From a5646621e3ca36b44a4f5623fe7aecda2ea21d47 Mon Sep 17 00:00:00 2001 From: Sergey B Kirpichev Date: Thu, 30 May 2024 23:20:17 +0300 Subject: [PATCH 001/272] CLN: use isnan() instead of the Py_IS_NAN macro (#58850) --- .../include/pandas/vendored/klib/khash_python.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pandas/_libs/include/pandas/vendored/klib/khash_python.h b/pandas/_libs/include/pandas/vendored/klib/khash_python.h index 811fdd139de2c..8d4c382241d39 100644 --- a/pandas/_libs/include/pandas/vendored/klib/khash_python.h +++ b/pandas/_libs/include/pandas/vendored/klib/khash_python.h @@ -156,7 +156,7 @@ KHASH_MAP_INIT_COMPLEX128(complex128, size_t) // NaN-floats should be in the same equivalency class, see GH 22119 static inline int floatobject_cmp(PyFloatObject *a, PyFloatObject *b) { - return (Py_IS_NAN(PyFloat_AS_DOUBLE(a)) && Py_IS_NAN(PyFloat_AS_DOUBLE(b))) || + return (isnan(PyFloat_AS_DOUBLE(a)) && isnan(PyFloat_AS_DOUBLE(b))) || (PyFloat_AS_DOUBLE(a) == PyFloat_AS_DOUBLE(b)); } @@ -164,12 +164,12 @@ static inline int floatobject_cmp(PyFloatObject *a, PyFloatObject *b) { // PyObject_RichCompareBool for complexobjects has a different behavior // needs to be replaced static inline int complexobject_cmp(PyComplexObject *a, PyComplexObject *b) { - return (Py_IS_NAN(a->cval.real) && Py_IS_NAN(b->cval.real) && - Py_IS_NAN(a->cval.imag) && Py_IS_NAN(b->cval.imag)) || - (Py_IS_NAN(a->cval.real) && Py_IS_NAN(b->cval.real) && + return (isnan(a->cval.real) && isnan(b->cval.real) && isnan(a->cval.imag) && + isnan(b->cval.imag)) || + (isnan(a->cval.real) && isnan(b->cval.real) && a->cval.imag == b->cval.imag) || - (a->cval.real == b->cval.real && Py_IS_NAN(a->cval.imag) && - Py_IS_NAN(b->cval.imag)) || + (a->cval.real == b->cval.real && isnan(a->cval.imag) && + isnan(b->cval.imag)) || (a->cval.real == b->cval.real && a->cval.imag == b->cval.imag); } @@ -223,7 +223,7 @@ static inline int pyobject_cmp(PyObject *a, PyObject *b) { static inline Py_hash_t _Pandas_HashDouble(double val) { // Since Python3.10, nan is no longer has hash 0 - if (Py_IS_NAN(val)) { + if (isnan(val)) { return 0; } #if PY_VERSION_HEX < 0x030A0000 From 5e972376fd5e4ab033f9922b546495d8efc9fda5 Mon Sep 17 00:00:00 2001 From: Thomas A Caswell Date: Fri, 31 May 2024 13:18:47 -0400 Subject: [PATCH 002/272] DOC: Add note about deprecated offset aliases (#58861) DOC: Add note about alias deprecations The PRs #55792 (Y), #52064 (Q), and #55553 (M) deprecated the single letter version of the aliases in favour of the -end version of them. Add a note to the offset table about deprecations. --- doc/source/user_guide/timeseries.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index b31249e1cf7c1..ab3f5b314ed83 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -1273,6 +1273,10 @@ frequencies. We will refer to these aliases as *offset aliases*. are deprecated in favour of the aliases ``h``, ``bh``, ``cbh``, ``min``, ``s``, ``ms``, ``us``, and ``ns``. + Aliases ``Y``, ``M``, and ``Q`` are deprecated in favour of the aliases + ``YE``, ``ME``, ``QE``. + + .. note:: When using the offset aliases above, it should be noted that functions From 0347e435e12ab4736cc6cc872dbdd560b453590c Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Fri, 31 May 2024 23:40:14 +0530 Subject: [PATCH 003/272] DOC: fix PR01 for pandas.MultiIndex (#58833) --- ci/code_checks.sh | 1 - pandas/core/indexes/multi.py | 4 ++++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index a93e23a0f5022..95ac6d457431e 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -76,7 +76,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.DataFrame.min RT03" \ -i "pandas.DataFrame.plot PR02,SA01" \ -i "pandas.Grouper PR02" \ - -i "pandas.MultiIndex PR01" \ -i "pandas.MultiIndex.append PR07,SA01" \ -i "pandas.MultiIndex.copy PR07,RT03,SA01" \ -i "pandas.MultiIndex.drop PR07,RT03,SA01" \ diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 3927619a567bf..1868081b0b0dc 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -209,8 +209,12 @@ class MultiIndex(Index): level). names : optional sequence of objects Names for each of the index levels. (name is accepted for compat). + dtype : Numpy dtype or pandas type, optional + Data type for the MultiIndex. copy : bool, default False Copy the meta-data. + name : Label + Kept for compatibility with 1-dimensional Index. Should not be used. verify_integrity : bool, default True Check that the levels/codes are consistent and valid. From 36e2fb78bb69ff4c4156f93ce096746576e6484b Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Fri, 31 May 2024 23:41:01 +0530 Subject: [PATCH 004/272] DOC: fix SA01 for pandas.MultiIndex.dtypes (#58834) --- ci/code_checks.sh | 1 - pandas/core/indexes/multi.py | 5 +++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 95ac6d457431e..fde287a8e060c 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -79,7 +79,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.MultiIndex.append PR07,SA01" \ -i "pandas.MultiIndex.copy PR07,RT03,SA01" \ -i "pandas.MultiIndex.drop PR07,RT03,SA01" \ - -i "pandas.MultiIndex.dtypes SA01" \ -i "pandas.MultiIndex.get_level_values SA01" \ -i "pandas.MultiIndex.get_loc PR07" \ -i "pandas.MultiIndex.get_loc_level PR07" \ diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 1868081b0b0dc..2568cd9bd18b5 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -775,6 +775,11 @@ def dtypes(self) -> Series: """ Return the dtypes as a Series for the underlying MultiIndex. + See Also + -------- + Index.dtype : Return the dtype object of the underlying data. + Series.dtypes : Return the data type of the underlying Series. + Examples -------- >>> idx = pd.MultiIndex.from_product( From 5df0581da65865877f1d91a6b6eb7eed4f2bc071 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Fri, 31 May 2024 23:42:11 +0530 Subject: [PATCH 005/272] DOC: fix SA01 for pandas.MultiIndex.levels (#58835) --- ci/code_checks.sh | 1 - pandas/core/indexes/multi.py | 6 ++++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index fde287a8e060c..8f4efdffc090c 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -82,7 +82,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.MultiIndex.get_level_values SA01" \ -i "pandas.MultiIndex.get_loc PR07" \ -i "pandas.MultiIndex.get_loc_level PR07" \ - -i "pandas.MultiIndex.levels SA01" \ -i "pandas.MultiIndex.levshape SA01" \ -i "pandas.MultiIndex.names SA01" \ -i "pandas.MultiIndex.nlevels SA01" \ diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 2568cd9bd18b5..69d4627f96044 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -835,6 +835,12 @@ def levels(self) -> FrozenList: it filters out all rows of the level C, MultiIndex.levels will still return A, B, C. + See Also + -------- + MultiIndex.codes : The codes of the levels in the MultiIndex. + MultiIndex.get_level_values : Return vector of label values for requested + level. + Examples -------- >>> index = pd.MultiIndex.from_product( From 35219f159655e4aa5ab66a183340dca7a343ccf1 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Fri, 31 May 2024 23:43:15 +0530 Subject: [PATCH 006/272] DOC: fix SA01 for pandas.MultiIndex.set_codes (#58836) --- ci/code_checks.sh | 1 - pandas/core/indexes/multi.py | 6 ++++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 8f4efdffc090c..b2fc55d5fe72c 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -87,7 +87,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.MultiIndex.nlevels SA01" \ -i "pandas.MultiIndex.remove_unused_levels RT03,SA01" \ -i "pandas.MultiIndex.reorder_levels RT03,SA01" \ - -i "pandas.MultiIndex.set_codes SA01" \ -i "pandas.MultiIndex.set_levels RT03,SA01" \ -i "pandas.MultiIndex.sortlevel PR07,SA01" \ -i "pandas.MultiIndex.to_frame RT03" \ diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 69d4627f96044..43f63a733fbaf 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1149,6 +1149,12 @@ def set_codes( new index (of same type and class...etc) or None The same type as the caller or None if ``inplace=True``. + See Also + -------- + MultiIndex.set_levels : Set new levels on MultiIndex. + MultiIndex.codes : Get the codes of the levels in the MultiIndex. + MultiIndex.levels : Get the levels of the MultiIndex. + Examples -------- >>> idx = pd.MultiIndex.from_tuples( From 07e8e54813e15bbe0466d68b810f14cbe3a2e05c Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Fri, 31 May 2024 23:43:54 +0530 Subject: [PATCH 007/272] DOC: fix SA01 for pandas.MultiIndex.truncate (#58837) --- ci/code_checks.sh | 1 - pandas/core/indexes/multi.py | 5 +++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index b2fc55d5fe72c..6614611e52224 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -90,7 +90,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.MultiIndex.set_levels RT03,SA01" \ -i "pandas.MultiIndex.sortlevel PR07,SA01" \ -i "pandas.MultiIndex.to_frame RT03" \ - -i "pandas.MultiIndex.truncate SA01" \ -i "pandas.NA SA01" \ -i "pandas.NaT SA01" \ -i "pandas.NamedAgg SA01" \ diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 43f63a733fbaf..8b11f8087db94 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3611,6 +3611,11 @@ def truncate(self, before=None, after=None) -> MultiIndex: MultiIndex The truncated MultiIndex. + See Also + -------- + DataFrame.truncate : Truncate a DataFrame before and after some index values. + Series.truncate : Truncate a Series before and after some index values. + Examples -------- >>> mi = pd.MultiIndex.from_arrays([["a", "b", "c"], ["x", "y", "z"]]) From fa937aa2050b1084b47509de9a2e7e4cad1e0f9c Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Fri, 31 May 2024 23:44:39 +0530 Subject: [PATCH 008/272] DOC: fix SA01 for pandas.Period.is_leap_year (#58839) --- ci/code_checks.sh | 1 - pandas/_libs/tslibs/period.pyx | 6 ++++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 6614611e52224..594da4f2bd1e3 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -97,7 +97,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Period.asfreq SA01" \ -i "pandas.Period.freq GL08" \ -i "pandas.Period.freqstr SA01" \ - -i "pandas.Period.is_leap_year SA01" \ -i "pandas.Period.month SA01" \ -i "pandas.Period.now SA01" \ -i "pandas.Period.ordinal GL08" \ diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 838b5b9f4595f..4ca10850ab839 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -2443,6 +2443,12 @@ cdef class _Period(PeriodMixin): """ Return True if the period's year is in a leap year. + See Also + -------- + Timestamp.is_leap_year : Check if the year in a Timestamp is a leap year. + DatetimeIndex.is_leap_year : Boolean indicator if the date belongs to a + leap year. + Examples -------- >>> period = pd.Period('2022-01', 'M') From 8dc5a3f94d49b9ffbf5c4ed5ac8ba9942c876e17 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Fri, 31 May 2024 23:45:14 +0530 Subject: [PATCH 009/272] DOC: fix SA01 for pandas.Period (#58838) --- ci/code_checks.sh | 1 - pandas/_libs/tslibs/period.pyx | 6 ++++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 594da4f2bd1e3..d4bebcf18e800 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -93,7 +93,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.NA SA01" \ -i "pandas.NaT SA01" \ -i "pandas.NamedAgg SA01" \ - -i "pandas.Period SA01" \ -i "pandas.Period.asfreq SA01" \ -i "pandas.Period.freq GL08" \ -i "pandas.Period.freqstr SA01" \ diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 4ca10850ab839..ddde4c68820f6 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -2699,6 +2699,12 @@ class Period(_Period): second : int, default 0 Second value of the period. + See Also + -------- + Timestamp : Pandas replacement for python datetime.datetime object. + date_range : Return a fixed frequency DatetimeIndex. + timedelta_range : Generates a fixed frequency range of timedeltas. + Examples -------- >>> period = pd.Period('2012-1-1', freq='D') From 79f4c5353dc10799355b82f97aeab99f6e5d288f Mon Sep 17 00:00:00 2001 From: Anish Karthik <89824626+anishfish2@users.noreply.github.com> Date: Fri, 31 May 2024 13:16:16 -0500 Subject: [PATCH 010/272] DOC: Fix docstring error SA01 for pandas.Dataframe.plot, pandas.Series.plot, pandas.core.groupby.DataFrameGroupBy.plot, pandas.core.groupby.SeriesGroupBy.plot (#58842) * Updated See Also for pandas.Dataframe.plot - S01 * Updating pandas.Series.plot to pass SA01 docstring test * Updating pandas.core.groupby.DataFrameGroupBy.plot to pass SA01 docstring test * Updating pandas.core.groupby.SeriesGroupBy.plot to pass SA01 docstring test --- ci/code_checks.sh | 8 ++++---- pandas/plotting/_core.py | 15 +++++++++++++++ 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index d4bebcf18e800..dee40075bcd74 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -74,7 +74,7 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.DataFrame.mean RT03,SA01" \ -i "pandas.DataFrame.median RT03,SA01" \ -i "pandas.DataFrame.min RT03" \ - -i "pandas.DataFrame.plot PR02,SA01" \ + -i "pandas.DataFrame.plot PR02" \ -i "pandas.Grouper PR02" \ -i "pandas.MultiIndex.append PR07,SA01" \ -i "pandas.MultiIndex.copy PR07,RT03,SA01" \ @@ -165,7 +165,7 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.lt SA01" \ -i "pandas.Series.ne SA01" \ -i "pandas.Series.pad PR01,SA01" \ - -i "pandas.Series.plot PR02,SA01" \ + -i "pandas.Series.plot PR02" \ -i "pandas.Series.pop RT03,SA01" \ -i "pandas.Series.prod RT03" \ -i "pandas.Series.product RT03" \ @@ -360,7 +360,7 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.core.groupby.DataFrameGroupBy.nth PR02" \ -i "pandas.core.groupby.DataFrameGroupBy.nunique SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.ohlc SA01" \ - -i "pandas.core.groupby.DataFrameGroupBy.plot PR02,SA01" \ + -i "pandas.core.groupby.DataFrameGroupBy.plot PR02" \ -i "pandas.core.groupby.DataFrameGroupBy.prod SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.sem SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.sum SA01" \ @@ -378,7 +378,7 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.core.groupby.SeriesGroupBy.min SA01" \ -i "pandas.core.groupby.SeriesGroupBy.nth PR02" \ -i "pandas.core.groupby.SeriesGroupBy.ohlc SA01" \ - -i "pandas.core.groupby.SeriesGroupBy.plot PR02,SA01" \ + -i "pandas.core.groupby.SeriesGroupBy.plot PR02" \ -i "pandas.core.groupby.SeriesGroupBy.prod SA01" \ -i "pandas.core.groupby.SeriesGroupBy.sem SA01" \ -i "pandas.core.groupby.SeriesGroupBy.sum SA01" \ diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index ea5daf02b7252..c83985917591c 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -791,6 +791,21 @@ class PlotAccessor(PandasObject): If the backend is not the default matplotlib one, the return value will be the object returned by the backend. + See Also + -------- + matplotlib.pyplot.plot : Plot y versus x as lines and/or markers. + DataFrame.hist : Make a histogram. + DataFrame.boxplot : Make a box plot. + DataFrame.plot.scatter : Make a scatter plot with varying marker + point size and color. + DataFrame.plot.hexbin : Make a hexagonal binning plot of + two variables. + DataFrame.plot.kde : Make Kernel Density Estimate plot using + Gaussian kernels. + DataFrame.plot.area : Make a stacked area plot. + DataFrame.plot.bar : Make a bar plot. + DataFrame.plot.barh : Make a horizontal bar plot. + Notes ----- - See matplotlib documentation online for more on this subject From 454b0af783c15cf28b930f8e1f9c67f0e9c0ba10 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Fri, 31 May 2024 23:46:52 +0530 Subject: [PATCH 011/272] DOC: fix SA01 for pandas.Period.quarter (#58841) --- ci/code_checks.sh | 1 - pandas/_libs/tslibs/period.pyx | 6 ++++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index dee40075bcd74..b9b3ca24b4162 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -99,7 +99,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Period.month SA01" \ -i "pandas.Period.now SA01" \ -i "pandas.Period.ordinal GL08" \ - -i "pandas.Period.quarter SA01" \ -i "pandas.Period.strftime PR01,SA01" \ -i "pandas.Period.to_timestamp SA01" \ -i "pandas.Period.year SA01" \ diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index ddde4c68820f6..023a0f52e320f 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -2329,6 +2329,12 @@ cdef class _Period(PeriodMixin): """ Return the quarter this Period falls on. + See Also + -------- + Timestamp.quarter : Return the quarter of the Timestamp. + Period.year : Return the year of the period. + Period.month : Return the month of the period. + Examples -------- >>> period = pd.Period('2022-04', 'M') From 8593554b128392f5ad2422f574e990dba451fea9 Mon Sep 17 00:00:00 2001 From: James Yuill <130482796+jamesyuill@users.noreply.github.com> Date: Fri, 31 May 2024 19:20:32 +0100 Subject: [PATCH 012/272] corrects-spelling-mistake-in-merging.rst (#58874) Changed 'mactches' to 'matches' --- doc/source/user_guide/merging.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/user_guide/merging.rst b/doc/source/user_guide/merging.rst index f3b849dc6de45..cfd2f40aa93a3 100644 --- a/doc/source/user_guide/merging.rst +++ b/doc/source/user_guide/merging.rst @@ -974,7 +974,7 @@ with optional filling of missing data with ``fill_method``. :func:`merge_asof` --------------------- -:func:`merge_asof` is similar to an ordered left-join except that mactches are on the +:func:`merge_asof` is similar to an ordered left-join except that matches are on the nearest key rather than equal keys. For each row in the ``left`` :class:`DataFrame`, the last row in the ``right`` :class:`DataFrame` are selected where the ``on`` key is less than the left's key. Both :class:`DataFrame` must be sorted by the key. From 528d176227d6be4e97bd6d0454d1428473540adb Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Fri, 31 May 2024 20:21:43 +0200 Subject: [PATCH 013/272] DOC: Fix reference to api.typing.NaType (#58871) --- doc/source/user_guide/missing_data.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst index 5149bd30dbbef..29f3fea899336 100644 --- a/doc/source/user_guide/missing_data.rst +++ b/doc/source/user_guide/missing_data.rst @@ -32,7 +32,7 @@ use :class:`api.typing.NaTType`. :class:`NA` for :class:`StringDtype`, :class:`Int64Dtype` (and other bit widths), :class:`Float64Dtype` (and other bit widths), :class:`BooleanDtype` and :class:`ArrowDtype`. These types will maintain the original data type of the data. -For typing applications, use :class:`api.types.NAType`. +For typing applications, use :class:`api.typing.NAType`. .. ipython:: python From f95c3a0d5dee3bce14513fa9a343f6000648efb7 Mon Sep 17 00:00:00 2001 From: santhoshbethi <45058712+santhoshbethi@users.noreply.github.com> Date: Fri, 31 May 2024 14:33:18 -0400 Subject: [PATCH 014/272] upadting doc string (#58830) * upadting doc string * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Empty-Commit * updating doc string :msg --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- pandas/tests/arithmetic/common.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/pandas/tests/arithmetic/common.py b/pandas/tests/arithmetic/common.py index d7a8b0510b50f..0730729e2fd94 100644 --- a/pandas/tests/arithmetic/common.py +++ b/pandas/tests/arithmetic/common.py @@ -20,13 +20,16 @@ def assert_cannot_add(left, right, msg="cannot add"): """ - Helper to assert that left and right cannot be added. + Helper function to assert that two objects cannot be added. Parameters ---------- left : object + The first operand. right : object + The second operand. msg : str, default "cannot add" + The error message expected in the TypeError. """ with pytest.raises(TypeError, match=msg): left + right @@ -36,13 +39,17 @@ def assert_cannot_add(left, right, msg="cannot add"): def assert_invalid_addsub_type(left, right, msg=None): """ - Helper to assert that left and right can be neither added nor subtracted. + Helper function to assert that two objects can + neither be added nor subtracted. Parameters ---------- left : object + The first operand. right : object + The second operand. msg : str or None, default None + The error message expected in the TypeError. """ with pytest.raises(TypeError, match=msg): left + right From a2a78d33f5f9d730225717233624dc8f41bd4e2f Mon Sep 17 00:00:00 2001 From: mutricyl <118692416+mutricyl@users.noreply.github.com> Date: Fri, 31 May 2024 20:38:47 +0200 Subject: [PATCH 015/272] updating df.query and df.eval docstrings. resolves #16283 (#58749) * updating df.query and df.eval docstrings. resolves #16283 * typo * adding 1 example * changing wording following example added * updating 'C C' to 'C&C' for eval * updating 'C C' to 'C&C' for query --------- Co-authored-by: Laurent Mutricy --- pandas/core/frame.py | 109 +++++++++++++++++++++++++++++-------------- 1 file changed, 73 insertions(+), 36 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 01ac5a2be3d79..912b4353acacf 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4577,36 +4577,44 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | No Examples -------- >>> df = pd.DataFrame( - ... {"A": range(1, 6), "B": range(10, 0, -2), "C C": range(10, 5, -1)} + ... {"A": range(1, 6), "B": range(10, 0, -2), "C&C": range(10, 5, -1)} ... ) >>> df - A B C C + A B C&C 0 1 10 10 1 2 8 9 2 3 6 8 3 4 4 7 4 5 2 6 >>> df.query("A > B") - A B C C + A B C&C 4 5 2 6 The previous expression is equivalent to >>> df[df.A > df.B] - A B C C + A B C&C 4 5 2 6 For columns with spaces in their name, you can use backtick quoting. - >>> df.query("B == `C C`") - A B C C + >>> df.query("B == `C&C`") + A B C&C 0 1 10 10 The previous expression is equivalent to - >>> df[df.B == df["C C"]] - A B C C + >>> df[df.B == df["C&C"]] + A B C&C 0 1 10 10 + + Using local variable: + + >>> local_var = 2 + >>> df.query("A <= @local_var") + A B C&C + 0 1 10 10 + 1 2 8 9 """ inplace = validate_bool_kwarg(inplace, "inplace") if not isinstance(expr, str): @@ -4647,6 +4655,13 @@ def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None: ---------- expr : str The expression string to evaluate. + + You can refer to variables + in the environment by prefixing them with an '@' character like + ``@a + b``. + + You can refer to column names that are not valid Python variable + names by surrounding them with backticks `````. inplace : bool, default False If the expression contains an assignment, whether to perform the operation inplace and mutate the existing DataFrame. Otherwise, @@ -4678,14 +4693,16 @@ def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None: Examples -------- - >>> df = pd.DataFrame({"A": range(1, 6), "B": range(10, 0, -2)}) + >>> df = pd.DataFrame( + ... {"A": range(1, 6), "B": range(10, 0, -2), "C&C": range(10, 5, -1)} + ... ) >>> df - A B - 0 1 10 - 1 2 8 - 2 3 6 - 3 4 4 - 4 5 2 + A B C&C + 0 1 10 10 + 1 2 8 9 + 2 3 6 8 + 3 4 4 7 + 4 5 2 6 >>> df.eval("A + B") 0 11 1 10 @@ -4697,35 +4714,55 @@ def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None: Assignment is allowed though by default the original DataFrame is not modified. - >>> df.eval("C = A + B") - A B C - 0 1 10 11 - 1 2 8 10 - 2 3 6 9 - 3 4 4 8 - 4 5 2 7 + >>> df.eval("D = A + B") + A B C&C D + 0 1 10 10 11 + 1 2 8 9 10 + 2 3 6 8 9 + 3 4 4 7 8 + 4 5 2 6 7 >>> df - A B - 0 1 10 - 1 2 8 - 2 3 6 - 3 4 4 - 4 5 2 + A B C&C + 0 1 10 10 + 1 2 8 9 + 2 3 6 8 + 3 4 4 7 + 4 5 2 6 Multiple columns can be assigned to using multi-line expressions: >>> df.eval( ... ''' - ... C = A + B - ... D = A - B + ... D = A + B + ... E = A - B ... ''' ... ) - A B C D - 0 1 10 11 -9 - 1 2 8 10 -6 - 2 3 6 9 -3 - 3 4 4 8 0 - 4 5 2 7 3 + A B C&C D E + 0 1 10 10 11 -9 + 1 2 8 9 10 -6 + 2 3 6 8 9 -3 + 3 4 4 7 8 0 + 4 5 2 6 7 3 + + For columns with spaces in their name, you can use backtick quoting. + + >>> df.eval("B * `C&C`") + 0 100 + 1 72 + 2 48 + 3 28 + 4 12 + + Local variables shall be explicitly referenced using ``@`` + character in front of the name: + + >>> local_var = 2 + >>> df.eval("@local_var * A") + 0 2 + 1 4 + 2 6 + 3 8 + 4 10 """ from pandas.core.computation.eval import eval as _eval From 2ea036f0491417786a662c320e504d7eb9aba83c Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 31 May 2024 11:40:04 -0700 Subject: [PATCH 016/272] ENH/WIP: resolution inference in pd.to_datetime, DatetimeIndex (#55901) * ENH: read_stata return non-nano * GH ref * move whatsnew * remove outdated whatsnew * ENH: read_stata return non-nano * avoid Series.view * dont go through Series * TST: dt64 units * BUG: cut with non-nano * BUG: round with non-nanosecond raising OverflowError * woops * BUG: cut with non-nano * TST: parametrize tests over dt64 unit * xfail non-nano * revert * BUG: mixed-type mixed-timezone/awareness * commit so i can unstash something else i hope * ENH: infer resolution in to_datetime, DatetimeIndex * revert commented-out * revert commented-out * revert commented-out * remove commented-out * remove comment * revert unnecessary * revert unnecessary * fix window tests * Fix resample tests * restore comment * revert unnecessary * remove no-longer necessary * revert no-longer-necessary * revert no-longer-necessary * update tests * revert no-longer-necessary * update tests * revert bits * update tests * cleanup * revert * revert * parametrize over unit * update tests * update tests * revert no-longer-needed * revert no-longer-necessary * revert no-longer-necessary * revert no-longer-necessary * revert no-longer-necessary * Revert no-longer-necessary * update test * update test * simplify * update tests * update tests * update tests * revert no-longer-necessary * post-merge fixup * revert no-longer-necessary * update tests * update test * update tests * update tests * remove commented-out * revert no-longer-necessary * as_unit->astype * cleanup * merge fixup * revert bit * revert no-longer-necessary, xfail * update multithread test * update tests * update doctest * update tests * update doctests * update tests * update db tests * troubleshoot db tests * update test * troubleshoot sql tests * update test * update tests * mypy fixup * Update test * kludge test * update test * update for min-version tests * fix adbc check * troubleshoot minimum version deps * troubleshoot * troubleshoot * troubleshoot * whatsnew * update abdc-driver-postgresql minimum version * update doctest * fix doc example * troubleshoot test_api_custom_dateparsing_error * troubleshoot * troubleshoot * troubleshoot * troubleshoot * troubleshoot * troubleshoot * update exp instead of object cast * revert accidental * simplify test --- doc/source/whatsnew/v3.0.0.rst | 63 ++++++ pandas/_libs/lib.pyx | 10 +- pandas/_libs/tslib.pyx | 17 +- pandas/_libs/tslibs/strptime.pyx | 6 +- pandas/core/algorithms.py | 8 +- pandas/core/arrays/datetimelike.py | 12 +- pandas/core/arrays/datetimes.py | 35 ++-- pandas/core/base.py | 2 +- pandas/core/dtypes/cast.py | 2 +- pandas/core/dtypes/dtypes.py | 2 +- pandas/core/dtypes/missing.py | 4 +- pandas/core/frame.py | 2 +- pandas/core/generic.py | 10 +- pandas/core/groupby/generic.py | 4 +- pandas/core/indexes/base.py | 2 +- pandas/core/indexes/datetimes.py | 5 +- pandas/core/series.py | 5 +- pandas/core/tools/datetimes.py | 16 +- pandas/tests/arithmetic/test_period.py | 7 +- .../tests/arrays/categorical/test_missing.py | 2 +- pandas/tests/arrays/test_array.py | 10 +- pandas/tests/base/test_constructors.py | 6 +- pandas/tests/base/test_conversion.py | 4 +- pandas/tests/dtypes/test_inference.py | 6 +- pandas/tests/extension/test_arrow.py | 2 +- .../frame/constructors/test_from_records.py | 2 +- pandas/tests/frame/indexing/test_setitem.py | 3 +- .../tests/frame/methods/test_combine_first.py | 25 ++- .../tests/frame/methods/test_infer_objects.py | 2 +- pandas/tests/frame/methods/test_map.py | 7 +- pandas/tests/frame/methods/test_to_csv.py | 41 +++- pandas/tests/frame/test_constructors.py | 61 ++++-- pandas/tests/groupby/test_apply.py | 4 +- pandas/tests/groupby/test_groupby.py | 2 +- .../indexes/datetimes/test_constructors.py | 62 +++--- .../indexes/datetimes/test_date_range.py | 4 +- pandas/tests/indexes/test_base.py | 2 +- pandas/tests/indexes/test_index_new.py | 11 +- pandas/tests/indexing/test_coercion.py | 20 +- pandas/tests/indexing/test_loc.py | 7 +- pandas/tests/indexing/test_partial.py | 2 +- pandas/tests/interchange/test_impl.py | 3 +- pandas/tests/io/excel/test_readers.py | 9 +- pandas/tests/io/excel/test_writers.py | 16 +- pandas/tests/io/json/test_pandas.py | 17 +- .../io/parser/common/test_common_basic.py | 9 +- pandas/tests/io/parser/common/test_index.py | 3 +- pandas/tests/io/parser/test_multi_thread.py | 9 +- pandas/tests/io/parser/test_parse_dates.py | 58 ++++-- pandas/tests/io/parser/test_read_fwf.py | 4 +- pandas/tests/io/parser/test_skiprows.py | 8 +- .../io/parser/usecols/test_parse_dates.py | 2 +- pandas/tests/io/pytables/test_store.py | 12 +- pandas/tests/io/test_fsspec.py | 16 +- pandas/tests/io/test_gcs.py | 6 +- pandas/tests/io/test_html.py | 14 +- pandas/tests/io/test_orc.py | 2 + pandas/tests/io/test_parquet.py | 44 +++- pandas/tests/io/test_sql.py | 52 +++-- pandas/tests/io/test_stata.py | 23 +- pandas/tests/resample/test_base.py | 3 +- pandas/tests/resample/test_time_grouper.py | 12 +- .../reshape/concat/test_append_common.py | 4 +- pandas/tests/reshape/concat/test_datetimes.py | 4 +- pandas/tests/reshape/test_cut.py | 52 +++-- pandas/tests/reshape/test_qcut.py | 4 +- pandas/tests/scalar/test_nat.py | 6 +- .../series/methods/test_combine_first.py | 2 +- pandas/tests/series/methods/test_fillna.py | 2 +- pandas/tests/series/methods/test_to_csv.py | 5 +- pandas/tests/series/test_constructors.py | 34 ++- pandas/tests/test_algos.py | 1 + pandas/tests/tools/test_to_datetime.py | 197 +++++++++++------- pandas/tests/tools/test_to_timedelta.py | 2 +- pandas/tests/tseries/holiday/test_calendar.py | 4 +- pandas/tests/tslibs/test_array_to_datetime.py | 50 +++-- pandas/tests/util/test_hashing.py | 16 +- 77 files changed, 745 insertions(+), 457 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 6a6abcf2d48fe..865996bdf8892 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -124,6 +124,69 @@ notable_bug_fix2 Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. _whatsnew_300.api_breaking.datetime_resolution_inference: + +Datetime resolution inference +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Converting a sequence of strings, ``datetime`` objects, or ``np.datetime64`` objects to +a ``datetime64`` dtype now performs inference on the appropriate resolution (AKA unit) for the output dtype. This affects :class:`Series`, :class:`DataFrame`, :class:`Index`, :class:`DatetimeIndex`, and :func:`to_datetime`. + +Previously, these would always give nanosecond resolution: + +.. code-block:: ipython + + In [1]: dt = pd.Timestamp("2024-03-22 11:36").to_pydatetime() + In [2]: pd.to_datetime([dt]).dtype + Out[2]: dtype('>> pd.unique(pd.Series([pd.Timestamp("20160101"), pd.Timestamp("20160101")])) - array(['2016-01-01T00:00:00.000000000'], dtype='datetime64[ns]') + array(['2016-01-01T00:00:00'], dtype='datetime64[s]') >>> pd.unique( ... pd.Series( ... [ ... pd.Timestamp("20160101", tz="US/Eastern"), ... pd.Timestamp("20160101", tz="US/Eastern"), - ... ] + ... ], + ... dtype="M8[ns, US/Eastern]", ... ) ... ) @@ -365,7 +366,8 @@ def unique(values): ... [ ... pd.Timestamp("20160101", tz="US/Eastern"), ... pd.Timestamp("20160101", tz="US/Eastern"), - ... ] + ... ], + ... dtype="M8[ns, US/Eastern]", ... ) ... ) DatetimeIndex(['2016-01-01 00:00:00-05:00'], diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 925858a20ce41..673001337767b 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1849,11 +1849,11 @@ def strftime(self, date_format: str) -> npt.NDArray[np.object_]: >>> rng_tz.floor("2h", ambiguous=False) DatetimeIndex(['2021-10-31 02:00:00+01:00'], - dtype='datetime64[ns, Europe/Amsterdam]', freq=None) + dtype='datetime64[s, Europe/Amsterdam]', freq=None) >>> rng_tz.floor("2h", ambiguous=True) DatetimeIndex(['2021-10-31 02:00:00+02:00'], - dtype='datetime64[ns, Europe/Amsterdam]', freq=None) + dtype='datetime64[s, Europe/Amsterdam]', freq=None) """ _floor_example = """>>> rng.floor('h') @@ -1876,11 +1876,11 @@ def strftime(self, date_format: str) -> npt.NDArray[np.object_]: >>> rng_tz.floor("2h", ambiguous=False) DatetimeIndex(['2021-10-31 02:00:00+01:00'], - dtype='datetime64[ns, Europe/Amsterdam]', freq=None) + dtype='datetime64[s, Europe/Amsterdam]', freq=None) >>> rng_tz.floor("2h", ambiguous=True) DatetimeIndex(['2021-10-31 02:00:00+02:00'], - dtype='datetime64[ns, Europe/Amsterdam]', freq=None) + dtype='datetime64[s, Europe/Amsterdam]', freq=None) """ _ceil_example = """>>> rng.ceil('h') @@ -1903,11 +1903,11 @@ def strftime(self, date_format: str) -> npt.NDArray[np.object_]: >>> rng_tz.ceil("h", ambiguous=False) DatetimeIndex(['2021-10-31 02:00:00+01:00'], - dtype='datetime64[ns, Europe/Amsterdam]', freq=None) + dtype='datetime64[s, Europe/Amsterdam]', freq=None) >>> rng_tz.ceil("h", ambiguous=True) DatetimeIndex(['2021-10-31 02:00:00+02:00'], - dtype='datetime64[ns, Europe/Amsterdam]', freq=None) + dtype='datetime64[s, Europe/Amsterdam]', freq=None) """ diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index b075e3d299ed0..bbbf7a9b4a63a 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -218,7 +218,7 @@ class DatetimeArray(dtl.TimelikeOps, dtl.DatelikeOps): # type: ignore[misc] ... ) ['2023-01-01 00:00:00', '2023-01-02 00:00:00'] - Length: 2, dtype: datetime64[ns] + Length: 2, dtype: datetime64[s] """ _typ = "datetimearray" @@ -613,7 +613,7 @@ def tz(self) -> tzinfo | None: >>> s 0 2020-01-01 10:00:00+00:00 1 2020-02-01 11:00:00+00:00 - dtype: datetime64[ns, UTC] + dtype: datetime64[s, UTC] >>> s.dt.tz datetime.timezone.utc @@ -1047,7 +1047,7 @@ def tz_localize( 4 2018-10-28 02:30:00+01:00 5 2018-10-28 03:00:00+01:00 6 2018-10-28 03:30:00+01:00 - dtype: datetime64[ns, CET] + dtype: datetime64[s, CET] In some cases, inferring the DST is impossible. In such cases, you can pass an ndarray to the ambiguous parameter to set the DST explicitly @@ -1059,14 +1059,14 @@ def tz_localize( 0 2018-10-28 01:20:00+02:00 1 2018-10-28 02:36:00+02:00 2 2018-10-28 03:46:00+01:00 - dtype: datetime64[ns, CET] + dtype: datetime64[s, CET] If the DST transition causes nonexistent times, you can shift these dates forward or backwards with a timedelta object or `'shift_forward'` or `'shift_backwards'`. >>> s = pd.to_datetime(pd.Series(['2015-03-29 02:30:00', - ... '2015-03-29 03:30:00'])) + ... '2015-03-29 03:30:00'], dtype="M8[ns]")) >>> s.dt.tz_localize('Europe/Warsaw', nonexistent='shift_forward') 0 2015-03-29 03:00:00+02:00 1 2015-03-29 03:30:00+02:00 @@ -1427,7 +1427,7 @@ def time(self) -> npt.NDArray[np.object_]: >>> s 0 2020-01-01 10:00:00+00:00 1 2020-02-01 11:00:00+00:00 - dtype: datetime64[ns, UTC] + dtype: datetime64[s, UTC] >>> s.dt.time 0 10:00:00 1 11:00:00 @@ -1470,7 +1470,7 @@ def timetz(self) -> npt.NDArray[np.object_]: >>> s 0 2020-01-01 10:00:00+00:00 1 2020-02-01 11:00:00+00:00 - dtype: datetime64[ns, UTC] + dtype: datetime64[s, UTC] >>> s.dt.timetz 0 10:00:00+00:00 1 11:00:00+00:00 @@ -1512,7 +1512,7 @@ def date(self) -> npt.NDArray[np.object_]: >>> s 0 2020-01-01 10:00:00+00:00 1 2020-02-01 11:00:00+00:00 - dtype: datetime64[ns, UTC] + dtype: datetime64[s, UTC] >>> s.dt.date 0 2020-01-01 1 2020-02-01 @@ -1861,7 +1861,7 @@ def isocalendar(self) -> DataFrame: >>> s 0 2020-01-01 10:00:00+00:00 1 2020-02-01 11:00:00+00:00 - dtype: datetime64[ns, UTC] + dtype: datetime64[s, UTC] >>> s.dt.dayofyear 0 1 1 32 @@ -1897,7 +1897,7 @@ def isocalendar(self) -> DataFrame: >>> s 0 2020-01-01 10:00:00+00:00 1 2020-04-01 11:00:00+00:00 - dtype: datetime64[ns, UTC] + dtype: datetime64[s, UTC] >>> s.dt.quarter 0 1 1 2 @@ -1933,7 +1933,7 @@ def isocalendar(self) -> DataFrame: >>> s 0 2020-01-01 10:00:00+00:00 1 2020-02-01 11:00:00+00:00 - dtype: datetime64[ns, UTC] + dtype: datetime64[s, UTC] >>> s.dt.daysinmonth 0 31 1 29 @@ -2372,9 +2372,9 @@ def _sequence_to_dt64( data, copy = maybe_convert_dtype(data, copy, tz=tz) data_dtype = getattr(data, "dtype", None) - if out_unit is None: - out_unit = "ns" - out_dtype = np.dtype(f"M8[{out_unit}]") + out_dtype = DT64NS_DTYPE + if out_unit is not None: + out_dtype = np.dtype(f"M8[{out_unit}]") if data_dtype == object or is_string_dtype(data_dtype): # TODO: We do not have tests specific to string-dtypes, @@ -2400,7 +2400,7 @@ def _sequence_to_dt64( dayfirst=dayfirst, yearfirst=yearfirst, allow_object=False, - out_unit=out_unit or "ns", + out_unit=out_unit, ) copy = False if tz and inferred_tz: @@ -2508,7 +2508,7 @@ def objects_to_datetime64( utc: bool = False, errors: DateTimeErrorChoices = "raise", allow_object: bool = False, - out_unit: str = "ns", + out_unit: str | None = None, ) -> tuple[np.ndarray, tzinfo | None]: """ Convert data to array of timestamps. @@ -2524,7 +2524,8 @@ def objects_to_datetime64( allow_object : bool Whether to return an object-dtype ndarray instead of raising if the data contains more than one timezone. - out_unit : str, default "ns" + out_unit : str or None, default None + None indicates we should do resolution inference. Returns ------- diff --git a/pandas/core/base.py b/pandas/core/base.py index 5cdbde8c64c47..b784dc8b03292 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1334,7 +1334,7 @@ def factorize( 0 2000-03-11 1 2000-03-12 2 2000-03-13 - dtype: datetime64[ns] + dtype: datetime64[s] >>> ser.searchsorted('3/14/2000') 3 diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 08adb580ff08f..662b8c5791e51 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1193,7 +1193,7 @@ def maybe_infer_to_datetimelike( # numpy would have done it for us. convert_numeric=False, convert_non_numeric=True, - dtype_if_all_nat=np.dtype("M8[ns]"), + dtype_if_all_nat=np.dtype("M8[s]"), ) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 45814ca77b70f..5213be8b69016 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -205,7 +205,7 @@ class CategoricalDtype(PandasExtensionDtype, ExtensionDtype): by providing an empty index. As follows, >>> pd.CategoricalDtype(pd.DatetimeIndex([])).categories.dtype - dtype(' bool | npt.NDArray[np.bool_] | NDFrame: >>> index = pd.DatetimeIndex(["2017-07-05", "2017-07-06", None, "2017-07-08"]) >>> index DatetimeIndex(['2017-07-05', '2017-07-06', 'NaT', '2017-07-08'], - dtype='datetime64[ns]', freq=None) + dtype='datetime64[s]', freq=None) >>> pd.isna(index) array([False, False, True, False]) @@ -362,7 +362,7 @@ def notna(obj: object) -> bool | npt.NDArray[np.bool_] | NDFrame: >>> index = pd.DatetimeIndex(["2017-07-05", "2017-07-06", None, "2017-07-08"]) >>> index DatetimeIndex(['2017-07-05', '2017-07-06', 'NaT', '2017-07-08'], - dtype='datetime64[ns]', freq=None) + dtype='datetime64[s]', freq=None) >>> pd.notna(index) array([ True, True, False, True]) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 912b4353acacf..97a4e414608b8 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -13286,7 +13286,7 @@ def to_period( >>> idx DatetimeIndex(['2001-03-31', '2002-05-31', '2003-08-31'], - dtype='datetime64[ns]', freq=None) + dtype='datetime64[s]', freq=None) >>> idx.to_period("M") PeriodIndex(['2001-03', '2002-05', '2003-08'], dtype='period[M]') diff --git a/pandas/core/generic.py b/pandas/core/generic.py index ca60ca9b48a14..22eecdc95934f 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3209,7 +3209,7 @@ class (index) object 32B 'bird' 'bird' 'mammal' 'mammal' Dimensions: (date: 2, animal: 2) Coordinates: - * date (date) datetime64[ns] 2018-01-01 2018-01-02 + * date (date) datetime64[s] 2018-01-01 2018-01-02 * animal (animal) object 'falcon' 'parrot' Data variables: speed (date, animal) int64 350 18 361 15 @@ -6194,7 +6194,7 @@ def dtypes(self): >>> df.dtypes float float64 int int64 - datetime datetime64[ns] + datetime datetime64[s] string object dtype: object """ @@ -10653,10 +10653,10 @@ def tz_localize( dates forward or backward with a timedelta object or `'shift_forward'` or `'shift_backward'`. - >>> s = pd.Series( - ... range(2), - ... index=pd.DatetimeIndex(["2015-03-29 02:30:00", "2015-03-29 03:30:00"]), + >>> dti = pd.DatetimeIndex( + ... ["2015-03-29 02:30:00", "2015-03-29 03:30:00"], dtype="M8[ns]" ... ) + >>> s = pd.Series(range(2), index=dti) >>> s.tz_localize("Europe/Warsaw", nonexistent="shift_forward") 2015-03-29 03:00:00+02:00 0 2015-03-29 03:30:00+02:00 1 diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index a20577e8d3df9..0c4f22f736d4a 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1206,7 +1206,7 @@ def idxmin(self, skipna: bool = True) -> Series: >>> ser.groupby(["a", "a", "b", "b"]).idxmin() a 2023-01-01 b 2023-02-01 - dtype: datetime64[ns] + dtype: datetime64[s] """ return self._idxmax_idxmin("idxmin", skipna=skipna) @@ -1259,7 +1259,7 @@ def idxmax(self, skipna: bool = True) -> Series: >>> ser.groupby(["a", "a", "b", "b"]).idxmax() a 2023-01-15 b 2023-02-15 - dtype: datetime64[ns] + dtype: datetime64[s] """ return self._idxmax_idxmin("idxmax", skipna=skipna) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 6a3fb8bc851df..56030a15dc143 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2634,7 +2634,7 @@ def isna(self) -> npt.NDArray[np.bool_]: ... ) >>> idx DatetimeIndex(['1940-04-25', 'NaT', 'NaT', 'NaT'], - dtype='datetime64[ns]', freq=None) + dtype='datetime64[s]', freq=None) >>> idx.isna() array([False, True, True, True]) """ diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 78f04f57029b1..930bc7a95bd14 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -242,7 +242,7 @@ class DatetimeIndex(DatetimeTimedeltaMixin): >>> idx = pd.DatetimeIndex(["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"]) >>> idx DatetimeIndex(['2020-01-01 10:00:00+00:00', '2020-02-01 11:00:00+00:00'], - dtype='datetime64[ns, UTC]', freq=None) + dtype='datetime64[s, UTC]', freq=None) """ _typ = "datetimeindex" @@ -473,7 +473,8 @@ def snap(self, freq: Frequency = "S") -> DatetimeIndex: Examples -------- >>> idx = pd.DatetimeIndex( - ... ["2023-01-01", "2023-01-02", "2023-02-01", "2023-02-02"] + ... ["2023-01-01", "2023-01-02", "2023-02-01", "2023-02-02"], + ... dtype="M8[ns]", ... ) >>> idx DatetimeIndex(['2023-01-01', '2023-01-02', '2023-02-01', '2023-02-02'], diff --git a/pandas/core/series.py b/pandas/core/series.py index c49eef49f7393..f67c0753fa9df 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2060,14 +2060,14 @@ def unique(self) -> ArrayLike: >>> pd.Series([pd.Timestamp("2016-01-01") for _ in range(3)]).unique() ['2016-01-01 00:00:00'] - Length: 1, dtype: datetime64[ns] + Length: 1, dtype: datetime64[s] >>> pd.Series( ... [pd.Timestamp("2016-01-01", tz="US/Eastern") for _ in range(3)] ... ).unique() ['2016-01-01 00:00:00-05:00'] - Length: 1, dtype: datetime64[ns, US/Eastern] + Length: 1, dtype: datetime64[s, US/Eastern] An Categorical will return categories in the order of appearance and with the same dtype. @@ -3175,6 +3175,7 @@ def combine_first(self, other) -> Series: other = other.reindex(keep_other) if this.dtype.kind == "M" and other.dtype.kind != "M": + # TODO: try to match resos? other = to_datetime(other) combined = concat([this, other]) combined = combined.reindex(new_index) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index b01cdb335ec46..c116ef015ae16 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -29,6 +29,7 @@ timezones as libtimezones, ) from pandas._libs.tslibs.conversion import cast_from_unit_vectorized +from pandas._libs.tslibs.dtypes import NpyDatetimeUnit from pandas._libs.tslibs.parsing import ( DateParseError, guess_datetime_format, @@ -524,6 +525,7 @@ def _to_datetime_with_unit(arg, unit, name, utc: bool, errors: str) -> Index: utc=utc, errors=errors, unit_for_numerics=unit, + creso=NpyDatetimeUnit.NPY_FR_ns.value, ) result = DatetimeIndex(arr, name=name) @@ -873,7 +875,7 @@ def to_datetime( >>> pd.to_datetime(df) 0 2015-02-04 1 2016-03-05 - dtype: datetime64[ns] + dtype: datetime64[s] Using a unix epoch time @@ -903,7 +905,7 @@ def to_datetime( Passing ``errors='coerce'`` will force an out-of-bounds date to :const:`NaT`, in addition to forcing non-dates (or non-parseable dates) to :const:`NaT`. - >>> pd.to_datetime("13000101", format="%Y%m%d", errors="coerce") + >>> pd.to_datetime("invalid for Ymd", format="%Y%m%d", errors="coerce") NaT .. _to_datetime_tz_examples: @@ -916,14 +918,14 @@ def to_datetime( >>> pd.to_datetime(["2018-10-26 12:00:00", "2018-10-26 13:00:15"]) DatetimeIndex(['2018-10-26 12:00:00', '2018-10-26 13:00:15'], - dtype='datetime64[ns]', freq=None) + dtype='datetime64[s]', freq=None) - Timezone-aware inputs *with constant time offset* are converted to timezone-aware :class:`DatetimeIndex`: >>> pd.to_datetime(["2018-10-26 12:00 -0500", "2018-10-26 13:00 -0500"]) DatetimeIndex(['2018-10-26 12:00:00-05:00', '2018-10-26 13:00:00-05:00'], - dtype='datetime64[ns, UTC-05:00]', freq=None) + dtype='datetime64[s, UTC-05:00]', freq=None) - However, timezone-aware inputs *with mixed time offsets* (for example issued from a timezone with daylight savings, such as Europe/Paris) @@ -965,21 +967,21 @@ def to_datetime( >>> pd.to_datetime(["2018-10-26 12:00", "2018-10-26 13:00"], utc=True) DatetimeIndex(['2018-10-26 12:00:00+00:00', '2018-10-26 13:00:00+00:00'], - dtype='datetime64[ns, UTC]', freq=None) + dtype='datetime64[s, UTC]', freq=None) - Timezone-aware inputs are *converted* to UTC (the output represents the exact same datetime, but viewed from the UTC time offset `+00:00`). >>> pd.to_datetime(["2018-10-26 12:00 -0530", "2018-10-26 12:00 -0500"], utc=True) DatetimeIndex(['2018-10-26 17:30:00+00:00', '2018-10-26 17:00:00+00:00'], - dtype='datetime64[ns, UTC]', freq=None) + dtype='datetime64[s, UTC]', freq=None) - Inputs can contain both string or datetime, the above rules still apply >>> pd.to_datetime(["2018-10-26 12:00", datetime(2020, 1, 1, 18)], utc=True) DatetimeIndex(['2018-10-26 12:00:00+00:00', '2020-01-01 18:00:00+00:00'], - dtype='datetime64[ns, UTC]', freq=None) + dtype='datetime64[us, UTC]', freq=None) """ if exact is not lib.no_default and format in {"mixed", "ISO8601"}: raise ValueError("Cannot use 'exact' when 'format' is 'mixed' or 'ISO8601'") diff --git a/pandas/tests/arithmetic/test_period.py b/pandas/tests/arithmetic/test_period.py index 18f1993c198df..539df9d61a7b2 100644 --- a/pandas/tests/arithmetic/test_period.py +++ b/pandas/tests/arithmetic/test_period.py @@ -1361,7 +1361,12 @@ def test_period_add_timestamp_raises(self, box_with_array): arr + ts with pytest.raises(TypeError, match=msg): ts + arr - msg = "cannot add PeriodArray and DatetimeArray" + if box_with_array is pd.DataFrame: + # TODO: before implementing resolution-inference we got the same + # message with DataFrame and non-DataFrame. Why did that change? + msg = "cannot add PeriodArray and Timestamp" + else: + msg = "cannot add PeriodArray and DatetimeArray" with pytest.raises(TypeError, match=msg): arr + Series([ts]) with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/arrays/categorical/test_missing.py b/pandas/tests/arrays/categorical/test_missing.py index 9d4b78ce9944e..e3cb9664e19f2 100644 --- a/pandas/tests/arrays/categorical/test_missing.py +++ b/pandas/tests/arrays/categorical/test_missing.py @@ -121,7 +121,7 @@ def test_compare_categorical_with_missing(self, a1, a2, categories): @pytest.mark.parametrize( "na_value, dtype", [ - (pd.NaT, "datetime64[ns]"), + (pd.NaT, "datetime64[s]"), (None, "float64"), (np.nan, "float64"), (pd.NA, "float64"), diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index 857509e18fa8e..97d57163ed079 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -125,7 +125,7 @@ def test_dt64_array(dtype_unit): ( pd.DatetimeIndex(["2000", "2001"]), None, - DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[ns]"), + DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[s]"), ), ( ["2000", "2001"], @@ -301,11 +301,11 @@ def test_array_copy(): # datetime ( [pd.Timestamp("2000"), pd.Timestamp("2001")], - DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[ns]"), + DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[s]"), ), ( [datetime.datetime(2000, 1, 1), datetime.datetime(2001, 1, 1)], - DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[ns]"), + DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[us]"), ), ( np.array([1, 2], dtype="M8[ns]"), @@ -321,7 +321,7 @@ def test_array_copy(): ( [pd.Timestamp("2000", tz="CET"), pd.Timestamp("2001", tz="CET")], DatetimeArray._from_sequence( - ["2000", "2001"], dtype=pd.DatetimeTZDtype(tz="CET", unit="ns") + ["2000", "2001"], dtype=pd.DatetimeTZDtype(tz="CET", unit="s") ), ), ( @@ -330,7 +330,7 @@ def test_array_copy(): datetime.datetime(2001, 1, 1, tzinfo=cet), ], DatetimeArray._from_sequence( - ["2000", "2001"], dtype=pd.DatetimeTZDtype(tz=cet, unit="ns") + ["2000", "2001"], dtype=pd.DatetimeTZDtype(tz=cet, unit="us") ), ), # timedelta diff --git a/pandas/tests/base/test_constructors.py b/pandas/tests/base/test_constructors.py index f3ac60f672ee1..c4b02423f8cf0 100644 --- a/pandas/tests/base/test_constructors.py +++ b/pandas/tests/base/test_constructors.py @@ -146,10 +146,12 @@ def test_constructor_datetime_outofbound( # No dtype specified (dtype inference) # datetime64[non-ns] raise error, other cases result in object dtype # and preserve original data - if a.dtype.kind == "M": + result = constructor(a) + if a.dtype.kind == "M" or isinstance(a[0], np.datetime64): # Can't fit in nanosecond bounds -> get the nearest supported unit - result = constructor(a) assert result.dtype == "M8[s]" + elif isinstance(a[0], datetime): + assert result.dtype == "M8[us]", result.dtype else: result = constructor(a) if using_infer_string and "object-string" in request.node.callspec.id: diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py index 6c0df49b0a93a..dd6bf3c7521f8 100644 --- a/pandas/tests/base/test_conversion.py +++ b/pandas/tests/base/test_conversion.py @@ -412,7 +412,7 @@ def test_to_numpy_dtype(as_series): [Timestamp("2000"), Timestamp("2000"), pd.NaT], None, Timestamp("2000"), - [np.datetime64("2000-01-01T00:00:00.000000000")] * 3, + [np.datetime64("2000-01-01T00:00:00", "s")] * 3, ), ], ) @@ -454,7 +454,7 @@ def test_to_numpy_na_value_numpy_dtype( [(0, Timestamp("2021")), (0, Timestamp("2022")), (1, Timestamp("2000"))], None, Timestamp("2000"), - [np.datetime64("2000-01-01T00:00:00.000000000")] * 3, + [np.datetime64("2000-01-01T00:00:00", "s")] * 3, ), ], ) diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index f4282c9c7ac3a..db18cd4aef14e 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -830,7 +830,11 @@ def test_maybe_convert_objects_datetime_overflow_safe(self, dtype): out = lib.maybe_convert_objects(arr, convert_non_numeric=True) # no OutOfBoundsDatetime/OutOfBoundsTimedeltas - tm.assert_numpy_array_equal(out, arr) + if dtype == "datetime64[ns]": + expected = np.array(["2363-10-04"], dtype="M8[us]") + else: + expected = arr + tm.assert_numpy_array_equal(out, expected) def test_maybe_convert_objects_mixed_datetimes(self): ts = Timestamp("now") diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 7d31fe6085c3a..5926d23b44dd0 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3445,7 +3445,7 @@ def test_arrow_floor_division_large_divisor(dtype): def test_string_to_datetime_parsing_cast(): # GH 56266 string_dates = ["2020-01-01 04:30:00", "2020-01-02 00:00:00", "2020-01-03 00:00:00"] - result = pd.Series(string_dates, dtype="timestamp[ns][pyarrow]") + result = pd.Series(string_dates, dtype="timestamp[s][pyarrow]") expected = pd.Series( ArrowExtensionArray(pa.array(pd.to_datetime(string_dates), from_pandas=True)) ) diff --git a/pandas/tests/frame/constructors/test_from_records.py b/pandas/tests/frame/constructors/test_from_records.py index 66fc234e79b4d..35e143fcedf7b 100644 --- a/pandas/tests/frame/constructors/test_from_records.py +++ b/pandas/tests/frame/constructors/test_from_records.py @@ -39,7 +39,7 @@ def test_from_records_with_datetimes(self): expected = DataFrame({"EXPIRY": [datetime(2005, 3, 1, 0, 0), None]}) arrdata = [np.array([datetime(2005, 3, 1, 0, 0), None])] - dtypes = [("EXPIRY", " None: ) with tm.assert_produces_warning(FutureWarning, match=msg): result = date_range("2010-01-01", periods=2, freq="m") - expected = DatetimeIndex(["2010-01-31", "2010-02-28"], freq="ME") + expected = DatetimeIndex( + ["2010-01-31", "2010-02-28"], dtype="M8[ns]", freq="ME" + ) tm.assert_index_equal(result, expected) def test_date_range_bday(self): diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 2e94961b673f8..bd38e6c2ff333 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -186,7 +186,7 @@ def test_constructor_int_dtype_nan(self): "klass,dtype,na_val", [ (Index, np.float64, np.nan), - (DatetimeIndex, "datetime64[ns]", pd.NaT), + (DatetimeIndex, "datetime64[s]", pd.NaT), ], ) def test_index_ctor_infer_nan_nat(self, klass, dtype, na_val): diff --git a/pandas/tests/indexes/test_index_new.py b/pandas/tests/indexes/test_index_new.py index b544ebac43ece..4a31ae88a757a 100644 --- a/pandas/tests/indexes/test_index_new.py +++ b/pandas/tests/indexes/test_index_new.py @@ -61,16 +61,16 @@ def test_infer_nat(self, val): values = [NaT, val] idx = Index(values) - assert idx.dtype == "datetime64[ns]" and idx.isna().all() + assert idx.dtype == "datetime64[s]" and idx.isna().all() idx = Index(values[::-1]) - assert idx.dtype == "datetime64[ns]" and idx.isna().all() + assert idx.dtype == "datetime64[s]" and idx.isna().all() idx = Index(np.array(values, dtype=object)) - assert idx.dtype == "datetime64[ns]" and idx.isna().all() + assert idx.dtype == "datetime64[s]" and idx.isna().all() idx = Index(np.array(values, dtype=object)[::-1]) - assert idx.dtype == "datetime64[ns]" and idx.isna().all() + assert idx.dtype == "datetime64[s]" and idx.isna().all() @pytest.mark.parametrize("na_value", [None, np.nan]) @pytest.mark.parametrize("vtype", [list, tuple, iter]) @@ -138,6 +138,9 @@ def test_constructor_infer_nat_dt_like( ) expected = klass([NaT, NaT]) + if dtype[0] == "d": + # we infer all-NaT as second resolution + expected = expected.astype("M8[ns]") assert expected.dtype == dtype data = [ctor] data.insert(pos, nulls_fixture) diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index d4bc0341e732e..84cd0d3b08b7b 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -598,7 +598,7 @@ def test_fillna_complex128(self, index_or_series, fill_val, fill_dtype): @pytest.mark.parametrize( "fill_val,fill_dtype", [ - (pd.Timestamp("2012-01-01"), "datetime64[ns]"), + (pd.Timestamp("2012-01-01"), "datetime64[s]"), (pd.Timestamp("2012-01-01", tz="US/Eastern"), object), (1, object), ("x", object), @@ -615,7 +615,7 @@ def test_fillna_datetime(self, index_or_series, fill_val, fill_dtype): pd.Timestamp("2011-01-04"), ] ) - assert obj.dtype == "datetime64[ns]" + assert obj.dtype == "datetime64[s]" exp = klass( [ @@ -630,10 +630,10 @@ def test_fillna_datetime(self, index_or_series, fill_val, fill_dtype): @pytest.mark.parametrize( "fill_val,fill_dtype", [ - (pd.Timestamp("2012-01-01", tz="US/Eastern"), "datetime64[ns, US/Eastern]"), + (pd.Timestamp("2012-01-01", tz="US/Eastern"), "datetime64[s, US/Eastern]"), (pd.Timestamp("2012-01-01"), object), # pre-2.0 with a mismatched tz we would get object result - (pd.Timestamp("2012-01-01", tz="Asia/Tokyo"), "datetime64[ns, US/Eastern]"), + (pd.Timestamp("2012-01-01", tz="Asia/Tokyo"), "datetime64[s, US/Eastern]"), (1, object), ("x", object), ], @@ -650,7 +650,7 @@ def test_fillna_datetime64tz(self, index_or_series, fill_val, fill_dtype): pd.Timestamp("2011-01-04", tz=tz), ] ) - assert obj.dtype == "datetime64[ns, US/Eastern]" + assert obj.dtype == "datetime64[s, US/Eastern]" if getattr(fill_val, "tz", None) is None: fv = fill_val @@ -830,6 +830,7 @@ def replacer(self, how, from_key, to_key): def test_replace_series(self, how, to_key, from_key, replacer): index = pd.Index([3, 4], name="xxx") obj = pd.Series(self.rep[from_key], index=index, name="yyy") + obj = obj.astype(from_key) assert obj.dtype == from_key if from_key.startswith("datetime") and to_key.startswith("datetime"): @@ -850,7 +851,6 @@ def test_replace_series(self, how, to_key, from_key, replacer): else: exp = pd.Series(self.rep[to_key], index=index, name="yyy") - assert exp.dtype == to_key result = obj.replace(replacer) tm.assert_series_equal(result, exp, check_dtype=False) @@ -867,7 +867,7 @@ def test_replace_series_datetime_tz( self, how, to_key, from_key, replacer, using_infer_string ): index = pd.Index([3, 4], name="xyz") - obj = pd.Series(self.rep[from_key], index=index, name="yyy") + obj = pd.Series(self.rep[from_key], index=index, name="yyy").dt.as_unit("ns") assert obj.dtype == from_key exp = pd.Series(self.rep[to_key], index=index, name="yyy") @@ -891,7 +891,7 @@ def test_replace_series_datetime_tz( ) def test_replace_series_datetime_datetime(self, how, to_key, from_key, replacer): index = pd.Index([3, 4], name="xyz") - obj = pd.Series(self.rep[from_key], index=index, name="yyy") + obj = pd.Series(self.rep[from_key], index=index, name="yyy").dt.as_unit("ns") assert obj.dtype == from_key exp = pd.Series(self.rep[to_key], index=index, name="yyy") @@ -900,8 +900,8 @@ def test_replace_series_datetime_datetime(self, how, to_key, from_key, replacer) ): # with mismatched tzs, we retain the original dtype as of 2.0 exp = exp.astype(obj.dtype) - else: - assert exp.dtype == to_key + elif to_key == from_key: + exp = exp.dt.as_unit("ns") result = obj.replace(replacer) tm.assert_series_equal(result, exp, check_dtype=False) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 01dab14c7e528..16f3e0fd0c229 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -711,7 +711,7 @@ def test_loc_modify_datetime(self): {"date": [1485264372711, 1485265925110, 1540215845888, 1540282121025]} ) - df["date_dt"] = to_datetime(df["date"], unit="ms", cache=True) + df["date_dt"] = to_datetime(df["date"], unit="ms", cache=True).dt.as_unit("ms") df.loc[:, "date_dt_cp"] = df.loc[:, "date_dt"] df.loc[[2, 3], "date_dt_cp"] = df.loc[[2, 3], "date_dt"] @@ -865,6 +865,7 @@ def test_loc_setitem_frame_multiples(self): "val": Series([0, 1, 0, 1, 2], dtype=np.int64), } ) + expected["date"] = expected["date"].astype("M8[ns]") rhs = df.loc[0:2] rhs.index = df.index[2:5] df.loc[2:4] = rhs @@ -1814,7 +1815,7 @@ def test_loc_getitem_datetime_string_with_datetimeindex(self): result = df.loc[["2010-01-01", "2010-01-05"], ["a", "b"]] expected = DataFrame( {"a": [0, 4], "b": [0, 4]}, - index=DatetimeIndex(["2010-01-01", "2010-01-05"]), + index=DatetimeIndex(["2010-01-01", "2010-01-05"]).as_unit("ns"), ) tm.assert_frame_equal(result, expected) @@ -2082,7 +2083,7 @@ def test_setitem_with_expansion(self): expected = Series([v[0].tz_convert("UTC"), df.loc[1, "time"]], name="time") tm.assert_series_equal(df2.time, expected) - v = df.loc[df.new_col == "new", "time"] + Timedelta("1s") + v = df.loc[df.new_col == "new", "time"] + Timedelta("1s").as_unit("s") df.loc[df.new_col == "new", "time"] = v tm.assert_series_equal(df.loc[df.new_col == "new", "time"], v) diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py index b0a041ed5b69c..4d232d5ed1312 100644 --- a/pandas/tests/indexing/test_partial.py +++ b/pandas/tests/indexing/test_partial.py @@ -580,7 +580,7 @@ def test_partial_set_invalid(self): ], ), ( - date_range(start="2000", periods=20, freq="D"), + date_range(start="2000", periods=20, freq="D", unit="s"), ["2000-01-04", "2000-01-08", "2000-01-12"], [ Timestamp("2000-01-04"), diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index 60e05c2c65124..64eca6ac643ca 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -603,7 +603,8 @@ def test_empty_dataframe(): ), ( pd.Series( - [datetime(2022, 1, 1), datetime(2022, 1, 2), datetime(2022, 1, 3)] + [datetime(2022, 1, 1), datetime(2022, 1, 2), datetime(2022, 1, 3)], + dtype="M8[ns]", ), (DtypeKind.DATETIME, 64, "tsn:", "="), (DtypeKind.INT, 64, ArrowCTypes.INT64, "="), diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index f0a72ba6163fa..6d6c3ad6b77a7 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -141,10 +141,13 @@ def df_ref(datapath): def get_exp_unit(read_ext: str, engine: str | None) -> str: - return "ns" + unit = "us" + if (read_ext == ".ods") ^ (engine == "calamine"): + unit = "s" + return unit -def adjust_expected(expected: DataFrame, read_ext: str, engine: str) -> None: +def adjust_expected(expected: DataFrame, read_ext: str, engine: str | None) -> None: expected.index.name = None unit = get_exp_unit(read_ext, engine) # error: "Index" has no attribute "as_unit" @@ -1117,7 +1120,6 @@ def test_read_excel_multiindex_blank_after_name( mi = MultiIndex.from_product([["foo", "bar"], ["a", "b"]], names=["c1", "c2"]) unit = get_exp_unit(read_ext, engine) - expected = DataFrame( [ [1, 2.5, pd.Timestamp("2015-01-01"), True], @@ -1675,6 +1677,7 @@ def test_read_datetime_multiindex(self, request, engine, read_ext): actual = pd.read_excel(excel, header=[0, 1], index_col=0, engine=engine) unit = get_exp_unit(read_ext, engine) + dti = pd.DatetimeIndex(["2020-02-29", "2020-03-01"], dtype=f"M8[{unit}]") expected_column_index = MultiIndex.from_arrays( [dti[:1], dti[1:]], diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index 859152db84b7d..744fe20e4995d 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -37,7 +37,9 @@ def get_exp_unit(path: str) -> str: - return "ns" + if path.endswith(".ods"): + return "s" + return "us" @pytest.fixture @@ -293,12 +295,15 @@ def test_read_excel_parse_dates(self, tmp_excel): tm.assert_frame_equal(df2, res) res = pd.read_excel(tmp_excel, parse_dates=["date_strings"], index_col=0) - tm.assert_frame_equal(df, res) + expected = df[:] + expected["date_strings"] = expected["date_strings"].astype("M8[s]") + tm.assert_frame_equal(res, expected) res = pd.read_excel( tmp_excel, parse_dates=["date_strings"], date_format="%m/%d/%Y", index_col=0 ) - tm.assert_frame_equal(df, res) + expected["date_strings"] = expected["date_strings"].astype("M8[s]") + tm.assert_frame_equal(expected, res) def test_multiindex_interval_datetimes(self, tmp_excel): # GH 30986 @@ -547,6 +552,7 @@ def test_sheets(self, frame, tmp_excel): columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=5, freq="B"), ) + index = pd.DatetimeIndex(np.asarray(tsframe.index), freq=None) tsframe.index = index @@ -695,7 +701,6 @@ def test_excel_date_datetime_format(self, ext, tmp_excel, tmp_path): # # Excel output format strings unit = get_exp_unit(tmp_excel) - df = DataFrame( [ [date(2014, 1, 31), date(1999, 9, 24)], @@ -732,6 +737,9 @@ def test_excel_date_datetime_format(self, ext, tmp_excel, tmp_path): with ExcelFile(filename2) as reader2: rs2 = pd.read_excel(reader2, sheet_name="test1", index_col=0) + # TODO: why do we get different units? + rs2 = rs2.astype(f"M8[{unit}]") + tm.assert_frame_equal(rs1, rs2) # Since the reader returns a datetime object for dates, diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index c4065ea01988f..b53957a7e77d1 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -133,7 +133,13 @@ def test_frame_non_unique_index_raises(self, orient): [[Timestamp("20130101"), 3.5], [Timestamp("20130102"), 4.5]], ], ) - def test_frame_non_unique_columns(self, orient, data): + def test_frame_non_unique_columns(self, orient, data, request): + if isinstance(data[0][0], Timestamp) and orient == "split": + mark = pytest.mark.xfail( + reason="GH#55827 non-nanosecond dt64 fails to round-trip" + ) + request.applymarker(mark) + df = DataFrame(data, index=[1, 2], columns=["x", "x"]) expected_warning = None @@ -141,7 +147,7 @@ def test_frame_non_unique_columns(self, orient, data): "The default 'epoch' date format is deprecated and will be removed " "in a future version, please use 'iso' date format instead." ) - if df.iloc[:, 0].dtype == "datetime64[ns]": + if df.iloc[:, 0].dtype == "datetime64[s]": expected_warning = FutureWarning with tm.assert_produces_warning(expected_warning, match=msg): @@ -150,7 +156,7 @@ def test_frame_non_unique_columns(self, orient, data): ) if orient == "values": expected = DataFrame(data) - if expected.iloc[:, 0].dtype == "datetime64[ns]": + if expected.iloc[:, 0].dtype == "datetime64[s]": # orient == "values" by default will write Timestamp objects out # in milliseconds; these are internally stored in nanosecond, # so divide to get where we need @@ -856,6 +862,10 @@ def test_date_index_and_values(self, date_format, as_object, date_typ): data.append("a") ser = Series(data, index=data) + if not as_object: + ser = ser.astype("M8[ns]") + if isinstance(ser.index, DatetimeIndex): + ser.index = ser.index.as_unit("ns") expected_warning = None if date_format == "epoch": @@ -897,6 +907,7 @@ def test_convert_dates_infer(self, infer_word): expected = DataFrame( [[1, Timestamp("2002-11-08")], [2, pd.NaT]], columns=["id", infer_word] ) + expected[infer_word] = expected[infer_word].astype("M8[ns]") result = read_json(StringIO(ujson_dumps(data)))[["id", infer_word]] tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index df76b46cc6a7b..b665cfba8bdc0 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -40,9 +40,7 @@ def test_read_csv_local(all_parsers, csv1): fname = prefix + str(os.path.abspath(csv1)) result = parser.read_csv(fname, index_col=0, parse_dates=True) - # TODO: make unit check more specific - if parser.engine == "pyarrow": - result.index = result.index.as_unit("ns") + expected = DataFrame( [ [0.980269, 3.685731, -0.364216805298, -1.159738], @@ -64,6 +62,7 @@ def test_read_csv_local(all_parsers, csv1): datetime(2000, 1, 10), datetime(2000, 1, 11), ], + dtype="M8[s]", name="index", ), ) @@ -144,9 +143,6 @@ def test_read_csv_low_memory_no_rows_with_index(all_parsers): def test_read_csv_dataframe(all_parsers, csv1): parser = all_parsers result = parser.read_csv(csv1, index_col=0, parse_dates=True) - # TODO: make unit check more specific - if parser.engine == "pyarrow": - result.index = result.index.as_unit("ns") expected = DataFrame( [ [0.980269, 3.685731, -0.364216805298, -1.159738], @@ -168,6 +164,7 @@ def test_read_csv_dataframe(all_parsers, csv1): datetime(2000, 1, 10), datetime(2000, 1, 11), ], + dtype="M8[s]", name="index", ), ) diff --git a/pandas/tests/io/parser/common/test_index.py b/pandas/tests/io/parser/common/test_index.py index 2fcc80f58ae30..4cfc12cdc46aa 100644 --- a/pandas/tests/io/parser/common/test_index.py +++ b/pandas/tests/io/parser/common/test_index.py @@ -260,7 +260,8 @@ def test_read_csv_no_index_name(all_parsers, csv_dir_path): datetime(2000, 1, 5), datetime(2000, 1, 6), datetime(2000, 1, 7), - ] + ], + dtype="M8[s]", ), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_multi_thread.py b/pandas/tests/io/parser/test_multi_thread.py index 649a1324686a7..348c19ac0f0c6 100644 --- a/pandas/tests/io/parser/test_multi_thread.py +++ b/pandas/tests/io/parser/test_multi_thread.py @@ -152,7 +152,8 @@ def test_multi_thread_path_multipart_read_csv(all_parsers): with tm.ensure_clean(file_name) as path: df.to_csv(path) - final_dataframe = _generate_multi_thread_dataframe( - parser, path, num_rows, num_tasks - ) - tm.assert_frame_equal(df, final_dataframe) + result = _generate_multi_thread_dataframe(parser, path, num_rows, num_tasks) + + expected = df[:] + expected["date"] = expected["date"].astype("M8[s]") + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 3bb3d793606e1..e9c6c0f5e32d7 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -62,6 +62,7 @@ def test_date_col_as_index_col(all_parsers): datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 22, 0), ], + dtype="M8[s]", name="X1", ) expected = DataFrame( @@ -90,7 +91,7 @@ def test_nat_parse(all_parsers): df = DataFrame( { "A": np.arange(10, dtype="float64"), - "B": Timestamp("20010101").as_unit("ns"), + "B": Timestamp("20010101"), } ) df.iloc[3:6, :] = np.nan @@ -126,7 +127,7 @@ def test_parse_dates_string(all_parsers): parser = all_parsers result = parser.read_csv(StringIO(data), index_col="date", parse_dates=["date"]) # freq doesn't round-trip - index = date_range("1/1/2009", periods=3, name="date")._with_freq(None) + index = date_range("1/1/2009", periods=3, name="date", unit="s")._with_freq(None) expected = DataFrame( {"A": ["a", "b", "c"], "B": [1, 3, 4], "C": [2, 4, 5]}, index=index @@ -143,6 +144,8 @@ def test_parse_dates_column_list(all_parsers, parse_dates): expected = DataFrame( {"a": [datetime(2010, 1, 1)], "b": [1], "c": [datetime(2010, 2, 15)]} ) + expected["a"] = expected["a"].astype("M8[s]") + expected["c"] = expected["c"].astype("M8[s]") expected = expected.set_index(["a", "b"]) result = parser.read_csv( @@ -166,9 +169,10 @@ def test_multi_index_parse_dates(all_parsers, index_col): 20090103,three,c,4,5 """ parser = all_parsers + dti = date_range("2009-01-01", periods=3, freq="D", unit="s") index = MultiIndex.from_product( [ - (datetime(2009, 1, 1), datetime(2009, 1, 2), datetime(2009, 1, 3)), + dti, ("one", "two", "three"), ], names=["index1", "index2"], @@ -209,9 +213,6 @@ def test_parse_tz_aware(all_parsers): data = "Date,x\n2012-06-13T01:39:00Z,0.5" result = parser.read_csv(StringIO(data), index_col=0, parse_dates=True) - # TODO: make unit check more specific - if parser.engine == "pyarrow": - result.index = result.index.as_unit("ns") expected = DataFrame( {"x": [0.5]}, index=Index([Timestamp("2012-06-13 01:39:00+00:00")], name="Date") ) @@ -302,6 +303,7 @@ def test_parse_dates_empty_string(all_parsers): expected = DataFrame( [[datetime(2012, 1, 1), 1], [pd.NaT, 2]], columns=["Date", "test"] ) + expected["Date"] = expected["Date"].astype("M8[s]") tm.assert_frame_equal(result, expected) @@ -312,18 +314,22 @@ def test_parse_dates_empty_string(all_parsers): ( "a\n04.15.2016", {"parse_dates": ["a"]}, - DataFrame([datetime(2016, 4, 15)], columns=["a"]), + DataFrame([datetime(2016, 4, 15)], columns=["a"], dtype="M8[s]"), ), ( "a\n04.15.2016", {"parse_dates": True, "index_col": 0}, - DataFrame(index=DatetimeIndex(["2016-04-15"], name="a"), columns=[]), + DataFrame( + index=DatetimeIndex(["2016-04-15"], dtype="M8[s]", name="a"), columns=[] + ), ), ( "a,b\n04.15.2016,09.16.2013", {"parse_dates": ["a", "b"]}, DataFrame( - [[datetime(2016, 4, 15), datetime(2013, 9, 16)]], columns=["a", "b"] + [[datetime(2016, 4, 15), datetime(2013, 9, 16)]], + dtype="M8[s]", + columns=["a", "b"], ), ), ( @@ -331,7 +337,13 @@ def test_parse_dates_empty_string(all_parsers): {"parse_dates": True, "index_col": [0, 1]}, DataFrame( index=MultiIndex.from_tuples( - [(datetime(2016, 4, 15), datetime(2013, 9, 16))], names=["a", "b"] + [ + ( + Timestamp(2016, 4, 15).as_unit("s"), + Timestamp(2013, 9, 16).as_unit("s"), + ) + ], + names=["a", "b"], ), columns=[], ), @@ -399,6 +411,7 @@ def test_parse_timezone(all_parsers): end="2018-01-04 09:05:00", freq="1min", tz=timezone(timedelta(minutes=540)), + unit="s", )._with_freq(None) expected_data = {"dt": dti, "val": [23350, 23400, 23400, 23400, 23400]} @@ -437,7 +450,7 @@ def test_parse_delimited_date_swap_no_warning( all_parsers, date_string, dayfirst, expected, request ): parser = all_parsers - expected = DataFrame({0: [expected]}, dtype="datetime64[ns]") + expected = DataFrame({0: [expected]}, dtype="datetime64[s]") if parser.engine == "pyarrow": if not dayfirst: # "CSV parse error: Empty CSV file or block" @@ -470,7 +483,7 @@ def test_parse_delimited_date_swap_with_warning( all_parsers, date_string, dayfirst, expected ): parser = all_parsers - expected = DataFrame({0: [expected]}, dtype="datetime64[ns]") + expected = DataFrame({0: [expected]}, dtype="datetime64[s]") warning_msg = ( "Parsing dates in .* format when dayfirst=.* was specified. " "Pass `dayfirst=.*` or specify a format to silence this warning." @@ -555,9 +568,7 @@ def test_date_parser_multiindex_columns(all_parsers): 1,2 2019-12-31,6""" result = parser.read_csv(StringIO(data), parse_dates=[("a", "1")], header=[0, 1]) - expected = DataFrame( - {("a", "1"): Timestamp("2019-12-31").as_unit("ns"), ("b", "2"): [6]} - ) + expected = DataFrame({("a", "1"): Timestamp("2019-12-31"), ("b", "2"): [6]}) tm.assert_frame_equal(result, expected) @@ -591,6 +602,7 @@ def test_date_parser_usecols_thousands(all_parsers): thousands="-", ) expected = DataFrame({"B": [3, 4], "C": [Timestamp("20-09-2001 01:00:00")] * 2}) + expected["C"] = expected["C"].astype("M8[s]") tm.assert_frame_equal(result, expected) @@ -600,7 +612,7 @@ def test_dayfirst_warnings(): # CASE 1: valid input input = "date\n31/12/2014\n10/03/2011" expected = DatetimeIndex( - ["2014-12-31", "2011-03-10"], dtype="datetime64[ns]", freq=None, name="date" + ["2014-12-31", "2011-03-10"], dtype="datetime64[s]", freq=None, name="date" ) warning_msg = ( "Parsing dates in .* format when dayfirst=.* was specified. " @@ -661,7 +673,7 @@ def test_dayfirst_warnings_no_leading_zero(date_string, dayfirst): # GH47880 initial_value = f"date\n{date_string}" expected = DatetimeIndex( - ["2014-01-31"], dtype="datetime64[ns]", freq=None, name="date" + ["2014-01-31"], dtype="datetime64[s]", freq=None, name="date" ) warning_msg = ( "Parsing dates in .* format when dayfirst=.* was specified. " @@ -716,7 +728,8 @@ def test_replace_nans_before_parsing_dates(all_parsers): pd.NaT, Timestamp("2017-09-09"), ] - } + }, + dtype="M8[s]", ) tm.assert_frame_equal(result, expected) @@ -731,6 +744,7 @@ def test_parse_dates_and_string_dtype(all_parsers): result = parser.read_csv(StringIO(data), dtype="string", parse_dates=["b"]) expected = DataFrame({"a": ["1"], "b": [Timestamp("2019-12-31")]}) expected["a"] = expected["a"].astype("string") + expected["b"] = expected["b"].astype("M8[s]") tm.assert_frame_equal(result, expected) @@ -750,7 +764,7 @@ def test_parse_dot_separated_dates(all_parsers): else: expected_index = DatetimeIndex( ["2003-03-27 14:55:00", "2003-08-03 15:20:00"], - dtype="datetime64[ns]", + dtype="datetime64[ms]", name="a", ) warn = UserWarning @@ -783,7 +797,8 @@ def test_parse_dates_dict_format(all_parsers): { "a": [Timestamp("2019-12-31"), Timestamp("2020-12-31")], "b": [Timestamp("2019-12-31"), Timestamp("2020-12-31")], - } + }, + dtype="M8[s]", ) tm.assert_frame_equal(result, expected) @@ -816,9 +831,6 @@ def test_parse_dates_arrow_engine(all_parsers): 2000-01-01 00:00:01,1""" result = parser.read_csv(StringIO(data), parse_dates=["a"]) - # TODO: make unit check more specific - if parser.engine == "pyarrow": - result["a"] = result["a"].dt.as_unit("ns") expected = DataFrame( { "a": [ diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index 0a9f6bd83e0d9..45d630c545565 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -298,7 +298,8 @@ def test_fwf_regression(): "2009-06-13 20:40:00", "2009-06-13 20:50:00", "2009-06-13 21:00:00", - ] + ], + dtype="M8[us]", ), columns=["SST", "T010", "T020", "T030", "T060", "T080", "T100"], ) @@ -311,6 +312,7 @@ def test_fwf_regression(): parse_dates=True, date_format="%Y%j%H%M%S", ) + expected.index = expected.index.astype("M8[s]") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_skiprows.py b/pandas/tests/io/parser/test_skiprows.py index 17a806d05fe28..99642ee4befc6 100644 --- a/pandas/tests/io/parser/test_skiprows.py +++ b/pandas/tests/io/parser/test_skiprows.py @@ -42,7 +42,9 @@ def test_skip_rows_bug(all_parsers, skiprows): StringIO(text), skiprows=skiprows, header=None, index_col=0, parse_dates=True ) index = Index( - [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], name=0 + [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], + dtype="M8[s]", + name=0, ) expected = DataFrame( @@ -85,7 +87,9 @@ def test_skip_rows_blank(all_parsers): StringIO(text), skiprows=6, header=None, index_col=0, parse_dates=True ) index = Index( - [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], name=0 + [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], + dtype="M8[s]", + name=0, ) expected = DataFrame( diff --git a/pandas/tests/io/parser/usecols/test_parse_dates.py b/pandas/tests/io/parser/usecols/test_parse_dates.py index 0cf3fe894c916..cc54f2487aa60 100644 --- a/pandas/tests/io/parser/usecols/test_parse_dates.py +++ b/pandas/tests/io/parser/usecols/test_parse_dates.py @@ -70,7 +70,7 @@ def test_usecols_with_parse_dates3(all_parsers): parse_dates = [0] cols = { - "a": Timestamp("2016-09-21").as_unit("ns"), + "a": Timestamp("2016-09-21"), "b": [1], "c": [1], "d": [2], diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index 471f7b8958ee4..3ce30e313cc30 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -613,10 +613,14 @@ def test_store_index_name(setup_path): @pytest.mark.parametrize("table_format", ["table", "fixed"]) def test_store_index_name_numpy_str(tmp_path, table_format, setup_path, unit, tz): # GH #13492 - idx = DatetimeIndex( - [dt.date(2000, 1, 1), dt.date(2000, 1, 2)], - name="cols\u05d2", - ).tz_localize(tz) + idx = ( + DatetimeIndex( + [dt.date(2000, 1, 1), dt.date(2000, 1, 2)], + name="cols\u05d2", + ) + .tz_localize(tz) + .as_unit(unit) + ) idx1 = ( DatetimeIndex( [dt.date(2010, 1, 1), dt.date(2010, 1, 2)], diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index f6fb032b9d51a..c609ae999d47d 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -72,7 +72,9 @@ def test_read_csv(cleared_fs, df1): w.write(text) df2 = read_csv("memory://test/test.csv", parse_dates=["dt"]) - tm.assert_frame_equal(df1, df2) + expected = df1.copy() + expected["dt"] = expected["dt"].astype("M8[s]") + tm.assert_frame_equal(df2, expected) def test_reasonable_error(monkeypatch, cleared_fs): @@ -95,7 +97,9 @@ def test_to_csv(cleared_fs, df1): df2 = read_csv("memory://test/test.csv", parse_dates=["dt"], index_col=0) - tm.assert_frame_equal(df1, df2) + expected = df1.copy() + expected["dt"] = expected["dt"].astype("M8[s]") + tm.assert_frame_equal(df2, expected) def test_to_excel(cleared_fs, df1): @@ -106,7 +110,9 @@ def test_to_excel(cleared_fs, df1): df2 = read_excel(path, parse_dates=["dt"], index_col=0) - tm.assert_frame_equal(df1, df2) + expected = df1.copy() + expected["dt"] = expected["dt"].astype("M8[s]") + tm.assert_frame_equal(df2, expected) @pytest.mark.parametrize("binary_mode", [False, True]) @@ -128,7 +134,9 @@ def test_to_csv_fsspec_object(cleared_fs, binary_mode, df1): ) assert not fsspec_object.closed - tm.assert_frame_equal(df1, df2) + expected = df1.copy() + expected["dt"] = expected["dt"].astype("M8[s]") + tm.assert_frame_equal(df2, expected) def test_csv_options(fsspectest): diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py index 4b2be41d0c9f9..17b89c9f31616 100644 --- a/pandas/tests/io/test_gcs.py +++ b/pandas/tests/io/test_gcs.py @@ -107,7 +107,11 @@ def from_uri(path): df1.to_markdown(path) df2 = df1 - tm.assert_frame_equal(df1, df2) + expected = df1[:] + if format in ["csv", "excel"]: + expected["dt"] = expected["dt"].dt.as_unit("s") + + tm.assert_frame_equal(df2, expected) def assert_equal_zip_safe(result: bytes, expected: bytes, compression: str): diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 594c1d02b94cc..dfc9b4156ecab 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -1044,11 +1044,15 @@ def test_header_inferred_from_rows_with_only_th(self, flavor_read_html): def test_parse_dates_list(self, flavor_read_html): df = DataFrame({"date": date_range("1/1/2001", periods=10)}) - expected = df.to_html() - res = flavor_read_html(StringIO(expected), parse_dates=[1], index_col=0) - tm.assert_frame_equal(df, res[0]) - res = flavor_read_html(StringIO(expected), parse_dates=["date"], index_col=0) - tm.assert_frame_equal(df, res[0]) + + expected = df[:] + expected["date"] = expected["date"].dt.as_unit("s") + + str_df = df.to_html() + res = flavor_read_html(StringIO(str_df), parse_dates=[1], index_col=0) + tm.assert_frame_equal(expected, res[0]) + res = flavor_read_html(StringIO(str_df), parse_dates=["date"], index_col=0) + tm.assert_frame_equal(expected, res[0]) def test_wikipedia_states_table(self, datapath, flavor_read_html): data = datapath("io", "data", "html", "wikipedia_states.html") diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index de6d46492e916..c7d9300c0a638 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -321,6 +321,8 @@ def test_orc_dtype_backend_pyarrow(): ], } ) + # FIXME: without casting to ns we do not round-trip correctly + df["datetime_with_nat"] = df["datetime_with_nat"].astype("M8[ns]") bytes_data = df.copy().to_orc() result = read_orc(BytesIO(bytes_data), dtype_backend="pyarrow") diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 2860b3a6483af..35275f3c23bef 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -670,6 +670,7 @@ def test_read_empty_array(self, pa, dtype): class TestParquetPyArrow(Base): + @pytest.mark.xfail(reason="datetime_with_nat unit doesn't round-trip") def test_basic(self, pa, df_full): df = df_full pytest.importorskip("pyarrow", "11.0.0") @@ -706,6 +707,14 @@ def test_to_bytes_without_path_or_buf_provided(self, pa, df_full): expected = df_full.copy() expected.loc[1, "string_with_nan"] = None + if pa_version_under11p0: + expected["datetime_with_nat"] = expected["datetime_with_nat"].astype( + "M8[ns]" + ) + else: + expected["datetime_with_nat"] = expected["datetime_with_nat"].astype( + "M8[ms]" + ) tm.assert_frame_equal(res, expected) def test_duplicate_columns(self, pa): @@ -961,7 +970,11 @@ def test_timezone_aware_index(self, request, pa, timezone_aware_date_list): # they both implement datetime.tzinfo # they both wrap datetime.timedelta() # this use-case sets the resolution to 1 minute - check_round_trip(df, pa, check_dtype=False) + + expected = df[:] + if pa_version_under11p0: + expected.index = expected.index.as_unit("ns") + check_round_trip(df, pa, check_dtype=False, expected=expected) def test_filter_row_groups(self, pa): # https://github.com/pandas-dev/pandas/issues/26551 @@ -988,13 +1001,14 @@ def test_read_dtype_backend_pyarrow_config(self, pa, df_full): if pa_version_under13p0: # pyarrow infers datetimes as us instead of ns expected["datetime"] = expected["datetime"].astype("timestamp[us][pyarrow]") - expected["datetime_with_nat"] = expected["datetime_with_nat"].astype( - "timestamp[us][pyarrow]" - ) expected["datetime_tz"] = expected["datetime_tz"].astype( pd.ArrowDtype(pyarrow.timestamp(unit="us", tz="Europe/Brussels")) ) + expected["datetime_with_nat"] = expected["datetime_with_nat"].astype( + "timestamp[ms][pyarrow]" + ) + check_round_trip( df, engine=pa, @@ -1018,6 +1032,7 @@ def test_read_dtype_backend_pyarrow_config_index(self, pa): expected=expected, ) + @pytest.mark.xfail(reason="pa.pandas_compat passes 'datetime64' to .astype") def test_columns_dtypes_not_invalid(self, pa): df = pd.DataFrame({"string": list("abc"), "int": list(range(1, 4))}) @@ -1107,9 +1122,11 @@ def test_infer_string_large_string_type(self, tmp_path, pa): # df.to_parquet(tmp_path / "test.parquet") # result = read_parquet(tmp_path / "test.parquet") # assert result["strings"].dtype == "string" + # FIXME: don't leave commented-out class TestParquetFastParquet(Base): + @pytest.mark.xfail(reason="datetime_with_nat gets incorrect values") def test_basic(self, fp, df_full): df = df_full @@ -1254,6 +1271,25 @@ def test_error_on_using_partition_cols_and_partition_on( partition_cols=partition_cols, ) + def test_empty_dataframe(self, fp): + # GH #27339 + df = pd.DataFrame() + expected = df.copy() + check_round_trip(df, fp, expected=expected) + + @pytest.mark.xfail( + reason="fastparquet passed mismatched values/dtype to DatetimeArray " + "constructor, see https://github.com/dask/fastparquet/issues/891" + ) + def test_timezone_aware_index(self, fp, timezone_aware_date_list): + idx = 5 * [timezone_aware_date_list] + + df = pd.DataFrame(index=idx, data={"index_as_col": idx}) + + expected = df.copy() + expected.index.name = "index" + check_round_trip(df, fp, expected=expected) + def test_close_file_handle_on_read_error(self): with tm.ensure_clean("test.parquet") as path: pathlib.Path(path).write_bytes(b"breakit") diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 6058f34d25ad3..df821fb740af8 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -19,10 +19,7 @@ import pytest from pandas._libs import lib -from pandas.compat import ( - pa_version_under13p0, - pa_version_under14p1, -) +from pandas.compat import pa_version_under14p1 from pandas.compat._optional import import_optional_dependency import pandas.util._test_decorators as td @@ -368,7 +365,7 @@ def create_and_load_postgres_datetz(conn): Timestamp("2000-01-01 08:00:00", tz="UTC"), Timestamp("2000-06-01 07:00:00", tz="UTC"), ] - return Series(expected_data, name="DateColWithTz") + return Series(expected_data, name="DateColWithTz").astype("M8[us, UTC]") def check_iris_frame(frame: DataFrame): @@ -1824,7 +1821,7 @@ def test_api_custom_dateparsing_error( pytest.mark.xfail(reason="failing combination of arguments") ) - expected = types_data_frame.astype({"DateCol": "datetime64[ns]"}) + expected = types_data_frame.astype({"DateCol": "datetime64[s]"}) result = read_sql( text, @@ -1847,10 +1844,12 @@ def test_api_custom_dateparsing_error( } ) - if not pa_version_under13p0: - # TODO: is this astype safe? - expected["DateCol"] = expected["DateCol"].astype("datetime64[us]") - + if conn_name == "postgresql_adbc_types" and pa_version_under14p1: + expected["DateCol"] = expected["DateCol"].astype("datetime64[ns]") + elif "postgres" in conn_name or "mysql" in conn_name: + expected["DateCol"] = expected["DateCol"].astype("datetime64[us]") + else: + expected["DateCol"] = expected["DateCol"].astype("datetime64[s]") tm.assert_frame_equal(result, expected) @@ -2835,7 +2834,9 @@ def test_datetime_with_timezone_table(conn, request): conn = request.getfixturevalue(conn) expected = create_and_load_postgres_datetz(conn) result = sql.read_sql_table("datetz", conn) - tm.assert_frame_equal(result, expected.to_frame()) + + exp_frame = expected.to_frame() + tm.assert_frame_equal(result, exp_frame) @pytest.mark.parametrize("conn", sqlalchemy_connectable) @@ -2847,7 +2848,7 @@ def test_datetime_with_timezone_roundtrip(conn, request): # For dbs that support timestamps with timezones, should get back UTC # otherwise naive data should be returned expected = DataFrame( - {"A": date_range("2013-01-01 09:00:00", periods=3, tz="US/Pacific")} + {"A": date_range("2013-01-01 09:00:00", periods=3, tz="US/Pacific", unit="us")} ) assert expected.to_sql(name="test_datetime_tz", con=conn, index=False) == 3 @@ -2865,7 +2866,7 @@ def test_datetime_with_timezone_roundtrip(conn, request): if "sqlite" in conn_name: # read_sql_query does not return datetime type like read_sql_table assert isinstance(result.loc[0, "A"], str) - result["A"] = to_datetime(result["A"]) + result["A"] = to_datetime(result["A"]).dt.as_unit("us") tm.assert_frame_equal(result, expected) @@ -2876,7 +2877,9 @@ def test_out_of_bounds_datetime(conn, request): data = DataFrame({"date": datetime(9999, 1, 1)}, index=[0]) assert data.to_sql(name="test_datetime_obb", con=conn, index=False) == 1 result = sql.read_sql_table("test_datetime_obb", conn) - expected = DataFrame([pd.NaT], columns=["date"]) + expected = DataFrame( + np.array([datetime(9999, 1, 1)], dtype="M8[us]"), columns=["date"] + ) tm.assert_frame_equal(result, expected) @@ -2885,7 +2888,7 @@ def test_naive_datetimeindex_roundtrip(conn, request): # GH 23510 # Ensure that a naive DatetimeIndex isn't converted to UTC conn = request.getfixturevalue(conn) - dates = date_range("2018-01-01", periods=5, freq="6h")._with_freq(None) + dates = date_range("2018-01-01", periods=5, freq="6h", unit="us")._with_freq(None) expected = DataFrame({"nums": range(5)}, index=dates) assert expected.to_sql(name="foo_table", con=conn, index_label="info_date") == 5 result = sql.read_sql_table("foo_table", conn, index_col="info_date") @@ -2937,7 +2940,10 @@ def test_datetime(conn, request): # with read_table -> type information from schema used result = sql.read_sql_table("test_datetime", conn) result = result.drop("index", axis=1) - tm.assert_frame_equal(result, df) + + expected = df[:] + expected["A"] = expected["A"].astype("M8[us]") + tm.assert_frame_equal(result, expected) # with read_sql -> no type information -> sqlite has no native result = sql.read_sql_query("SELECT * FROM test_datetime", conn) @@ -2945,9 +2951,7 @@ def test_datetime(conn, request): if "sqlite" in conn_name: assert isinstance(result.loc[0, "A"], str) result["A"] = to_datetime(result["A"]) - tm.assert_frame_equal(result, df) - else: - tm.assert_frame_equal(result, df) + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("conn", sqlalchemy_connectable) @@ -2962,16 +2966,17 @@ def test_datetime_NaT(conn, request): # with read_table -> type information from schema used result = sql.read_sql_table("test_datetime", conn) - tm.assert_frame_equal(result, df) + expected = df[:] + expected["A"] = expected["A"].astype("M8[us]") + tm.assert_frame_equal(result, expected) # with read_sql -> no type information -> sqlite has no native result = sql.read_sql_query("SELECT * FROM test_datetime", conn) if "sqlite" in conn_name: assert isinstance(result.loc[0, "A"], str) result["A"] = to_datetime(result["A"], errors="coerce") - tm.assert_frame_equal(result, df) - else: - tm.assert_frame_equal(result, df) + + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("conn", sqlalchemy_connectable) @@ -3963,6 +3968,7 @@ def test_self_join_date_columns(postgresql_psycopg2_engine): expected = DataFrame( [[1, Timestamp("2021", tz="UTC")] * 2], columns=["id", "created_dt"] * 2 ) + expected["created_dt"] = expected["created_dt"].astype("M8[us, UTC]") tm.assert_frame_equal(result, expected) # Cleanup diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 2f981953a6237..d5134a3e3afd0 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -181,9 +181,7 @@ def test_read_dta2(self, datapath): expected["monthly_date"] = expected["monthly_date"].astype("M8[s]") expected["quarterly_date"] = expected["quarterly_date"].astype("M8[s]") expected["half_yearly_date"] = expected["half_yearly_date"].astype("M8[s]") - expected["yearly_date"] = ( - expected["yearly_date"].astype("Period[s]").array.view("M8[s]") - ) + expected["yearly_date"] = expected["yearly_date"].astype("M8[s]") path1 = datapath("io", "data", "stata", "stata2_114.dta") path2 = datapath("io", "data", "stata", "stata2_115.dta") @@ -206,9 +204,9 @@ def test_read_dta2(self, datapath): # buggy test because of the NaT comparison on certain platforms # Format 113 test fails since it does not support tc and tC formats # tm.assert_frame_equal(parsed_113, expected) - tm.assert_frame_equal(parsed_114, expected, check_datetimelike_compat=True) - tm.assert_frame_equal(parsed_115, expected, check_datetimelike_compat=True) - tm.assert_frame_equal(parsed_117, expected, check_datetimelike_compat=True) + tm.assert_frame_equal(parsed_114, expected) + tm.assert_frame_equal(parsed_115, expected) + tm.assert_frame_equal(parsed_117, expected) @pytest.mark.parametrize( "file", ["stata3_113", "stata3_114", "stata3_115", "stata3_117"] @@ -952,8 +950,8 @@ def test_big_dates(self, datapath, temp_file): parsed_115 = read_stata(datapath("io", "data", "stata", "stata9_115.dta")) parsed_117 = read_stata(datapath("io", "data", "stata", "stata9_117.dta")) - tm.assert_frame_equal(expected, parsed_115, check_datetimelike_compat=True) - tm.assert_frame_equal(expected, parsed_117, check_datetimelike_compat=True) + tm.assert_frame_equal(expected, parsed_115) + tm.assert_frame_equal(expected, parsed_117) date_conversion = {c: c[-2:] for c in columns} # {c : c[-2:] for c in columns} @@ -965,7 +963,6 @@ def test_big_dates(self, datapath, temp_file): tm.assert_frame_equal( written_and_read_again.set_index("index"), expected.set_index(expected.index.astype(np.int32)), - check_datetimelike_compat=True, ) def test_dtype_conversion(self, datapath): @@ -1252,7 +1249,9 @@ def test_read_chunks_117( from_frame = parsed.iloc[pos : pos + chunksize, :].copy() from_frame = self._convert_categorical(from_frame) tm.assert_frame_equal( - from_frame, chunk, check_dtype=False, check_datetimelike_compat=True + from_frame, + chunk, + check_dtype=False, ) pos += chunksize @@ -1344,7 +1343,9 @@ def test_read_chunks_115( from_frame = parsed.iloc[pos : pos + chunksize, :].copy() from_frame = self._convert_categorical(from_frame) tm.assert_frame_equal( - from_frame, chunk, check_dtype=False, check_datetimelike_compat=True + from_frame, + chunk, + check_dtype=False, ) pos += chunksize diff --git a/pandas/tests/resample/test_base.py b/pandas/tests/resample/test_base.py index 3428abacd509e..f4ea6b1d3f3de 100644 --- a/pandas/tests/resample/test_base.py +++ b/pandas/tests/resample/test_base.py @@ -557,7 +557,8 @@ def test_first_last_skipna(any_real_nullable_dtype, skipna, how): method = getattr(rs, how) result = method(skipna=skipna) - gb = df.groupby(df.shape[0] * [pd.to_datetime("2020-01-31")]) + ts = pd.to_datetime("2020-01-31").as_unit("ns") + gb = df.groupby(df.shape[0] * [ts]) expected = getattr(gb, how)(skipna=skipna) expected.index.freq = "ME" tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/resample/test_time_grouper.py b/pandas/tests/resample/test_time_grouper.py index 5f5a54c4d92a3..2646106b9b97c 100644 --- a/pandas/tests/resample/test_time_grouper.py +++ b/pandas/tests/resample/test_time_grouper.py @@ -421,11 +421,13 @@ def test_groupby_resample_interpolate_with_apply_syntax_off_grid(groupy_test_df) ) volume = [50, 50, 60] - week_starting = [ - Timestamp("2018-01-07"), - Timestamp("2018-01-18 01:00:00"), - Timestamp("2018-01-14"), - ] + week_starting = pd.DatetimeIndex( + [ + Timestamp("2018-01-07"), + Timestamp("2018-01-18 01:00:00"), + Timestamp("2018-01-14"), + ] + ).as_unit("ns") expected_ind = pd.MultiIndex.from_arrays( [volume, week_starting], names=["volume", "week_starting"], diff --git a/pandas/tests/reshape/concat/test_append_common.py b/pandas/tests/reshape/concat/test_append_common.py index c831cb8293943..afafe8f6ab264 100644 --- a/pandas/tests/reshape/concat/test_append_common.py +++ b/pandas/tests/reshape/concat/test_append_common.py @@ -19,12 +19,12 @@ "float64": [1.1, np.nan, 3.3], "category": Categorical(["X", "Y", "Z"]), "object": ["a", "b", "c"], - "datetime64[ns]": [ + "datetime64[s]": [ pd.Timestamp("2011-01-01"), pd.Timestamp("2011-01-02"), pd.Timestamp("2011-01-03"), ], - "datetime64[ns, US/Eastern]": [ + "datetime64[s, US/Eastern]": [ pd.Timestamp("2011-01-01", tz="US/Eastern"), pd.Timestamp("2011-01-02", tz="US/Eastern"), pd.Timestamp("2011-01-03", tz="US/Eastern"), diff --git a/pandas/tests/reshape/concat/test_datetimes.py b/pandas/tests/reshape/concat/test_datetimes.py index 3e046b2df72d8..89a3c3c5ed8bc 100644 --- a/pandas/tests/reshape/concat/test_datetimes.py +++ b/pandas/tests/reshape/concat/test_datetimes.py @@ -213,7 +213,7 @@ def test_concat_NaT_dataframes(self, tz): @pytest.mark.parametrize("tz1", [None, "UTC"]) @pytest.mark.parametrize("tz2", [None, "UTC"]) - @pytest.mark.parametrize("item", [pd.NaT, Timestamp("20150101")]) + @pytest.mark.parametrize("item", [pd.NaT, Timestamp("20150101").as_unit("ns")]) def test_concat_NaT_dataframes_all_NaT_axis_0(self, tz1, tz2, item): # GH 12396 @@ -358,7 +358,7 @@ def test_concat_tz_series_tzlocal(self): result = concat([Series(x), Series(y)], ignore_index=True) tm.assert_series_equal(result, Series(x + y)) - assert result.dtype == "datetime64[ns, tzlocal()]" + assert result.dtype == "datetime64[s, tzlocal()]" def test_concat_tz_series_with_datetimelike(self): # see gh-12620: tz and timedelta diff --git a/pandas/tests/reshape/test_cut.py b/pandas/tests/reshape/test_cut.py index 340c5c449aea7..d8bb4fba1e1fe 100644 --- a/pandas/tests/reshape/test_cut.py +++ b/pandas/tests/reshape/test_cut.py @@ -1,3 +1,5 @@ +from datetime import datetime + import numpy as np import pytest @@ -445,10 +447,16 @@ def test_datetime_bin(conv): Interval(Timestamp(bin_data[1]), Timestamp(bin_data[2])), ] ) - ).astype(CategoricalDtype(ordered=True)) + ) bins = [conv(v) for v in bin_data] result = Series(cut(data, bins=bins)) + + if type(bins[0]) is datetime: + # The bins have microsecond dtype -> so does result + expected = expected.astype("interval[datetime64[us]]") + + expected = expected.astype(CategoricalDtype(ordered=True)) tm.assert_series_equal(result, expected) @@ -461,10 +469,6 @@ def test_datetime_cut(unit, box): data = box(data) result, _ = cut(data, 3, retbins=True) - if box is list: - # We don't (yet) do inference on these, so get nanos - unit = "ns" - if unit == "s": # See https://github.com/pandas-dev/pandas/pull/56101#discussion_r1405325425 # for why we round to 8 seconds instead of 7 @@ -531,24 +535,26 @@ def test_datetime_tz_cut(bins, box): bins = box(bins) result = cut(ser, bins) - expected = Series( - IntervalIndex( - [ - Interval( - Timestamp("2012-12-31 23:57:07.200000", tz=tz), - Timestamp("2013-01-01 16:00:00", tz=tz), - ), - Interval( - Timestamp("2013-01-01 16:00:00", tz=tz), - Timestamp("2013-01-02 08:00:00", tz=tz), - ), - Interval( - Timestamp("2013-01-02 08:00:00", tz=tz), - Timestamp("2013-01-03 00:00:00", tz=tz), - ), - ] - ) - ).astype(CategoricalDtype(ordered=True)) + ii = IntervalIndex( + [ + Interval( + Timestamp("2012-12-31 23:57:07.200000", tz=tz), + Timestamp("2013-01-01 16:00:00", tz=tz), + ), + Interval( + Timestamp("2013-01-01 16:00:00", tz=tz), + Timestamp("2013-01-02 08:00:00", tz=tz), + ), + Interval( + Timestamp("2013-01-02 08:00:00", tz=tz), + Timestamp("2013-01-03 00:00:00", tz=tz), + ), + ] + ) + if isinstance(bins, int): + # the dtype is inferred from ser, which has nanosecond unit + ii = ii.astype("interval[datetime64[ns, US/Eastern]]") + expected = Series(ii).astype(CategoricalDtype(ordered=True)) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/reshape/test_qcut.py b/pandas/tests/reshape/test_qcut.py index 53af673e0f7b0..5f769db7f8acf 100644 --- a/pandas/tests/reshape/test_qcut.py +++ b/pandas/tests/reshape/test_qcut.py @@ -271,8 +271,10 @@ def test_datetime_tz_qcut(bins): ], ], ) -def test_date_like_qcut_bins(arg, expected_bins): +def test_date_like_qcut_bins(arg, expected_bins, unit): # see gh-19891 + arg = arg.as_unit(unit) + expected_bins = expected_bins.as_unit(unit) ser = Series(arg) result, result_bins = qcut(ser, 2, retbins=True) tm.assert_index_equal(result_bins, expected_bins) diff --git a/pandas/tests/scalar/test_nat.py b/pandas/tests/scalar/test_nat.py index e352e2601cef3..131be7a77f2e5 100644 --- a/pandas/tests/scalar/test_nat.py +++ b/pandas/tests/scalar/test_nat.py @@ -439,8 +439,10 @@ def test_nat_rfloordiv_timedelta(val, expected): @pytest.mark.parametrize( "value", [ - DatetimeIndex(["2011-01-01", "2011-01-02"], name="x"), - DatetimeIndex(["2011-01-01", "2011-01-02"], tz="US/Eastern", name="x"), + DatetimeIndex(["2011-01-01", "2011-01-02"], dtype="M8[ns]", name="x"), + DatetimeIndex( + ["2011-01-01", "2011-01-02"], dtype="M8[ns, US/Eastern]", name="x" + ), DatetimeArray._from_sequence(["2011-01-01", "2011-01-02"], dtype="M8[ns]"), DatetimeArray._from_sequence( ["2011-01-01", "2011-01-02"], dtype=DatetimeTZDtype(tz="US/Pacific") diff --git a/pandas/tests/series/methods/test_combine_first.py b/pandas/tests/series/methods/test_combine_first.py index 0f2f533c8feff..293919173c2d5 100644 --- a/pandas/tests/series/methods/test_combine_first.py +++ b/pandas/tests/series/methods/test_combine_first.py @@ -78,7 +78,7 @@ def test_combine_first_dt64(self, unit): s1 = Series([np.nan, "2011"]) rs = s0.combine_first(s1) - xp = Series([datetime(2010, 1, 1), "2011"], dtype="datetime64[ns]") + xp = Series([datetime(2010, 1, 1), "2011"], dtype=f"datetime64[{unit}]") tm.assert_series_equal(rs, xp) diff --git a/pandas/tests/series/methods/test_fillna.py b/pandas/tests/series/methods/test_fillna.py index 592dba253532d..c10bb8278a3d1 100644 --- a/pandas/tests/series/methods/test_fillna.py +++ b/pandas/tests/series/methods/test_fillna.py @@ -411,7 +411,7 @@ def test_datetime64_tz_fillna(self, tz, unit): Timestamp("2011-01-02 10:00", tz=tz), Timestamp("2011-01-03 10:00"), Timestamp("2011-01-02 10:00", tz=tz), - ] + ], ) tm.assert_series_equal(expected, result) tm.assert_series_equal(isna(ser), null_loc) diff --git a/pandas/tests/series/methods/test_to_csv.py b/pandas/tests/series/methods/test_to_csv.py index f7dec02ab0e5b..488d0cb9fe9da 100644 --- a/pandas/tests/series/methods/test_to_csv.py +++ b/pandas/tests/series/methods/test_to_csv.py @@ -31,7 +31,9 @@ def test_from_csv(self, datetime_series, string_series, temp_file): path = temp_file datetime_series.to_csv(path, header=False) ts = self.read_csv(path, parse_dates=True) - tm.assert_series_equal(datetime_series, ts, check_names=False) + expected = datetime_series.copy() + expected.index = expected.index.as_unit("s") + tm.assert_series_equal(expected, ts, check_names=False) assert ts.name is None assert ts.index.name is None @@ -57,6 +59,7 @@ def test_from_csv(self, datetime_series, string_series, temp_file): series = self.read_csv(path, sep="|", parse_dates=True) check_series = Series({datetime(1998, 1, 1): 1.0, datetime(1999, 1, 1): 2.0}) + check_series.index = check_series.index.as_unit("s") tm.assert_series_equal(check_series, series) series = self.read_csv(path, sep="|", parse_dates=False) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 3f9d5bbe806bb..00c614cf72c20 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -752,7 +752,7 @@ def test_constructor_pass_nan_nat(self): tm.assert_series_equal(Series(np.array([np.nan, np.nan])), exp) exp = Series([NaT, NaT]) - assert exp.dtype == "datetime64[ns]" + assert exp.dtype == "datetime64[s]" tm.assert_series_equal(Series([NaT, NaT]), exp) tm.assert_series_equal(Series(np.array([NaT, NaT])), exp) @@ -934,7 +934,7 @@ def test_constructor_datetimes_with_nulls(self): np.array([None, None, datetime.now(), None]), ]: result = Series(arr) - assert result.dtype == "M8[ns]" + assert result.dtype == "M8[us]" def test_constructor_dtype_datetime64(self): s = Series(iNaT, dtype="M8[ns]", index=range(5)) @@ -962,15 +962,15 @@ def test_constructor_dtype_datetime64_10(self): dates = [np.datetime64(x) for x in pydates] ser = Series(dates) - assert ser.dtype == "M8[ns]" + assert ser.dtype == "M8[us]" ser.iloc[0] = np.nan - assert ser.dtype == "M8[ns]" + assert ser.dtype == "M8[us]" # GH3414 related expected = Series(pydates, dtype="datetime64[ms]") - result = Series(Series(dates).astype(np.int64) / 1000000, dtype="M8[ms]") + result = Series(Series(dates).astype(np.int64) / 1000, dtype="M8[ms]") tm.assert_series_equal(result, expected) result = Series(dates, dtype="datetime64[ms]") @@ -1084,16 +1084,16 @@ def test_constructor_dtype_datetime64_4(self): def test_constructor_dtype_datetime64_3(self): # if we passed a NaT it remains ser = Series([datetime(2010, 1, 1), datetime(2, 1, 1), NaT]) - assert ser.dtype == "object" + assert ser.dtype == "M8[us]" assert ser[2] is NaT assert "NaT" in str(ser) def test_constructor_dtype_datetime64_2(self): # if we passed a nan it remains ser = Series([datetime(2010, 1, 1), datetime(2, 1, 1), np.nan]) - assert ser.dtype == "object" - assert ser[2] is np.nan - assert "NaN" in str(ser) + assert ser.dtype == "M8[us]" + assert ser[2] is NaT + assert "NaT" in str(ser) def test_constructor_with_datetime_tz(self): # 8260 @@ -1155,7 +1155,7 @@ def test_constructor_with_datetime_tz4(self): Timestamp("2013-01-02 14:00:00-0800", tz="US/Pacific"), ] ) - assert ser.dtype == "datetime64[ns, US/Pacific]" + assert ser.dtype == "datetime64[s, US/Pacific]" assert lib.infer_dtype(ser, skipna=True) == "datetime64" def test_constructor_with_datetime_tz3(self): @@ -1215,7 +1215,7 @@ def test_construction_to_datetimelike_unit(self, arr_dtype, kind, unit): def test_constructor_with_naive_string_and_datetimetz_dtype(self, arg): # GH 17415: With naive string result = Series([arg], dtype="datetime64[ns, CET]") - expected = Series(Timestamp(arg)).dt.tz_localize("CET") + expected = Series([Timestamp(arg)], dtype="M8[ns]").dt.tz_localize("CET") tm.assert_series_equal(result, expected) def test_constructor_datetime64_bigendian(self): @@ -1356,14 +1356,8 @@ def test_constructor_dict_order(self): expected = Series([1, 0, 2], index=list("bac")) tm.assert_series_equal(result, expected) - def test_constructor_dict_extension(self, ea_scalar_and_dtype, request): + def test_constructor_dict_extension(self, ea_scalar_and_dtype): ea_scalar, ea_dtype = ea_scalar_and_dtype - if isinstance(ea_scalar, Timestamp): - mark = pytest.mark.xfail( - reason="Construction from dict goes through " - "maybe_convert_objects which casts to nano" - ) - request.applymarker(mark) d = {"a": ea_scalar} result = Series(d, index=["a"]) expected = Series(ea_scalar, index=["a"], dtype=ea_dtype) @@ -1408,7 +1402,9 @@ def create_data(constructor): result_Timestamp = Series(data_Timestamp) tm.assert_series_equal(result_datetime64, expected) - tm.assert_series_equal(result_datetime, expected) + tm.assert_series_equal( + result_datetime, expected.set_axis(expected.index.as_unit("us")) + ) tm.assert_series_equal(result_Timestamp, expected) def test_constructor_dict_tuple_indexer(self): diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 6da6ad27f853f..134ebededd163 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1264,6 +1264,7 @@ def test_value_counts_datetime_outofbounds(self, dtype): ], dtype=dtype, ) + res = ser.value_counts() exp_index = Index( diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index b05c30fa50fbe..cbbd018720bad 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -117,7 +117,9 @@ def test_to_datetime_format_YYYYMMDD_with_nat(self, cache): ser = Series([19801222, 19801222] + [19810105] * 5, dtype="float") # with NaT expected = Series( - [Timestamp("19801222"), Timestamp("19801222")] + [Timestamp("19810105")] * 5 + [Timestamp("19801222"), Timestamp("19801222")] + + [Timestamp("19810105")] * 5, + dtype="M8[s]", ) expected[2] = np.nan ser[2] = np.nan @@ -143,19 +145,32 @@ def test_to_datetime_format_YYYYMM_with_nat(self, cache): # Explicit cast to float to explicit cast when setting np.nan ser = Series([198012, 198012] + [198101] * 5, dtype="float") expected = Series( - [Timestamp("19801201"), Timestamp("19801201")] + [Timestamp("19810101")] * 5 + [Timestamp("19801201"), Timestamp("19801201")] + + [Timestamp("19810101")] * 5, + dtype="M8[s]", ) expected[2] = np.nan ser[2] = np.nan result = to_datetime(ser, format="%Y%m", cache=cache) tm.assert_series_equal(result, expected) + def test_to_datetime_format_YYYYMMDD_oob_for_ns(self, cache): + # coercion + # GH 7930, GH 14487 + ser = Series([20121231, 20141231, 99991231]) + result = to_datetime(ser, format="%Y%m%d", errors="raise", cache=cache) + expected = Series( + np.array(["2012-12-31", "2014-12-31", "9999-12-31"], dtype="M8[s]"), + dtype="M8[s]", + ) + tm.assert_series_equal(result, expected) + def test_to_datetime_format_YYYYMMDD_coercion(self, cache): # coercion # GH 7930 - ser = Series([20121231, 20141231, 99991231]) + ser = Series([20121231, 20141231, 999999999999999999999999999991231]) result = to_datetime(ser, format="%Y%m%d", errors="coerce", cache=cache) - expected = Series(["20121231", "20141231", "NaT"], dtype="M8[ns]") + expected = Series(["20121231", "20141231", "NaT"], dtype="M8[s]") tm.assert_series_equal(result, expected) @pytest.mark.parametrize( @@ -532,7 +547,8 @@ def test_to_datetime_overflow(self): res = to_datetime(arg, errors="coerce") assert res is NaT res = to_datetime([arg], errors="coerce") - tm.assert_index_equal(res, Index([NaT])) + exp = Index([NaT], dtype="M8[s]") + tm.assert_index_equal(res, exp) def test_to_datetime_mixed_datetime_and_string(self): # GH#47018 adapted old doctest with new behavior @@ -563,7 +579,7 @@ def test_to_datetime_mixed_date_and_string(self, format): # https://github.com/pandas-dev/pandas/issues/50108 d1 = date(2020, 1, 2) res = to_datetime(["2020-01-01", d1], format=format) - expected = DatetimeIndex(["2020-01-01", "2020-01-02"], dtype="M8[ns]") + expected = DatetimeIndex(["2020-01-01", "2020-01-02"], dtype="M8[s]") tm.assert_index_equal(res, expected) @pytest.mark.parametrize( @@ -579,7 +595,7 @@ def test_to_datetime_mixed_date_and_string(self, format): ["2000-01-01 01:00:00-08:00", "2000-01-01 02:00:00-08:00"], DatetimeIndex( ["2000-01-01 09:00:00+00:00", "2000-01-01 10:00:00+00:00"], - dtype="datetime64[ns, UTC]", + dtype="datetime64[us, UTC]", ), id="all tz-aware, with utc", ), @@ -588,7 +604,7 @@ def test_to_datetime_mixed_date_and_string(self, format): ["2000-01-01 01:00:00+00:00", "2000-01-01 02:00:00+00:00"], DatetimeIndex( ["2000-01-01 01:00:00+00:00", "2000-01-01 02:00:00+00:00"], - ), + ).as_unit("us"), id="all tz-aware, without utc", ), pytest.param( @@ -596,7 +612,7 @@ def test_to_datetime_mixed_date_and_string(self, format): ["2000-01-01 01:00:00-08:00", "2000-01-01 02:00:00+00:00"], DatetimeIndex( ["2000-01-01 09:00:00+00:00", "2000-01-01 02:00:00+00:00"], - dtype="datetime64[ns, UTC]", + dtype="datetime64[us, UTC]", ), id="all tz-aware, mixed offsets, with utc", ), @@ -605,7 +621,7 @@ def test_to_datetime_mixed_date_and_string(self, format): ["2000-01-01 01:00:00", "2000-01-01 02:00:00+00:00"], DatetimeIndex( ["2000-01-01 01:00:00+00:00", "2000-01-01 02:00:00+00:00"], - dtype="datetime64[ns, UTC]", + dtype="datetime64[us, UTC]", ), id="tz-aware string, naive pydatetime, with utc", ), @@ -625,6 +641,8 @@ def test_to_datetime_mixed_datetime_and_string_with_format( ts1 = constructor(args[0]) ts2 = args[1] result = to_datetime([ts1, ts2], format=fmt, utc=utc) + if constructor is Timestamp: + expected = expected.as_unit("s") tm.assert_index_equal(result, expected) @pytest.mark.parametrize( @@ -696,7 +714,7 @@ def test_to_datetime_mixed_offsets_with_none_tz_utc_false_removed( "%Y-%m-%d %H:%M:%S%z", DatetimeIndex( ["2000-01-01 08:00:00+00:00", "2000-01-02 00:00:00+00:00", "NaT"], - dtype="datetime64[ns, UTC]", + dtype="datetime64[s, UTC]", ), id="ISO8601, UTC", ), @@ -704,7 +722,7 @@ def test_to_datetime_mixed_offsets_with_none_tz_utc_false_removed( "%Y-%d-%m %H:%M:%S%z", DatetimeIndex( ["2000-01-01 08:00:00+00:00", "2000-02-01 00:00:00+00:00", "NaT"], - dtype="datetime64[ns, UTC]", + dtype="datetime64[s, UTC]", ), id="non-ISO8601, UTC", ), @@ -965,7 +983,7 @@ def test_to_datetime_now(self): # See GH#18666 with tm.set_timezone("US/Eastern"): # GH#18705 - now = Timestamp("now").as_unit("ns") + now = Timestamp("now") pdnow = to_datetime("now") pdnow2 = to_datetime(["now"])[0] @@ -988,12 +1006,12 @@ def test_to_datetime_today(self, tz): # this both of these timezones _and_ UTC will all be in the same day, # so this test will not detect the regression introduced in #18666. with tm.set_timezone(tz): - nptoday = np.datetime64("today").astype("datetime64[ns]").astype(np.int64) + nptoday = np.datetime64("today").astype("datetime64[us]").astype(np.int64) pdtoday = to_datetime("today") pdtoday2 = to_datetime(["today"])[0] - tstoday = Timestamp("today").as_unit("ns") - tstoday2 = Timestamp.today().as_unit("ns") + tstoday = Timestamp("today") + tstoday2 = Timestamp.today() # These should all be equal with infinite perf; this gives # a generous margin of 10 seconds @@ -1030,7 +1048,7 @@ def test_to_datetime_now_with_format(self, format, expected_ds, string, attribut # https://github.com/pandas-dev/pandas/issues/50359 result = to_datetime(["2020-01-03 00:00:00Z", string], format=format, utc=True) expected = DatetimeIndex( - [expected_ds, getattr(Timestamp, attribute)()], dtype="datetime64[ns, UTC]" + [expected_ds, getattr(Timestamp, attribute)()], dtype="datetime64[s, UTC]" ) assert (expected - result).max().total_seconds() < 1 @@ -1091,11 +1109,7 @@ def test_to_datetime_array_of_dt64s(self, cache, unit): # Assuming all datetimes are in bounds, to_datetime() returns # an array that is equal to Timestamp() parsing result = to_datetime(dts, cache=cache) - if cache: - # FIXME: behavior should not depend on cache - expected = DatetimeIndex([Timestamp(x).asm8 for x in dts], dtype="M8[s]") - else: - expected = DatetimeIndex([Timestamp(x).asm8 for x in dts], dtype="M8[ns]") + expected = DatetimeIndex([Timestamp(x).asm8 for x in dts], dtype="M8[s]") tm.assert_index_equal(result, expected) @@ -1106,14 +1120,7 @@ def test_to_datetime_array_of_dt64s(self, cache, unit): to_datetime(dts_with_oob, errors="raise") result = to_datetime(dts_with_oob, errors="coerce", cache=cache) - if not cache: - # FIXME: shouldn't depend on cache! - expected = DatetimeIndex( - [Timestamp(dts_with_oob[0]).asm8, Timestamp(dts_with_oob[1]).asm8] * 30 - + [NaT], - ) - else: - expected = DatetimeIndex(np.array(dts_with_oob, dtype="M8[s]")) + expected = DatetimeIndex(np.array(dts_with_oob, dtype="M8[s]")) tm.assert_index_equal(result, expected) def test_to_datetime_tz(self, cache): @@ -1126,7 +1133,7 @@ def test_to_datetime_tz(self, cache): result = to_datetime(arr, cache=cache) expected = DatetimeIndex( ["2013-01-01 13:00:00", "2013-01-02 14:00:00"], tz="US/Pacific" - ) + ).as_unit("s") tm.assert_index_equal(result, expected) def test_to_datetime_tz_mixed(self, cache): @@ -1145,7 +1152,7 @@ def test_to_datetime_tz_mixed(self, cache): result = to_datetime(arr, cache=cache, errors="coerce") expected = DatetimeIndex( - ["2013-01-01 13:00:00-08:00", "NaT"], dtype="datetime64[ns, US/Pacific]" + ["2013-01-01 13:00:00-08:00", "NaT"], dtype="datetime64[s, US/Pacific]" ) tm.assert_index_equal(result, expected) @@ -1177,7 +1184,7 @@ def test_to_datetime_tz_pytz(self, cache): result = to_datetime(arr, utc=True, cache=cache) expected = DatetimeIndex( ["2000-01-01 08:00:00+00:00", "2000-06-01 07:00:00+00:00"], - dtype="datetime64[ns, UTC]", + dtype="datetime64[us, UTC]", freq=None, ) tm.assert_index_equal(result, expected) @@ -1264,7 +1271,7 @@ def test_to_datetime_tz_psycopg2(self, request, cache): result = to_datetime(arr, errors="coerce", utc=True, cache=cache) expected = DatetimeIndex( ["2000-01-01 08:00:00+00:00", "2000-06-01 07:00:00+00:00"], - dtype="datetime64[ns, UTC]", + dtype="datetime64[us, UTC]", freq=None, ) tm.assert_index_equal(result, expected) @@ -1273,15 +1280,15 @@ def test_to_datetime_tz_psycopg2(self, request, cache): i = DatetimeIndex( ["2000-01-01 08:00:00"], tz=psycopg2_tz.FixedOffsetTimezone(offset=-300, name=None), - ) - assert is_datetime64_ns_dtype(i) + ).as_unit("us") + assert not is_datetime64_ns_dtype(i) # tz coercion result = to_datetime(i, errors="coerce", cache=cache) tm.assert_index_equal(result, i) result = to_datetime(i, errors="coerce", utc=True, cache=cache) - expected = DatetimeIndex(["2000-01-01 13:00:00"], dtype="datetime64[ns, UTC]") + expected = DatetimeIndex(["2000-01-01 13:00:00"], dtype="datetime64[us, UTC]") tm.assert_index_equal(result, expected) @pytest.mark.parametrize("arg", [True, False]) @@ -1351,16 +1358,20 @@ def test_datetime_invalid_scalar(self, value, format): def test_datetime_outofbounds_scalar(self, value, format): # GH24763 res = to_datetime(value, errors="coerce", format=format) - assert res is NaT + if format is None: + assert isinstance(res, Timestamp) + assert res == Timestamp(value) + else: + assert res is NaT if format is not None: msg = r'^time data ".*" doesn\'t match format ".*", at position 0.' with pytest.raises(ValueError, match=msg): to_datetime(value, errors="raise", format=format) else: - msg = "^Out of bounds .*, at position 0$" - with pytest.raises(OutOfBoundsDatetime, match=msg): - to_datetime(value, errors="raise", format=format) + res = to_datetime(value, errors="raise", format=format) + assert isinstance(res, Timestamp) + assert res == Timestamp(value) @pytest.mark.parametrize( ("values"), [(["a"]), (["00:01:99"]), (["a", "b", "99:00:00"])] @@ -1433,15 +1444,17 @@ def test_to_datetime_cache_scalar(self): assert result == expected @pytest.mark.parametrize( - "datetimelikes,expected_values", + "datetimelikes,expected_values,exp_unit", ( ( (None, np.nan) + (NaT,) * start_caching_at, (NaT,) * (start_caching_at + 2), + "s", ), ( (None, Timestamp("2012-07-26")) + (NaT,) * start_caching_at, (NaT, Timestamp("2012-07-26")) + (NaT,) * start_caching_at, + "s", ), ( (None,) @@ -1449,11 +1462,12 @@ def test_to_datetime_cache_scalar(self): + ("2012 July 26", Timestamp("2012-07-26")), (NaT,) * (start_caching_at + 1) + (Timestamp("2012-07-26"), Timestamp("2012-07-26")), + "s", ), ), ) def test_convert_object_to_datetime_with_cache( - self, datetimelikes, expected_values + self, datetimelikes, expected_values, exp_unit ): # GH#39882 ser = Series( @@ -1463,7 +1477,7 @@ def test_convert_object_to_datetime_with_cache( result_series = to_datetime(ser, errors="coerce") expected_series = Series( expected_values, - dtype="datetime64[ns]", + dtype=f"datetime64[{exp_unit}]", ) tm.assert_series_equal(result_series, expected_series) @@ -1484,7 +1498,7 @@ def test_convert_object_to_datetime_with_cache( ) def test_to_datetime_converts_null_like_to_nat(self, cache, input): # GH35888 - expected = Series([NaT] * len(input), dtype="M8[ns]") + expected = Series([NaT] * len(input), dtype="M8[s]") result = to_datetime(input, cache=cache) tm.assert_series_equal(result, expected) @@ -1535,7 +1549,17 @@ def test_to_datetime_coerce_oob(self, string_arg, format, outofbounds): # https://github.com/pandas-dev/pandas/issues/50255 ts_strings = [string_arg, outofbounds] result = to_datetime(ts_strings, errors="coerce", format=format) - expected = DatetimeIndex([datetime(2018, 3, 1), NaT]) + if isinstance(outofbounds, str) and ( + format.startswith("%B") ^ outofbounds.startswith("J") + ): + # the strings don't match the given format, so they raise and we coerce + expected = DatetimeIndex([datetime(2018, 3, 1), NaT], dtype="M8[s]") + elif isinstance(outofbounds, datetime): + expected = DatetimeIndex( + [datetime(2018, 3, 1), outofbounds], dtype="M8[us]" + ) + else: + expected = DatetimeIndex([datetime(2018, 3, 1), outofbounds], dtype="M8[s]") tm.assert_index_equal(result, expected) def test_to_datetime_malformed_no_raise(self): @@ -1546,7 +1570,9 @@ def test_to_datetime_malformed_no_raise(self): UserWarning, match="Could not infer format", raise_on_extra_warnings=False ): result = to_datetime(ts_strings, errors="coerce") - tm.assert_index_equal(result, Index([NaT, NaT])) + # TODO: should Index get "s" by default here? + exp = Index([NaT, NaT], dtype="M8[s]") + tm.assert_index_equal(result, exp) def test_to_datetime_malformed_raise(self): # GH 48633 @@ -1594,7 +1620,7 @@ def test_iso_8601_strings_with_different_offsets_utc(self): result = to_datetime(ts_strings, utc=True) expected = DatetimeIndex( [Timestamp(2015, 11, 18, 10), Timestamp(2015, 11, 18, 10), NaT], tz="UTC" - ) + ).as_unit("s") tm.assert_index_equal(result, expected) def test_mixed_offsets_with_native_datetime_utc_false_raises(self): @@ -1620,7 +1646,7 @@ def test_non_iso_strings_with_tz_offset(self): result = to_datetime(["March 1, 2018 12:00:00+0400"] * 2) expected = DatetimeIndex( [datetime(2018, 3, 1, 12, tzinfo=timezone(timedelta(minutes=240)))] * 2 - ) + ).as_unit("s") tm.assert_index_equal(result, expected) @pytest.mark.parametrize( @@ -1641,9 +1667,11 @@ def test_timestamp_utc_true(self, ts, expected): @pytest.mark.parametrize("dt_str", ["00010101", "13000101", "30000101", "99990101"]) def test_to_datetime_with_format_out_of_bounds(self, dt_str): # GH 9107 - msg = "Out of bounds nanosecond timestamp" - with pytest.raises(OutOfBoundsDatetime, match=msg): - to_datetime(dt_str, format="%Y%m%d") + res = to_datetime(dt_str, format="%Y%m%d") + dtobj = datetime.strptime(dt_str, "%Y%m%d") + expected = Timestamp(dtobj).as_unit("s") + assert res == expected + assert res.unit == expected.unit def test_to_datetime_utc(self): arr = np.array([parse("2012-06-13T01:39:00Z")], dtype=object) @@ -1726,7 +1754,7 @@ def test_to_datetime_month_or_year_unit_non_round_float(self, cache, unit): # In 3.0, the string "1.5" is parsed as as it would be without unit, # which fails. With errors="coerce" this becomes NaT. res = to_datetime(["1.5"], unit=unit, errors="coerce") - expected = to_datetime([NaT]) + expected = to_datetime([NaT]).as_unit("ns") tm.assert_index_equal(res, expected) # round floats are OK @@ -2149,7 +2177,7 @@ def test_dataframe_utc_true(self): df = DataFrame({"year": [2015, 2016], "month": [2, 3], "day": [4, 5]}) result = to_datetime(df, utc=True) expected = Series( - np.array(["2015-02-04", "2016-03-05"], dtype="datetime64[ns]") + np.array(["2015-02-04", "2016-03-05"], dtype="datetime64[s]") ).dt.tz_localize("UTC") tm.assert_series_equal(result, expected) @@ -2361,7 +2389,9 @@ def test_to_datetime_with_space_in_series(self, cache): with pytest.raises(ValueError, match=msg): to_datetime(ser, errors="raise", cache=cache) result_coerce = to_datetime(ser, errors="coerce", cache=cache) - expected_coerce = Series([datetime(2006, 10, 18), datetime(2008, 10, 18), NaT]) + expected_coerce = Series( + [datetime(2006, 10, 18), datetime(2008, 10, 18), NaT] + ).dt.as_unit("s") tm.assert_series_equal(result_coerce, expected_coerce) @td.skip_if_not_us_locale @@ -2473,7 +2503,7 @@ def test_string_na_nat_conversion(self, cache): strings = np.array(["1/1/2000", "1/2/2000", np.nan, "1/4/2000"], dtype=object) - expected = np.empty(4, dtype="M8[ns]") + expected = np.empty(4, dtype="M8[s]") for i, val in enumerate(strings): if isna(val): expected[i] = iNaT @@ -2518,7 +2548,7 @@ def test_string_na_nat_conversion_with_name(self, cache): result = to_datetime(series, cache=cache) dresult = to_datetime(dseries, cache=cache) - expected = Series(np.empty(5, dtype="M8[ns]"), index=idx) + expected = Series(np.empty(5, dtype="M8[s]"), index=idx) for i in range(5): x = series.iloc[i] if isna(x): @@ -2558,7 +2588,7 @@ def test_dayfirst(self, cache): arr = ["10/02/2014", "11/02/2014", "12/02/2014"] expected = DatetimeIndex( [datetime(2014, 2, 10), datetime(2014, 2, 11), datetime(2014, 2, 12)] - ) + ).as_unit("s") idx1 = DatetimeIndex(arr, dayfirst=True) idx2 = DatetimeIndex(np.array(arr), dayfirst=True) idx3 = to_datetime(arr, dayfirst=True, cache=cache) @@ -2582,7 +2612,7 @@ def test_dayfirst_warnings_valid_input(self): # CASE 1: valid input arr = ["31/12/2014", "10/03/2011"] expected = DatetimeIndex( - ["2014-12-31", "2011-03-10"], dtype="datetime64[ns]", freq=None + ["2014-12-31", "2011-03-10"], dtype="datetime64[s]", freq=None ) # A. dayfirst arg correct, no warning @@ -2687,7 +2717,7 @@ def test_to_datetime_consistent_format(self, cache): ser = Series(np.array(data)) result = to_datetime(ser, cache=cache) expected = Series( - ["2011-01-01", "2011-02-01", "2011-03-01"], dtype="datetime64[ns]" + ["2011-01-01", "2011-02-01", "2011-03-01"], dtype="datetime64[s]" ) tm.assert_series_equal(result, expected) @@ -2699,9 +2729,7 @@ def test_to_datetime_series_with_nans(self, cache): ) ) result = to_datetime(ser, cache=cache) - expected = Series( - ["2011-01-01", NaT, "2011-01-03", NaT], dtype="datetime64[ns]" - ) + expected = Series(["2011-01-01", NaT, "2011-01-03", NaT], dtype="datetime64[s]") tm.assert_series_equal(result, expected) def test_to_datetime_series_start_with_nans(self, cache): @@ -2720,7 +2748,7 @@ def test_to_datetime_series_start_with_nans(self, cache): result = to_datetime(ser, cache=cache) expected = Series( - [NaT, NaT, "2011-01-01", "2011-01-02", "2011-01-03"], dtype="datetime64[ns]" + [NaT, NaT, "2011-01-01", "2011-01-02", "2011-01-03"], dtype="datetime64[s]" ) tm.assert_series_equal(result, expected) @@ -2734,6 +2762,7 @@ def test_infer_datetime_format_tz_name(self, tz_name, offset): result = to_datetime(ser) tz = timezone(timedelta(minutes=offset)) expected = Series([Timestamp("2019-02-02 08:07:13").tz_localize(tz)]) + expected = expected.dt.as_unit("s") tm.assert_series_equal(result, expected) @pytest.mark.parametrize( @@ -2890,9 +2919,16 @@ def test_parsers(self, date_str, expected, cache): # https://github.com/dateutil/dateutil/issues/217 yearfirst = True - result1, _ = parsing.parse_datetime_string_with_reso( + result1, reso_attrname = parsing.parse_datetime_string_with_reso( date_str, yearfirst=yearfirst ) + + reso = { + "nanosecond": "ns", + "microsecond": "us", + "millisecond": "ms", + "second": "s", + }.get(reso_attrname, "s") result2 = to_datetime(date_str, yearfirst=yearfirst) result3 = to_datetime([date_str], yearfirst=yearfirst) # result5 is used below @@ -2907,7 +2943,7 @@ def test_parsers(self, date_str, expected, cache): for res in [result1, result2]: assert res == expected for res in [result3, result4, result6, result8, result9]: - exp = DatetimeIndex([Timestamp(expected)]) + exp = DatetimeIndex([Timestamp(expected)]).as_unit(reso) tm.assert_index_equal(res, exp) # these really need to have yearfirst, but we don't support @@ -2921,7 +2957,7 @@ def test_na_values_with_cache( self, cache, unique_nulls_fixture, unique_nulls_fixture2 ): # GH22305 - expected = Index([NaT, NaT], dtype="datetime64[ns]") + expected = Index([NaT, NaT], dtype="datetime64[s]") result = to_datetime([unique_nulls_fixture, unique_nulls_fixture2], cache=cache) tm.assert_index_equal(result, expected) @@ -3197,9 +3233,16 @@ def test_incorrect_value_exception(self): ) def test_to_datetime_out_of_bounds_with_format_arg(self, format, warning): # see gh-23830 - msg = r"^Out of bounds nanosecond timestamp: 2417-10-10 00:00:00, at position 0" - with pytest.raises(OutOfBoundsDatetime, match=msg): - to_datetime("2417-10-10 00:00:00", format=format) + if format is None: + res = to_datetime("2417-10-10 00:00:00.00", format=format) + assert isinstance(res, Timestamp) + assert res.year == 2417 + assert res.month == 10 + assert res.day == 10 + else: + msg = "unconverted data remains when parsing with format.*, at position 0" + with pytest.raises(ValueError, match=msg): + to_datetime("2417-10-10 00:00:00.00", format=format) @pytest.mark.parametrize( "arg, origin, expected_str", @@ -3331,7 +3374,7 @@ def test_empty_string_datetime(errors, args, format): # coerce empty string to pd.NaT result = to_datetime(td, format=format, errors=errors) - expected = Series(["2016-03-24", "2016-03-25", NaT], dtype="datetime64[ns]") + expected = Series(["2016-03-24", "2016-03-25", NaT], dtype="datetime64[s]") tm.assert_series_equal(expected, result) @@ -3371,14 +3414,12 @@ def test_to_datetime_cache_coerce_50_lines_outofbounds(series_length): ) result1 = to_datetime(ser, errors="coerce", utc=True) - expected1 = Series( - [NaT] + ([Timestamp("1991-10-20 00:00:00+00:00")] * series_length) - ) - + expected1 = Series([Timestamp(x) for x in ser]) + assert expected1.dtype == "M8[us, UTC]" tm.assert_series_equal(result1, expected1) - with pytest.raises(OutOfBoundsDatetime, match="Out of bounds nanosecond timestamp"): - to_datetime(ser, errors="raise", utc=True) + result3 = to_datetime(ser, errors="raise", utc=True) + tm.assert_series_equal(result3, expected1) def test_to_datetime_format_f_parse_nanos(): @@ -3463,7 +3504,7 @@ def test_to_datetime_with_empty_str_utc_false_format_mixed(): # GH 50887 vals = ["2020-01-01 00:00+00:00", ""] result = to_datetime(vals, format="mixed") - expected = Index([Timestamp("2020-01-01 00:00+00:00"), "NaT"], dtype="M8[ns, UTC]") + expected = Index([Timestamp("2020-01-01 00:00+00:00"), "NaT"], dtype="M8[s, UTC]") tm.assert_index_equal(result, expected) # Check that a couple of other similar paths work the same way diff --git a/pandas/tests/tools/test_to_timedelta.py b/pandas/tests/tools/test_to_timedelta.py index ba000a0439dd1..894f49b2fa140 100644 --- a/pandas/tests/tools/test_to_timedelta.py +++ b/pandas/tests/tools/test_to_timedelta.py @@ -29,7 +29,7 @@ def test_to_timedelta_dt64_raises(self): # supported GH#29794 msg = r"dtype datetime64\[ns\] cannot be converted to timedelta64\[ns\]" - ser = Series([pd.NaT]) + ser = Series([pd.NaT], dtype="M8[ns]") with pytest.raises(TypeError, match=msg): to_timedelta(ser) with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/tseries/holiday/test_calendar.py b/pandas/tests/tseries/holiday/test_calendar.py index 99829857e6836..90e2e117852a2 100644 --- a/pandas/tests/tseries/holiday/test_calendar.py +++ b/pandas/tests/tseries/holiday/test_calendar.py @@ -57,10 +57,10 @@ def __init__(self, name=None, rules=None) -> None: jan2 = TestCalendar(rules=[Holiday("jan2", year=2015, month=1, day=2)]) # Getting holidays for Jan 1 should not alter results for Jan 2. - expected = DatetimeIndex(["01-Jan-2015"]).as_unit("ns") + expected = DatetimeIndex(["01-Jan-2015"]).as_unit("us") tm.assert_index_equal(jan1.holidays(), expected) - expected2 = DatetimeIndex(["02-Jan-2015"]).as_unit("ns") + expected2 = DatetimeIndex(["02-Jan-2015"]).as_unit("us") tm.assert_index_equal(jan2.holidays(), expected2) diff --git a/pandas/tests/tslibs/test_array_to_datetime.py b/pandas/tests/tslibs/test_array_to_datetime.py index 35b72c9bb2887..3c55ae2c6f904 100644 --- a/pandas/tests/tslibs/test_array_to_datetime.py +++ b/pandas/tests/tslibs/test_array_to_datetime.py @@ -15,7 +15,6 @@ tslib, ) from pandas._libs.tslibs.dtypes import NpyDatetimeUnit -from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime from pandas import Timestamp import pandas._testing as tm @@ -156,7 +155,7 @@ def test_parsing_valid_dates(data, expected): arr = np.array(data, dtype=object) result, _ = tslib.array_to_datetime(arr) - expected = np.array(expected, dtype="M8[ns]") + expected = np.array(expected, dtype="M8[s]") tm.assert_numpy_array_equal(result, expected) @@ -174,6 +173,8 @@ def test_parsing_timezone_offsets(dt_string, expected_tz): # to the same datetime after the timezone offset is added. arr = np.array(["01-01-2013 00:00:00"], dtype=object) expected, _ = tslib.array_to_datetime(arr) + if "000000000" in dt_string: + expected = expected.astype("M8[ns]") arr = np.array([dt_string], dtype=object) result, result_tz = tslib.array_to_datetime(arr) @@ -206,38 +207,35 @@ def test_parsing_different_timezone_offsets(): @pytest.mark.parametrize( - "invalid_date", + "invalid_date,exp_unit", [ - date(1000, 1, 1), - datetime(1000, 1, 1), - "1000-01-01", - "Jan 1, 1000", - np.datetime64("1000-01-01"), + (date(1000, 1, 1), "s"), + (datetime(1000, 1, 1), "us"), + ("1000-01-01", "s"), + ("Jan 1, 1000", "s"), + (np.datetime64("1000-01-01"), "s"), ], ) @pytest.mark.parametrize("errors", ["coerce", "raise"]) -def test_coerce_outside_ns_bounds(invalid_date, errors): +def test_coerce_outside_ns_bounds(invalid_date, exp_unit, errors): arr = np.array([invalid_date], dtype="object") - kwargs = {"values": arr, "errors": errors} - if errors == "raise": - msg = "^Out of bounds nanosecond timestamp: .*, at position 0$" + result, _ = tslib.array_to_datetime(arr, errors=errors) + out_reso = np.datetime_data(result.dtype)[0] + assert out_reso == exp_unit + ts = Timestamp(invalid_date) + assert ts.unit == exp_unit - with pytest.raises(OutOfBoundsDatetime, match=msg): - tslib.array_to_datetime(**kwargs) - else: # coerce. - result, _ = tslib.array_to_datetime(**kwargs) - expected = np.array([iNaT], dtype="M8[ns]") - - tm.assert_numpy_array_equal(result, expected) + expected = np.array([ts._value], dtype=f"M8[{exp_unit}]") + tm.assert_numpy_array_equal(result, expected) def test_coerce_outside_ns_bounds_one_valid(): arr = np.array(["1/1/1000", "1/1/2000"], dtype=object) result, _ = tslib.array_to_datetime(arr, errors="coerce") - expected = [iNaT, "2000-01-01T00:00:00.000000000"] - expected = np.array(expected, dtype="M8[ns]") + expected = ["1000-01-01T00:00:00.000000000", "2000-01-01T00:00:00.000000000"] + expected = np.array(expected, dtype="M8[s]") tm.assert_numpy_array_equal(result, expected) @@ -247,7 +245,13 @@ def test_coerce_of_invalid_datetimes(): # With coercing, the invalid dates becomes iNaT result, _ = tslib.array_to_datetime(arr, errors="coerce") expected = ["2013-01-01T00:00:00.000000000", iNaT, iNaT] - tm.assert_numpy_array_equal(result, np.array(expected, dtype="M8[ns]")) + tm.assert_numpy_array_equal(result, np.array(expected, dtype="M8[s]")) + + # With coercing, the invalid dates becomes iNaT + result, _ = tslib.array_to_datetime(arr, errors="coerce") + expected = ["2013-01-01T00:00:00.000000000", iNaT, iNaT] + + tm.assert_numpy_array_equal(result, np.array(expected, dtype="M8[s]")) def test_to_datetime_barely_out_of_bounds(): @@ -292,5 +296,5 @@ def test_datetime_subclass(klass): arr = np.array([klass(2000, 1, 1)], dtype=object) result, _ = tslib.array_to_datetime(arr) - expected = np.array(["2000-01-01T00:00:00.000000000"], dtype="M8[ns]") + expected = np.array(["2000-01-01T00:00:00.000000"], dtype="M8[us]") tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/util/test_hashing.py b/pandas/tests/util/test_hashing.py index a54e0071aa006..e654534ccd453 100644 --- a/pandas/tests/util/test_hashing.py +++ b/pandas/tests/util/test_hashing.py @@ -260,14 +260,14 @@ def test_categorical_consistency(s1, categorize): tm.assert_series_equal(h1, h3) -def test_categorical_with_nan_consistency(): - c = pd.Categorical.from_codes( - [-1, 0, 1, 2, 3, 4], categories=pd.date_range("2012-01-01", periods=5, name="B") - ) - expected = hash_array(c, categorize=False) - - c = pd.Categorical.from_codes([-1, 0], categories=[pd.Timestamp("2012-01-01")]) - result = hash_array(c, categorize=False) +def test_categorical_with_nan_consistency(unit): + dti = pd.date_range("2012-01-01", periods=5, name="B", unit=unit) + cat = pd.Categorical.from_codes([-1, 0, 1, 2, 3, 4], categories=dti) + expected = hash_array(cat, categorize=False) + + ts = pd.Timestamp("2012-01-01").as_unit(unit) + cat2 = pd.Categorical.from_codes([-1, 0], categories=[ts]) + result = hash_array(cat2, categorize=False) assert result[0] in expected assert result[1] in expected From bce10b46fe72e7d517eb2e874e8d4f60f5fced69 Mon Sep 17 00:00:00 2001 From: Philipp Hoffmann Date: Fri, 31 May 2024 18:41:47 +0000 Subject: [PATCH 017/272] PERF: more realistic np datetime c benchmark (#58165) * make input range for np_datetime.c benchmark more realistic * fix random numbers * fix import * add new benchmark --- asv_bench/benchmarks/tslibs/fields.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/asv_bench/benchmarks/tslibs/fields.py b/asv_bench/benchmarks/tslibs/fields.py index 3a2baec54109a..fe31879e67a67 100644 --- a/asv_bench/benchmarks/tslibs/fields.py +++ b/asv_bench/benchmarks/tslibs/fields.py @@ -19,10 +19,15 @@ class TimeGetTimedeltaField: def setup(self, size, field): arr = np.random.randint(0, 10, size=size, dtype="i8") self.i8data = arr + arr = np.random.randint(-86400 * 1_000_000_000, 0, size=size, dtype="i8") + self.i8data_negative = arr def time_get_timedelta_field(self, size, field): get_timedelta_field(self.i8data, field) + def time_get_timedelta_field_negative_td(self, size, field): + get_timedelta_field(self.i8data_negative, field) + class TimeGetDateField: params = [ @@ -72,3 +77,6 @@ def setup(self, size, side, period, freqstr, month_kw): def time_get_start_end_field(self, size, side, period, freqstr, month_kw): get_start_end_field(self.i8data, self.attrname, freqstr, month_kw=month_kw) + + +from ..pandas_vb_common import setup # noqa: F401 isort:skip From 6c55f5e7f145ecb5ee1e44f91f8167e59b285050 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 31 May 2024 11:55:32 -0700 Subject: [PATCH 018/272] Bump pypa/cibuildwheel from 2.18.0 to 2.18.1 (#58840) Bumps [pypa/cibuildwheel](https://github.com/pypa/cibuildwheel) from 2.18.0 to 2.18.1. - [Release notes](https://github.com/pypa/cibuildwheel/releases) - [Changelog](https://github.com/pypa/cibuildwheel/blob/main/docs/changelog.md) - [Commits](https://github.com/pypa/cibuildwheel/compare/v2.18.0...v2.18.1) --- updated-dependencies: - dependency-name: pypa/cibuildwheel dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/wheels.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index ab0201ca623aa..d7a98671c42bc 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -140,7 +140,7 @@ jobs: run: echo "sdist_name=$(cd ./dist && ls -d */)" >> "$GITHUB_ENV" - name: Build wheels - uses: pypa/cibuildwheel@v2.18.0 + uses: pypa/cibuildwheel@v2.18.1 with: package-dir: ./dist/${{ startsWith(matrix.buildplat[1], 'macosx') && env.sdist_name || needs.build_sdist.outputs.sdist_file }} env: From 9ada6084409b8db42b7542142b7ae817c2a87246 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Sat, 1 Jun 2024 21:55:20 +0530 Subject: [PATCH 019/272] DOC: fix PR07,SA01 for pandas.util.hash_array (#58877) --- ci/code_checks.sh | 1 - pandas/core/util/hashing.py | 6 ++++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index b9b3ca24b4162..b4ae6976c6916 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -788,7 +788,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.tseries.offsets.YearEnd.normalize GL08" \ -i "pandas.tseries.offsets.YearEnd.rule_code GL08" \ -i "pandas.unique PR07" \ - -i "pandas.util.hash_array PR07,SA01" \ -i "pandas.util.hash_pandas_object PR07,SA01" # There should be no backslash in the final line, please keep this comment in the last ignored function RET=$(($RET + $?)) ; echo $MSG "DONE" diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index 3b9dd40a92ce8..e120e69dc27cf 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -244,6 +244,7 @@ def hash_array( Parameters ---------- vals : ndarray or ExtensionArray + The input array to hash. encoding : str, default 'utf8' Encoding for data & key when strings. hash_key : str, default _default_hash_key @@ -257,6 +258,11 @@ def hash_array( ndarray[np.uint64, ndim=1] Hashed values, same length as the vals. + See Also + -------- + util.hash_pandas_object : Return a data hash of the Index/Series/DataFrame. + util.hash_tuples : Hash an MultiIndex / listlike-of-tuples efficiently. + Examples -------- >>> pd.util.hash_array(np.array([1, 2, 3])) From a608dfa4942dd9c16bdaef2bf4bd21b8fa18ddd2 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Sat, 1 Jun 2024 21:55:59 +0530 Subject: [PATCH 020/272] DOC: fix PR02 for pandas.tseries.offsets.YearEnd (#58878) --- ci/code_checks.sh | 1 - pandas/_libs/tslibs/offsets.pyx | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index b4ae6976c6916..700b0bda1b916 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -779,7 +779,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.tseries.offsets.YearBegin.nanos GL08" \ -i "pandas.tseries.offsets.YearBegin.normalize GL08" \ -i "pandas.tseries.offsets.YearBegin.rule_code GL08" \ - -i "pandas.tseries.offsets.YearEnd PR02" \ -i "pandas.tseries.offsets.YearEnd.freqstr SA01" \ -i "pandas.tseries.offsets.YearEnd.is_on_offset GL08" \ -i "pandas.tseries.offsets.YearEnd.month GL08" \ diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 0d681e0c2aae6..f9d63065493c3 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -2579,7 +2579,7 @@ cdef class YearEnd(YearOffset): YearEnd goes to the next date which is the end of the year. - Parameters + Attributes ---------- n : int, default 1 The number of years represented. From 71d067f60c1bf7d9fdddf307ac6c525c11359633 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Sat, 1 Jun 2024 21:56:33 +0530 Subject: [PATCH 021/272] DOC: fix SA01 for pandas.timedelta_range (#58879) --- ci/code_checks.sh | 1 - pandas/core/indexes/timedeltas.py | 5 +++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 700b0bda1b916..80248b4fd5944 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -494,7 +494,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.testing.assert_extension_array_equal SA01" \ -i "pandas.testing.assert_index_equal PR07,SA01" \ -i "pandas.testing.assert_series_equal PR07,SA01" \ - -i "pandas.timedelta_range SA01" \ -i "pandas.tseries.offsets.BDay PR02,SA01" \ -i "pandas.tseries.offsets.BQuarterBegin PR02" \ -i "pandas.tseries.offsets.BQuarterBegin.freqstr SA01" \ diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 8af5a56f43c57..29039ffd0217e 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -273,6 +273,11 @@ def timedelta_range( TimedeltaIndex Fixed frequency, with day as the default. + See Also + -------- + date_range : Return a fixed frequency DatetimeIndex. + period_range : Return a fixed frequency PeriodIndex. + Notes ----- Of the four parameters ``start``, ``end``, ``periods``, and ``freq``, From 04fccd250f4ab84e2effda647154612076620cc3 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Sat, 1 Jun 2024 21:57:18 +0530 Subject: [PATCH 022/272] DOC: fix PR07,SA01 for pandas.testing.assert_index_equal (#58880) --- ci/code_checks.sh | 1 - pandas/_testing/asserters.py | 7 +++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 80248b4fd5944..6feff654a36f5 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -492,7 +492,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.show_versions SA01" \ -i "pandas.test SA01" \ -i "pandas.testing.assert_extension_array_equal SA01" \ - -i "pandas.testing.assert_index_equal PR07,SA01" \ -i "pandas.testing.assert_series_equal PR07,SA01" \ -i "pandas.tseries.offsets.BDay PR02,SA01" \ -i "pandas.tseries.offsets.BQuarterBegin PR02" \ diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index 543d7944e4c5d..ecaa7e5507996 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -196,7 +196,9 @@ def assert_index_equal( Parameters ---------- left : Index + The first index to compare. right : Index + The second index to compare. exact : bool or {'equiv'}, default 'equiv' Whether to check the Index class, dtype and inferred_type are identical. If 'equiv', then RangeIndex can be substituted for @@ -219,6 +221,11 @@ def assert_index_equal( Specify object name being compared, internally used to show appropriate assertion message. + See Also + -------- + testing.assert_series_equal : Check that two Series are equal. + testing.assert_frame_equal : Check that two DataFrames are equal. + Examples -------- >>> from pandas import testing as tm From 55f32cddd3280140378dd6c2301fbf8d76129014 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Sat, 1 Jun 2024 21:57:54 +0530 Subject: [PATCH 023/272] DOC: fix SA01 for pandas.test (#58881) --- ci/code_checks.sh | 1 - pandas/util/_tester.py | 4 ++++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 6feff654a36f5..d79a42a6fef6b 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -490,7 +490,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.set_eng_float_format RT03,SA01" \ -i "pandas.set_option SA01" \ -i "pandas.show_versions SA01" \ - -i "pandas.test SA01" \ -i "pandas.testing.assert_extension_array_equal SA01" \ -i "pandas.testing.assert_series_equal PR07,SA01" \ -i "pandas.tseries.offsets.BDay PR02,SA01" \ diff --git a/pandas/util/_tester.py b/pandas/util/_tester.py index 494f306ec807d..c0e9756372f47 100644 --- a/pandas/util/_tester.py +++ b/pandas/util/_tester.py @@ -27,6 +27,10 @@ def test(extra_args: list[str] | None = None, run_doctests: bool = False) -> Non both doctests/regular tests, just append "--doctest-modules"/"--doctest-cython" to extra_args. + See Also + -------- + pytest.main : The main entry point for pytest testing framework. + Examples -------- >>> pd.test() # doctest: +SKIP From 4a9ac5a94c1a460b27be7dea45e188ad1dc42bd5 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Sat, 1 Jun 2024 21:58:28 +0530 Subject: [PATCH 024/272] DOC: fix SA01 for pandas.set_option (#58882) --- ci/code_checks.sh | 1 - pandas/_config/config.py | 8 ++++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index d79a42a6fef6b..5f53705133816 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -488,7 +488,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.read_spss SA01" \ -i "pandas.reset_option SA01" \ -i "pandas.set_eng_float_format RT03,SA01" \ - -i "pandas.set_option SA01" \ -i "pandas.show_versions SA01" \ -i "pandas.testing.assert_extension_array_equal SA01" \ -i "pandas.testing.assert_series_equal PR07,SA01" \ diff --git a/pandas/_config/config.py b/pandas/_config/config.py index 1b91a7c3ee636..5bd6535b0343c 100644 --- a/pandas/_config/config.py +++ b/pandas/_config/config.py @@ -211,6 +211,14 @@ def set_option(*args) -> None: TypeError if keyword arguments are provided OptionError if no such option exists + See Also + -------- + get_option : Retrieve the value of the specified option. + reset_option : Reset one or more options to their default value. + describe_option : Print the description for one or more registered options. + option_context : Context manager to temporarily set options in a ``with`` + statement. + Notes ----- For all available options, please view the :ref:`User Guide ` From 979d51e0e8d914bbb739b1d9d557f6b3dabf8c27 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Sat, 1 Jun 2024 21:59:01 +0530 Subject: [PATCH 025/272] DOC: fix SA01 for pandas.read_sas (#58883) * DOC: fix SA01 for pandas.read_sas * DOC: fix SA01 for pandas.read_sas --- ci/code_checks.sh | 1 - pandas/io/sas/sasreader.py | 8 ++++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 5f53705133816..ef079cac0414b 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -484,7 +484,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.qcut PR07,SA01" \ -i "pandas.read_feather SA01" \ -i "pandas.read_orc SA01" \ - -i "pandas.read_sas SA01" \ -i "pandas.read_spss SA01" \ -i "pandas.reset_option SA01" \ -i "pandas.set_eng_float_format RT03,SA01" \ diff --git a/pandas/io/sas/sasreader.py b/pandas/io/sas/sasreader.py index 12d698a4f76a8..6daf4a24781bd 100644 --- a/pandas/io/sas/sasreader.py +++ b/pandas/io/sas/sasreader.py @@ -124,6 +124,14 @@ def read_sas( DataFrame if iterator=False and chunksize=None, else SAS7BDATReader or XportReader, file format is inferred from file extension. + See Also + -------- + read_csv : Read a comma-separated values (csv) file into a pandas DataFrame. + read_excel : Read an Excel file into a pandas DataFrame. + read_spss : Read an SPSS file into a pandas DataFrame. + read_orc : Load an ORC object into a pandas DataFrame. + read_feather : Load a feather-format object into a pandas DataFrame. + Examples -------- >>> df = pd.read_sas("sas_data.sas7bdat") # doctest: +SKIP From 84dab8937404e9f671e969338bf97e86b3948830 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Sat, 1 Jun 2024 21:59:36 +0530 Subject: [PATCH 026/272] DOC: fix SA01 for pandas.read_feather (#58884) --- ci/code_checks.sh | 1 - pandas/io/feather_format.py | 8 ++++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index ef079cac0414b..d5b4dd2f59bea 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -482,7 +482,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.plotting.scatter_matrix PR07,SA01" \ -i "pandas.plotting.table PR07,RT03,SA01" \ -i "pandas.qcut PR07,SA01" \ - -i "pandas.read_feather SA01" \ -i "pandas.read_orc SA01" \ -i "pandas.read_spss SA01" \ -i "pandas.reset_option SA01" \ diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index b42dbaa579ee7..8132167fbe05c 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -107,6 +107,14 @@ def read_feather( type of object stored in file DataFrame object stored in the file. + See Also + -------- + read_csv : Read a comma-separated values (csv) file into a pandas DataFrame. + read_excel : Read an Excel file into a pandas DataFrame. + read_spss : Read an SPSS file into a pandas DataFrame. + read_orc : Load an ORC object into a pandas DataFrame. + read_sas : Read SAS file into a pandas DataFrame. + Examples -------- >>> df = pd.read_feather("path/to/file.feather") # doctest: +SKIP From b0c4194c2fd63a84adc6e84eb77e2102e96d4b71 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Sat, 1 Jun 2024 22:00:08 +0530 Subject: [PATCH 027/272] DOC: fix SA01 for pandas.plotting.plot_params (#58885) --- ci/code_checks.sh | 1 - pandas/plotting/_misc.py | 8 ++++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index d5b4dd2f59bea..230b4d3daf243 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -478,7 +478,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.plotting.autocorrelation_plot RT03,SA01" \ -i "pandas.plotting.lag_plot RT03,SA01" \ -i "pandas.plotting.parallel_coordinates PR07,RT03,SA01" \ - -i "pandas.plotting.plot_params SA01" \ -i "pandas.plotting.scatter_matrix PR07,SA01" \ -i "pandas.plotting.table PR07,RT03,SA01" \ -i "pandas.qcut PR07,SA01" \ diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py index af7ddf39283c0..d79bb7152e6b4 100644 --- a/pandas/plotting/_misc.py +++ b/pandas/plotting/_misc.py @@ -617,6 +617,14 @@ class _Options(dict): the same as the plot function parameters, but is stored in a canonical format that makes it easy to breakdown into groups later. + See Also + -------- + plotting.register_matplotlib_converters : Register pandas formatters and + converters with matplotlib. + plotting.bootstrap_plot : Bootstrap plot on mean, median and mid-range statistics. + plotting.autocorrelation_plot : Autocorrelation plot for time series. + plotting.lag_plot : Lag plot for time series. + Examples -------- From 1b48cef8749296b043ff7100853361c2429bb08f Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Mon, 3 Jun 2024 23:06:45 +0530 Subject: [PATCH 028/272] DOC: fix PR07 for pandas.unique (#58887) --- ci/code_checks.sh | 1 - pandas/core/algorithms.py | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 230b4d3daf243..e1181abda0010 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -779,7 +779,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.tseries.offsets.YearEnd.nanos GL08" \ -i "pandas.tseries.offsets.YearEnd.normalize GL08" \ -i "pandas.tseries.offsets.YearEnd.rule_code GL08" \ - -i "pandas.unique PR07" \ -i "pandas.util.hash_pandas_object PR07,SA01" # There should be no backslash in the final line, please keep this comment in the last ignored function RET=$(($RET + $?)) ; echo $MSG "DONE" diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index b6d157eda8fd3..0d97f8a298fdb 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -319,6 +319,8 @@ def unique(values): Parameters ---------- values : 1d array-like + The input array-like object containing values from which to extract + unique values. Returns ------- From 8643b2fb3faf8cbb4ebf47a35cc298744d0c991b Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Mon, 3 Jun 2024 23:07:35 +0530 Subject: [PATCH 029/272] DOC: fix PR07,SA01 for pandas.testing.assert_series_equal (#58888) --- ci/code_checks.sh | 1 - pandas/_testing/asserters.py | 7 +++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index e1181abda0010..153433160f482 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -487,7 +487,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.set_eng_float_format RT03,SA01" \ -i "pandas.show_versions SA01" \ -i "pandas.testing.assert_extension_array_equal SA01" \ - -i "pandas.testing.assert_series_equal PR07,SA01" \ -i "pandas.tseries.offsets.BDay PR02,SA01" \ -i "pandas.tseries.offsets.BQuarterBegin PR02" \ -i "pandas.tseries.offsets.BQuarterBegin.freqstr SA01" \ diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index ecaa7e5507996..430840711122a 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -857,7 +857,9 @@ def assert_series_equal( Parameters ---------- left : Series + First Series to compare. right : Series + Second Series to compare. check_dtype : bool, default True Whether to check the Series dtype is identical. check_index_type : bool or {'equiv'}, default 'equiv' @@ -908,6 +910,11 @@ def assert_series_equal( .. versionadded:: 1.5.0 + See Also + -------- + testing.assert_index_equal : Check that two Indexes are equal. + testing.assert_frame_equal : Check that two DataFrames are equal. + Examples -------- >>> from pandas import testing as tm From 39cc859debb80c223ac68d311604c15f706ad255 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Mon, 3 Jun 2024 23:08:10 +0530 Subject: [PATCH 030/272] DOC: fix SA01 for pandas.show_versions (#58889) --- ci/code_checks.sh | 1 - pandas/util/_print_versions.py | 5 +++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 153433160f482..f3e84a6e01331 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -485,7 +485,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.read_spss SA01" \ -i "pandas.reset_option SA01" \ -i "pandas.set_eng_float_format RT03,SA01" \ - -i "pandas.show_versions SA01" \ -i "pandas.testing.assert_extension_array_equal SA01" \ -i "pandas.tseries.offsets.BDay PR02,SA01" \ -i "pandas.tseries.offsets.BQuarterBegin PR02" \ diff --git a/pandas/util/_print_versions.py b/pandas/util/_print_versions.py index 6cdd96996cea6..c4fec39594407 100644 --- a/pandas/util/_print_versions.py +++ b/pandas/util/_print_versions.py @@ -115,6 +115,11 @@ def show_versions(as_json: str | bool = False) -> None: Info will be written to that file in JSON format. * If True, outputs info in JSON format to the console. + See Also + -------- + get_option : Retrieve the value of the specified option. + set_option : Set the value of the specified option or options. + Examples -------- >>> pd.show_versions() # doctest: +SKIP From 23e4c3a925c2323cf36b8c86bc4fa84b0cbdcb85 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Mon, 3 Jun 2024 23:08:37 +0530 Subject: [PATCH 031/272] DOC: fix SA01 for pandas.reset_option (#58890) --- ci/code_checks.sh | 1 - pandas/_config/config.py | 6 ++++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index f3e84a6e01331..22106c35d3fb3 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -483,7 +483,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.qcut PR07,SA01" \ -i "pandas.read_orc SA01" \ -i "pandas.read_spss SA01" \ - -i "pandas.reset_option SA01" \ -i "pandas.set_eng_float_format RT03,SA01" \ -i "pandas.testing.assert_extension_array_equal SA01" \ -i "pandas.tseries.offsets.BDay PR02,SA01" \ diff --git a/pandas/_config/config.py b/pandas/_config/config.py index 5bd6535b0343c..2fb258cc3e874 100644 --- a/pandas/_config/config.py +++ b/pandas/_config/config.py @@ -323,6 +323,12 @@ def reset_option(pat: str) -> None: None No return value. + See Also + -------- + get_option : Retrieve the value of the specified option. + set_option : Set the value of the specified option or options. + describe_option : Print the description for one or more registered options. + Notes ----- For all available options, please view the From bef7c39f4214557bf02705ee6b41b88f4cdeed93 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Mon, 3 Jun 2024 23:09:17 +0530 Subject: [PATCH 032/272] DOC: fix PR07,RT03,SA01 for pandas.plotting.table (#58892) --- ci/code_checks.sh | 1 - pandas/plotting/_misc.py | 7 +++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 22106c35d3fb3..c4666bbd3a5fa 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -479,7 +479,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.plotting.lag_plot RT03,SA01" \ -i "pandas.plotting.parallel_coordinates PR07,RT03,SA01" \ -i "pandas.plotting.scatter_matrix PR07,SA01" \ - -i "pandas.plotting.table PR07,RT03,SA01" \ -i "pandas.qcut PR07,SA01" \ -i "pandas.read_orc SA01" \ -i "pandas.read_spss SA01" \ diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py index d79bb7152e6b4..8d36a8767bc66 100644 --- a/pandas/plotting/_misc.py +++ b/pandas/plotting/_misc.py @@ -33,6 +33,7 @@ def table(ax: Axes, data: DataFrame | Series, **kwargs) -> Table: Parameters ---------- ax : Matplotlib axes object + The axes on which to draw the table. data : DataFrame or Series Data for table contents. **kwargs @@ -43,6 +44,12 @@ def table(ax: Axes, data: DataFrame | Series, **kwargs) -> Table: Returns ------- matplotlib table object + The created table as a matplotlib Table object. + + See Also + -------- + DataFrame.plot : Make plots of DataFrame using matplotlib. + matplotlib.pyplot.table : Create a table from data in a Matplotlib plot. Examples -------- From 2e93c6336857c12c188cbb7386100593eb885efd Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Mon, 3 Jun 2024 23:09:44 +0530 Subject: [PATCH 033/272] DOC: fix SA01 for pandas.read_orc (#58891) --- ci/code_checks.sh | 1 - pandas/io/orc.py | 8 ++++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index c4666bbd3a5fa..d11287c033675 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -480,7 +480,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.plotting.parallel_coordinates PR07,RT03,SA01" \ -i "pandas.plotting.scatter_matrix PR07,SA01" \ -i "pandas.qcut PR07,SA01" \ - -i "pandas.read_orc SA01" \ -i "pandas.read_spss SA01" \ -i "pandas.set_eng_float_format RT03,SA01" \ -i "pandas.testing.assert_extension_array_equal SA01" \ diff --git a/pandas/io/orc.py b/pandas/io/orc.py index d4b4fd90658ad..3bca8ea7ef1df 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -85,6 +85,14 @@ def read_orc( DataFrame DataFrame based on the ORC file. + See Also + -------- + read_csv : Read a comma-separated values (csv) file into a pandas DataFrame. + read_excel : Read an Excel file into a pandas DataFrame. + read_spss : Read an SPSS file into a pandas DataFrame. + read_sas : Load a SAS file into a pandas DataFrame. + read_feather : Load a feather-format object into a pandas DataFrame. + Notes ----- Before using this function you should read the :ref:`user guide about ORC ` From 8c2391192e0af276a37a0c4a869eb24de0c134ff Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Mon, 3 Jun 2024 23:10:15 +0530 Subject: [PATCH 034/272] DOC: fix PR07,RT03,SA01 for pandas.plotting.parallel_coordinates (#58894) --- ci/code_checks.sh | 1 - pandas/plotting/_misc.py | 8 ++++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index d11287c033675..ea0fc1bb38427 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -477,7 +477,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.plotting.andrews_curves RT03,SA01" \ -i "pandas.plotting.autocorrelation_plot RT03,SA01" \ -i "pandas.plotting.lag_plot RT03,SA01" \ - -i "pandas.plotting.parallel_coordinates PR07,RT03,SA01" \ -i "pandas.plotting.scatter_matrix PR07,SA01" \ -i "pandas.qcut PR07,SA01" \ -i "pandas.read_spss SA01" \ diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py index 8d36a8767bc66..7b498c7c94178 100644 --- a/pandas/plotting/_misc.py +++ b/pandas/plotting/_misc.py @@ -479,6 +479,7 @@ def parallel_coordinates( Parameters ---------- frame : DataFrame + The DataFrame to be plotted. class_column : str Column name containing class names. cols : list, optional @@ -505,6 +506,13 @@ def parallel_coordinates( Returns ------- matplotlib.axes.Axes + The matplotlib axes containing the parallel coordinates plot. + + See Also + -------- + plotting.andrews_curves : Generate a matplotlib plot for visualizing clusters + of multivariate data. + plotting.radviz : Plot a multidimensional dataset in 2D. Examples -------- From e00e142235d943d5c71aa6fad2e0f91b27354035 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Mon, 3 Jun 2024 23:10:48 +0530 Subject: [PATCH 035/272] DOC: fix RT03,SA01 for pandas.plotting.autocorrelation_plot (#58895) --- ci/code_checks.sh | 1 - pandas/plotting/_misc.py | 6 ++++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index ea0fc1bb38427..5300a6c03b87f 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -475,7 +475,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.pivot PR07" \ -i "pandas.pivot_table PR07" \ -i "pandas.plotting.andrews_curves RT03,SA01" \ - -i "pandas.plotting.autocorrelation_plot RT03,SA01" \ -i "pandas.plotting.lag_plot RT03,SA01" \ -i "pandas.plotting.scatter_matrix PR07,SA01" \ -i "pandas.qcut PR07,SA01" \ diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py index 7b498c7c94178..d8455f44ef0d1 100644 --- a/pandas/plotting/_misc.py +++ b/pandas/plotting/_misc.py @@ -606,6 +606,12 @@ def autocorrelation_plot(series: Series, ax: Axes | None = None, **kwargs) -> Ax Returns ------- matplotlib.axes.Axes + The matplotlib axes containing the autocorrelation plot. + + See Also + -------- + Series.autocorr : Compute the lag-N autocorrelation for a Series. + plotting.lag_plot : Lag plot for time series. Examples -------- From 93d9561c038daba40fad9c15b5639bc17ee4a1e6 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Mon, 3 Jun 2024 23:11:49 +0530 Subject: [PATCH 036/272] DOC: fix SA01 for pandas.option_context (#58897) --- ci/code_checks.sh | 1 - pandas/_config/config.py | 7 +++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 5300a6c03b87f..1e615b6df8446 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -470,7 +470,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.merge PR07" \ -i "pandas.merge_asof PR07,RT03" \ -i "pandas.merge_ordered PR07" \ - -i "pandas.option_context SA01" \ -i "pandas.period_range RT03,SA01" \ -i "pandas.pivot PR07" \ -i "pandas.pivot_table PR07" \ diff --git a/pandas/_config/config.py b/pandas/_config/config.py index 2fb258cc3e874..55d9e29686259 100644 --- a/pandas/_config/config.py +++ b/pandas/_config/config.py @@ -420,6 +420,13 @@ def option_context(*args) -> Generator[None, None, None]: None No return value. + See Also + -------- + get_option : Retrieve the value of the specified option. + set_option : Set the value of the specified option. + reset_option : Reset one or more options to their default value. + describe_option : Print the description for one or more registered options. + Notes ----- For all available options, please view the :ref:`User Guide ` From f49b286615a3a67d8bf41b007b18ce6e10f8d0de Mon Sep 17 00:00:00 2001 From: Cristina Yenyxe Gonzalez Garcia Date: Mon, 3 Jun 2024 20:06:41 +0200 Subject: [PATCH 037/272] Removed duplicated description of cumsum and cumprod methods (#58902) --- doc/source/user_guide/missing_data.rst | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst index 29f3fea899336..69dfb406daa43 100644 --- a/doc/source/user_guide/missing_data.rst +++ b/doc/source/user_guide/missing_data.rst @@ -337,10 +337,8 @@ When taking the product, NA values or empty data will be treated as 1. pd.Series([], dtype="float64").prod() Cumulative methods like :meth:`~DataFrame.cumsum` and :meth:`~DataFrame.cumprod` -ignore NA values by default preserve them in the result. This behavior can be changed -with ``skipna`` - -* Cumulative methods like :meth:`~DataFrame.cumsum` and :meth:`~DataFrame.cumprod` ignore NA values by default, but preserve them in the resulting arrays. To override this behaviour and include NA values, use ``skipna=False``. +ignore NA values by default, but preserve them in the resulting array. To override +this behaviour and include NA values in the calculation, use ``skipna=False``. .. ipython:: python From ed4b8676f46bb259f8be2a6ba6466602002e0e4a Mon Sep 17 00:00:00 2001 From: Rob <124158982+rob-sil@users.noreply.github.com> Date: Mon, 3 Jun 2024 11:18:13 -0700 Subject: [PATCH 038/272] BUG: Check `min_periods` before applying the function (#58886) * Check min_periods before calling the function * Update whatsnew --- doc/source/whatsnew/v3.0.0.rst | 2 +- pandas/core/window/numba_.py | 4 ++-- pandas/tests/window/test_numba.py | 15 +++++++++++++++ 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 865996bdf8892..872c5b64892e1 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -536,7 +536,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrameGroupBy.apply` that was returning a completely empty DataFrame when all return values of ``func`` were ``None`` instead of returning an empty DataFrame with the original columns and dtypes. (:issue:`57775`) - Bug in :meth:`DataFrameGroupBy.apply` with ``as_index=False`` that was returning :class:`MultiIndex` instead of returning :class:`Index`. (:issue:`58291`) - Bug in :meth:`DataFrameGroupby.transform` and :meth:`SeriesGroupby.transform` with a reducer and ``observed=False`` that coerces dtype to float when there are unobserved categories. (:issue:`55326`) - +- Bug in :meth:`Rolling.apply` where the applied function could be called on fewer than ``min_period`` periods if ``method="table"``. (:issue:`58868`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/window/numba_.py b/pandas/core/window/numba_.py index eb06479fc325e..824cf936b8185 100644 --- a/pandas/core/window/numba_.py +++ b/pandas/core/window/numba_.py @@ -227,10 +227,10 @@ def roll_table( stop = end[i] window = values[start:stop] count_nan = np.sum(np.isnan(window), axis=0) - sub_result = numba_func(window, *args) nan_mask = len(window) - count_nan >= minimum_periods + if nan_mask.any(): + result[i, :] = numba_func(window, *args) min_periods_mask[i, :] = nan_mask - result[i, :] = sub_result result = np.where(min_periods_mask, result, np.nan) return result diff --git a/pandas/tests/window/test_numba.py b/pandas/tests/window/test_numba.py index 650eb911e410b..3695ab8bf6cd3 100644 --- a/pandas/tests/window/test_numba.py +++ b/pandas/tests/window/test_numba.py @@ -67,6 +67,21 @@ def f(x, *args): ) tm.assert_series_equal(result, expected) + def test_numba_min_periods(self): + # GH 58868 + def last_row(x): + assert len(x) == 3 + return x[-1] + + df = DataFrame([[1, 2], [3, 4], [5, 6], [7, 8]]) + + result = df.rolling(3, method="table", min_periods=3).apply( + last_row, raw=True, engine="numba" + ) + + expected = DataFrame([[np.nan, np.nan], [np.nan, np.nan], [5, 6], [7, 8]]) + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( "data", [ From a5492ee9ab1459185b0792f3bea6b7acd36d3112 Mon Sep 17 00:00:00 2001 From: cmjcharlton <90400333+cmjcharlton@users.noreply.github.com> Date: Mon, 3 Jun 2024 19:33:23 +0100 Subject: [PATCH 039/272] ENH: Restore support for reading Stata 104 format dta files, and add support for 103 (#58555) --- doc/source/whatsnew/v3.0.0.rst | 2 +- pandas/io/stata.py | 7 ++--- .../tests/io/data/stata/stata-compat-103.dta | Bin 0 -> 650 bytes .../tests/io/data/stata/stata-compat-104.dta | Bin 0 -> 647 bytes .../io/data/stata/stata-compat-be-103.dta | Bin 0 -> 650 bytes .../io/data/stata/stata-compat-be-104.dta | Bin 0 -> 647 bytes pandas/tests/io/data/stata/stata4_103.dta | Bin 0 -> 780 bytes pandas/tests/io/data/stata/stata4_104.dta | Bin 0 -> 770 bytes pandas/tests/io/test_stata.py | 25 +++++++++++++++++- 9 files changed, 29 insertions(+), 5 deletions(-) create mode 100644 pandas/tests/io/data/stata/stata-compat-103.dta create mode 100644 pandas/tests/io/data/stata/stata-compat-104.dta create mode 100644 pandas/tests/io/data/stata/stata-compat-be-103.dta create mode 100644 pandas/tests/io/data/stata/stata-compat-be-104.dta create mode 100644 pandas/tests/io/data/stata/stata4_103.dta create mode 100644 pandas/tests/io/data/stata/stata4_104.dta diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 872c5b64892e1..e5d65ad82cc95 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -44,8 +44,8 @@ Other enhancements - :meth:`DataFrame.fillna` and :meth:`Series.fillna` can now accept ``value=None``; for non-object dtype the corresponding NA value will be used (:issue:`57723`) - :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`) - :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`) +- Restore support for reading Stata 104-format and enable reading 103-format dta files (:issue:`58554`) - Support reading Stata 110-format (Stata 7) dta files (:issue:`47176`) -- .. --------------------------------------------------------------------------- .. _whatsnew_300.notable_bug_fixes: diff --git a/pandas/io/stata.py b/pandas/io/stata.py index b87ec94b85bb0..4e7bd160a5a52 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -91,7 +91,7 @@ _version_error = ( "Version of given Stata file is {version}. pandas supports importing " - "versions 105, 108, 110 (Stata 7), 111 (Stata 7SE), 113 (Stata 8/9), " + "versions 103, 104, 105, 108, 110 (Stata 7), 111 (Stata 7SE), 113 (Stata 8/9), " "114 (Stata 10/11), 115 (Stata 12), 117 (Stata 13), 118 (Stata 14/15/16)," "and 119 (Stata 15/16, over 32,767 variables)." ) @@ -1393,7 +1393,7 @@ def _get_seek_variable_labels(self) -> int: def _read_old_header(self, first_char: bytes) -> None: self._format_version = int(first_char[0]) - if self._format_version not in [104, 105, 108, 110, 111, 113, 114, 115]: + if self._format_version not in [103, 104, 105, 108, 110, 111, 113, 114, 115]: raise ValueError(_version_error.format(version=self._format_version)) self._set_encoding() self._byteorder = ">" if self._read_int8() == 0x1 else "<" @@ -1405,7 +1405,8 @@ def _read_old_header(self, first_char: bytes) -> None: self._data_label = self._get_data_label() - self._time_stamp = self._get_time_stamp() + if self._format_version >= 105: + self._time_stamp = self._get_time_stamp() # descriptors if self._format_version >= 111: diff --git a/pandas/tests/io/data/stata/stata-compat-103.dta b/pandas/tests/io/data/stata/stata-compat-103.dta new file mode 100644 index 0000000000000000000000000000000000000000..adfeb6c672333b3223c6b4afebda5598baea7b5b GIT binary patch literal 650 zcmYdiVr1Z8U}hi;axyb>(o#}7GxJhXD?rLKEufk*4b32|Ok*PmBMmCUkOF6vKv~6x z1~4&nTGh}<&mf&a)dEDqDX5?&M9|OxtOQOKqZ=~HCp!cffja;H|Nr~{|N8%D&z#vi zYbFTnvKo$T1 literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/stata/stata-compat-be-103.dta b/pandas/tests/io/data/stata/stata-compat-be-103.dta new file mode 100644 index 0000000000000000000000000000000000000000..0e2ef231f91c00f79e5e8c274e139f325fe4e6f2 GIT binary patch literal 650 zcmYdiWMp9AU|?Wi24cJ*Co?lAEhVKhGcP5z0;Dq20;)dK&;M1Xd*;kp z`<*jEV3xy~POy=QNy#axY3Ui6SwJom)RuidK!qpa3L8MeC<>VwSr{1_7#Q~fO|rji N3@8AlsTzq>L>mP?4i>lpG~Df&Et|gNqCu zp3%2Y|0`Q6`aK%bjQ)O1ZcUyy*4k`C)H*9N?q$0}YHR8`maNZ$c)vs!_siRYXz>uu z-3qpwgygf`SkV@TwRhq`yIV>Vw7(}jM_c`x?6b;b8>^S?%+&c%>gsCOlizB7$u_!9 z4knbo2~n$#3R1^d8Z0Sye6h&jL&2peMRxs{myW`N-)OAc*p9QggkuL6+w;N#b5Y^p^ zbvMYA&o;NhEe3J##9;1jDowcjJ)t?;>epnSRVK@=UbZt+=R>Kht5r{atEtI0x=s!z zl)ed(tBwqk=ds3ZVqGVVN4_NIcHX!k6rF+|kfa1)Ij;M_JS6}N)D1z(z@pRuI5U9d zsD{9a0bd`$ry8{&?n#Zggh1T0nlJ%@xJajDDF*JHr?lAPnzzetLhj`!)4}>Qet;oH z7-NDdy#ntv)dJz*qb@7Wk-|3>2MV*7g4(=EzSeu~$kJw02GY(%f*?`8&t-+}yz4W5 NdS>#HWZVC8#&0|9qDTM$ literal 0 HcmV?d00001 diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index d5134a3e3afd0..a27448a342a19 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -267,7 +267,7 @@ def test_read_dta4(self, version, datapath): # stata doesn't save .category metadata tm.assert_frame_equal(parsed, expected) - @pytest.mark.parametrize("version", [105, 108]) + @pytest.mark.parametrize("version", [103, 104, 105, 108]) def test_readold_dta4(self, version, datapath): # This test is the same as test_read_dta4 above except that the columns # had to be renamed to match the restrictions in older file format @@ -2011,6 +2011,18 @@ def test_backward_compat(version, datapath): tm.assert_frame_equal(old_dta, expected, check_dtype=False) +@pytest.mark.parametrize("version", [103, 104]) +def test_backward_compat_nodateconversion(version, datapath): + # The Stata data format prior to 105 did not support a date format + # so read the raw values for comparison + data_base = datapath("io", "data", "stata") + ref = os.path.join(data_base, "stata-compat-118.dta") + old = os.path.join(data_base, f"stata-compat-{version}.dta") + expected = read_stata(ref, convert_dates=False) + old_dta = read_stata(old, convert_dates=False) + tm.assert_frame_equal(old_dta, expected, check_dtype=False) + + @pytest.mark.parametrize("version", [105, 108, 110, 111, 113, 114, 118]) def test_bigendian(version, datapath): ref = datapath("io", "data", "stata", f"stata-compat-{version}.dta") @@ -2020,6 +2032,17 @@ def test_bigendian(version, datapath): tm.assert_frame_equal(big_dta, expected) +@pytest.mark.parametrize("version", [103, 104]) +def test_bigendian_nodateconversion(version, datapath): + # The Stata data format prior to 105 did not support a date format + # so read the raw values for comparison + ref = datapath("io", "data", "stata", f"stata-compat-{version}.dta") + big = datapath("io", "data", "stata", f"stata-compat-be-{version}.dta") + expected = read_stata(ref, convert_dates=False) + big_dta = read_stata(big, convert_dates=False) + tm.assert_frame_equal(big_dta, expected) + + def test_direct_read(datapath, monkeypatch): file_path = datapath("io", "data", "stata", "stata-compat-118.dta") From 9f71476fc8f7ace8ee7c24dca3c5ac8e279d7c61 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 3 Jun 2024 08:43:03 -1000 Subject: [PATCH 040/272] CLN: Stopped object inference in constructors for pandas objects (#58758) * CLN: Stopped object inference in constructors for pandas objects * Adjust tests --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/_testing/__init__.py | 13 +++-------- pandas/core/construction.py | 4 ++-- pandas/core/frame.py | 16 -------------- pandas/core/indexes/base.py | 22 +++++-------------- pandas/core/internals/construction.py | 8 +++---- pandas/core/series.py | 16 -------------- pandas/tests/copy_view/test_constructors.py | 10 ++++----- pandas/tests/frame/test_constructors.py | 17 +++++--------- .../indexes/base_class/test_constructors.py | 16 +++++--------- pandas/tests/indexes/test_base.py | 17 ++++++-------- .../series/accessors/test_dt_accessor.py | 5 ++--- pandas/tests/series/methods/test_equals.py | 6 ++--- pandas/tests/series/test_constructors.py | 21 ++++++------------ .../tseries/frequencies/test_inference.py | 12 ---------- 15 files changed, 48 insertions(+), 136 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index e5d65ad82cc95..2707adb06a1d6 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -311,6 +311,7 @@ Removal of prior version deprecations/changes - Removed the deprecated ``delim_whitespace`` keyword in :func:`read_csv` and :func:`read_table`, use ``sep=r"\s+"`` instead (:issue:`55569`) - Require :meth:`SparseDtype.fill_value` to be a valid value for the :meth:`SparseDtype.subtype` (:issue:`53043`) - Stopped automatically casting non-datetimelike values (mainly strings) in :meth:`Series.isin` and :meth:`Index.isin` with ``datetime64``, ``timedelta64``, and :class:`PeriodDtype` dtypes (:issue:`53111`) +- Stopped performing dtype inference in :class:`Index`, :class:`Series` and :class:`DataFrame` constructors when given a pandas object (:class:`Series`, :class:`Index`, :class:`ExtensionArray`), call ``.infer_objects`` on the input to keep the current behavior (:issue:`56012`) - Stopped performing dtype inference when setting a :class:`Index` into a :class:`DataFrame` (:issue:`56102`) - Stopped performing dtype inference with in :meth:`Index.insert` with object-dtype index; this often affects the index/columns that result when setting new entries into an empty :class:`Series` or :class:`DataFrame` (:issue:`51363`) - Removed the "closed" and "unit" keywords in :meth:`TimedeltaIndex.__new__` (:issue:`52628`, :issue:`55499`) diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 12395b42bba19..a757ef6fc1a29 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -10,7 +10,6 @@ ContextManager, cast, ) -import warnings import numpy as np @@ -290,17 +289,11 @@ def box_expected(expected, box_cls, transpose: bool = True): else: expected = pd.array(expected, copy=False) elif box_cls is Index: - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", "Dtype inference", category=FutureWarning) - expected = Index(expected) + expected = Index(expected) elif box_cls is Series: - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", "Dtype inference", category=FutureWarning) - expected = Series(expected) + expected = Series(expected) elif box_cls is DataFrame: - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", "Dtype inference", category=FutureWarning) - expected = Series(expected).to_frame() + expected = Series(expected).to_frame() if transpose: # for vector operations, we need a DataFrame to be a single-row, # not a single-column, in order to operate against non-DataFrame diff --git a/pandas/core/construction.py b/pandas/core/construction.py index f01d8822241c9..360e1d5ddd3ff 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -554,7 +554,7 @@ def sanitize_array( # Avoid ending up with a NumpyExtensionArray dtype = dtype.numpy_dtype - data_was_index = isinstance(data, ABCIndex) + infer_object = not isinstance(data, (ABCIndex, ABCSeries)) # extract ndarray or ExtensionArray, ensure we have no NumpyExtensionArray data = extract_array(data, extract_numpy=True, extract_range=True) @@ -607,7 +607,7 @@ def sanitize_array( if dtype is None: subarr = data - if data.dtype == object and not data_was_index: + if data.dtype == object and infer_object: subarr = maybe_infer_to_datetimelike(data) elif data.dtype.kind == "U" and using_pyarrow_string_dtype(): from pandas.core.arrays.string_ import StringDtype diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 97a4e414608b8..703fece35b23a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -728,10 +728,6 @@ def __init__( NDFrame.__init__(self, data) return - is_pandas_object = isinstance(data, (Series, Index, ExtensionArray)) - data_dtype = getattr(data, "dtype", None) - original_dtype = dtype - # GH47215 if isinstance(index, set): raise ValueError("index cannot be a set") @@ -896,18 +892,6 @@ def __init__( NDFrame.__init__(self, mgr) - if original_dtype is None and is_pandas_object and data_dtype == np.object_: - if self.dtypes.iloc[0] != data_dtype: - warnings.warn( - "Dtype inference on a pandas object " - "(Series, Index, ExtensionArray) is deprecated. The DataFrame " - "constructor will keep the original dtype in the future. " - "Call `infer_objects` on the result to get the old " - "behavior.", - FutureWarning, - stacklevel=2, - ) - # ---------------------------------------------------------------------- def __dataframe__( diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 56030a15dc143..15c318e5e9caf 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -490,8 +490,6 @@ def __new__( if not copy and isinstance(data, (ABCSeries, Index)): refs = data._references - is_pandas_object = isinstance(data, (ABCSeries, Index, ExtensionArray)) - # range if isinstance(data, (range, RangeIndex)): result = RangeIndex(start=data, copy=copy, name=name) @@ -508,7 +506,7 @@ def __new__( elif is_ea_or_datetimelike_dtype(data_dtype): pass - elif isinstance(data, (np.ndarray, Index, ABCSeries)): + elif isinstance(data, (np.ndarray, ABCMultiIndex)): if isinstance(data, ABCMultiIndex): data = data._values @@ -518,7 +516,9 @@ def __new__( # they are actually ints, e.g. '0' and 0.0 # should not be coerced data = com.asarray_tuplesafe(data, dtype=_dtype_obj) - + elif isinstance(data, (ABCSeries, Index)): + # GH 56244: Avoid potential inference on object types + pass elif is_scalar(data): raise cls._raise_scalar_data_error(data) elif hasattr(data, "__array__"): @@ -571,19 +571,7 @@ def __new__( klass = cls._dtype_to_subclass(arr.dtype) arr = klass._ensure_array(arr, arr.dtype, copy=False) - result = klass._simple_new(arr, name, refs=refs) - if dtype is None and is_pandas_object and data_dtype == np.object_: - if result.dtype != data_dtype: - warnings.warn( - "Dtype inference on a pandas object " - "(Series, Index, ExtensionArray) is deprecated. The Index " - "constructor will keep the original dtype in the future. " - "Call `infer_objects` on the result to get the old " - "behavior.", - FutureWarning, - stacklevel=2, - ) - return result # type: ignore[return-value] + return klass._simple_new(arr, name, refs=refs) @classmethod def _ensure_array(cls, data, dtype, copy: bool): diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index cea52bf8c91b2..23572975a1112 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -192,6 +192,7 @@ def ndarray_to_mgr( ) -> Manager: # used in DataFrame.__init__ # input must be a ndarray, list, Series, Index, ExtensionArray + infer_object = not isinstance(values, (ABCSeries, Index, ExtensionArray)) if isinstance(values, ABCSeries): if columns is None: @@ -287,15 +288,14 @@ def ndarray_to_mgr( # if we don't have a dtype specified, then try to convert objects # on the entire block; this is to convert if we have datetimelike's # embedded in an object type - if dtype is None and is_object_dtype(values.dtype): + if dtype is None and infer_object and is_object_dtype(values.dtype): obj_columns = list(values) maybe_datetime = [maybe_infer_to_datetimelike(x) for x in obj_columns] # don't convert (and copy) the objects if no type inference occurs if any(x is not y for x, y in zip(obj_columns, maybe_datetime)): - dvals_list = [ensure_block_shape(dval, 2) for dval in maybe_datetime] block_values = [ - new_block_2d(dvals_list[n], placement=BlockPlacement(n)) - for n in range(len(dvals_list)) + new_block_2d(ensure_block_shape(dval, 2), placement=BlockPlacement(n)) + for n, dval in enumerate(maybe_datetime) ] else: bp = BlockPlacement(slice(len(columns))) diff --git a/pandas/core/series.py b/pandas/core/series.py index f67c0753fa9df..bfaba866c3dfd 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -389,10 +389,6 @@ def __init__( self.name = name return - is_pandas_object = isinstance(data, (Series, Index, ExtensionArray)) - data_dtype = getattr(data, "dtype", None) - original_dtype = dtype - if isinstance(data, (ExtensionArray, np.ndarray)): if copy is not False: if dtype is None or astype_is_view(data.dtype, pandas_dtype(dtype)): @@ -438,7 +434,6 @@ def __init__( data = data.astype(dtype) refs = data._references - data = data._values copy = False elif isinstance(data, np.ndarray): @@ -512,17 +507,6 @@ def __init__( self.name = name self._set_axis(0, index) - if original_dtype is None and is_pandas_object and data_dtype == np.object_: - if self.dtype != data_dtype: - warnings.warn( - "Dtype inference on a pandas object " - "(Series, Index, ExtensionArray) is deprecated. The Series " - "constructor will keep the original dtype in the future. " - "Call `infer_objects` on the result to get the old behavior.", - FutureWarning, - stacklevel=find_stack_level(), - ) - def _init_dict( self, data: Mapping, index: Index | None = None, dtype: DtypeObj | None = None ): diff --git a/pandas/tests/copy_view/test_constructors.py b/pandas/tests/copy_view/test_constructors.py index bc931b53b37d0..eb5177e393936 100644 --- a/pandas/tests/copy_view/test_constructors.py +++ b/pandas/tests/copy_view/test_constructors.py @@ -228,12 +228,12 @@ def test_dataframe_from_series_or_index_different_dtype(index_or_series): assert df._mgr._has_no_reference(0) -def test_dataframe_from_series_infer_datetime(): +def test_dataframe_from_series_dont_infer_datetime(): ser = Series([Timestamp("2019-12-31"), Timestamp("2020-12-31")], dtype=object) - with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): - df = DataFrame(ser) - assert not np.shares_memory(get_array(ser), get_array(df, 0)) - assert df._mgr._has_no_reference(0) + df = DataFrame(ser) + assert df.dtypes.iloc[0] == np.dtype(object) + assert np.shares_memory(get_array(ser), get_array(df, 0)) + assert not df._mgr._has_no_reference(0) @pytest.mark.parametrize("index", [None, [0, 1, 2]]) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index cbd969e5d90bf..5032932256488 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2702,21 +2702,14 @@ def test_frame_string_inference_block_dim(self): df = DataFrame(np.array([["hello", "goodbye"], ["hello", "Hello"]])) assert df._mgr.blocks[0].ndim == 2 - def test_inference_on_pandas_objects(self): + @pytest.mark.parametrize("klass", [Series, Index]) + def test_inference_on_pandas_objects(self, klass): # GH#56012 - idx = Index([Timestamp("2019-12-31")], dtype=object) - with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): - result = DataFrame(idx, columns=["a"]) - assert result.dtypes.iloc[0] != np.object_ - result = DataFrame({"a": idx}) + obj = klass([Timestamp("2019-12-31")], dtype=object) + result = DataFrame(obj, columns=["a"]) assert result.dtypes.iloc[0] == np.object_ - ser = Series([Timestamp("2019-12-31")], dtype=object) - - with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): - result = DataFrame(ser, columns=["a"]) - assert result.dtypes.iloc[0] != np.object_ - result = DataFrame({"a": ser}) + result = DataFrame({"a": obj}) assert result.dtypes.iloc[0] == np.object_ def test_dict_keys_returns_rangeindex(self): diff --git a/pandas/tests/indexes/base_class/test_constructors.py b/pandas/tests/indexes/base_class/test_constructors.py index e5956f808286d..6036eddce7a01 100644 --- a/pandas/tests/indexes/base_class/test_constructors.py +++ b/pandas/tests/indexes/base_class/test_constructors.py @@ -59,18 +59,12 @@ def test_index_string_inference(self): ser = Index(["a", 1]) tm.assert_index_equal(ser, expected) - def test_inference_on_pandas_objects(self): + @pytest.mark.parametrize("klass", [Series, Index]) + def test_inference_on_pandas_objects(self, klass): # GH#56012 - idx = Index([pd.Timestamp("2019-12-31")], dtype=object) - with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): - result = Index(idx) - assert result.dtype != np.object_ - - ser = Series([pd.Timestamp("2019-12-31")], dtype=object) - - with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): - result = Index(ser) - assert result.dtype != np.object_ + obj = klass([pd.Timestamp("2019-12-31")], dtype=object) + result = Index(obj) + assert result.dtype == np.object_ def test_constructor_not_read_only(self): # GH#57130 diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index bd38e6c2ff333..e701a49ea93ad 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -104,16 +104,9 @@ def test_constructor_copy(self, using_infer_string): ) def test_constructor_from_index_dtlike(self, cast_as_obj, index): if cast_as_obj: - with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): - result = Index(index.astype(object)) - else: - result = Index(index) - - tm.assert_index_equal(result, index) - - if isinstance(index, DatetimeIndex): - assert result.tz == index.tz - if cast_as_obj: + result = Index(index.astype(object)) + assert result.dtype == np.dtype(object) + if isinstance(index, DatetimeIndex): # GH#23524 check that Index(dti, dtype=object) does not # incorrectly raise ValueError, and that nanoseconds are not # dropped @@ -121,6 +114,10 @@ def test_constructor_from_index_dtlike(self, cast_as_obj, index): result = Index(index, dtype=object) assert result.dtype == np.object_ assert list(result) == list(index) + else: + result = Index(index) + + tm.assert_index_equal(result, index) @pytest.mark.parametrize( "index,has_tz", diff --git a/pandas/tests/series/accessors/test_dt_accessor.py b/pandas/tests/series/accessors/test_dt_accessor.py index 8c60f7beb317d..49ae0a60e6608 100644 --- a/pandas/tests/series/accessors/test_dt_accessor.py +++ b/pandas/tests/series/accessors/test_dt_accessor.py @@ -256,9 +256,8 @@ def test_dt_accessor_limited_display_api(self): tm.assert_almost_equal(results, sorted(set(ok_for_dt + ok_for_dt_methods))) # Period - idx = period_range("20130101", periods=5, freq="D", name="xxx").astype(object) - with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): - ser = Series(idx) + idx = period_range("20130101", periods=5, freq="D", name="xxx") + ser = Series(idx) results = get_dir(ser) tm.assert_almost_equal( results, sorted(set(ok_for_period + ok_for_period_methods)) diff --git a/pandas/tests/series/methods/test_equals.py b/pandas/tests/series/methods/test_equals.py index 875ffdd3fe851..b94723b7cbddf 100644 --- a/pandas/tests/series/methods/test_equals.py +++ b/pandas/tests/series/methods/test_equals.py @@ -82,15 +82,13 @@ def test_equals_matching_nas(): left = Series([np.datetime64("NaT")], dtype=object) right = Series([np.datetime64("NaT")], dtype=object) assert left.equals(right) - with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): - assert Index(left).equals(Index(right)) + assert Index(left).equals(Index(right)) assert left.array.equals(right.array) left = Series([np.timedelta64("NaT")], dtype=object) right = Series([np.timedelta64("NaT")], dtype=object) assert left.equals(right) - with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): - assert Index(left).equals(Index(right)) + assert Index(left).equals(Index(right)) assert left.array.equals(right.array) left = Series([np.float64("NaN")], dtype=object) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 00c614cf72c20..44a7862c21273 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -1318,9 +1318,8 @@ def test_constructor_periodindex(self): pi = period_range("20130101", periods=5, freq="D") s = Series(pi) assert s.dtype == "Period[D]" - with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): - expected = Series(pi.astype(object)) - tm.assert_series_equal(s, expected) + expected = Series(pi.astype(object)) + assert expected.dtype == object def test_constructor_dict(self): d = {"a": 0.0, "b": 1.0, "c": 2.0} @@ -2137,20 +2136,14 @@ def test_series_string_inference_na_first(self): result = Series([pd.NA, "b"]) tm.assert_series_equal(result, expected) - def test_inference_on_pandas_objects(self): + @pytest.mark.parametrize("klass", [Series, Index]) + def test_inference_on_pandas_objects(self, klass): # GH#56012 - ser = Series([Timestamp("2019-12-31")], dtype=object) - with tm.assert_produces_warning(None): - # This doesn't do inference - result = Series(ser) + obj = klass([Timestamp("2019-12-31")], dtype=object) + # This doesn't do inference + result = Series(obj) assert result.dtype == np.object_ - idx = Index([Timestamp("2019-12-31")], dtype=object) - - with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): - result = Series(idx) - assert result.dtype != np.object_ - class TestSeriesConstructorIndexCoercion: def test_series_constructor_datetimelike_index_coercion(self): diff --git a/pandas/tests/tseries/frequencies/test_inference.py b/pandas/tests/tseries/frequencies/test_inference.py index edfc1973a2bd9..dad5c73b89626 100644 --- a/pandas/tests/tseries/frequencies/test_inference.py +++ b/pandas/tests/tseries/frequencies/test_inference.py @@ -23,7 +23,6 @@ date_range, period_range, ) -import pandas._testing as tm from pandas.core.arrays import ( DatetimeArray, TimedeltaArray, @@ -202,17 +201,6 @@ def test_infer_freq_custom(base_delta_code_pair, constructor): assert frequencies.infer_freq(index) is None -@pytest.mark.parametrize( - "freq,expected", [("Q", "QE-DEC"), ("Q-NOV", "QE-NOV"), ("Q-OCT", "QE-OCT")] -) -def test_infer_freq_index(freq, expected): - rng = period_range("1959Q2", "2009Q3", freq=freq) - with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): - rng = Index(rng.to_timestamp("D", how="e").astype(object)) - - assert rng.inferred_freq == expected - - @pytest.mark.parametrize( "expected,dates", list( From 199bf2084a6e755e15fdf59ea97341c13ed10f69 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 3 Jun 2024 08:44:57 -1000 Subject: [PATCH 041/272] REF: Remove BlockManager.arrays in favor of BlockManager.blocks usage (#58804) * REF: Remove BlockManager.arrays in favor of BlockManager.blocks usage * Add back arrays * Whitespace --- pandas/_testing/__init__.py | 4 +-- pandas/core/frame.py | 17 ++++----- pandas/core/generic.py | 6 ++-- pandas/core/indexing.py | 4 +-- pandas/core/internals/managers.py | 4 ++- pandas/tests/apply/test_str.py | 2 +- pandas/tests/extension/base/casting.py | 5 +-- pandas/tests/extension/base/constructors.py | 4 +-- pandas/tests/extension/base/getitem.py | 2 +- pandas/tests/extension/base/reshaping.py | 2 +- pandas/tests/frame/indexing/test_setitem.py | 4 +-- pandas/tests/frame/methods/test_cov_corr.py | 2 +- pandas/tests/frame/methods/test_fillna.py | 2 +- pandas/tests/frame/methods/test_shift.py | 6 ++-- pandas/tests/frame/methods/test_values.py | 2 +- pandas/tests/frame/test_constructors.py | 36 +++++++++---------- pandas/tests/groupby/aggregate/test_cython.py | 2 +- pandas/tests/indexing/test_iloc.py | 2 +- pandas/tests/reshape/concat/test_concat.py | 25 +++++++------ pandas/tests/reshape/merge/test_merge.py | 4 +-- pandas/tests/series/indexing/test_setitem.py | 4 +-- 21 files changed, 74 insertions(+), 65 deletions(-) diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index a757ef6fc1a29..d35242ada21e9 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -531,8 +531,8 @@ def shares_memory(left, right) -> bool: left._mask, right._mask ) - if isinstance(left, DataFrame) and len(left._mgr.arrays) == 1: - arr = left._mgr.arrays[0] + if isinstance(left, DataFrame) and len(left._mgr.blocks) == 1: + arr = left._mgr.blocks[0].values return shares_memory(arr, right) raise NotImplementedError(type(left), type(right)) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 703fece35b23a..c37dfa225de5a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1046,7 +1046,7 @@ def _is_homogeneous_type(self) -> bool: False """ # The "<" part of "<=" here is for empty DataFrame cases - return len({arr.dtype for arr in self._mgr.arrays}) <= 1 + return len({block.values.dtype for block in self._mgr.blocks}) <= 1 @property def _can_fast_transpose(self) -> bool: @@ -5726,7 +5726,6 @@ def shift( periods = cast(int, periods) ncols = len(self.columns) - arrays = self._mgr.arrays if axis == 1 and periods != 0 and ncols > 0 and freq is None: if fill_value is lib.no_default: # We will infer fill_value to match the closest column @@ -5752,12 +5751,12 @@ def shift( result.columns = self.columns.copy() return result - elif len(arrays) > 1 or ( + elif len(self._mgr.blocks) > 1 or ( # If we only have one block and we know that we can't # keep the same dtype (i.e. the _can_hold_element check) # then we can go through the reindex_indexer path # (and avoid casting logic in the Block method). - not can_hold_element(arrays[0], fill_value) + not can_hold_element(self._mgr.blocks[0].values, fill_value) ): # GH#35488 we need to watch out for multi-block cases # We only get here with fill_value not-lib.no_default @@ -11453,7 +11452,7 @@ def _get_data() -> DataFrame: if numeric_only: df = _get_data() if axis is None: - dtype = find_common_type([arr.dtype for arr in df._mgr.arrays]) + dtype = find_common_type([block.values.dtype for block in df._mgr.blocks]) if isinstance(dtype, ExtensionDtype): df = df.astype(dtype) arr = concat_compat(list(df._iter_column_arrays())) @@ -11478,7 +11477,9 @@ def _get_data() -> DataFrame: # kurtosis excluded since groupby does not implement it if df.shape[1] and name != "kurt": - dtype = find_common_type([arr.dtype for arr in df._mgr.arrays]) + dtype = find_common_type( + [block.values.dtype for block in df._mgr.blocks] + ) if isinstance(dtype, ExtensionDtype): # GH 54341: fastpath for EA-backed axis=1 reductions # This flattens the frame into a single 1D array while keeping @@ -11552,8 +11553,8 @@ def _reduce_axis1(self, name: str, func, skipna: bool) -> Series: else: raise NotImplementedError(name) - for arr in self._mgr.arrays: - middle = func(arr, axis=0, skipna=skipna) + for blocks in self._mgr.blocks: + middle = func(blocks.values, axis=0, skipna=skipna) result = ufunc(result, middle) res_ser = self._constructor_sliced(result, index=self.index, copy=False) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 22eecdc95934f..80314c2648f45 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6373,7 +6373,7 @@ def astype( # TODO(EA2D): special case not needed with 2D EAs dtype = pandas_dtype(dtype) if isinstance(dtype, ExtensionDtype) and all( - arr.dtype == dtype for arr in self._mgr.arrays + block.values.dtype == dtype for block in self._mgr.blocks ): return self.copy(deep=False) # GH 18099/22869: columnwise conversion to extension dtype @@ -11148,9 +11148,9 @@ def _logical_func( if ( self.ndim > 1 and axis == 1 - and len(self._mgr.arrays) > 1 + and len(self._mgr.blocks) > 1 # TODO(EA2D): special-case not needed - and all(x.ndim == 2 for x in self._mgr.arrays) + and all(block.values.ndim == 2 for block in self._mgr.blocks) and not kwargs ): # Fastpath avoiding potentially expensive transpose diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index e9bd3b389dd75..9140b1dbe9b33 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1804,10 +1804,10 @@ def _setitem_with_indexer(self, indexer, value, name: str = "iloc") -> None: # if there is only one block/type, still have to take split path # unless the block is one-dimensional or it can hold the value - if not take_split_path and len(self.obj._mgr.arrays) and self.ndim > 1: + if not take_split_path and len(self.obj._mgr.blocks) and self.ndim > 1: # in case of dict, keys are indices val = list(value.values()) if isinstance(value, dict) else value - arr = self.obj._mgr.arrays[0] + arr = self.obj._mgr.blocks[0].values take_split_path = not can_hold_element( arr, extract_array(val, extract_numpy=True) ) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 7c1bcbec1d3f2..82b88d090f847 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -353,6 +353,8 @@ def arrays(self) -> list[ArrayLike]: Warning! The returned arrays don't handle Copy-on-Write, so this should be used with caution (only in read-mode). """ + # TODO: Deprecate, usage in Dask + # https://github.com/dask/dask/blob/484fc3f1136827308db133cd256ba74df7a38d8c/dask/base.py#L1312 return [blk.values for blk in self.blocks] def __repr__(self) -> str: @@ -2068,7 +2070,7 @@ def array(self) -> ArrayLike: """ Quick access to the backing array of the Block. """ - return self.arrays[0] + return self.blocks[0].values # error: Cannot override writeable attribute with read-only property @property diff --git a/pandas/tests/apply/test_str.py b/pandas/tests/apply/test_str.py index e0c5e337fb746..e224b07a1097b 100644 --- a/pandas/tests/apply/test_str.py +++ b/pandas/tests/apply/test_str.py @@ -287,7 +287,7 @@ def test_transform_groupby_kernel_frame(request, float_frame, op): # same thing, but ensuring we have multiple blocks assert "E" not in float_frame.columns float_frame["E"] = float_frame["A"].copy() - assert len(float_frame._mgr.arrays) > 1 + assert len(float_frame._mgr.blocks) > 1 ones = np.ones(float_frame.shape[0]) gb2 = float_frame.groupby(ones) diff --git a/pandas/tests/extension/base/casting.py b/pandas/tests/extension/base/casting.py index 2bfe801c48a77..e924e38ee5030 100644 --- a/pandas/tests/extension/base/casting.py +++ b/pandas/tests/extension/base/casting.py @@ -30,8 +30,9 @@ def test_astype_object_frame(self, all_data): blk = result._mgr.blocks[0] assert isinstance(blk, NumpyBlock), type(blk) assert blk.is_object - assert isinstance(result._mgr.arrays[0], np.ndarray) - assert result._mgr.arrays[0].dtype == np.dtype(object) + arr = result._mgr.blocks[0].values + assert isinstance(arr, np.ndarray) + assert arr.dtype == np.dtype(object) # check that we can compare the dtypes comp = result.dtypes == df.dtypes diff --git a/pandas/tests/extension/base/constructors.py b/pandas/tests/extension/base/constructors.py index c32a6a6a115ac..639dc874c9fb9 100644 --- a/pandas/tests/extension/base/constructors.py +++ b/pandas/tests/extension/base/constructors.py @@ -69,7 +69,7 @@ def test_dataframe_constructor_from_dict(self, data, from_series): assert result.shape == (len(data), 1) if hasattr(result._mgr, "blocks"): assert isinstance(result._mgr.blocks[0], EABackedBlock) - assert isinstance(result._mgr.arrays[0], ExtensionArray) + assert isinstance(result._mgr.blocks[0].values, ExtensionArray) def test_dataframe_from_series(self, data): result = pd.DataFrame(pd.Series(data)) @@ -77,7 +77,7 @@ def test_dataframe_from_series(self, data): assert result.shape == (len(data), 1) if hasattr(result._mgr, "blocks"): assert isinstance(result._mgr.blocks[0], EABackedBlock) - assert isinstance(result._mgr.arrays[0], ExtensionArray) + assert isinstance(result._mgr.blocks[0].values, ExtensionArray) def test_series_given_mismatched_index_raises(self, data): msg = r"Length of values \(3\) does not match length of index \(5\)" diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py index 935edce32a0ab..3fa2f50bf4930 100644 --- a/pandas/tests/extension/base/getitem.py +++ b/pandas/tests/extension/base/getitem.py @@ -450,7 +450,7 @@ def test_loc_len1(self, data): df = pd.DataFrame({"A": data}) res = df.loc[[0], "A"] assert res.ndim == 1 - assert res._mgr.arrays[0].ndim == 1 + assert res._mgr.blocks[0].ndim == 1 if hasattr(res._mgr, "blocks"): assert res._mgr._block.ndim == 1 diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py index 489cd15644d04..24be94443c5ba 100644 --- a/pandas/tests/extension/base/reshaping.py +++ b/pandas/tests/extension/base/reshaping.py @@ -29,7 +29,7 @@ def test_concat(self, data, in_frame): assert dtype == data.dtype if hasattr(result._mgr, "blocks"): assert isinstance(result._mgr.blocks[0], EABackedBlock) - assert isinstance(result._mgr.arrays[0], ExtensionArray) + assert isinstance(result._mgr.blocks[0].values, ExtensionArray) @pytest.mark.parametrize("in_frame", [True, False]) def test_concat_all_na_block(self, data_missing, in_frame): diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index fe477f88c81ff..15cdc6566b570 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -340,8 +340,8 @@ def test_setitem_dt64tz(self, timezone_frame): # assert that A & C are not sharing the same base (e.g. they # are copies) # Note: This does not hold with Copy on Write (because of lazy copying) - v1 = df._mgr.arrays[1] - v2 = df._mgr.arrays[2] + v1 = df._mgr.blocks[1].values + v2 = df._mgr.blocks[2].values tm.assert_extension_array_equal(v1, v2) v1base = v1._ndarray.base v2base = v2._ndarray.base diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index 4151a1d27d06a..aeaf80f285f9d 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -214,7 +214,7 @@ def test_corr_item_cache(self): df["B"] = range(10)[::-1] ser = df["A"] # populate item_cache - assert len(df._mgr.arrays) == 2 # i.e. 2 blocks + assert len(df._mgr.blocks) == 2 _ = df.corr(numeric_only=True) diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py index 2ef7780e9a6d5..1b852343266aa 100644 --- a/pandas/tests/frame/methods/test_fillna.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -47,7 +47,7 @@ def test_fillna_on_column_view(self): assert np.isnan(arr[:, 0]).all() # i.e. we didn't create a new 49-column block - assert len(df._mgr.arrays) == 1 + assert len(df._mgr.blocks) == 1 assert np.shares_memory(df.values, arr) def test_fillna_datetime(self, datetime_frame): diff --git a/pandas/tests/frame/methods/test_shift.py b/pandas/tests/frame/methods/test_shift.py index 72c1a123eac98..4e490e9e344ba 100644 --- a/pandas/tests/frame/methods/test_shift.py +++ b/pandas/tests/frame/methods/test_shift.py @@ -320,7 +320,7 @@ def test_shift_categorical1(self, frame_or_series): def get_cat_values(ndframe): # For Series we could just do ._values; for DataFrame # we may be able to do this if we ever have 2D Categoricals - return ndframe._mgr.arrays[0] + return ndframe._mgr.blocks[0].values cat = get_cat_values(obj) @@ -560,7 +560,7 @@ def test_shift_dt64values_int_fill_deprecated(self): # same thing but not consolidated; pre-2.0 we got different behavior df3 = DataFrame({"A": ser}) df3["B"] = ser - assert len(df3._mgr.arrays) == 2 + assert len(df3._mgr.blocks) == 2 result = df3.shift(1, axis=1, fill_value=0) tm.assert_frame_equal(result, expected) @@ -621,7 +621,7 @@ def test_shift_dt64values_axis1_invalid_fill(self, vals, as_cat): # same thing but not consolidated df3 = DataFrame({"A": ser}) df3["B"] = ser - assert len(df3._mgr.arrays) == 2 + assert len(df3._mgr.blocks) == 2 result = df3.shift(-1, axis=1, fill_value="foo") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_values.py b/pandas/tests/frame/methods/test_values.py index dfece3fc7552b..2de2053bb705f 100644 --- a/pandas/tests/frame/methods/test_values.py +++ b/pandas/tests/frame/methods/test_values.py @@ -256,7 +256,7 @@ def test_private_values_dt64_multiblock(self): df = DataFrame({"A": dta[:4]}, copy=False) df["B"] = dta[4:] - assert len(df._mgr.arrays) == 2 + assert len(df._mgr.blocks) == 2 result = df._values expected = dta.reshape(2, 4).T diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 5032932256488..da0504458cf5d 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -180,24 +180,24 @@ def test_datetimelike_values_with_object_dtype(self, kind, frame_or_series): arr = arr[:, 0] obj = frame_or_series(arr, dtype=object) - assert obj._mgr.arrays[0].dtype == object - assert isinstance(obj._mgr.arrays[0].ravel()[0], scalar_type) + assert obj._mgr.blocks[0].values.dtype == object + assert isinstance(obj._mgr.blocks[0].values.ravel()[0], scalar_type) # go through a different path in internals.construction obj = frame_or_series(frame_or_series(arr), dtype=object) - assert obj._mgr.arrays[0].dtype == object - assert isinstance(obj._mgr.arrays[0].ravel()[0], scalar_type) + assert obj._mgr.blocks[0].values.dtype == object + assert isinstance(obj._mgr.blocks[0].values.ravel()[0], scalar_type) obj = frame_or_series(frame_or_series(arr), dtype=NumpyEADtype(object)) - assert obj._mgr.arrays[0].dtype == object - assert isinstance(obj._mgr.arrays[0].ravel()[0], scalar_type) + assert obj._mgr.blocks[0].values.dtype == object + assert isinstance(obj._mgr.blocks[0].values.ravel()[0], scalar_type) if frame_or_series is DataFrame: # other paths through internals.construction sers = [Series(x) for x in arr] obj = frame_or_series(sers, dtype=object) - assert obj._mgr.arrays[0].dtype == object - assert isinstance(obj._mgr.arrays[0].ravel()[0], scalar_type) + assert obj._mgr.blocks[0].values.dtype == object + assert isinstance(obj._mgr.blocks[0].values.ravel()[0], scalar_type) def test_series_with_name_not_matching_column(self): # GH#9232 @@ -297,7 +297,7 @@ def test_constructor_dtype_nocast_view_dataframe(self): def test_constructor_dtype_nocast_view_2d_array(self): df = DataFrame([[1, 2], [3, 4]], dtype="int64") df2 = DataFrame(df.values, dtype=df[0].dtype) - assert df2._mgr.arrays[0].flags.c_contiguous + assert df2._mgr.blocks[0].values.flags.c_contiguous @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="conversion copies") def test_1d_object_array_does_not_copy(self): @@ -2493,9 +2493,9 @@ def get_base(obj): def check_views(c_only: bool = False): # Check that the underlying data behind df["c"] is still `c` # after setting with iloc. Since we don't know which entry in - # df._mgr.arrays corresponds to df["c"], we just check that exactly + # df._mgr.blocks corresponds to df["c"], we just check that exactly # one of these arrays is `c`. GH#38939 - assert sum(x is c for x in df._mgr.arrays) == 1 + assert sum(x.values is c for x in df._mgr.blocks) == 1 if c_only: # If we ever stop consolidating in setitem_with_indexer, # this will become unnecessary. @@ -2503,17 +2503,17 @@ def check_views(c_only: bool = False): assert ( sum( - get_base(x) is a - for x in df._mgr.arrays - if isinstance(x.dtype, np.dtype) + get_base(x.values) is a + for x in df._mgr.blocks + if isinstance(x.values.dtype, np.dtype) ) == 1 ) assert ( sum( - get_base(x) is b - for x in df._mgr.arrays - if isinstance(x.dtype, np.dtype) + get_base(x.values) is b + for x in df._mgr.blocks + if isinstance(x.values.dtype, np.dtype) ) == 1 ) @@ -3045,7 +3045,7 @@ def test_construction_from_ndarray_datetimelike(self): # constructed from 2D ndarray arr = np.arange(0, 12, dtype="datetime64[ns]").reshape(4, 3) df = DataFrame(arr) - assert all(isinstance(arr, DatetimeArray) for arr in df._mgr.arrays) + assert all(isinstance(block.values, DatetimeArray) for block in df._mgr.blocks) def test_construction_from_ndarray_with_eadtype_mismatched_columns(self): arr = np.random.default_rng(2).standard_normal((10, 2)) diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index aafd06e8f88cf..bf9e82480785c 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -285,7 +285,7 @@ def test_read_only_buffer_source_agg(agg): "species": ["setosa", "setosa", "setosa", "setosa", "setosa"], } ) - df._mgr.arrays[0].flags.writeable = False + df._mgr.blocks[0].values.flags.writeable = False result = df.groupby(["species"]).agg({"sepal_length": agg}) expected = df.copy().groupby(["species"]).agg({"sepal_length": agg}) diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 172aa9878caec..8b90a6c32849d 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -114,7 +114,7 @@ def test_iloc_setitem_ea_inplace(self, frame_or_series, index_or_series_or_array if frame_or_series is Series: values = obj.values else: - values = obj._mgr.arrays[0] + values = obj._mgr.blocks[0].values if frame_or_series is Series: obj.iloc[:2] = index_or_series_or_array(arr[2:]) diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index f86cc0c69d363..550b424371a95 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -5,6 +5,7 @@ from collections.abc import Iterator from datetime import datetime from decimal import Decimal +import itertools import numpy as np import pytest @@ -51,35 +52,39 @@ def test_concat_copy(self): # These are actual copies. result = concat([df, df2, df3], axis=1) - for arr in result._mgr.arrays: - assert arr.base is not None + for block in result._mgr.blocks: + assert block.values.base is not None # These are the same. result = concat([df, df2, df3], axis=1) - for arr in result._mgr.arrays: + for block in result._mgr.blocks: + arr = block.values if arr.dtype.kind == "f": - assert arr.base is df._mgr.arrays[0].base + assert arr.base is df._mgr.blocks[0].values.base elif arr.dtype.kind in ["i", "u"]: - assert arr.base is df2._mgr.arrays[0].base + assert arr.base is df2._mgr.blocks[0].values.base elif arr.dtype == object: assert arr.base is not None # Float block was consolidated. df4 = DataFrame(np.random.default_rng(2).standard_normal((4, 1))) result = concat([df, df2, df3, df4], axis=1) - for arr in result._mgr.arrays: + for blocks in result._mgr.blocks: + arr = blocks.values if arr.dtype.kind == "f": # this is a view on some array in either df or df4 assert any( - np.shares_memory(arr, other) - for other in df._mgr.arrays + df4._mgr.arrays + np.shares_memory(arr, block.values) + for block in itertools.chain(df._mgr.blocks, df4._mgr.blocks) ) elif arr.dtype.kind in ["i", "u"]: - assert arr.base is df2._mgr.arrays[0].base + assert arr.base is df2._mgr.blocks[0].values.base elif arr.dtype == object: # this is a view on df3 - assert any(np.shares_memory(arr, other) for other in df3._mgr.arrays) + assert any( + np.shares_memory(arr, block.values) for block in df3._mgr.blocks + ) def test_concat_with_group_keys(self): # axis=0 diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 5c5c06dea0008..0a5989e3c82e6 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1451,8 +1451,8 @@ def test_merge_readonly(self): ) # make each underlying block array / column array read-only - for arr in data1._mgr.arrays: - arr.flags.writeable = False + for block in data1._mgr.blocks: + block.values.flags.writeable = False data1.merge(data2) # no error diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index b94e6b6f0c6c8..69fba8925784e 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -461,9 +461,9 @@ def test_dt64tz_setitem_does_not_mutate_dti(self): ser = Series(dti) assert ser._values is not dti assert ser._values._ndarray.base is dti._data._ndarray.base - assert ser._mgr.arrays[0]._ndarray.base is dti._data._ndarray.base + assert ser._mgr.blocks[0].values._ndarray.base is dti._data._ndarray.base - assert ser._mgr.arrays[0] is not dti + assert ser._mgr.blocks[0].values is not dti ser[::3] = NaT assert ser[0] is NaT From ff550e69292181341dc4c330f8bf5e04ee0c69cc Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 3 Jun 2024 13:48:13 -0700 Subject: [PATCH 042/272] [pre-commit.ci] pre-commit autoupdate (#58903) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * [pre-commit.ci] pre-commit autoupdate updates: - [github.com/astral-sh/ruff-pre-commit: v0.4.3 → v0.4.7](https://github.com/astral-sh/ruff-pre-commit/compare/v0.4.3...v0.4.7) - [github.com/codespell-project/codespell: v2.2.6 → v2.3.0](https://github.com/codespell-project/codespell/compare/v2.2.6...v2.3.0) - [github.com/pre-commit/mirrors-clang-format: v18.1.4 → v18.1.5](https://github.com/pre-commit/mirrors-clang-format/compare/v18.1.4...v18.1.5) * Add codespell ignores --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- .pre-commit-config.yaml | 6 +++--- pandas/tests/internals/test_internals.py | 2 +- pyproject.toml | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a5de902866611..bf88500b10524 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -19,7 +19,7 @@ ci: skip: [pyright, mypy] repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.4.3 + rev: v0.4.7 hooks: - id: ruff args: [--exit-non-zero-on-fix] @@ -40,7 +40,7 @@ repos: pass_filenames: true require_serial: false - repo: https://github.com/codespell-project/codespell - rev: v2.2.6 + rev: v2.3.0 hooks: - id: codespell types_or: [python, rst, markdown, cython, c] @@ -92,7 +92,7 @@ repos: - id: sphinx-lint args: ["--enable", "all", "--disable", "line-too-long"] - repo: https://github.com/pre-commit/mirrors-clang-format - rev: v18.1.4 + rev: v18.1.5 hooks: - id: clang-format files: ^pandas/_libs/src|^pandas/_libs/include diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 749e2c4a86b55..60ca47b52b373 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -1397,7 +1397,7 @@ def test_make_block_no_pandas_array(block_maker): assert result.dtype.kind in ["i", "u"] assert result.is_extension is False - # new_block no longer taked dtype keyword + # new_block no longer accepts dtype keyword # ndarray, NumpyEADtype result = block_maker( arr.to_numpy(), slice(len(arr)), dtype=arr.dtype, ndim=arr.ndim diff --git a/pyproject.toml b/pyproject.toml index 085c054f8241a..e7d7474134c3a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -722,5 +722,5 @@ exclude_lines = [ directory = "coverage_html_report" [tool.codespell] -ignore-words-list = "blocs, coo, hist, nd, sav, ser, recuse, nin, timere, expec, expecs, indext" +ignore-words-list = "blocs, coo, hist, nd, sav, ser, recuse, nin, timere, expec, expecs, indext, SME, NotIn, tructures, tru" ignore-regex = 'https://([\w/\.])+' From 76c7274985215c487248fa5640e12a9b32a06e8c Mon Sep 17 00:00:00 2001 From: pedrocariellof <105252210+pedrocariellof@users.noreply.github.com> Date: Mon, 3 Jun 2024 18:27:15 -0300 Subject: [PATCH 043/272] DEPR: DataFrameGroupBy.corrwith (#58732) --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/conftest.py | 1 + pandas/core/groupby/generic.py | 9 ++++++ pandas/tests/groupby/test_all_methods.py | 27 +++++++++--------- pandas/tests/groupby/test_apply.py | 9 +++++- pandas/tests/groupby/test_categorical.py | 28 ++++++++++++++++--- pandas/tests/groupby/test_groupby_dropna.py | 18 ++++++++++-- pandas/tests/groupby/test_numeric_only.py | 18 ++++++++++-- pandas/tests/groupby/test_raises.py | 8 ++++++ .../tests/groupby/transform/test_transform.py | 17 +++++++++-- 10 files changed, 111 insertions(+), 25 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 2707adb06a1d6..f54f859bd43ff 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -263,6 +263,7 @@ starting with 3.0, so it can be safely removed from your code. Other Deprecations ^^^^^^^^^^^^^^^^^^ +- Deprecated :meth:`.DataFrameGroupby.corrwith` (:issue:`57158`) - Deprecated :meth:`Timestamp.utcfromtimestamp`, use ``Timestamp.fromtimestamp(ts, "UTC")`` instead (:issue:`56680`) - Deprecated :meth:`Timestamp.utcnow`, use ``Timestamp.now("UTC")`` instead (:issue:`56680`) - Deprecated allowing non-keyword arguments in :meth:`DataFrame.all`, :meth:`DataFrame.min`, :meth:`DataFrame.max`, :meth:`DataFrame.sum`, :meth:`DataFrame.prod`, :meth:`DataFrame.mean`, :meth:`DataFrame.median`, :meth:`DataFrame.sem`, :meth:`DataFrame.var`, :meth:`DataFrame.std`, :meth:`DataFrame.skew`, :meth:`DataFrame.kurt`, :meth:`Series.all`, :meth:`Series.min`, :meth:`Series.max`, :meth:`Series.sum`, :meth:`Series.prod`, :meth:`Series.mean`, :meth:`Series.median`, :meth:`Series.sem`, :meth:`Series.var`, :meth:`Series.std`, :meth:`Series.skew`, and :meth:`Series.kurt`. (:issue:`57087`) diff --git a/pandas/conftest.py b/pandas/conftest.py index 0ab51139528ad..163c3890a7f6d 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -150,6 +150,7 @@ def pytest_collection_modifyitems(items, config) -> None: ("is_categorical_dtype", "is_categorical_dtype is deprecated"), ("is_sparse", "is_sparse is deprecated"), ("DataFrameGroupBy.fillna", "DataFrameGroupBy.fillna is deprecated"), + ("DataFrameGroupBy.corrwith", "DataFrameGroupBy.corrwith is deprecated"), ("NDFrame.replace", "Series.replace without 'value'"), ("NDFrame.clip", "Downcasting behavior in Series and DataFrame methods"), ("Series.idxmin", "The behavior of Series.idxmin"), diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 0c4f22f736d4a..945b9f9c14c0b 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -21,6 +21,7 @@ Union, cast, ) +import warnings import numpy as np @@ -32,6 +33,7 @@ Substitution, doc, ) +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( ensure_int64, @@ -2726,6 +2728,8 @@ def corrwith( """ Compute pairwise correlation. + .. deprecated:: 3.0.0 + Pairwise correlation is computed between rows or columns of DataFrame with rows or columns of Series or DataFrame. DataFrames are first aligned along both axes before computing the @@ -2785,6 +2789,11 @@ def corrwith( 2 0.755929 NaN 3 0.576557 NaN """ + warnings.warn( + "DataFrameGroupBy.corrwith is deprecated", + FutureWarning, + stacklevel=find_stack_level(), + ) result = self._op_via_apply( "corrwith", other=other, diff --git a/pandas/tests/groupby/test_all_methods.py b/pandas/tests/groupby/test_all_methods.py index ad35bec70f668..945c3e421a132 100644 --- a/pandas/tests/groupby/test_all_methods.py +++ b/pandas/tests/groupby/test_all_methods.py @@ -25,9 +25,12 @@ def test_multiindex_group_all_columns_when_empty(groupby_func): gb = df.groupby(["a", "b", "c"], group_keys=False) method = getattr(gb, groupby_func) args = get_groupby_method_args(groupby_func, df) - - warn = FutureWarning if groupby_func == "fillna" else None - warn_msg = "DataFrameGroupBy.fillna is deprecated" + if groupby_func == "corrwith": + warn = FutureWarning + warn_msg = "DataFrameGroupBy.corrwith is deprecated" + else: + warn = None + warn_msg = "" with tm.assert_produces_warning(warn, match=warn_msg): result = method(*args).index expected = df.index @@ -42,18 +45,12 @@ def test_duplicate_columns(request, groupby_func, as_index): df = DataFrame([[1, 3, 6], [1, 4, 7], [2, 5, 8]], columns=list("abb")) args = get_groupby_method_args(groupby_func, df) gb = df.groupby("a", as_index=as_index) - warn = FutureWarning if groupby_func == "fillna" else None - warn_msg = "DataFrameGroupBy.fillna is deprecated" - with tm.assert_produces_warning(warn, match=warn_msg): - result = getattr(gb, groupby_func)(*args) + result = getattr(gb, groupby_func)(*args) expected_df = df.set_axis(["a", "b", "c"], axis=1) expected_args = get_groupby_method_args(groupby_func, expected_df) expected_gb = expected_df.groupby("a", as_index=as_index) - warn = FutureWarning if groupby_func == "fillna" else None - warn_msg = "DataFrameGroupBy.fillna is deprecated" - with tm.assert_produces_warning(warn, match=warn_msg): - expected = getattr(expected_gb, groupby_func)(*expected_args) + expected = getattr(expected_gb, groupby_func)(*expected_args) if groupby_func not in ("size", "ngroup", "cumcount"): expected = expected.rename(columns={"c": "b"}) tm.assert_equal(result, expected) @@ -74,8 +71,12 @@ def test_dup_labels_output_shape(groupby_func, idx): grp_by = df.groupby([0]) args = get_groupby_method_args(groupby_func, df) - warn = FutureWarning if groupby_func == "fillna" else None - warn_msg = "DataFrameGroupBy.fillna is deprecated" + if groupby_func == "corrwith": + warn = FutureWarning + warn_msg = "DataFrameGroupBy.corrwith is deprecated" + else: + warn = None + warn_msg = "" with tm.assert_produces_warning(warn, match=warn_msg): result = getattr(grp_by, groupby_func)(*args) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index ac853746cf008..75801b9e039f6 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -1197,7 +1197,14 @@ def test_apply_is_unchanged_when_other_methods_are_called_first(reduction_func): # Check output when another method is called before .apply() grp = df.groupby(by="a") args = get_groupby_method_args(reduction_func, df) - _ = getattr(grp, reduction_func)(*args) + if reduction_func == "corrwith": + warn = FutureWarning + msg = "DataFrameGroupBy.corrwith is deprecated" + else: + warn = None + msg = "" + with tm.assert_produces_warning(warn, match=msg): + _ = getattr(grp, reduction_func)(*args) result = grp.apply(np.sum, axis=0, include_groups=False) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 2194e5692aa0e..010bd9ee52555 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1473,7 +1473,14 @@ def test_dataframe_groupby_on_2_categoricals_when_observed_is_true(reduction_fun df_grp = df.groupby(["cat_1", "cat_2"], observed=True) args = get_groupby_method_args(reduction_func, df) - res = getattr(df_grp, reduction_func)(*args) + if reduction_func == "corrwith": + warn = FutureWarning + warn_msg = "DataFrameGroupBy.corrwith is deprecated" + else: + warn = None + warn_msg = "" + with tm.assert_produces_warning(warn, match=warn_msg): + res = getattr(df_grp, reduction_func)(*args) for cat in unobserved_cats: assert cat not in res.index @@ -1512,7 +1519,14 @@ def test_dataframe_groupby_on_2_categoricals_when_observed_is_false( getattr(df_grp, reduction_func)(*args) return - res = getattr(df_grp, reduction_func)(*args) + if reduction_func == "corrwith": + warn = FutureWarning + warn_msg = "DataFrameGroupBy.corrwith is deprecated" + else: + warn = None + warn_msg = "" + with tm.assert_produces_warning(warn, match=warn_msg): + res = getattr(df_grp, reduction_func)(*args) expected = _results_for_groupbys_with_missing_categories[reduction_func] @@ -1904,8 +1918,14 @@ def test_category_order_reducer( ): getattr(gb, reduction_func)(*args) return - - op_result = getattr(gb, reduction_func)(*args) + if reduction_func == "corrwith": + warn = FutureWarning + warn_msg = "DataFrameGroupBy.corrwith is deprecated" + else: + warn = None + warn_msg = "" + with tm.assert_produces_warning(warn, match=warn_msg): + op_result = getattr(gb, reduction_func)(*args) if as_index: result = op_result.index.get_level_values("a").categories else: diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index d3b3c945e06de..4749e845a0e59 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -543,7 +543,14 @@ def test_categorical_reducers(reduction_func, observed, sort, as_index, index_ki return gb_filled = df_filled.groupby(keys, observed=observed, sort=sort, as_index=True) - expected = getattr(gb_filled, reduction_func)(*args_filled).reset_index() + if reduction_func == "corrwith": + warn = FutureWarning + msg = "DataFrameGroupBy.corrwith is deprecated" + else: + warn = None + msg = "" + with tm.assert_produces_warning(warn, match=msg): + expected = getattr(gb_filled, reduction_func)(*args_filled).reset_index() expected["x"] = expected["x"].cat.remove_categories([4]) if index_kind == "multi": expected["x2"] = expected["x2"].cat.remove_categories([4]) @@ -567,7 +574,14 @@ def test_categorical_reducers(reduction_func, observed, sort, as_index, index_ki if as_index: expected = expected["size"].rename(None) - result = getattr(gb_keepna, reduction_func)(*args) + if reduction_func == "corrwith": + warn = FutureWarning + msg = "DataFrameGroupBy.corrwith is deprecated" + else: + warn = None + msg = "" + with tm.assert_produces_warning(warn, match=msg): + result = getattr(gb_keepna, reduction_func)(*args) # size will return a Series, others are DataFrame tm.assert_equal(result, expected) diff --git a/pandas/tests/groupby/test_numeric_only.py b/pandas/tests/groupby/test_numeric_only.py index 33cdd1883e1b9..afbc64429e93c 100644 --- a/pandas/tests/groupby/test_numeric_only.py +++ b/pandas/tests/groupby/test_numeric_only.py @@ -256,7 +256,14 @@ def test_numeric_only(kernel, has_arg, numeric_only, keys): method = getattr(gb, kernel) if has_arg and numeric_only is True: # Cases where b does not appear in the result - result = method(*args, **kwargs) + if kernel == "corrwith": + warn = FutureWarning + msg = "DataFrameGroupBy.corrwith is deprecated" + else: + warn = None + msg = "" + with tm.assert_produces_warning(warn, match=msg): + result = method(*args, **kwargs) assert "b" not in result.columns elif ( # kernels that work on any dtype and have numeric_only arg @@ -296,7 +303,14 @@ def test_numeric_only(kernel, has_arg, numeric_only, keys): elif kernel == "idxmax": msg = "'>' not supported between instances of 'type' and 'type'" with pytest.raises(exception, match=msg): - method(*args, **kwargs) + if kernel == "corrwith": + warn = FutureWarning + msg = "DataFrameGroupBy.corrwith is deprecated" + else: + warn = None + msg = "" + with tm.assert_produces_warning(warn, match=msg): + method(*args, **kwargs) elif not has_arg and numeric_only is not lib.no_default: with pytest.raises( TypeError, match="got an unexpected keyword argument 'numeric_only'" diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index 9301f8d56d9d2..5a8192a9ffe02 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -183,6 +183,8 @@ def test_groupby_raises_string( if groupby_func == "fillna": kind = "Series" if groupby_series else "DataFrame" warn_msg = f"{kind}GroupBy.fillna is deprecated" + elif groupby_func == "corrwith": + warn_msg = "DataFrameGroupBy.corrwith is deprecated" else: warn_msg = "" _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg) @@ -288,6 +290,8 @@ def test_groupby_raises_datetime( if groupby_func == "fillna": kind = "Series" if groupby_series else "DataFrame" warn_msg = f"{kind}GroupBy.fillna is deprecated" + elif groupby_func == "corrwith": + warn_msg = "DataFrameGroupBy.corrwith is deprecated" else: warn_msg = "" _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg=warn_msg) @@ -485,6 +489,8 @@ def test_groupby_raises_category( if groupby_func == "fillna": kind = "Series" if groupby_series else "DataFrame" warn_msg = f"{kind}GroupBy.fillna is deprecated" + elif groupby_func == "corrwith": + warn_msg = "DataFrameGroupBy.corrwith is deprecated" else: warn_msg = "" _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg) @@ -658,6 +664,8 @@ def test_groupby_raises_category_on_category( if groupby_func == "fillna": kind = "Series" if groupby_series else "DataFrame" warn_msg = f"{kind}GroupBy.fillna is deprecated" + elif groupby_func == "corrwith": + warn_msg = "DataFrameGroupBy.corrwith is deprecated" else: warn_msg = "" _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg) diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index d6d545a8c4834..726c57081373c 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -1104,7 +1104,14 @@ def test_transform_agg_by_name(request, reduction_func, frame_or_series): return args = get_groupby_method_args(reduction_func, obj) - result = g.transform(func, *args) + if func == "corrwith": + warn = FutureWarning + msg = "DataFrameGroupBy.corrwith is deprecated" + else: + warn = None + msg = "" + with tm.assert_produces_warning(warn, match=msg): + result = g.transform(func, *args) # this is the *definition* of a transformation tm.assert_index_equal(result.index, obj.index) @@ -1468,8 +1475,12 @@ def test_as_index_no_change(keys, df, groupby_func): args = get_groupby_method_args(groupby_func, df) gb_as_index_true = df.groupby(keys, as_index=True) gb_as_index_false = df.groupby(keys, as_index=False) - warn = FutureWarning if groupby_func == "fillna" else None - msg = "DataFrameGroupBy.fillna is deprecated" + if groupby_func == "corrwith": + warn = FutureWarning + msg = "DataFrameGroupBy.corrwith is deprecated" + else: + warn = None + msg = "" with tm.assert_produces_warning(warn, match=msg): result = gb_as_index_true.transform(groupby_func, *args) with tm.assert_produces_warning(warn, match=msg): From 6c321bba6771c815f7503954fe7c69921830a76e Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 4 Jun 2024 09:44:18 -0700 Subject: [PATCH 044/272] DEPR: make_block (#57754) --- doc/source/whatsnew/v3.0.0.rst | 2 +- pandas/core/internals/api.py | 10 ++++++++ pandas/io/feather_format.py | 14 ++++++++--- pandas/io/parquet.py | 23 ++++++++++++++--- pandas/io/parsers/arrow_parser_wrapper.py | 30 ++++++++++++++--------- pandas/tests/internals/test_api.py | 5 +++- pandas/tests/internals/test_internals.py | 20 ++++++++++----- pandas/tests/io/test_parquet.py | 1 + 8 files changed, 79 insertions(+), 26 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index f54f859bd43ff..2d4fafc3c4e1e 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -263,6 +263,7 @@ starting with 3.0, so it can be safely removed from your code. Other Deprecations ^^^^^^^^^^^^^^^^^^ +- Deprecated :func:`core.internals.api.make_block`, use public APIs instead (:issue:`56815`) - Deprecated :meth:`.DataFrameGroupby.corrwith` (:issue:`57158`) - Deprecated :meth:`Timestamp.utcfromtimestamp`, use ``Timestamp.fromtimestamp(ts, "UTC")`` instead (:issue:`56680`) - Deprecated :meth:`Timestamp.utcnow`, use ``Timestamp.now("UTC")`` instead (:issue:`56680`) @@ -272,7 +273,6 @@ Other Deprecations - Deprecated behavior of :meth:`Series.dt.to_pytimedelta`, in a future version this will return a :class:`Series` containing python ``datetime.timedelta`` objects instead of an ``ndarray`` of timedelta; this matches the behavior of other :meth:`Series.dt` properties. (:issue:`57463`) - Deprecated parameter ``method`` in :meth:`DataFrame.reindex_like` / :meth:`Series.reindex_like` (:issue:`58667`) - Deprecated using ``epoch`` date format in :meth:`DataFrame.to_json` and :meth:`Series.to_json`, use ``iso`` instead. (:issue:`57063`) -- .. --------------------------------------------------------------------------- .. _whatsnew_300.prior_deprecations: diff --git a/pandas/core/internals/api.py b/pandas/core/internals/api.py index 24bfad4791b29..04944db2ebd9c 100644 --- a/pandas/core/internals/api.py +++ b/pandas/core/internals/api.py @@ -10,6 +10,7 @@ from __future__ import annotations from typing import TYPE_CHECKING +import warnings import numpy as np @@ -87,6 +88,15 @@ def make_block( - Block.make_block_same_class - Block.__init__ """ + warnings.warn( + # GH#56815 + "make_block is deprecated and will be removed in a future version. " + "Use pd.api.internals.create_dataframe_from_blocks or " + "(recommended) higher-level public APIs instead.", + DeprecationWarning, + stacklevel=2, + ) + if dtype is not None: dtype = pandas_dtype(dtype) diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index 8132167fbe05c..16d4e1f9ea25d 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -6,6 +6,7 @@ TYPE_CHECKING, Any, ) +import warnings from pandas._config import using_pyarrow_string_dtype @@ -131,9 +132,16 @@ def read_feather( path, "rb", storage_options=storage_options, is_text=False ) as handles: if dtype_backend is lib.no_default and not using_pyarrow_string_dtype(): - return feather.read_feather( - handles.handle, columns=columns, use_threads=bool(use_threads) - ) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + "make_block is deprecated", + DeprecationWarning, + ) + + return feather.read_feather( + handles.handle, columns=columns, use_threads=bool(use_threads) + ) pa_table = feather.read_table( handles.handle, columns=columns, use_threads=bool(use_threads) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 08983ceed44e5..306b144811898 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -10,7 +10,10 @@ Any, Literal, ) -from warnings import catch_warnings +from warnings import ( + catch_warnings, + filterwarnings, +) from pandas._config import using_pyarrow_string_dtype @@ -271,7 +274,13 @@ def read( filters=filters, **kwargs, ) - result = pa_table.to_pandas(**to_pandas_kwargs) + with catch_warnings(): + filterwarnings( + "ignore", + "make_block is deprecated", + DeprecationWarning, + ) + result = pa_table.to_pandas(**to_pandas_kwargs) if pa_table.schema.metadata: if b"PANDAS_ATTRS" in pa_table.schema.metadata: @@ -384,7 +393,15 @@ def read( try: parquet_file = self.api.ParquetFile(path, **parquet_kwargs) - return parquet_file.to_pandas(columns=columns, filters=filters, **kwargs) + with catch_warnings(): + filterwarnings( + "ignore", + "make_block is deprecated", + DeprecationWarning, + ) + return parquet_file.to_pandas( + columns=columns, filters=filters, **kwargs + ) finally: if handles is not None: handles.close() diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 8b6f7d5750ffe..cffdb28e2c9e4 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -287,17 +287,23 @@ def read(self) -> DataFrame: table = table.cast(new_schema) - if dtype_backend == "pyarrow": - frame = table.to_pandas(types_mapper=pd.ArrowDtype) - elif dtype_backend == "numpy_nullable": - # Modify the default mapping to also - # map null to Int64 (to match other engines) - dtype_mapping = _arrow_dtype_mapping() - dtype_mapping[pa.null()] = pd.Int64Dtype() - frame = table.to_pandas(types_mapper=dtype_mapping.get) - elif using_pyarrow_string_dtype(): - frame = table.to_pandas(types_mapper=arrow_string_types_mapper()) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + "make_block is deprecated", + DeprecationWarning, + ) + if dtype_backend == "pyarrow": + frame = table.to_pandas(types_mapper=pd.ArrowDtype) + elif dtype_backend == "numpy_nullable": + # Modify the default mapping to also + # map null to Int64 (to match other engines) + dtype_mapping = _arrow_dtype_mapping() + dtype_mapping[pa.null()] = pd.Int64Dtype() + frame = table.to_pandas(types_mapper=dtype_mapping.get) + elif using_pyarrow_string_dtype(): + frame = table.to_pandas(types_mapper=arrow_string_types_mapper()) - else: - frame = table.to_pandas() + else: + frame = table.to_pandas() return self._finalize_pandas_output(frame) diff --git a/pandas/tests/internals/test_api.py b/pandas/tests/internals/test_api.py index c189d5248b1f3..591157bbe87fe 100644 --- a/pandas/tests/internals/test_api.py +++ b/pandas/tests/internals/test_api.py @@ -44,7 +44,10 @@ def test_namespace(): def test_make_block_2d_with_dti(): # GH#41168 dti = pd.date_range("2012", periods=3, tz="UTC") - blk = api.make_block(dti, placement=[0]) + + msg = "make_block is deprecated" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + blk = api.make_block(dti, placement=[0]) assert blk.shape == (1, 3) assert blk.values.shape == (1, 3) diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 60ca47b52b373..fca1ed39c0f9c 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -1368,8 +1368,10 @@ def test_validate_ndim(): placement = BlockPlacement(slice(2)) msg = r"Wrong number of dimensions. values.ndim != ndim \[1 != 2\]" + depr_msg = "make_block is deprecated" with pytest.raises(ValueError, match=msg): - make_block(values, placement, ndim=2) + with tm.assert_produces_warning(DeprecationWarning, match=depr_msg): + make_block(values, placement, ndim=2) def test_block_shape(): @@ -1384,8 +1386,12 @@ def test_make_block_no_pandas_array(block_maker): # https://github.com/pandas-dev/pandas/pull/24866 arr = pd.arrays.NumpyExtensionArray(np.array([1, 2])) + depr_msg = "make_block is deprecated" + warn = DeprecationWarning if block_maker is make_block else None + # NumpyExtensionArray, no dtype - result = block_maker(arr, BlockPlacement(slice(len(arr))), ndim=arr.ndim) + with tm.assert_produces_warning(warn, match=depr_msg): + result = block_maker(arr, BlockPlacement(slice(len(arr))), ndim=arr.ndim) assert result.dtype.kind in ["i", "u"] if block_maker is make_block: @@ -1393,14 +1399,16 @@ def test_make_block_no_pandas_array(block_maker): assert result.is_extension is False # NumpyExtensionArray, NumpyEADtype - result = block_maker(arr, slice(len(arr)), dtype=arr.dtype, ndim=arr.ndim) + with tm.assert_produces_warning(warn, match=depr_msg): + result = block_maker(arr, slice(len(arr)), dtype=arr.dtype, ndim=arr.ndim) assert result.dtype.kind in ["i", "u"] assert result.is_extension is False # new_block no longer accepts dtype keyword # ndarray, NumpyEADtype - result = block_maker( - arr.to_numpy(), slice(len(arr)), dtype=arr.dtype, ndim=arr.ndim - ) + with tm.assert_produces_warning(warn, match=depr_msg): + result = block_maker( + arr.to_numpy(), slice(len(arr)), dtype=arr.dtype, ndim=arr.ndim + ) assert result.dtype.kind in ["i", "u"] assert result.is_extension is False diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 35275f3c23bef..af492b967bc1d 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -985,6 +985,7 @@ def test_filter_row_groups(self, pa): result = read_parquet(path, pa, filters=[("a", "==", 0)]) assert len(result) == 1 + @pytest.mark.filterwarnings("ignore:make_block is deprecated:DeprecationWarning") def test_read_dtype_backend_pyarrow_config(self, pa, df_full): import pyarrow From cc18753374077fcdc1562e1e1da7e620392b42c5 Mon Sep 17 00:00:00 2001 From: Robin Shindelman <129821483+shindelr@users.noreply.github.com> Date: Tue, 4 Jun 2024 10:49:40 -0700 Subject: [PATCH 045/272] Proofreading/editing Getting Started for readability. Attempt 2, this time without extra files. (#58919) Proofreading/editing Getting Started for readability. Attempt2 --- doc/source/getting_started/index.rst | 38 ++++++++++++++-------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/doc/source/getting_started/index.rst b/doc/source/getting_started/index.rst index d9cb1de14aded..9f29f7f4f4406 100644 --- a/doc/source/getting_started/index.rst +++ b/doc/source/getting_started/index.rst @@ -134,8 +134,8 @@ to explore, clean, and process your data. In pandas, a data table is called a :c
-pandas supports the integration with many file formats or data sources out of the box (csv, excel, sql, json, parquet,…). Importing data from each of these -data sources is provided by function with the prefix ``read_*``. Similarly, the ``to_*`` methods are used to store data. +pandas supports the integration with many file formats or data sources out of the box (csv, excel, sql, json, parquet,…). The ability to import data from each of these +data sources is provided by functions with the prefix, ``read_*``. Similarly, the ``to_*`` methods are used to store data. .. image:: ../_static/schemas/02_io_readwrite.svg :align: center @@ -181,7 +181,7 @@ data sources is provided by function with the prefix ``read_*``. Similarly, the
-Selecting or filtering specific rows and/or columns? Filtering the data on a condition? Methods for slicing, selecting, and extracting the +Selecting or filtering specific rows and/or columns? Filtering the data on a particular condition? Methods for slicing, selecting, and extracting the data you need are available in pandas. .. image:: ../_static/schemas/03_subset_columns_rows.svg @@ -228,7 +228,7 @@ data you need are available in pandas.
-pandas provides plotting your data out of the box, using the power of Matplotlib. You can pick the plot type (scatter, bar, boxplot,...) +pandas provides plotting for your data right out of the box with the power of Matplotlib. Simply pick the plot type (scatter, bar, boxplot,...) corresponding to your data. .. image:: ../_static/schemas/04_plot_overview.svg @@ -275,7 +275,7 @@ corresponding to your data.
-There is no need to loop over all rows of your data table to do calculations. Data manipulations on a column work elementwise. +There's no need to loop over all rows of your data table to do calculations. Column data manipulations work elementwise in pandas. Adding a column to a :class:`DataFrame` based on existing data in other columns is straightforward. .. image:: ../_static/schemas/05_newcolumn_2.svg @@ -322,7 +322,7 @@ Adding a column to a :class:`DataFrame` based on existing data in other columns
-Basic statistics (mean, median, min, max, counts...) are easily calculable. These or custom aggregations can be applied on the entire +Basic statistics (mean, median, min, max, counts...) are easily calculable across data frames. These, or even custom aggregations, can be applied on the entire data set, a sliding window of the data, or grouped by categories. The latter is also known as the split-apply-combine approach. .. image:: ../_static/schemas/06_groupby.svg @@ -369,8 +369,8 @@ data set, a sliding window of the data, or grouped by categories. The latter is
-Change the structure of your data table in multiple ways. You can :func:`~pandas.melt` your data table from wide to long/tidy form or :func:`~pandas.pivot` -from long to wide format. With aggregations built-in, a pivot table is created with a single command. +Change the structure of your data table in a variety of ways. You can use :func:`~pandas.melt` to reshape your data from a wide format to a long and tidy one. Use :func:`~pandas.pivot` + to go from long to wide format. With aggregations built-in, a pivot table can be created with a single command. .. image:: ../_static/schemas/07_melt.svg :align: center @@ -416,7 +416,7 @@ from long to wide format. With aggregations built-in, a pivot table is created w
-Multiple tables can be concatenated both column wise and row wise as database-like join/merge operations are provided to combine multiple tables of data. +Multiple tables can be concatenated column wise or row wise with pandas' database-like join and merge operations. .. image:: ../_static/schemas/08_concat_row.svg :align: center @@ -505,7 +505,7 @@ pandas has great support for time series and has an extensive set of tools for w
-Data sets do not only contain numerical data. pandas provides a wide range of functions to clean textual data and extract useful information from it. +Data sets often contain more than just numerical data. pandas provides a wide range of functions to clean textual data and extract useful information from it. .. raw:: html @@ -551,9 +551,9 @@ the pandas-equivalent operations compared to software you already know: :class-card: comparison-card :shadow: md - The `R programming language `__ provides the - ``data.frame`` data structure and multiple packages, such as - `tidyverse `__ use and extend ``data.frame`` + The `R programming language `__ provides a + ``data.frame`` data structure as well as packages like + `tidyverse `__ which use and extend ``data.frame`` for convenient data handling functionalities similar to pandas. +++ @@ -572,8 +572,8 @@ the pandas-equivalent operations compared to software you already know: :class-card: comparison-card :shadow: md - Already familiar to ``SELECT``, ``GROUP BY``, ``JOIN``, etc.? - Most of these SQL manipulations do have equivalents in pandas. + Already familiar with ``SELECT``, ``GROUP BY``, ``JOIN``, etc.? + Many SQL manipulations have equivalents in pandas. +++ @@ -631,10 +631,10 @@ the pandas-equivalent operations compared to software you already know: :class-card: comparison-card :shadow: md - The `SAS `__ statistical software suite - also provides the ``data set`` corresponding to the pandas ``DataFrame``. - Also SAS vectorized operations, filtering, string processing operations, - and more have similar functions in pandas. + `SAS `__, the statistical software suite, + uses the ``data set`` structure, which closely corresponds pandas' ``DataFrame``. + Also SAS vectorized operations such as filtering or string processing operations + have similar functions in pandas. +++ From dcb542bfb0befec5d385604cb0175fb7ac45572e Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Tue, 4 Jun 2024 23:34:37 +0530 Subject: [PATCH 046/272] DOC: fix RT03,SA01 for pandas.bdate_range (#58917) --- ci/code_checks.sh | 1 - pandas/core/indexes/datetimes.py | 7 +++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 1e615b6df8446..d29761b938eaa 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -343,7 +343,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.arrays.NumpyExtensionArray SA01" \ -i "pandas.arrays.SparseArray PR07,SA01" \ -i "pandas.arrays.TimedeltaArray PR07,SA01" \ - -i "pandas.bdate_range RT03,SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.__iter__ RT03,SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.agg RT03" \ -i "pandas.core.groupby.DataFrameGroupBy.aggregate RT03" \ diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 930bc7a95bd14..c276750314a34 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -1070,6 +1070,13 @@ def bdate_range( Returns ------- DatetimeIndex + Fixed frequency DatetimeIndex. + + See Also + -------- + date_range : Return a fixed frequency DatetimeIndex. + period_range : Return a fixed frequency PeriodIndex. + timedelta_range : Return a fixed frequency TimedeltaIndex. Notes ----- From f55bec1db9620dd3e2136c2b45b05d7391a94015 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Tue, 4 Jun 2024 23:35:38 +0530 Subject: [PATCH 047/272] DOC: fix SA01 for pandas.describe_option (#58914) --- ci/code_checks.sh | 1 - pandas/_config/config.py | 6 ++++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index d29761b938eaa..8145cb62070ca 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -403,7 +403,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.core.window.rolling.Window.std PR01" \ -i "pandas.core.window.rolling.Window.var PR01" \ -i "pandas.date_range RT03" \ - -i "pandas.describe_option SA01" \ -i "pandas.errors.AbstractMethodError PR01,SA01" \ -i "pandas.errors.AttributeConflictWarning SA01" \ -i "pandas.errors.CSSWarning SA01" \ diff --git a/pandas/_config/config.py b/pandas/_config/config.py index 55d9e29686259..95c549a8ff0e8 100644 --- a/pandas/_config/config.py +++ b/pandas/_config/config.py @@ -279,6 +279,12 @@ def describe_option(pat: str = "", _print_desc: bool = True) -> str | None: str If the description(s) as a string if ``_print_desc=False``. + See Also + -------- + get_option : Retrieve the value of the specified option. + set_option : Set the value of the specified option or options. + reset_option : Reset one or more options to their default value. + Notes ----- For all available options, please view the From da5408750750ad30f270ae1295f1746efa5d9c80 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Tue, 4 Jun 2024 23:36:56 +0530 Subject: [PATCH 048/272] DOC: fix PR07 for pandas.pivot_table (#58896) * DOC: fix PR07 for pandas.pivot_table * DOC: remove redundant comments Co-authored-by: mroeschke --------- Co-authored-by: mroeschke --- ci/code_checks.sh | 1 - pandas/core/reshape/pivot.py | 169 ++++++++++++++++++++++++++++++++++- 2 files changed, 165 insertions(+), 5 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 8145cb62070ca..039700f306e03 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -470,7 +470,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.merge_ordered PR07" \ -i "pandas.period_range RT03,SA01" \ -i "pandas.pivot PR07" \ - -i "pandas.pivot_table PR07" \ -i "pandas.plotting.andrews_curves RT03,SA01" \ -i "pandas.plotting.lag_plot RT03,SA01" \ -i "pandas.plotting.scatter_matrix PR07,SA01" \ diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index e0126d439a79c..86da19f13bacf 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -54,10 +54,6 @@ from pandas import DataFrame -# Note: We need to make sure `frame` is imported before `pivot`, otherwise -# _shared_docs['pivot_table'] will not yet exist. TODO: Fix this dependency -@Substitution("\ndata : DataFrame") -@Appender(_shared_docs["pivot_table"], indents=1) def pivot_table( data: DataFrame, values=None, @@ -71,6 +67,171 @@ def pivot_table( observed: bool = True, sort: bool = True, ) -> DataFrame: + """ + Create a spreadsheet-style pivot table as a DataFrame. + + The levels in the pivot table will be stored in MultiIndex objects + (hierarchical indexes) on the index and columns of the result DataFrame. + + Parameters + ---------- + data : DataFrame + Input pandas DataFrame object. + values : list-like or scalar, optional + Column or columns to aggregate. + index : column, Grouper, array, or list of the previous + Keys to group by on the pivot table index. If a list is passed, + it can contain any of the other types (except list). If an array is + passed, it must be the same length as the data and will be used in + the same manner as column values. + columns : column, Grouper, array, or list of the previous + Keys to group by on the pivot table column. If a list is passed, + it can contain any of the other types (except list). If an array is + passed, it must be the same length as the data and will be used in + the same manner as column values. + aggfunc : function, list of functions, dict, default "mean" + If a list of functions is passed, the resulting pivot table will have + hierarchical columns whose top level are the function names + (inferred from the function objects themselves). + If a dict is passed, the key is column to aggregate and the value is + function or list of functions. If ``margin=True``, aggfunc will be + used to calculate the partial aggregates. + fill_value : scalar, default None + Value to replace missing values with (in the resulting pivot table, + after aggregation). + margins : bool, default False + If ``margins=True``, special ``All`` columns and rows + will be added with partial group aggregates across the categories + on the rows and columns. + dropna : bool, default True + Do not include columns whose entries are all NaN. If True, + rows with a NaN value in any column will be omitted before + computing margins. + margins_name : str, default 'All' + Name of the row / column that will contain the totals + when margins is True. + observed : bool, default False + This only applies if any of the groupers are Categoricals. + If True: only show observed values for categorical groupers. + If False: show all values for categorical groupers. + + .. versionchanged:: 3.0.0 + + The default value is now ``True``. + + sort : bool, default True + Specifies if the result should be sorted. + + .. versionadded:: 1.3.0 + + Returns + ------- + DataFrame + An Excel style pivot table. + + See Also + -------- + DataFrame.pivot : Pivot without aggregation that can handle + non-numeric data. + DataFrame.melt: Unpivot a DataFrame from wide to long format, + optionally leaving identifiers set. + wide_to_long : Wide panel to long format. Less flexible but more + user-friendly than melt. + + Notes + ----- + Reference :ref:`the user guide ` for more examples. + + Examples + -------- + >>> df = pd.DataFrame( + ... { + ... "A": ["foo", "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar"], + ... "B": ["one", "one", "one", "two", "two", "one", "one", "two", "two"], + ... "C": [ + ... "small", + ... "large", + ... "large", + ... "small", + ... "small", + ... "large", + ... "small", + ... "small", + ... "large", + ... ], + ... "D": [1, 2, 2, 3, 3, 4, 5, 6, 7], + ... "E": [2, 4, 5, 5, 6, 6, 8, 9, 9], + ... } + ... ) + >>> df + A B C D E + 0 foo one small 1 2 + 1 foo one large 2 4 + 2 foo one large 2 5 + 3 foo two small 3 5 + 4 foo two small 3 6 + 5 bar one large 4 6 + 6 bar one small 5 8 + 7 bar two small 6 9 + 8 bar two large 7 9 + + This first example aggregates values by taking the sum. + + >>> table = pd.pivot_table( + ... df, values="D", index=["A", "B"], columns=["C"], aggfunc="sum" + ... ) + >>> table + C large small + A B + bar one 4.0 5.0 + two 7.0 6.0 + foo one 4.0 1.0 + two NaN 6.0 + + We can also fill missing values using the `fill_value` parameter. + + >>> table = pd.pivot_table( + ... df, values="D", index=["A", "B"], columns=["C"], aggfunc="sum", fill_value=0 + ... ) + >>> table + C large small + A B + bar one 4 5 + two 7 6 + foo one 4 1 + two 0 6 + + The next example aggregates by taking the mean across multiple columns. + + >>> table = pd.pivot_table( + ... df, values=["D", "E"], index=["A", "C"], aggfunc={"D": "mean", "E": "mean"} + ... ) + >>> table + D E + A C + bar large 5.500000 7.500000 + small 5.500000 8.500000 + foo large 2.000000 4.500000 + small 2.333333 4.333333 + + We can also calculate multiple types of aggregations for any given + value column. + + >>> table = pd.pivot_table( + ... df, + ... values=["D", "E"], + ... index=["A", "C"], + ... aggfunc={"D": "mean", "E": ["min", "max", "mean"]}, + ... ) + >>> table + D E + mean max mean min + A C + bar large 5.500000 9 7.500000 6 + small 5.500000 9 8.500000 8 + foo large 2.000000 5 4.500000 4 + small 2.333333 6 4.333333 2 + """ index = _convert_by(index) columns = _convert_by(columns) From f97c3c4022fcdc884d550420ce7442a52ed00a89 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 4 Jun 2024 08:08:08 -1000 Subject: [PATCH 049/272] CI: Update CircleCI configs (#58908) * CI: Update CircleCI configs * Fix workflow * add mamba path * Use conda env create * Remove workaround? --- .circleci/config.yml | 72 ++++++++++++------- .circleci/setup_env.sh | 60 ---------------- ...e-310-arm64.yaml => circle-311-arm64.yaml} | 2 +- 3 files changed, 46 insertions(+), 88 deletions(-) delete mode 100755 .circleci/setup_env.sh rename ci/deps/{circle-310-arm64.yaml => circle-311-arm64.yaml} (98%) diff --git a/.circleci/config.yml b/.circleci/config.yml index 6f134c9a7a7bd..463667446ed42 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -1,46 +1,64 @@ version: 2.1 jobs: - test-arm: + test-linux-arm: machine: image: default resource_class: arm.large environment: - ENV_FILE: ci/deps/circle-310-arm64.yaml + ENV_FILE: ci/deps/circle-311-arm64.yaml PYTEST_WORKERS: auto PATTERN: "not single_cpu and not slow and not network and not clipboard and not arm_slow and not db" PYTEST_TARGET: "pandas" PANDAS_CI: "1" steps: - checkout - - run: .circleci/setup_env.sh - - run: > - PATH=$HOME/miniconda3/envs/pandas-dev/bin:$HOME/miniconda3/condabin:$PATH - LD_PRELOAD=$HOME/miniconda3/envs/pandas-dev/lib/libgomp.so.1:$LD_PRELOAD - ci/run_tests.sh - linux-musl: + - run: + name: Install Environment and Run Tests + shell: /bin/bash -exuo pipefail + command: | + MAMBA_URL="https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Mambaforge-24.3.0-0-Linux-aarch64.sh" + wget -q $MAMBA_URL -O minimamba.sh + chmod +x minimamba.sh + MAMBA_DIR="$HOME/miniconda3" + rm -rf $MAMBA_DIR + ./minimamba.sh -b -p $MAMBA_DIR + export PATH=$MAMBA_DIR/bin:$PATH + conda info -a + conda env create -q -n pandas-dev -f $ENV_FILE + conda list -n pandas-dev + source activate pandas-dev + if pip show pandas 1>/dev/null; then + pip uninstall -y pandas + fi + python -m pip install --no-build-isolation -ve . --config-settings=setup-args="--werror" + PATH=$HOME/miniconda3/envs/pandas-dev/bin:$HOME/miniconda3/condabin:$PATH + ci/run_tests.sh + test-linux-musl: docker: - image: quay.io/pypa/musllinux_1_1_aarch64 resource_class: arm.large steps: # Install pkgs first to have git in the image # (needed for checkout) - - run: | - apk update - apk add git - apk add musl-locales + - run: + name: Install System Packages + command: | + apk update + apk add git + apk add musl-locales - checkout - - run: | - /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev - . ~/virtualenvs/pandas-dev/bin/activate - python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.13.1 meson[ninja]==1.2.1 - python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 hypothesis>=6.46.1 - python -m pip install --no-cache-dir --no-build-isolation -e . --config-settings=setup-args="--werror" - python -m pip list --no-cache-dir - - run: | - . ~/virtualenvs/pandas-dev/bin/activate - export PANDAS_CI=1 - python -m pytest -m 'not slow and not network and not clipboard and not single_cpu' pandas --junitxml=test-data.xml + - run: + name: Install Environment and Run Tests + command: | + /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev + . ~/virtualenvs/pandas-dev/bin/activate + python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.13.1 meson[ninja]==1.2.1 + python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 hypothesis>=6.46.1 + python -m pip install --no-cache-dir --no-build-isolation -e . --config-settings=setup-args="--werror" + python -m pip list --no-cache-dir + export PANDAS_CI=1 + python -m pytest -m 'not slow and not network and not clipboard and not single_cpu' pandas --junitxml=test-data.xml build-aarch64: parameters: cibw-build: @@ -71,7 +89,7 @@ jobs: name: Build aarch64 wheels no_output_timeout: 30m # Sometimes the tests won't generate any output, make sure the job doesn't get killed by that command: | - pip3 install cibuildwheel==2.15.0 + pip3 install cibuildwheel==2.18.1 cibuildwheel --prerelease-pythons --output-dir wheelhouse environment: @@ -81,7 +99,7 @@ jobs: name: Install Anaconda Client & Upload Wheels command: | echo "Install Mambaforge" - MAMBA_URL="https://github.com/conda-forge/miniforge/releases/download/23.1.0-0/Mambaforge-23.1.0-0-Linux-aarch64.sh" + MAMBA_URL="https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Mambaforge-24.3.0-0-Linux-aarch64.sh" echo "Downloading $MAMBA_URL" wget -q $MAMBA_URL -O minimamba.sh chmod +x minimamba.sh @@ -107,14 +125,14 @@ workflows: not: equal: [ scheduled_pipeline, << pipeline.trigger_source >> ] jobs: - - test-arm + - test-linux-arm test-musl: # Don't run trigger this one when scheduled pipeline runs when: not: equal: [ scheduled_pipeline, << pipeline.trigger_source >> ] jobs: - - linux-musl + - test-linux-musl build-wheels: jobs: - build-aarch64: diff --git a/.circleci/setup_env.sh b/.circleci/setup_env.sh deleted file mode 100755 index eef4db1191a9a..0000000000000 --- a/.circleci/setup_env.sh +++ /dev/null @@ -1,60 +0,0 @@ -#!/bin/bash -e - -echo "Install Mambaforge" -MAMBA_URL="https://github.com/conda-forge/miniforge/releases/download/23.1.0-0/Mambaforge-23.1.0-0-Linux-aarch64.sh" -echo "Downloading $MAMBA_URL" -wget -q $MAMBA_URL -O minimamba.sh -chmod +x minimamba.sh - -MAMBA_DIR="$HOME/miniconda3" -rm -rf $MAMBA_DIR -./minimamba.sh -b -p $MAMBA_DIR - -export PATH=$MAMBA_DIR/bin:$PATH - -echo -echo "which conda" -which conda - -echo -echo "update conda" -conda config --set ssl_verify false -conda config --set quiet true --set always_yes true --set changeps1 false -mamba install -y -c conda-forge -n base pip setuptools - -echo "conda info -a" -conda info -a - -echo "conda list (root environment)" -conda list - -echo -# Clean up any left-over from a previous build -mamba env remove -n pandas-dev -echo "mamba env update --file=${ENV_FILE}" -# See https://github.com/mamba-org/mamba/issues/633 -mamba create -q -n pandas-dev -time mamba env update -n pandas-dev --file="${ENV_FILE}" - -echo "conda list -n pandas-dev" -conda list -n pandas-dev - -echo "activate pandas-dev" -source activate pandas-dev - -# Explicitly set an environment variable indicating that this is pandas' CI environment. -# -# This allows us to enable things like -Werror that shouldn't be activated in -# downstream CI jobs that may also build pandas from source. -export PANDAS_CI=1 - -if pip show pandas 1>/dev/null; then - echo - echo "remove any installed pandas package w/o removing anything else" - pip uninstall -y pandas -fi - -echo "Install pandas" -python -m pip install --no-build-isolation -ve . --config-settings=setup-args="--werror" - -echo "done" diff --git a/ci/deps/circle-310-arm64.yaml b/ci/deps/circle-311-arm64.yaml similarity index 98% rename from ci/deps/circle-310-arm64.yaml rename to ci/deps/circle-311-arm64.yaml index ed4d139714e71..1c31d353699f8 100644 --- a/ci/deps/circle-310-arm64.yaml +++ b/ci/deps/circle-311-arm64.yaml @@ -2,7 +2,7 @@ name: pandas-dev channels: - conda-forge dependencies: - - python=3.10 + - python=3.11 # build dependencies - versioneer[toml] From ea509bbe5c269e925ece5712986077318df85f0e Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 4 Jun 2024 08:08:36 -1000 Subject: [PATCH 050/272] CLN: Plotting tests (#58905) * Clean some plotting tests * Cleann test_misc * Format test_datetimelike * clean more datetimelike --- pandas/tests/plotting/test_datetimelike.py | 195 +++++++++------------ pandas/tests/plotting/test_misc.py | 132 +++++--------- pandas/tests/plotting/test_series.py | 94 +++------- pandas/tests/plotting/test_style.py | 18 +- 4 files changed, 154 insertions(+), 285 deletions(-) diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py index 4b4eeada58366..a9135ee583d91 100644 --- a/pandas/tests/plotting/test_datetimelike.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -46,6 +46,8 @@ mpl = pytest.importorskip("matplotlib") plt = pytest.importorskip("matplotlib.pyplot") +import pandas.plotting._matplotlib.converter as conv + class TestTSPlot: @pytest.mark.filterwarnings("ignore::UserWarning") @@ -73,7 +75,7 @@ def test_fontsize_set_correctly(self): def test_frame_inferred(self): # inferred freq - idx = date_range("1/1/1987", freq="MS", periods=100) + idx = date_range("1/1/1987", freq="MS", periods=10) idx = DatetimeIndex(idx.values, freq=None) df = DataFrame( @@ -82,7 +84,7 @@ def test_frame_inferred(self): _check_plot_works(df.plot) # axes freq - idx = idx[0:40].union(idx[45:99]) + idx = idx[0:4].union(idx[6:]) df2 = DataFrame( np.random.default_rng(2).standard_normal((len(idx), 3)), index=idx ) @@ -111,7 +113,6 @@ def test_nonnumeric_exclude(self): fig, ax = mpl.pyplot.subplots() df.plot(ax=ax) # it works assert len(ax.get_lines()) == 1 # B was plotted - mpl.pyplot.close(fig) def test_nonnumeric_exclude_error(self): idx = date_range("1/1/1987", freq="YE", periods=3) @@ -122,7 +123,7 @@ def test_nonnumeric_exclude_error(self): @pytest.mark.parametrize("freq", ["s", "min", "h", "D", "W", "M", "Q", "Y"]) def test_tsplot_period(self, freq): - idx = period_range("12/31/1999", freq=freq, periods=100) + idx = period_range("12/31/1999", freq=freq, periods=10) ser = Series(np.random.default_rng(2).standard_normal(len(idx)), idx) _, ax = mpl.pyplot.subplots() _check_plot_works(ser.plot, ax=ax) @@ -131,7 +132,7 @@ def test_tsplot_period(self, freq): "freq", ["s", "min", "h", "D", "W", "ME", "QE-DEC", "YE", "1B30Min"] ) def test_tsplot_datetime(self, freq): - idx = date_range("12/31/1999", freq=freq, periods=100) + idx = date_range("12/31/1999", freq=freq, periods=10) ser = Series(np.random.default_rng(2).standard_normal(len(idx)), idx) _, ax = mpl.pyplot.subplots() _check_plot_works(ser.plot, ax=ax) @@ -145,10 +146,9 @@ def test_tsplot(self): color = (0.0, 0.0, 0.0, 1) assert color == ax.get_lines()[0].get_color() - def test_both_style_and_color(self): - ts = Series( - np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10) - ) + @pytest.mark.parametrize("index", [None, date_range("2020-01-01", periods=10)]) + def test_both_style_and_color(self, index): + ts = Series(np.arange(10, dtype=np.float64), index=index) msg = ( "Cannot pass 'style' string with a color symbol and 'color' " "keyword argument. Please use one or the other or pass 'style' " @@ -157,46 +157,37 @@ def test_both_style_and_color(self): with pytest.raises(ValueError, match=msg): ts.plot(style="b-", color="#000099") - s = ts.reset_index(drop=True) - with pytest.raises(ValueError, match=msg): - s.plot(style="b-", color="#000099") - @pytest.mark.parametrize("freq", ["ms", "us"]) def test_high_freq(self, freq): _, ax = mpl.pyplot.subplots() - rng = date_range("1/1/2012", periods=100, freq=freq) + rng = date_range("1/1/2012", periods=10, freq=freq) ser = Series(np.random.default_rng(2).standard_normal(len(rng)), rng) _check_plot_works(ser.plot, ax=ax) def test_get_datevalue(self): - from pandas.plotting._matplotlib.converter import get_datevalue - - assert get_datevalue(None, "D") is None - assert get_datevalue(1987, "Y") == 1987 - assert get_datevalue(Period(1987, "Y"), "M") == Period("1987-12", "M").ordinal - assert get_datevalue("1/1/1987", "D") == Period("1987-1-1", "D").ordinal - - def test_ts_plot_format_coord(self): - def check_format_of_first_point(ax, expected_string): - first_line = ax.get_lines()[0] - first_x = first_line.get_xdata()[0].ordinal - first_y = first_line.get_ydata()[0] - assert expected_string == ax.format_coord(first_x, first_y) + assert conv.get_datevalue(None, "D") is None + assert conv.get_datevalue(1987, "Y") == 1987 + assert ( + conv.get_datevalue(Period(1987, "Y"), "M") == Period("1987-12", "M").ordinal + ) + assert conv.get_datevalue("1/1/1987", "D") == Period("1987-1-1", "D").ordinal - annual = Series(1, index=date_range("2014-01-01", periods=3, freq="YE-DEC")) + @pytest.mark.parametrize( + "freq, expected_string", + [["YE-DEC", "t = 2014 y = 1.000000"], ["D", "t = 2014-01-01 y = 1.000000"]], + ) + def test_ts_plot_format_coord(self, freq, expected_string): + ser = Series(1, index=date_range("2014-01-01", periods=3, freq=freq)) _, ax = mpl.pyplot.subplots() - annual.plot(ax=ax) - check_format_of_first_point(ax, "t = 2014 y = 1.000000") - - # note this is added to the annual plot already in existence, and - # changes its freq field - daily = Series(1, index=date_range("2014-01-01", periods=3, freq="D")) - daily.plot(ax=ax) - check_format_of_first_point(ax, "t = 2014-01-01 y = 1.000000") + ser.plot(ax=ax) + first_line = ax.get_lines()[0] + first_x = first_line.get_xdata()[0].ordinal + first_y = first_line.get_ydata()[0] + assert expected_string == ax.format_coord(first_x, first_y) @pytest.mark.parametrize("freq", ["s", "min", "h", "D", "W", "M", "Q", "Y"]) def test_line_plot_period_series(self, freq): - idx = period_range("12/31/1999", freq=freq, periods=100) + idx = period_range("12/31/1999", freq=freq, periods=10) ser = Series(np.random.default_rng(2).standard_normal(len(idx)), idx) _check_plot_works(ser.plot, ser.index.freq) @@ -206,7 +197,7 @@ def test_line_plot_period_series(self, freq): def test_line_plot_period_mlt_series(self, frqncy): # test period index line plot for series with multiples (`mlt`) of the # frequency (`frqncy`) rule code. tests resolution of issue #14763 - idx = period_range("12/31/1999", freq=frqncy, periods=100) + idx = period_range("12/31/1999", freq=frqncy, periods=10) s = Series(np.random.default_rng(2).standard_normal(len(idx)), idx) _check_plot_works(s.plot, s.index.freq.rule_code) @@ -214,13 +205,13 @@ def test_line_plot_period_mlt_series(self, frqncy): "freq", ["s", "min", "h", "D", "W", "ME", "QE-DEC", "YE", "1B30Min"] ) def test_line_plot_datetime_series(self, freq): - idx = date_range("12/31/1999", freq=freq, periods=100) + idx = date_range("12/31/1999", freq=freq, periods=10) ser = Series(np.random.default_rng(2).standard_normal(len(idx)), idx) _check_plot_works(ser.plot, ser.index.freq.rule_code) @pytest.mark.parametrize("freq", ["s", "min", "h", "D", "W", "ME", "QE", "YE"]) def test_line_plot_period_frame(self, freq): - idx = date_range("12/31/1999", freq=freq, periods=100) + idx = date_range("12/31/1999", freq=freq, periods=10) df = DataFrame( np.random.default_rng(2).standard_normal((len(idx), 3)), index=idx, @@ -235,7 +226,7 @@ def test_line_plot_period_mlt_frame(self, frqncy): # test period index line plot for DataFrames with multiples (`mlt`) # of the frequency (`frqncy`) rule code. tests resolution of issue # #14763 - idx = period_range("12/31/1999", freq=frqncy, periods=100) + idx = period_range("12/31/1999", freq=frqncy, periods=10) df = DataFrame( np.random.default_rng(2).standard_normal((len(idx), 3)), index=idx, @@ -249,7 +240,7 @@ def test_line_plot_period_mlt_frame(self, frqncy): "freq", ["s", "min", "h", "D", "W", "ME", "QE-DEC", "YE", "1B30Min"] ) def test_line_plot_datetime_frame(self, freq): - idx = date_range("12/31/1999", freq=freq, periods=100) + idx = date_range("12/31/1999", freq=freq, periods=10) df = DataFrame( np.random.default_rng(2).standard_normal((len(idx), 3)), index=idx, @@ -263,7 +254,7 @@ def test_line_plot_datetime_frame(self, freq): "freq", ["s", "min", "h", "D", "W", "ME", "QE-DEC", "YE", "1B30Min"] ) def test_line_plot_inferred_freq(self, freq): - idx = date_range("12/31/1999", freq=freq, periods=100) + idx = date_range("12/31/1999", freq=freq, periods=10) ser = Series(np.random.default_rng(2).standard_normal(len(idx)), idx) ser = Series(ser.values, Index(np.asarray(ser.index))) _check_plot_works(ser.plot, ser.index.inferred_freq) @@ -350,8 +341,8 @@ def test_business_freq(self): def test_business_freq_convert(self): bts = Series( - np.arange(300, dtype=np.float64), - index=date_range("2020-01-01", periods=300, freq="B"), + np.arange(50, dtype=np.float64), + index=date_range("2020-01-01", periods=50, freq="B"), ).asfreq("BME") ts = bts.to_period("M") _, ax = mpl.pyplot.subplots() @@ -444,12 +435,8 @@ def test_axis_limits(self, obj): result = ax.get_xlim() assert int(result[0]) == expected[0].ordinal assert int(result[1]) == expected[1].ordinal - fig = ax.get_figure() - mpl.pyplot.close(fig) def test_get_finder(self): - import pandas.plotting._matplotlib.converter as conv - assert conv.get_finder(to_offset("B")) == conv._daily_finder assert conv.get_finder(to_offset("D")) == conv._daily_finder assert conv.get_finder(to_offset("ME")) == conv._monthly_finder @@ -552,7 +539,7 @@ def test_finder_annual(self): @pytest.mark.slow def test_finder_minutely(self): - nminutes = 50 * 24 * 60 + nminutes = 1 * 24 * 60 rng = date_range("1/1/1999", freq="Min", periods=nminutes) ser = Series(np.random.default_rng(2).standard_normal(len(rng)), rng) _, ax = mpl.pyplot.subplots() @@ -577,9 +564,9 @@ def test_finder_hourly(self): def test_gaps(self): ts = Series( - np.arange(30, dtype=np.float64), index=date_range("2020-01-01", periods=30) + np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10) ) - ts.iloc[5:25] = np.nan + ts.iloc[5:7] = np.nan _, ax = mpl.pyplot.subplots() ts.plot(ax=ax) lines = ax.get_lines() @@ -591,8 +578,7 @@ def test_gaps(self): assert isinstance(data, np.ma.core.MaskedArray) mask = data.mask - assert mask[5:25, 1].all() - mpl.pyplot.close(ax.get_figure()) + assert mask[5:7, 1].all() def test_gaps_irregular(self): # irregular @@ -613,7 +599,6 @@ def test_gaps_irregular(self): assert isinstance(data, np.ma.core.MaskedArray) mask = data.mask assert mask[2:5, 1].all() - mpl.pyplot.close(ax.get_figure()) def test_gaps_non_ts(self): # non-ts @@ -634,9 +619,9 @@ def test_gaps_non_ts(self): def test_gap_upsample(self): low = Series( - np.arange(30, dtype=np.float64), index=date_range("2020-01-01", periods=30) + np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10) ) - low.iloc[5:25] = np.nan + low.iloc[5:7] = np.nan _, ax = mpl.pyplot.subplots() low.plot(ax=ax) @@ -653,7 +638,7 @@ def test_gap_upsample(self): assert isinstance(data, np.ma.core.MaskedArray) mask = data.mask - assert mask[5:25, 1].all() + assert mask[5:7, 1].all() def test_secondary_y(self): ser = Series(np.random.default_rng(2).standard_normal(10)) @@ -667,7 +652,6 @@ def test_secondary_y(self): tm.assert_series_equal(ser, xp) assert ax.get_yaxis().get_ticks_position() == "right" assert not axes[0].get_yaxis().get_visible() - mpl.pyplot.close(fig) def test_secondary_y_yaxis(self): Series(np.random.default_rng(2).standard_normal(10)) @@ -675,7 +659,6 @@ def test_secondary_y_yaxis(self): _, ax2 = mpl.pyplot.subplots() ser2.plot(ax=ax2) assert ax2.get_yaxis().get_ticks_position() == "left" - mpl.pyplot.close(ax2.get_figure()) def test_secondary_both(self): ser = Series(np.random.default_rng(2).standard_normal(10)) @@ -701,7 +684,6 @@ def test_secondary_y_ts(self): tm.assert_series_equal(ser, xp) assert ax.get_yaxis().get_ticks_position() == "right" assert not axes[0].get_yaxis().get_visible() - mpl.pyplot.close(fig) def test_secondary_y_ts_yaxis(self): idx = date_range("1/1/2000", periods=10) @@ -709,7 +691,6 @@ def test_secondary_y_ts_yaxis(self): _, ax2 = mpl.pyplot.subplots() ser2.plot(ax=ax2) assert ax2.get_yaxis().get_ticks_position() == "left" - mpl.pyplot.close(ax2.get_figure()) def test_secondary_y_ts_visible(self): idx = date_range("1/1/2000", periods=10) @@ -1108,8 +1089,8 @@ def test_from_resampling_area_line_mixed_high_to_low(self, kind1, kind2): def test_mixed_freq_second_millisecond(self): # GH 7772, GH 7760 - idxh = date_range("2014-07-01 09:00", freq="s", periods=50) - idxl = date_range("2014-07-01 09:00", freq="100ms", periods=500) + idxh = date_range("2014-07-01 09:00", freq="s", periods=5) + idxl = date_range("2014-07-01 09:00", freq="100ms", periods=50) high = Series(np.random.default_rng(2).standard_normal(len(idxh)), idxh) low = Series(np.random.default_rng(2).standard_normal(len(idxl)), idxl) # high to low @@ -1122,8 +1103,8 @@ def test_mixed_freq_second_millisecond(self): def test_mixed_freq_second_millisecond_low_to_high(self): # GH 7772, GH 7760 - idxh = date_range("2014-07-01 09:00", freq="s", periods=50) - idxl = date_range("2014-07-01 09:00", freq="100ms", periods=500) + idxh = date_range("2014-07-01 09:00", freq="s", periods=5) + idxl = date_range("2014-07-01 09:00", freq="100ms", periods=50) high = Series(np.random.default_rng(2).standard_normal(len(idxh)), idxh) low = Series(np.random.default_rng(2).standard_normal(len(idxl)), idxl) # low to high @@ -1298,7 +1279,6 @@ def test_secondary_legend(self): # TODO: color cycle problems assert len(colors) == 4 - mpl.pyplot.close(fig) def test_secondary_legend_right(self): df = DataFrame( @@ -1315,7 +1295,6 @@ def test_secondary_legend_right(self): assert leg.get_texts()[1].get_text() == "B" assert leg.get_texts()[2].get_text() == "C" assert leg.get_texts()[3].get_text() == "D" - mpl.pyplot.close(fig) def test_secondary_legend_bar(self): df = DataFrame( @@ -1328,7 +1307,6 @@ def test_secondary_legend_bar(self): leg = ax.get_legend() assert leg.get_texts()[0].get_text() == "A (right)" assert leg.get_texts()[1].get_text() == "B" - mpl.pyplot.close(fig) def test_secondary_legend_bar_right(self): df = DataFrame( @@ -1341,7 +1319,6 @@ def test_secondary_legend_bar_right(self): leg = ax.get_legend() assert leg.get_texts()[0].get_text() == "A" assert leg.get_texts()[1].get_text() == "B" - mpl.pyplot.close(fig) def test_secondary_legend_multi_col(self): df = DataFrame( @@ -1366,14 +1343,13 @@ def test_secondary_legend_multi_col(self): # TODO: color cycle problems assert len(colors) == 4 - mpl.pyplot.close(fig) def test_secondary_legend_nonts(self): # non-ts df = DataFrame( - 1.1 * np.arange(120).reshape((30, 4)), + 1.1 * np.arange(40).reshape((10, 4)), columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + index=Index([f"i-{i}" for i in range(10)], dtype=object), ) fig = mpl.pyplot.figure() ax = fig.add_subplot(211) @@ -1387,14 +1363,13 @@ def test_secondary_legend_nonts(self): # TODO: color cycle problems assert len(colors) == 4 - mpl.pyplot.close() def test_secondary_legend_nonts_multi_col(self): # non-ts df = DataFrame( - 1.1 * np.arange(120).reshape((30, 4)), + 1.1 * np.arange(40).reshape((10, 4)), columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + index=Index([f"i-{i}" for i in range(10)], dtype=object), ) fig = mpl.pyplot.figure() ax = fig.add_subplot(211) @@ -1448,13 +1423,10 @@ def test_mpl_nopandas(self): exp = np.array([x.toordinal() for x in dates], dtype=np.float64) tm.assert_numpy_array_equal(line1.get_xydata()[:, 0], exp) - exp = np.array([x.toordinal() for x in dates], dtype=np.float64) tm.assert_numpy_array_equal(line2.get_xydata()[:, 0], exp) def test_irregular_ts_shared_ax_xlim(self): # GH 2960 - from pandas.plotting._matplotlib.converter import DatetimeConverter - ts = Series( np.arange(20, dtype=np.float64), index=date_range("2020-01-01", periods=20) ) @@ -1467,8 +1439,8 @@ def test_irregular_ts_shared_ax_xlim(self): # check that axis limits are correct left, right = ax.get_xlim() - assert left <= DatetimeConverter.convert(ts_irregular.index.min(), "", ax) - assert right >= DatetimeConverter.convert(ts_irregular.index.max(), "", ax) + assert left <= conv.DatetimeConverter.convert(ts_irregular.index.min(), "", ax) + assert right >= conv.DatetimeConverter.convert(ts_irregular.index.max(), "", ax) def test_secondary_y_non_ts_xlim(self): # GH 3490 - non-timeseries with secondary y @@ -1504,7 +1476,7 @@ def test_secondary_y_regular_ts_xlim(self): def test_secondary_y_mixed_freq_ts_xlim(self): # GH 3490 - mixed frequency timeseries with secondary y - rng = date_range("2000-01-01", periods=10000, freq="min") + rng = date_range("2000-01-01", periods=10, freq="min") ts = Series(1, index=rng) _, ax = mpl.pyplot.subplots() @@ -1519,8 +1491,6 @@ def test_secondary_y_mixed_freq_ts_xlim(self): def test_secondary_y_irregular_ts_xlim(self): # GH 3490 - irregular-timeseries with secondary y - from pandas.plotting._matplotlib.converter import DatetimeConverter - ts = Series( np.arange(20, dtype=np.float64), index=date_range("2020-01-01", periods=20) ) @@ -1534,8 +1504,8 @@ def test_secondary_y_irregular_ts_xlim(self): ts_irregular[:5].plot(ax=ax) left, right = ax.get_xlim() - assert left <= DatetimeConverter.convert(ts_irregular.index.min(), "", ax) - assert right >= DatetimeConverter.convert(ts_irregular.index.max(), "", ax) + assert left <= conv.DatetimeConverter.convert(ts_irregular.index.min(), "", ax) + assert right >= conv.DatetimeConverter.convert(ts_irregular.index.max(), "", ax) def test_plot_outofbounds_datetime(self): # 2579 - checking this does not raise @@ -1722,35 +1692,28 @@ def test_pickle_fig(self, temp_file, frame_or_series, idx): def _check_plot_works(f, freq=None, series=None, *args, **kwargs): - import matplotlib.pyplot as plt - fig = plt.gcf() - try: - plt.clf() - ax = fig.add_subplot(211) - orig_ax = kwargs.pop("ax", plt.gca()) - orig_axfreq = getattr(orig_ax, "freq", None) - - ret = f(*args, **kwargs) - assert ret is not None # do something more intelligent - - ax = kwargs.pop("ax", plt.gca()) - if series is not None: - dfreq = series.index.freq - if isinstance(dfreq, BaseOffset): - dfreq = dfreq.rule_code - if orig_axfreq is None: - assert ax.freq == dfreq - - if freq is not None: - ax_freq = to_offset(ax.freq, is_period=True) - if freq is not None and orig_axfreq is None: - assert ax_freq == freq - - ax = fig.add_subplot(212) - kwargs["ax"] = ax - ret = f(*args, **kwargs) - assert ret is not None # TODO: do something more intelligent - finally: - plt.close(fig) + fig.clf() + ax = fig.add_subplot(211) + orig_ax = kwargs.pop("ax", plt.gca()) + orig_axfreq = getattr(orig_ax, "freq", None) + + ret = f(*args, **kwargs) + assert ret is not None # do something more intelligent + + ax = kwargs.pop("ax", plt.gca()) + if series is not None: + dfreq = series.index.freq + if isinstance(dfreq, BaseOffset): + dfreq = dfreq.rule_code + if orig_axfreq is None: + assert ax.freq == dfreq + + if freq is not None and orig_axfreq is None: + assert to_offset(ax.freq, is_period=True) == freq + + ax = fig.add_subplot(212) + kwargs["ax"] = ax + ret = f(*args, **kwargs) + assert ret is not None # TODO: do something more intelligent diff --git a/pandas/tests/plotting/test_misc.py b/pandas/tests/plotting/test_misc.py index d593ddbbaa0b8..43e1255404784 100644 --- a/pandas/tests/plotting/test_misc.py +++ b/pandas/tests/plotting/test_misc.py @@ -31,6 +31,8 @@ plt = pytest.importorskip("matplotlib.pyplot") cm = pytest.importorskip("matplotlib.cm") +from pandas.plotting._matplotlib.style import get_standard_colors + @pytest.fixture def iris(datapath) -> DataFrame: @@ -109,8 +111,6 @@ def test_savefig(kind, data, index): class TestSeriesPlots: def test_autocorrelation_plot(self): - from pandas.plotting import autocorrelation_plot - ser = Series( np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10), @@ -118,32 +118,28 @@ def test_autocorrelation_plot(self): ) # Ensure no UserWarning when making plot with tm.assert_produces_warning(None): - _check_plot_works(autocorrelation_plot, series=ser) - _check_plot_works(autocorrelation_plot, series=ser.values) + _check_plot_works(plotting.autocorrelation_plot, series=ser) + _check_plot_works(plotting.autocorrelation_plot, series=ser.values) - ax = autocorrelation_plot(ser, label="Test") + ax = plotting.autocorrelation_plot(ser, label="Test") _check_legend_labels(ax, labels=["Test"]) @pytest.mark.parametrize("kwargs", [{}, {"lag": 5}]) def test_lag_plot(self, kwargs): - from pandas.plotting import lag_plot - ser = Series( np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10), name="ts", ) - _check_plot_works(lag_plot, series=ser, **kwargs) + _check_plot_works(plotting.lag_plot, series=ser, **kwargs) def test_bootstrap_plot(self): - from pandas.plotting import bootstrap_plot - ser = Series( np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10), name="ts", ) - _check_plot_works(bootstrap_plot, series=ser, size=10) + _check_plot_works(plotting.bootstrap_plot, series=ser, size=10) class TestDataFramePlots: @@ -156,7 +152,7 @@ def test_scatter_matrix_axis(self, pass_axis): if pass_axis: _, ax = mpl.pyplot.subplots(3, 3) - df = DataFrame(np.random.default_rng(2).standard_normal((100, 3))) + df = DataFrame(np.random.default_rng(2).standard_normal((10, 3))) # we are plotting multiples on a sub-plot with tm.assert_produces_warning(UserWarning, check_stacklevel=False): @@ -168,7 +164,7 @@ def test_scatter_matrix_axis(self, pass_axis): ) axes0_labels = axes[0][0].yaxis.get_majorticklabels() # GH 5662 - expected = ["-2", "0", "2"] + expected = ["-2", "-1", "0"] _check_text_labels(axes0_labels, expected) _check_ticks_props(axes, xlabelsize=8, xrot=90, ylabelsize=8, yrot=0) @@ -181,7 +177,7 @@ def test_scatter_matrix_axis_smaller(self, pass_axis): if pass_axis: _, ax = mpl.pyplot.subplots(3, 3) - df = DataFrame(np.random.default_rng(11).standard_normal((100, 3))) + df = DataFrame(np.random.default_rng(11).standard_normal((10, 3))) df[0] = (df[0] - 2) / 3 # we are plotting multiples on a sub-plot @@ -193,18 +189,15 @@ def test_scatter_matrix_axis_smaller(self, pass_axis): ax=ax, ) axes0_labels = axes[0][0].yaxis.get_majorticklabels() - expected = ["-1.0", "-0.5", "0.0"] + expected = ["-1.25", "-1.0", "-0.75", "-0.5"] _check_text_labels(axes0_labels, expected) _check_ticks_props(axes, xlabelsize=8, xrot=90, ylabelsize=8, yrot=0) @pytest.mark.slow def test_andrews_curves_no_warning(self, iris): - from pandas.plotting import andrews_curves - - df = iris # Ensure no UserWarning when making plot with tm.assert_produces_warning(None): - _check_plot_works(andrews_curves, frame=df, class_column="Name") + _check_plot_works(plotting.andrews_curves, frame=iris, class_column="Name") @pytest.mark.slow @pytest.mark.parametrize( @@ -229,12 +222,10 @@ def test_andrews_curves_no_warning(self, iris): ], ) def test_andrews_curves_linecolors(self, request, df, linecolors): - from pandas.plotting import andrews_curves - if isinstance(df, str): df = request.getfixturevalue(df) ax = _check_plot_works( - andrews_curves, frame=df, class_column="Name", color=linecolors + plotting.andrews_curves, frame=df, class_column="Name", color=linecolors ) _check_colors( ax.get_lines()[:10], linecolors=linecolors, mapping=df["Name"][:10] @@ -256,23 +247,19 @@ def test_andrews_curves_linecolors(self, request, df, linecolors): ], ) def test_andrews_curves_cmap(self, request, df): - from pandas.plotting import andrews_curves - if isinstance(df, str): df = request.getfixturevalue(df) cmaps = [cm.jet(n) for n in np.linspace(0, 1, df["Name"].nunique())] ax = _check_plot_works( - andrews_curves, frame=df, class_column="Name", color=cmaps + plotting.andrews_curves, frame=df, class_column="Name", color=cmaps ) _check_colors(ax.get_lines()[:10], linecolors=cmaps, mapping=df["Name"][:10]) @pytest.mark.slow def test_andrews_curves_handle(self): - from pandas.plotting import andrews_curves - colors = ["b", "g", "r"] df = DataFrame({"A": [1, 2, 3], "B": [1, 2, 3], "C": [1, 2, 3], "Name": colors}) - ax = andrews_curves(df, "Name", color=colors) + ax = plotting.andrews_curves(df, "Name", color=colors) handles, _ = ax.get_legend_handles_labels() _check_colors(handles, linecolors=colors) @@ -282,61 +269,54 @@ def test_andrews_curves_handle(self): [("#556270", "#4ECDC4", "#C7F464"), ["dodgerblue", "aquamarine", "seagreen"]], ) def test_parallel_coordinates_colors(self, iris, color): - from pandas.plotting import parallel_coordinates - df = iris ax = _check_plot_works( - parallel_coordinates, frame=df, class_column="Name", color=color + plotting.parallel_coordinates, frame=df, class_column="Name", color=color ) _check_colors(ax.get_lines()[:10], linecolors=color, mapping=df["Name"][:10]) @pytest.mark.slow def test_parallel_coordinates_cmap(self, iris): - from matplotlib import cm - - from pandas.plotting import parallel_coordinates - df = iris ax = _check_plot_works( - parallel_coordinates, frame=df, class_column="Name", colormap=cm.jet + plotting.parallel_coordinates, + frame=df, + class_column="Name", + colormap=cm.jet, ) - cmaps = [cm.jet(n) for n in np.linspace(0, 1, df["Name"].nunique())] + cmaps = [mpl.cm.jet(n) for n in np.linspace(0, 1, df["Name"].nunique())] _check_colors(ax.get_lines()[:10], linecolors=cmaps, mapping=df["Name"][:10]) @pytest.mark.slow def test_parallel_coordinates_line_diff(self, iris): - from pandas.plotting import parallel_coordinates - df = iris - ax = _check_plot_works(parallel_coordinates, frame=df, class_column="Name") + ax = _check_plot_works( + plotting.parallel_coordinates, frame=df, class_column="Name" + ) nlines = len(ax.get_lines()) nxticks = len(ax.xaxis.get_ticklabels()) ax = _check_plot_works( - parallel_coordinates, frame=df, class_column="Name", axvlines=False + plotting.parallel_coordinates, frame=df, class_column="Name", axvlines=False ) assert len(ax.get_lines()) == (nlines - nxticks) @pytest.mark.slow def test_parallel_coordinates_handles(self, iris): - from pandas.plotting import parallel_coordinates - df = iris colors = ["b", "g", "r"] df = DataFrame({"A": [1, 2, 3], "B": [1, 2, 3], "C": [1, 2, 3], "Name": colors}) - ax = parallel_coordinates(df, "Name", color=colors) + ax = plotting.parallel_coordinates(df, "Name", color=colors) handles, _ = ax.get_legend_handles_labels() _check_colors(handles, linecolors=colors) # not sure if this is indicative of a problem @pytest.mark.filterwarnings("ignore:Attempting to set:UserWarning") def test_parallel_coordinates_with_sorted_labels(self): - """For #15908""" - from pandas.plotting import parallel_coordinates - + # GH 15908 df = DataFrame( { "feat": list(range(30)), @@ -345,7 +325,7 @@ def test_parallel_coordinates_with_sorted_labels(self): + [1 for _ in range(10)], } ) - ax = parallel_coordinates(df, "class", sort_labels=True) + ax = plotting.parallel_coordinates(df, "class", sort_labels=True) polylines, labels = ax.get_legend_handles_labels() color_label_tuples = zip( [polyline.get_color() for polyline in polylines], labels @@ -359,45 +339,38 @@ def test_parallel_coordinates_with_sorted_labels(self): assert prev[1] < nxt[1] and prev[0] < nxt[0] def test_radviz_no_warning(self, iris): - from pandas.plotting import radviz - - df = iris # Ensure no UserWarning when making plot with tm.assert_produces_warning(None): - _check_plot_works(radviz, frame=df, class_column="Name") + _check_plot_works(plotting.radviz, frame=iris, class_column="Name") @pytest.mark.parametrize( "color", [("#556270", "#4ECDC4", "#C7F464"), ["dodgerblue", "aquamarine", "seagreen"]], ) def test_radviz_color(self, iris, color): - from pandas.plotting import radviz - df = iris - ax = _check_plot_works(radviz, frame=df, class_column="Name", color=color) + ax = _check_plot_works( + plotting.radviz, frame=df, class_column="Name", color=color + ) # skip Circle drawn as ticks patches = [p for p in ax.patches[:20] if p.get_label() != ""] _check_colors(patches[:10], facecolors=color, mapping=df["Name"][:10]) def test_radviz_color_cmap(self, iris): - from matplotlib import cm - - from pandas.plotting import radviz - df = iris - ax = _check_plot_works(radviz, frame=df, class_column="Name", colormap=cm.jet) - cmaps = [cm.jet(n) for n in np.linspace(0, 1, df["Name"].nunique())] + ax = _check_plot_works( + plotting.radviz, frame=df, class_column="Name", colormap=cm.jet + ) + cmaps = [mpl.cm.jet(n) for n in np.linspace(0, 1, df["Name"].nunique())] patches = [p for p in ax.patches[:20] if p.get_label() != ""] _check_colors(patches, facecolors=cmaps, mapping=df["Name"][:10]) def test_radviz_colors_handles(self): - from pandas.plotting import radviz - colors = [[0.0, 0.0, 1.0, 1.0], [0.0, 0.5, 1.0, 1.0], [1.0, 0.0, 0.0, 1.0]] df = DataFrame( {"A": [1, 2, 3], "B": [2, 1, 3], "C": [3, 2, 1], "Name": ["b", "g", "r"]} ) - ax = radviz(df, "Name", color=colors) + ax = plotting.radviz(df, "Name", color=colors) handles, _ = ax.get_legend_handles_labels() _check_colors(handles, facecolors=colors) @@ -471,15 +444,11 @@ def test_get_standard_colors_random_seed(self): def test_get_standard_colors_consistency(self): # GH17525 # Make sure it produces the same colors every time it's called - from pandas.plotting._matplotlib.style import get_standard_colors - color1 = get_standard_colors(1, color_type="random") color2 = get_standard_colors(1, color_type="random") assert color1 == color2 def test_get_standard_colors_default_num_colors(self): - from pandas.plotting._matplotlib.style import get_standard_colors - # Make sure the default color_types returns the specified amount color1 = get_standard_colors(1, color_type="default") color2 = get_standard_colors(9, color_type="default") @@ -509,11 +478,7 @@ def test_get_standard_colors_no_appending(self): # Make sure not to add more colors so that matplotlib can cycle # correctly. - from matplotlib import cm - - from pandas.plotting._matplotlib.style import get_standard_colors - - color_before = cm.gnuplot(range(5)) + color_before = mpl.cm.gnuplot(range(5)) color_after = get_standard_colors(1, color=color_before) assert len(color_after) == len(color_before) @@ -521,7 +486,7 @@ def test_get_standard_colors_no_appending(self): np.random.default_rng(2).standard_normal((48, 4)), columns=list("ABCD") ) - color_list = cm.gnuplot(np.linspace(0, 1, 16)) + color_list = mpl.cm.gnuplot(np.linspace(0, 1, 16)) p = df.A.plot.bar(figsize=(16, 7), color=color_list) assert p.patches[1].get_facecolor() == p.patches[17].get_facecolor() @@ -546,9 +511,7 @@ def test_dictionary_color(self, kind): def test_bar_plot(self): # GH38947 # Test bar plot with string and int index - from matplotlib.text import Text - - expected = [Text(0, 0, "0"), Text(1, 0, "Total")] + expected = [mpl.text.Text(0, 0, "0"), mpl.text.Text(1, 0, "Total")] df = DataFrame( { @@ -565,11 +528,12 @@ def test_bar_plot(self): def test_barh_plot_labels_mixed_integer_string(self): # GH39126 # Test barh plot with string and integer at the same column - from matplotlib.text import Text - df = DataFrame([{"word": 1, "value": 0}, {"word": "knowledge", "value": 2}]) plot_barh = df.plot.barh(x="word", legend=None) - expected_yticklabels = [Text(0, 0, "1"), Text(0, 1, "knowledge")] + expected_yticklabels = [ + mpl.text.Text(0, 0, "1"), + mpl.text.Text(0, 1, "knowledge"), + ] assert all( actual.get_text() == expected.get_text() for actual, expected in zip( @@ -649,8 +613,8 @@ def test_externally_shared_axes(self): # Create data df = DataFrame( { - "a": np.random.default_rng(2).standard_normal(1000), - "b": np.random.default_rng(2).standard_normal(1000), + "a": np.random.default_rng(2).standard_normal(10), + "b": np.random.default_rng(2).standard_normal(10), } ) @@ -707,9 +671,7 @@ def test_plot_bar_axis_units_timestamp_conversion(self): def test_bar_plt_xaxis_intervalrange(self): # GH 38969 # Ensure IntervalIndex x-axis produces a bar plot as expected - from matplotlib.text import Text - - expected = [Text(0, 0, "([0, 1],)"), Text(1, 0, "([1, 2],)")] + expected = [mpl.text.Text(0, 0, "([0, 1],)"), mpl.text.Text(1, 0, "([1, 2],)")] s = Series( [1, 2], index=[interval_range(0, 2, closed="both")], diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index 54f09c7007330..279d9a18d8df7 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -33,9 +33,14 @@ get_y_axis, ) +from pandas.tseries.offsets import CustomBusinessDay + mpl = pytest.importorskip("matplotlib") plt = pytest.importorskip("matplotlib.pyplot") +from pandas.plotting._matplotlib.converter import DatetimeConverter +from pandas.plotting._matplotlib.style import get_standard_colors + @pytest.fixture def ts(): @@ -49,7 +54,7 @@ def ts(): @pytest.fixture def series(): return Series( - range(20), dtype=np.float64, name="series", index=[f"i_{i}" for i in range(20)] + range(10), dtype=np.float64, name="series", index=[f"i_{i}" for i in range(10)] ) @@ -192,28 +197,24 @@ def test_area_sharey_dont_overwrite(self, ts): assert get_y_axis(ax1).joined(ax1, ax2) assert get_y_axis(ax2).joined(ax1, ax2) - plt.close(fig) def test_label(self): s = Series([1, 2]) _, ax = mpl.pyplot.subplots() ax = s.plot(label="LABEL", legend=True, ax=ax) _check_legend_labels(ax, labels=["LABEL"]) - mpl.pyplot.close("all") def test_label_none(self): s = Series([1, 2]) _, ax = mpl.pyplot.subplots() ax = s.plot(legend=True, ax=ax) _check_legend_labels(ax, labels=[""]) - mpl.pyplot.close("all") def test_label_ser_name(self): s = Series([1, 2], name="NAME") _, ax = mpl.pyplot.subplots() ax = s.plot(legend=True, ax=ax) _check_legend_labels(ax, labels=["NAME"]) - mpl.pyplot.close("all") def test_label_ser_name_override(self): s = Series([1, 2], name="NAME") @@ -221,7 +222,6 @@ def test_label_ser_name_override(self): _, ax = mpl.pyplot.subplots() ax = s.plot(legend=True, label="LABEL", ax=ax) _check_legend_labels(ax, labels=["LABEL"]) - mpl.pyplot.close("all") def test_label_ser_name_override_dont_draw(self): s = Series([1, 2], name="NAME") @@ -231,7 +231,6 @@ def test_label_ser_name_override_dont_draw(self): assert ax.get_legend() is None # Hasn't been drawn ax.legend() # draw it _check_legend_labels(ax, labels=["LABEL"]) - mpl.pyplot.close("all") def test_boolean(self): # GH 23719 @@ -344,9 +343,7 @@ def test_rotation_30(self): _check_ticks_props(axes, xrot=30) def test_irregular_datetime(self): - from pandas.plotting._matplotlib.converter import DatetimeConverter - - rng = date_range("1/1/2000", "3/1/2000") + rng = date_range("1/1/2000", "1/15/2000") rng = rng[[0, 1, 2, 3, 5, 9, 10, 11, 12]] ser = Series(np.random.default_rng(2).standard_normal(len(rng)), rng) _, ax = mpl.pyplot.subplots() @@ -453,9 +450,9 @@ def test_pie_nan(self): def test_df_series_secondary_legend(self): # GH 9779 df = DataFrame( - np.random.default_rng(2).standard_normal((30, 3)), columns=list("abc") + np.random.default_rng(2).standard_normal((10, 3)), columns=list("abc") ) - s = Series(np.random.default_rng(2).standard_normal(30), name="x") + s = Series(np.random.default_rng(2).standard_normal(10), name="x") # primary -> secondary (without passing ax) _, ax = mpl.pyplot.subplots() @@ -467,28 +464,12 @@ def test_df_series_secondary_legend(self): assert ax.get_yaxis().get_visible() assert ax.right_ax.get_yaxis().get_visible() - def test_df_series_secondary_legend_with_axes(self): - # GH 9779 - df = DataFrame( - np.random.default_rng(2).standard_normal((30, 3)), columns=list("abc") - ) - s = Series(np.random.default_rng(2).standard_normal(30), name="x") - # primary -> secondary (with passing ax) - _, ax = mpl.pyplot.subplots() - ax = df.plot(ax=ax) - s.plot(ax=ax, legend=True, secondary_y=True) - # both legends are drawn on left ax - # left and right axis must be visible - _check_legend_labels(ax, labels=["a", "b", "c", "x (right)"]) - assert ax.get_yaxis().get_visible() - assert ax.right_ax.get_yaxis().get_visible() - def test_df_series_secondary_legend_both(self): # GH 9779 df = DataFrame( - np.random.default_rng(2).standard_normal((30, 3)), columns=list("abc") + np.random.default_rng(2).standard_normal((10, 3)), columns=list("abc") ) - s = Series(np.random.default_rng(2).standard_normal(30), name="x") + s = Series(np.random.default_rng(2).standard_normal(10), name="x") # secondary -> secondary (without passing ax) _, ax = mpl.pyplot.subplots() ax = df.plot(secondary_y=True, ax=ax) @@ -500,29 +481,12 @@ def test_df_series_secondary_legend_both(self): assert not ax.left_ax.get_yaxis().get_visible() assert ax.get_yaxis().get_visible() - def test_df_series_secondary_legend_both_with_axis(self): - # GH 9779 - df = DataFrame( - np.random.default_rng(2).standard_normal((30, 3)), columns=list("abc") - ) - s = Series(np.random.default_rng(2).standard_normal(30), name="x") - # secondary -> secondary (with passing ax) - _, ax = mpl.pyplot.subplots() - ax = df.plot(secondary_y=True, ax=ax) - s.plot(ax=ax, legend=True, secondary_y=True) - # both legends are drawn on left ax - # left axis must be invisible and right axis must be visible - expected = ["a (right)", "b (right)", "c (right)", "x (right)"] - _check_legend_labels(ax.left_ax, expected) - assert not ax.left_ax.get_yaxis().get_visible() - assert ax.get_yaxis().get_visible() - def test_df_series_secondary_legend_both_with_axis_2(self): # GH 9779 df = DataFrame( - np.random.default_rng(2).standard_normal((30, 3)), columns=list("abc") + np.random.default_rng(2).standard_normal((10, 3)), columns=list("abc") ) - s = Series(np.random.default_rng(2).standard_normal(30), name="x") + s = Series(np.random.default_rng(2).standard_normal(10), name="x") # secondary -> secondary (with passing ax) _, ax = mpl.pyplot.subplots() ax = df.plot(secondary_y=True, mark_right=False, ax=ax) @@ -537,17 +501,12 @@ def test_df_series_secondary_legend_both_with_axis_2(self): @pytest.mark.parametrize( "input_logy, expected_scale", [(True, "log"), ("sym", "symlog")] ) - def test_secondary_logy(self, input_logy, expected_scale): - # GH 25545 - s1 = Series(np.random.default_rng(2).standard_normal(100)) - s2 = Series(np.random.default_rng(2).standard_normal(100)) - - # GH 24980 - ax1 = s1.plot(logy=input_logy) - ax2 = s2.plot(secondary_y=True, logy=input_logy) - + @pytest.mark.parametrize("secondary_kwarg", [{}, {"secondary_y": True}]) + def test_secondary_logy(self, input_logy, expected_scale, secondary_kwarg): + # GH 25545, GH 24980 + s1 = Series(np.random.default_rng(2).standard_normal(10)) + ax1 = s1.plot(logy=input_logy, **secondary_kwarg) assert ax1.get_yscale() == expected_scale - assert ax2.get_yscale() == expected_scale def test_plot_fails_with_dupe_color_and_style(self): x = Series(np.random.default_rng(2).standard_normal(2)) @@ -673,6 +632,9 @@ def test_errorbar_asymmetrical(self): expected = (err.T * np.array([-1, 1])) + s.to_numpy().reshape(-1, 1) tm.assert_numpy_array_equal(result, expected) + def test_errorbar_asymmetrical_error(self): + # GH9536 + s = Series(np.arange(10), name="x") msg = ( "Asymmetrical error bars should be provided " f"with the shape \\(2, {len(s)}\\)" @@ -759,8 +721,6 @@ def test_series_grid_settings(self): @pytest.mark.parametrize("c", ["r", "red", "green", "#FF0000"]) def test_standard_colors(self, c): - from pandas.plotting._matplotlib.style import get_standard_colors - result = get_standard_colors(1, color=c) assert result == [c] @@ -774,12 +734,8 @@ def test_standard_colors(self, c): assert result == [c] * 3 def test_standard_colors_all(self): - from matplotlib import colors - - from pandas.plotting._matplotlib.style import get_standard_colors - # multiple colors like mediumaquamarine - for c in colors.cnames: + for c in mpl.colors.cnames: result = get_standard_colors(num_colors=1, color=c) assert result == [c] @@ -793,7 +749,7 @@ def test_standard_colors_all(self): assert result == [c] * 3 # single letter colors like k - for c in colors.ColorConverter.colors: + for c in mpl.colors.ColorConverter.colors: result = get_standard_colors(num_colors=1, color=c) assert result == [c] @@ -821,8 +777,6 @@ def test_time_series_plot_color_kwargs(self): _check_colors(ax.get_lines(), linecolors=["green"]) def test_time_series_plot_color_with_empty_kwargs(self): - import matplotlib as mpl - def_colors = _unpack_cycler(mpl.rcParams) index = date_range("1/1/2000", periods=12) s = Series(np.arange(1, 13), index=index) @@ -851,8 +805,6 @@ def test_xtick_barPlot(self): def test_custom_business_day_freq(self): # GH7222 - from pandas.tseries.offsets import CustomBusinessDay - s = Series( range(100, 121), index=pd.bdate_range( diff --git a/pandas/tests/plotting/test_style.py b/pandas/tests/plotting/test_style.py index 665bda15724fd..f9c89e0a7893f 100644 --- a/pandas/tests/plotting/test_style.py +++ b/pandas/tests/plotting/test_style.py @@ -2,7 +2,8 @@ from pandas import Series -pytest.importorskip("matplotlib") +mpl = pytest.importorskip("matplotlib") +plt = pytest.importorskip("matplotlib.pyplot") from pandas.plotting._matplotlib.style import get_standard_colors @@ -18,11 +19,8 @@ class TestGetStandardColors: ], ) def test_default_colors_named_from_prop_cycle(self, num_colors, expected): - import matplotlib as mpl - from matplotlib.pyplot import cycler - mpl_params = { - "axes.prop_cycle": cycler(color=["red", "green", "blue"]), + "axes.prop_cycle": plt.cycler(color=["red", "green", "blue"]), } with mpl.rc_context(rc=mpl_params): result = get_standard_colors(num_colors=num_colors) @@ -39,11 +37,8 @@ def test_default_colors_named_from_prop_cycle(self, num_colors, expected): ], ) def test_default_colors_named_from_prop_cycle_string(self, num_colors, expected): - import matplotlib as mpl - from matplotlib.pyplot import cycler - mpl_params = { - "axes.prop_cycle": cycler(color="bgry"), + "axes.prop_cycle": plt.cycler(color="bgry"), } with mpl.rc_context(rc=mpl_params): result = get_standard_colors(num_colors=num_colors) @@ -74,11 +69,8 @@ def test_default_colors_named_from_prop_cycle_string(self, num_colors, expected) ], ) def test_default_colors_named_undefined_prop_cycle(self, num_colors, expected_name): - import matplotlib as mpl - import matplotlib.colors as mcolors - with mpl.rc_context(rc={}): - expected = [mcolors.to_hex(x) for x in expected_name] + expected = [mpl.colors.to_hex(x) for x in expected_name] result = get_standard_colors(num_colors=num_colors) assert result == expected From 728cfcbc71f2bc9e3d35c4d5269cd8e66183c46f Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 4 Jun 2024 08:09:09 -1000 Subject: [PATCH 051/272] CLN: Plotting tests 2 (#58910) * Refactor test_converter * clean test_boxplot method * cleann test_hist_method --- pandas/tests/plotting/test_boxplot_method.py | 29 +++----- pandas/tests/plotting/test_converter.py | 28 ++------ pandas/tests/plotting/test_hist_method.py | 71 ++++++++------------ 3 files changed, 44 insertions(+), 84 deletions(-) diff --git a/pandas/tests/plotting/test_boxplot_method.py b/pandas/tests/plotting/test_boxplot_method.py index 573f95eed15ef..4916963ab7c87 100644 --- a/pandas/tests/plotting/test_boxplot_method.py +++ b/pandas/tests/plotting/test_boxplot_method.py @@ -38,9 +38,7 @@ def _check_ax_limits(col, ax): class TestDataFramePlots: def test_stacked_boxplot_set_axis(self): # GH2980 - import matplotlib.pyplot as plt - - n = 80 + n = 30 df = DataFrame( { "Clinical": np.random.default_rng(2).choice([0, 1, 2, 3], n), @@ -51,10 +49,10 @@ def test_stacked_boxplot_set_axis(self): ) ax = df.plot(kind="bar", stacked=True) assert [int(x.get_text()) for x in ax.get_xticklabels()] == df.index.to_list() - ax.set_xticks(np.arange(0, 80, 10)) + ax.set_xticks(np.arange(0, n, 10)) plt.draw() # Update changes assert [int(x.get_text()) for x in ax.get_xticklabels()] == list( - np.arange(0, 80, 10) + np.arange(0, n, 10) ) @pytest.mark.slow @@ -227,12 +225,12 @@ def test_boxplot_numeric_data(self): # GH 22799 df = DataFrame( { - "a": date_range("2012-01-01", periods=100), - "b": np.random.default_rng(2).standard_normal(100), - "c": np.random.default_rng(2).standard_normal(100) + 2, - "d": date_range("2012-01-01", periods=100).astype(str), - "e": date_range("2012-01-01", periods=100, tz="UTC"), - "f": timedelta_range("1 days", periods=100), + "a": date_range("2012-01-01", periods=10), + "b": np.random.default_rng(2).standard_normal(10), + "c": np.random.default_rng(2).standard_normal(10) + 2, + "d": date_range("2012-01-01", periods=10).astype(str), + "e": date_range("2012-01-01", periods=10, tz="UTC"), + "f": timedelta_range("1 days", periods=10), } ) ax = df.plot(kind="box") @@ -282,8 +280,6 @@ def test_color_kwd(self, colors_kwd, expected): def test_colors_in_theme(self, scheme, expected): # GH: 40769 df = DataFrame(np.random.default_rng(2).random((10, 2))) - import matplotlib.pyplot as plt - plt.style.use(scheme) result = df.plot.box(return_type="dict") for k, v in expected.items(): @@ -334,8 +330,8 @@ def test_plot_xlabel_ylabel(self, vert): def test_plot_box(self, vert): # GH 54941 rng = np.random.default_rng(2) - df1 = DataFrame(rng.integers(0, 100, size=(100, 4)), columns=list("ABCD")) - df2 = DataFrame(rng.integers(0, 100, size=(100, 4)), columns=list("ABCD")) + df1 = DataFrame(rng.integers(0, 100, size=(10, 4)), columns=list("ABCD")) + df2 = DataFrame(rng.integers(0, 100, size=(10, 4)), columns=list("ABCD")) xlabel, ylabel = "x", "y" _, axs = plt.subplots(ncols=2, figsize=(10, 7), sharey=True) @@ -344,7 +340,6 @@ def test_plot_box(self, vert): for ax in axs: assert ax.get_xlabel() == xlabel assert ax.get_ylabel() == ylabel - mpl.pyplot.close() @pytest.mark.parametrize("vert", [True, False]) def test_boxplot_xlabel_ylabel(self, vert): @@ -374,7 +369,6 @@ def test_boxplot_group_xlabel_ylabel(self, vert): for subplot in ax: assert subplot.get_xlabel() == xlabel assert subplot.get_ylabel() == ylabel - mpl.pyplot.close() @pytest.mark.parametrize("vert", [True, False]) def test_boxplot_group_no_xlabel_ylabel(self, vert): @@ -389,7 +383,6 @@ def test_boxplot_group_no_xlabel_ylabel(self, vert): for subplot in ax: target_label = subplot.get_xlabel() if vert else subplot.get_ylabel() assert target_label == pprint_thing(["group"]) - mpl.pyplot.close() class TestDataFrameGroupByPlots: diff --git a/pandas/tests/plotting/test_converter.py b/pandas/tests/plotting/test_converter.py index d4774a5cd0439..6a1777b098de0 100644 --- a/pandas/tests/plotting/test_converter.py +++ b/pandas/tests/plotting/test_converter.py @@ -34,15 +34,11 @@ Second, ) -try: - from pandas.plotting._matplotlib import converter -except ImportError: - # try / except, rather than skip, to avoid internal refactoring - # causing an improper skip - pass - -pytest.importorskip("matplotlib.pyplot") +plt = pytest.importorskip("matplotlib.pyplot") dates = pytest.importorskip("matplotlib.dates") +units = pytest.importorskip("matplotlib.units") + +from pandas.plotting._matplotlib import converter @pytest.mark.single_cpu @@ -79,30 +75,22 @@ def test_dont_register_by_default(self): assert subprocess.check_call(call) == 0 def test_registering_no_warning(self): - plt = pytest.importorskip("matplotlib.pyplot") s = Series(range(12), index=date_range("2017", periods=12)) _, ax = plt.subplots() # Set to the "warn" state, in case this isn't the first test run register_matplotlib_converters() ax.plot(s.index, s.values) - plt.close() def test_pandas_plots_register(self): - plt = pytest.importorskip("matplotlib.pyplot") s = Series(range(12), index=date_range("2017", periods=12)) # Set to the "warn" state, in case this isn't the first test run with tm.assert_produces_warning(None) as w: s.plot() - try: - assert len(w) == 0 - finally: - plt.close() + assert len(w) == 0 def test_matplotlib_formatters(self): - units = pytest.importorskip("matplotlib.units") - # Can't make any assertion about the start state. # We we check that toggling converters off removes it, and toggling it # on restores it. @@ -113,8 +101,6 @@ def test_matplotlib_formatters(self): assert Timestamp in units.registry def test_option_no_warning(self): - pytest.importorskip("matplotlib.pyplot") - plt = pytest.importorskip("matplotlib.pyplot") s = Series(range(12), index=date_range("2017", periods=12)) _, ax = plt.subplots() @@ -126,12 +112,8 @@ def test_option_no_warning(self): register_matplotlib_converters() with cf.option_context("plotting.matplotlib.register_converters", False): ax.plot(s.index, s.values) - plt.close() def test_registry_resets(self): - units = pytest.importorskip("matplotlib.units") - dates = pytest.importorskip("matplotlib.dates") - # make a copy, to reset to original = dict(units.registry) diff --git a/pandas/tests/plotting/test_hist_method.py b/pandas/tests/plotting/test_hist_method.py index 511c1dd7761d5..65cb62917dc4e 100644 --- a/pandas/tests/plotting/test_hist_method.py +++ b/pandas/tests/plotting/test_hist_method.py @@ -27,6 +27,9 @@ ) mpl = pytest.importorskip("matplotlib") +plt = pytest.importorskip("matplotlib.pyplot") + +from pandas.plotting._matplotlib.hist import _grouped_hist @pytest.fixture @@ -119,18 +122,13 @@ def test_hist_layout_with_by_shape(self, hist_df): _check_axes_shape(axes, axes_num=4, layout=(4, 2), figsize=(12, 7)) def test_hist_no_overlap(self): - from matplotlib.pyplot import ( - gcf, - subplot, - ) - x = Series(np.random.default_rng(2).standard_normal(2)) y = Series(np.random.default_rng(2).standard_normal(2)) - subplot(121) + plt.subplot(121) x.hist() - subplot(122) + plt.subplot(122) y.hist() - fig = gcf() + fig = plt.gcf() axes = fig.axes assert len(axes) == 2 @@ -140,10 +138,8 @@ def test_hist_by_no_extra_plots(self, hist_df): assert len(mpl.pyplot.get_fignums()) == 1 def test_plot_fails_when_ax_differs_from_figure(self, ts): - from pylab import figure - - fig1 = figure() - fig2 = figure() + fig1 = plt.figure(1) + fig2 = plt.figure(2) ax1 = fig1.add_subplot(111) msg = "passed axis not bound to passed figure" with pytest.raises(AssertionError, match=msg): @@ -169,8 +165,8 @@ def test_histtype_argument(self, histtype, expected): ) def test_hist_with_legend(self, by, expected_axes_num, expected_layout): # GH 6279 - Series histogram can have a legend - index = 15 * ["1"] + 15 * ["2"] - s = Series(np.random.default_rng(2).standard_normal(30), index=index, name="a") + index = 5 * ["1"] + 5 * ["2"] + s = Series(np.random.default_rng(2).standard_normal(10), index=index, name="a") s.index.name = "b" # Use default_axes=True when plotting method generate subplots itself @@ -181,8 +177,8 @@ def test_hist_with_legend(self, by, expected_axes_num, expected_layout): @pytest.mark.parametrize("by", [None, "b"]) def test_hist_with_legend_raises(self, by): # GH 6279 - Series histogram with legend and label raises - index = 15 * ["1"] + 15 * ["2"] - s = Series(np.random.default_rng(2).standard_normal(30), index=index, name="a") + index = 5 * ["1"] + 5 * ["2"] + s = Series(np.random.default_rng(2).standard_normal(10), index=index, name="a") s.index.name = "b" with pytest.raises(ValueError, match="Cannot use both legend and label"): @@ -331,12 +327,10 @@ def test_hist_df_legacy_layout_labelsize_rot(self, frame_or_series): @pytest.mark.slow def test_hist_df_legacy_rectangles(self): - from matplotlib.patches import Rectangle - ser = Series(range(10)) ax = ser.hist(cumulative=True, bins=4, density=True) # height of last bin (index 5) must be 1.0 - rects = [x for x in ax.get_children() if isinstance(x, Rectangle)] + rects = [x for x in ax.get_children() if isinstance(x, mpl.patches.Rectangle)] tm.assert_almost_equal(rects[-1].get_height(), 1.0) @pytest.mark.slow @@ -431,12 +425,12 @@ def test_hist_layout_error(self): # GH 9351 def test_tight_layout(self): - df = DataFrame(np.random.default_rng(2).standard_normal((100, 2))) + df = DataFrame(np.random.default_rng(2).standard_normal((10, 2))) df[2] = to_datetime( np.random.default_rng(2).integers( 812419200000000000, 819331200000000000, - size=100, + size=10, dtype=np.int64, ) ) @@ -504,7 +498,7 @@ def test_hist_column_order_unchanged(self, column, expected): def test_histtype_argument(self, histtype, expected): # GH23992 Verify functioning of histtype argument df = DataFrame( - np.random.default_rng(2).integers(1, 10, size=(100, 2)), columns=["a", "b"] + np.random.default_rng(2).integers(1, 10, size=(10, 2)), columns=["a", "b"] ) ax = df.hist(histtype=histtype) _check_patches_all_filled(ax, filled=expected) @@ -519,9 +513,9 @@ def test_hist_with_legend(self, by, column): if by is not None: expected_labels = [expected_labels] * 2 - index = Index(15 * ["1"] + 15 * ["2"], name="c") + index = Index(5 * ["1"] + 5 * ["2"], name="c") df = DataFrame( - np.random.default_rng(2).standard_normal((30, 2)), + np.random.default_rng(2).standard_normal((10, 2)), index=index, columns=["a", "b"], ) @@ -545,9 +539,9 @@ def test_hist_with_legend(self, by, column): @pytest.mark.parametrize("column", [None, "b"]) def test_hist_with_legend_raises(self, by, column): # GH 6279 - DataFrame histogram with legend and label raises - index = Index(15 * ["1"] + 15 * ["2"], name="c") + index = Index(5 * ["1"] + 5 * ["2"], name="c") df = DataFrame( - np.random.default_rng(2).standard_normal((30, 2)), + np.random.default_rng(2).standard_normal((10, 2)), index=index, columns=["a", "b"], ) @@ -586,7 +580,7 @@ def test_hist_df_with_nonnumerics_no_bins(self): def test_hist_secondary_legend(self): # GH 9610 df = DataFrame( - np.random.default_rng(2).standard_normal((30, 4)), columns=list("abcd") + np.random.default_rng(2).standard_normal((10, 4)), columns=list("abcd") ) # primary -> secondary @@ -602,7 +596,7 @@ def test_hist_secondary_legend(self): def test_hist_secondary_secondary(self): # GH 9610 df = DataFrame( - np.random.default_rng(2).standard_normal((30, 4)), columns=list("abcd") + np.random.default_rng(2).standard_normal((10, 4)), columns=list("abcd") ) # secondary -> secondary _, ax = mpl.pyplot.subplots() @@ -617,7 +611,7 @@ def test_hist_secondary_secondary(self): def test_hist_secondary_primary(self): # GH 9610 df = DataFrame( - np.random.default_rng(2).standard_normal((30, 4)), columns=list("abcd") + np.random.default_rng(2).standard_normal((10, 4)), columns=list("abcd") ) # secondary -> primary _, ax = mpl.pyplot.subplots() @@ -632,7 +626,6 @@ def test_hist_secondary_primary(self): def test_hist_with_nans_and_weights(self): # GH 48884 - mpl_patches = pytest.importorskip("matplotlib.patches") df = DataFrame( [[np.nan, 0.2, 0.3], [0.4, np.nan, np.nan], [0.7, 0.8, 0.9]], columns=list("abc"), @@ -643,12 +636,12 @@ def test_hist_with_nans_and_weights(self): _, ax0 = mpl.pyplot.subplots() df.plot.hist(ax=ax0, weights=weights) - rects = [x for x in ax0.get_children() if isinstance(x, mpl_patches.Rectangle)] + rects = [x for x in ax0.get_children() if isinstance(x, mpl.patches.Rectangle)] heights = [rect.get_height() for rect in rects] _, ax1 = mpl.pyplot.subplots() no_nan_df.plot.hist(ax=ax1, weights=no_nan_weights) no_nan_rects = [ - x for x in ax1.get_children() if isinstance(x, mpl_patches.Rectangle) + x for x in ax1.get_children() if isinstance(x, mpl.patches.Rectangle) ] no_nan_heights = [rect.get_height() for rect in no_nan_rects] assert all(h0 == h1 for h0, h1 in zip(heights, no_nan_heights)) @@ -663,8 +656,6 @@ def test_hist_with_nans_and_weights(self): class TestDataFrameGroupByPlots: def test_grouped_hist_legacy(self): - from pandas.plotting._matplotlib.hist import _grouped_hist - rs = np.random.default_rng(10) df = DataFrame(rs.standard_normal((10, 1)), columns=["A"]) df["B"] = to_datetime( @@ -716,10 +707,6 @@ def test_grouped_hist_legacy_single_key(self): _check_ticks_props(axes, xrot=30) def test_grouped_hist_legacy_grouped_hist_kwargs(self): - from matplotlib.patches import Rectangle - - from pandas.plotting._matplotlib.hist import _grouped_hist - rs = np.random.default_rng(2) df = DataFrame(rs.standard_normal((10, 1)), columns=["A"]) df["B"] = to_datetime( @@ -748,14 +735,14 @@ def test_grouped_hist_legacy_grouped_hist_kwargs(self): ) # height of last bin (index 5) must be 1.0 for ax in axes.ravel(): - rects = [x for x in ax.get_children() if isinstance(x, Rectangle)] + rects = [ + x for x in ax.get_children() if isinstance(x, mpl.patches.Rectangle) + ] height = rects[-1].get_height() tm.assert_almost_equal(height, 1.0) _check_ticks_props(axes, xlabelsize=xf, xrot=xrot, ylabelsize=yf, yrot=yrot) def test_grouped_hist_legacy_grouped_hist(self): - from pandas.plotting._matplotlib.hist import _grouped_hist - rs = np.random.default_rng(2) df = DataFrame(rs.standard_normal((10, 1)), columns=["A"]) df["B"] = to_datetime( @@ -773,8 +760,6 @@ def test_grouped_hist_legacy_grouped_hist(self): _check_ax_scales(axes, yaxis="log") def test_grouped_hist_legacy_external_err(self): - from pandas.plotting._matplotlib.hist import _grouped_hist - rs = np.random.default_rng(2) df = DataFrame(rs.standard_normal((10, 1)), columns=["A"]) df["B"] = to_datetime( From 4526ea70f443af489959ba9d1233a51150109b10 Mon Sep 17 00:00:00 2001 From: undermyumbrella1 <120079323+undermyumbrella1@users.noreply.github.com> Date: Wed, 5 Jun 2024 02:20:13 +0800 Subject: [PATCH 052/272] Update compute_dict_like to get all columns (#58452) * Update compute_dict_like to get all columns * Add tests * Update rst * Remove newline from rst * Project the columns before converting to series group by * retrigger doc build * Account for 1d/series projection result * Declare var before assignment * Remove if condition * Add test to test agg list funcs --------- Co-authored-by: Kei Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/apply.py | 35 +++++- .../tests/groupby/aggregate/test_aggregate.py | 118 ++++++++++++++++++ 3 files changed, 149 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 2d4fafc3c4e1e..7f5c879c0d9f5 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -535,6 +535,7 @@ Groupby/resample/rolling - Bug in :meth:`.DataFrameGroupBy.quantile` when ``interpolation="nearest"`` is inconsistent with :meth:`DataFrame.quantile` (:issue:`47942`) - Bug in :meth:`.Resampler.interpolate` on a :class:`DataFrame` with non-uniform sampling and/or indices not aligning with the resulting resampled index would result in wrong interpolation (:issue:`21351`) - Bug in :meth:`DataFrame.ewm` and :meth:`Series.ewm` when passed ``times`` and aggregation functions other than mean (:issue:`51695`) +- Bug in :meth:`DataFrameGroupBy.agg` that raises ``AttributeError`` when there is dictionary input and duplicated columns, instead of returning a DataFrame with the aggregation of all duplicate columns. (:issue:`55041`) - Bug in :meth:`DataFrameGroupBy.apply` that was returning a completely empty DataFrame when all return values of ``func`` were ``None`` instead of returning an empty DataFrame with the original columns and dtypes. (:issue:`57775`) - Bug in :meth:`DataFrameGroupBy.apply` with ``as_index=False`` that was returning :class:`MultiIndex` instead of returning :class:`Index`. (:issue:`58291`) - Bug in :meth:`DataFrameGroupby.transform` and :meth:`SeriesGroupby.transform` with a reducer and ``observed=False`` that coerces dtype to float when there are unobserved categories. (:issue:`55326`) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 25836e967e948..33f506235870d 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -471,8 +471,30 @@ def compute_dict_like( keys += [key] * len(key_data) results += key_data - else: + elif is_groupby: # key used for column selection and output + + df = selected_obj + results, keys = [], [] + for key, how in func.items(): + cols = df[key] + + if cols.ndim == 1: + series_list = [obj._gotitem(key, ndim=1, subset=cols)] + else: + series_list = [] + for index in range(cols.shape[1]): + col = cols.iloc[:, index] + + series = obj._gotitem(key, ndim=1, subset=col) + series_list.append(series) + + for series in series_list: + result = getattr(series, op_name)(how, **kwargs) + results.append(result) + keys.append(key) + + else: results = [ getattr(obj._gotitem(key, ndim=1), op_name)(how, **kwargs) for key, how in func.items() @@ -496,11 +518,14 @@ def wrap_results_dict_like( is_ndframe = [isinstance(r, ABCNDFrame) for r in result_data] if all(is_ndframe): - results = dict(zip(result_index, result_data)) + results = [result for result in result_data if not result.empty] keys_to_use: Iterable[Hashable] - keys_to_use = [k for k in result_index if not results[k].empty] + keys_to_use = [k for k, v in zip(result_index, result_data) if not v.empty] # Have to check, if at least one DataFrame is not empty. - keys_to_use = keys_to_use if keys_to_use != [] else result_index + if keys_to_use == []: + keys_to_use = result_index + results = result_data + if selected_obj.ndim == 2: # keys are columns, so we can preserve names ktu = Index(keys_to_use) @@ -509,7 +534,7 @@ def wrap_results_dict_like( axis: AxisInt = 0 if isinstance(obj, ABCSeries) else 1 result = concat( - {k: results[k] for k in keys_to_use}, + results, axis=axis, keys=keys_to_use, ) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 3362d6209af6d..1f140063fd84b 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -1662,3 +1662,121 @@ def func(x): msg = "length must not be 0" with pytest.raises(ValueError, match=msg): df.groupby("A", observed=False).agg(func) + + +def test_groupby_aggregation_duplicate_columns_single_dict_value(): + # GH#55041 + df = DataFrame( + [[1, 2, 3, 4], [1, 3, 4, 5], [2, 4, 5, 6]], + columns=["a", "b", "c", "c"], + ) + gb = df.groupby("a") + result = gb.agg({"c": "sum"}) + + expected = DataFrame( + [[7, 9], [5, 6]], columns=["c", "c"], index=Index([1, 2], name="a") + ) + tm.assert_frame_equal(result, expected) + + +def test_groupby_aggregation_duplicate_columns_multiple_dict_values(): + # GH#55041 + df = DataFrame( + [[1, 2, 3, 4], [1, 3, 4, 5], [2, 4, 5, 6]], + columns=["a", "b", "c", "c"], + ) + gb = df.groupby("a") + result = gb.agg({"c": ["sum", "min", "max", "min"]}) + + expected = DataFrame( + [[7, 3, 4, 3, 9, 4, 5, 4], [5, 5, 5, 5, 6, 6, 6, 6]], + columns=MultiIndex( + levels=[["c"], ["sum", "min", "max"]], + codes=[[0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 2, 1, 0, 1, 2, 1]], + ), + index=Index([1, 2], name="a"), + ) + tm.assert_frame_equal(result, expected) + + +def test_groupby_aggregation_duplicate_columns_some_empty_result(): + # GH#55041 + df = DataFrame( + [ + [1, 9843, 43, 54, 7867], + [2, 940, 9, -34, 44], + [1, -34, -546, -549358, 0], + [2, 244, -33, -100, 44], + ], + columns=["a", "b", "b", "c", "c"], + ) + gb = df.groupby("a") + result = gb.agg({"b": [], "c": ["var"]}) + + expected = DataFrame( + [[1.509268e11, 30944844.5], [2.178000e03, 0.0]], + columns=MultiIndex(levels=[["c"], ["var"]], codes=[[0, 0], [0, 0]]), + index=Index([1, 2], name="a"), + ) + tm.assert_frame_equal(result, expected) + + +def test_groupby_aggregation_multi_index_duplicate_columns(): + # GH#55041 + df = DataFrame( + [ + [1, -9843, 43, 54, 7867], + [2, 940, 9, -34, 44], + [1, -34, 546, -549358, 0], + [2, 244, -33, -100, 44], + ], + columns=MultiIndex( + levels=[["level1.1", "level1.2"], ["level2.1", "level2.2"]], + codes=[[0, 0, 0, 1, 1], [0, 1, 1, 0, 1]], + ), + index=MultiIndex( + levels=[["level1.1", "level1.2"], ["level2.1", "level2.2"]], + codes=[[0, 0, 0, 1], [0, 1, 1, 0]], + ), + ) + gb = df.groupby(level=0) + result = gb.agg({("level1.1", "level2.2"): "min"}) + + expected = DataFrame( + [[-9843, 9], [244, -33]], + columns=MultiIndex(levels=[["level1.1"], ["level2.2"]], codes=[[0, 0], [0, 0]]), + index=Index(["level1.1", "level1.2"]), + ) + tm.assert_frame_equal(result, expected) + + +def test_groupby_aggregation_func_list_multi_index_duplicate_columns(): + # GH#55041 + df = DataFrame( + [ + [1, -9843, 43, 54, 7867], + [2, 940, 9, -34, 44], + [1, -34, 546, -549358, 0], + [2, 244, -33, -100, 44], + ], + columns=MultiIndex( + levels=[["level1.1", "level1.2"], ["level2.1", "level2.2"]], + codes=[[0, 0, 0, 1, 1], [0, 1, 1, 0, 1]], + ), + index=MultiIndex( + levels=[["level1.1", "level1.2"], ["level2.1", "level2.2"]], + codes=[[0, 0, 0, 1], [0, 1, 1, 0]], + ), + ) + gb = df.groupby(level=0) + result = gb.agg({("level1.1", "level2.2"): ["min", "max"]}) + + expected = DataFrame( + [[-9843, 940, 9, 546], [244, 244, -33, -33]], + columns=MultiIndex( + levels=[["level1.1"], ["level2.2"], ["min", "max"]], + codes=[[0, 0, 0, 0], [0, 0, 0, 0], [0, 1, 0, 1]], + ), + index=Index(["level1.1", "level1.2"]), + ) + tm.assert_frame_equal(result, expected) From cc85e2c46a23be9fe57595e021024783590c682d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 4 Jun 2024 09:33:06 -1000 Subject: [PATCH 053/272] CLN: Matplotlib imports in style (#58921) Use import_optional_dependency in style.py --- pandas/io/formats/style.py | 150 +++++++++++++++++-------------------- 1 file changed, 67 insertions(+), 83 deletions(-) diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 69021eb2656f6..8212b50594842 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -4,7 +4,6 @@ from __future__ import annotations -from contextlib import contextmanager import copy from functools import partial import operator @@ -56,7 +55,6 @@ if TYPE_CHECKING: from collections.abc import ( - Generator, Hashable, Sequence, ) @@ -84,22 +82,6 @@ from pandas import ExcelWriter -try: - import matplotlib as mpl - import matplotlib.pyplot as plt - - has_mpl = True -except ImportError: - has_mpl = False - - -@contextmanager -def _mpl(func: Callable) -> Generator[tuple[Any, Any], None, None]: - if has_mpl: - yield plt, mpl - else: - raise ImportError(f"{func.__name__} requires matplotlib.") - #### # Shared Doc Strings @@ -3832,61 +3814,61 @@ def _background_gradient( else: # else validate gmap against the underlying data gmap = _validate_apply_axis_arg(gmap, "gmap", float, data) - with _mpl(Styler.background_gradient) as (_, _matplotlib): - smin = np.nanmin(gmap) if vmin is None else vmin - smax = np.nanmax(gmap) if vmax is None else vmax - rng = smax - smin - # extend lower / upper bounds, compresses color range - norm = _matplotlib.colors.Normalize(smin - (rng * low), smax + (rng * high)) + smin = np.nanmin(gmap) if vmin is None else vmin + smax = np.nanmax(gmap) if vmax is None else vmax + rng = smax - smin + _matplotlib = import_optional_dependency( + "matplotlib", extra="Styler.background_gradient requires matplotlib." + ) + # extend lower / upper bounds, compresses color range + norm = _matplotlib.colors.Normalize(smin - (rng * low), smax + (rng * high)) + + if cmap is None: + rgbas = _matplotlib.colormaps[_matplotlib.rcParams["image.cmap"]](norm(gmap)) + else: + rgbas = _matplotlib.colormaps.get_cmap(cmap)(norm(gmap)) + + def relative_luminance(rgba) -> float: + """ + Calculate relative luminance of a color. + + The calculation adheres to the W3C standards + (https://www.w3.org/WAI/GL/wiki/Relative_luminance) + + Parameters + ---------- + color : rgb or rgba tuple + + Returns + ------- + float + The relative luminance as a value from 0 to 1 + """ + r, g, b = ( + x / 12.92 if x <= 0.04045 else ((x + 0.055) / 1.055) ** 2.4 + for x in rgba[:3] + ) + return 0.2126 * r + 0.7152 * g + 0.0722 * b - if cmap is None: - rgbas = _matplotlib.colormaps[_matplotlib.rcParams["image.cmap"]]( - norm(gmap) + def css(rgba, text_only) -> str: + if not text_only: + dark = relative_luminance(rgba) < text_color_threshold + text_color = "#f1f1f1" if dark else "#000000" + return ( + f"background-color: {_matplotlib.colors.rgb2hex(rgba)};" + f"color: {text_color};" ) else: - rgbas = _matplotlib.colormaps.get_cmap(cmap)(norm(gmap)) - - def relative_luminance(rgba) -> float: - """ - Calculate relative luminance of a color. - - The calculation adheres to the W3C standards - (https://www.w3.org/WAI/GL/wiki/Relative_luminance) - - Parameters - ---------- - color : rgb or rgba tuple - - Returns - ------- - float - The relative luminance as a value from 0 to 1 - """ - r, g, b = ( - x / 12.92 if x <= 0.04045 else ((x + 0.055) / 1.055) ** 2.4 - for x in rgba[:3] - ) - return 0.2126 * r + 0.7152 * g + 0.0722 * b - - def css(rgba, text_only) -> str: - if not text_only: - dark = relative_luminance(rgba) < text_color_threshold - text_color = "#f1f1f1" if dark else "#000000" - return ( - f"background-color: {_matplotlib.colors.rgb2hex(rgba)};" - f"color: {text_color};" - ) - else: - return f"color: {_matplotlib.colors.rgb2hex(rgba)};" + return f"color: {_matplotlib.colors.rgb2hex(rgba)};" - if data.ndim == 1: - return [css(rgba, text_only) for rgba in rgbas] - else: - return DataFrame( - [[css(rgba, text_only) for rgba in row] for row in rgbas], - index=data.index, - columns=data.columns, - ) + if data.ndim == 1: + return [css(rgba, text_only) for rgba in rgbas] + else: + return DataFrame( + [[css(rgba, text_only) for rgba in row] for row in rgbas], + index=data.index, + columns=data.columns, + ) def _highlight_between( @@ -4124,20 +4106,22 @@ def css_calc(x, left: float, right: float, align: str, color: str | list | tuple rgbas = None if cmap is not None: # use the matplotlib colormap input - with _mpl(Styler.bar) as (_, _matplotlib): - cmap = ( - _matplotlib.colormaps[cmap] - if isinstance(cmap, str) - else cmap # assumed to be a Colormap instance as documented - ) - norm = _matplotlib.colors.Normalize(left, right) - rgbas = cmap(norm(values)) - if data.ndim == 1: - rgbas = [_matplotlib.colors.rgb2hex(rgba) for rgba in rgbas] - else: - rgbas = [ - [_matplotlib.colors.rgb2hex(rgba) for rgba in row] for row in rgbas - ] + _matplotlib = import_optional_dependency( + "matplotlib", extra="Styler.bar requires matplotlib." + ) + cmap = ( + _matplotlib.colormaps[cmap] + if isinstance(cmap, str) + else cmap # assumed to be a Colormap instance as documented + ) + norm = _matplotlib.colors.Normalize(left, right) + rgbas = cmap(norm(values)) + if data.ndim == 1: + rgbas = [_matplotlib.colors.rgb2hex(rgba) for rgba in rgbas] + else: + rgbas = [ + [_matplotlib.colors.rgb2hex(rgba) for rgba in row] for row in rgbas + ] assert isinstance(align, str) # mypy: should now be in [left, right, mid, zero] if data.ndim == 1: From 92207feacb0e2bc398bbe30042b4d24578df12e9 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 4 Jun 2024 12:13:31 -1000 Subject: [PATCH 054/272] CLN: Remove downcast keyword in MultiIndex.fillna (#58923) --- pandas/core/indexes/multi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 8b11f8087db94..a8c05ab78c98e 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1677,7 +1677,7 @@ def duplicated(self, keep: DropKeep = "first") -> npt.NDArray[np.bool_]: # (previously declared in base class "IndexOpsMixin") _duplicated = duplicated # type: ignore[misc] - def fillna(self, value, downcast=None): + def fillna(self, value): """ fillna is not implemented for MultiIndex """ From 9e7abc84a11b283b71a0c8f012ca04eb79b8b882 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 4 Jun 2024 12:14:46 -1000 Subject: [PATCH 055/272] CLN: Plotting testing asserters (#58922) --- pandas/_testing/__init__.py | 2 -- pandas/_testing/asserters.py | 22 ------------------- pandas/tests/plotting/common.py | 38 ++++++++++++++++++++++----------- 3 files changed, 26 insertions(+), 36 deletions(-) diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index d35242ada21e9..85d03ea17bf42 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -57,7 +57,6 @@ assert_indexing_slices_equivalent, assert_interval_array_equal, assert_is_sorted, - assert_is_valid_plot_return_object, assert_metadata_equivalent, assert_numpy_array_equal, assert_period_array_equal, @@ -558,7 +557,6 @@ def shares_memory(left, right) -> bool: "assert_indexing_slices_equivalent", "assert_interval_array_equal", "assert_is_sorted", - "assert_is_valid_plot_return_object", "assert_metadata_equivalent", "assert_numpy_array_equal", "assert_period_array_equal", diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index 430840711122a..1127a4512643c 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -429,28 +429,6 @@ def assert_attr_equal(attr: str, left, right, obj: str = "Attributes") -> None: return None -def assert_is_valid_plot_return_object(objs) -> None: - from matplotlib.artist import Artist - from matplotlib.axes import Axes - - if isinstance(objs, (Series, np.ndarray)): - if isinstance(objs, Series): - objs = objs._values - for el in objs.ravel(): - msg = ( - "one of 'objs' is not a matplotlib Axes instance, " - f"type encountered {type(el).__name__!r}" - ) - assert isinstance(el, (Axes, dict)), msg - else: - msg = ( - "objs is neither an ndarray of Artist instances nor a single " - "ArtistArtist instance, tuple, or dict, 'objs' is a " - f"{type(objs).__name__!r}" - ) - assert isinstance(objs, (Artist, tuple, dict)), msg - - def assert_is_sorted(seq) -> None: """Assert that the sequence is sorted.""" if isinstance(seq, (Index, Series)): diff --git a/pandas/tests/plotting/common.py b/pandas/tests/plotting/common.py index 5a46cdcb051b6..d8c49d6d47f28 100644 --- a/pandas/tests/plotting/common.py +++ b/pandas/tests/plotting/common.py @@ -76,8 +76,6 @@ def _check_data(xp, rs): xp : matplotlib Axes object rs : matplotlib Axes object """ - import matplotlib.pyplot as plt - xp_lines = xp.get_lines() rs_lines = rs.get_lines() @@ -87,8 +85,6 @@ def _check_data(xp, rs): rsdata = rsl.get_xydata() tm.assert_almost_equal(xpdata, rsdata) - plt.close("all") - def _check_visible(collections, visible=True): """ @@ -495,6 +491,28 @@ def get_y_axis(ax): return ax._shared_axes["y"] +def assert_is_valid_plot_return_object(objs) -> None: + from matplotlib.artist import Artist + from matplotlib.axes import Axes + + if isinstance(objs, (Series, np.ndarray)): + if isinstance(objs, Series): + objs = objs._values + for el in objs.reshape(-1): + msg = ( + "one of 'objs' is not a matplotlib Axes instance, " + f"type encountered {type(el).__name__!r}" + ) + assert isinstance(el, (Axes, dict)), msg + else: + msg = ( + "objs is neither an ndarray of Artist instances nor a single " + "ArtistArtist instance, tuple, or dict, 'objs' is a " + f"{type(objs).__name__!r}" + ) + assert isinstance(objs, (Artist, tuple, dict)), msg + + def _check_plot_works(f, default_axes=False, **kwargs): """ Create plot and ensure that plot return object is valid. @@ -530,15 +548,11 @@ def _check_plot_works(f, default_axes=False, **kwargs): gen_plots = _gen_two_subplots ret = None - try: - fig = kwargs.get("figure", plt.gcf()) - plt.clf() - - for ret in gen_plots(f, fig, **kwargs): - tm.assert_is_valid_plot_return_object(ret) + fig = kwargs.get("figure", plt.gcf()) + fig.clf() - finally: - plt.close(fig) + for ret in gen_plots(f, fig, **kwargs): + assert_is_valid_plot_return_object(ret) return ret From f7590e6f5a31742142f9a06e144a0022058a572b Mon Sep 17 00:00:00 2001 From: Abel Tavares <121238257+abeltavares@users.noreply.github.com> Date: Wed, 5 Jun 2024 17:59:02 +0100 Subject: [PATCH 056/272] BUG/df.agg-with-df-with-missing-values-results-in-IndexError (#58864) * fix * improve and fix bug entry * update --------- Co-authored-by: 121238257 --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/apply.py | 8 +++--- .../tests/groupby/aggregate/test_aggregate.py | 26 +++++++++++++++++++ 3 files changed, 32 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 7f5c879c0d9f5..802a9fd7e2099 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -456,6 +456,7 @@ Datetimelike - Bug in :func:`date_range` where the last valid timestamp would sometimes not be produced (:issue:`56134`) - Bug in :func:`date_range` where using a negative frequency value would not include all points between the start and end values (:issue:`56382`) - Bug in :func:`tseries.api.guess_datetime_format` would fail to infer time format when "%Y" == "%H%M" (:issue:`57452`) +- Bug in :meth:`Dataframe.agg` with df with missing values resulting in IndexError (:issue:`58810`) - Bug in :meth:`DatetimeIndex.is_year_start` and :meth:`DatetimeIndex.is_quarter_start` does not raise on Custom business days frequencies bigger then "1C" (:issue:`58664`) - Bug in :meth:`DatetimeIndex.is_year_start` and :meth:`DatetimeIndex.is_quarter_start` returning ``False`` on double-digit frequencies (:issue:`58523`) - Bug in setting scalar values with mismatched resolution into arrays with non-nanosecond ``datetime64``, ``timedelta64`` or :class:`DatetimeTZDtype` incorrectly truncating those scalars (:issue:`56410`) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 33f506235870d..2039386c4766c 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -1850,11 +1850,13 @@ def relabel_result( com.get_callable_name(f) if not isinstance(f, str) else f for f in fun ] col_idx_order = Index(s.index).get_indexer(fun) - s = s.iloc[col_idx_order] - + valid_idx = col_idx_order != -1 + if valid_idx.any(): + s = s.iloc[col_idx_order[valid_idx]] # assign the new user-provided "named aggregation" as index names, and reindex # it based on the whole user-provided names. - s.index = reordered_indexes[idx : idx + len(fun)] + if not s.empty: + s.index = reordered_indexes[idx : idx + len(fun)] reordered_result_in_dict[col] = s.reindex(columns) idx = idx + len(fun) return reordered_result_in_dict diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 1f140063fd84b..26602baedb594 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -62,6 +62,32 @@ def test_agg_ser_multi_key(df): tm.assert_series_equal(results, expected) +def test_agg_with_missing_values(): + # GH#58810 + missing_df = DataFrame( + { + "nan": [np.nan, np.nan, np.nan, np.nan], + "na": [pd.NA, pd.NA, pd.NA, pd.NA], + "nat": [pd.NaT, pd.NaT, pd.NaT, pd.NaT], + "none": [None, None, None, None], + "values": [1, 2, 3, 4], + } + ) + + result = missing_df.agg(x=("nan", "min"), y=("na", "min"), z=("values", "sum")) + + expected = DataFrame( + { + "nan": [np.nan, np.nan, np.nan], + "na": [np.nan, np.nan, np.nan], + "values": [np.nan, np.nan, 10.0], + }, + index=["x", "y", "z"], + ) + + tm.assert_frame_equal(result, expected) + + def test_groupby_aggregation_mixed_dtype(): # GH 6212 expected = DataFrame( From 811e6a43d192a4425e3c12b0a68efce96acef6cb Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Wed, 5 Jun 2024 22:30:12 +0530 Subject: [PATCH 057/272] DOC: fix SA01 for pandas.read_spss (#58934) --- ci/code_checks.sh | 1 - pandas/io/spss.py | 8 ++++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 039700f306e03..33e072a26c94a 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -474,7 +474,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.plotting.lag_plot RT03,SA01" \ -i "pandas.plotting.scatter_matrix PR07,SA01" \ -i "pandas.qcut PR07,SA01" \ - -i "pandas.read_spss SA01" \ -i "pandas.set_eng_float_format RT03,SA01" \ -i "pandas.testing.assert_extension_array_equal SA01" \ -i "pandas.tseries.offsets.BDay PR02,SA01" \ diff --git a/pandas/io/spss.py b/pandas/io/spss.py index 2c464cc7e90c4..313ffa79cbd09 100644 --- a/pandas/io/spss.py +++ b/pandas/io/spss.py @@ -52,6 +52,14 @@ def read_spss( DataFrame DataFrame based on the SPSS file. + See Also + -------- + read_csv : Read a comma-separated values (csv) file into a pandas DataFrame. + read_excel : Read an Excel file into a pandas DataFrame. + read_sas : Read an SAS file into a pandas DataFrame. + read_orc : Load an ORC object into a pandas DataFrame. + read_feather : Load a feather-format object into a pandas DataFrame. + Examples -------- >>> df = pd.read_spss("spss_data.sav") # doctest: +SKIP From 5fd883ac4f021440ef1b95704c0a193ad3bbc382 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Wed, 5 Jun 2024 22:30:45 +0530 Subject: [PATCH 058/272] DOC: fix PR07 for pandas.pivot (#58935) --- ci/code_checks.sh | 1 - pandas/core/reshape/pivot.py | 153 +++++++++++++++++++++++++++++++++-- 2 files changed, 146 insertions(+), 8 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 33e072a26c94a..705f01d74a972 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -469,7 +469,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.merge_asof PR07,RT03" \ -i "pandas.merge_ordered PR07" \ -i "pandas.period_range RT03,SA01" \ - -i "pandas.pivot PR07" \ -i "pandas.plotting.andrews_curves RT03,SA01" \ -i "pandas.plotting.lag_plot RT03,SA01" \ -i "pandas.plotting.scatter_matrix PR07,SA01" \ diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 86da19f13bacf..ff993c039bf9a 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -11,10 +11,6 @@ import numpy as np from pandas._libs import lib -from pandas.util._decorators import ( - Appender, - Substitution, -) from pandas.core.dtypes.cast import maybe_downcast_to_dtype from pandas.core.dtypes.common import ( @@ -29,7 +25,6 @@ ) import pandas.core.common as com -from pandas.core.frame import _shared_docs from pandas.core.groupby import Grouper from pandas.core.indexes.api import ( Index, @@ -656,8 +651,6 @@ def _convert_by(by): return by -@Substitution("\ndata : DataFrame") -@Appender(_shared_docs["pivot"], indents=1) def pivot( data: DataFrame, *, @@ -665,6 +658,152 @@ def pivot( index: IndexLabel | lib.NoDefault = lib.no_default, values: IndexLabel | lib.NoDefault = lib.no_default, ) -> DataFrame: + """ + Return reshaped DataFrame organized by given index / column values. + + Reshape data (produce a "pivot" table) based on column values. Uses + unique values from specified `index` / `columns` to form axes of the + resulting DataFrame. This function does not support data + aggregation, multiple values will result in a MultiIndex in the + columns. See the :ref:`User Guide ` for more on reshaping. + + Parameters + ---------- + data : DataFrame + Input pandas DataFrame object. + columns : str or object or a list of str + Column to use to make new frame's columns. + index : str or object or a list of str, optional + Column to use to make new frame's index. If not given, uses existing index. + values : str, object or a list of the previous, optional + Column(s) to use for populating new frame's values. If not + specified, all remaining columns will be used and the result will + have hierarchically indexed columns. + + Returns + ------- + DataFrame + Returns reshaped DataFrame. + + Raises + ------ + ValueError: + When there are any `index`, `columns` combinations with multiple + values. `DataFrame.pivot_table` when you need to aggregate. + + See Also + -------- + DataFrame.pivot_table : Generalization of pivot that can handle + duplicate values for one index/column pair. + DataFrame.unstack : Pivot based on the index values instead of a + column. + wide_to_long : Wide panel to long format. Less flexible but more + user-friendly than melt. + + Notes + ----- + For finer-tuned control, see hierarchical indexing documentation along + with the related stack/unstack methods. + + Reference :ref:`the user guide ` for more examples. + + Examples + -------- + >>> df = pd.DataFrame( + ... { + ... "foo": ["one", "one", "one", "two", "two", "two"], + ... "bar": ["A", "B", "C", "A", "B", "C"], + ... "baz": [1, 2, 3, 4, 5, 6], + ... "zoo": ["x", "y", "z", "q", "w", "t"], + ... } + ... ) + >>> df + foo bar baz zoo + 0 one A 1 x + 1 one B 2 y + 2 one C 3 z + 3 two A 4 q + 4 two B 5 w + 5 two C 6 t + + >>> df.pivot(index="foo", columns="bar", values="baz") + bar A B C + foo + one 1 2 3 + two 4 5 6 + + >>> df.pivot(index="foo", columns="bar")["baz"] + bar A B C + foo + one 1 2 3 + two 4 5 6 + + >>> df.pivot(index="foo", columns="bar", values=["baz", "zoo"]) + baz zoo + bar A B C A B C + foo + one 1 2 3 x y z + two 4 5 6 q w t + + You could also assign a list of column names or a list of index names. + + >>> df = pd.DataFrame( + ... { + ... "lev1": [1, 1, 1, 2, 2, 2], + ... "lev2": [1, 1, 2, 1, 1, 2], + ... "lev3": [1, 2, 1, 2, 1, 2], + ... "lev4": [1, 2, 3, 4, 5, 6], + ... "values": [0, 1, 2, 3, 4, 5], + ... } + ... ) + >>> df + lev1 lev2 lev3 lev4 values + 0 1 1 1 1 0 + 1 1 1 2 2 1 + 2 1 2 1 3 2 + 3 2 1 2 4 3 + 4 2 1 1 5 4 + 5 2 2 2 6 5 + + >>> df.pivot(index="lev1", columns=["lev2", "lev3"], values="values") + lev2 1 2 + lev3 1 2 1 2 + lev1 + 1 0.0 1.0 2.0 NaN + 2 4.0 3.0 NaN 5.0 + + >>> df.pivot(index=["lev1", "lev2"], columns=["lev3"], values="values") + lev3 1 2 + lev1 lev2 + 1 1 0.0 1.0 + 2 2.0 NaN + 2 1 4.0 3.0 + 2 NaN 5.0 + + A ValueError is raised if there are any duplicates. + + >>> df = pd.DataFrame( + ... { + ... "foo": ["one", "one", "two", "two"], + ... "bar": ["A", "A", "B", "C"], + ... "baz": [1, 2, 3, 4], + ... } + ... ) + >>> df + foo bar baz + 0 one A 1 + 1 one A 2 + 2 two B 3 + 3 two C 4 + + Notice that the first two rows are the same for our `index` + and `columns` arguments. + + >>> df.pivot(index="foo", columns="bar", values="baz") + Traceback (most recent call last): + ... + ValueError: Index contains duplicate entries, cannot reshape + """ columns_listlike = com.convert_to_list_like(columns) # If columns is None we will create a MultiIndex level with None as name From 0e90f66a88e0d4b16b143f1e563ec5fa3565469b Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Wed, 5 Jun 2024 22:31:11 +0530 Subject: [PATCH 059/272] DOC: fix PR07 for pandas.merge_ordered (#58936) --- ci/code_checks.sh | 1 - pandas/core/reshape/merge.py | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 705f01d74a972..ade7173cf0344 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -467,7 +467,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.json_normalize RT03,SA01" \ -i "pandas.merge PR07" \ -i "pandas.merge_asof PR07,RT03" \ - -i "pandas.merge_ordered PR07" \ -i "pandas.period_range RT03,SA01" \ -i "pandas.plotting.andrews_curves RT03,SA01" \ -i "pandas.plotting.lag_plot RT03,SA01" \ diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index e6e84c2135b82..ddf6bd3c70988 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -316,7 +316,9 @@ def merge_ordered( Parameters ---------- left : DataFrame or named Series + First pandas object to merge. right : DataFrame or named Series + Second pandas object to merge. on : label or list Field names to join on. Must be found in both DataFrames. left_on : label or list, or array-like From 0d47e86e7564ced16f9674aeb907e26f91f089a6 Mon Sep 17 00:00:00 2001 From: Natalia Mokeeva <91160475+natmokval@users.noreply.github.com> Date: Thu, 6 Jun 2024 16:56:15 +0200 Subject: [PATCH 060/272] DOC: correct examples for Resample a time series for clarity (#58947) update documentation for resample a time series to another frequency --- doc/source/getting_started/intro_tutorials/09_timeseries.rst | 2 +- doc/source/user_guide/timeseries.rst | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/getting_started/intro_tutorials/09_timeseries.rst b/doc/source/getting_started/intro_tutorials/09_timeseries.rst index 14db38c3822dc..6ba3c17fac3c3 100644 --- a/doc/source/getting_started/intro_tutorials/09_timeseries.rst +++ b/doc/source/getting_started/intro_tutorials/09_timeseries.rst @@ -295,7 +295,7 @@ Aggregate the current hourly time series values to the monthly maximum value in .. ipython:: python - monthly_max = no_2.resample("ME").max() + monthly_max = no_2.resample("MS").max() monthly_max A very powerful method on time series data with a datetime index, is the diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index ab3f5b314ed83..d5137baa95ab8 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -1864,7 +1864,7 @@ to resample based on datetimelike column in the frame, it can passed to the ), ) df - df.resample("ME", on="date")[["a"]].sum() + df.resample("MS", on="date")[["a"]].sum() Similarly, if you instead want to resample by a datetimelike level of ``MultiIndex``, its name or location can be passed to the @@ -1872,7 +1872,7 @@ level of ``MultiIndex``, its name or location can be passed to the .. ipython:: python - df.resample("ME", level="d")[["a"]].sum() + df.resample("MS", level="d")[["a"]].sum() .. _timeseries.iterating-label: From d969dd805079e28e1e4d7342e306f59348673ab1 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 6 Jun 2024 05:59:30 -1000 Subject: [PATCH 061/272] REF: Make _slice_take_blocks_ax0 a generator (#58805) * REF: Make _slice_take_blocks_ax0 a generator * Remove [] --- pandas/core/internals/managers.py | 68 ++++++++++++++----------------- 1 file changed, 31 insertions(+), 37 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 82b88d090f847..64109f5c1655c 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -821,11 +821,13 @@ def reindex_indexer( raise IndexError("Requested axis not found in manager") if axis == 0: - new_blocks = self._slice_take_blocks_ax0( - indexer, - fill_value=fill_value, - only_slice=only_slice, - use_na_proxy=use_na_proxy, + new_blocks = list( + self._slice_take_blocks_ax0( + indexer, + fill_value=fill_value, + only_slice=only_slice, + use_na_proxy=use_na_proxy, + ) ) else: new_blocks = [ @@ -857,7 +859,7 @@ def _slice_take_blocks_ax0( *, use_na_proxy: bool = False, ref_inplace_op: bool = False, - ) -> list[Block]: + ) -> Generator[Block, None, None]: """ Slice/take blocks along axis=0. @@ -875,9 +877,9 @@ def _slice_take_blocks_ax0( ref_inplace_op: bool, default False Don't track refs if True because we operate inplace - Returns - ------- - new_blocks : list of Block + Yields + ------ + Block : New Block """ allow_fill = fill_value is not lib.no_default @@ -892,9 +894,10 @@ def _slice_take_blocks_ax0( # GH#32959 EABlock would fail since we can't make 0-width # TODO(EA2D): special casing unnecessary with 2D EAs if sllen == 0: - return [] + return bp = BlockPlacement(slice(0, sllen)) - return [blk.getitem_block_columns(slobj, new_mgr_locs=bp)] + yield blk.getitem_block_columns(slobj, new_mgr_locs=bp) + return elif not allow_fill or self.ndim == 1: if allow_fill and fill_value is None: fill_value = blk.fill_value @@ -902,25 +905,21 @@ def _slice_take_blocks_ax0( if not allow_fill and only_slice: # GH#33597 slice instead of take, so we get # views instead of copies - blocks = [ - blk.getitem_block_columns( + for i, ml in enumerate(slobj): + yield blk.getitem_block_columns( slice(ml, ml + 1), new_mgr_locs=BlockPlacement(i), ref_inplace_op=ref_inplace_op, ) - for i, ml in enumerate(slobj) - ] - return blocks else: bp = BlockPlacement(slice(0, sllen)) - return [ - blk.take_nd( - slobj, - axis=0, - new_mgr_locs=bp, - fill_value=fill_value, - ) - ] + yield blk.take_nd( + slobj, + axis=0, + new_mgr_locs=bp, + fill_value=fill_value, + ) + return if sl_type == "slice": blknos = self.blknos[slobj] @@ -935,18 +934,15 @@ def _slice_take_blocks_ax0( # When filling blknos, make sure blknos is updated before appending to # blocks list, that way new blkno is exactly len(blocks). - blocks = [] group = not only_slice for blkno, mgr_locs in libinternals.get_blkno_placements(blknos, group=group): if blkno == -1: # If we've got here, fill_value was not lib.no_default - blocks.append( - self._make_na_block( - placement=mgr_locs, - fill_value=fill_value, - use_na_proxy=use_na_proxy, - ) + yield self._make_na_block( + placement=mgr_locs, + fill_value=fill_value, + use_na_proxy=use_na_proxy, ) else: blk = self.blocks[blkno] @@ -961,7 +957,7 @@ def _slice_take_blocks_ax0( for mgr_loc in mgr_locs: newblk = blk.copy(deep=deep) newblk.mgr_locs = BlockPlacement(slice(mgr_loc, mgr_loc + 1)) - blocks.append(newblk) + yield newblk else: # GH#32779 to avoid the performance penalty of copying, @@ -972,7 +968,7 @@ def _slice_take_blocks_ax0( if isinstance(taker, slice): nb = blk.getitem_block_columns(taker, new_mgr_locs=mgr_locs) - blocks.append(nb) + yield nb elif only_slice: # GH#33597 slice instead of take, so we get # views instead of copies @@ -981,12 +977,10 @@ def _slice_take_blocks_ax0( bp = BlockPlacement(ml) nb = blk.getitem_block_columns(slc, new_mgr_locs=bp) # We have np.shares_memory(nb.values, blk.values) - blocks.append(nb) + yield nb else: nb = blk.take_nd(taker, axis=0, new_mgr_locs=mgr_locs) - blocks.append(nb) - - return blocks + yield nb def _make_na_block( self, placement: BlockPlacement, fill_value=None, use_na_proxy: bool = False From d6b7d5c9a25e3dd02c8ea249a92f2f0f329b0e42 Mon Sep 17 00:00:00 2001 From: Georgios Malandrakis <93475472+giormala@users.noreply.github.com> Date: Fri, 7 Jun 2024 00:01:04 +0300 Subject: [PATCH 062/272] DOC: Add notes to nullable types documentation about pd.NA column type (#58163) --- doc/source/user_guide/boolean.rst | 13 +++++++++++++ doc/source/user_guide/integer_na.rst | 13 +++++++++++++ 2 files changed, 26 insertions(+) diff --git a/doc/source/user_guide/boolean.rst b/doc/source/user_guide/boolean.rst index 3c361d4de17e5..7de0430123fd2 100644 --- a/doc/source/user_guide/boolean.rst +++ b/doc/source/user_guide/boolean.rst @@ -37,6 +37,19 @@ If you would prefer to keep the ``NA`` values you can manually fill them with `` s[mask.fillna(True)] +If you create a column of ``NA`` values (for example to fill them later) +with ``df['new_col'] = pd.NA``, the ``dtype`` would be set to ``object`` in the +new column. The performance on this column will be worse than with +the appropriate type. It's better to use +``df['new_col'] = pd.Series(pd.NA, dtype="boolean")`` +(or another ``dtype`` that supports ``NA``). + +.. ipython:: python + + df = pd.DataFrame() + df['objects'] = pd.NA + df.dtypes + .. _boolean.kleene: Kleene logical operations diff --git a/doc/source/user_guide/integer_na.rst b/doc/source/user_guide/integer_na.rst index 1a727cd78af09..76a2f22b7987d 100644 --- a/doc/source/user_guide/integer_na.rst +++ b/doc/source/user_guide/integer_na.rst @@ -84,6 +84,19 @@ with the dtype. In the future, we may provide an option for :class:`Series` to infer a nullable-integer dtype. +If you create a column of ``NA`` values (for example to fill them later) +with ``df['new_col'] = pd.NA``, the ``dtype`` would be set to ``object`` in the +new column. The performance on this column will be worse than with +the appropriate type. It's better to use +``df['new_col'] = pd.Series(pd.NA, dtype="Int64")`` +(or another ``dtype`` that supports ``NA``). + +.. ipython:: python + + df = pd.DataFrame() + df['objects'] = pd.NA + df.dtypes + Operations ---------- From 8b705675bcb8ae86fb03e388e9969a1772236bf0 Mon Sep 17 00:00:00 2001 From: Pedro Freitas <102478434+PF2100@users.noreply.github.com> Date: Thu, 6 Jun 2024 22:16:33 +0100 Subject: [PATCH 063/272] ENH: Add **kwargs to pivot_table to allow the specification of aggfunc keyword arguments #57884 (#58893) Co-authored-by: Pedro Freitas Co-authored-by: Rui Amaral --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/frame.py | 7 ++++ pandas/core/reshape/pivot.py | 63 ++++++++++++++++++++++-------- pandas/tests/reshape/test_pivot.py | 54 +++++++++++++++++++++++++ 4 files changed, 109 insertions(+), 16 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 802a9fd7e2099..abf18968076d0 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -42,6 +42,7 @@ Other enhancements - :meth:`DataFrame.corrwith` now accepts ``min_periods`` as optional arguments, as in :meth:`DataFrame.corr` and :meth:`Series.corr` (:issue:`9490`) - :meth:`DataFrame.cummin`, :meth:`DataFrame.cummax`, :meth:`DataFrame.cumprod` and :meth:`DataFrame.cumsum` methods now have a ``numeric_only`` parameter (:issue:`53072`) - :meth:`DataFrame.fillna` and :meth:`Series.fillna` can now accept ``value=None``; for non-object dtype the corresponding NA value will be used (:issue:`57723`) +- :meth:`DataFrame.pivot_table` and :func:`pivot_table` now allow the passing of keyword arguments to ``aggfunc`` through ``**kwargs`` (:issue:`57884`) - :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`) - :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`) - Restore support for reading Stata 104-format and enable reading 103-format dta files (:issue:`58554`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c37dfa225de5a..a6c0e1e372530 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -9275,6 +9275,11 @@ def pivot( .. versionadded:: 1.3.0 + **kwargs : dict + Optional keyword arguments to pass to ``aggfunc``. + + .. versionadded:: 3.0.0 + Returns ------- DataFrame @@ -9382,6 +9387,7 @@ def pivot_table( margins_name: Level = "All", observed: bool = True, sort: bool = True, + **kwargs, ) -> DataFrame: from pandas.core.reshape.pivot import pivot_table @@ -9397,6 +9403,7 @@ def pivot_table( margins_name=margins_name, observed=observed, sort=sort, + **kwargs, ) def stack( diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index ff993c039bf9a..8c2c2053b0554 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -61,6 +61,7 @@ def pivot_table( margins_name: Hashable = "All", observed: bool = True, sort: bool = True, + **kwargs, ) -> DataFrame: """ Create a spreadsheet-style pivot table as a DataFrame. @@ -119,6 +120,11 @@ def pivot_table( .. versionadded:: 1.3.0 + **kwargs : dict + Optional keyword arguments to pass to ``aggfunc``. + + .. versionadded:: 3.0.0 + Returns ------- DataFrame @@ -246,6 +252,7 @@ def pivot_table( margins_name=margins_name, observed=observed, sort=sort, + kwargs=kwargs, ) pieces.append(_table) keys.append(getattr(func, "__name__", func)) @@ -265,6 +272,7 @@ def pivot_table( margins_name, observed, sort, + kwargs, ) return table.__finalize__(data, method="pivot_table") @@ -281,6 +289,7 @@ def __internal_pivot_table( margins_name: Hashable, observed: bool, sort: bool, + kwargs, ) -> DataFrame: """ Helper of :func:`pandas.pivot_table` for any non-list ``aggfunc``. @@ -323,7 +332,7 @@ def __internal_pivot_table( values = list(values) grouped = data.groupby(keys, observed=observed, sort=sort, dropna=dropna) - agged = grouped.agg(aggfunc) + agged = grouped.agg(aggfunc, **kwargs) if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns): agged = agged.dropna(how="all") @@ -378,6 +387,7 @@ def __internal_pivot_table( rows=index, cols=columns, aggfunc=aggfunc, + kwargs=kwargs, observed=dropna, margins_name=margins_name, fill_value=fill_value, @@ -403,6 +413,7 @@ def _add_margins( rows, cols, aggfunc, + kwargs, observed: bool, margins_name: Hashable = "All", fill_value=None, @@ -415,7 +426,7 @@ def _add_margins( if margins_name in table.index.get_level_values(level): raise ValueError(msg) - grand_margin = _compute_grand_margin(data, values, aggfunc, margins_name) + grand_margin = _compute_grand_margin(data, values, aggfunc, kwargs, margins_name) if table.ndim == 2: # i.e. DataFrame @@ -436,7 +447,15 @@ def _add_margins( elif values: marginal_result_set = _generate_marginal_results( - table, data, values, rows, cols, aggfunc, observed, margins_name + table, + data, + values, + rows, + cols, + aggfunc, + kwargs, + observed, + margins_name, ) if not isinstance(marginal_result_set, tuple): return marginal_result_set @@ -445,7 +464,7 @@ def _add_margins( # no values, and table is a DataFrame assert isinstance(table, ABCDataFrame) marginal_result_set = _generate_marginal_results_without_values( - table, data, rows, cols, aggfunc, observed, margins_name + table, data, rows, cols, aggfunc, kwargs, observed, margins_name ) if not isinstance(marginal_result_set, tuple): return marginal_result_set @@ -482,26 +501,26 @@ def _add_margins( def _compute_grand_margin( - data: DataFrame, values, aggfunc, margins_name: Hashable = "All" + data: DataFrame, values, aggfunc, kwargs, margins_name: Hashable = "All" ): if values: grand_margin = {} for k, v in data[values].items(): try: if isinstance(aggfunc, str): - grand_margin[k] = getattr(v, aggfunc)() + grand_margin[k] = getattr(v, aggfunc)(**kwargs) elif isinstance(aggfunc, dict): if isinstance(aggfunc[k], str): - grand_margin[k] = getattr(v, aggfunc[k])() + grand_margin[k] = getattr(v, aggfunc[k])(**kwargs) else: - grand_margin[k] = aggfunc[k](v) + grand_margin[k] = aggfunc[k](v, **kwargs) else: - grand_margin[k] = aggfunc(v) + grand_margin[k] = aggfunc(v, **kwargs) except TypeError: pass return grand_margin else: - return {margins_name: aggfunc(data.index)} + return {margins_name: aggfunc(data.index, **kwargs)} def _generate_marginal_results( @@ -511,6 +530,7 @@ def _generate_marginal_results( rows, cols, aggfunc, + kwargs, observed: bool, margins_name: Hashable = "All", ): @@ -524,7 +544,11 @@ def _all_key(key): return (key, margins_name) + ("",) * (len(cols) - 1) if len(rows) > 0: - margin = data[rows + values].groupby(rows, observed=observed).agg(aggfunc) + margin = ( + data[rows + values] + .groupby(rows, observed=observed) + .agg(aggfunc, **kwargs) + ) cat_axis = 1 for key, piece in table.T.groupby(level=0, observed=observed): @@ -549,7 +573,7 @@ def _all_key(key): table_pieces.append(piece) # GH31016 this is to calculate margin for each group, and assign # corresponded key as index - transformed_piece = DataFrame(piece.apply(aggfunc)).T + transformed_piece = DataFrame(piece.apply(aggfunc, **kwargs)).T if isinstance(piece.index, MultiIndex): # We are adding an empty level transformed_piece.index = MultiIndex.from_tuples( @@ -579,7 +603,9 @@ def _all_key(key): margin_keys = table.columns if len(cols) > 0: - row_margin = data[cols + values].groupby(cols, observed=observed).agg(aggfunc) + row_margin = ( + data[cols + values].groupby(cols, observed=observed).agg(aggfunc, **kwargs) + ) row_margin = row_margin.stack() # GH#26568. Use names instead of indices in case of numeric names @@ -598,6 +624,7 @@ def _generate_marginal_results_without_values( rows, cols, aggfunc, + kwargs, observed: bool, margins_name: Hashable = "All", ): @@ -612,14 +639,16 @@ def _all_key(): return (margins_name,) + ("",) * (len(cols) - 1) if len(rows) > 0: - margin = data.groupby(rows, observed=observed)[rows].apply(aggfunc) + margin = data.groupby(rows, observed=observed)[rows].apply( + aggfunc, **kwargs + ) all_key = _all_key() table[all_key] = margin result = table margin_keys.append(all_key) else: - margin = data.groupby(level=0, observed=observed).apply(aggfunc) + margin = data.groupby(level=0, observed=observed).apply(aggfunc, **kwargs) all_key = _all_key() table[all_key] = margin result = table @@ -630,7 +659,9 @@ def _all_key(): margin_keys = table.columns if len(cols): - row_margin = data.groupby(cols, observed=observed)[cols].apply(aggfunc) + row_margin = data.groupby(cols, observed=observed)[cols].apply( + aggfunc, **kwargs + ) else: row_margin = Series(np.nan, index=result.columns) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 4a13c1f5e1167..728becc76b71f 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -2058,6 +2058,60 @@ def test_pivot_string_as_func(self): ).rename_axis("A") tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("kwargs", [{"a": 2}, {"a": 2, "b": 3}, {"b": 3, "a": 2}]) + def test_pivot_table_kwargs(self, kwargs): + # GH#57884 + def f(x, a, b=3): + return x.sum() * a + b + + def g(x): + return f(x, **kwargs) + + df = DataFrame( + { + "A": ["good", "bad", "good", "bad", "good"], + "B": ["one", "two", "one", "three", "two"], + "X": [2, 5, 4, 20, 10], + } + ) + result = pivot_table( + df, index="A", columns="B", values="X", aggfunc=f, **kwargs + ) + expected = pivot_table(df, index="A", columns="B", values="X", aggfunc=g) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "kwargs", [{}, {"b": 10}, {"a": 3}, {"a": 3, "b": 10}, {"b": 10, "a": 3}] + ) + def test_pivot_table_kwargs_margin(self, data, kwargs): + # GH#57884 + def f(x, a=5, b=7): + return (x.sum() + b) * a + + def g(x): + return f(x, **kwargs) + + result = data.pivot_table( + values="D", + index=["A", "B"], + columns="C", + aggfunc=f, + margins=True, + fill_value=0, + **kwargs, + ) + + expected = data.pivot_table( + values="D", + index=["A", "B"], + columns="C", + aggfunc=g, + margins=True, + fill_value=0, + ) + + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( "f, f_numpy", [ From ee127fc0f2065ebf71b3013ac7e56e5da19f1969 Mon Sep 17 00:00:00 2001 From: Asish Mahapatra Date: Thu, 6 Jun 2024 22:37:43 -0400 Subject: [PATCH 064/272] DOC: minor grammar fix "en -> and" (#58953) doc: typo - en -> and --- pandas/core/series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index bfaba866c3dfd..3d1bd8ebb03cb 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4965,7 +4965,7 @@ def drop( C 2 dtype: int64 - Drop labels B en C + Drop labels B and C >>> s.drop(labels=["B", "C"]) A 0 From c95716ab29313b4b5c6122e52bc9d3a082d6ae3e Mon Sep 17 00:00:00 2001 From: Natalia Mokeeva <91160475+natmokval@users.noreply.github.com> Date: Fri, 7 Jun 2024 19:27:30 +0200 Subject: [PATCH 065/272] CLN: enforce deprecation of frequencies deprecated for offsets (#57986) * enforce deprecation of offset deprecated freqstr * fix tests * fix mypy error * add a note to v3.0.0 * remove c_REVERSE_OFFSET_REMOVED_FREQSTR, correct tests * fix test_to_period_offsets_not_supported * correct to_offset, fix tests * add dict PERIOD_TO_OFFSET_FREQSTR, corect meth to_offset, fix tests * add a comment * create dictionary PERIOD_AND_OFFSET_ALIASES * correct def to_offset * fixup * fixup * replace c_OFFSET_RENAMED_FREQSTR with c_PERIOD_TO_OFFSET_FREQSTR * add cimport c_PERIOD_TO_OFFSET_FREQSTR * add a helper function for error reporting * minor whatsnew fix --------- Co-authored-by: Marco Edward Gorelli Co-authored-by: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> --- doc/source/whatsnew/v3.0.0.rst | 30 +++++- pandas/_libs/tslibs/dtypes.pxd | 5 +- pandas/_libs/tslibs/dtypes.pyx | 45 ++++++++- pandas/_libs/tslibs/offsets.pyx | 96 ++++++++++--------- pandas/tests/arrays/test_datetimes.py | 45 ++++----- pandas/tests/frame/methods/test_asfreq.py | 18 ++-- .../datetimes/methods/test_to_period.py | 32 +++---- .../indexes/datetimes/test_date_range.py | 57 +++-------- .../tests/indexes/datetimes/test_datetime.py | 27 +----- .../indexes/period/methods/test_asfreq.py | 25 ++--- .../tests/indexes/period/test_constructors.py | 21 ++-- .../tests/indexes/period/test_period_range.py | 24 +++-- pandas/tests/resample/test_datetime_index.py | 44 ++------- pandas/tests/resample/test_period_index.py | 74 ++++++-------- pandas/tests/scalar/period/test_asfreq.py | 12 +-- pandas/tests/scalar/period/test_period.py | 2 +- pandas/tests/tslibs/test_to_offset.py | 18 ++-- 17 files changed, 274 insertions(+), 301 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index abf18968076d0..2b45c8aa4865f 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -280,6 +280,34 @@ Other Deprecations Removal of prior version deprecations/changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Enforced deprecation of aliases ``M``, ``Q``, ``Y``, etc. in favour of ``ME``, ``QE``, ``YE``, etc. for offsets +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Renamed the following offset aliases (:issue:`57986`): + ++-------------------------------+------------------+------------------+ +| offset | removed alias | new alias | ++===============================+==================+==================+ +|:class:`MonthEnd` | ``M`` | ``ME`` | ++-------------------------------+------------------+------------------+ +|:class:`BusinessMonthEnd` | ``BM`` | ``BME`` | ++-------------------------------+------------------+------------------+ +|:class:`SemiMonthEnd` | ``SM`` | ``SME`` | ++-------------------------------+------------------+------------------+ +|:class:`CustomBusinessMonthEnd`| ``CBM`` | ``CBME`` | ++-------------------------------+------------------+------------------+ +|:class:`QuarterEnd` | ``Q`` | ``QE`` | ++-------------------------------+------------------+------------------+ +|:class:`BQuarterEnd` | ``BQ`` | ``BQE`` | ++-------------------------------+------------------+------------------+ +|:class:`YearEnd` | ``Y`` | ``YE`` | ++-------------------------------+------------------+------------------+ +|:class:`BYearEnd` | ``BY`` | ``BYE`` | ++-------------------------------+------------------+------------------+ + +Other Removals +^^^^^^^^^^^^^^ - :class:`.DataFrameGroupBy.idxmin`, :class:`.DataFrameGroupBy.idxmax`, :class:`.SeriesGroupBy.idxmin`, and :class:`.SeriesGroupBy.idxmax` will now raise a ``ValueError`` when used with ``skipna=False`` and an NA value is encountered (:issue:`10694`) - :func:`concat` no longer ignores empty objects when determining output dtypes (:issue:`39122`) - :func:`concat` with all-NA entries no longer ignores the dtype of those entries when determining the result dtype (:issue:`40893`) @@ -343,7 +371,7 @@ Removal of prior version deprecations/changes - Enforced deprecation of string ``A`` denoting frequency in :class:`YearEnd` and strings ``A-DEC``, ``A-JAN``, etc. denoting annual frequencies with various fiscal year ends (:issue:`57699`) - Enforced deprecation of string ``BAS`` denoting frequency in :class:`BYearBegin` and strings ``BAS-DEC``, ``BAS-JAN``, etc. denoting annual frequencies with various fiscal year starts (:issue:`57793`) - Enforced deprecation of string ``BA`` denoting frequency in :class:`BYearEnd` and strings ``BA-DEC``, ``BA-JAN``, etc. denoting annual frequencies with various fiscal year ends (:issue:`57793`) -- Enforced deprecation of strings ``T``, ``L``, ``U``, and ``N`` denoting frequencies in :class:`Minute`, :class:`Second`, :class:`Milli`, :class:`Micro`, :class:`Nano` (:issue:`57627`) +- Enforced deprecation of strings ``T``, ``L``, ``U``, and ``N`` denoting frequencies in :class:`Minute`, :class:`Milli`, :class:`Micro`, :class:`Nano` (:issue:`57627`) - Enforced deprecation of strings ``T``, ``L``, ``U``, and ``N`` denoting units in :class:`Timedelta` (:issue:`57627`) - Enforced deprecation of the behavior of :func:`concat` when ``len(keys) != len(objs)`` would truncate to the shorter of the two. Now this raises a ``ValueError`` (:issue:`43485`) - Enforced deprecation of values "pad", "ffill", "bfill", and "backfill" for :meth:`Series.interpolate` and :meth:`DataFrame.interpolate` (:issue:`57869`) diff --git a/pandas/_libs/tslibs/dtypes.pxd b/pandas/_libs/tslibs/dtypes.pxd index 33f6789f3b402..455bca35d160a 100644 --- a/pandas/_libs/tslibs/dtypes.pxd +++ b/pandas/_libs/tslibs/dtypes.pxd @@ -12,9 +12,10 @@ cdef NPY_DATETIMEUNIT get_supported_reso(NPY_DATETIMEUNIT reso) cdef bint is_supported_unit(NPY_DATETIMEUNIT reso) cdef dict c_OFFSET_TO_PERIOD_FREQSTR -cdef dict c_OFFSET_DEPR_FREQSTR -cdef dict c_REVERSE_OFFSET_DEPR_FREQSTR +cdef dict c_PERIOD_TO_OFFSET_FREQSTR +cdef dict c_OFFSET_RENAMED_FREQSTR cdef dict c_DEPR_ABBREVS +cdef dict c_PERIOD_AND_OFFSET_DEPR_FREQSTR cdef dict attrname_to_abbrevs cdef dict npy_unit_to_attrname cdef dict attrname_to_npy_unit diff --git a/pandas/_libs/tslibs/dtypes.pyx b/pandas/_libs/tslibs/dtypes.pyx index 5bfbe211bfd14..479a5a328b1d8 100644 --- a/pandas/_libs/tslibs/dtypes.pyx +++ b/pandas/_libs/tslibs/dtypes.pyx @@ -176,6 +176,10 @@ OFFSET_TO_PERIOD_FREQSTR: dict = { "EOM": "M", "BME": "M", "SME": "M", + "BMS": "M", + "CBME": "M", + "CBMS": "M", + "SMS": "M", "BQS": "Q", "QS": "Q", "BQE": "Q", @@ -228,7 +232,6 @@ OFFSET_TO_PERIOD_FREQSTR: dict = { "YE-NOV": "Y-NOV", "W": "W", "ME": "M", - "Y": "Y", "BYE": "Y", "BYE-DEC": "Y-DEC", "BYE-JAN": "Y-JAN", @@ -245,7 +248,7 @@ OFFSET_TO_PERIOD_FREQSTR: dict = { "YS": "Y", "BYS": "Y", } -cdef dict c_OFFSET_DEPR_FREQSTR = { +cdef dict c_OFFSET_RENAMED_FREQSTR = { "M": "ME", "Q": "QE", "Q-DEC": "QE-DEC", @@ -303,10 +306,37 @@ cdef dict c_OFFSET_DEPR_FREQSTR = { "BQ-OCT": "BQE-OCT", "BQ-NOV": "BQE-NOV", } -cdef dict c_OFFSET_TO_PERIOD_FREQSTR = OFFSET_TO_PERIOD_FREQSTR -cdef dict c_REVERSE_OFFSET_DEPR_FREQSTR = { - v: k for k, v in c_OFFSET_DEPR_FREQSTR.items() +PERIOD_TO_OFFSET_FREQSTR = { + "M": "ME", + "Q": "QE", + "Q-DEC": "QE-DEC", + "Q-JAN": "QE-JAN", + "Q-FEB": "QE-FEB", + "Q-MAR": "QE-MAR", + "Q-APR": "QE-APR", + "Q-MAY": "QE-MAY", + "Q-JUN": "QE-JUN", + "Q-JUL": "QE-JUL", + "Q-AUG": "QE-AUG", + "Q-SEP": "QE-SEP", + "Q-OCT": "QE-OCT", + "Q-NOV": "QE-NOV", + "Y": "YE", + "Y-DEC": "YE-DEC", + "Y-JAN": "YE-JAN", + "Y-FEB": "YE-FEB", + "Y-MAR": "YE-MAR", + "Y-APR": "YE-APR", + "Y-MAY": "YE-MAY", + "Y-JUN": "YE-JUN", + "Y-JUL": "YE-JUL", + "Y-AUG": "YE-AUG", + "Y-SEP": "YE-SEP", + "Y-OCT": "YE-OCT", + "Y-NOV": "YE-NOV", } +cdef dict c_OFFSET_TO_PERIOD_FREQSTR = OFFSET_TO_PERIOD_FREQSTR +cdef dict c_PERIOD_TO_OFFSET_FREQSTR = PERIOD_TO_OFFSET_FREQSTR # Map deprecated resolution abbreviations to correct resolution abbreviations cdef dict c_DEPR_ABBREVS = { @@ -316,6 +346,11 @@ cdef dict c_DEPR_ABBREVS = { "S": "s", } +cdef dict c_PERIOD_AND_OFFSET_DEPR_FREQSTR = { + "w": "W", + "MIN": "min", +} + class FreqGroup(Enum): # Mirrors c_FreqGroup in the .pxd file diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index f9d63065493c3..a24941e4f0a5a 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -57,8 +57,10 @@ from pandas._libs.tslibs.ccalendar cimport ( from pandas._libs.tslibs.conversion cimport localize_pydatetime from pandas._libs.tslibs.dtypes cimport ( c_DEPR_ABBREVS, - c_OFFSET_DEPR_FREQSTR, - c_REVERSE_OFFSET_DEPR_FREQSTR, + c_OFFSET_RENAMED_FREQSTR, + c_OFFSET_TO_PERIOD_FREQSTR, + c_PERIOD_AND_OFFSET_DEPR_FREQSTR, + c_PERIOD_TO_OFFSET_FREQSTR, periods_per_day, ) from pandas._libs.tslibs.nattype cimport ( @@ -4711,6 +4713,34 @@ INVALID_FREQ_ERR_MSG = "Invalid frequency: {0}" _offset_map = {} +def _validate_to_offset_alias(alias: str, is_period: bool) -> None: + if not is_period: + if alias.upper() in c_OFFSET_RENAMED_FREQSTR: + raise ValueError( + f"\'{alias}\' is no longer supported for offsets. Please " + f"use \'{c_OFFSET_RENAMED_FREQSTR.get(alias.upper())}\' " + f"instead." + ) + if (alias.upper() != alias and + alias.lower() not in {"s", "ms", "us", "ns"} and + alias.upper().split("-")[0].endswith(("S", "E"))): + raise ValueError(INVALID_FREQ_ERR_MSG.format(alias)) + if (is_period and + alias.upper() in c_OFFSET_TO_PERIOD_FREQSTR and + alias != "ms" and + alias.upper().split("-")[0].endswith(("S", "E"))): + if (alias.upper().startswith("B") or + alias.upper().startswith("S") or + alias.upper().startswith("C")): + raise ValueError(INVALID_FREQ_ERR_MSG.format(alias)) + else: + alias_msg = "".join(alias.upper().split("E", 1)) + raise ValueError( + f"for Period, please use \'{alias_msg}\' " + f"instead of \'{alias}\'" + ) + + # TODO: better name? def _get_offset(name: str) -> BaseOffset: """ @@ -4850,54 +4880,26 @@ cpdef to_offset(freq, bint is_period=False): tups = zip(split[0::4], split[1::4], split[2::4]) for n, (sep, stride, name) in enumerate(tups): - if not is_period and name.upper() in c_OFFSET_DEPR_FREQSTR: - warnings.warn( - f"\'{name}\' is deprecated and will be removed " - f"in a future version, please use " - f"\'{c_OFFSET_DEPR_FREQSTR.get(name.upper())}\' instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - name = c_OFFSET_DEPR_FREQSTR[name.upper()] - if (not is_period and - name != name.upper() and - name.lower() not in {"s", "ms", "us", "ns"} and - name.upper().split("-")[0].endswith(("S", "E"))): - warnings.warn( - f"\'{name}\' is deprecated and will be removed " - f"in a future version, please use " - f"\'{name.upper()}\' instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - name = name.upper() - if is_period and name.upper() in c_REVERSE_OFFSET_DEPR_FREQSTR: - if name.upper().startswith("Y"): - raise ValueError( - f"for Period, please use \'Y{name.upper()[2:]}\' " - f"instead of \'{name}\'" - ) - if (name.upper().startswith("B") or - name.upper().startswith("S") or - name.upper().startswith("C")): - raise ValueError(INVALID_FREQ_ERR_MSG.format(name)) - else: - raise ValueError( - f"for Period, please use " - f"\'{c_REVERSE_OFFSET_DEPR_FREQSTR.get(name.upper())}\' " - f"instead of \'{name}\'" - ) - elif is_period and name.upper() in c_OFFSET_DEPR_FREQSTR: - if name.upper() != name: + _validate_to_offset_alias(name, is_period) + if is_period: + if name.upper() in c_PERIOD_TO_OFFSET_FREQSTR: + if name.upper() != name: + raise ValueError( + f"\'{name}\' is no longer supported, " + f"please use \'{name.upper()}\' instead.", + ) + name = c_PERIOD_TO_OFFSET_FREQSTR.get(name.upper()) + + if name in c_PERIOD_AND_OFFSET_DEPR_FREQSTR: warnings.warn( - f"\'{name}\' is deprecated and will be removed in " - f"a future version, please use \'{name.upper()}\' " - f"instead.", + f"\'{name}\' is deprecated and will be removed " + f"in a future version, please use " + f"\'{c_PERIOD_AND_OFFSET_DEPR_FREQSTR.get(name)}\' " + f" instead.", FutureWarning, stacklevel=find_stack_level(), - ) - name = c_OFFSET_DEPR_FREQSTR.get(name.upper()) - + ) + name = c_PERIOD_AND_OFFSET_DEPR_FREQSTR.get(name) if sep != "" and not sep.isspace(): raise ValueError("separator must be spaces") prefix = _lite_rule_alias.get(name) or name diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index 8650be62ae7eb..63d60c78da482 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -764,29 +764,14 @@ def test_iter_zoneinfo_fold(self, tz): assert left.utcoffset() == right2.utcoffset() @pytest.mark.parametrize( - "freq, freq_depr", - [ - ("2ME", "2M"), - ("2SME", "2SM"), - ("2SME", "2sm"), - ("2QE", "2Q"), - ("2QE-SEP", "2Q-SEP"), - ("1YE", "1Y"), - ("2YE-MAR", "2Y-MAR"), - ("2ME", "2m"), - ("2QE-SEP", "2q-sep"), - ("2YE", "2y"), - ], + "freq", + ["2M", "2SM", "2sm", "2Q", "2Q-SEP", "1Y", "2Y-MAR", "2m", "2q-sep", "2y"], ) - def test_date_range_frequency_M_Q_Y_A_deprecated(self, freq, freq_depr): - # GH#9586, GH#54275 - depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed " - f"in a future version, please use '{freq[1:]}' instead." + def test_date_range_frequency_M_Q_Y_raises(self, freq): + msg = f"Invalid frequency: {freq}" - expected = pd.date_range("1/1/2000", periods=4, freq=freq) - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - result = pd.date_range("1/1/2000", periods=4, freq=freq_depr) - tm.assert_index_equal(result, expected) + with pytest.raises(ValueError, match=msg): + pd.date_range("1/1/2000", periods=4, freq=freq) @pytest.mark.parametrize("freq_depr", ["2H", "2CBH", "2MIN", "2S", "2mS", "2Us"]) def test_date_range_uppercase_frequency_deprecated(self, freq_depr): @@ -800,7 +785,7 @@ def test_date_range_uppercase_frequency_deprecated(self, freq_depr): tm.assert_index_equal(result, expected) @pytest.mark.parametrize( - "freq_depr", + "freq", [ "2ye-mar", "2ys", @@ -811,17 +796,21 @@ def test_date_range_uppercase_frequency_deprecated(self, freq_depr): "2bms", "2cbme", "2me", - "2w", ], ) - def test_date_range_lowercase_frequency_deprecated(self, freq_depr): + def test_date_range_lowercase_frequency_raises(self, freq): + msg = f"Invalid frequency: {freq}" + + with pytest.raises(ValueError, match=msg): + pd.date_range("1/1/2000", periods=4, freq=freq) + + def test_date_range_lowercase_frequency_deprecated(self): # GH#9586, GH#54939 - depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed in a " - f"future version, please use '{freq_depr.upper()[1:]}' instead." + depr_msg = "'w' is deprecated and will be removed in a future version" - expected = pd.date_range("1/1/2000", periods=4, freq=freq_depr.upper()) + expected = pd.date_range("1/1/2000", periods=4, freq="2W") with tm.assert_produces_warning(FutureWarning, match=depr_msg): - result = pd.date_range("1/1/2000", periods=4, freq=freq_depr) + result = pd.date_range("1/1/2000", periods=4, freq="2w") tm.assert_index_equal(result, expected) @pytest.mark.parametrize("freq", ["1A", "2A-MAR", "2a-mar"]) diff --git a/pandas/tests/frame/methods/test_asfreq.py b/pandas/tests/frame/methods/test_asfreq.py index fb288e19c6e82..1c3c41e2e0299 100644 --- a/pandas/tests/frame/methods/test_asfreq.py +++ b/pandas/tests/frame/methods/test_asfreq.py @@ -236,32 +236,30 @@ def test_asfreq_2ME(self, freq, freq_half): "freq, freq_depr", [ ("2ME", "2M"), + ("2ME", "2m"), ("2QE", "2Q"), ("2QE-SEP", "2Q-SEP"), ("1BQE", "1BQ"), ("2BQE-SEP", "2BQ-SEP"), - ("1YE", "1Y"), + ("2BQE-SEP", "2bq-sep"), + ("1YE", "1y"), ("2YE-MAR", "2Y-MAR"), ], ) - def test_asfreq_frequency_M_Q_Y_deprecated(self, freq, freq_depr): - # GH#9586, #55978 - depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed " - f"in a future version, please use '{freq[1:]}' instead." + def test_asfreq_frequency_M_Q_Y_raises(self, freq, freq_depr): + msg = f"Invalid frequency: {freq_depr}" index = date_range("1/1/2000", periods=4, freq=f"{freq[1:]}") df = DataFrame({"s": Series([0.0, 1.0, 2.0, 3.0], index=index)}) - expected = df.asfreq(freq=freq) - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - result = df.asfreq(freq=freq_depr) - tm.assert_frame_equal(result, expected) + with pytest.raises(ValueError, match=msg): + df.asfreq(freq=freq_depr) @pytest.mark.parametrize( "freq, error_msg", [ ( "2MS", - "MS is not supported as period frequency", + "Invalid frequency: 2MS", ), ( offsets.MonthBegin(), diff --git a/pandas/tests/indexes/datetimes/methods/test_to_period.py b/pandas/tests/indexes/datetimes/methods/test_to_period.py index 5b2cc55d6dc56..8e279162b7012 100644 --- a/pandas/tests/indexes/datetimes/methods/test_to_period.py +++ b/pandas/tests/indexes/datetimes/methods/test_to_period.py @@ -90,24 +90,14 @@ def test_dti_to_period_2monthish(self, freq_offset, freq_period): tm.assert_index_equal(pi, period_range("2020-01", "2020-05", freq=freq_period)) @pytest.mark.parametrize( - "freq, freq_depr", - [ - ("2ME", "2M"), - ("2QE", "2Q"), - ("2QE-SEP", "2Q-SEP"), - ("1YE", "1Y"), - ("2YE-MAR", "2Y-MAR"), - ], + "freq", ["2ME", "1me", "2QE", "2QE-SEP", "1YE", "ye", "2YE-MAR"] ) - def test_to_period_frequency_M_Q_Y_deprecated(self, freq, freq_depr): - # GH#9586 - msg = f"'{freq_depr[1:]}' is deprecated and will be removed " - f"in a future version, please use '{freq[1:]}' instead." + def test_to_period_frequency_M_Q_Y_raises(self, freq): + msg = f"Invalid frequency: {freq}" - rng = date_range("01-Jan-2012", periods=8, freq=freq) - prng = rng.to_period() - with tm.assert_produces_warning(FutureWarning, match=msg): - assert prng.freq == freq_depr + rng = date_range("01-Jan-2012", periods=8, freq="ME") + with pytest.raises(ValueError, match=msg): + rng.to_period(freq) def test_to_period_infer(self): # https://github.com/pandas-dev/pandas/issues/33358 @@ -208,10 +198,16 @@ def test_to_period_nofreq(self): assert idx.freqstr is None tm.assert_index_equal(idx.to_period(), expected) - @pytest.mark.parametrize("freq", ["2BMS", "1SME-15"]) + @pytest.mark.parametrize("freq", ["2BME", "SME-15", "2BMS"]) def test_to_period_offsets_not_supported(self, freq): # GH#56243 - msg = f"{freq[1:]} is not supported as period frequency" + msg = "|".join( + [ + f"Invalid frequency: {freq}", + f"{freq} is not supported as period frequency", + ] + ) + ts = date_range("1/1/2012", periods=4, freq=freq) with pytest.raises(ValueError, match=msg): ts.to_period() diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py index 1ab1fc8e744ba..8bf51bcd38862 100644 --- a/pandas/tests/indexes/datetimes/test_date_range.py +++ b/pandas/tests/indexes/datetimes/test_date_range.py @@ -144,24 +144,12 @@ def test_date_range_fractional_period(self): with pytest.raises(TypeError, match=msg): date_range("1/1/2000", periods=10.5) - @pytest.mark.parametrize( - "freq,freq_depr", - [ - ("2ME", "2M"), - ("2SME", "2SM"), - ("2BQE", "2BQ"), - ("2BYE", "2BY"), - ], - ) - def test_date_range_frequency_M_SM_BQ_BY_deprecated(self, freq, freq_depr): - # GH#52064 - depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed " - f"in a future version, please use '{freq[1:]}' instead." - - expected = date_range("1/1/2000", periods=4, freq=freq) - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - result = date_range("1/1/2000", periods=4, freq=freq_depr) - tm.assert_index_equal(result, expected) + @pytest.mark.parametrize("freq", ["2M", "1m", "2SM", "2BQ", "1bq", "2BY"]) + def test_date_range_frequency_M_SM_BQ_BY_raises(self, freq): + msg = f"Invalid frequency: {freq}" + + with pytest.raises(ValueError, match=msg): + date_range("1/1/2000", periods=4, freq=freq) def test_date_range_tuple_freq_raises(self): # GH#34703 @@ -777,36 +765,13 @@ def test_frequency_H_T_S_L_U_N_raises(self, freq): date_range("1/1/2000", periods=2, freq=freq) @pytest.mark.parametrize( - "freq,freq_depr", - [ - ("YE", "Y"), - ("YE-MAY", "Y-MAY"), - ], + "freq_depr", ["m", "bm", "CBM", "SM", "BQ", "q-feb", "y-may", "Y-MAY"] ) - def test_frequencies_Y_renamed(self, freq, freq_depr): - # GH#9586, GH#54275 - freq_msg = re.split("[0-9]*", freq, maxsplit=1)[1] - freq_depr_msg = re.split("[0-9]*", freq_depr, maxsplit=1)[1] - msg = f"'{freq_depr_msg}' is deprecated and will be removed " - f"in a future version, please use '{freq_msg}' instead." - - expected = date_range("1/1/2000", periods=2, freq=freq) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = date_range("1/1/2000", periods=2, freq=freq_depr) - tm.assert_index_equal(result, expected) + def test_frequency_raises(self, freq_depr): + msg = f"Invalid frequency: {freq_depr}" - def test_to_offset_with_lowercase_deprecated_freq(self) -> None: - # https://github.com/pandas-dev/pandas/issues/56847 - msg = ( - "'m' is deprecated and will be removed in a future version, please use " - "'ME' instead." - ) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = date_range("2010-01-01", periods=2, freq="m") - expected = DatetimeIndex( - ["2010-01-31", "2010-02-28"], dtype="M8[ns]", freq="ME" - ) - tm.assert_index_equal(result, expected) + with pytest.raises(ValueError, match=msg): + date_range("1/1/2000", periods=2, freq=freq_depr) def test_date_range_bday(self): sdate = datetime(1999, 12, 25) diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index 84a616f05cd63..cc2b802de2a16 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -157,29 +157,12 @@ def test_CBH_deprecated(self): tm.assert_index_equal(result, expected) - @pytest.mark.parametrize( - "freq, expected_values, freq_depr", - [ - ("2BYE-JUN", ["2016-06-30"], "2BY-JUN"), - ("2BME", ["2016-02-29", "2016-04-29", "2016-06-30"], "2BM"), - ("2BQE", ["2016-03-31"], "2BQ"), - ("1BQE-MAR", ["2016-03-31", "2016-06-30"], "1BQ-MAR"), - ], - ) - def test_BM_BQ_BY_deprecated(self, freq, expected_values, freq_depr): - # GH#52064 - msg = f"'{freq_depr[1:]}' is deprecated and will be removed " - f"in a future version, please use '{freq[1:]}' instead." - - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = date_range(start="2016-02-21", end="2016-08-21", freq=freq_depr) - result = DatetimeIndex( - data=expected_values, - dtype="datetime64[ns]", - freq=freq, - ) + @pytest.mark.parametrize("freq", ["2BM", "1bm", "2BQ", "1BQ-MAR", "2BY-JUN", "1by"]) + def test_BM_BQ_BY_raises(self, freq): + msg = f"Invalid frequency: {freq}" - tm.assert_index_equal(result, expected) + with pytest.raises(ValueError, match=msg): + date_range(start="2016-02-21", end="2016-08-21", freq=freq) @pytest.mark.parametrize("freq", ["2BA-MAR", "1BAS-MAY", "2AS-AUG"]) def test_BA_BAS_raises(self, freq): diff --git a/pandas/tests/indexes/period/methods/test_asfreq.py b/pandas/tests/indexes/period/methods/test_asfreq.py index ea305a9766103..8fca53c28a036 100644 --- a/pandas/tests/indexes/period/methods/test_asfreq.py +++ b/pandas/tests/indexes/period/methods/test_asfreq.py @@ -142,21 +142,24 @@ def test_asfreq_with_different_n(self): tm.assert_series_equal(result, excepted) @pytest.mark.parametrize( - "freq, is_str", + "freq", [ - ("2BMS", True), - ("2YS-MAR", True), - ("2bh", True), - (offsets.MonthBegin(2), False), - (offsets.BusinessMonthEnd(2), False), + "2BMS", + "2YS-MAR", + "2bh", + offsets.MonthBegin(2), + offsets.BusinessMonthEnd(2), ], ) - def test_pi_asfreq_not_supported_frequency(self, freq, is_str): + def test_pi_asfreq_not_supported_frequency(self, freq): # GH#55785, GH#56945 - if is_str: - msg = f"{freq[1:]} is not supported as period frequency" - else: - msg = re.escape(f"{freq} is not supported as period frequency") + msg = "|".join( + [ + f"Invalid frequency: {freq}", + re.escape(f"{freq} is not supported as period frequency"), + "bh is not supported as period frequency", + ] + ) pi = PeriodIndex(["2020-01-01", "2021-01-01"], freq="M") with pytest.raises(ValueError, match=msg): diff --git a/pandas/tests/indexes/period/test_constructors.py b/pandas/tests/indexes/period/test_constructors.py index 6aba9f17326ba..aca765e7167b2 100644 --- a/pandas/tests/indexes/period/test_constructors.py +++ b/pandas/tests/indexes/period/test_constructors.py @@ -33,7 +33,7 @@ class TestPeriodIndexDisallowedFreqs: ) def test_period_index_offsets_frequency_error_message(self, freq, freq_depr): # GH#52064 - msg = f"for Period, please use '{freq[1:]}' instead of '{freq_depr[1:]}'" + msg = f"Invalid frequency: {freq_depr}" with pytest.raises(ValueError, match=msg): PeriodIndex(["2020-01-01", "2020-01-02"], freq=freq_depr) @@ -41,20 +41,23 @@ def test_period_index_offsets_frequency_error_message(self, freq, freq_depr): with pytest.raises(ValueError, match=msg): period_range(start="2020-01-01", end="2020-01-02", freq=freq_depr) - @pytest.mark.parametrize("freq_depr", ["2SME", "2sme", "2CBME", "2BYE", "2Bye"]) - def test_period_index_frequency_invalid_freq(self, freq_depr): + @pytest.mark.parametrize( + "freq", + ["2SME", "2sme", "2BYE", "2Bye", "2CBME"], + ) + def test_period_index_frequency_invalid_freq(self, freq): # GH#9586 - msg = f"Invalid frequency: {freq_depr[1:]}" + msg = f"Invalid frequency: {freq}" with pytest.raises(ValueError, match=msg): - period_range("2020-01", "2020-05", freq=freq_depr) + period_range("2020-01", "2020-05", freq=freq) with pytest.raises(ValueError, match=msg): - PeriodIndex(["2020-01", "2020-05"], freq=freq_depr) + PeriodIndex(["2020-01", "2020-05"], freq=freq) @pytest.mark.parametrize("freq", ["2BQE-SEP", "2BYE-MAR", "2BME"]) def test_period_index_from_datetime_index_invalid_freq(self, freq): # GH#56899 - msg = f"Invalid frequency: {freq[1:]}" + msg = f"Invalid frequency: {freq}" rng = date_range("01-Jan-2012", periods=8, freq=freq) with pytest.raises(ValueError, match=msg): @@ -542,9 +545,7 @@ def test_mixed_freq_raises(self): with tm.assert_produces_warning(FutureWarning, match=msg): end_intv = Period("2005-05-01", "B") - msg = "'w' is deprecated and will be removed in a future version." - with tm.assert_produces_warning(FutureWarning, match=msg): - vals = [end_intv, Period("2006-12-31", "w")] + vals = [end_intv, Period("2006-12-31", "W")] msg = r"Input has different freq=W-SUN from PeriodIndex\(freq=B\)" depr_msg = r"PeriodDtype\[B\] is deprecated" with pytest.raises(IncompatibleFrequency, match=msg): diff --git a/pandas/tests/indexes/period/test_period_range.py b/pandas/tests/indexes/period/test_period_range.py index 67f4d7421df23..4e58dc1f324b2 100644 --- a/pandas/tests/indexes/period/test_period_range.py +++ b/pandas/tests/indexes/period/test_period_range.py @@ -181,10 +181,8 @@ def test_construction_from_period(self): def test_mismatched_start_end_freq_raises(self): depr_msg = "Period with BDay freq is deprecated" - msg = "'w' is deprecated and will be removed in a future version." - with tm.assert_produces_warning(FutureWarning, match=msg): - end_w = Period("2006-12-31", "1w") + end_w = Period("2006-12-31", "1W") with tm.assert_produces_warning(FutureWarning, match=depr_msg): start_b = Period("02-Apr-2005", "B") end_b = Period("2005-05-01", "B") @@ -214,14 +212,13 @@ def test_uppercase_freq_deprecated_from_time_series(self, freq_depr): with tm.assert_produces_warning(FutureWarning, match=msg): period_range("2020-01-01 00:00:00 00:00", periods=2, freq=freq_depr) - @pytest.mark.parametrize("freq_depr", ["2m", "2q-sep", "2y", "2w"]) - def test_lowercase_freq_deprecated_from_time_series(self, freq_depr): + @pytest.mark.parametrize("freq", ["2m", "2q-sep", "2y"]) + def test_lowercase_freq_from_time_series_raises(self, freq): # GH#52536, GH#54939 - msg = f"'{freq_depr[1:]}' is deprecated and will be removed in a " - f"future version. Please use '{freq_depr.upper()[1:]}' instead." + msg = f"Invalid frequency: {freq}" - with tm.assert_produces_warning(FutureWarning, match=msg): - period_range(freq=freq_depr, start="1/1/2001", end="12/1/2009") + with pytest.raises(ValueError, match=msg): + period_range(freq=freq, start="1/1/2001", end="12/1/2009") @pytest.mark.parametrize("freq", ["2A", "2a", "2A-AUG", "2A-aug"]) def test_A_raises_from_time_series(self, freq): @@ -229,3 +226,12 @@ def test_A_raises_from_time_series(self, freq): with pytest.raises(ValueError, match=msg): period_range(freq=freq, start="1/1/2001", end="12/1/2009") + + @pytest.mark.parametrize("freq", ["2w"]) + def test_lowercase_freq_from_time_series_deprecated(self, freq): + # GH#52536, GH#54939 + msg = f"'{freq[1:]}' is deprecated and will be removed in a " + f"future version. Please use '{freq.upper()[1:]}' instead." + + with tm.assert_produces_warning(FutureWarning, match=msg): + period_range(freq=freq, start="1/1/2001", end="12/1/2009") diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index c38d223c9d6a0..7f37ca6831faa 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -2014,46 +2014,22 @@ def test_resample_empty_series_with_tz(): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize( - "freq, freq_depr", - [ - ("2ME", "2M"), - ("2QE", "2Q"), - ("2QE-SEP", "2Q-SEP"), - ("1YE", "1Y"), - ("2YE-MAR", "2Y-MAR"), - ], -) -def test_resample_M_Q_Y_deprecated(freq, freq_depr): - # GH#9586 - depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed " - f"in a future version, please use '{freq[1:]}' instead." +@pytest.mark.parametrize("freq", ["2M", "2m", "2Q", "2Q-SEP", "2q-sep", "1Y", "2Y-MAR"]) +def test_resample_M_Q_Y_raises(freq): + msg = f"Invalid frequency: {freq}" s = Series(range(10), index=date_range("20130101", freq="d", periods=10)) - expected = s.resample(freq).mean() - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - result = s.resample(freq_depr).mean() - tm.assert_series_equal(result, expected) + with pytest.raises(ValueError, match=msg): + s.resample(freq).mean() -@pytest.mark.parametrize( - "freq, freq_depr", - [ - ("2BME", "2BM"), - ("2BQE", "2BQ"), - ("2BQE-MAR", "2BQ-MAR"), - ], -) -def test_resample_BM_BQ_deprecated(freq, freq_depr): - # GH#52064 - depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed " - f"in a future version, please use '{freq[1:]}' instead." +@pytest.mark.parametrize("freq", ["2BM", "1bm", "1BQ", "2BQ-MAR", "2bq=-mar"]) +def test_resample_BM_BQ_raises(freq): + msg = f"Invalid frequency: {freq}" s = Series(range(10), index=date_range("20130101", freq="d", periods=10)) - expected = s.resample(freq).mean() - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - result = s.resample(freq_depr).mean() - tm.assert_series_equal(result, expected) + with pytest.raises(ValueError, match=msg): + s.resample(freq).mean() def test_resample_ms_closed_right(unit): diff --git a/pandas/tests/resample/test_period_index.py b/pandas/tests/resample/test_period_index.py index 67db427a2cdb7..a4e27ad46c59c 100644 --- a/pandas/tests/resample/test_period_index.py +++ b/pandas/tests/resample/test_period_index.py @@ -988,30 +988,22 @@ def test_resample_t_l_deprecated(self): ser.resample("T").mean() @pytest.mark.parametrize( - "freq, freq_depr, freq_res, freq_depr_res, data", + "freq, freq_depr, freq_depr_res", [ - ("2Q", "2q", "2Y", "2y", [0.5]), - ("2M", "2m", "2Q", "2q", [1.0, 3.0]), + ("2Q", "2q", "2y"), + ("2M", "2m", "2q"), ], ) - def test_resample_lowercase_frequency_deprecated( - self, freq, freq_depr, freq_res, freq_depr_res, data - ): - depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed in a " - f"future version. Please use '{freq[1:]}' instead." - depr_msg_res = f"'{freq_depr_res[1:]}' is deprecated and will be removed in a " - f"future version. Please use '{freq_res[1:]}' instead." - - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - rng_l = period_range("2020-01-01", "2020-08-01", freq=freq_depr) - ser = Series(np.arange(len(rng_l)), index=rng_l) - - rng = period_range("2020-01-01", "2020-08-01", freq=freq_res) - expected = Series(data=data, index=rng) + def test_resample_lowercase_frequency_raises(self, freq, freq_depr, freq_depr_res): + msg = f"Invalid frequency: {freq_depr}" + with pytest.raises(ValueError, match=msg): + period_range("2020-01-01", "2020-08-01", freq=freq_depr) - with tm.assert_produces_warning(FutureWarning, match=depr_msg_res): - result = ser.resample(freq_depr_res).mean() - tm.assert_series_equal(result, expected) + msg = f"Invalid frequency: {freq_depr_res}" + rng = period_range("2020-01-01", "2020-08-01", freq=freq) + ser = Series(np.arange(len(rng)), index=rng) + with pytest.raises(ValueError, match=msg): + ser.resample(freq_depr_res).mean() @pytest.mark.parametrize( "offset", @@ -1031,25 +1023,26 @@ def test_asfreq_invalid_period_offset(self, offset, frame_or_series): @pytest.mark.parametrize( - "freq,freq_depr", + "freq", [ - ("2M", "2ME"), - ("2Q", "2QE"), - ("2Q-FEB", "2QE-FEB"), - ("2Y", "2YE"), - ("2Y-MAR", "2YE-MAR"), - ("2M", "2me"), - ("2Q", "2qe"), - ("2Y-MAR", "2ye-mar"), + ("2ME"), + ("2QE"), + ("2QE-FEB"), + ("2YE"), + ("2YE-MAR"), + ("2me"), + ("2qe"), + ("2ye-mar"), ], ) -def test_resample_frequency_ME_QE_YE_error_message(frame_or_series, freq, freq_depr): +def test_resample_frequency_ME_QE_YE_raises(frame_or_series, freq): # GH#9586 - msg = f"for Period, please use '{freq[1:]}' instead of '{freq_depr[1:]}'" + msg = f"{freq[1:]} is not supported as period frequency" obj = frame_or_series(range(5), index=period_range("2020-01-01", periods=5)) + msg = f"Invalid frequency: {freq}" with pytest.raises(ValueError, match=msg): - obj.resample(freq_depr) + obj.resample(freq) def test_corner_cases_period(simple_period_range_series): @@ -1062,20 +1055,11 @@ def test_corner_cases_period(simple_period_range_series): assert len(result) == 0 -@pytest.mark.parametrize( - "freq_depr", - [ - "2BME", - "2CBME", - "2SME", - "2BQE-FEB", - "2BYE-MAR", - ], -) -def test_resample_frequency_invalid_freq(frame_or_series, freq_depr): +@pytest.mark.parametrize("freq", ["2BME", "2CBME", "2SME", "2BQE-FEB", "2BYE-MAR"]) +def test_resample_frequency_invalid_freq(frame_or_series, freq): # GH#9586 - msg = f"Invalid frequency: {freq_depr[1:]}" + msg = f"Invalid frequency: {freq}" obj = frame_or_series(range(5), index=period_range("2020-01-01", periods=5)) with pytest.raises(ValueError, match=msg): - obj.resample(freq_depr) + obj.resample(freq) diff --git a/pandas/tests/scalar/period/test_asfreq.py b/pandas/tests/scalar/period/test_asfreq.py index 1a21d234f1d50..90d4a7d0cc23b 100644 --- a/pandas/tests/scalar/period/test_asfreq.py +++ b/pandas/tests/scalar/period/test_asfreq.py @@ -59,6 +59,7 @@ def test_asfreq_corner(self): def test_conv_annual(self): # frequency conversion tests: from Annual Frequency + msg = INVALID_FREQ_ERR_MSG ival_A = Period(freq="Y", year=2007) @@ -110,18 +111,17 @@ def test_conv_annual(self): assert ival_A.asfreq("B", "E") == ival_A_to_B_end assert ival_A.asfreq("D", "s") == ival_A_to_D_start assert ival_A.asfreq("D", "E") == ival_A_to_D_end - msg = "'H' is deprecated and will be removed in a future version." - with tm.assert_produces_warning(FutureWarning, match=msg): + msg_depr = "'H' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg_depr): assert ival_A.asfreq("H", "s") == ival_A_to_H_start assert ival_A.asfreq("H", "E") == ival_A_to_H_end assert ival_A.asfreq("min", "s") == ival_A_to_T_start assert ival_A.asfreq("min", "E") == ival_A_to_T_end - msg = "Invalid frequency: T" with pytest.raises(ValueError, match=msg): assert ival_A.asfreq("T", "s") == ival_A_to_T_start assert ival_A.asfreq("T", "E") == ival_A_to_T_end - msg = "'S' is deprecated and will be removed in a future version." - with tm.assert_produces_warning(FutureWarning, match=msg): + msg_depr = "'S' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg_depr): assert ival_A.asfreq("S", "S") == ival_A_to_S_start assert ival_A.asfreq("S", "E") == ival_A_to_S_end @@ -820,7 +820,7 @@ def test_asfreq_MS(self): assert initial.asfreq(freq="M", how="S") == Period("2013-01", "M") - msg = "MS is not supported as period frequency" + msg = INVALID_FREQ_ERR_MSG with pytest.raises(ValueError, match=msg): initial.asfreq(freq="MS", how="S") diff --git a/pandas/tests/scalar/period/test_period.py b/pandas/tests/scalar/period/test_period.py index 2c3a0816737fc..49bd48b40e67a 100644 --- a/pandas/tests/scalar/period/test_period.py +++ b/pandas/tests/scalar/period/test_period.py @@ -60,7 +60,7 @@ def test_invalid_frequency_error_message(self): Period("2012-01-02", freq="WOM-1MON") def test_invalid_frequency_period_error_message(self): - msg = "for Period, please use 'M' instead of 'ME'" + msg = "Invalid frequency: ME" with pytest.raises(ValueError, match=msg): Period("2012-01-02", freq="ME") diff --git a/pandas/tests/tslibs/test_to_offset.py b/pandas/tests/tslibs/test_to_offset.py index ad4e9e2bcf38a..07bdfca8f2f2d 100644 --- a/pandas/tests/tslibs/test_to_offset.py +++ b/pandas/tests/tslibs/test_to_offset.py @@ -176,6 +176,14 @@ def test_anchored_shortcuts(shortcut, expected): assert result == expected +def test_to_offset_lowercase_frequency_w_deprecated(): + # GH#54939 + msg = "'w' is deprecated and will be removed in a future version" + + with tm.assert_produces_warning(FutureWarning, match=msg): + to_offset("2w") + + @pytest.mark.parametrize( "freq_depr", [ @@ -185,18 +193,16 @@ def test_anchored_shortcuts(shortcut, expected): "2qs-feb", "2bqs", "2sms", + "1sme", "2bms", "2cbme", "2me", - "2w", ], ) -def test_to_offset_lowercase_frequency_deprecated(freq_depr): - # GH#54939 - depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed in a " - f"future version, please use '{freq_depr.upper()[1:]}' instead." +def test_to_offset_lowercase_frequency_raises(freq_depr): + msg = f"Invalid frequency: {freq_depr}" - with tm.assert_produces_warning(FutureWarning, match=depr_msg): + with pytest.raises(ValueError, match=msg): to_offset(freq_depr) From a30bb6f571a843853f679516d3b4ff664bd3d04b Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 7 Jun 2024 07:39:18 -1000 Subject: [PATCH 066/272] REF/CLN: Standardized matplotlib imports (#58937) * Use standard import matplotlib as mpl * Standardaize more matplotlib imports * Fix matplotlib units * Reduce diff a little more * Import matplotlib dates * satisfy pyright --- pandas/plotting/_core.py | 2 +- pandas/plotting/_matplotlib/__init__.py | 2 +- pandas/plotting/_matplotlib/boxplot.py | 12 +++--- pandas/plotting/_matplotlib/converter.py | 19 ++++----- pandas/plotting/_matplotlib/core.py | 39 ++++++------------- pandas/plotting/_matplotlib/misc.py | 9 ++--- pandas/plotting/_matplotlib/style.py | 4 +- pandas/plotting/_matplotlib/timeseries.py | 4 +- pandas/plotting/_matplotlib/tools.py | 23 +++++------ .../tests/io/formats/style/test_matplotlib.py | 4 +- pandas/tests/plotting/frame/test_frame.py | 33 +++------------- .../tests/plotting/frame/test_frame_color.py | 32 ++++++++------- .../tests/plotting/frame/test_frame_legend.py | 7 +--- pandas/tests/plotting/test_converter.py | 8 ++-- 14 files changed, 73 insertions(+), 125 deletions(-) diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index c83985917591c..0daf3cfafe81c 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -1598,7 +1598,7 @@ def area( See Also -------- - DataFrame.plot : Make plots of DataFrame using matplotlib / pylab. + DataFrame.plot : Make plots of DataFrame using matplotlib. Examples -------- diff --git a/pandas/plotting/_matplotlib/__init__.py b/pandas/plotting/_matplotlib/__init__.py index 75c61da03795a..87f3ca09ad346 100644 --- a/pandas/plotting/_matplotlib/__init__.py +++ b/pandas/plotting/_matplotlib/__init__.py @@ -69,7 +69,7 @@ def plot(data, kind, **kwargs): kwargs["ax"] = getattr(ax, "left_ax", ax) plot_obj = PLOT_CLASSES[kind](data, **kwargs) plot_obj.generate() - plot_obj.draw() + plt.draw_if_interactive() return plot_obj.result diff --git a/pandas/plotting/_matplotlib/boxplot.py b/pandas/plotting/_matplotlib/boxplot.py index 2a28cd94b64e5..11c0ba01fff64 100644 --- a/pandas/plotting/_matplotlib/boxplot.py +++ b/pandas/plotting/_matplotlib/boxplot.py @@ -7,7 +7,7 @@ ) import warnings -from matplotlib.artist import setp +import matplotlib as mpl import numpy as np from pandas._libs import lib @@ -274,13 +274,13 @@ def maybe_color_bp(bp, color_tup, **kwds) -> None: # GH#30346, when users specifying those arguments explicitly, our defaults # for these four kwargs should be overridden; if not, use Pandas settings if not kwds.get("boxprops"): - setp(bp["boxes"], color=color_tup[0], alpha=1) + mpl.artist.setp(bp["boxes"], color=color_tup[0], alpha=1) if not kwds.get("whiskerprops"): - setp(bp["whiskers"], color=color_tup[1], alpha=1) + mpl.artist.setp(bp["whiskers"], color=color_tup[1], alpha=1) if not kwds.get("medianprops"): - setp(bp["medians"], color=color_tup[2], alpha=1) + mpl.artist.setp(bp["medians"], color=color_tup[2], alpha=1) if not kwds.get("capprops"): - setp(bp["caps"], color=color_tup[3], alpha=1) + mpl.artist.setp(bp["caps"], color=color_tup[3], alpha=1) def _grouped_plot_by_column( @@ -455,7 +455,7 @@ def plot_group(keys, values, ax: Axes, **kwds): if ax is None: rc = {"figure.figsize": figsize} if figsize is not None else {} - with plt.rc_context(rc): + with mpl.rc_context(rc): ax = plt.gca() data = data._get_numeric_data() naxes = len(data.columns) diff --git a/pandas/plotting/_matplotlib/converter.py b/pandas/plotting/_matplotlib/converter.py index 50fa722f6dd72..a8f08769ceae2 100644 --- a/pandas/plotting/_matplotlib/converter.py +++ b/pandas/plotting/_matplotlib/converter.py @@ -14,13 +14,8 @@ ) import warnings +import matplotlib as mpl import matplotlib.dates as mdates -from matplotlib.ticker import ( - AutoLocator, - Formatter, - Locator, -) -from matplotlib.transforms import nonsingular import matplotlib.units as munits import numpy as np @@ -174,7 +169,7 @@ def axisinfo(unit, axis) -> munits.AxisInfo | None: if unit != "time": return None - majloc = AutoLocator() + majloc = mpl.ticker.AutoLocator() # pyright: ignore[reportAttributeAccessIssue] majfmt = TimeFormatter(majloc) return munits.AxisInfo(majloc=majloc, majfmt=majfmt, label="time") @@ -184,7 +179,7 @@ def default_units(x, axis) -> str: # time formatter -class TimeFormatter(Formatter): +class TimeFormatter(mpl.ticker.Formatter): # pyright: ignore[reportAttributeAccessIssue] def __init__(self, locs) -> None: self.locs = locs @@ -917,7 +912,7 @@ def get_finder(freq: BaseOffset): raise NotImplementedError(f"Unsupported frequency: {dtype_code}") -class TimeSeries_DateLocator(Locator): +class TimeSeries_DateLocator(mpl.ticker.Locator): # pyright: ignore[reportAttributeAccessIssue] """ Locates the ticks along an axis controlled by a :class:`Series`. @@ -998,7 +993,7 @@ def autoscale(self): if vmin == vmax: vmin -= 1 vmax += 1 - return nonsingular(vmin, vmax) + return mpl.transforms.nonsingular(vmin, vmax) # ------------------------------------------------------------------------- @@ -1006,7 +1001,7 @@ def autoscale(self): # ------------------------------------------------------------------------- -class TimeSeries_DateFormatter(Formatter): +class TimeSeries_DateFormatter(mpl.ticker.Formatter): # pyright: ignore[reportAttributeAccessIssue] """ Formats the ticks along an axis controlled by a :class:`PeriodIndex`. @@ -1082,7 +1077,7 @@ def __call__(self, x, pos: int | None = 0) -> str: return period.strftime(fmt) -class TimeSeries_TimedeltaFormatter(Formatter): +class TimeSeries_TimedeltaFormatter(mpl.ticker.Formatter): # pyright: ignore[reportAttributeAccessIssue] """ Formats the ticks along an axis controlled by a :class:`TimedeltaIndex`. """ diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index fffeb9b82492f..2d3c81f2512aa 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -107,9 +107,7 @@ def _color_in_style(style: str) -> bool: """ Check if there is a color letter in the style string. """ - from matplotlib.colors import BASE_COLORS - - return not set(BASE_COLORS).isdisjoint(style) + return not set(mpl.colors.BASE_COLORS).isdisjoint(style) class MPLPlot(ABC): @@ -176,8 +174,6 @@ def __init__( style=None, **kwds, ) -> None: - import matplotlib.pyplot as plt - # if users assign an empty list or tuple, raise `ValueError` # similar to current `df.box` and `df.hist` APIs. if by in ([], ()): @@ -238,7 +234,7 @@ def __init__( self.rot = self._default_rot if grid is None: - grid = False if secondary_y else plt.rcParams["axes.grid"] + grid = False if secondary_y else mpl.rcParams["axes.grid"] self.grid = grid self.legend = legend @@ -497,10 +493,6 @@ def _get_nseries(self, data: Series | DataFrame) -> int: def nseries(self) -> int: return self._get_nseries(self.data) - @final - def draw(self) -> None: - self.plt.draw_if_interactive() - @final def generate(self) -> None: self._compute_plot_data() @@ -570,6 +562,8 @@ def axes(self) -> Sequence[Axes]: @final @cache_readonly def _axes_and_fig(self) -> tuple[Sequence[Axes], Figure]: + import matplotlib.pyplot as plt + if self.subplots: naxes = ( self.nseries if isinstance(self.subplots, bool) else len(self.subplots) @@ -584,7 +578,7 @@ def _axes_and_fig(self) -> tuple[Sequence[Axes], Figure]: layout_type=self._layout_type, ) elif self.ax is None: - fig = self.plt.figure(figsize=self.figsize) + fig = plt.figure(figsize=self.figsize) axes = fig.add_subplot(111) else: fig = self.ax.get_figure() @@ -918,13 +912,6 @@ def _get_ax_legend(ax: Axes): ax = other_ax return ax, leg - @final - @cache_readonly - def plt(self): - import matplotlib.pyplot as plt - - return plt - _need_to_set_index = False @final @@ -1219,9 +1206,9 @@ def _get_errorbars( @final def _get_subplots(self, fig: Figure) -> list[Axes]: if Version(mpl.__version__) < Version("3.8"): - from matplotlib.axes import Subplot as Klass + Klass = mpl.axes.Subplot else: - from matplotlib.axes import Axes as Klass + Klass = mpl.axes.Axes return [ ax @@ -1386,7 +1373,7 @@ def _get_c_values(self, color, color_by_categorical: bool, c_is_column: bool): if c is not None and color is not None: raise TypeError("Specify exactly one of `c` and `color`") if c is None and color is None: - c_values = self.plt.rcParams["patch.facecolor"] + c_values = mpl.rcParams["patch.facecolor"] elif color is not None: c_values = color elif color_by_categorical: @@ -1411,12 +1398,10 @@ def _get_norm_and_cmap(self, c_values, color_by_categorical: bool): cmap = None if color_by_categorical and cmap is not None: - from matplotlib import colors - n_cats = len(self.data[c].cat.categories) - cmap = colors.ListedColormap([cmap(i) for i in range(cmap.N)]) + cmap = mpl.colors.ListedColormap([cmap(i) for i in range(cmap.N)]) bounds = np.linspace(0, n_cats, n_cats + 1) - norm = colors.BoundaryNorm(bounds, cmap.N) + norm = mpl.colors.BoundaryNorm(bounds, cmap.N) # TODO: warn that we are ignoring self.norm if user specified it? # Doesn't happen in any tests 2023-11-09 else: @@ -1676,8 +1661,6 @@ def _update_stacker(cls, ax: Axes, stacking_id: int | None, values) -> None: ax._stacker_neg_prior[stacking_id] += values # type: ignore[attr-defined] def _post_plot_logic(self, ax: Axes, data) -> None: - from matplotlib.ticker import FixedLocator - def get_label(i): if is_float(i) and i.is_integer(): i = int(i) @@ -1691,7 +1674,7 @@ def get_label(i): xticklabels = [get_label(x) for x in xticks] # error: Argument 1 to "FixedLocator" has incompatible type "ndarray[Any, # Any]"; expected "Sequence[float]" - ax.xaxis.set_major_locator(FixedLocator(xticks)) # type: ignore[arg-type] + ax.xaxis.set_major_locator(mpl.ticker.FixedLocator(xticks)) # type: ignore[arg-type] ax.set_xticklabels(xticklabels) # If the index is an irregular time series, then by default diff --git a/pandas/plotting/_matplotlib/misc.py b/pandas/plotting/_matplotlib/misc.py index 1f9212587e05e..4a891ec27e8cb 100644 --- a/pandas/plotting/_matplotlib/misc.py +++ b/pandas/plotting/_matplotlib/misc.py @@ -3,8 +3,7 @@ import random from typing import TYPE_CHECKING -from matplotlib import patches -import matplotlib.lines as mlines +import matplotlib as mpl import numpy as np from pandas.core.dtypes.missing import notna @@ -129,7 +128,7 @@ def scatter_matrix( def _get_marker_compat(marker): - if marker not in mlines.lineMarkers: + if marker not in mpl.lines.lineMarkers: return "o" return marker @@ -190,10 +189,10 @@ def normalize(series): ) ax.legend() - ax.add_patch(patches.Circle((0.0, 0.0), radius=1.0, facecolor="none")) + ax.add_patch(mpl.patches.Circle((0.0, 0.0), radius=1.0, facecolor="none")) for xy, name in zip(s, df.columns): - ax.add_patch(patches.Circle(xy, radius=0.025, facecolor="gray")) + ax.add_patch(mpl.patches.Circle(xy, radius=0.025, facecolor="gray")) if xy[0] < 0.0 and xy[1] < 0.0: ax.text( diff --git a/pandas/plotting/_matplotlib/style.py b/pandas/plotting/_matplotlib/style.py index d725d53bd21ec..962f9711d9916 100644 --- a/pandas/plotting/_matplotlib/style.py +++ b/pandas/plotting/_matplotlib/style.py @@ -260,9 +260,7 @@ def _get_colors_from_color_type(color_type: str, num_colors: int) -> list[Color] def _get_default_colors(num_colors: int) -> list[Color]: """Get `num_colors` of default colors from matplotlib rc params.""" - import matplotlib.pyplot as plt - - colors = [c["color"] for c in plt.rcParams["axes.prop_cycle"]] + colors = [c["color"] for c in mpl.rcParams["axes.prop_cycle"]] return colors[0:num_colors] diff --git a/pandas/plotting/_matplotlib/timeseries.py b/pandas/plotting/_matplotlib/timeseries.py index d438f521c0dbc..d95ccad2da565 100644 --- a/pandas/plotting/_matplotlib/timeseries.py +++ b/pandas/plotting/_matplotlib/timeseries.py @@ -333,7 +333,7 @@ def format_dateaxis( default, changing the limits of the x axis will intelligently change the positions of the ticks. """ - from matplotlib import pylab + import matplotlib.pyplot as plt # handle index specific formatting # Note: DatetimeIndex does not use this @@ -365,4 +365,4 @@ def format_dateaxis( else: raise TypeError("index type not supported") - pylab.draw_if_interactive() + plt.draw_if_interactive() diff --git a/pandas/plotting/_matplotlib/tools.py b/pandas/plotting/_matplotlib/tools.py index 50cfdbd967ea7..ae82f0232aee0 100644 --- a/pandas/plotting/_matplotlib/tools.py +++ b/pandas/plotting/_matplotlib/tools.py @@ -5,8 +5,7 @@ from typing import TYPE_CHECKING import warnings -from matplotlib import ticker -import matplotlib.table +import matplotlib as mpl import numpy as np from pandas.util._exceptions import find_stack_level @@ -77,7 +76,7 @@ def table( # error: Argument "cellText" to "table" has incompatible type "ndarray[Any, # Any]"; expected "Sequence[Sequence[str]] | None" - return matplotlib.table.table( + return mpl.table.table( ax, cellText=cellText, # type: ignore[arg-type] rowLabels=rowLabels, @@ -327,10 +326,10 @@ def _remove_labels_from_axis(axis: Axis) -> None: # set_visible will not be effective if # minor axis has NullLocator and NullFormatter (default) - if isinstance(axis.get_minor_locator(), ticker.NullLocator): - axis.set_minor_locator(ticker.AutoLocator()) - if isinstance(axis.get_minor_formatter(), ticker.NullFormatter): - axis.set_minor_formatter(ticker.FormatStrFormatter("")) + if isinstance(axis.get_minor_locator(), mpl.ticker.NullLocator): + axis.set_minor_locator(mpl.ticker.AutoLocator()) + if isinstance(axis.get_minor_formatter(), mpl.ticker.NullFormatter): + axis.set_minor_formatter(mpl.ticker.FormatStrFormatter("")) for t in axis.get_minorticklabels(): t.set_visible(False) @@ -455,17 +454,15 @@ def set_ticks_props( ylabelsize: int | None = None, yrot=None, ): - import matplotlib.pyplot as plt - for ax in flatten_axes(axes): if xlabelsize is not None: - plt.setp(ax.get_xticklabels(), fontsize=xlabelsize) + mpl.artist.setp(ax.get_xticklabels(), fontsize=xlabelsize) if xrot is not None: - plt.setp(ax.get_xticklabels(), rotation=xrot) + mpl.artist.setp(ax.get_xticklabels(), rotation=xrot) if ylabelsize is not None: - plt.setp(ax.get_yticklabels(), fontsize=ylabelsize) + mpl.artist.setp(ax.get_yticklabels(), fontsize=ylabelsize) if yrot is not None: - plt.setp(ax.get_yticklabels(), rotation=yrot) + mpl.artist.setp(ax.get_yticklabels(), rotation=yrot) return axes diff --git a/pandas/tests/io/formats/style/test_matplotlib.py b/pandas/tests/io/formats/style/test_matplotlib.py index 70ddd65c02d14..296fb20d855c4 100644 --- a/pandas/tests/io/formats/style/test_matplotlib.py +++ b/pandas/tests/io/formats/style/test_matplotlib.py @@ -7,11 +7,9 @@ Series, ) -pytest.importorskip("matplotlib") +mpl = pytest.importorskip("matplotlib") pytest.importorskip("jinja2") -import matplotlib as mpl - from pandas.io.formats.style import Styler pytestmark = pytest.mark.usefixtures("mpl_cleanup") diff --git a/pandas/tests/plotting/frame/test_frame.py b/pandas/tests/plotting/frame/test_frame.py index adb56a40b0071..e809bd33610f1 100644 --- a/pandas/tests/plotting/frame/test_frame.py +++ b/pandas/tests/plotting/frame/test_frame.py @@ -1177,20 +1177,16 @@ def test_hist_df_series(self): _check_ticks_props(axes, xrot=40, yrot=0) def test_hist_df_series_cumulative_density(self): - from matplotlib.patches import Rectangle - series = Series(np.random.default_rng(2).random(10)) ax = series.plot.hist(cumulative=True, bins=4, density=True) # height of last bin (index 5) must be 1.0 - rects = [x for x in ax.get_children() if isinstance(x, Rectangle)] + rects = [x for x in ax.get_children() if isinstance(x, mpl.patches.Rectangle)] tm.assert_almost_equal(rects[-1].get_height(), 1.0) def test_hist_df_series_cumulative(self): - from matplotlib.patches import Rectangle - series = Series(np.random.default_rng(2).random(10)) ax = series.plot.hist(cumulative=True, bins=4) - rects = [x for x in ax.get_children() if isinstance(x, Rectangle)] + rects = [x for x in ax.get_children() if isinstance(x, mpl.patches.Rectangle)] tm.assert_almost_equal(rects[-2].get_height(), 10.0) @@ -1385,8 +1381,6 @@ def test_plot_int_columns(self): ], ) def test_style_by_column(self, markers): - import matplotlib.pyplot as plt - fig = plt.gcf() fig.clf() fig.add_subplot(111) @@ -1969,9 +1963,6 @@ def test_sharex_and_ax(self): # https://github.com/pandas-dev/pandas/issues/9737 using gridspec, # the axis in fig.get_axis() are sorted differently than pandas # expected them, so make sure that only the right ones are removed - import matplotlib.pyplot as plt - - plt.close("all") gs, axes = _generate_4_axes_via_gridspec() df = DataFrame( @@ -2009,8 +2000,6 @@ def test_sharex_false_and_ax(self): # https://github.com/pandas-dev/pandas/issues/9737 using gridspec, # the axis in fig.get_axis() are sorted differently than pandas # expected them, so make sure that only the right ones are removed - import matplotlib.pyplot as plt - df = DataFrame( { "a": [1, 2, 3, 4, 5, 6], @@ -2035,8 +2024,6 @@ def test_sharey_and_ax(self): # https://github.com/pandas-dev/pandas/issues/9737 using gridspec, # the axis in fig.get_axis() are sorted differently than pandas # expected them, so make sure that only the right ones are removed - import matplotlib.pyplot as plt - gs, axes = _generate_4_axes_via_gridspec() df = DataFrame( @@ -2073,8 +2060,6 @@ def _check(axes): def test_sharey_and_ax_tight(self): # https://github.com/pandas-dev/pandas/issues/9737 using gridspec, - import matplotlib.pyplot as plt - df = DataFrame( { "a": [1, 2, 3, 4, 5, 6], @@ -2134,9 +2119,6 @@ def test_memory_leak(self, kind): def test_df_gridspec_patterns_vert_horiz(self): # GH 10819 - from matplotlib import gridspec - import matplotlib.pyplot as plt - ts = Series( np.random.default_rng(2).standard_normal(10), index=date_range("1/1/2000", periods=10), @@ -2149,14 +2131,14 @@ def test_df_gridspec_patterns_vert_horiz(self): ) def _get_vertical_grid(): - gs = gridspec.GridSpec(3, 1) + gs = mpl.gridspec.GridSpec(3, 1) fig = plt.figure() ax1 = fig.add_subplot(gs[:2, :]) ax2 = fig.add_subplot(gs[2, :]) return ax1, ax2 def _get_horizontal_grid(): - gs = gridspec.GridSpec(1, 3) + gs = mpl.gridspec.GridSpec(1, 3) fig = plt.figure() ax1 = fig.add_subplot(gs[:, :2]) ax2 = fig.add_subplot(gs[:, 2]) @@ -2217,9 +2199,6 @@ def _get_horizontal_grid(): def test_df_gridspec_patterns_boxed(self): # GH 10819 - from matplotlib import gridspec - import matplotlib.pyplot as plt - ts = Series( np.random.default_rng(2).standard_normal(10), index=date_range("1/1/2000", periods=10), @@ -2227,7 +2206,7 @@ def test_df_gridspec_patterns_boxed(self): # boxed def _get_boxed_grid(): - gs = gridspec.GridSpec(3, 3) + gs = mpl.gridspec.GridSpec(3, 3) fig = plt.figure() ax1 = fig.add_subplot(gs[:2, :2]) ax2 = fig.add_subplot(gs[:2, 2]) @@ -2595,8 +2574,6 @@ def test_plot_period_index_makes_no_right_shift(self, freq): def _generate_4_axes_via_gridspec(): - import matplotlib.pyplot as plt - gs = mpl.gridspec.GridSpec(2, 2) ax_tl = plt.subplot(gs[0, 0]) ax_ll = plt.subplot(gs[1, 0]) diff --git a/pandas/tests/plotting/frame/test_frame_color.py b/pandas/tests/plotting/frame/test_frame_color.py index 76d3b20aaa2c6..4b35e896e1a6c 100644 --- a/pandas/tests/plotting/frame/test_frame_color.py +++ b/pandas/tests/plotting/frame/test_frame_color.py @@ -364,14 +364,16 @@ def test_line_colors_and_styles_subplots_list_styles(self): _check_colors(ax.get_lines(), linecolors=[c]) def test_area_colors(self): - from matplotlib.collections import PolyCollection - custom_colors = "rgcby" df = DataFrame(np.random.default_rng(2).random((5, 5))) ax = df.plot.area(color=custom_colors) _check_colors(ax.get_lines(), linecolors=custom_colors) - poly = [o for o in ax.get_children() if isinstance(o, PolyCollection)] + poly = [ + o + for o in ax.get_children() + if isinstance(o, mpl.collections.PolyCollection) + ] _check_colors(poly, facecolors=custom_colors) handles, _ = ax.get_legend_handles_labels() @@ -381,14 +383,15 @@ def test_area_colors(self): assert h.get_alpha() is None def test_area_colors_poly(self): - from matplotlib import cm - from matplotlib.collections import PolyCollection - df = DataFrame(np.random.default_rng(2).random((5, 5))) ax = df.plot.area(colormap="jet") - jet_colors = [cm.jet(n) for n in np.linspace(0, 1, len(df))] + jet_colors = [mpl.cm.jet(n) for n in np.linspace(0, 1, len(df))] _check_colors(ax.get_lines(), linecolors=jet_colors) - poly = [o for o in ax.get_children() if isinstance(o, PolyCollection)] + poly = [ + o + for o in ax.get_children() + if isinstance(o, mpl.collections.PolyCollection) + ] _check_colors(poly, facecolors=jet_colors) handles, _ = ax.get_legend_handles_labels() @@ -397,15 +400,16 @@ def test_area_colors_poly(self): assert h.get_alpha() is None def test_area_colors_stacked_false(self): - from matplotlib import cm - from matplotlib.collections import PolyCollection - df = DataFrame(np.random.default_rng(2).random((5, 5))) - jet_colors = [cm.jet(n) for n in np.linspace(0, 1, len(df))] + jet_colors = [mpl.cm.jet(n) for n in np.linspace(0, 1, len(df))] # When stacked=False, alpha is set to 0.5 - ax = df.plot.area(colormap=cm.jet, stacked=False) + ax = df.plot.area(colormap=mpl.cm.jet, stacked=False) _check_colors(ax.get_lines(), linecolors=jet_colors) - poly = [o for o in ax.get_children() if isinstance(o, PolyCollection)] + poly = [ + o + for o in ax.get_children() + if isinstance(o, mpl.collections.PolyCollection) + ] jet_with_alpha = [(c[0], c[1], c[2], 0.5) for c in jet_colors] _check_colors(poly, facecolors=jet_with_alpha) diff --git a/pandas/tests/plotting/frame/test_frame_legend.py b/pandas/tests/plotting/frame/test_frame_legend.py index 402a4b9531e5d..a9723fe4ef871 100644 --- a/pandas/tests/plotting/frame/test_frame_legend.py +++ b/pandas/tests/plotting/frame/test_frame_legend.py @@ -26,9 +26,6 @@ class TestFrameLegend: ) def test_mixed_yerr(self): # https://github.com/pandas-dev/pandas/issues/39522 - from matplotlib.collections import LineCollection - from matplotlib.lines import Line2D - df = DataFrame([{"x": 1, "a": 1, "b": 1}, {"x": 2, "a": 2, "b": 3}]) ax = df.plot("x", "a", c="orange", yerr=0.1, label="orange") @@ -40,8 +37,8 @@ def test_mixed_yerr(self): else: result_handles = legend.legend_handles - assert isinstance(result_handles[0], LineCollection) - assert isinstance(result_handles[1], Line2D) + assert isinstance(result_handles[0], mpl.collections.LineCollection) + assert isinstance(result_handles[1], mpl.lines.Line2D) def test_legend_false(self): # https://github.com/pandas-dev/pandas/issues/40044 diff --git a/pandas/tests/plotting/test_converter.py b/pandas/tests/plotting/test_converter.py index 6a1777b098de0..cfdfa7f723599 100644 --- a/pandas/tests/plotting/test_converter.py +++ b/pandas/tests/plotting/test_converter.py @@ -196,7 +196,7 @@ def test_conversion_float(self, dtc): rtol = 0.5 * 10**-9 rs = dtc.convert(Timestamp("2012-1-1 01:02:03", tz="UTC"), None, None) - xp = converter.mdates.date2num(Timestamp("2012-1-1 01:02:03", tz="UTC")) + xp = dates.date2num(Timestamp("2012-1-1 01:02:03", tz="UTC")) tm.assert_almost_equal(rs, xp, rtol=rtol) rs = dtc.convert( @@ -217,10 +217,10 @@ def test_conversion_float(self, dtc): def test_conversion_outofbounds_datetime(self, dtc, values): # 2579 rs = dtc.convert(values, None, None) - xp = converter.mdates.date2num(values) + xp = dates.date2num(values) tm.assert_numpy_array_equal(rs, xp) rs = dtc.convert(values[0], None, None) - xp = converter.mdates.date2num(values[0]) + xp = dates.date2num(values[0]) assert rs == xp @pytest.mark.parametrize( @@ -243,7 +243,7 @@ def test_dateindex_conversion(self, freq, dtc): rtol = 10**-9 dateindex = date_range("2020-01-01", periods=10, freq=freq) rs = dtc.convert(dateindex, None, None) - xp = converter.mdates.date2num(dateindex._mpl_repr()) + xp = dates.date2num(dateindex._mpl_repr()) tm.assert_almost_equal(rs, xp, rtol=rtol) @pytest.mark.parametrize("offset", [Second(), Milli(), Micro(50)]) From c0262e8cf7fea4dd6853a3ab2bfc7d891ab36fe8 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Sat, 8 Jun 2024 01:58:44 +0530 Subject: [PATCH 067/272] DOC: fix PR07,SA01 for pandas.qcut (#58957) --- ci/code_checks.sh | 1 - pandas/core/reshape/tile.py | 6 ++++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index ade7173cf0344..7bf676d5628aa 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -471,7 +471,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.plotting.andrews_curves RT03,SA01" \ -i "pandas.plotting.lag_plot RT03,SA01" \ -i "pandas.plotting.scatter_matrix PR07,SA01" \ - -i "pandas.qcut PR07,SA01" \ -i "pandas.set_eng_float_format RT03,SA01" \ -i "pandas.testing.assert_extension_array_equal SA01" \ -i "pandas.tseries.offsets.BDay PR02,SA01" \ diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 1499afbde56d3..d780433386395 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -289,6 +289,7 @@ def qcut( Parameters ---------- x : 1d ndarray or Series + Input Numpy array or pandas Series object to be discretized. q : int or list-like of float Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles. @@ -313,6 +314,11 @@ def qcut( bins : ndarray of floats Returned only if `retbins` is True. + See Also + -------- + cut : Bin values into discrete intervals. + Series.quantile : Return value at the given quantile. + Notes ----- Out of bounds values will be NA in the resulting Categorical object From f2f298b95c6fa1881803ad3f6976c1b166ac354f Mon Sep 17 00:00:00 2001 From: SubsequentlySneeds <118424338+SubsequentlySneeds@users.noreply.github.com> Date: Fri, 7 Jun 2024 15:17:12 -0700 Subject: [PATCH 068/272] Fix link wording in "10 Minutes to Pandas" (#58961) Fix link wording Current wording is grammatically incorrect and does not match the target page's title - updated to match. --- doc/source/user_guide/10min.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/user_guide/10min.rst b/doc/source/user_guide/10min.rst index 3cdcb81c14961..887ffd5580a52 100644 --- a/doc/source/user_guide/10min.rst +++ b/doc/source/user_guide/10min.rst @@ -101,7 +101,7 @@ truncated for brevity. Viewing data ------------ -See the :ref:`Essentially basics functionality section `. +See the :ref:`Essential basic functionality section `. Use :meth:`DataFrame.head` and :meth:`DataFrame.tail` to view the top and bottom rows of the frame respectively: From 81a44faf5c188546cb8e949b233135ac9855df1f Mon Sep 17 00:00:00 2001 From: Matt Heeter <94481579+mattheeter@users.noreply.github.com> Date: Sat, 8 Jun 2024 10:33:54 -0400 Subject: [PATCH 069/272] BUG: DateTimeIndex.is_year_start unexpected behavior when constructed with freq 'MS' date_range (#57377) (#57494) * Added some comments to where the bug is occurring * Potential fix, passed all potentially relevant tests * Very likely fix * Reverted ro previous start/end scheme; added tests * Added fixes to whatsnew doc * Removed stray comment * Fixed alphabetical problem in whatsnew * add parametric test * fixup --------- Co-authored-by: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/_libs/tslibs/fields.pyx | 3 +- pandas/core/arrays/datetimes.py | 2 +- .../indexes/datetimes/test_scalar_compat.py | 95 +++++++++++++++++++ 4 files changed, 99 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 2b45c8aa4865f..27b16cb706e8d 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -481,6 +481,7 @@ Categorical Datetimelike ^^^^^^^^^^^^ +- Bug in :attr:`is_year_start` where a DateTimeIndex constructed via a date_range with frequency 'MS' wouldn't have the correct year or quarter start attributes (:issue:`57377`) - Bug in :class:`Timestamp` constructor failing to raise when ``tz=None`` is explicitly specified in conjunction with timezone-aware ``tzinfo`` or data (:issue:`48688`) - Bug in :func:`date_range` where the last valid timestamp would sometimes not be produced (:issue:`56134`) - Bug in :func:`date_range` where using a negative frequency value would not include all points between the start and end values (:issue:`56382`) diff --git a/pandas/_libs/tslibs/fields.pyx b/pandas/_libs/tslibs/fields.pyx index 399a5c2e96cd5..e523ac2e7b5c6 100644 --- a/pandas/_libs/tslibs/fields.pyx +++ b/pandas/_libs/tslibs/fields.pyx @@ -253,9 +253,10 @@ def get_start_end_field( # month of year. Other offsets use month, startingMonth as ending # month of year. - if freq_name.lstrip("B")[0:2] in ["MS", "QS", "YS"]: + if freq_name.lstrip("B")[0:2] in ["QS", "YS"]: end_month = 12 if month_kw == 1 else month_kw - 1 start_month = month_kw + else: end_month = month_kw start_month = (end_month % 12) + 1 diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index bbbf7a9b4a63a..077bde35a4c94 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -143,7 +143,7 @@ def f(self): month_kw = 12 if freq: kwds = freq.kwds - month_kw = kwds.get("startingMonth", kwds.get("month", 12)) + month_kw = kwds.get("startingMonth", kwds.get("month", month_kw)) if freq is not None: freq_name = freq.name diff --git a/pandas/tests/indexes/datetimes/test_scalar_compat.py b/pandas/tests/indexes/datetimes/test_scalar_compat.py index 5831846c9ceb6..87251f35c755b 100644 --- a/pandas/tests/indexes/datetimes/test_scalar_compat.py +++ b/pandas/tests/indexes/datetimes/test_scalar_compat.py @@ -11,6 +11,8 @@ import locale import unicodedata +from hypothesis import given +import hypothesis.strategies as st import numpy as np import pytest @@ -329,6 +331,84 @@ def test_dti_is_month_start_custom(self): with pytest.raises(ValueError, match=msg): dti.is_month_start + @pytest.mark.parametrize( + "timestamp, freq, periods, expected_values", + [ + ("2017-12-01", "MS", 3, np.array([False, True, False])), + ("2017-12-01", "QS", 3, np.array([True, False, False])), + ("2017-12-01", "YS", 3, np.array([True, True, True])), + ], + ) + def test_dti_dr_is_year_start(self, timestamp, freq, periods, expected_values): + # GH57377 + result = date_range(timestamp, freq=freq, periods=periods).is_year_start + tm.assert_numpy_array_equal(result, expected_values) + + @pytest.mark.parametrize( + "timestamp, freq, periods, expected_values", + [ + ("2017-12-01", "ME", 3, np.array([True, False, False])), + ("2017-12-01", "QE", 3, np.array([True, False, False])), + ("2017-12-01", "YE", 3, np.array([True, True, True])), + ], + ) + def test_dti_dr_is_year_end(self, timestamp, freq, periods, expected_values): + # GH57377 + result = date_range(timestamp, freq=freq, periods=periods).is_year_end + tm.assert_numpy_array_equal(result, expected_values) + + @pytest.mark.parametrize( + "timestamp, freq, periods, expected_values", + [ + ("2017-12-01", "MS", 3, np.array([False, True, False])), + ("2017-12-01", "QS", 3, np.array([True, True, True])), + ("2017-12-01", "YS", 3, np.array([True, True, True])), + ], + ) + def test_dti_dr_is_quarter_start(self, timestamp, freq, periods, expected_values): + # GH57377 + result = date_range(timestamp, freq=freq, periods=periods).is_quarter_start + tm.assert_numpy_array_equal(result, expected_values) + + @pytest.mark.parametrize( + "timestamp, freq, periods, expected_values", + [ + ("2017-12-01", "ME", 3, np.array([True, False, False])), + ("2017-12-01", "QE", 3, np.array([True, True, True])), + ("2017-12-01", "YE", 3, np.array([True, True, True])), + ], + ) + def test_dti_dr_is_quarter_end(self, timestamp, freq, periods, expected_values): + # GH57377 + result = date_range(timestamp, freq=freq, periods=periods).is_quarter_end + tm.assert_numpy_array_equal(result, expected_values) + + @pytest.mark.parametrize( + "timestamp, freq, periods, expected_values", + [ + ("2017-12-01", "MS", 3, np.array([True, True, True])), + ("2017-12-01", "QS", 3, np.array([True, True, True])), + ("2017-12-01", "YS", 3, np.array([True, True, True])), + ], + ) + def test_dti_dr_is_month_start(self, timestamp, freq, periods, expected_values): + # GH57377 + result = date_range(timestamp, freq=freq, periods=periods).is_month_start + tm.assert_numpy_array_equal(result, expected_values) + + @pytest.mark.parametrize( + "timestamp, freq, periods, expected_values", + [ + ("2017-12-01", "ME", 3, np.array([True, True, True])), + ("2017-12-01", "QE", 3, np.array([True, True, True])), + ("2017-12-01", "YE", 3, np.array([True, True, True])), + ], + ) + def test_dti_dr_is_month_end(self, timestamp, freq, periods, expected_values): + # GH57377 + result = date_range(timestamp, freq=freq, periods=periods).is_month_end + tm.assert_numpy_array_equal(result, expected_values) + def test_dti_is_year_quarter_start_doubledigit_freq(self): # GH#58523 dr = date_range("2017-01-01", periods=2, freq="10YS") @@ -343,3 +423,18 @@ def test_dti_is_year_start_freq_custom_business_day_with_digit(self): msg = "Custom business days is not supported by is_year_start" with pytest.raises(ValueError, match=msg): dr.is_year_start + + +@given( + dt=st.datetimes(min_value=datetime(1960, 1, 1), max_value=datetime(1980, 1, 1)), + n=st.integers(min_value=1, max_value=10), + freq=st.sampled_from(["MS", "QS", "YS"]), +) +@pytest.mark.slow +def test_against_scalar_parametric(freq, dt, n): + # https://github.com/pandas-dev/pandas/issues/49606 + freq = f"{n}{freq}" + d = date_range(dt, periods=3, freq=freq) + result = list(d.is_year_start) + expected = [x.is_year_start for x in d] + assert result == expected From a787f4580db8477c03359a39700c1310bc680756 Mon Sep 17 00:00:00 2001 From: Natalia Mokeeva <91160475+natmokval@users.noreply.github.com> Date: Sun, 9 Jun 2024 17:56:13 +0200 Subject: [PATCH 070/272] TST: add tests for DatetimeIndex.is_year_start/is_quarter_start on "BMS" frequency (#58691) * bug-DatetimeIndex-is_year_start-breaks-on-freq-BusinessMonthStart * correct def get_start_end_field * fixup * parametrize test, and a note to v3.0.0 --- pandas/tests/indexes/datetimes/test_scalar_compat.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pandas/tests/indexes/datetimes/test_scalar_compat.py b/pandas/tests/indexes/datetimes/test_scalar_compat.py index 87251f35c755b..eb472b099fb1f 100644 --- a/pandas/tests/indexes/datetimes/test_scalar_compat.py +++ b/pandas/tests/indexes/datetimes/test_scalar_compat.py @@ -424,6 +424,17 @@ def test_dti_is_year_start_freq_custom_business_day_with_digit(self): with pytest.raises(ValueError, match=msg): dr.is_year_start + @pytest.mark.parametrize("freq", ["3BMS", offsets.BusinessMonthBegin(3)]) + def test_dti_is_year_quarter_start_freq_business_month_begin(self, freq): + # GH#58729 + dr = date_range("2020-01-01", periods=5, freq=freq) + result = [x.is_year_start for x in dr] + assert result == [True, False, False, False, True] + + dr = date_range("2020-01-01", periods=4, freq=freq) + result = [x.is_quarter_start for x in dr] + assert all(dr.is_quarter_start) + @given( dt=st.datetimes(min_value=datetime(1960, 1, 1), max_value=datetime(1980, 1, 1)), From 2d1e59dffb78e013418d3b306126f71b9b456972 Mon Sep 17 00:00:00 2001 From: cmjcharlton <90400333+cmjcharlton@users.noreply.github.com> Date: Mon, 10 Jun 2024 12:44:42 +0100 Subject: [PATCH 071/272] =?UTF-8?q?BUG:=20Unable=20to=20open=20Stata=20118?= =?UTF-8?q?=20or=20119=20format=20files=20saved=20in=20big-endian=E2=80=A6?= =?UTF-8?q?=20(#58640)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * BUG: Unable to open Stata 118 or 119 format files saved in big-endian format that contain strL data * Rename test functions to make their purpose clearer --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/io/stata.py | 7 ++- pandas/tests/io/data/stata/stata12_118.dta | Bin 0 -> 2622 bytes pandas/tests/io/data/stata/stata12_119.dta | Bin 0 -> 2632 bytes pandas/tests/io/data/stata/stata12_be_117.dta | Bin 0 -> 1285 bytes pandas/tests/io/data/stata/stata12_be_118.dta | Bin 0 -> 2622 bytes pandas/tests/io/data/stata/stata12_be_119.dta | Bin 0 -> 2632 bytes pandas/tests/io/data/stata/stata14_119.dta | Bin 0 -> 5574 bytes pandas/tests/io/data/stata/stata14_be_118.dta | Bin 0 -> 5556 bytes pandas/tests/io/data/stata/stata14_be_119.dta | Bin 0 -> 5574 bytes pandas/tests/io/data/stata/stata16_119.dta | Bin 0 -> 4628 bytes pandas/tests/io/data/stata/stata16_be_118.dta | Bin 0 -> 4614 bytes pandas/tests/io/data/stata/stata16_be_119.dta | Bin 0 -> 4628 bytes pandas/tests/io/test_stata.py | 47 +++++++++++++++--- 14 files changed, 43 insertions(+), 12 deletions(-) create mode 100644 pandas/tests/io/data/stata/stata12_118.dta create mode 100644 pandas/tests/io/data/stata/stata12_119.dta create mode 100644 pandas/tests/io/data/stata/stata12_be_117.dta create mode 100644 pandas/tests/io/data/stata/stata12_be_118.dta create mode 100644 pandas/tests/io/data/stata/stata12_be_119.dta create mode 100644 pandas/tests/io/data/stata/stata14_119.dta create mode 100644 pandas/tests/io/data/stata/stata14_be_118.dta create mode 100644 pandas/tests/io/data/stata/stata14_be_119.dta create mode 100644 pandas/tests/io/data/stata/stata16_119.dta create mode 100644 pandas/tests/io/data/stata/stata16_be_118.dta create mode 100644 pandas/tests/io/data/stata/stata16_be_119.dta diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 27b16cb706e8d..e621ab2a5b9c5 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -546,6 +546,7 @@ I/O - Bug in :meth:`DataFrame.to_excel` when writing empty :class:`DataFrame` with :class:`MultiIndex` on both axes (:issue:`57696`) - Bug in :meth:`DataFrame.to_string` that raised ``StopIteration`` with nested DataFrames. (:issue:`16098`) - Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`) +- Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`) Period ^^^^^^ diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 4e7bd160a5a52..9c6cd2faeaa2f 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1600,14 +1600,13 @@ def _read_strls(self) -> None: v_o = self._read_uint64() else: buf = self._path_or_buf.read(12) - # Only tested on little endian file on little endian machine. + # Only tested on little endian machine. v_size = 2 if self._format_version == 118 else 3 if self._byteorder == "<": buf = buf[0:v_size] + buf[4 : (12 - v_size)] else: - # This path may not be correct, impossible to test - buf = buf[0:v_size] + buf[(4 + v_size) :] - v_o = struct.unpack("Q", buf)[0] + buf = buf[4 - v_size : 4] + buf[(4 + v_size) :] + v_o = struct.unpack(f"{self._byteorder}Q", buf)[0] typ = self._read_uint8() length = self._read_uint32() va = self._path_or_buf.read(length) diff --git a/pandas/tests/io/data/stata/stata12_118.dta b/pandas/tests/io/data/stata/stata12_118.dta new file mode 100644 index 0000000000000000000000000000000000000000..87c6d1f063150d6a279d0b5cee1fbb5237123bcc GIT binary patch literal 2622 zcmeHJJ5R$f5H?gG!3GioTbHglsf0w($y6Yfp{NX_e__NRV@|_$T(%nUOFXvuGj3cpxJiZh$x@|Xq*^JO+= zf_(pMBlH4ziLBXfsoEBKGTPSHXX`Uc}Pc%+K$XZ+2VFms)Ct5zuFEIW6TIk|#^d&*BR0)~f0V z)Dif<2rN_p_EJMQrksY7arNEAp&)?&P$E@isoVc@MuSlK+Ca8;L7h$*(41nc7xZy% zHq%gCx&h9GK}enmayhuxk;NZIZ)f3U)6=qK`~{Ddy~qFn literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/stata/stata12_119.dta b/pandas/tests/io/data/stata/stata12_119.dta new file mode 100644 index 0000000000000000000000000000000000000000..fa63f0135738e6fb9be5b09d0da4ffde5bbb2ec9 GIT binary patch literal 2632 zcmeHJy-ve05H=u?V1t!`E?sd`i6Q7@styIIP=-E0cXvMDo&S;#l;)ZTJ=Kr;oxsTN~t9PYa< zpt%Zdc0dRL+Gfy6MbuF3Fi3bP5@r-DfsRK4Gma+g)blGX5mxFo@2XPsuj@6S`6$q= zm=T{~0NXwtp6^+q>F{^o>I;WIrXFmALyHf`7AFqB9A2DQJ2Qbj@tso;WUxz1wKT_0!@~+sq$D7m%KgJq=J}sgqdg<@L#m1G*f7{#cH< zR;=n@)&`lwHdyu5HtxCa&J%%VMdQ^=UPO?-t-!82k6%g!?SQ$1>&KFROVYzq8+Wpx zTFbQM4wO5v+YYQS0r#jO8gj`aEhPSS;z%)|8ENY1&ya^Gc(m Z!`$#C6CDYgO|trjG0R!`vKciekN+1`zAFF# literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/stata/stata12_be_117.dta b/pandas/tests/io/data/stata/stata12_be_117.dta new file mode 100644 index 0000000000000000000000000000000000000000..7f84d15fb76d0543eaecaa2af670198da64e495d GIT binary patch literal 1285 zcmdT^y-ve05O$lEfenP1yL81#C4>SeQ!$jGq6~e2$VuEbQbLPu3PB7NZ^3i0@f17& z?}3=Wj?*S8sv^{hNV)I+eCIo#4MI{$y--pD6GlVE32+uODj0E`6TnLW{8X|j*A7?1 z9^gWRK0y}XK7j#2NV5Y1G4)wY5MaZAj1s0&B{Q<;xUC-ZTOHduZrSc>#|B(j0WWuv z(3uWb{$7kwJIL`Y$JZR+4c6xdEd!`=XqbkZ<5P|wIer;zHUaCbFsD4Ce#|_X&X^$I zpJ->HN?B8Dw;QU;A`fQ$W}0p`8OvjpLzdlS12N??j)Yvb2k=S-qbX0Q6r}AQ+2g9H zWqaEdWwDz7^8&!j&8gPKcg)LX!!uP?=iAyt_i#OJan{*p#Hy!TXZt?^N|T*80fGtT zG?0v|F^d900RPg2>fIpc3xud2m|FFHt982w1wJh_^@}$nWK{d87b05>Lh@LU%i*QqsF~{ literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/stata/stata12_be_118.dta b/pandas/tests/io/data/stata/stata12_be_118.dta new file mode 100644 index 0000000000000000000000000000000000000000..9ed6f39b0f9b53c49b9af3735ec2cd59b9a6d1c2 GIT binary patch literal 2622 zcmeHJJ5R$f5Oy0CiEbb+nXAz@<5!*fjoD<+BlFhiDa53rw zE=U*)MJri0X8F$Va$}jcusa~r`2a(tJ|>;TOH@P+W}li0q3E_ zbe`MHOrsYwsO@LiGg$w~@YA66X0UN(aQiI7afYu3n|m2{GbDhuSGc1*q+Y~anar6W z-(P6!Nu{#F*3+)3N{c)g{muQ8&AB$7A2%I9x;kbxK+L#|LLnF12k?Rg;~9^s6r}AO zbS7n@Zabok8bYNN0p~%hEf?J~H$79X$+lc9mDU<7p!eSDvJ`hnx>#bd7uV>wGF4Te zs=)tMV5tMh4mJ2w%Be3ISIQP0=0T4}(wUJoJPqXnma`kqP( zj%uY+We}1_f}D>omCUFkRC`uLQ`rpIcrp#qZ)sgicW>?)mr43F!1ou!*ilSRw;19RuS$p8QV literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/stata/stata12_be_119.dta b/pandas/tests/io/data/stata/stata12_be_119.dta new file mode 100644 index 0000000000000000000000000000000000000000..3c9736d0f3af3c0dd504db2cafb456fa0f73290c GIT binary patch literal 2632 zcmeHJJ#NA<6gD6#m91Snl!dhhqztA?UyXA}28rDU{T9f6@u{B)vnoO1(%g z(vkWnX=s>W$wCHye*b=c&#^&BDybVtN?^`tz&HWUB1Q!xosI)IBfv`}TXL;1nhXHv zBn$~cs0MgQV63P5Q{>EH=(lgcNT9>!|QVc}Ts8xiX2F zAnPA=v{I=Q*jnwLs$_s5Hzm~lKQyp6k5KAtjP{?BYs1p8(V7lZBDg|jd z7wxo9cFUpVh}{e_?Y#{FXNy*?7dzDrn7m7{!6P`diwCpxJ=AorvfjllkNE7x?PFEC2ui literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/stata/stata14_119.dta b/pandas/tests/io/data/stata/stata14_119.dta new file mode 100644 index 0000000000000000000000000000000000000000..e64353213b1c966ec75e8355e03b67395723f9f9 GIT binary patch literal 5574 zcmeHLJ8u**5FQ{rMF9dTB(zOhoC}JQLzWW~2tkQLlt+E?-8s2c_SUDh14^1iq2dov zAt5Lz=x7iiL~{}U08!A;&{KgxNXPiGxjd>Y5z9$u?RxC>jA#6PWJd{l>Zf%eoJ?0o-FaHEj~{e6XjpfGrSV2F}-D3i3AHaKGC?0Wznx{S9q@t&vPq_AnXB<@*9DU^J>_u1quKvMON{m)|k0&87 zN1~)x8$z)YfZcBVZ2f5cYJG1}vZvXw#~g>p5Q}(8xh`|b$3h*7JPlZIGoa)QDb495 z`nN#biFKai2N;he&((xcT+F#bFeUR{1h_wKuSlw7qN7l0yOq3o0Fdq}6FmSs>y&J> z(nXon@h7t0Vn!hI5WL#Alx3;)7PAT#2D3?K)<{YEV{prjRz%63EhV@2W*K`|ri;L# zBEap8Jk%~btTVIs?p>N~f6Bt6*n~(#!0}bGE&`pm*VJJUqq(~QtmN(Wbvl3w-BcCa z4|S~_!&GWK$$0vA9vla^IE=Lo^A0ZAm9PjbnmC~gVfNWzZYjF>WycR9jMKUR{sn+L zZi2jHL(QqdD_SHNa)uSfDB1VLT#m6Zu2J!*T#}j}1F+XwCfgYHH(TF^Z%Px-jY_3u zj4Vfup`v7)`h>9{d5`y?FPe(U`2S?JNt+iGet7vXhy<>Ad&)Uca`}_XRAEsogC<(%F&x@jb0cU^={RqH#C?V_HrXPhtZD~7yPE9RaR?S zj17|=_mUZj>2mB)Hm27c?fdL#|GJ|WSxlcgdTAo2pB%mXI;QWyH&)>hlL0G+yr4H~ zTv5_T`nE~qSL0{ncjE{6ZEPbRG3*{iETR?V8cU^|3Ut8JbYRKNfP!;`?Mt&2KUsJbo#B#u3?C(`+|y}CNgW0;x_c17Qrb~orvs=kELGlmpewDMrBq=u z;qKpga2(y^AksFiJGy39!y>S3@`5ge$!CK(lj!1?96u6bUe*Qh7y#}&`H2G?FuMjT zX)ZCv3`&ZEf9Q)j45L#_qv1)hA~YjC)`twqjBgVxzB`w03zN@H3Z+F9i)6RHZQ(b* z_0@?ige>CZAqG)ccdfY6u}!&v2DHeF)eX@U>23U(*v7Tyk5iW zzZ2G9ZtlL_eU4eYv%Wv~Lx95~o;nh8<39<#v*^nee9U#$5t0!#9f-(dBz`IGg&Ei@ O(}4p~vzaxmgWA71;YXPO literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/stata/stata14_be_119.dta b/pandas/tests/io/data/stata/stata14_be_119.dta new file mode 100644 index 0000000000000000000000000000000000000000..09d08f7e992ea3ae410bac190f93f5cdc4fc9e5e GIT binary patch literal 5574 zcmeHLy>AmS6n|;c0u=)Yn2^vdUAd;HYJ)7dv`~s#O4W}!auU;=dO1gD7t}7TVB#NO zLP9Vwu(2ROh$R*O0b*cbVP^t?kizpvnl`ZXsCq<-b?0aMJ^MZX&OX0qA{A4tP*#kR z3TI^=Qxfx#Gs$T#mnVL+LW&!Ti&zWXT$(0+t4d}m030L!EG6^0s8b*D=P3zUk%ts6 zELQ^Q;ln(y{wNAcqRhQA>h+8O@vR(DL6xID)fzoLJmy`g#oq8xZp6!tULF}Eev3=| zrm0m{YihK50p8g{Chus)(!Q*tuD$=0tq(k~^o(@$p`~XR9sOwOxi^l!C%)E7?yxvu z#gG@&MvY4f{m{P+jUSEgjh~Hg#BXD3@rdE{C}I(23lrJJjw-B1NpiKq~hux&IoMqbp};09tk8 z?~0P!BCg^`q=Wf{K&H|0;?Qc6tu~lX+A%kptY>14#BY8W{N;x0ykO3jg8kA-=HBho zMc_~oU}i-g>X02anEv;+Z_SoJW#Ex?hR57v_)1abo=V$m>M)4W-2DKS()Ri~9YBe0 zs`Ty#s?v&KN);9-JpDTlj)PkqM9PGD2k+VKVGy{j4D3384N(WeG0DIR0U8I&X?{vR#+a*R$fg^DM|s!$A|R~wOm%=k8v@vpiDa8>Ad zZc<1kA}7+b^<@j&_||94F)f5F;;@KLl&+akOr~otEuaD=mc`nJz!t;JHhxWP17s&o zHHH30>GNlEF#ci1B;p&Zi8(0g^`%?Qv5;o&VRqb1rTzr=^LjMzUpVzEcP3!9RHxh41L;wH) literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/stata/stata16_119.dta b/pandas/tests/io/data/stata/stata16_119.dta new file mode 100644 index 0000000000000000000000000000000000000000..d03c489d4342d3e908da62d6b4dbf22e34dbce00 GIT binary patch literal 4628 zcmeHLJ8u&~5I&xv2_h&R%{6IgE_Ogl(6AK>MZkdk1Dv(JQBFEPbh|-vXDp{hR1pOQ z1r=Q+OZ)&kl+ItkCE`yovyZbcc2I07vV?ABd3NT$dED&m-OYkjT=7y>aR!HiS4GSq z7Lnjmuv~5j0>?nPr9>kpz^mde1Y;KV*d5RFAlPG2U@JJK0YQO5#LFUL#F!bVuqM!B zt;yE5a{g=H^7EUyZ9l*D>?!&hUxmO0Q{zoEV8&NVAKfsMT}yx6GW_aoL!VPTIDXq; z-rYC6vh@1{!~a<`^xi{5*DZyByuy1PhrAq#l4>=DWRJad4;{9>HUt+g5wyeXNw)DMFCA_!x{=>pP;qB5J#X<)7xf5%OA~JU=tz|Gv#n(EJy*6eaF&7V z$N=sCi-kFUo9S|uQ_q9z2qd$aWgyEymVqn-|7iv$IwAX7fyyC|c}0mBKRUyTWDrbA zs@QU}^AE$*;g4Z&czWLXcXrU|#d+N0jbQSTThDc@=J{tS0@G^cw~G^eSrM_O?Si>jkK#lk8>j4W+<#{$6Jl znN%qufNm-Q&~i3CtESf$1zIgtGFB}C(1bfRd%oax=)hpAG^}a@BGtN6)$2tb>0a#O zT7~2iDO8<;TagY)eUg$SCDn0}#HCwfrNg=-7;6C=Je|48ZiTkx#5W?b literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/stata/stata16_be_118.dta b/pandas/tests/io/data/stata/stata16_be_118.dta new file mode 100644 index 0000000000000000000000000000000000000000..bae769c038820091a3c52ec98f84f12bc0863f4b GIT binary patch literal 4614 zcmeHLy^hmB5Z*Y+r2-K|L+9oin!V&e%AwIVoS-;eK*|H;Hpxb3$@!6Y4=G(hxfW3& z3Pi;VoD_)%NRBf100^YK1`X_=6WfWw)5txv_So3`Piuc0Tg1kez8ib@ znYCSG?|m}1{KMK`#@^optg^yu8hEtjv!)mg87JrwdVq3cG?vpz+)~g!l~W@ZqFXaz zcJ%90;q@y!O0LpU-uAp5=11xgG`R@;T@6Aju*(tX`ZV+Oo2>+Oo6ltL{2|;TY>gD4QN}iK)z^tZB77hcslZsNp9XOt`=Vx z)5X=z+rOuOHJj9rXOl)0HADs7&7q>P-@eSp*OU4C`Q^1e%SGYQ_UuEnXWG<;b8I+g zIsLetj+fJS%juxXtGNbVfuw1^5( zP*70OMY6;P*r9ZO0hfrMU|!2Uyh5>sw5WmmQeICTs@X;ELez|lS6UYClD z-2txdyY>kg+O<9Rz^?5-dn$58cfkoK(j&>!x#MQ zK!@4T)hFERxf@C@(-PM9ybb0@@)0x{5ByC{LMgDx8L0Zy^$4sVaNlD*y@`Z4r>9aO zaQ;HztRHjB5ZOW~yAp>nv}%0=WXY0=WXY0_!ReI|-TB3bfBjK-!!J;-k}RGYrm>rz4J> z`1W#sJ^wzR%&%|X{X6~Z>A3!4I&Q>K1B6O)To5NuWq*8~jc&%X53{SA&vD>#fBJFR zM@%@>HctPm9TDF?qk3T!~I1calEMNY^vriY8pqRxL@)9%UgrCY)HYT5QSA=HAg8 zE>^@+;nd}h+zeRk!(Bz%%D}APvLmbtC;mxL^#bxp$MDO>D*;f?Uf6;xo$W*^X+XFp z1+WGvNwX9!Q8G!I0$3zPD@s~*sWg&Q1+sc6YDrQn=HVs@>Xy|>5z3Na*~S|#M|;D0 fh2AZNWhrdML=Xo&rCj{9BDZDi8&M9UKZ5=M?@bKz literal 0 HcmV?d00001 diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index a27448a342a19..bf17e62985fe9 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -314,8 +314,19 @@ def test_readold_dta4(self, version, datapath): tm.assert_frame_equal(parsed, expected) # File containing strls - def test_read_dta12(self, datapath): - parsed_117 = self.read_dta(datapath("io", "data", "stata", "stata12_117.dta")) + @pytest.mark.parametrize( + "file", + [ + "stata12_117", + "stata12_be_117", + "stata12_118", + "stata12_be_118", + "stata12_119", + "stata12_be_119", + ], + ) + def test_read_dta_strl(self, file, datapath): + parsed = self.read_dta(datapath("io", "data", "stata", f"{file}.dta")) expected = DataFrame.from_records( [ [1, "abc", "abcdefghi"], @@ -325,10 +336,20 @@ def test_read_dta12(self, datapath): columns=["x", "y", "z"], ) - tm.assert_frame_equal(parsed_117, expected, check_dtype=False) + tm.assert_frame_equal(parsed, expected, check_dtype=False) - def test_read_dta18(self, datapath): - parsed_118 = self.read_dta(datapath("io", "data", "stata", "stata14_118.dta")) + # 117 is not included in this list as it uses ASCII strings + @pytest.mark.parametrize( + "file", + [ + "stata14_118", + "stata14_be_118", + "stata14_119", + "stata14_be_119", + ], + ) + def test_read_dta118_119(self, file, datapath): + parsed_118 = self.read_dta(datapath("io", "data", "stata", f"{file}.dta")) parsed_118["Bytes"] = parsed_118["Bytes"].astype("O") expected = DataFrame.from_records( [ @@ -352,7 +373,7 @@ def test_read_dta18(self, datapath): for col in parsed_118.columns: tm.assert_almost_equal(parsed_118[col], expected[col]) - with StataReader(datapath("io", "data", "stata", "stata14_118.dta")) as rdr: + with StataReader(datapath("io", "data", "stata", f"{file}.dta")) as rdr: vl = rdr.variable_labels() vl_expected = { "Unicode_Cities_Strl": "Here are some strls with Ünicode chars", @@ -1799,8 +1820,18 @@ def test_gzip_writing(self, temp_file): reread = read_stata(gz, index_col="index") tm.assert_frame_equal(df, reread) - def test_unicode_dta_118(self, datapath): - unicode_df = self.read_dta(datapath("io", "data", "stata", "stata16_118.dta")) + # 117 is not included in this list as it uses ASCII strings + @pytest.mark.parametrize( + "file", + [ + "stata16_118", + "stata16_be_118", + "stata16_119", + "stata16_be_119", + ], + ) + def test_unicode_dta_118_119(self, file, datapath): + unicode_df = self.read_dta(datapath("io", "data", "stata", f"{file}.dta")) columns = ["utf8", "latin1", "ascii", "utf8_strl", "ascii_strl"] values = [ From 5411cc45ffdc38f528c5d4477f3a159f3e1e5b2b Mon Sep 17 00:00:00 2001 From: Brett Dixon <93047660+BDixon808@users.noreply.github.com> Date: Mon, 10 Jun 2024 09:45:14 -0700 Subject: [PATCH 072/272] DOC: added see also to docstrings in mean and median (#58951) * added see also to docstrings * fixed typing * Updated code check --- ci/code_checks.sh | 4 ++-- pandas/core/generic.py | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 7bf676d5628aa..188f5678bbbba 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -71,8 +71,8 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i ES01 `# For now it is ok if docstrings are missing the extended summary` \ -i "pandas.Series.dt PR01" `# Accessors are implemented as classes, but we do not document the Parameters section` \ -i "pandas.DataFrame.max RT03" \ - -i "pandas.DataFrame.mean RT03,SA01" \ - -i "pandas.DataFrame.median RT03,SA01" \ + -i "pandas.DataFrame.mean RT03" \ + -i "pandas.DataFrame.median RT03" \ -i "pandas.DataFrame.min RT03" \ -i "pandas.DataFrame.plot PR02" \ -i "pandas.Grouper PR02" \ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 80314c2648f45..84745b25b5eef 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -12571,7 +12571,7 @@ def make_doc(name: str, ndim: int) -> str: elif name == "median": base_doc = _num_doc desc = "Return the median of the values over the requested axis." - see_also = "" + see_also = _stat_func_see_also examples = """ Examples @@ -12612,7 +12612,7 @@ def make_doc(name: str, ndim: int) -> str: elif name == "mean": base_doc = _num_doc desc = "Return the mean of the values over the requested axis." - see_also = "" + see_also = _stat_func_see_also examples = """ Examples @@ -12760,6 +12760,7 @@ def make_doc(name: str, ndim: int) -> str: a 0.0 dtype: float64""" kwargs = {"min_count": ""} + elif name == "kurt": base_doc = _num_doc desc = ( From b290bf0f9a8b316b84c3d32a2e0fb2cf12da6e9d Mon Sep 17 00:00:00 2001 From: DaxServer <7479937+DaxServer@users.noreply.github.com> Date: Mon, 10 Jun 2024 19:04:02 +0200 Subject: [PATCH 073/272] DOC: Update typo in missing_data.rst (#58968) --- doc/source/user_guide/missing_data.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst index 69dfb406daa43..66e42352754ae 100644 --- a/doc/source/user_guide/missing_data.rst +++ b/doc/source/user_guide/missing_data.rst @@ -353,7 +353,7 @@ this behaviour and include NA values in the calculation, use ``skipna=False``. Dropping missing data ~~~~~~~~~~~~~~~~~~~~~ -:meth:`~DataFrame.dropna` dropa rows or columns with missing data. +:meth:`~DataFrame.dropna` drops rows or columns with missing data. .. ipython:: python From 629ffeb26e9c385cab2f4f0aacb466be8266ff69 Mon Sep 17 00:00:00 2001 From: cmjcharlton <90400333+cmjcharlton@users.noreply.github.com> Date: Mon, 10 Jun 2024 21:05:45 +0100 Subject: [PATCH 074/272] BUG: byteorder option in to_stata is not honoured when writing strL data (#58970) * BUG: byteorder option in to_stata is not honoured when writing strL data * Check whether requested byteorder matches the current system once, and store the result --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/io/stata.py | 17 ++++++++++++++--- pandas/tests/io/test_stata.py | 8 ++++++-- 3 files changed, 21 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index e621ab2a5b9c5..07f5b01709223 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -544,6 +544,7 @@ I/O - Bug in :class:`DataFrame` and :class:`Series` ``repr`` of :py:class:`collections.abc.Mapping`` elements. (:issue:`57915`) - Bug in :meth:`DataFrame.to_dict` raises unnecessary ``UserWarning`` when columns are not unique and ``orient='tight'``. (:issue:`58281`) - Bug in :meth:`DataFrame.to_excel` when writing empty :class:`DataFrame` with :class:`MultiIndex` on both axes (:issue:`57696`) +- Bug in :meth:`DataFrame.to_stata` when writing :class:`DataFrame` and ``byteorder=`big```. (:issue:`58969`) - Bug in :meth:`DataFrame.to_string` that raised ``StopIteration`` with nested DataFrames. (:issue:`16098`) - Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`) - Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 9c6cd2faeaa2f..d1e57ad568ba5 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -3037,6 +3037,8 @@ def __init__( if byteorder is None: byteorder = sys.byteorder self._byteorder = _set_endianness(byteorder) + # Flag whether chosen byteorder matches the system on which we're running + self._native_byteorder = self._byteorder == _set_endianness(sys.byteorder) gso_v_type = "I" # uint32 gso_o_type = "Q" # uint64 @@ -3049,13 +3051,20 @@ def __init__( o_size = 6 else: # version == 119 o_size = 5 - self._o_offet = 2 ** (8 * (8 - o_size)) + if self._native_byteorder: + self._o_offet = 2 ** (8 * (8 - o_size)) + else: + self._o_offet = 2 ** (8 * o_size) self._gso_o_type = gso_o_type self._gso_v_type = gso_v_type def _convert_key(self, key: tuple[int, int]) -> int: v, o = key - return v + self._o_offet * o + if self._native_byteorder: + return v + self._o_offet * o + else: + # v, o will be swapped when applying byteorder + return o + self._o_offet * v def generate_table(self) -> tuple[dict[str, tuple[int, int]], DataFrame]: """ @@ -3532,7 +3541,9 @@ def _convert_strls(self, data: DataFrame) -> DataFrame: ] if convert_cols: - ssw = StataStrLWriter(data, convert_cols, version=self._dta_version) + ssw = StataStrLWriter( + data, convert_cols, version=self._dta_version, byteorder=self._byteorder + ) tab, new_data = ssw.generate_table() data = new_data self._strl_blob = ssw.generate_blob(tab) diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index bf17e62985fe9..2534df6a82f89 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -1678,7 +1678,8 @@ def test_date_parsing_ignores_format_details(self, column, datapath): formatted = df.loc[0, column + "_fmt"] assert unformatted == formatted - def test_writer_117(self, temp_file): + @pytest.mark.parametrize("byteorder", ["little", "big"]) + def test_writer_117(self, byteorder, temp_file): original = DataFrame( data=[ [ @@ -1736,6 +1737,7 @@ def test_writer_117(self, temp_file): original.to_stata( path, convert_dates={"datetime": "tc"}, + byteorder=byteorder, convert_strl=["forced_strl"], version=117, ) @@ -1940,7 +1942,8 @@ def test_stata_119(self, datapath): assert reader._nvar == 32999 @pytest.mark.parametrize("version", [118, 119, None]) - def test_utf8_writer(self, version, temp_file): + @pytest.mark.parametrize("byteorder", ["little", "big"]) + def test_utf8_writer(self, version, byteorder, temp_file): cat = pd.Categorical(["a", "β", "ĉ"], ordered=True) data = DataFrame( [ @@ -1968,6 +1971,7 @@ def test_utf8_writer(self, version, temp_file): convert_strl=["strls"], variable_labels=variable_labels, write_index=False, + byteorder=byteorder, version=version, value_labels=value_labels, ) From 31c2de5e39cbe82a5e4260af95c3ad72953e27f9 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Tue, 11 Jun 2024 21:25:47 +0530 Subject: [PATCH 075/272] DOC: fix SA01 for pandas.MultiIndex.nlevels (#58976) --- ci/code_checks.sh | 1 - pandas/core/indexes/multi.py | 7 +++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 188f5678bbbba..18e42ac3ebd35 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -84,7 +84,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.MultiIndex.get_loc_level PR07" \ -i "pandas.MultiIndex.levshape SA01" \ -i "pandas.MultiIndex.names SA01" \ - -i "pandas.MultiIndex.nlevels SA01" \ -i "pandas.MultiIndex.remove_unused_levels RT03,SA01" \ -i "pandas.MultiIndex.reorder_levels RT03,SA01" \ -i "pandas.MultiIndex.set_levels RT03,SA01" \ diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index a8c05ab78c98e..63908ada0c73e 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1031,6 +1031,13 @@ def nlevels(self) -> int: """ Integer number of levels in this MultiIndex. + See Also + -------- + MultiIndex.levels : Get the levels of the MultiIndex. + MultiIndex.codes : Get the codes of the MultiIndex. + MultiIndex.from_arrays : Convert arrays to MultiIndex. + MultiIndex.from_tuples : Convert list of tuples to MultiIndex. + Examples -------- >>> mi = pd.MultiIndex.from_arrays([["a"], ["b"], ["c"]]) From 42f785f8c1a391d18b558aaf84347129790d1431 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Tue, 11 Jun 2024 21:28:46 +0530 Subject: [PATCH 076/272] DOC: fix PR07 for pandas.merge (#58979) --- ci/code_checks.sh | 1 - pandas/core/reshape/merge.py | 213 +++++++++++++++++++++++++++++++++-- 2 files changed, 205 insertions(+), 9 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 18e42ac3ebd35..4d74fec24c4ab 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -464,7 +464,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.io.stata.StataReader.variable_labels RT03,SA01" \ -i "pandas.io.stata.StataWriter.write_file SA01" \ -i "pandas.json_normalize RT03,SA01" \ - -i "pandas.merge PR07" \ -i "pandas.merge_asof PR07,RT03" \ -i "pandas.period_range RT03,SA01" \ -i "pandas.plotting.andrews_curves RT03,SA01" \ diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index ddf6bd3c70988..a6cb6b5f48de2 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -39,11 +39,7 @@ npt, ) from pandas.errors import MergeError -from pandas.util._decorators import ( - Appender, - Substitution, - cache_readonly, -) +from pandas.util._decorators import cache_readonly from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.base import ExtensionDtype @@ -95,7 +91,6 @@ ensure_wrapped_if_datetimelike, extract_array, ) -from pandas.core.frame import _merge_doc from pandas.core.indexes.api import default_index from pandas.core.sorting import ( get_group_index, @@ -133,8 +128,6 @@ _known = (np.ndarray, ExtensionArray, Index, ABCSeries) -@Substitution("\nleft : DataFrame or named Series") -@Appender(_merge_doc, indents=0) def merge( left: DataFrame | Series, right: DataFrame | Series, @@ -150,6 +143,210 @@ def merge( indicator: str | bool = False, validate: str | None = None, ) -> DataFrame: + """ + Merge DataFrame or named Series objects with a database-style join. + + A named Series object is treated as a DataFrame with a single named column. + + The join is done on columns or indexes. If joining columns on + columns, the DataFrame indexes *will be ignored*. Otherwise if joining indexes + on indexes or indexes on a column or columns, the index will be passed on. + When performing a cross merge, no column specifications to merge on are + allowed. + + .. warning:: + + If both key columns contain rows where the key is a null value, those + rows will be matched against each other. This is different from usual SQL + join behaviour and can lead to unexpected results. + + Parameters + ---------- + left : DataFrame or named Series + First pandas object to merge. + right : DataFrame or named Series + Second pandas object to merge. + how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'inner' + Type of merge to be performed. + + * left: use only keys from left frame, similar to a SQL left outer join; + preserve key order. + * right: use only keys from right frame, similar to a SQL right outer join; + preserve key order. + * outer: use union of keys from both frames, similar to a SQL full outer + join; sort keys lexicographically. + * inner: use intersection of keys from both frames, similar to a SQL inner + join; preserve the order of the left keys. + * cross: creates the cartesian product from both frames, preserves the order + of the left keys. + on : label or list + Column or index level names to join on. These must be found in both + DataFrames. If `on` is None and not merging on indexes then this defaults + to the intersection of the columns in both DataFrames. + left_on : label or list, or array-like + Column or index level names to join on in the left DataFrame. Can also + be an array or list of arrays of the length of the left DataFrame. + These arrays are treated as if they are columns. + right_on : label or list, or array-like + Column or index level names to join on in the right DataFrame. Can also + be an array or list of arrays of the length of the right DataFrame. + These arrays are treated as if they are columns. + left_index : bool, default False + Use the index from the left DataFrame as the join key(s). If it is a + MultiIndex, the number of keys in the other DataFrame (either the index + or a number of columns) must match the number of levels. + right_index : bool, default False + Use the index from the right DataFrame as the join key. Same caveats as + left_index. + sort : bool, default False + Sort the join keys lexicographically in the result DataFrame. If False, + the order of the join keys depends on the join type (how keyword). + suffixes : list-like, default is ("_x", "_y") + A length-2 sequence where each element is optionally a string + indicating the suffix to add to overlapping column names in + `left` and `right` respectively. Pass a value of `None` instead + of a string to indicate that the column name from `left` or + `right` should be left as-is, with no suffix. At least one of the + values must not be None. + copy : bool, default False + If False, avoid copy if possible. + + .. note:: + The `copy` keyword will change behavior in pandas 3.0. + `Copy-on-Write + `__ + will be enabled by default, which means that all methods with a + `copy` keyword will use a lazy copy mechanism to defer the copy and + ignore the `copy` keyword. The `copy` keyword will be removed in a + future version of pandas. + + You can already get the future behavior and improvements through + enabling copy on write ``pd.options.mode.copy_on_write = True`` + + .. deprecated:: 3.0.0 + indicator : bool or str, default False + If True, adds a column to the output DataFrame called "_merge" with + information on the source of each row. The column can be given a different + name by providing a string argument. The column will have a Categorical + type with the value of "left_only" for observations whose merge key only + appears in the left DataFrame, "right_only" for observations + whose merge key only appears in the right DataFrame, and "both" + if the observation's merge key is found in both DataFrames. + + validate : str, optional + If specified, checks if merge is of specified type. + + * "one_to_one" or "1:1": check if merge keys are unique in both + left and right datasets. + * "one_to_many" or "1:m": check if merge keys are unique in left + dataset. + * "many_to_one" or "m:1": check if merge keys are unique in right + dataset. + * "many_to_many" or "m:m": allowed, but does not result in checks. + + Returns + ------- + DataFrame + A DataFrame of the two merged objects. + + See Also + -------- + merge_ordered : Merge with optional filling/interpolation. + merge_asof : Merge on nearest keys. + DataFrame.join : Similar method using indices. + + Examples + -------- + >>> df1 = pd.DataFrame( + ... {"lkey": ["foo", "bar", "baz", "foo"], "value": [1, 2, 3, 5]} + ... ) + >>> df2 = pd.DataFrame( + ... {"rkey": ["foo", "bar", "baz", "foo"], "value": [5, 6, 7, 8]} + ... ) + >>> df1 + lkey value + 0 foo 1 + 1 bar 2 + 2 baz 3 + 3 foo 5 + >>> df2 + rkey value + 0 foo 5 + 1 bar 6 + 2 baz 7 + 3 foo 8 + + Merge df1 and df2 on the lkey and rkey columns. The value columns have + the default suffixes, _x and _y, appended. + + >>> df1.merge(df2, left_on="lkey", right_on="rkey") + lkey value_x rkey value_y + 0 foo 1 foo 5 + 1 foo 1 foo 8 + 2 bar 2 bar 6 + 3 baz 3 baz 7 + 4 foo 5 foo 5 + 5 foo 5 foo 8 + + Merge DataFrames df1 and df2 with specified left and right suffixes + appended to any overlapping columns. + + >>> df1.merge(df2, left_on="lkey", right_on="rkey", suffixes=("_left", "_right")) + lkey value_left rkey value_right + 0 foo 1 foo 5 + 1 foo 1 foo 8 + 2 bar 2 bar 6 + 3 baz 3 baz 7 + 4 foo 5 foo 5 + 5 foo 5 foo 8 + + Merge DataFrames df1 and df2, but raise an exception if the DataFrames have + any overlapping columns. + + >>> df1.merge(df2, left_on="lkey", right_on="rkey", suffixes=(False, False)) + Traceback (most recent call last): + ... + ValueError: columns overlap but no suffix specified: + Index(['value'], dtype='object') + + >>> df1 = pd.DataFrame({"a": ["foo", "bar"], "b": [1, 2]}) + >>> df2 = pd.DataFrame({"a": ["foo", "baz"], "c": [3, 4]}) + >>> df1 + a b + 0 foo 1 + 1 bar 2 + >>> df2 + a c + 0 foo 3 + 1 baz 4 + + >>> df1.merge(df2, how="inner", on="a") + a b c + 0 foo 1 3 + + >>> df1.merge(df2, how="left", on="a") + a b c + 0 foo 1 3.0 + 1 bar 2 NaN + + >>> df1 = pd.DataFrame({"left": ["foo", "bar"]}) + >>> df2 = pd.DataFrame({"right": [7, 8]}) + >>> df1 + left + 0 foo + 1 bar + >>> df2 + right + 0 7 + 1 8 + + >>> df1.merge(df2, how="cross") + left right + 0 foo 7 + 1 foo 8 + 2 bar 7 + 3 bar 8 + """ left_df = _validate_operand(left) left._check_copy_deprecation(copy) right_df = _validate_operand(right) From bbe0e531383358b44e94131482e122bda43b33d7 Mon Sep 17 00:00:00 2001 From: auderson <48577571+auderson@users.noreply.github.com> Date: Wed, 12 Jun 2024 00:11:36 +0800 Subject: [PATCH 077/272] ENH add *args support for numba apply (#58767) * add *args for raw numba apply * add whatsnew * fix test_case * fix pre-commit * fix test case * add *args for raw=False as well; merge tests together * add prepare_function_arguments * fix mypy * update get_jit_arguments * add nopython test in `test_apply_args` * fix test * fix pre-commit --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/_numba/executor.py | 12 +++--- pandas/core/apply.py | 36 +++++++++++------ pandas/core/util/numba_.py | 56 ++++++++++++++++++++++++-- pandas/tests/apply/test_frame_apply.py | 54 ++++++++++++++++++++++--- pandas/tests/window/test_numba.py | 4 +- 6 files changed, 136 insertions(+), 27 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 07f5b01709223..fe1dcefe05ff2 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -598,6 +598,7 @@ Other - Bug in :class:`DataFrame` when passing a ``dict`` with a NA scalar and ``columns`` that would always return ``np.nan`` (:issue:`57205`) - Bug in :func:`eval` where the names of the :class:`Series` were not preserved when using ``engine="numexpr"``. (:issue:`10239`) - Bug in :func:`unique` on :class:`Index` not always returning :class:`Index` (:issue:`57043`) +- Bug in :meth:`DataFrame.apply` where passing ``engine="numba"`` ignored ``args`` passed to the applied function (:issue:`58712`) - Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which caused an exception when using NumPy attributes via ``@`` notation, e.g., ``df.eval("@np.floor(a)")``. (:issue:`58041`) - Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which did not allow to use ``tan`` function. (:issue:`55091`) - Bug in :meth:`DataFrame.sort_index` when passing ``axis="columns"`` and ``ignore_index=True`` and ``ascending=False`` not returning a :class:`RangeIndex` columns (:issue:`57293`) diff --git a/pandas/core/_numba/executor.py b/pandas/core/_numba/executor.py index 0a26acb7df60a..82fd4e34ac67b 100644 --- a/pandas/core/_numba/executor.py +++ b/pandas/core/_numba/executor.py @@ -14,6 +14,8 @@ from pandas.compat._optional import import_optional_dependency +from pandas.core.util.numba_ import jit_user_function + @functools.cache def generate_apply_looper(func, nopython=True, nogil=True, parallel=False): @@ -21,10 +23,10 @@ def generate_apply_looper(func, nopython=True, nogil=True, parallel=False): import numba else: numba = import_optional_dependency("numba") - nb_compat_func = numba.extending.register_jitable(func) + nb_compat_func = jit_user_function(func) @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel) - def nb_looper(values, axis): + def nb_looper(values, axis, *args): # Operate on the first row/col in order to get # the output shape if axis == 0: @@ -33,7 +35,7 @@ def nb_looper(values, axis): else: first_elem = values[0] dim0 = values.shape[0] - res0 = nb_compat_func(first_elem) + res0 = nb_compat_func(first_elem, *args) # Use np.asarray to get shape for # https://github.com/numba/numba/issues/4202#issuecomment-1185981507 buf_shape = (dim0,) + np.atleast_1d(np.asarray(res0)).shape @@ -44,11 +46,11 @@ def nb_looper(values, axis): if axis == 1: buff[0] = res0 for i in numba.prange(1, values.shape[0]): - buff[i] = nb_compat_func(values[i]) + buff[i] = nb_compat_func(values[i], *args) else: buff[:, 0] = res0 for j in numba.prange(1, values.shape[1]): - buff[:, j] = nb_compat_func(values[:, j]) + buff[:, j] = nb_compat_func(values[:, j], *args) return buff return nb_looper diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 2039386c4766c..75ad17b59bf88 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -51,6 +51,10 @@ from pandas.core._numba.executor import generate_apply_looper import pandas.core.common as com from pandas.core.construction import ensure_wrapped_if_datetimelike +from pandas.core.util.numba_ import ( + get_jit_arguments, + prepare_function_arguments, +) if TYPE_CHECKING: from collections.abc import ( @@ -70,7 +74,6 @@ from pandas.core.resample import Resampler from pandas.core.window.rolling import BaseWindow - ResType = dict[int, Any] @@ -997,17 +1000,20 @@ def wrapper(*args, **kwargs): return wrapper if engine == "numba": - engine_kwargs = {} if engine_kwargs is None else engine_kwargs - + args, kwargs = prepare_function_arguments( + self.func, # type: ignore[arg-type] + self.args, + self.kwargs, + ) # error: Argument 1 to "__call__" of "_lru_cache_wrapper" has # incompatible type "Callable[..., Any] | str | list[Callable # [..., Any] | str] | dict[Hashable,Callable[..., Any] | str | # list[Callable[..., Any] | str]]"; expected "Hashable" nb_looper = generate_apply_looper( self.func, # type: ignore[arg-type] - **engine_kwargs, + **get_jit_arguments(engine_kwargs, kwargs), ) - result = nb_looper(self.values, self.axis) + result = nb_looper(self.values, self.axis, *args) # If we made the result 2-D, squeeze it back to 1-D result = np.squeeze(result) else: @@ -1148,21 +1154,23 @@ def generate_numba_apply_func( # Currently the parallel argument doesn't get passed through here # (it's disabled) since the dicts in numba aren't thread-safe. @numba.jit(nogil=nogil, nopython=nopython, parallel=parallel) - def numba_func(values, col_names, df_index): + def numba_func(values, col_names, df_index, *args): results = {} for j in range(values.shape[1]): # Create the series ser = Series( values[:, j], index=df_index, name=maybe_cast_str(col_names[j]) ) - results[j] = jitted_udf(ser) + results[j] = jitted_udf(ser, *args) return results return numba_func def apply_with_numba(self) -> dict[int, Any]: + func = cast(Callable, self.func) + args, kwargs = prepare_function_arguments(func, self.args, self.kwargs) nb_func = self.generate_numba_apply_func( - cast(Callable, self.func), **self.engine_kwargs + func, **get_jit_arguments(self.engine_kwargs, kwargs) ) from pandas.core._numba.extensions import set_numba_data @@ -1177,7 +1185,7 @@ def apply_with_numba(self) -> dict[int, Any]: # Convert from numba dict to regular dict # Our isinstance checks in the df constructor don't pass for numbas typed dict with set_numba_data(index) as index, set_numba_data(columns) as columns: - res = dict(nb_func(self.values, columns, index)) + res = dict(nb_func(self.values, columns, index, *args)) return res @property @@ -1285,7 +1293,7 @@ def generate_numba_apply_func( jitted_udf = numba.extending.register_jitable(func) @numba.jit(nogil=nogil, nopython=nopython, parallel=parallel) - def numba_func(values, col_names_index, index): + def numba_func(values, col_names_index, index, *args): results = {} # Currently the parallel argument doesn't get passed through here # (it's disabled) since the dicts in numba aren't thread-safe. @@ -1297,15 +1305,17 @@ def numba_func(values, col_names_index, index): index=col_names_index, name=maybe_cast_str(index[i]), ) - results[i] = jitted_udf(ser) + results[i] = jitted_udf(ser, *args) return results return numba_func def apply_with_numba(self) -> dict[int, Any]: + func = cast(Callable, self.func) + args, kwargs = prepare_function_arguments(func, self.args, self.kwargs) nb_func = self.generate_numba_apply_func( - cast(Callable, self.func), **self.engine_kwargs + func, **get_jit_arguments(self.engine_kwargs, kwargs) ) from pandas.core._numba.extensions import set_numba_data @@ -1316,7 +1326,7 @@ def apply_with_numba(self) -> dict[int, Any]: set_numba_data(self.obj.index) as index, set_numba_data(self.columns) as columns, ): - res = dict(nb_func(self.values, columns, index)) + res = dict(nb_func(self.values, columns, index, *args)) return res diff --git a/pandas/core/util/numba_.py b/pandas/core/util/numba_.py index a6079785e7475..d93984d210cb4 100644 --- a/pandas/core/util/numba_.py +++ b/pandas/core/util/numba_.py @@ -2,6 +2,7 @@ from __future__ import annotations +import inspect import types from typing import ( TYPE_CHECKING, @@ -54,10 +55,15 @@ def get_jit_arguments( engine_kwargs = {} nopython = engine_kwargs.get("nopython", True) - if kwargs and nopython: + if kwargs: + # Note: in case numba supports keyword-only arguments in + # a future version, we should remove this check. But this + # seems unlikely to happen soon. + raise NumbaUtilError( - "numba does not support kwargs with nopython=True: " - "https://github.com/numba/numba/issues/2916" + "numba does not support keyword-only arguments" + "https://github.com/numba/numba/issues/2916, " + "https://github.com/numba/numba/issues/6846" ) nogil = engine_kwargs.get("nogil", False) parallel = engine_kwargs.get("parallel", False) @@ -97,3 +103,47 @@ def jit_user_function(func: Callable) -> Callable: numba_func = numba.extending.register_jitable(func) return numba_func + + +_sentinel = object() + + +def prepare_function_arguments( + func: Callable, args: tuple, kwargs: dict +) -> tuple[tuple, dict]: + """ + Prepare arguments for jitted function. As numba functions do not support kwargs, + we try to move kwargs into args if possible. + + Parameters + ---------- + func : function + user defined function + args : tuple + user input positional arguments + kwargs : dict + user input keyword arguments + + Returns + ------- + tuple[tuple, dict] + args, kwargs + + """ + if not kwargs: + return args, kwargs + + # the udf should have this pattern: def udf(value, *args, **kwargs):... + signature = inspect.signature(func) + arguments = signature.bind(_sentinel, *args, **kwargs) + arguments.apply_defaults() + # Ref: https://peps.python.org/pep-0362/ + # Arguments which could be passed as part of either *args or **kwargs + # will be included only in the BoundArguments.args attribute. + args = arguments.args + kwargs = arguments.kwargs + + assert args[0] is _sentinel + args = args[1:] + + return args, kwargs diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index cbc68265a1cc1..939997f44c1a9 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -63,16 +63,60 @@ def test_apply(float_frame, engine, request): @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize("raw", [True, False]) -def test_apply_args(float_frame, axis, raw, engine, request): - if engine == "numba": - mark = pytest.mark.xfail(reason="numba engine doesn't support args") - request.node.add_marker(mark) +@pytest.mark.parametrize("nopython", [True, False]) +def test_apply_args(float_frame, axis, raw, engine, nopython): + engine_kwargs = {"nopython": nopython} result = float_frame.apply( - lambda x, y: x + y, axis, args=(1,), raw=raw, engine=engine + lambda x, y: x + y, + axis, + args=(1,), + raw=raw, + engine=engine, + engine_kwargs=engine_kwargs, ) expected = float_frame + 1 tm.assert_frame_equal(result, expected) + # GH:58712 + result = float_frame.apply( + lambda x, a, b: x + a + b, + args=(1,), + b=2, + raw=raw, + engine=engine, + engine_kwargs=engine_kwargs, + ) + expected = float_frame + 3 + tm.assert_frame_equal(result, expected) + + if engine == "numba": + # keyword-only arguments are not supported in numba + with pytest.raises( + pd.errors.NumbaUtilError, + match="numba does not support keyword-only arguments", + ): + float_frame.apply( + lambda x, a, *, b: x + a + b, + args=(1,), + b=2, + raw=raw, + engine=engine, + engine_kwargs=engine_kwargs, + ) + + with pytest.raises( + pd.errors.NumbaUtilError, + match="numba does not support keyword-only arguments", + ): + float_frame.apply( + lambda *x, b: x[0] + x[1] + b, + args=(1,), + b=2, + raw=raw, + engine=engine, + engine_kwargs=engine_kwargs, + ) + def test_apply_categorical_func(): # GH 9573 diff --git a/pandas/tests/window/test_numba.py b/pandas/tests/window/test_numba.py index 3695ab8bf6cd3..23b17c651f08d 100644 --- a/pandas/tests/window/test_numba.py +++ b/pandas/tests/window/test_numba.py @@ -319,7 +319,9 @@ def f(x): @td.skip_if_no("numba") def test_invalid_kwargs_nopython(): - with pytest.raises(NumbaUtilError, match="numba does not support kwargs with"): + with pytest.raises( + NumbaUtilError, match="numba does not support keyword-only arguments" + ): Series(range(1)).rolling(1).apply( lambda x: x, kwargs={"a": 1}, engine="numba", raw=True ) From cce2f66b3ce82979d143f57e8ba1656957538073 Mon Sep 17 00:00:00 2001 From: Natalia Mokeeva <91160475+natmokval@users.noreply.github.com> Date: Wed, 12 Jun 2024 18:50:15 +0200 Subject: [PATCH 078/272] DOC: add examples to clarify the behavior of `is_year_start` for business offsets (#58975) * add examples to is_year_start * add missing spaces * remove unnecessary spaces * add missing parentheses * update is_year_start example --- pandas/core/arrays/datetimes.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 077bde35a4c94..e0a4587535cfd 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -2119,6 +2119,32 @@ def isocalendar(self) -> DataFrame: >>> idx.is_year_start array([False, False, True]) + + This method, when applied to Series with datetime values under + the ``.dt`` accessor, will lose information about Business offsets. + + >>> dates = pd.Series(pd.date_range("2020-10-30", periods=4, freq="BYS")) + >>> dates + 0 2021-01-01 + 1 2022-01-03 + 2 2023-01-02 + 3 2024-01-01 + dtype: datetime64[ns] + + >>> dates.dt.is_year_start + 0 True + 1 False + 2 False + 3 True + dtype: bool + + >>> idx = pd.date_range("2020-10-30", periods=4, freq="BYS") + >>> idx + DatetimeIndex(['2021-01-01', '2022-01-03', '2023-01-02', '2024-01-01'], + dtype='datetime64[ns]', freq='BYS-JAN') + + >>> idx.is_year_start + array([ True, True, True, True]) """, ) is_year_end = _field_accessor( From de5d7323cf6fcdd6fcb1643a11c248440787d960 Mon Sep 17 00:00:00 2001 From: Siddhesh Bangar Date: Wed, 12 Jun 2024 22:07:42 +0100 Subject: [PATCH 079/272] BUG: eval fails for ExtensionArray (#58793) --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/computation/ops.py | 9 ++++----- pandas/tests/frame/test_query_eval.py | 7 +++++++ 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index fe1dcefe05ff2..4a02622ae9eda 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -596,6 +596,7 @@ Styler Other ^^^^^ - Bug in :class:`DataFrame` when passing a ``dict`` with a NA scalar and ``columns`` that would always return ``np.nan`` (:issue:`57205`) +- Bug in :func:`eval` on :class:`ExtensionArray` on including division ``/`` failed with a ``TypeError``. (:issue:`58748`) - Bug in :func:`eval` where the names of the :class:`Series` were not preserved when using ``engine="numexpr"``. (:issue:`10239`) - Bug in :func:`unique` on :class:`Index` not always returning :class:`Index` (:issue:`57043`) - Bug in :meth:`DataFrame.apply` where passing ``engine="numba"`` ignored ``args`` passed to the applied function (:issue:`58712`) diff --git a/pandas/core/computation/ops.py b/pandas/core/computation/ops.py index b7a1cb173f659..d69765e91f467 100644 --- a/pandas/core/computation/ops.py +++ b/pandas/core/computation/ops.py @@ -19,6 +19,7 @@ from pandas.core.dtypes.common import ( is_list_like, + is_numeric_dtype, is_scalar, ) @@ -508,10 +509,6 @@ def _disallow_scalar_only_bool_ops(self) -> None: raise NotImplementedError("cannot evaluate scalar only bool ops") -def isnumeric(dtype) -> bool: - return issubclass(np.dtype(dtype).type, np.number) - - class Div(BinOp): """ Div operator to special case casting. @@ -525,7 +522,9 @@ class Div(BinOp): def __init__(self, lhs, rhs) -> None: super().__init__("/", lhs, rhs) - if not isnumeric(lhs.return_type) or not isnumeric(rhs.return_type): + if not is_numeric_dtype(lhs.return_type) or not is_numeric_dtype( + rhs.return_type + ): raise TypeError( f"unsupported operand type(s) for {self.op}: " f"'{lhs.return_type}' and '{rhs.return_type}'" diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index 643d342b052a4..ff1bf5632e920 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -202,6 +202,13 @@ def test_eval_simple(self, engine, parser): expected = df["a"] tm.assert_series_equal(expected, res) + def test_extension_array_eval(self, engine, parser): + # GH#58748 + df = DataFrame({"a": pd.array([1, 2, 3]), "b": pd.array([4, 5, 6])}) + result = df.eval("a / b", engine=engine, parser=parser) + expected = Series([0.25, 0.40, 0.50]) + tm.assert_series_equal(result, expected) + class TestDataFrameQueryWithMultiIndex: def test_query_with_named_multiindex(self, parser, engine): From 54cf59b4fabae5db3b8c7b6b6003f9275596d5f2 Mon Sep 17 00:00:00 2001 From: Tanya Bouman Date: Thu, 13 Jun 2024 13:34:24 -0400 Subject: [PATCH 080/272] DOC: Miss. -> Miss and Master. -> Master (#59001) --- doc/data/titanic.csv | 444 +++++++++--------- .../_static/schemas/01_table_spreadsheet.png | Bin 46286 -> 100707 bytes .../intro_tutorials/01_table_oriented.rst | 2 +- 3 files changed, 223 insertions(+), 223 deletions(-) diff --git a/doc/data/titanic.csv b/doc/data/titanic.csv index 5cc466e97cf12..0f7d184728a17 100644 --- a/doc/data/titanic.csv +++ b/doc/data/titanic.csv @@ -1,93 +1,93 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S 2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38,1,0,PC 17599,71.2833,C85,C -3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S +3,1,3,"Heikkinen, Miss Laina",female,26,0,0,STON/O2. 3101282,7.925,,S 4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S 5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S 6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q 7,0,1,"McCarthy, Mr. Timothy J",male,54,0,0,17463,51.8625,E46,S -8,0,3,"Palsson, Master. Gosta Leonard",male,2,3,1,349909,21.075,,S +8,0,3,"Palsson, Master Gosta Leonard",male,2,3,1,349909,21.075,,S 9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27,0,2,347742,11.1333,,S 10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14,1,0,237736,30.0708,,C -11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4,1,1,PP 9549,16.7,G6,S -12,1,1,"Bonnell, Miss. Elizabeth",female,58,0,0,113783,26.55,C103,S +11,1,3,"Sandstrom, Miss Marguerite Rut",female,4,1,1,PP 9549,16.7,G6,S +12,1,1,"Bonnell, Miss Elizabeth",female,58,0,0,113783,26.55,C103,S 13,0,3,"Saundercock, Mr. William Henry",male,20,0,0,A/5. 2151,8.05,,S 14,0,3,"Andersson, Mr. Anders Johan",male,39,1,5,347082,31.275,,S -15,0,3,"Vestrom, Miss. Hulda Amanda Adolfina",female,14,0,0,350406,7.8542,,S +15,0,3,"Vestrom, Miss Hulda Amanda Adolfina",female,14,0,0,350406,7.8542,,S 16,1,2,"Hewlett, Mrs. (Mary D Kingcome) ",female,55,0,0,248706,16,,S -17,0,3,"Rice, Master. Eugene",male,2,4,1,382652,29.125,,Q +17,0,3,"Rice, Master Eugene",male,2,4,1,382652,29.125,,Q 18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13,,S 19,0,3,"Vander Planke, Mrs. Julius (Emelia Maria Vandemoortele)",female,31,1,0,345763,18,,S 20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.225,,C 21,0,2,"Fynney, Mr. Joseph J",male,35,0,0,239865,26,,S 22,1,2,"Beesley, Mr. Lawrence",male,34,0,0,248698,13,D56,S -23,1,3,"McGowan, Miss. Anna ""Annie""",female,15,0,0,330923,8.0292,,Q +23,1,3,"McGowan, Miss Anna ""Annie""",female,15,0,0,330923,8.0292,,Q 24,1,1,"Sloper, Mr. William Thompson",male,28,0,0,113788,35.5,A6,S -25,0,3,"Palsson, Miss. Torborg Danira",female,8,3,1,349909,21.075,,S +25,0,3,"Palsson, Miss Torborg Danira",female,8,3,1,349909,21.075,,S 26,1,3,"Asplund, Mrs. Carl Oscar (Selma Augusta Emilia Johansson)",female,38,1,5,347077,31.3875,,S 27,0,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.225,,C 28,0,1,"Fortune, Mr. Charles Alexander",male,19,3,2,19950,263,C23 C25 C27,S -29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,,Q +29,1,3,"O'Dwyer, Miss Ellen ""Nellie""",female,,0,0,330959,7.8792,,Q 30,0,3,"Todoroff, Mr. Lalio",male,,0,0,349216,7.8958,,S 31,0,1,"Uruchurtu, Don. Manuel E",male,40,0,0,PC 17601,27.7208,,C 32,1,1,"Spencer, Mrs. William Augustus (Marie Eugenie)",female,,1,0,PC 17569,146.5208,B78,C -33,1,3,"Glynn, Miss. Mary Agatha",female,,0,0,335677,7.75,,Q +33,1,3,"Glynn, Miss Mary Agatha",female,,0,0,335677,7.75,,Q 34,0,2,"Wheadon, Mr. Edward H",male,66,0,0,C.A. 24579,10.5,,S 35,0,1,"Meyer, Mr. Edgar Joseph",male,28,1,0,PC 17604,82.1708,,C 36,0,1,"Holverson, Mr. Alexander Oskar",male,42,1,0,113789,52,,S 37,1,3,"Mamee, Mr. Hanna",male,,0,0,2677,7.2292,,C 38,0,3,"Cann, Mr. Ernest Charles",male,21,0,0,A./5. 2152,8.05,,S -39,0,3,"Vander Planke, Miss. Augusta Maria",female,18,2,0,345764,18,,S -40,1,3,"Nicola-Yarred, Miss. Jamila",female,14,1,0,2651,11.2417,,C +39,0,3,"Vander Planke, Miss Augusta Maria",female,18,2,0,345764,18,,S +40,1,3,"Nicola-Yarred, Miss Jamila",female,14,1,0,2651,11.2417,,C 41,0,3,"Ahlin, Mrs. Johan (Johanna Persdotter Larsson)",female,40,1,0,7546,9.475,,S 42,0,2,"Turpin, Mrs. William John Robert (Dorothy Ann Wonnacott)",female,27,1,0,11668,21,,S 43,0,3,"Kraeff, Mr. Theodor",male,,0,0,349253,7.8958,,C -44,1,2,"Laroche, Miss. Simonne Marie Anne Andree",female,3,1,2,SC/Paris 2123,41.5792,,C -45,1,3,"Devaney, Miss. Margaret Delia",female,19,0,0,330958,7.8792,,Q +44,1,2,"Laroche, Miss Simonne Marie Anne Andree",female,3,1,2,SC/Paris 2123,41.5792,,C +45,1,3,"Devaney, Miss Margaret Delia",female,19,0,0,330958,7.8792,,Q 46,0,3,"Rogers, Mr. William John",male,,0,0,S.C./A.4. 23567,8.05,,S 47,0,3,"Lennon, Mr. Denis",male,,1,0,370371,15.5,,Q -48,1,3,"O'Driscoll, Miss. Bridget",female,,0,0,14311,7.75,,Q +48,1,3,"O'Driscoll, Miss Bridget",female,,0,0,14311,7.75,,Q 49,0,3,"Samaan, Mr. Youssef",male,,2,0,2662,21.6792,,C 50,0,3,"Arnold-Franchi, Mrs. Josef (Josefine Franchi)",female,18,1,0,349237,17.8,,S -51,0,3,"Panula, Master. Juha Niilo",male,7,4,1,3101295,39.6875,,S +51,0,3,"Panula, Master Juha Niilo",male,7,4,1,3101295,39.6875,,S 52,0,3,"Nosworthy, Mr. Richard Cater",male,21,0,0,A/4. 39886,7.8,,S 53,1,1,"Harper, Mrs. Henry Sleeper (Myna Haxtun)",female,49,1,0,PC 17572,76.7292,D33,C 54,1,2,"Faunthorpe, Mrs. Lizzie (Elizabeth Anne Wilkinson)",female,29,1,0,2926,26,,S 55,0,1,"Ostby, Mr. Engelhart Cornelius",male,65,0,1,113509,61.9792,B30,C 56,1,1,"Woolner, Mr. Hugh",male,,0,0,19947,35.5,C52,S -57,1,2,"Rugg, Miss. Emily",female,21,0,0,C.A. 31026,10.5,,S +57,1,2,"Rugg, Miss Emily",female,21,0,0,C.A. 31026,10.5,,S 58,0,3,"Novel, Mr. Mansouer",male,28.5,0,0,2697,7.2292,,C -59,1,2,"West, Miss. Constance Mirium",female,5,1,2,C.A. 34651,27.75,,S -60,0,3,"Goodwin, Master. William Frederick",male,11,5,2,CA 2144,46.9,,S +59,1,2,"West, Miss Constance Mirium",female,5,1,2,C.A. 34651,27.75,,S +60,0,3,"Goodwin, Master William Frederick",male,11,5,2,CA 2144,46.9,,S 61,0,3,"Sirayanian, Mr. Orsen",male,22,0,0,2669,7.2292,,C -62,1,1,"Icard, Miss. Amelie",female,38,0,0,113572,80,B28, +62,1,1,"Icard, Miss Amelie",female,38,0,0,113572,80,B28, 63,0,1,"Harris, Mr. Henry Birkhardt",male,45,1,0,36973,83.475,C83,S -64,0,3,"Skoog, Master. Harald",male,4,3,2,347088,27.9,,S +64,0,3,"Skoog, Master Harald",male,4,3,2,347088,27.9,,S 65,0,1,"Stewart, Mr. Albert A",male,,0,0,PC 17605,27.7208,,C -66,1,3,"Moubarek, Master. Gerios",male,,1,1,2661,15.2458,,C +66,1,3,"Moubarek, Master Gerios",male,,1,1,2661,15.2458,,C 67,1,2,"Nye, Mrs. (Elizabeth Ramell)",female,29,0,0,C.A. 29395,10.5,F33,S 68,0,3,"Crease, Mr. Ernest James",male,19,0,0,S.P. 3464,8.1583,,S -69,1,3,"Andersson, Miss. Erna Alexandra",female,17,4,2,3101281,7.925,,S +69,1,3,"Andersson, Miss Erna Alexandra",female,17,4,2,3101281,7.925,,S 70,0,3,"Kink, Mr. Vincenz",male,26,2,0,315151,8.6625,,S 71,0,2,"Jenkin, Mr. Stephen Curnow",male,32,0,0,C.A. 33111,10.5,,S -72,0,3,"Goodwin, Miss. Lillian Amy",female,16,5,2,CA 2144,46.9,,S +72,0,3,"Goodwin, Miss Lillian Amy",female,16,5,2,CA 2144,46.9,,S 73,0,2,"Hood, Mr. Ambrose Jr",male,21,0,0,S.O.C. 14879,73.5,,S 74,0,3,"Chronopoulos, Mr. Apostolos",male,26,1,0,2680,14.4542,,C 75,1,3,"Bing, Mr. Lee",male,32,0,0,1601,56.4958,,S 76,0,3,"Moen, Mr. Sigurd Hansen",male,25,0,0,348123,7.65,F G73,S 77,0,3,"Staneff, Mr. Ivan",male,,0,0,349208,7.8958,,S 78,0,3,"Moutal, Mr. Rahamin Haim",male,,0,0,374746,8.05,,S -79,1,2,"Caldwell, Master. Alden Gates",male,0.83,0,2,248738,29,,S -80,1,3,"Dowdell, Miss. Elizabeth",female,30,0,0,364516,12.475,,S +79,1,2,"Caldwell, Master Alden Gates",male,0.83,0,2,248738,29,,S +80,1,3,"Dowdell, Miss Elizabeth",female,30,0,0,364516,12.475,,S 81,0,3,"Waelens, Mr. Achille",male,22,0,0,345767,9,,S 82,1,3,"Sheerlinck, Mr. Jan Baptist",male,29,0,0,345779,9.5,,S -83,1,3,"McDermott, Miss. Brigdet Delia",female,,0,0,330932,7.7875,,Q +83,1,3,"McDermott, Miss Brigdet Delia",female,,0,0,330932,7.7875,,Q 84,0,1,"Carrau, Mr. Francisco M",male,28,0,0,113059,47.1,,S -85,1,2,"Ilett, Miss. Bertha",female,17,0,0,SO/C 14885,10.5,,S +85,1,2,"Ilett, Miss Bertha",female,17,0,0,SO/C 14885,10.5,,S 86,1,3,"Backstrom, Mrs. Karl Alfred (Maria Mathilda Gustafsson)",female,33,3,0,3101278,15.85,,S 87,0,3,"Ford, Mr. William Neal",male,16,1,3,W./C. 6608,34.375,,S 88,0,3,"Slocovski, Mr. Selman Francis",male,,0,0,SOTON/OQ 392086,8.05,,S -89,1,1,"Fortune, Miss. Mabel Helen",female,23,3,2,19950,263,C23 C25 C27,S +89,1,1,"Fortune, Miss Mabel Helen",female,23,3,2,19950,263,C23 C25 C27,S 90,0,3,"Celotti, Mr. Francesco",male,24,0,0,343275,8.05,,S 91,0,3,"Christmann, Mr. Emil",male,29,0,0,343276,8.05,,S 92,0,3,"Andreasson, Mr. Paul Edvin",male,20,0,0,347466,7.8542,,S @@ -99,35 +99,35 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 98,1,1,"Greenfield, Mr. William Bertram",male,23,0,1,PC 17759,63.3583,D10 D12,C 99,1,2,"Doling, Mrs. John T (Ada Julia Bone)",female,34,0,1,231919,23,,S 100,0,2,"Kantor, Mr. Sinai",male,34,1,0,244367,26,,S -101,0,3,"Petranec, Miss. Matilda",female,28,0,0,349245,7.8958,,S +101,0,3,"Petranec, Miss Matilda",female,28,0,0,349245,7.8958,,S 102,0,3,"Petroff, Mr. Pastcho (""Pentcho"")",male,,0,0,349215,7.8958,,S 103,0,1,"White, Mr. Richard Frasar",male,21,0,1,35281,77.2875,D26,S 104,0,3,"Johansson, Mr. Gustaf Joel",male,33,0,0,7540,8.6542,,S 105,0,3,"Gustafsson, Mr. Anders Vilhelm",male,37,2,0,3101276,7.925,,S 106,0,3,"Mionoff, Mr. Stoytcho",male,28,0,0,349207,7.8958,,S -107,1,3,"Salkjelsvik, Miss. Anna Kristine",female,21,0,0,343120,7.65,,S +107,1,3,"Salkjelsvik, Miss Anna Kristine",female,21,0,0,343120,7.65,,S 108,1,3,"Moss, Mr. Albert Johan",male,,0,0,312991,7.775,,S 109,0,3,"Rekic, Mr. Tido",male,38,0,0,349249,7.8958,,S -110,1,3,"Moran, Miss. Bertha",female,,1,0,371110,24.15,,Q +110,1,3,"Moran, Miss Bertha",female,,1,0,371110,24.15,,Q 111,0,1,"Porter, Mr. Walter Chamberlain",male,47,0,0,110465,52,C110,S -112,0,3,"Zabour, Miss. Hileni",female,14.5,1,0,2665,14.4542,,C +112,0,3,"Zabour, Miss Hileni",female,14.5,1,0,2665,14.4542,,C 113,0,3,"Barton, Mr. David John",male,22,0,0,324669,8.05,,S -114,0,3,"Jussila, Miss. Katriina",female,20,1,0,4136,9.825,,S -115,0,3,"Attalah, Miss. Malake",female,17,0,0,2627,14.4583,,C +114,0,3,"Jussila, Miss Katriina",female,20,1,0,4136,9.825,,S +115,0,3,"Attalah, Miss Malake",female,17,0,0,2627,14.4583,,C 116,0,3,"Pekoniemi, Mr. Edvard",male,21,0,0,STON/O 2. 3101294,7.925,,S 117,0,3,"Connors, Mr. Patrick",male,70.5,0,0,370369,7.75,,Q 118,0,2,"Turpin, Mr. William John Robert",male,29,1,0,11668,21,,S 119,0,1,"Baxter, Mr. Quigg Edmond",male,24,0,1,PC 17558,247.5208,B58 B60,C -120,0,3,"Andersson, Miss. Ellis Anna Maria",female,2,4,2,347082,31.275,,S +120,0,3,"Andersson, Miss Ellis Anna Maria",female,2,4,2,347082,31.275,,S 121,0,2,"Hickman, Mr. Stanley George",male,21,2,0,S.O.C. 14879,73.5,,S 122,0,3,"Moore, Mr. Leonard Charles",male,,0,0,A4. 54510,8.05,,S 123,0,2,"Nasser, Mr. Nicholas",male,32.5,1,0,237736,30.0708,,C -124,1,2,"Webber, Miss. Susan",female,32.5,0,0,27267,13,E101,S +124,1,2,"Webber, Miss Susan",female,32.5,0,0,27267,13,E101,S 125,0,1,"White, Mr. Percival Wayland",male,54,0,1,35281,77.2875,D26,S -126,1,3,"Nicola-Yarred, Master. Elias",male,12,1,0,2651,11.2417,,C +126,1,3,"Nicola-Yarred, Master Elias",male,12,1,0,2651,11.2417,,C 127,0,3,"McMahon, Mr. Martin",male,,0,0,370372,7.75,,Q 128,1,3,"Madsen, Mr. Fridtjof Arne",male,24,0,0,C 17369,7.1417,,S -129,1,3,"Peter, Miss. Anna",female,,1,1,2668,22.3583,F E69,C +129,1,3,"Peter, Miss Anna",female,,1,1,2668,22.3583,F E69,C 130,0,3,"Ekstrom, Mr. Johan",male,45,0,0,347061,6.975,,S 131,0,3,"Drazenoic, Mr. Jozef",male,33,0,0,349241,7.8958,,C 132,0,3,"Coelho, Mr. Domingos Fernandeo",male,20,0,0,SOTON/O.Q. 3101307,7.05,,S @@ -135,18 +135,18 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 134,1,2,"Weisz, Mrs. Leopold (Mathilde Francoise Pede)",female,29,1,0,228414,26,,S 135,0,2,"Sobey, Mr. Samuel James Hayden",male,25,0,0,C.A. 29178,13,,S 136,0,2,"Richard, Mr. Emile",male,23,0,0,SC/PARIS 2133,15.0458,,C -137,1,1,"Newsom, Miss. Helen Monypeny",female,19,0,2,11752,26.2833,D47,S +137,1,1,"Newsom, Miss Helen Monypeny",female,19,0,2,11752,26.2833,D47,S 138,0,1,"Futrelle, Mr. Jacques Heath",male,37,1,0,113803,53.1,C123,S 139,0,3,"Osen, Mr. Olaf Elon",male,16,0,0,7534,9.2167,,S 140,0,1,"Giglio, Mr. Victor",male,24,0,0,PC 17593,79.2,B86,C 141,0,3,"Boulos, Mrs. Joseph (Sultana)",female,,0,2,2678,15.2458,,C -142,1,3,"Nysten, Miss. Anna Sofia",female,22,0,0,347081,7.75,,S +142,1,3,"Nysten, Miss Anna Sofia",female,22,0,0,347081,7.75,,S 143,1,3,"Hakkarainen, Mrs. Pekka Pietari (Elin Matilda Dolck)",female,24,1,0,STON/O2. 3101279,15.85,,S 144,0,3,"Burke, Mr. Jeremiah",male,19,0,0,365222,6.75,,Q 145,0,2,"Andrew, Mr. Edgardo Samuel",male,18,0,0,231945,11.5,,S 146,0,2,"Nicholls, Mr. Joseph Charles",male,19,1,1,C.A. 33112,36.75,,S 147,1,3,"Andersson, Mr. August Edvard (""Wennerstrom"")",male,27,0,0,350043,7.7958,,S -148,0,3,"Ford, Miss. Robina Maggie ""Ruby""",female,9,2,2,W./C. 6608,34.375,,S +148,0,3,"Ford, Miss Robina Maggie ""Ruby""",female,9,2,2,W./C. 6608,34.375,,S 149,0,2,"Navratil, Mr. Michel (""Louis M Hoffman"")",male,36.5,0,2,230080,26,F2,S 150,0,2,"Byles, Rev. Thomas Roussel Davids",male,42,0,0,244310,13,,S 151,0,2,"Bateman, Rev. Robert James",male,51,0,0,S.O.P. 1166,12.525,,S @@ -155,35 +155,35 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 154,0,3,"van Billiard, Mr. Austin Blyler",male,40.5,0,2,A/5. 851,14.5,,S 155,0,3,"Olsen, Mr. Ole Martin",male,,0,0,Fa 265302,7.3125,,S 156,0,1,"Williams, Mr. Charles Duane",male,51,0,1,PC 17597,61.3792,,C -157,1,3,"Gilnagh, Miss. Katherine ""Katie""",female,16,0,0,35851,7.7333,,Q +157,1,3,"Gilnagh, Miss Katherine ""Katie""",female,16,0,0,35851,7.7333,,Q 158,0,3,"Corn, Mr. Harry",male,30,0,0,SOTON/OQ 392090,8.05,,S 159,0,3,"Smiljanic, Mr. Mile",male,,0,0,315037,8.6625,,S -160,0,3,"Sage, Master. Thomas Henry",male,,8,2,CA. 2343,69.55,,S +160,0,3,"Sage, Master Thomas Henry",male,,8,2,CA. 2343,69.55,,S 161,0,3,"Cribb, Mr. John Hatfield",male,44,0,1,371362,16.1,,S 162,1,2,"Watt, Mrs. James (Elizabeth ""Bessie"" Inglis Milne)",female,40,0,0,C.A. 33595,15.75,,S 163,0,3,"Bengtsson, Mr. John Viktor",male,26,0,0,347068,7.775,,S 164,0,3,"Calic, Mr. Jovo",male,17,0,0,315093,8.6625,,S -165,0,3,"Panula, Master. Eino Viljami",male,1,4,1,3101295,39.6875,,S -166,1,3,"Goldsmith, Master. Frank John William ""Frankie""",male,9,0,2,363291,20.525,,S +165,0,3,"Panula, Master Eino Viljami",male,1,4,1,3101295,39.6875,,S +166,1,3,"Goldsmith, Master Frank John William ""Frankie""",male,9,0,2,363291,20.525,,S 167,1,1,"Chibnall, Mrs. (Edith Martha Bowerman)",female,,0,1,113505,55,E33,S 168,0,3,"Skoog, Mrs. William (Anna Bernhardina Karlsson)",female,45,1,4,347088,27.9,,S 169,0,1,"Baumann, Mr. John D",male,,0,0,PC 17318,25.925,,S 170,0,3,"Ling, Mr. Lee",male,28,0,0,1601,56.4958,,S 171,0,1,"Van der hoef, Mr. Wyckoff",male,61,0,0,111240,33.5,B19,S -172,0,3,"Rice, Master. Arthur",male,4,4,1,382652,29.125,,Q -173,1,3,"Johnson, Miss. Eleanor Ileen",female,1,1,1,347742,11.1333,,S +172,0,3,"Rice, Master Arthur",male,4,4,1,382652,29.125,,Q +173,1,3,"Johnson, Miss Eleanor Ileen",female,1,1,1,347742,11.1333,,S 174,0,3,"Sivola, Mr. Antti Wilhelm",male,21,0,0,STON/O 2. 3101280,7.925,,S 175,0,1,"Smith, Mr. James Clinch",male,56,0,0,17764,30.6958,A7,C 176,0,3,"Klasen, Mr. Klas Albin",male,18,1,1,350404,7.8542,,S -177,0,3,"Lefebre, Master. Henry Forbes",male,,3,1,4133,25.4667,,S -178,0,1,"Isham, Miss. Ann Elizabeth",female,50,0,0,PC 17595,28.7125,C49,C +177,0,3,"Lefebre, Master Henry Forbes",male,,3,1,4133,25.4667,,S +178,0,1,"Isham, Miss Ann Elizabeth",female,50,0,0,PC 17595,28.7125,C49,C 179,0,2,"Hale, Mr. Reginald",male,30,0,0,250653,13,,S 180,0,3,"Leonard, Mr. Lionel",male,36,0,0,LINE,0,,S -181,0,3,"Sage, Miss. Constance Gladys",female,,8,2,CA. 2343,69.55,,S +181,0,3,"Sage, Miss Constance Gladys",female,,8,2,CA. 2343,69.55,,S 182,0,2,"Pernot, Mr. Rene",male,,0,0,SC/PARIS 2131,15.05,,C -183,0,3,"Asplund, Master. Clarence Gustaf Hugo",male,9,4,2,347077,31.3875,,S -184,1,2,"Becker, Master. Richard F",male,1,2,1,230136,39,F4,S -185,1,3,"Kink-Heilmann, Miss. Luise Gretchen",female,4,0,2,315153,22.025,,S +183,0,3,"Asplund, Master Clarence Gustaf Hugo",male,9,4,2,347077,31.3875,,S +184,1,2,"Becker, Master Richard F",male,1,2,1,230136,39,F4,S +185,1,3,"Kink-Heilmann, Miss Luise Gretchen",female,4,0,2,315153,22.025,,S 186,0,1,"Rood, Mr. Hugh Roscoe",male,,0,0,113767,50,A32,S 187,1,3,"O'Brien, Mrs. Thomas (Johanna ""Hannah"" Godfrey)",female,,1,0,370365,15.5,,Q 188,1,1,"Romaine, Mr. Charles Hallace (""Mr C Rolmane"")",male,45,0,0,111428,26.55,,S @@ -191,33 +191,33 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 190,0,3,"Turcin, Mr. Stjepan",male,36,0,0,349247,7.8958,,S 191,1,2,"Pinsky, Mrs. (Rosa)",female,32,0,0,234604,13,,S 192,0,2,"Carbines, Mr. William",male,19,0,0,28424,13,,S -193,1,3,"Andersen-Jensen, Miss. Carla Christine Nielsine",female,19,1,0,350046,7.8542,,S -194,1,2,"Navratil, Master. Michel M",male,3,1,1,230080,26,F2,S +193,1,3,"Andersen-Jensen, Miss Carla Christine Nielsine",female,19,1,0,350046,7.8542,,S +194,1,2,"Navratil, Master Michel M",male,3,1,1,230080,26,F2,S 195,1,1,"Brown, Mrs. James Joseph (Margaret Tobin)",female,44,0,0,PC 17610,27.7208,B4,C -196,1,1,"Lurette, Miss. Elise",female,58,0,0,PC 17569,146.5208,B80,C +196,1,1,"Lurette, Miss Elise",female,58,0,0,PC 17569,146.5208,B80,C 197,0,3,"Mernagh, Mr. Robert",male,,0,0,368703,7.75,,Q 198,0,3,"Olsen, Mr. Karl Siegwart Andreas",male,42,0,1,4579,8.4042,,S -199,1,3,"Madigan, Miss. Margaret ""Maggie""",female,,0,0,370370,7.75,,Q -200,0,2,"Yrois, Miss. Henriette (""Mrs Harbeck"")",female,24,0,0,248747,13,,S +199,1,3,"Madigan, Miss Margaret ""Maggie""",female,,0,0,370370,7.75,,Q +200,0,2,"Yrois, Miss Henriette (""Mrs Harbeck"")",female,24,0,0,248747,13,,S 201,0,3,"Vande Walle, Mr. Nestor Cyriel",male,28,0,0,345770,9.5,,S 202,0,3,"Sage, Mr. Frederick",male,,8,2,CA. 2343,69.55,,S 203,0,3,"Johanson, Mr. Jakob Alfred",male,34,0,0,3101264,6.4958,,S 204,0,3,"Youseff, Mr. Gerious",male,45.5,0,0,2628,7.225,,C 205,1,3,"Cohen, Mr. Gurshon ""Gus""",male,18,0,0,A/5 3540,8.05,,S -206,0,3,"Strom, Miss. Telma Matilda",female,2,0,1,347054,10.4625,G6,S +206,0,3,"Strom, Miss Telma Matilda",female,2,0,1,347054,10.4625,G6,S 207,0,3,"Backstrom, Mr. Karl Alfred",male,32,1,0,3101278,15.85,,S 208,1,3,"Albimona, Mr. Nassef Cassem",male,26,0,0,2699,18.7875,,C -209,1,3,"Carr, Miss. Helen ""Ellen""",female,16,0,0,367231,7.75,,Q +209,1,3,"Carr, Miss Helen ""Ellen""",female,16,0,0,367231,7.75,,Q 210,1,1,"Blank, Mr. Henry",male,40,0,0,112277,31,A31,C 211,0,3,"Ali, Mr. Ahmed",male,24,0,0,SOTON/O.Q. 3101311,7.05,,S -212,1,2,"Cameron, Miss. Clear Annie",female,35,0,0,F.C.C. 13528,21,,S +212,1,2,"Cameron, Miss Clear Annie",female,35,0,0,F.C.C. 13528,21,,S 213,0,3,"Perkin, Mr. John Henry",male,22,0,0,A/5 21174,7.25,,S 214,0,2,"Givard, Mr. Hans Kristensen",male,30,0,0,250646,13,,S 215,0,3,"Kiernan, Mr. Philip",male,,1,0,367229,7.75,,Q -216,1,1,"Newell, Miss. Madeleine",female,31,1,0,35273,113.275,D36,C -217,1,3,"Honkanen, Miss. Eliina",female,27,0,0,STON/O2. 3101283,7.925,,S +216,1,1,"Newell, Miss Madeleine",female,31,1,0,35273,113.275,D36,C +217,1,3,"Honkanen, Miss Eliina",female,27,0,0,STON/O2. 3101283,7.925,,S 218,0,2,"Jacobsohn, Mr. Sidney Samuel",male,42,1,0,243847,27,,S -219,1,1,"Bazzani, Miss. Albina",female,32,0,0,11813,76.2917,D15,C +219,1,1,"Bazzani, Miss Albina",female,32,0,0,11813,76.2917,D15,C 220,0,2,"Harris, Mr. Walter",male,30,0,0,W/C 14208,10.5,,S 221,1,3,"Sunderland, Mr. Victor Francis",male,16,0,0,SOTON/OQ 392089,8.05,,S 222,0,2,"Bracken, Mr. James H",male,27,0,0,220367,13,,S @@ -228,24 +228,24 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 227,1,2,"Mellors, Mr. William John",male,19,0,0,SW/PP 751,10.5,,S 228,0,3,"Lovell, Mr. John Hall (""Henry"")",male,20.5,0,0,A/5 21173,7.25,,S 229,0,2,"Fahlstrom, Mr. Arne Jonas",male,18,0,0,236171,13,,S -230,0,3,"Lefebre, Miss. Mathilde",female,,3,1,4133,25.4667,,S +230,0,3,"Lefebre, Miss Mathilde",female,,3,1,4133,25.4667,,S 231,1,1,"Harris, Mrs. Henry Birkhardt (Irene Wallach)",female,35,1,0,36973,83.475,C83,S 232,0,3,"Larsson, Mr. Bengt Edvin",male,29,0,0,347067,7.775,,S 233,0,2,"Sjostedt, Mr. Ernst Adolf",male,59,0,0,237442,13.5,,S -234,1,3,"Asplund, Miss. Lillian Gertrud",female,5,4,2,347077,31.3875,,S +234,1,3,"Asplund, Miss Lillian Gertrud",female,5,4,2,347077,31.3875,,S 235,0,2,"Leyson, Mr. Robert William Norman",male,24,0,0,C.A. 29566,10.5,,S -236,0,3,"Harknett, Miss. Alice Phoebe",female,,0,0,W./C. 6609,7.55,,S +236,0,3,"Harknett, Miss Alice Phoebe",female,,0,0,W./C. 6609,7.55,,S 237,0,2,"Hold, Mr. Stephen",male,44,1,0,26707,26,,S -238,1,2,"Collyer, Miss. Marjorie ""Lottie""",female,8,0,2,C.A. 31921,26.25,,S +238,1,2,"Collyer, Miss Marjorie ""Lottie""",female,8,0,2,C.A. 31921,26.25,,S 239,0,2,"Pengelly, Mr. Frederick William",male,19,0,0,28665,10.5,,S 240,0,2,"Hunt, Mr. George Henry",male,33,0,0,SCO/W 1585,12.275,,S -241,0,3,"Zabour, Miss. Thamine",female,,1,0,2665,14.4542,,C -242,1,3,"Murphy, Miss. Katherine ""Kate""",female,,1,0,367230,15.5,,Q +241,0,3,"Zabour, Miss Thamine",female,,1,0,2665,14.4542,,C +242,1,3,"Murphy, Miss Katherine ""Kate""",female,,1,0,367230,15.5,,Q 243,0,2,"Coleridge, Mr. Reginald Charles",male,29,0,0,W./C. 14263,10.5,,S 244,0,3,"Maenpaa, Mr. Matti Alexanteri",male,22,0,0,STON/O 2. 3101275,7.125,,S 245,0,3,"Attalah, Mr. Sleiman",male,30,0,0,2694,7.225,,C 246,0,1,"Minahan, Dr. William Edward",male,44,2,0,19928,90,C78,Q -247,0,3,"Lindahl, Miss. Agda Thorilda Viktoria",female,25,0,0,347071,7.775,,S +247,0,3,"Lindahl, Miss Agda Thorilda Viktoria",female,25,0,0,347071,7.775,,S 248,1,2,"Hamalainen, Mrs. William (Anna)",female,24,0,2,250649,14.5,,S 249,1,1,"Beckwith, Mr. Richard Leonard",male,37,1,1,11751,52.5542,D35,S 250,0,2,"Carter, Rev. Ernest Courtenay",male,54,1,0,244252,26,,S @@ -256,28 +256,28 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 255,0,3,"Rosblom, Mrs. Viktor (Helena Wilhelmina)",female,41,0,2,370129,20.2125,,S 256,1,3,"Touma, Mrs. Darwis (Hanne Youssef Razi)",female,29,0,2,2650,15.2458,,C 257,1,1,"Thorne, Mrs. Gertrude Maybelle",female,,0,0,PC 17585,79.2,,C -258,1,1,"Cherry, Miss. Gladys",female,30,0,0,110152,86.5,B77,S -259,1,1,"Ward, Miss. Anna",female,35,0,0,PC 17755,512.3292,,C +258,1,1,"Cherry, Miss Gladys",female,30,0,0,110152,86.5,B77,S +259,1,1,"Ward, Miss Anna",female,35,0,0,PC 17755,512.3292,,C 260,1,2,"Parrish, Mrs. (Lutie Davis)",female,50,0,1,230433,26,,S 261,0,3,"Smith, Mr. Thomas",male,,0,0,384461,7.75,,Q -262,1,3,"Asplund, Master. Edvin Rojj Felix",male,3,4,2,347077,31.3875,,S +262,1,3,"Asplund, Master Edvin Rojj Felix",male,3,4,2,347077,31.3875,,S 263,0,1,"Taussig, Mr. Emil",male,52,1,1,110413,79.65,E67,S 264,0,1,"Harrison, Mr. William",male,40,0,0,112059,0,B94,S -265,0,3,"Henry, Miss. Delia",female,,0,0,382649,7.75,,Q +265,0,3,"Henry, Miss Delia",female,,0,0,382649,7.75,,Q 266,0,2,"Reeves, Mr. David",male,36,0,0,C.A. 17248,10.5,,S 267,0,3,"Panula, Mr. Ernesti Arvid",male,16,4,1,3101295,39.6875,,S 268,1,3,"Persson, Mr. Ernst Ulrik",male,25,1,0,347083,7.775,,S 269,1,1,"Graham, Mrs. William Thompson (Edith Junkins)",female,58,0,1,PC 17582,153.4625,C125,S -270,1,1,"Bissette, Miss. Amelia",female,35,0,0,PC 17760,135.6333,C99,S +270,1,1,"Bissette, Miss Amelia",female,35,0,0,PC 17760,135.6333,C99,S 271,0,1,"Cairns, Mr. Alexander",male,,0,0,113798,31,,S 272,1,3,"Tornquist, Mr. William Henry",male,25,0,0,LINE,0,,S 273,1,2,"Mellinger, Mrs. (Elizabeth Anne Maidment)",female,41,0,1,250644,19.5,,S 274,0,1,"Natsch, Mr. Charles H",male,37,0,1,PC 17596,29.7,C118,C -275,1,3,"Healy, Miss. Hanora ""Nora""",female,,0,0,370375,7.75,,Q -276,1,1,"Andrews, Miss. Kornelia Theodosia",female,63,1,0,13502,77.9583,D7,S -277,0,3,"Lindblom, Miss. Augusta Charlotta",female,45,0,0,347073,7.75,,S +275,1,3,"Healy, Miss Hanora ""Nora""",female,,0,0,370375,7.75,,Q +276,1,1,"Andrews, Miss Kornelia Theodosia",female,63,1,0,13502,77.9583,D7,S +277,0,3,"Lindblom, Miss Augusta Charlotta",female,45,0,0,347073,7.75,,S 278,0,2,"Parkes, Mr. Francis ""Frank""",male,,0,0,239853,0,,S -279,0,3,"Rice, Master. Eric",male,7,4,1,382652,29.125,,Q +279,0,3,"Rice, Master Eric",male,7,4,1,382652,29.125,,Q 280,1,3,"Abbott, Mrs. Stanton (Rosa Hunt)",female,35,1,1,C.A. 2673,20.25,,S 281,0,3,"Duane, Mr. Frank",male,65,0,0,336439,7.75,,Q 282,0,3,"Olsson, Mr. Nils Johan Goransson",male,28,0,0,347464,7.8542,,S @@ -288,66 +288,66 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 287,1,3,"de Mulder, Mr. Theodore",male,30,0,0,345774,9.5,,S 288,0,3,"Naidenoff, Mr. Penko",male,22,0,0,349206,7.8958,,S 289,1,2,"Hosono, Mr. Masabumi",male,42,0,0,237798,13,,S -290,1,3,"Connolly, Miss. Kate",female,22,0,0,370373,7.75,,Q -291,1,1,"Barber, Miss. Ellen ""Nellie""",female,26,0,0,19877,78.85,,S +290,1,3,"Connolly, Miss Kate",female,22,0,0,370373,7.75,,Q +291,1,1,"Barber, Miss Ellen ""Nellie""",female,26,0,0,19877,78.85,,S 292,1,1,"Bishop, Mrs. Dickinson H (Helen Walton)",female,19,1,0,11967,91.0792,B49,C 293,0,2,"Levy, Mr. Rene Jacques",male,36,0,0,SC/Paris 2163,12.875,D,C -294,0,3,"Haas, Miss. Aloisia",female,24,0,0,349236,8.85,,S +294,0,3,"Haas, Miss Aloisia",female,24,0,0,349236,8.85,,S 295,0,3,"Mineff, Mr. Ivan",male,24,0,0,349233,7.8958,,S 296,0,1,"Lewy, Mr. Ervin G",male,,0,0,PC 17612,27.7208,,C 297,0,3,"Hanna, Mr. Mansour",male,23.5,0,0,2693,7.2292,,C -298,0,1,"Allison, Miss. Helen Loraine",female,2,1,2,113781,151.55,C22 C26,S +298,0,1,"Allison, Miss Helen Loraine",female,2,1,2,113781,151.55,C22 C26,S 299,1,1,"Saalfeld, Mr. Adolphe",male,,0,0,19988,30.5,C106,S 300,1,1,"Baxter, Mrs. James (Helene DeLaudeniere Chaput)",female,50,0,1,PC 17558,247.5208,B58 B60,C -301,1,3,"Kelly, Miss. Anna Katherine ""Annie Kate""",female,,0,0,9234,7.75,,Q +301,1,3,"Kelly, Miss Anna Katherine ""Annie Kate""",female,,0,0,9234,7.75,,Q 302,1,3,"McCoy, Mr. Bernard",male,,2,0,367226,23.25,,Q 303,0,3,"Johnson, Mr. William Cahoone Jr",male,19,0,0,LINE,0,,S -304,1,2,"Keane, Miss. Nora A",female,,0,0,226593,12.35,E101,Q +304,1,2,"Keane, Miss Nora A",female,,0,0,226593,12.35,E101,Q 305,0,3,"Williams, Mr. Howard Hugh ""Harry""",male,,0,0,A/5 2466,8.05,,S -306,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S -307,1,1,"Fleming, Miss. Margaret",female,,0,0,17421,110.8833,,C +306,1,1,"Allison, Master Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S +307,1,1,"Fleming, Miss Margaret",female,,0,0,17421,110.8833,,C 308,1,1,"Penasco y Castellana, Mrs. Victor de Satode (Maria Josefa Perez de Soto y Vallejo)",female,17,1,0,PC 17758,108.9,C65,C 309,0,2,"Abelson, Mr. Samuel",male,30,1,0,P/PP 3381,24,,C -310,1,1,"Francatelli, Miss. Laura Mabel",female,30,0,0,PC 17485,56.9292,E36,C -311,1,1,"Hays, Miss. Margaret Bechstein",female,24,0,0,11767,83.1583,C54,C -312,1,1,"Ryerson, Miss. Emily Borie",female,18,2,2,PC 17608,262.375,B57 B59 B63 B66,C +310,1,1,"Francatelli, Miss Laura Mabel",female,30,0,0,PC 17485,56.9292,E36,C +311,1,1,"Hays, Miss Margaret Bechstein",female,24,0,0,11767,83.1583,C54,C +312,1,1,"Ryerson, Miss Emily Borie",female,18,2,2,PC 17608,262.375,B57 B59 B63 B66,C 313,0,2,"Lahtinen, Mrs. William (Anna Sylfven)",female,26,1,1,250651,26,,S 314,0,3,"Hendekovic, Mr. Ignjac",male,28,0,0,349243,7.8958,,S 315,0,2,"Hart, Mr. Benjamin",male,43,1,1,F.C.C. 13529,26.25,,S -316,1,3,"Nilsson, Miss. Helmina Josefina",female,26,0,0,347470,7.8542,,S +316,1,3,"Nilsson, Miss Helmina Josefina",female,26,0,0,347470,7.8542,,S 317,1,2,"Kantor, Mrs. Sinai (Miriam Sternin)",female,24,1,0,244367,26,,S 318,0,2,"Moraweck, Dr. Ernest",male,54,0,0,29011,14,,S -319,1,1,"Wick, Miss. Mary Natalie",female,31,0,2,36928,164.8667,C7,S +319,1,1,"Wick, Miss Mary Natalie",female,31,0,2,36928,164.8667,C7,S 320,1,1,"Spedden, Mrs. Frederic Oakley (Margaretta Corning Stone)",female,40,1,1,16966,134.5,E34,C 321,0,3,"Dennis, Mr. Samuel",male,22,0,0,A/5 21172,7.25,,S 322,0,3,"Danoff, Mr. Yoto",male,27,0,0,349219,7.8958,,S -323,1,2,"Slayter, Miss. Hilda Mary",female,30,0,0,234818,12.35,,Q +323,1,2,"Slayter, Miss Hilda Mary",female,30,0,0,234818,12.35,,Q 324,1,2,"Caldwell, Mrs. Albert Francis (Sylvia Mae Harbaugh)",female,22,1,1,248738,29,,S 325,0,3,"Sage, Mr. George John Jr",male,,8,2,CA. 2343,69.55,,S -326,1,1,"Young, Miss. Marie Grice",female,36,0,0,PC 17760,135.6333,C32,C +326,1,1,"Young, Miss Marie Grice",female,36,0,0,PC 17760,135.6333,C32,C 327,0,3,"Nysveen, Mr. Johan Hansen",male,61,0,0,345364,6.2375,,S 328,1,2,"Ball, Mrs. (Ada E Hall)",female,36,0,0,28551,13,D,S 329,1,3,"Goldsmith, Mrs. Frank John (Emily Alice Brown)",female,31,1,1,363291,20.525,,S -330,1,1,"Hippach, Miss. Jean Gertrude",female,16,0,1,111361,57.9792,B18,C -331,1,3,"McCoy, Miss. Agnes",female,,2,0,367226,23.25,,Q +330,1,1,"Hippach, Miss Jean Gertrude",female,16,0,1,111361,57.9792,B18,C +331,1,3,"McCoy, Miss Agnes",female,,2,0,367226,23.25,,Q 332,0,1,"Partner, Mr. Austen",male,45.5,0,0,113043,28.5,C124,S 333,0,1,"Graham, Mr. George Edward",male,38,0,1,PC 17582,153.4625,C91,S 334,0,3,"Vander Planke, Mr. Leo Edmondus",male,16,2,0,345764,18,,S 335,1,1,"Frauenthal, Mrs. Henry William (Clara Heinsheimer)",female,,1,0,PC 17611,133.65,,S 336,0,3,"Denkoff, Mr. Mitto",male,,0,0,349225,7.8958,,S 337,0,1,"Pears, Mr. Thomas Clinton",male,29,1,0,113776,66.6,C2,S -338,1,1,"Burns, Miss. Elizabeth Margaret",female,41,0,0,16966,134.5,E40,C +338,1,1,"Burns, Miss Elizabeth Margaret",female,41,0,0,16966,134.5,E40,C 339,1,3,"Dahl, Mr. Karl Edwart",male,45,0,0,7598,8.05,,S 340,0,1,"Blackwell, Mr. Stephen Weart",male,45,0,0,113784,35.5,T,S -341,1,2,"Navratil, Master. Edmond Roger",male,2,1,1,230080,26,F2,S -342,1,1,"Fortune, Miss. Alice Elizabeth",female,24,3,2,19950,263,C23 C25 C27,S +341,1,2,"Navratil, Master Edmond Roger",male,2,1,1,230080,26,F2,S +342,1,1,"Fortune, Miss Alice Elizabeth",female,24,3,2,19950,263,C23 C25 C27,S 343,0,2,"Collander, Mr. Erik Gustaf",male,28,0,0,248740,13,,S 344,0,2,"Sedgwick, Mr. Charles Frederick Waddington",male,25,0,0,244361,13,,S 345,0,2,"Fox, Mr. Stanley Hubert",male,36,0,0,229236,13,,S -346,1,2,"Brown, Miss. Amelia ""Mildred""",female,24,0,0,248733,13,F33,S -347,1,2,"Smith, Miss. Marion Elsie",female,40,0,0,31418,13,,S +346,1,2,"Brown, Miss Amelia ""Mildred""",female,24,0,0,248733,13,F33,S +347,1,2,"Smith, Miss Marion Elsie",female,40,0,0,31418,13,,S 348,1,3,"Davison, Mrs. Thomas Henry (Mary E Finck)",female,,1,0,386525,16.1,,S -349,1,3,"Coutts, Master. William Loch ""William""",male,3,1,1,C.A. 37671,15.9,,S +349,1,3,"Coutts, Master William Loch ""William""",male,3,1,1,C.A. 37671,15.9,,S 350,0,3,"Dimic, Mr. Jovan",male,42,0,0,315088,8.6625,,S 351,0,3,"Odahl, Mr. Nils Martin",male,23,0,0,7267,9.225,,S 352,0,1,"Williams-Lambert, Mr. Fletcher Fellows",male,,0,0,113510,35,C128,S @@ -355,10 +355,10 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 354,0,3,"Arnold-Franchi, Mr. Josef",male,25,1,0,349237,17.8,,S 355,0,3,"Yousif, Mr. Wazli",male,,0,0,2647,7.225,,C 356,0,3,"Vanden Steen, Mr. Leo Peter",male,28,0,0,345783,9.5,,S -357,1,1,"Bowerman, Miss. Elsie Edith",female,22,0,1,113505,55,E33,S -358,0,2,"Funk, Miss. Annie Clemmer",female,38,0,0,237671,13,,S -359,1,3,"McGovern, Miss. Mary",female,,0,0,330931,7.8792,,Q -360,1,3,"Mockler, Miss. Helen Mary ""Ellie""",female,,0,0,330980,7.8792,,Q +357,1,1,"Bowerman, Miss Elsie Edith",female,22,0,1,113505,55,E33,S +358,0,2,"Funk, Miss Annie Clemmer",female,38,0,0,237671,13,,S +359,1,3,"McGovern, Miss Mary",female,,0,0,330931,7.8792,,Q +360,1,3,"Mockler, Miss Helen Mary ""Ellie""",female,,0,0,330980,7.8792,,Q 361,0,3,"Skoog, Mr. Wilhelm",male,40,1,4,347088,27.9,,S 362,0,2,"del Carlo, Mr. Sebastiano",male,29,1,0,SC/PARIS 2167,27.7208,,C 363,0,3,"Barbara, Mrs. (Catherine David)",female,45,0,1,2691,14.4542,,C @@ -367,58 +367,58 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 366,0,3,"Adahl, Mr. Mauritz Nils Martin",male,30,0,0,C 7076,7.25,,S 367,1,1,"Warren, Mrs. Frank Manley (Anna Sophia Atkinson)",female,60,1,0,110813,75.25,D37,C 368,1,3,"Moussa, Mrs. (Mantoura Boulos)",female,,0,0,2626,7.2292,,C -369,1,3,"Jermyn, Miss. Annie",female,,0,0,14313,7.75,,Q +369,1,3,"Jermyn, Miss Annie",female,,0,0,14313,7.75,,Q 370,1,1,"Aubart, Mme. Leontine Pauline",female,24,0,0,PC 17477,69.3,B35,C 371,1,1,"Harder, Mr. George Achilles",male,25,1,0,11765,55.4417,E50,C 372,0,3,"Wiklund, Mr. Jakob Alfred",male,18,1,0,3101267,6.4958,,S 373,0,3,"Beavan, Mr. William Thomas",male,19,0,0,323951,8.05,,S 374,0,1,"Ringhini, Mr. Sante",male,22,0,0,PC 17760,135.6333,,C -375,0,3,"Palsson, Miss. Stina Viola",female,3,3,1,349909,21.075,,S +375,0,3,"Palsson, Miss Stina Viola",female,3,3,1,349909,21.075,,S 376,1,1,"Meyer, Mrs. Edgar Joseph (Leila Saks)",female,,1,0,PC 17604,82.1708,,C -377,1,3,"Landergren, Miss. Aurora Adelia",female,22,0,0,C 7077,7.25,,S +377,1,3,"Landergren, Miss Aurora Adelia",female,22,0,0,C 7077,7.25,,S 378,0,1,"Widener, Mr. Harry Elkins",male,27,0,2,113503,211.5,C82,C 379,0,3,"Betros, Mr. Tannous",male,20,0,0,2648,4.0125,,C 380,0,3,"Gustafsson, Mr. Karl Gideon",male,19,0,0,347069,7.775,,S -381,1,1,"Bidois, Miss. Rosalie",female,42,0,0,PC 17757,227.525,,C -382,1,3,"Nakid, Miss. Maria (""Mary"")",female,1,0,2,2653,15.7417,,C +381,1,1,"Bidois, Miss Rosalie",female,42,0,0,PC 17757,227.525,,C +382,1,3,"Nakid, Miss Maria (""Mary"")",female,1,0,2,2653,15.7417,,C 383,0,3,"Tikkanen, Mr. Juho",male,32,0,0,STON/O 2. 3101293,7.925,,S 384,1,1,"Holverson, Mrs. Alexander Oskar (Mary Aline Towner)",female,35,1,0,113789,52,,S 385,0,3,"Plotcharsky, Mr. Vasil",male,,0,0,349227,7.8958,,S 386,0,2,"Davies, Mr. Charles Henry",male,18,0,0,S.O.C. 14879,73.5,,S -387,0,3,"Goodwin, Master. Sidney Leonard",male,1,5,2,CA 2144,46.9,,S -388,1,2,"Buss, Miss. Kate",female,36,0,0,27849,13,,S +387,0,3,"Goodwin, Master Sidney Leonard",male,1,5,2,CA 2144,46.9,,S +388,1,2,"Buss, Miss Kate",female,36,0,0,27849,13,,S 389,0,3,"Sadlier, Mr. Matthew",male,,0,0,367655,7.7292,,Q -390,1,2,"Lehmann, Miss. Bertha",female,17,0,0,SC 1748,12,,C +390,1,2,"Lehmann, Miss Bertha",female,17,0,0,SC 1748,12,,C 391,1,1,"Carter, Mr. William Ernest",male,36,1,2,113760,120,B96 B98,S 392,1,3,"Jansson, Mr. Carl Olof",male,21,0,0,350034,7.7958,,S 393,0,3,"Gustafsson, Mr. Johan Birger",male,28,2,0,3101277,7.925,,S -394,1,1,"Newell, Miss. Marjorie",female,23,1,0,35273,113.275,D36,C +394,1,1,"Newell, Miss Marjorie",female,23,1,0,35273,113.275,D36,C 395,1,3,"Sandstrom, Mrs. Hjalmar (Agnes Charlotta Bengtsson)",female,24,0,2,PP 9549,16.7,G6,S 396,0,3,"Johansson, Mr. Erik",male,22,0,0,350052,7.7958,,S -397,0,3,"Olsson, Miss. Elina",female,31,0,0,350407,7.8542,,S +397,0,3,"Olsson, Miss Elina",female,31,0,0,350407,7.8542,,S 398,0,2,"McKane, Mr. Peter David",male,46,0,0,28403,26,,S 399,0,2,"Pain, Dr. Alfred",male,23,0,0,244278,10.5,,S 400,1,2,"Trout, Mrs. William H (Jessie L)",female,28,0,0,240929,12.65,,S 401,1,3,"Niskanen, Mr. Juha",male,39,0,0,STON/O 2. 3101289,7.925,,S 402,0,3,"Adams, Mr. John",male,26,0,0,341826,8.05,,S -403,0,3,"Jussila, Miss. Mari Aina",female,21,1,0,4137,9.825,,S +403,0,3,"Jussila, Miss Mari Aina",female,21,1,0,4137,9.825,,S 404,0,3,"Hakkarainen, Mr. Pekka Pietari",male,28,1,0,STON/O2. 3101279,15.85,,S -405,0,3,"Oreskovic, Miss. Marija",female,20,0,0,315096,8.6625,,S +405,0,3,"Oreskovic, Miss Marija",female,20,0,0,315096,8.6625,,S 406,0,2,"Gale, Mr. Shadrach",male,34,1,0,28664,21,,S 407,0,3,"Widegren, Mr. Carl/Charles Peter",male,51,0,0,347064,7.75,,S -408,1,2,"Richards, Master. William Rowe",male,3,1,1,29106,18.75,,S +408,1,2,"Richards, Master William Rowe",male,3,1,1,29106,18.75,,S 409,0,3,"Birkeland, Mr. Hans Martin Monsen",male,21,0,0,312992,7.775,,S -410,0,3,"Lefebre, Miss. Ida",female,,3,1,4133,25.4667,,S +410,0,3,"Lefebre, Miss Ida",female,,3,1,4133,25.4667,,S 411,0,3,"Sdycoff, Mr. Todor",male,,0,0,349222,7.8958,,S 412,0,3,"Hart, Mr. Henry",male,,0,0,394140,6.8583,,Q -413,1,1,"Minahan, Miss. Daisy E",female,33,1,0,19928,90,C78,Q +413,1,1,"Minahan, Miss Daisy E",female,33,1,0,19928,90,C78,Q 414,0,2,"Cunningham, Mr. Alfred Fleming",male,,0,0,239853,0,,S 415,1,3,"Sundman, Mr. Johan Julian",male,44,0,0,STON/O 2. 3101269,7.925,,S 416,0,3,"Meek, Mrs. Thomas (Annie Louise Rowley)",female,,0,0,343095,8.05,,S 417,1,2,"Drew, Mrs. James Vivian (Lulu Thorne Christian)",female,34,1,1,28220,32.5,,S -418,1,2,"Silven, Miss. Lyyli Karoliina",female,18,0,2,250652,13,,S +418,1,2,"Silven, Miss Lyyli Karoliina",female,18,0,2,250652,13,,S 419,0,2,"Matthews, Mr. William John",male,30,0,0,28228,13,,S -420,0,3,"Van Impe, Miss. Catharina",female,10,0,2,345773,24.15,,S +420,0,3,"Van Impe, Miss Catharina",female,10,0,2,345773,24.15,,S 421,0,3,"Gheorgheff, Mr. Stanio",male,,0,0,349254,7.8958,,C 422,0,3,"Charters, Mr. David",male,21,0,0,A/5. 13032,7.7333,,Q 423,0,3,"Zimmerman, Mr. Leo",male,29,0,0,315082,7.875,,S @@ -426,7 +426,7 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 425,0,3,"Rosblom, Mr. Viktor Richard",male,18,1,1,370129,20.2125,,S 426,0,3,"Wiseman, Mr. Phillippe",male,,0,0,A/4. 34244,7.25,,S 427,1,2,"Clarke, Mrs. Charles V (Ada Maria Winfield)",female,28,1,0,2003,26,,S -428,1,2,"Phillips, Miss. Kate Florence (""Mrs Kate Louise Phillips Marshall"")",female,19,0,0,250655,26,,S +428,1,2,"Phillips, Miss Kate Florence (""Mrs Kate Louise Phillips Marshall"")",female,19,0,0,250655,26,,S 429,0,3,"Flynn, Mr. James",male,,0,0,364851,7.75,,Q 430,1,3,"Pickard, Mr. Berk (Berk Trembisky)",male,32,0,0,SOTON/O.Q. 392078,8.05,E10,S 431,1,1,"Bjornstrom-Steffansson, Mr. Mauritz Hakan",male,28,0,0,110564,26.55,C52,S @@ -434,8 +434,8 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 433,1,2,"Louch, Mrs. Charles Alexander (Alice Adelaide Slow)",female,42,1,0,SC/AH 3085,26,,S 434,0,3,"Kallio, Mr. Nikolai Erland",male,17,0,0,STON/O 2. 3101274,7.125,,S 435,0,1,"Silvey, Mr. William Baird",male,50,1,0,13507,55.9,E44,S -436,1,1,"Carter, Miss. Lucile Polk",female,14,1,2,113760,120,B96 B98,S -437,0,3,"Ford, Miss. Doolina Margaret ""Daisy""",female,21,2,2,W./C. 6608,34.375,,S +436,1,1,"Carter, Miss Lucile Polk",female,14,1,2,113760,120,B96 B98,S +437,0,3,"Ford, Miss Doolina Margaret ""Daisy""",female,21,2,2,W./C. 6608,34.375,,S 438,1,2,"Richards, Mrs. Sidney (Emily Hocking)",female,24,2,3,29106,18.75,,S 439,0,1,"Fortune, Mr. Mark",male,64,1,4,19950,263,C23 C25 C27,S 440,0,2,"Kvillner, Mr. Johan Henrik Johannesson",male,31,0,0,C.A. 18723,10.5,,S @@ -444,10 +444,10 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 443,0,3,"Petterson, Mr. Johan Emil",male,25,1,0,347076,7.775,,S 444,1,2,"Reynaldo, Ms. Encarnacion",female,28,0,0,230434,13,,S 445,1,3,"Johannesen-Bratthammer, Mr. Bernt",male,,0,0,65306,8.1125,,S -446,1,1,"Dodge, Master. Washington",male,4,0,2,33638,81.8583,A34,S -447,1,2,"Mellinger, Miss. Madeleine Violet",female,13,0,1,250644,19.5,,S +446,1,1,"Dodge, Master Washington",male,4,0,2,33638,81.8583,A34,S +447,1,2,"Mellinger, Miss Madeleine Violet",female,13,0,1,250644,19.5,,S 448,1,1,"Seward, Mr. Frederic Kimber",male,34,0,0,113794,26.55,,S -449,1,3,"Baclini, Miss. Marie Catherine",female,5,2,1,2666,19.2583,,C +449,1,3,"Baclini, Miss Marie Catherine",female,5,2,1,2666,19.2583,,C 450,1,1,"Peuchen, Major. Arthur Godfrey",male,52,0,0,113786,30.5,C104,S 451,0,2,"West, Mr. Edwy Arthur",male,36,1,2,C.A. 34651,27.75,,S 452,0,3,"Hagland, Mr. Ingvald Olai Olsen",male,,1,0,65303,19.9667,,S @@ -457,7 +457,7 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 456,1,3,"Jalsevac, Mr. Ivan",male,29,0,0,349240,7.8958,,C 457,0,1,"Millet, Mr. Francis Davis",male,65,0,0,13509,26.55,E38,S 458,1,1,"Kenyon, Mrs. Frederick R (Marion)",female,,1,0,17464,51.8625,D21,S -459,1,2,"Toomey, Miss. Ellen",female,50,0,0,F.C.C. 13531,10.5,,S +459,1,2,"Toomey, Miss Ellen",female,50,0,0,F.C.C. 13531,10.5,,S 460,0,3,"O'Connor, Mr. Maurice",male,,0,0,371060,7.75,,Q 461,1,1,"Anderson, Mr. Harry",male,48,0,0,19952,26.55,E12,S 462,0,3,"Morley, Mr. William",male,34,0,0,364506,8.05,,S @@ -468,42 +468,42 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 467,0,2,"Campbell, Mr. William",male,,0,0,239853,0,,S 468,0,1,"Smart, Mr. John Montgomery",male,56,0,0,113792,26.55,,S 469,0,3,"Scanlan, Mr. James",male,,0,0,36209,7.725,,Q -470,1,3,"Baclini, Miss. Helene Barbara",female,0.75,2,1,2666,19.2583,,C +470,1,3,"Baclini, Miss Helene Barbara",female,0.75,2,1,2666,19.2583,,C 471,0,3,"Keefe, Mr. Arthur",male,,0,0,323592,7.25,,S 472,0,3,"Cacic, Mr. Luka",male,38,0,0,315089,8.6625,,S 473,1,2,"West, Mrs. Edwy Arthur (Ada Mary Worth)",female,33,1,2,C.A. 34651,27.75,,S 474,1,2,"Jerwan, Mrs. Amin S (Marie Marthe Thuillard)",female,23,0,0,SC/AH Basle 541,13.7917,D,C -475,0,3,"Strandberg, Miss. Ida Sofia",female,22,0,0,7553,9.8375,,S +475,0,3,"Strandberg, Miss Ida Sofia",female,22,0,0,7553,9.8375,,S 476,0,1,"Clifford, Mr. George Quincy",male,,0,0,110465,52,A14,S 477,0,2,"Renouf, Mr. Peter Henry",male,34,1,0,31027,21,,S 478,0,3,"Braund, Mr. Lewis Richard",male,29,1,0,3460,7.0458,,S 479,0,3,"Karlsson, Mr. Nils August",male,22,0,0,350060,7.5208,,S -480,1,3,"Hirvonen, Miss. Hildur E",female,2,0,1,3101298,12.2875,,S -481,0,3,"Goodwin, Master. Harold Victor",male,9,5,2,CA 2144,46.9,,S +480,1,3,"Hirvonen, Miss Hildur E",female,2,0,1,3101298,12.2875,,S +481,0,3,"Goodwin, Master Harold Victor",male,9,5,2,CA 2144,46.9,,S 482,0,2,"Frost, Mr. Anthony Wood ""Archie""",male,,0,0,239854,0,,S 483,0,3,"Rouse, Mr. Richard Henry",male,50,0,0,A/5 3594,8.05,,S 484,1,3,"Turkula, Mrs. (Hedwig)",female,63,0,0,4134,9.5875,,S 485,1,1,"Bishop, Mr. Dickinson H",male,25,1,0,11967,91.0792,B49,C -486,0,3,"Lefebre, Miss. Jeannie",female,,3,1,4133,25.4667,,S +486,0,3,"Lefebre, Miss Jeannie",female,,3,1,4133,25.4667,,S 487,1,1,"Hoyt, Mrs. Frederick Maxfield (Jane Anne Forby)",female,35,1,0,19943,90,C93,S 488,0,1,"Kent, Mr. Edward Austin",male,58,0,0,11771,29.7,B37,C 489,0,3,"Somerton, Mr. Francis William",male,30,0,0,A.5. 18509,8.05,,S -490,1,3,"Coutts, Master. Eden Leslie ""Neville""",male,9,1,1,C.A. 37671,15.9,,S +490,1,3,"Coutts, Master Eden Leslie ""Neville""",male,9,1,1,C.A. 37671,15.9,,S 491,0,3,"Hagland, Mr. Konrad Mathias Reiersen",male,,1,0,65304,19.9667,,S 492,0,3,"Windelov, Mr. Einar",male,21,0,0,SOTON/OQ 3101317,7.25,,S 493,0,1,"Molson, Mr. Harry Markland",male,55,0,0,113787,30.5,C30,S 494,0,1,"Artagaveytia, Mr. Ramon",male,71,0,0,PC 17609,49.5042,,C 495,0,3,"Stanley, Mr. Edward Roland",male,21,0,0,A/4 45380,8.05,,S 496,0,3,"Yousseff, Mr. Gerious",male,,0,0,2627,14.4583,,C -497,1,1,"Eustis, Miss. Elizabeth Mussey",female,54,1,0,36947,78.2667,D20,C +497,1,1,"Eustis, Miss Elizabeth Mussey",female,54,1,0,36947,78.2667,D20,C 498,0,3,"Shellard, Mr. Frederick William",male,,0,0,C.A. 6212,15.1,,S 499,0,1,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25,1,2,113781,151.55,C22 C26,S 500,0,3,"Svensson, Mr. Olof",male,24,0,0,350035,7.7958,,S 501,0,3,"Calic, Mr. Petar",male,17,0,0,315086,8.6625,,S -502,0,3,"Canavan, Miss. Mary",female,21,0,0,364846,7.75,,Q -503,0,3,"O'Sullivan, Miss. Bridget Mary",female,,0,0,330909,7.6292,,Q -504,0,3,"Laitinen, Miss. Kristina Sofia",female,37,0,0,4135,9.5875,,S -505,1,1,"Maioni, Miss. Roberta",female,16,0,0,110152,86.5,B79,S +502,0,3,"Canavan, Miss Mary",female,21,0,0,364846,7.75,,Q +503,0,3,"O'Sullivan, Miss Bridget Mary",female,,0,0,330909,7.6292,,Q +504,0,3,"Laitinen, Miss Kristina Sofia",female,37,0,0,4135,9.5875,,S +505,1,1,"Maioni, Miss Roberta",female,16,0,0,110152,86.5,B79,S 506,0,1,"Penasco y Castellana, Mr. Victor de Satode",male,18,1,0,PC 17758,108.9,C65,C 507,1,2,"Quick, Mrs. Frederick Charles (Jane Richards)",female,33,0,2,26360,26,,S 508,1,1,"Bradley, Mr. George (""George Arthur Brayton"")",male,,0,0,111427,26.55,,S @@ -519,41 +519,41 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 518,0,3,"Ryan, Mr. Patrick",male,,0,0,371110,24.15,,Q 519,1,2,"Angle, Mrs. William A (Florence ""Mary"" Agnes Hughes)",female,36,1,0,226875,26,,S 520,0,3,"Pavlovic, Mr. Stefo",male,32,0,0,349242,7.8958,,S -521,1,1,"Perreault, Miss. Anne",female,30,0,0,12749,93.5,B73,S +521,1,1,"Perreault, Miss Anne",female,30,0,0,12749,93.5,B73,S 522,0,3,"Vovk, Mr. Janko",male,22,0,0,349252,7.8958,,S 523,0,3,"Lahoud, Mr. Sarkis",male,,0,0,2624,7.225,,C 524,1,1,"Hippach, Mrs. Louis Albert (Ida Sophia Fischer)",female,44,0,1,111361,57.9792,B18,C 525,0,3,"Kassem, Mr. Fared",male,,0,0,2700,7.2292,,C 526,0,3,"Farrell, Mr. James",male,40.5,0,0,367232,7.75,,Q -527,1,2,"Ridsdale, Miss. Lucy",female,50,0,0,W./C. 14258,10.5,,S +527,1,2,"Ridsdale, Miss Lucy",female,50,0,0,W./C. 14258,10.5,,S 528,0,1,"Farthing, Mr. John",male,,0,0,PC 17483,221.7792,C95,S 529,0,3,"Salonen, Mr. Johan Werner",male,39,0,0,3101296,7.925,,S 530,0,2,"Hocking, Mr. Richard George",male,23,2,1,29104,11.5,,S -531,1,2,"Quick, Miss. Phyllis May",female,2,1,1,26360,26,,S +531,1,2,"Quick, Miss Phyllis May",female,2,1,1,26360,26,,S 532,0,3,"Toufik, Mr. Nakli",male,,0,0,2641,7.2292,,C 533,0,3,"Elias, Mr. Joseph Jr",male,17,1,1,2690,7.2292,,C 534,1,3,"Peter, Mrs. Catherine (Catherine Rizk)",female,,0,2,2668,22.3583,,C -535,0,3,"Cacic, Miss. Marija",female,30,0,0,315084,8.6625,,S -536,1,2,"Hart, Miss. Eva Miriam",female,7,0,2,F.C.C. 13529,26.25,,S +535,0,3,"Cacic, Miss Marija",female,30,0,0,315084,8.6625,,S +536,1,2,"Hart, Miss Eva Miriam",female,7,0,2,F.C.C. 13529,26.25,,S 537,0,1,"Butt, Major. Archibald Willingham",male,45,0,0,113050,26.55,B38,S -538,1,1,"LeRoy, Miss. Bertha",female,30,0,0,PC 17761,106.425,,C +538,1,1,"LeRoy, Miss Bertha",female,30,0,0,PC 17761,106.425,,C 539,0,3,"Risien, Mr. Samuel Beard",male,,0,0,364498,14.5,,S -540,1,1,"Frolicher, Miss. Hedwig Margaritha",female,22,0,2,13568,49.5,B39,C -541,1,1,"Crosby, Miss. Harriet R",female,36,0,2,WE/P 5735,71,B22,S -542,0,3,"Andersson, Miss. Ingeborg Constanzia",female,9,4,2,347082,31.275,,S -543,0,3,"Andersson, Miss. Sigrid Elisabeth",female,11,4,2,347082,31.275,,S +540,1,1,"Frolicher, Miss Hedwig Margaritha",female,22,0,2,13568,49.5,B39,C +541,1,1,"Crosby, Miss Harriet R",female,36,0,2,WE/P 5735,71,B22,S +542,0,3,"Andersson, Miss Ingeborg Constanzia",female,9,4,2,347082,31.275,,S +543,0,3,"Andersson, Miss Sigrid Elisabeth",female,11,4,2,347082,31.275,,S 544,1,2,"Beane, Mr. Edward",male,32,1,0,2908,26,,S 545,0,1,"Douglas, Mr. Walter Donald",male,50,1,0,PC 17761,106.425,C86,C 546,0,1,"Nicholson, Mr. Arthur Ernest",male,64,0,0,693,26,,S 547,1,2,"Beane, Mrs. Edward (Ethel Clarke)",female,19,1,0,2908,26,,S 548,1,2,"Padro y Manent, Mr. Julian",male,,0,0,SC/PARIS 2146,13.8625,,C 549,0,3,"Goldsmith, Mr. Frank John",male,33,1,1,363291,20.525,,S -550,1,2,"Davies, Master. John Morgan Jr",male,8,1,1,C.A. 33112,36.75,,S +550,1,2,"Davies, Master John Morgan Jr",male,8,1,1,C.A. 33112,36.75,,S 551,1,1,"Thayer, Mr. John Borland Jr",male,17,0,2,17421,110.8833,C70,C 552,0,2,"Sharp, Mr. Percival James R",male,27,0,0,244358,26,,S 553,0,3,"O'Brien, Mr. Timothy",male,,0,0,330979,7.8292,,Q 554,1,3,"Leeni, Mr. Fahim (""Philip Zenni"")",male,22,0,0,2620,7.225,,C -555,1,3,"Ohman, Miss. Velin",female,22,0,0,347085,7.775,,S +555,1,3,"Ohman, Miss Velin",female,22,0,0,347085,7.775,,S 556,0,1,"Wright, Mr. George",male,62,0,0,113807,26.55,,S 557,1,1,"Duff Gordon, Lady. (Lucille Christiana Sutherland) (""Mrs Morgan"")",female,48,1,0,11755,39.6,A16,C 558,0,1,"Robbins, Mr. Victor",male,,0,0,PC 17757,227.525,,C @@ -563,7 +563,7 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 562,0,3,"Sivic, Mr. Husein",male,40,0,0,349251,7.8958,,S 563,0,2,"Norman, Mr. Robert Douglas",male,28,0,0,218629,13.5,,S 564,0,3,"Simmons, Mr. John",male,,0,0,SOTON/OQ 392082,8.05,,S -565,0,3,"Meanwell, Miss. (Marion Ogden)",female,,0,0,SOTON/O.Q. 392087,8.05,,S +565,0,3,"Meanwell, Miss (Marion Ogden)",female,,0,0,SOTON/O.Q. 392087,8.05,,S 566,0,3,"Davies, Mr. Alfred J",male,24,2,0,A/4 48871,24.15,,S 567,0,3,"Stoytcheff, Mr. Ilia",male,19,0,0,349205,7.8958,,S 568,0,3,"Palsson, Mrs. Nils (Alma Cornelia Berglund)",female,29,0,4,349909,21.075,,S @@ -572,19 +572,19 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 571,1,2,"Harris, Mr. George",male,62,0,0,S.W./PP 752,10.5,,S 572,1,1,"Appleton, Mrs. Edward Dale (Charlotte Lamson)",female,53,2,0,11769,51.4792,C101,S 573,1,1,"Flynn, Mr. John Irwin (""Irving"")",male,36,0,0,PC 17474,26.3875,E25,S -574,1,3,"Kelly, Miss. Mary",female,,0,0,14312,7.75,,Q +574,1,3,"Kelly, Miss Mary",female,,0,0,14312,7.75,,Q 575,0,3,"Rush, Mr. Alfred George John",male,16,0,0,A/4. 20589,8.05,,S 576,0,3,"Patchett, Mr. George",male,19,0,0,358585,14.5,,S -577,1,2,"Garside, Miss. Ethel",female,34,0,0,243880,13,,S +577,1,2,"Garside, Miss Ethel",female,34,0,0,243880,13,,S 578,1,1,"Silvey, Mrs. William Baird (Alice Munger)",female,39,1,0,13507,55.9,E44,S 579,0,3,"Caram, Mrs. Joseph (Maria Elias)",female,,1,0,2689,14.4583,,C 580,1,3,"Jussila, Mr. Eiriik",male,32,0,0,STON/O 2. 3101286,7.925,,S -581,1,2,"Christy, Miss. Julie Rachel",female,25,1,1,237789,30,,S +581,1,2,"Christy, Miss Julie Rachel",female,25,1,1,237789,30,,S 582,1,1,"Thayer, Mrs. John Borland (Marian Longstreth Morris)",female,39,1,1,17421,110.8833,C68,C 583,0,2,"Downton, Mr. William James",male,54,0,0,28403,26,,S 584,0,1,"Ross, Mr. John Hugo",male,36,0,0,13049,40.125,A10,C 585,0,3,"Paulner, Mr. Uscher",male,,0,0,3411,8.7125,,C -586,1,1,"Taussig, Miss. Ruth",female,18,0,2,110413,79.65,E68,S +586,1,1,"Taussig, Miss Ruth",female,18,0,2,110413,79.65,E68,S 587,0,2,"Jarvis, Mr. John Denzil",male,47,0,0,237565,15,,S 588,1,1,"Frolicher-Stehli, Mr. Maxmillian",male,60,1,1,13567,79.2,B41,C 589,0,3,"Gilinski, Mr. Eliezer",male,22,0,0,14973,8.05,,S @@ -592,10 +592,10 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 591,0,3,"Rintamaki, Mr. Matti",male,35,0,0,STON/O 2. 3101273,7.125,,S 592,1,1,"Stephenson, Mrs. Walter Bertram (Martha Eustis)",female,52,1,0,36947,78.2667,D20,C 593,0,3,"Elsbury, Mr. William James",male,47,0,0,A/5 3902,7.25,,S -594,0,3,"Bourke, Miss. Mary",female,,0,2,364848,7.75,,Q +594,0,3,"Bourke, Miss Mary",female,,0,2,364848,7.75,,Q 595,0,2,"Chapman, Mr. John Henry",male,37,1,0,SC/AH 29037,26,,S 596,0,3,"Van Impe, Mr. Jean Baptiste",male,36,1,1,345773,24.15,,S -597,1,2,"Leitch, Miss. Jessie Wills",female,,0,0,248727,33,,S +597,1,2,"Leitch, Miss Jessie Wills",female,,0,0,248727,33,,S 598,0,3,"Johnson, Mr. Alfred",male,49,0,0,LINE,0,,S 599,0,3,"Boulos, Mr. Hanna",male,,0,0,2664,7.225,,C 600,1,1,"Duff Gordon, Sir. Cosmo Edmund (""Mr Morgan"")",male,49,1,0,PC 17485,56.9292,A20,C @@ -608,16 +608,16 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 607,0,3,"Karaic, Mr. Milan",male,30,0,0,349246,7.8958,,S 608,1,1,"Daniel, Mr. Robert Williams",male,27,0,0,113804,30.5,,S 609,1,2,"Laroche, Mrs. Joseph (Juliette Marie Louise Lafargue)",female,22,1,2,SC/Paris 2123,41.5792,,C -610,1,1,"Shutes, Miss. Elizabeth W",female,40,0,0,PC 17582,153.4625,C125,S +610,1,1,"Shutes, Miss Elizabeth W",female,40,0,0,PC 17582,153.4625,C125,S 611,0,3,"Andersson, Mrs. Anders Johan (Alfrida Konstantia Brogren)",female,39,1,5,347082,31.275,,S 612,0,3,"Jardin, Mr. Jose Neto",male,,0,0,SOTON/O.Q. 3101305,7.05,,S -613,1,3,"Murphy, Miss. Margaret Jane",female,,1,0,367230,15.5,,Q +613,1,3,"Murphy, Miss Margaret Jane",female,,1,0,367230,15.5,,Q 614,0,3,"Horgan, Mr. John",male,,0,0,370377,7.75,,Q 615,0,3,"Brocklebank, Mr. William Alfred",male,35,0,0,364512,8.05,,S -616,1,2,"Herman, Miss. Alice",female,24,1,2,220845,65,,S +616,1,2,"Herman, Miss Alice",female,24,1,2,220845,65,,S 617,0,3,"Danbom, Mr. Ernst Gilbert",male,34,1,1,347080,14.4,,S 618,0,3,"Lobb, Mrs. William Arthur (Cordelia K Stanlick)",female,26,1,0,A/5. 3336,16.1,,S -619,1,2,"Becker, Miss. Marion Louise",female,4,2,1,230136,39,F4,S +619,1,2,"Becker, Miss Marion Louise",female,4,2,1,230136,39,F4,S 620,0,2,"Gavey, Mr. Lawrence",male,26,0,0,31028,10.5,,S 621,0,3,"Yasbeck, Mr. Antoni",male,27,1,0,2659,14.4542,,C 622,1,1,"Kimball, Mr. Edwin Nelson Jr",male,42,1,0,11753,52.5542,D19,S @@ -626,34 +626,34 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 625,0,3,"Bowen, Mr. David John ""Dai""",male,21,0,0,54636,16.1,,S 626,0,1,"Sutton, Mr. Frederick",male,61,0,0,36963,32.3208,D50,S 627,0,2,"Kirkland, Rev. Charles Leonard",male,57,0,0,219533,12.35,,Q -628,1,1,"Longley, Miss. Gretchen Fiske",female,21,0,0,13502,77.9583,D9,S +628,1,1,"Longley, Miss Gretchen Fiske",female,21,0,0,13502,77.9583,D9,S 629,0,3,"Bostandyeff, Mr. Guentcho",male,26,0,0,349224,7.8958,,S 630,0,3,"O'Connell, Mr. Patrick D",male,,0,0,334912,7.7333,,Q 631,1,1,"Barkworth, Mr. Algernon Henry Wilson",male,80,0,0,27042,30,A23,S 632,0,3,"Lundahl, Mr. Johan Svensson",male,51,0,0,347743,7.0542,,S 633,1,1,"Stahelin-Maeglin, Dr. Max",male,32,0,0,13214,30.5,B50,C 634,0,1,"Parr, Mr. William Henry Marsh",male,,0,0,112052,0,,S -635,0,3,"Skoog, Miss. Mabel",female,9,3,2,347088,27.9,,S -636,1,2,"Davis, Miss. Mary",female,28,0,0,237668,13,,S +635,0,3,"Skoog, Miss Mabel",female,9,3,2,347088,27.9,,S +636,1,2,"Davis, Miss Mary",female,28,0,0,237668,13,,S 637,0,3,"Leinonen, Mr. Antti Gustaf",male,32,0,0,STON/O 2. 3101292,7.925,,S 638,0,2,"Collyer, Mr. Harvey",male,31,1,1,C.A. 31921,26.25,,S 639,0,3,"Panula, Mrs. Juha (Maria Emilia Ojala)",female,41,0,5,3101295,39.6875,,S 640,0,3,"Thorneycroft, Mr. Percival",male,,1,0,376564,16.1,,S 641,0,3,"Jensen, Mr. Hans Peder",male,20,0,0,350050,7.8542,,S 642,1,1,"Sagesser, Mlle. Emma",female,24,0,0,PC 17477,69.3,B35,C -643,0,3,"Skoog, Miss. Margit Elizabeth",female,2,3,2,347088,27.9,,S +643,0,3,"Skoog, Miss Margit Elizabeth",female,2,3,2,347088,27.9,,S 644,1,3,"Foo, Mr. Choong",male,,0,0,1601,56.4958,,S -645,1,3,"Baclini, Miss. Eugenie",female,0.75,2,1,2666,19.2583,,C +645,1,3,"Baclini, Miss Eugenie",female,0.75,2,1,2666,19.2583,,C 646,1,1,"Harper, Mr. Henry Sleeper",male,48,1,0,PC 17572,76.7292,D33,C 647,0,3,"Cor, Mr. Liudevit",male,19,0,0,349231,7.8958,,S 648,1,1,"Simonius-Blumer, Col. Oberst Alfons",male,56,0,0,13213,35.5,A26,C 649,0,3,"Willey, Mr. Edward",male,,0,0,S.O./P.P. 751,7.55,,S -650,1,3,"Stanley, Miss. Amy Zillah Elsie",female,23,0,0,CA. 2314,7.55,,S +650,1,3,"Stanley, Miss Amy Zillah Elsie",female,23,0,0,CA. 2314,7.55,,S 651,0,3,"Mitkoff, Mr. Mito",male,,0,0,349221,7.8958,,S -652,1,2,"Doling, Miss. Elsie",female,18,0,1,231919,23,,S +652,1,2,"Doling, Miss Elsie",female,18,0,1,231919,23,,S 653,0,3,"Kalvik, Mr. Johannes Halvorsen",male,21,0,0,8475,8.4333,,S -654,1,3,"O'Leary, Miss. Hanora ""Norah""",female,,0,0,330919,7.8292,,Q -655,0,3,"Hegarty, Miss. Hanora ""Nora""",female,18,0,0,365226,6.75,,Q +654,1,3,"O'Leary, Miss Hanora ""Norah""",female,,0,0,330919,7.8292,,Q +655,0,3,"Hegarty, Miss Hanora ""Nora""",female,18,0,0,365226,6.75,,Q 656,0,2,"Hickman, Mr. Leonard Mark",male,24,2,0,S.O.C. 14879,73.5,,S 657,0,3,"Radeff, Mr. Alexander",male,,0,0,349223,7.8958,,S 658,0,3,"Bourke, Mrs. John (Catherine)",female,32,1,1,364849,15.5,,Q @@ -676,10 +676,10 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 675,0,2,"Watson, Mr. Ennis Hastings",male,,0,0,239856,0,,S 676,0,3,"Edvardsson, Mr. Gustaf Hjalmar",male,18,0,0,349912,7.775,,S 677,0,3,"Sawyer, Mr. Frederick Charles",male,24.5,0,0,342826,8.05,,S -678,1,3,"Turja, Miss. Anna Sofia",female,18,0,0,4138,9.8417,,S +678,1,3,"Turja, Miss Anna Sofia",female,18,0,0,4138,9.8417,,S 679,0,3,"Goodwin, Mrs. Frederick (Augusta Tyler)",female,43,1,6,CA 2144,46.9,,S 680,1,1,"Cardeza, Mr. Thomas Drake Martinez",male,36,0,1,PC 17755,512.3292,B51 B53 B55,C -681,0,3,"Peters, Miss. Katie",female,,0,0,330935,8.1375,,Q +681,0,3,"Peters, Miss Katie",female,,0,0,330935,8.1375,,Q 682,1,1,"Hassab, Mr. Hammad",male,27,0,0,PC 17572,76.7292,D49,C 683,0,3,"Olsvigen, Mr. Thor Anderson",male,20,0,0,6563,9.225,,S 684,0,3,"Goodwin, Mr. Charles Edward",male,14,5,2,CA 2144,46.9,,S @@ -688,48 +688,48 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 687,0,3,"Panula, Mr. Jaako Arnold",male,14,4,1,3101295,39.6875,,S 688,0,3,"Dakic, Mr. Branko",male,19,0,0,349228,10.1708,,S 689,0,3,"Fischer, Mr. Eberhard Thelander",male,18,0,0,350036,7.7958,,S -690,1,1,"Madill, Miss. Georgette Alexandra",female,15,0,1,24160,211.3375,B5,S +690,1,1,"Madill, Miss Georgette Alexandra",female,15,0,1,24160,211.3375,B5,S 691,1,1,"Dick, Mr. Albert Adrian",male,31,1,0,17474,57,B20,S -692,1,3,"Karun, Miss. Manca",female,4,0,1,349256,13.4167,,C +692,1,3,"Karun, Miss Manca",female,4,0,1,349256,13.4167,,C 693,1,3,"Lam, Mr. Ali",male,,0,0,1601,56.4958,,S 694,0,3,"Saad, Mr. Khalil",male,25,0,0,2672,7.225,,C 695,0,1,"Weir, Col. John",male,60,0,0,113800,26.55,,S 696,0,2,"Chapman, Mr. Charles Henry",male,52,0,0,248731,13.5,,S 697,0,3,"Kelly, Mr. James",male,44,0,0,363592,8.05,,S -698,1,3,"Mullens, Miss. Katherine ""Katie""",female,,0,0,35852,7.7333,,Q +698,1,3,"Mullens, Miss Katherine ""Katie""",female,,0,0,35852,7.7333,,Q 699,0,1,"Thayer, Mr. John Borland",male,49,1,1,17421,110.8833,C68,C 700,0,3,"Humblen, Mr. Adolf Mathias Nicolai Olsen",male,42,0,0,348121,7.65,F G63,S 701,1,1,"Astor, Mrs. John Jacob (Madeleine Talmadge Force)",female,18,1,0,PC 17757,227.525,C62 C64,C 702,1,1,"Silverthorne, Mr. Spencer Victor",male,35,0,0,PC 17475,26.2875,E24,S -703,0,3,"Barbara, Miss. Saiide",female,18,0,1,2691,14.4542,,C +703,0,3,"Barbara, Miss Saiide",female,18,0,1,2691,14.4542,,C 704,0,3,"Gallagher, Mr. Martin",male,25,0,0,36864,7.7417,,Q 705,0,3,"Hansen, Mr. Henrik Juul",male,26,1,0,350025,7.8542,,S 706,0,2,"Morley, Mr. Henry Samuel (""Mr Henry Marshall"")",male,39,0,0,250655,26,,S 707,1,2,"Kelly, Mrs. Florence ""Fannie""",female,45,0,0,223596,13.5,,S 708,1,1,"Calderhead, Mr. Edward Pennington",male,42,0,0,PC 17476,26.2875,E24,S -709,1,1,"Cleaver, Miss. Alice",female,22,0,0,113781,151.55,,S -710,1,3,"Moubarek, Master. Halim Gonios (""William George"")",male,,1,1,2661,15.2458,,C +709,1,1,"Cleaver, Miss Alice",female,22,0,0,113781,151.55,,S +710,1,3,"Moubarek, Master Halim Gonios (""William George"")",male,,1,1,2661,15.2458,,C 711,1,1,"Mayne, Mlle. Berthe Antonine (""Mrs de Villiers"")",female,24,0,0,PC 17482,49.5042,C90,C 712,0,1,"Klaber, Mr. Herman",male,,0,0,113028,26.55,C124,S 713,1,1,"Taylor, Mr. Elmer Zebley",male,48,1,0,19996,52,C126,S 714,0,3,"Larsson, Mr. August Viktor",male,29,0,0,7545,9.4833,,S 715,0,2,"Greenberg, Mr. Samuel",male,52,0,0,250647,13,,S 716,0,3,"Soholt, Mr. Peter Andreas Lauritz Andersen",male,19,0,0,348124,7.65,F G73,S -717,1,1,"Endres, Miss. Caroline Louise",female,38,0,0,PC 17757,227.525,C45,C -718,1,2,"Troutt, Miss. Edwina Celia ""Winnie""",female,27,0,0,34218,10.5,E101,S +717,1,1,"Endres, Miss Caroline Louise",female,38,0,0,PC 17757,227.525,C45,C +718,1,2,"Troutt, Miss Edwina Celia ""Winnie""",female,27,0,0,34218,10.5,E101,S 719,0,3,"McEvoy, Mr. Michael",male,,0,0,36568,15.5,,Q 720,0,3,"Johnson, Mr. Malkolm Joackim",male,33,0,0,347062,7.775,,S -721,1,2,"Harper, Miss. Annie Jessie ""Nina""",female,6,0,1,248727,33,,S +721,1,2,"Harper, Miss Annie Jessie ""Nina""",female,6,0,1,248727,33,,S 722,0,3,"Jensen, Mr. Svend Lauritz",male,17,1,0,350048,7.0542,,S 723,0,2,"Gillespie, Mr. William Henry",male,34,0,0,12233,13,,S 724,0,2,"Hodges, Mr. Henry Price",male,50,0,0,250643,13,,S 725,1,1,"Chambers, Mr. Norman Campbell",male,27,1,0,113806,53.1,E8,S 726,0,3,"Oreskovic, Mr. Luka",male,20,0,0,315094,8.6625,,S 727,1,2,"Renouf, Mrs. Peter Henry (Lillian Jefferys)",female,30,3,0,31027,21,,S -728,1,3,"Mannion, Miss. Margareth",female,,0,0,36866,7.7375,,Q +728,1,3,"Mannion, Miss Margareth",female,,0,0,36866,7.7375,,Q 729,0,2,"Bryhl, Mr. Kurt Arnold Gottfrid",male,25,1,0,236853,26,,S -730,0,3,"Ilmakangas, Miss. Pieta Sofia",female,25,1,0,STON/O2. 3101271,7.925,,S -731,1,1,"Allen, Miss. Elisabeth Walton",female,29,0,0,24160,211.3375,B5,S +730,0,3,"Ilmakangas, Miss Pieta Sofia",female,25,1,0,STON/O2. 3101271,7.925,,S +731,1,1,"Allen, Miss Elisabeth Walton",female,29,0,0,24160,211.3375,B5,S 732,0,3,"Hassan, Mr. Houssein G N",male,11,0,0,2699,18.7875,,C 733,0,2,"Knight, Mr. Robert J",male,,0,0,239855,0,,S 734,0,2,"Berriman, Mr. William John",male,23,0,0,28425,13,,S @@ -741,20 +741,20 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 740,0,3,"Nankoff, Mr. Minko",male,,0,0,349218,7.8958,,S 741,1,1,"Hawksford, Mr. Walter James",male,,0,0,16988,30,D45,S 742,0,1,"Cavendish, Mr. Tyrell William",male,36,1,0,19877,78.85,C46,S -743,1,1,"Ryerson, Miss. Susan Parker ""Suzette""",female,21,2,2,PC 17608,262.375,B57 B59 B63 B66,C +743,1,1,"Ryerson, Miss Susan Parker ""Suzette""",female,21,2,2,PC 17608,262.375,B57 B59 B63 B66,C 744,0,3,"McNamee, Mr. Neal",male,24,1,0,376566,16.1,,S 745,1,3,"Stranden, Mr. Juho",male,31,0,0,STON/O 2. 3101288,7.925,,S 746,0,1,"Crosby, Capt. Edward Gifford",male,70,1,1,WE/P 5735,71,B22,S 747,0,3,"Abbott, Mr. Rossmore Edward",male,16,1,1,C.A. 2673,20.25,,S -748,1,2,"Sinkkonen, Miss. Anna",female,30,0,0,250648,13,,S +748,1,2,"Sinkkonen, Miss Anna",female,30,0,0,250648,13,,S 749,0,1,"Marvin, Mr. Daniel Warner",male,19,1,0,113773,53.1,D30,S 750,0,3,"Connaghton, Mr. Michael",male,31,0,0,335097,7.75,,Q -751,1,2,"Wells, Miss. Joan",female,4,1,1,29103,23,,S -752,1,3,"Moor, Master. Meier",male,6,0,1,392096,12.475,E121,S +751,1,2,"Wells, Miss Joan",female,4,1,1,29103,23,,S +752,1,3,"Moor, Master Meier",male,6,0,1,392096,12.475,E121,S 753,0,3,"Vande Velde, Mr. Johannes Joseph",male,33,0,0,345780,9.5,,S 754,0,3,"Jonkoff, Mr. Lalio",male,23,0,0,349204,7.8958,,S 755,1,2,"Herman, Mrs. Samuel (Jane Laver)",female,48,1,2,220845,65,,S -756,1,2,"Hamalainen, Master. Viljo",male,0.67,1,1,250649,14.5,,S +756,1,2,"Hamalainen, Master Viljo",male,0.67,1,1,250649,14.5,,S 757,0,3,"Carlsson, Mr. August Sigfrid",male,28,0,0,350042,7.7958,,S 758,0,2,"Bailey, Mr. Percy Andrew",male,18,0,0,29108,11.5,,S 759,0,3,"Theobald, Mr. Thomas Leonard",male,34,0,0,363294,8.05,,S @@ -766,7 +766,7 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 765,0,3,"Eklund, Mr. Hans Linus",male,16,0,0,347074,7.775,,S 766,1,1,"Hogeboom, Mrs. John C (Anna Andrews)",female,51,1,0,13502,77.9583,D11,S 767,0,1,"Brewe, Dr. Arthur Jackson",male,,0,0,112379,39.6,,C -768,0,3,"Mangan, Miss. Mary",female,30.5,0,0,364850,7.75,,Q +768,0,3,"Mangan, Miss Mary",female,30.5,0,0,364850,7.75,,Q 769,0,3,"Moran, Mr. Daniel J",male,,1,0,371110,24.15,,Q 770,0,3,"Gronnestad, Mr. Daniel Danielsen",male,32,0,0,8471,8.3625,,S 771,0,3,"Lievens, Mr. Rene Aime",male,24,0,0,345781,9.5,,S @@ -776,22 +776,22 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 775,1,2,"Hocking, Mrs. Elizabeth (Eliza Needs)",female,54,1,3,29105,23,,S 776,0,3,"Myhrman, Mr. Pehr Fabian Oliver Malkolm",male,18,0,0,347078,7.75,,S 777,0,3,"Tobin, Mr. Roger",male,,0,0,383121,7.75,F38,Q -778,1,3,"Emanuel, Miss. Virginia Ethel",female,5,0,0,364516,12.475,,S +778,1,3,"Emanuel, Miss Virginia Ethel",female,5,0,0,364516,12.475,,S 779,0,3,"Kilgannon, Mr. Thomas J",male,,0,0,36865,7.7375,,Q 780,1,1,"Robert, Mrs. Edward Scott (Elisabeth Walton McMillan)",female,43,0,1,24160,211.3375,B3,S -781,1,3,"Ayoub, Miss. Banoura",female,13,0,0,2687,7.2292,,C +781,1,3,"Ayoub, Miss Banoura",female,13,0,0,2687,7.2292,,C 782,1,1,"Dick, Mrs. Albert Adrian (Vera Gillespie)",female,17,1,0,17474,57,B20,S 783,0,1,"Long, Mr. Milton Clyde",male,29,0,0,113501,30,D6,S 784,0,3,"Johnston, Mr. Andrew G",male,,1,2,W./C. 6607,23.45,,S 785,0,3,"Ali, Mr. William",male,25,0,0,SOTON/O.Q. 3101312,7.05,,S 786,0,3,"Harmer, Mr. Abraham (David Lishin)",male,25,0,0,374887,7.25,,S -787,1,3,"Sjoblom, Miss. Anna Sofia",female,18,0,0,3101265,7.4958,,S -788,0,3,"Rice, Master. George Hugh",male,8,4,1,382652,29.125,,Q -789,1,3,"Dean, Master. Bertram Vere",male,1,1,2,C.A. 2315,20.575,,S +787,1,3,"Sjoblom, Miss Anna Sofia",female,18,0,0,3101265,7.4958,,S +788,0,3,"Rice, Master George Hugh",male,8,4,1,382652,29.125,,Q +789,1,3,"Dean, Master Bertram Vere",male,1,1,2,C.A. 2315,20.575,,S 790,0,1,"Guggenheim, Mr. Benjamin",male,46,0,0,PC 17593,79.2,B82 B84,C 791,0,3,"Keane, Mr. Andrew ""Andy""",male,,0,0,12460,7.75,,Q 792,0,2,"Gaskell, Mr. Alfred",male,16,0,0,239865,26,,S -793,0,3,"Sage, Miss. Stella Anna",female,,8,2,CA. 2343,69.55,,S +793,0,3,"Sage, Miss Stella Anna",female,,8,2,CA. 2343,69.55,,S 794,0,1,"Hoyt, Mr. William Fisher",male,,0,0,PC 17600,30.6958,,C 795,0,3,"Dantcheff, Mr. Ristiu",male,25,0,0,349203,7.8958,,S 796,0,2,"Otter, Mr. Richard",male,39,0,0,28213,13,,S @@ -801,47 +801,47 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 800,0,3,"Van Impe, Mrs. Jean Baptiste (Rosalie Paula Govaert)",female,30,1,1,345773,24.15,,S 801,0,2,"Ponesell, Mr. Martin",male,34,0,0,250647,13,,S 802,1,2,"Collyer, Mrs. Harvey (Charlotte Annie Tate)",female,31,1,1,C.A. 31921,26.25,,S -803,1,1,"Carter, Master. William Thornton II",male,11,1,2,113760,120,B96 B98,S -804,1,3,"Thomas, Master. Assad Alexander",male,0.42,0,1,2625,8.5167,,C +803,1,1,"Carter, Master William Thornton II",male,11,1,2,113760,120,B96 B98,S +804,1,3,"Thomas, Master Assad Alexander",male,0.42,0,1,2625,8.5167,,C 805,1,3,"Hedman, Mr. Oskar Arvid",male,27,0,0,347089,6.975,,S 806,0,3,"Johansson, Mr. Karl Johan",male,31,0,0,347063,7.775,,S 807,0,1,"Andrews, Mr. Thomas Jr",male,39,0,0,112050,0,A36,S -808,0,3,"Pettersson, Miss. Ellen Natalia",female,18,0,0,347087,7.775,,S +808,0,3,"Pettersson, Miss Ellen Natalia",female,18,0,0,347087,7.775,,S 809,0,2,"Meyer, Mr. August",male,39,0,0,248723,13,,S 810,1,1,"Chambers, Mrs. Norman Campbell (Bertha Griggs)",female,33,1,0,113806,53.1,E8,S 811,0,3,"Alexander, Mr. William",male,26,0,0,3474,7.8875,,S 812,0,3,"Lester, Mr. James",male,39,0,0,A/4 48871,24.15,,S 813,0,2,"Slemen, Mr. Richard James",male,35,0,0,28206,10.5,,S -814,0,3,"Andersson, Miss. Ebba Iris Alfrida",female,6,4,2,347082,31.275,,S +814,0,3,"Andersson, Miss Ebba Iris Alfrida",female,6,4,2,347082,31.275,,S 815,0,3,"Tomlin, Mr. Ernest Portage",male,30.5,0,0,364499,8.05,,S 816,0,1,"Fry, Mr. Richard",male,,0,0,112058,0,B102,S -817,0,3,"Heininen, Miss. Wendla Maria",female,23,0,0,STON/O2. 3101290,7.925,,S +817,0,3,"Heininen, Miss Wendla Maria",female,23,0,0,STON/O2. 3101290,7.925,,S 818,0,2,"Mallet, Mr. Albert",male,31,1,1,S.C./PARIS 2079,37.0042,,C 819,0,3,"Holm, Mr. John Fredrik Alexander",male,43,0,0,C 7075,6.45,,S -820,0,3,"Skoog, Master. Karl Thorsten",male,10,3,2,347088,27.9,,S +820,0,3,"Skoog, Master Karl Thorsten",male,10,3,2,347088,27.9,,S 821,1,1,"Hays, Mrs. Charles Melville (Clara Jennings Gregg)",female,52,1,1,12749,93.5,B69,S 822,1,3,"Lulic, Mr. Nikola",male,27,0,0,315098,8.6625,,S 823,0,1,"Reuchlin, Jonkheer. John George",male,38,0,0,19972,0,,S 824,1,3,"Moor, Mrs. (Beila)",female,27,0,1,392096,12.475,E121,S -825,0,3,"Panula, Master. Urho Abraham",male,2,4,1,3101295,39.6875,,S +825,0,3,"Panula, Master Urho Abraham",male,2,4,1,3101295,39.6875,,S 826,0,3,"Flynn, Mr. John",male,,0,0,368323,6.95,,Q 827,0,3,"Lam, Mr. Len",male,,0,0,1601,56.4958,,S -828,1,2,"Mallet, Master. Andre",male,1,0,2,S.C./PARIS 2079,37.0042,,C +828,1,2,"Mallet, Master Andre",male,1,0,2,S.C./PARIS 2079,37.0042,,C 829,1,3,"McCormack, Mr. Thomas Joseph",male,,0,0,367228,7.75,,Q 830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62,0,0,113572,80,B28, 831,1,3,"Yasbeck, Mrs. Antoni (Selini Alexander)",female,15,1,0,2659,14.4542,,C -832,1,2,"Richards, Master. George Sibley",male,0.83,1,1,29106,18.75,,S +832,1,2,"Richards, Master George Sibley",male,0.83,1,1,29106,18.75,,S 833,0,3,"Saad, Mr. Amin",male,,0,0,2671,7.2292,,C 834,0,3,"Augustsson, Mr. Albert",male,23,0,0,347468,7.8542,,S 835,0,3,"Allum, Mr. Owen George",male,18,0,0,2223,8.3,,S -836,1,1,"Compton, Miss. Sara Rebecca",female,39,1,1,PC 17756,83.1583,E49,C +836,1,1,"Compton, Miss Sara Rebecca",female,39,1,1,PC 17756,83.1583,E49,C 837,0,3,"Pasic, Mr. Jakob",male,21,0,0,315097,8.6625,,S 838,0,3,"Sirota, Mr. Maurice",male,,0,0,392092,8.05,,S 839,1,3,"Chip, Mr. Chang",male,32,0,0,1601,56.4958,,S 840,1,1,"Marechal, Mr. Pierre",male,,0,0,11774,29.7,C47,C 841,0,3,"Alhomaki, Mr. Ilmari Rudolf",male,20,0,0,SOTON/O2 3101287,7.925,,S 842,0,2,"Mudd, Mr. Thomas Charles",male,16,0,0,S.O./P.P. 3,10.5,,S -843,1,1,"Serepeca, Miss. Augusta",female,30,0,0,113798,31,,C +843,1,1,"Serepeca, Miss Augusta",female,30,0,0,113798,31,,C 844,0,3,"Lemberopolous, Mr. Peter L",male,34.5,0,0,2683,6.4375,,C 845,0,3,"Culumovic, Mr. Jeso",male,17,0,0,315090,8.6625,,S 846,0,3,"Abbing, Mr. Anthony",male,42,0,0,C.A. 5547,7.55,,S @@ -849,10 +849,10 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 848,0,3,"Markoff, Mr. Marin",male,35,0,0,349213,7.8958,,C 849,0,2,"Harper, Rev. John",male,28,0,1,248727,33,,S 850,1,1,"Goldenberg, Mrs. Samuel L (Edwiga Grabowska)",female,,1,0,17453,89.1042,C92,C -851,0,3,"Andersson, Master. Sigvard Harald Elias",male,4,4,2,347082,31.275,,S +851,0,3,"Andersson, Master Sigvard Harald Elias",male,4,4,2,347082,31.275,,S 852,0,3,"Svensson, Mr. Johan",male,74,0,0,347060,7.775,,S -853,0,3,"Boulos, Miss. Nourelain",female,9,1,1,2678,15.2458,,C -854,1,1,"Lines, Miss. Mary Conover",female,16,0,1,PC 17592,39.4,D28,S +853,0,3,"Boulos, Miss Nourelain",female,9,1,1,2678,15.2458,,C +854,1,1,"Lines, Miss Mary Conover",female,16,0,1,PC 17592,39.4,D28,S 855,0,2,"Carter, Mrs. Ernest Courtenay (Lilian Hughes)",female,44,1,0,244252,26,,S 856,1,3,"Aks, Mrs. Sam (Leah Rosen)",female,18,0,1,392091,9.35,,S 857,1,1,"Wick, Mrs. George Dennick (Mary Hitchcock)",female,45,1,1,36928,164.8667,,S @@ -862,31 +862,31 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 861,0,3,"Hansen, Mr. Claus Peter",male,41,2,0,350026,14.1083,,S 862,0,2,"Giles, Mr. Frederick Edward",male,21,1,0,28134,11.5,,S 863,1,1,"Swift, Mrs. Frederick Joel (Margaret Welles Barron)",female,48,0,0,17466,25.9292,D17,S -864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.55,,S +864,0,3,"Sage, Miss Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.55,,S 865,0,2,"Gill, Mr. John William",male,24,0,0,233866,13,,S 866,1,2,"Bystrom, Mrs. (Karolina)",female,42,0,0,236852,13,,S -867,1,2,"Duran y More, Miss. Asuncion",female,27,1,0,SC/PARIS 2149,13.8583,,C +867,1,2,"Duran y More, Miss Asuncion",female,27,1,0,SC/PARIS 2149,13.8583,,C 868,0,1,"Roebling, Mr. Washington Augustus II",male,31,0,0,PC 17590,50.4958,A24,S 869,0,3,"van Melkebeke, Mr. Philemon",male,,0,0,345777,9.5,,S -870,1,3,"Johnson, Master. Harold Theodor",male,4,1,1,347742,11.1333,,S +870,1,3,"Johnson, Master Harold Theodor",male,4,1,1,347742,11.1333,,S 871,0,3,"Balkic, Mr. Cerin",male,26,0,0,349248,7.8958,,S 872,1,1,"Beckwith, Mrs. Richard Leonard (Sallie Monypeny)",female,47,1,1,11751,52.5542,D35,S 873,0,1,"Carlsson, Mr. Frans Olof",male,33,0,0,695,5,B51 B53 B55,S 874,0,3,"Vander Cruyssen, Mr. Victor",male,47,0,0,345765,9,,S 875,1,2,"Abelson, Mrs. Samuel (Hannah Wizosky)",female,28,1,0,P/PP 3381,24,,C -876,1,3,"Najib, Miss. Adele Kiamie ""Jane""",female,15,0,0,2667,7.225,,C +876,1,3,"Najib, Miss Adele Kiamie ""Jane""",female,15,0,0,2667,7.225,,C 877,0,3,"Gustafsson, Mr. Alfred Ossian",male,20,0,0,7534,9.8458,,S 878,0,3,"Petroff, Mr. Nedelio",male,19,0,0,349212,7.8958,,S 879,0,3,"Laleff, Mr. Kristo",male,,0,0,349217,7.8958,,S 880,1,1,"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)",female,56,0,1,11767,83.1583,C50,C 881,1,2,"Shelley, Mrs. William (Imanita Parrish Hall)",female,25,0,1,230433,26,,S 882,0,3,"Markun, Mr. Johann",male,33,0,0,349257,7.8958,,S -883,0,3,"Dahlberg, Miss. Gerda Ulrika",female,22,0,0,7552,10.5167,,S +883,0,3,"Dahlberg, Miss Gerda Ulrika",female,22,0,0,7552,10.5167,,S 884,0,2,"Banfield, Mr. Frederick James",male,28,0,0,C.A./SOTON 34068,10.5,,S 885,0,3,"Sutehall, Mr. Henry Jr",male,25,0,0,SOTON/OQ 392076,7.05,,S 886,0,3,"Rice, Mrs. William (Margaret Norton)",female,39,0,5,382652,29.125,,Q 887,0,2,"Montvila, Rev. Juozas",male,27,0,0,211536,13,,S -888,1,1,"Graham, Miss. Margaret Edith",female,19,0,0,112053,30,B42,S -889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S +888,1,1,"Graham, Miss Margaret Edith",female,19,0,0,112053,30,B42,S +889,0,3,"Johnston, Miss Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S 890,1,1,"Behr, Mr. Karl Howell",male,26,0,0,111369,30,C148,C 891,0,3,"Dooley, Mr. Patrick",male,32,0,0,370376,7.75,,Q diff --git a/doc/source/_static/schemas/01_table_spreadsheet.png b/doc/source/_static/schemas/01_table_spreadsheet.png index b3cf5a0245b9cc955106b95bb718c039fa1dac74..4e3497879de31c282c956362afbf71c087b92a81 100644 GIT binary patch literal 100707 zcmdpdWmuG5+b)WNN~xeUh=?HF9n#X>-Q7I{1ClD;-2wvA-6h>Q^w2f*&==7|TrX+JsIw_nt&FzleH=X?62 z=5wL2p8fgEQ_F_W;k9w|cDKa~CB%7Op8lZf2*CB15&7~B?Ct^awXGC$k6n@z`Emov z*yayhN@lYd$w;2_x=iC4fDv;R!8v=JXs=1UuAe@m2z)8>Ki~FxCcDz#u2O^%@jhFc z53eARnwM!EEzB%%`TSgWaV+>I3i51W)s;bJat$r9ulj|{L)=j~}`1d1y8%)WeIoq8} z|Gq9U-e3Y|CtAp8HfJZ|iAwr20E1Qv7~xo>hP_oQZCvGSa63orVm(4}M4chn2jTioBquCit)VqeE23>k}i(j(Z4AK!8n8y}V z2MT0)bc`_@v_K?p`Wjqqccfk&C>5JM)T3PBzLBt9%|6A^d5Q_EO~A zmb?;P24n-mTk9}b`OrE#vUIyYk}FsIwz=|(5QQwgE^3{c>41LC2jsS07;=QZk^m-q zPtA1{W8c$AsSmJXN$?-yp6Vzva#;N!|M{MSp*s%F_$|y zggkZ}4oWSi!?}mM-}IclsC?!P$TF^kl97{xFrF97SoR$X4B7YAK&;PPA-i9j0vCZ%>=!TUInrv>#>*D^@oDA z_4e#8+aIu=eA&cWHOB7(JV!=(>}LG*%bWON8s;a&+1|vRMcwh ztu16y%eh8-IL+3$OEbM{8Va;|O4MZ>72kCTf5qG(?)n|)LJ!{di69fQ;wZ~Ty-f~q zfZbQ8!Df(BN>0!3TU$*{%^uOV3JMM-fv8Zx7vwAHID(V{l&B6&3SknI61iW2#U3vP zYt$Y|XBAGZ2eMEm`z*=o(ueV$AkI5Rp&^Xvyw1}b&lgYHbFs#VYpkYXfpxvTzP>?; zN{-hXjKL~)C?k98Yf7h41a&!`vdYjtCSjsavT=FINXZsj@Rq+fxvIa~daiTNdHi5u zyiUWT_E5IGX+Hu>xa6xuu2EmPQ5rWSY^ljTnW@WaTLoBUg+(srZ+>f4c(*k@EBg`S z{preFLejbcS2pX|*rfdJSKl=za?A#`>!?H(f#Zu?q`SxFOOV+V8tAM83#uCqL}y!R zC5fMkT71Dh;<3Cuue8kh^z!A!>BbCk+2)a)lkJ>64@!Z)|J^Ssy*e21_++g|^8AZS z{UiO|y$gdEqsOcoHTJeT1zVg-EN%IUk{&SLRHxiv)Tq?N-ud&?IbWLSq2DBFX|91! zo=y~K;Gmr!iF#_%gl-9RtR)&)JamYKUZ3wTzww*2h`GKA+1!GvOim2@dK{31s~120 zy9sPcIm$QG7_rCV%g3QAOQrDrcN zvup}1gE4nxkQeEu+fxe1!_l1dz&UOOY}BTZg20P*w$U~bb9MFHc(b)PCcj$?XkLVF z3t3_RoFfV>Ch!goduDNhq_!bx;W1jYtlf8fhhlL_#Ch!KVbb#rlE&+t6s9}LUQ$$4 zX>n=KiSBCQOiYTe9U(kzIYuBezKh`YK|b>>(;FPiU?805@iXoXhQ9#nw48*YpAZ8j z&(O?_HW7ox#H=Bs}i^Iu-^Cv)zsKQ+VDsv=W$v zrr1OW%N~9jFi>Q3Gg5kxgx5RpJat{Gby@6r+rsP%IArs~7mGMWB6 zgiEcDhZFqtpz@?AXJ@}I((^b|LPdr5^!fxlz!Gf_f7Jop9muDQoR`SK`Q3MZe}X2 z^|#FfnJ)CpGL?Jnl3)tdoB1e7wM1}Bs$oHEWYW&<9_QMP$~5V>|WJ`#Ksb?ssGXXWHf;IWZ~?S zc1_&xq+-iA0tt_T$bK6%%`|J;+cUy!dH;y&-Q*xV3hL_vIf^x+Gc!l`dAL?rH$@}X zWU>X(+d7x(399;B55kI6@AKrJ?rt##zU(xr)fJ^>RCePwNL6Rhx7TyrfFXD$w_8d`hPI-e zfelf@Y_X)q#|Tn!cml)m$;Eq0x>V8cQol73)RGbDzbZMe!+2L1t$nTf*Fcv3rWhMj(2!yQ6#iQ?)GD0}6PXOZHTHAFp;g#GnOl+SRmBnl=^ zcAA6YGrG3c4~7FPD0gAh!BH!pG0`Zul~uIbcrt`$?6c}cZkeD38Vb!VEhtxOspOv+ zOIu~tZgI%W;UiCOy3b4jT)g$J`!?^cRuO5bQd#f#TM=NtU=NPX9bH86Ncp z-`*+ERA!SgoSdGS?qos!ob5=c!qq*&Udq6!B*>6{&~$M;`9m$icel(LQXy76&1&5D zIT&+a5}3WYrMa(vkc#2p@huKjrT69$D_`Zyoc?hux9YGk!_~=1mWPH!(!*Egdqx)o z0EkKB^|`3@3KZ*hPL~txMQ-b(8ICXZ4){B|EjUP??aX&?nc2XfZv#NsSZl^Etg0A! z&h|u2?PR(2ahse$j-^^9nt_5kY z_#haskAv~teD4n(0jDKOrq8)v&ADw2g?nfh{C={bvHo~q?OH4_L=Ds;+C;-!9csMO zeoqhb($Zoo-$Nq3$97%$FIL)i@F%M#z}u6I*r|tOj6WJ8`3+oh0I|P%GvFOT%t z#qb2xry8&kU;=z>jFr_tA^H{OAuObKi|HfRXzOP!ML(g)hg-6F@tVRjkY6VGBM7A0 zfw;S@xF}Nxz~9aQHNw!>wZ@iUNP%lI4~#;gX}xnBF7%#GyJjp;YOEV*4;O$CD@8!~ zXn*a8Sm(c017t2LW@Sd~>oTPo?XR&`Ar7b9{C3a4z^j*mM}{j;=`z6%{% z3sn%J;JaP3B?MpYcTEnap8J-W^o<#v{rFx;W~hQU(uDOcX9b7<2X^!kk)By{k3$kb zYg1m=bhZHnbc)tj^nY7Z{<>FtCias-@20OQE%P+ALmc znyn9BPf=hGmHtP*S{*f$1+4JBLfyW;jS+P6stW%61Hp$qI0}8nY;QT72RgEnvEWeTbbC>FPsmOlu^)MU(`>)Reg+aa z@e&aYV02qdFmS~?IfgQG! zfqimS8P9GhMBiDZ>>Y#K`OeH|cS&uvE!5d{l`etj8o-g)1R%2$Hh^-a{H0vJnchlP zOQ&Wed>4#FQ)M`H!>!nUxxEU<>t)5%EXnyQ0vnc9h3R+0ka1*`_jW9y>K{ z?amp}CPz0nqov?s5?R^zZCNOQ0MLU;KCo|N@$xkZz+QWGp8e7h`sl_>svzi1u2iC< zq+PTPoo=f#yOno#Mm#&aEU-)cTUVpgC+pev0KP-XTE&|DuozQms7esJC1|a|Jb}UhA+VVQTuJO9E4}plXqD^cQ#R>O^|h z1Sx?G9g^0|DuhVpFP{3j6*VvoEMkOE+fJ8oyAQ2b|m4U7O>7PWw z+@D-HVO8pKu04#6-+bTImBbIBM>@WfYpZ1Od< zp@UtqpKxUbe$*b#58F6AkQa=8$Yq+2jgRkY#7fn=ny`s4I|)DM(J$>E_sI@+8w3hM zx#^s*)?F{}#3UQdCq@b?s(6JqWCA(l8q2Q(^=6{c3?tObvPS!3B>5myTgfUAKjtor zoG}q37QLkcTbuT+khh2S_P`z^GVv2)TsOES0-`5klxuAl`csdR{X6ye#3!qCpwbJ+ z=ND-2c&^j*ZZndT<&2bGp->15pF?D))O)qV6PoNfo=#wY{CG0uzx_QNCgFU_BcH}E zx1Vr;yu4H3aYw2Kxv?*LA;M7r4Pw9-_7FFP*v%5rw-+4r)#(C0*`t-Beq1LHm#RWW zk$`|MS4^JC=+wy!Sn|~(?GIx#G5$!^)sO^W@k`PP)v+-1Q1ApLBZT4-l)fy=M_yzBFT%U z_jdNY_EHPNUnkn%#yzRarUi~OAc}TpTKInCjJ1lQGn(qYV>sX!7w_IV=kTavnn7DB zrR&j5lCUaVM#6W^9Gm~*-k6^smy>d_;9Cuin!I)Z0yAT1Xk`w zazH8(%N>tONf=3Ko6*s|tmDXEH6ooCUp+dqfV3E7>T~lX``Z}!Jg`ZqmDCl&BnD^{ed9`Hroz3eg)#NzGwqilSZArSyJ}Ex0}_KBC`R zUE33RXk|h1MszXZCB~`hm`ir*D=UwpGM5`TAmF8?rNBHHN2vyymL4tfIH_*HP8Hkl zYmLm`0%WXfVUT?hnak9g|5McfUfbyU<0B0-ir;A4{qh|Ge14YqM zvK5bCb}c*4HNnoK_kOP(FM~Y)B)4pOIKG_Z|$LlU&TC=3K2?d+6blC(D=Nq^ooth_NXE z8vu7|rIJq1@E6xkVZZdR=j){4Fui28aQDxEKJb=epYTy7-cv{=j zi~DNMKWVvzLB2Z6P^{ps@<| zeJGgK*ARI&u4-m#%Hn(S`t2~dnV#UB8-dTng6@gD^VVX&JP8#peR0H*-hc>D5ZdVH#-|y_1Jc zwz{|YiHgeF$~Mo>&;q=C@bU56T}jIdkwN147IW1;1`JgfgtN5lAHLNY_;jouEEogz zn*U*sn3JJc{R%pMWI8iLHx9QMs`_cGuCJk?QG9MqG-Jtw!=!+NfYlC#X(?)Iek%LD zPXchv=QtQOHn6)pZzng7nwF_*1V^dOoxp8Gi*&E--0|f6ypMQ>nVT6i!BEShwK36% z4tV(ux#!Y**2ivxoT&pU(*WCTfG5mx^Xw|B#ZOwq@<2}Uy1Vn80WzLs!BV2nIB3^f zC30I{PzunSPfbzD{mtD>6_(4mV!hEn-xX+OF~j-(Tg^yciWc z|B^na=GKKyJ`IS=!*fiWgnI^{$VB0p6arO<5PYbw< zs1JK zhxZ1bOS6GU1tLr#xU7$?Hl{N0aab4%-5fyWehcnbHMGm28ZS>vx4G9HK>fAZg>j&E zPr|9Dmh1vWB?wmwiJ#dA)U7AzPbaeV@Oaa)Tl5WXr9ZodTx_-Bkn@;BIpLzhU#*nO z=lAMPdGi~2J-NR(U>N+at4vd9YYz&OeT=-@SI>h!(Hn9rA6(}1-E@HDkTrcnn* z*;JwK=55PLp`=}F{(=p3*$0!Hb!_WY6Qfo5d`{^AtCMPf&cfpw&i_Qs&7BfFhN-I@ zJN9OKrRA1O`mU?*fHk7JnqBbr%EUiFG)WLaq&r~m`4_f9hB>@%#(UHF5{*Y}pd;x| z&_kG=&vq(NQl-B*@h{(ec!nDw7|CZopHGD=`k`%DwrFZGalb3?;TH`E@T;^);${v*h)8@iygBFR{}9O5|0w~# z+IV5)NiY_fUq}<*oiXTjlUb7-Je-!8Ha*jz{T}r@C(ie3Fd^MwA`Vl;GmM&RHoCIn zZ_KxmRr9v6{G(SY_?a9iTPq4eYr@@_#+)cX3$Z)&7 zHRklgRZl{WU^{t;Z`)mBBWL)ZqT$Z=5H0#sH zIyFbEZ(@q=zN;5AV6=bQLP`I7a7)FQ+RRryprAy*S~bIcrfD!U-JL9qdV^AK0=1_b zj4!rZNJwNOkQ)tgnXt0cg(;Ke>r3YAn_nbS7?tfpoAj5O;<4aWErO9GZ0CY6USPiC zXP*U{2&D6KN*WQW6`I(mQQb)pNc%V8PW{A{FfJCfrOptwv{sE3HoD4fS&@;hJg`x zI^PV-Ex7yA*~Mj8cutT-V9za+3RsN7Pd~$kWVe_=(LmUkb=2WAg+9kG`l!aCo&F8( z+*f8iFg$1m>kzXxwgLL5|CUS{zBC@SQ#d1>4;b5e#`Vg?a;XZSPbPZqF-rF}bd_}+ILyDm>DHGM9T zJFZ3qnZFwTA9z($%>zlj4u0qWAGR+nEUY=xym)9_e+!6W%mkRoR(GFsv!e3a;gkfR zH}bR{+KaBhG^&j#Khu|w*rnKK&=H$e4KBL=y;M#JNL&^kI&CQY2^t!+>zJQ5ynn`2CY>v9W9NRVTE5}9T=C}O(99{BaPnhn2%5vU@VbT; zRoRB?YumY2T)$nF(hsZ?AmZvN$cq=De`QycZ_jYn`z6WRhnI%;ZtQaBk2kt5QKLYD z;bp$ayq=pv8Pb`vdw}N-iB32q`!Eod)Xvx)%vX&fn``+Tk^_oUi-*rub>@9{${Q{X zp(Eq`H|=eXE>-%GYAQeledg@wnAL43DG|u#ecZh&cri8IGwX>0G&+BRp(E}GiT?#j zLtH7?LV@~fc-V_R;Y$PFt3G($ku1)zLpr`Mr$5A?&oWVgv}~&!4{#F({HW z;0)Lcvx;?eb~d)?)CY_Y>Wm?oOaPns8z^c)CcO$dSue@nO2}*ZmE~8OTWqiFZ-{O0 zVE>gr8lzpUBCm1CyJ4fF*0ePAte4!yg>rFi8oIijeyuD3or;Q&m$NF&$olMSE}Q;mI+i+z z<^`OqyJGLMCVr-_#ts#bN{r8rvpqfQ*_`{{wX?IUaufkg=dmJj3<*rqKYDc|TRRo8 zTiHpQ@@c#Qk&#>n3lgu$h&pFqbse9ciRD^4y0}qsGy9EHuz^M> zak_j+Qz~DFuyD1*LNK5`v6}3CXl~}0q6I@TfCv4%(OpZVz@ZuwL00gyK5(V|^Iv_f z?WiFZSoX!GMW89+eY<`ty>K|&O3EV8^jA*0&D2l@9rab1(24c* z@;X6QDJ*al`lqJS1)!gqY#+98@4AMO^9`&mqN3x*UY+^EQaZKWCwQt8IEb*emqf15 zs;NoSwOMk5gjTO5Pk2cqJ@(wsG9U!a{w(#N9NwlJ%Qx&lGN{q;vL=b0+{eo|e|PE- zhkY>HVl$Ga&}*_*8UVGZ!LC8{O@NQYH?U;E<8)r<$O)!bx3<1M?8cU_r3;*~r`=pI zji&78FD0jSH8wHln?UDcc6IWBq z0eADk*Sf=n|FpfWem$S~@2hCD?m_YWk@xZU>GP6pUnL3ikE4QZfxQnt|(n4|2An5WzF&f4XVXvm90!O`gM7v zYWn8qnKmbv2(y{9Lf7wyUlMmBre`$owi~742ePwWHV$pQ0v?MZfP3hnbsIZWhn8V_ zK{G$}^c3L<>Od@lw@ZMv!#5x&FU??$E-sL*u19to_2v`DS5St$nr?gpSSrld>c`_V z%3L;Po%D5YJ|4AQ7j-xODr9*3=1rw#zj6j>Ls(lZ(4~t*%z z*O^+^@`4b>EkNV^uOsAsK_5t(=DR%%j?xD}`%c8FsL3o9WOAdTqVhhOqyA7}!R)xj z@j*f?>gV1G!}hezu6ZDK>qipQ_+cZ5cxX6=m&uO>Z=6x7B;JIh;pIiW zX}gyWO(;1=AgbyS&dF~@o~?d!OEsQrc|ZXkA>ZP%G%zG^W$REvU!QCjpB2c2NTG-2 z_PID0b;K|5)$qo2-SX)=VkvPfFW8Ck{nvV{4wBcC73E6C6R7 z%;+vOHq*~Son0a)Cl@)YW!nJ64v8e8bRPtiP|~^-r~;jO1&cj~i6nM`MIEDriD=t9C1DlVDI+J(LK$>zhe`uFC>?0!TEt1S^q1c>&BQ zOW@0@)62G^k~(`6`db`Qa{zDyEsWT#_RoO#gNB~={kb>MEzZYEh)SBM9UqraCcXhU z7V!45SkD^rN_xEeK=VvP17D@ArI{b_7Jf83f*{IG{MKS#XLy>kOcuxkEiOpl6LckI zZRx#hcFsRN2*i)+opVskXkN;^BG$RtZp);Fisx0K1tFU=0ig0xveIJ$6%gpCaJy~M z6C1Dh_YdFpl8yo)N#gO2<@;ta#9|=OX?oe^!?8L1W>BMab!GKKFCxS7{M_B($Xr(- zTt2<%VInausn4N}vB2*3!u4R8R9tC{#%$D%E|p)n*iT+buC2%_eW*(&TmseR;VL$5 zkpwT#Hs@)0Fr!}MM2lpkb(Jm&e^TG2(7e!e<3Dy#%f@PL28;|yT*gwPZp;DI{?e%b zwn4?!dR_?$iNCd1dhNl2%TjpNCwkf^%;>P8Sn9aDV%ir)>Y)<3S>s9OoYT9mvT6Jc z6Ji(%A?fP$0Z8?a|8{$xC>S&EXaRMTUJHpCy=sKjSOOqpdanZpOf;YL)A6a+FK!Za zRl=6+L`c-fjxH``B;J!L_tVzZmeoYn^QVMzHSv#KMFe3Iq1S4%c~n$s3Nnbt3Z%9nFnmLWR0@!T=04T;FS4u!{77`Ky z0M4VnzM=PQY_Zj682gJYiFSOi_cnPPIHFMiN9MmL#p=t}`s?fZ_ZzkU$Mr9d9sju~ z!v600U*Fxo{(ju=1Ik|!=6~)6EXx1;;QudgjvM-ITT}6m+ZZ@lT&M(A{`oY4mY-ky z;r{Eh0Yv_h6v}_B_BXT-kGTJN6bhoKC;va*tf&8onjaI=11cxGOHnV05Jy zJ$S4DhSDR5uV7`8P(e6;3D)=lNrDqee+7>TfT6s5IcRL0YwTU z4+Jhr3B%WH$;6dQCCMC4C^f27YdBuNlZ80h98go((LnDnCCLu*P(@!8-+%eQ25Z0I zaQa{{`E_PK_>x#gZ7sSh+U0<33{!8n(w=2=8J^kl} zRr%JdGxzzz;c5tD&;$cN*QPL}(eU?0LXQP&SxMTcVSxlq;;#S+UWM<2DptH;sP`yN zTW=-&_|rsXXh%P3nV+L}-H}+NASxjbh&lan8gbLl&ezw-6dL%pk$7%b_b~B8c|tXO zBDEv;BXg^-3_oL8=1M9jM*ul_S|?z54c4!&Zrv4aZKz{A<*20Ck<#Q1Xlj+0IL{n_ zw+)$Pg>F)XTM0QoOfyyF5iEv0gcu}-_4W=3_^bq&?`323laT`MZ5>tyh5yIU50OBxk!&9KWcjml@tjr(kSS?e)`z zS_g~qweAnp)Tre?@EJ4)*rwa{w8Xf>X97kd1zyYu3fXzb^VPl_Lb8; zv{?bwb_5(n47{q&bx3h<{R6%%^WM|#2oY7+YEqsUPVJxRGeI9?zQ*>BxA`!aYENE#H9eFmjR@i~E-?r*dPRj6Dp?d6Gt zoess04(*){-u*G7<=}{yB==35E{1CAePm>eKqm+Fk$5-Z5j7M0uki-<3WEF~Gqpc= zj4TByP1DVLBA0_)>kBFA)5CWP2QUbrXKmduL`k^eU?@N z&UgocvKxxaZ|~=0dXrf6IjY&>f6vy~GN}qfBgQFpQX$!K?t9#JPjU~n`hJ05Xx+_n zZ@k&NJ0VYQz>Qa4T#Q0^O<`*=<3kj9t&Fh`9}>bstt=g zXI3XJCmW4&5L~MgSL1kX37rSINX*Z|{tMz8?qGgyh1(tC%i>rel+_C37|srzXpD41xdqxTk%9+XY#-X!3JKWmvLLM4rjkyQKv{Q+6s#Y zx1m!UO&10AE4X;Fj@<}H%@0cHAU5ZDlEcM|PZ$|@ehj;k*SmAq)i)~)rIAH#3t!DO zGC9ZB2q#{Q7BLa0AG@&l^iO#PO%%D=Y=F_d(mk z4_pM8`~n_~qYU~(N#nVXuE2LN{_`_kX&E{py`zf-d%NS){$!C^d4_it;GQk+^g!SOd4??YiuBKo6IM$Pb0}ze?J;?Ghh38 zq3Jb*&F!_KbTVvS`5>3xm1&@I8ENCo`}lUbhpuc%ii_*1$ui@zuMO2U+~&)PP^nTjBJSw{JRE^s4e z;GaP+@(ir!GtEv0xuU?^G-e_W=x7IA4w@|>EZ-8n-dF_zJ%%gyZBh@%u5&JS&uhjq zMDR65J_wo5XtKHZi1R^NHKsD29B^z*SAcn9QUb>NL(1I+2v|>e%@uZ~RyJ!262E-1 z;)ATkmZ?{G!JJ06cbrbceS-D5Kk4s z%tQ@{tVb97;hYrqd$D~)lv-+-QDnMdop?URJsr&EIpx`dV;$lUkJ^!qkeWWdn2KuH zmR$CE0N*bI$Bgl}`um5g`wm7^uGIBHNL>EWR$ZCz%H`~n7-iOehUqoZ*W!tg-1?iw zT}-Q8^*e%^hSsL!5f`<@Bj>LrLXZ(C9aqJ+Cy`e@?qVZx0QFLz-K5F0smJ0L-|nt$ zAey*?t+-YjM7qZg$$@S0`{Dft zPEd})S+zacAyL@rVT}>z-N4TAB|7>jQzGT9kPwEKCA(ptA#9-R3e=5OOD;vxBo<&{ zNXL6lIy^L2MKIq9Ix8)Ot?%o!d6D{FI3IYTM(S1t>j()*=p#Llpf%}G92|!oMDoEk zmoK&+I^5~bI%W7;PBLU!;23#>IY-ze#ch|RkXLKJa#nkg#^P5higSadxbsU#p|js} zE!H7dG|vF0c6#yA@ej%m7YmRFgH77ZA;#W1n11E>PPdz6-#offbRZ-* z;X!Xa-D9x>csWT8xX)#B+E{$>LBRRXhPX(EIPL-gon(iNyTO8#`qJKTI@6=dwV$LB zbRQtGS=_LHzr#WX@x`C(32kYXo`|fl>t5K`Z2QWCbKb3kS|bz${W1_CN79FAA7x8C z(-o+{dii?8%>FZxfsk(Mmvd`M30B@4fs3PI6S#rXMg*aT4?$6i_`U%jjp`fMbO{#L z&Ka3tfrIhy@;;!|>m_Sv`6J`erAC;?s&RG*81Qr$mAg1rI_84@PYbu9e*@~4bZ7y<8futaj;h+6D(jou z<(~vS$+a@?RaqkVvT*h>k{`i=eoE`n(Z}5q8D4N%Vb!e=21dTgNM7u5E=$vfH|?F~ zUz|-#Dpc9ZNCaKP$(?4)c_-Ef&5DLz9}D8#Umj6kttkrP%cu`E{A56SoAMFkW^8m( zseJ$K*j}hroD3eW`b~GFRctx7$NsBZ>jQNULS{{UrW-ccR5qF)AcL?*FO4pFHg3OB zegBSleWKpFGN`faPtP4Rn9W^8VR^19zSFL)>b^5x_l;$ta@bE>&E9+EH{E&Y8ex@Z zX_hhf5e}A*U=`DYj1qNtUMaK#R3D)}Jv8FLsAsaX z*Bmq4O)WJW8$|TRk!9Ajx&7jbl3;A2(S!VbCq;fxLg2d8AG?P3a0c2844K(BI0_Z5 z<-$dNL0LHUGs(7I5rM@~o(aK?`=9shhH0Hi!*eE0*B8E{r+Zt~(lCor(uno#t`p?_ zt!#$D#Y@r056LwKWP#v=8na(Th5kio%PSLTuq?tSE7FlZ{jn}>TZY*YV;yxTV@s0+ zi;O6C!Y^*`Z{iXY?>bjWk52zga3USw6hLth z;K(SoJzEO9-aoj!-p}je-p#&X%ZVuUK#~hBXImT*LmhANEfnWRw+=41$BI6Yjm|%l z2;2MPdwD?N05|xv9d4GUu^F6g9zP5z+Sy@u7RyWBPnlgHYiT;qE=CY>;!{_eI8d-j zfK~g3+oNZ>W3Y(4ku810Unm$42`)X36jv!#`WLD~#RO;$mv$HO2q7|RGE$z~PPK39 zCvJPL7oE*~c>}$&`(?~w^Guo-uy~T;g;8pmHj_nkmw|kUhLiyUt;|8Td*yZaA9S+K z85TXOU3eG;Q~IRT?QE?VsZ%f=0lw>rjCf0$f~vrFU2B(0vAXk26-KN>`p7S`XwQVL zEIYz)Xm$wRjJ+&N7&`yinFa~1^6?&bGT&iZ|2M{0o;Da?1V~29$nVGvj`)XgV_VD>JHF{zOJ!@^|dB2v}_vP+f}g!x+FpOs^}soA%xu3Yo6VN-?1= zUwo*O-w{>K!sy<9t$A|sdU?q;0wWc+8ZgU3PXIbeQuDrHaaPoE)|FthT4cWQhZQCpN=>7d;V+bBzI5kO-f|s=If@jSNanSD_>9SPRc>U zHFu3U5ouo@xaule4e@Pe(k=F6F*-O*UceuCKNl9Q%!5fN{jJ+RxgtG1_L0s!RS$?_ zX)2TRRC>CLELcfsgO9NgZy*?0bR*dOsORRn0k`e}znwA^QECF`sygDp9+SApl8j2Y z`-B!jO_m)u@TfjmFmo=|*X8a;9||{Y(UqjLf^2hjH&M4NSj{z;#RW87DDm5IZ>IZC zHIx21U=p52o2)ay(`Gn3!tE>Q9jW57C+3+f`$J;EMq&Y7whgm<4xby`0;Be}_KNrb zIWsS)yLtappWQZ1M4=NLSyeI{`lya^V~Y-i`(`7UVx2}Op>34dz@GBp*kdRa=C1%t zIgf0pRWtkaOk+F`yH!}l7cDJiB&ANsNa0bvTU#nu!eYlxgmAqz2e+5_n9Y1Fy^rNP z{U2WuHi8?H#ghtLkE|*1N77c$tOj>wL4sQ?lhmiflJI&%)v=m}h{S~D-YdpzPhK0R z3rq^oj_q3}aMIV2JUr<9Z3)lBOBQ3i^mM!_q#g22qRysdZfI2|t$aEm+xx{@)b6hr zzpi@5>H@H*L@xHFe@=XH?ULf`&FIX+GTE>0xU$Af3ya)#%`?~SyiW&zQh$@ids&c( zJ{%S8zi7<6ctJFHm?;bUW$3-Qc>Kb8uw4t`xp|mTPeV%;0lg61zuiZ98J;aiI^h*!Eu(i_MmdDx%efL+=GIK5v?+pIEG|J)A5f7m^m+1Ul^V5BRRd zSdb%67YWyYdI=H3y@G&wUi!xt>Ks#*XKyIb{eod#lsV++Kch1RPSNn*(AFkfbWw%bQ za@xz@2>;MQqDl9}^PSV57Do~8XU0w&V0yqSRxEmZ)eFn~_4H>tZ^w6Bj)K^2=Zc$W z!)R@SgrA1k#_pD(UgxDJXV#>Fks;Lg3?Sy!Wev}}TE==>TCXO-?a-3p3wmpyd+=`3 zgtZ9x&&{Z@C>i~XV4U{C>oh+1xLoy#6PE_=8@IlsMxZaTWi)lgc9BmgrUsS=Ed5R@ zg!_fU406Ukwm-TStZl8wCG?vwtV>hud4|2sOy}TQv)xl!1VbG#5*p`+^4r=24=GoDyxTR;1(h2GNL zqY}<{pLNh>1k{^rr|ykXxE*z@5L`Z39JO}k{+btz@U`HICwrDnDF>3vt-K+Sd+%GR17@1_v z-JRXdM|jvQ4jXXL+~vu*+dp}b8*AoRP5h>POY)PHPdcY8CL{4(^zhL}%XcP6(ZL^q zQBs{3JhE4#db&R6FEa=p*izToFqXfR4ooyS;LvK5nA#=}F8XXk`T5Wv(ne=6(It{2 z&JMK|14HW0mk<}-gAF7|AZTz)(BLkO zhu{!M2*KS6?rwqLu0fmNu0a}W9D+0g+!=Tx0t zdq4H;UAy+kEl}>ANjT=Dq%^jrZ~c$;bN!FO`bOgJ!c;Ld;$e=E$7+QUb02r7LeBMA zJ;#lKZZN|0q6K!m+Lg`hexch`EXpmNHPJK;1IpHiP8Ym+hiz=OTD|NYua1F(Q38iW zKYF|@#L8LtG_)zl@P(vq4D%|zM61)XOeuaOljyUsf}ueA_HK4)Txij#UQSdmX3@Mw z`upiIUGXmPrsQY^m0sz|W!3Hiv6 z?fZ+4?o$h2LiHN1*&jHal_kf`H@Hd7cINUkmC*(wThppU2_)EJ}q(oxPFL9aIY zCLirPS{TP%^p;=0)%c2!)1})CeCOhi;0%36JZ#eKJ(;=x2JwmYpm-oLQO+kwj#hr= zpf@6NmG>xiu%%8}d|2-lpECw_aoi5{P4oHhaZv@9J`ehhJ15ysY;mKIZ+tiH7hM4f zPcNDi_iEpFDm#j_5u~jf_}{eqpOLOmK@y_PY!U}WE7G2&y4aL-IC{EM|9KjY)^_l@ z$Mo!$Sgm2Pq05o0;|RllG2Rk;^qQkmmVK-jr}oHrxp)%Fx7w7vT@h!-*JHiHBPg0* zqe7LV8)i0_tlhXoyqDX3eWj=19(1*#t!tDZCK8xyH0a|ixqAfreoy2`EQ>JjXdxyo z{N&C;ET67vLN=%Pu*-$}bbYeKDDeL1OSe#Te-b?fblvH6KhS7%5ERH-?*eni6{eKO zqhjhsiTm;Op~T7;{;`W$J!#w+t=USGLosFusr~M)IXO;)qjPQx&gQ6%SqzeOQ_pF{ zqe|6NUt?2U+bBMM{aP;0SF>8-3bQK1IJp<-aq8r;{wkkfXp{~JnPAo+;X!~iQu_D%elT(mgXRNHk=L6Kq z$){JO$qFebY1gq2Eh4)kFMlWzpNpk(%ruY^?{+TG68K8ihNX8~B>yYt~3KhMaxwwfBgl6^R-{N#hkLBTrT1z%F}pL_reK z?0mcS%8p;+L>h_|B79w7*fZ}3#aX+AwDS+%)n?lDL|_o&?k+=I8MeI(W;bk(HMqxf z-7se*_wbE#<$G_tuOnx(??yeEY=Cj`YrXN|`bwkNuOY?1#Pf9W+dVK%Fu1uIGSu*h zKada()^mJ;RPtl#CUOrL z(WZ&$(UCJ|^??+~l_BxlmKd2_Mm@M))!M_CR6RrcH0kdAEqZ#|s z;|b2!qwbZ>H?AunT9Y?^fg8vEo&%Y)Ejw^WVUO;l{68R^o^x>SSIqL+nC>1!s^^ql+BfMTsRKcyrqS$Go9Wzh`@F zu)*UUFWWBHruoTA3UhE#VCAc66a1MC7tS+dj}mE6e39NnwGG*YX0FL*D`e%EqVL^@RdK5fhVP~f-Qq`~HTV=MfCksR&rx8QaDYFRg-zz0W zR>C))UNUgL=M?KQVR}I-b}TmLJ>E4=UagSlJ-t283|9#^OfWKfuJ^`I`)(PjR6UJD z(+Ye#!ukZ+)XKQA(Q#ED+cKvSx!2YeaL)G6?6VRtltFM;udB0kn71b?o5Ta{1t+b% zqo-8@+RIS6j{9Fu`{8zreCfc&u-~QLn8ur3UnJ*p_qqs|a}1BpeO_cI-&`#>T=yG+ z10Qhcdg#R*Zra6hTX8R?2MpoMj%e&DWtutXKu*t5hw&-HFE@{QCwCJUPmS*_%j6Db zrt2J*-fk4TAJ)6)owt5^FhjG=Y5FnFQ}E?0>yhaD+16ahO_g~+yv;`6gYUp|u}eL8 zrtY~|yTt3If279{MB6HP76p4ueq(J7pKTyWga*v#(v&9lfwTCmaL2_FpawL%wHQC& zos19qzE=VOVJB|O(Yc0$7P|G}6qdIqD;+8vAjES;5IsBp3pDQGoAp$X{Dk7{CqSs> z5vi}7Zf7#PFfVV2C}~s?Z@T_UQuAyzo%c2^yXbWE%JufBQ8Ln2m3$gM_TugS_<3y{>u*IuT0=G`4-W-59NtwShDbdmvg`@DBL z_eJAS$t*(V;I-1Ce$}{i=SYuNAClH0|lRNE*01bH*mqaeggay3E-gb5x<7Y>q+yMQ)68 zC!&?RP5h(p-j=t}mKkvL0mVcv>8Nq(D=Y8E%zV8-+Vc%Q#yR#=pTzLHVAP2TtYn|2^= z)l^J$K{>IVs8Bs4FsG_QDB-YHy~B0(F#5|WiJOfUNwj#bK!rzwvndhf&0@E1x`%Ua zih!`r@Ko_;SGe~8EOVRc2N;gftG zytL2y&KB(JVX?i~0aj)r$e>Eo^=lQXna`K)`LZYt!cX!Pw*#YH}eF-P?rUaK4xB{x4BPh1rQ|L*)M7+vJu5=`%kq1SA0_PAqy zn8w@ZZs%dLO7J@&?ugCA*}T@imkB;!pUTk2m#%j}mjRz%yayU(BWEN_J5qsD&Bak| zc^PJT6xKS9e>e5-#Xrw;)$9FJ2OCVxGe*7;fa=>Rl(} zxm_=4LKbH}Q>{JHN_?Q;K^EE!&Q=z)D?`4#w-o&o#o15g?*s+y9%nPGdMMtNRPU;~ z(!4oPt&<@Wyspt1gT@}r=dO2A5XtK?Qn`W}k=sbUqZTv6gXm*(0#AD7oPRI}Yk|ZF zCg|drOwi38EPkHfirsiF6!%xE)Q1#_ap+C6WuQ4-CsP4k&_#{yf2A;i#i^P>YanW{ z|M(gZ&kpO1VGcl_?Xqo9v*~sI$RTs{N514YdXrgn281!yOqsM!U^2MTL(NCP6>(N? zb0r>s&4|Ah@X)?_&>O%6Zv}G!7pz;1tM~2oLKXX2X5Ov73(j!%`{i7h1%D zS?NVrcL7k+-SA_zXEm>n*?18;<6)EF8~wP+3Y~TyGz`sr3W4G%qxicQs@uanG2XCk zzl&A_*Q@FFhYe*Jc`Z&Py1V*bJJa%|VpGXP__y}OnchRYWI;=`2Aww7@tYe8&05gT zIG5$A=fc+mVM3Dg>Cp}v&uoj~ROc{0+WE0gsc++R5yiga+C9%jBM$=B7TqJ~Vx1N) z50x14PB_~|XnS>kS9_C%i3dY8$1HjsDAP(+-E2LM`G6rVMm-W*^hGeDiLUK@r8cnUno+2X7q7M0_KzClYBo}ay z!s5?MtOj8_G}|jf{pF_6&I41HcNxE-I!}%UfOV*T1Pk1$({$9Z0h!$R+r%$i?t}Xl z1AbKo@YgOhyK0yJYJK;va&ErEi2$*HBc0{`k~7|ufnL`$|6ORX|A{pGmKiO)NUUme zpL9M%%JF2^&n4gwyzp1lMrTiAeRfUYZTcv7T4CK$h1xfeIfku8tI+~`DHE-3hYMqc zR`u3w2INe>;1&D>rn4@1eD4nr`}{-`0Yby z2$Xpq`D#W+H|=~fi`w&}Me^$&48Jxf`Zl#f3xjNPm%Cqb zmYt7;Ap7yY+L=7p!B%K=8J(jVYMq34SE)=5#&NL$5N)xOmh7s_x>NTYPhPDk(PQUH z+w7S6gBjihf6{(RiB!x?AGY>$&F9=M=@SNwFpT)4tOO zZn+gjX>(a?qpph$89&}q{&9U2kMVKkH5%6)Dvw>f^_}JPu+tmd$N=$NuPl1|?7Xp) zaXO3j{C#jpgsTP#Bp12v2vs7Xyw~!Bave-Mhw7`;jEKkVpO$x$=KHfuv;zYS&zqL7 zT9uSbDHI3ekbCGp#Lf)WBlwLcR)lzVD^5EI8=;3$6F64)xLSMr?8og*(+1hu3dp4SD91d^W(`z>gKNUq! zwC5_Y?eFoA?*YY=m~NPYhw9U)&(4ffwxb;C^tnMG#xT{K{fpzfBL;?Gf`zE-I*W&0 ztimI}1+(hL{r17!W%I>^*Fe2c3#Yz_4rI&#rSh1GRymqwn74SV&wyOVlQTRdTd?# zBWAse<@*W!a`j4`uf&)5F9;;=@1L+SOY{W&d=;uoZEn8cgfG2a9e;7m5l-%YQgSGH zG8yblGY9N&CU z^!|sSY%Vs^Vjh~BubOoBbpENA; zoix!}w;y4Q{gFqj@gE^sV2?{9kmNisJUQ27q7OSg-NBE@3vRn=f9!QJYF%zd(J5x?NRts;dm=FW2yUK( z0v;(x1drC|Y(&?IZ6_-~8aa^r&{MOWSeQO%J3KdTn_1A8JVsM`mpc^m^kCkZft;?O zbf%O=p!dNILGi)uXg+O-FMHFINfOgCC57cSIgkbtW(GP~i#=7lCg#EZPS9<*f0HjKX24>z& zcS1Wg+T+*a@(L=dgCrD+8q>P`pQken(H!y|qoZ~OzQ``+J~8z_k9Y%{Gx0mMunKte z5#b}Pw73r@DkMwPh$wq|ft&7pv=;1E@#!42L~dc~y%zG5|Mt+V6K=+Vpb}9mZp4}1 z7buwg)Xej6C#a#6B_HCn*+Z!Wk;Ln<4!ofhGup*Vt6GIZJ1~R^1EcuJ1*PeR)$2Zs zXZCp%&mwnC*c6mvJ9>yc!c92-%-+)#2exBp5y)K_ikIn>dKA=Nb<`dyD{gO}U1T;N zRi}e)?(fp^db>=YmuSGEG^%t!#Z{tCKZ&(|6`5#s?iT+Gy9gt(OXsmXT9yCE&)3UQ zEpo5ewbO%()b&KNB_C~YNysPV@rSgy=2f|H^?m~hS$XZIJ<~tp~;z>zL&~T}6yK2?9>j%l* z4W55JLv|%1V)Yjw8~Ks){#8nJwaR?$TL$E6@O=SRPEBh_Tew?Zd1K26)8EJ2kf49X zqc~~xx6(i5n~bC5fBug6rhw-3_gnuIrr-ax`LBzS|LWC07fk7%cSZiE&j0tS*vwQK zjAS<2Mz@jCp~zk-^#zu8Mse-=B^eiA7@-EEuOWx?Uqz)5MfjgV{`-r_=AVK7=SrUp zMG(<{uJ9!OpRe*_PU1O=D%MG9bw-Kl)v)zPcfwf$#;QYcy8$^ zM)zGb_5?r{9l%e6FlgIX)BCZ6EkTk51nYqItDP*s4_ivB{Z(v|TpA%Q(NA-m21`f! zZ$>5JLyqoO11cIG{nmol93-_W^`A=l=&}NqTiq}ITtt&{p>%QM09xJl4tsxRFwY%K z!rGT<;n)6X)38Av?YQWg1J}HhX7_^`6mI*wSwOqUaq0%U>aWGx6>qW6Ms5Uv4-h30o|`%_esY=oSF-r^H(;=nOm*xBYF%Ad)Ta{Y9O z5ZnMD7vuLVh^IgvEh?0aGgkt9-HfScB$()pzp@yLh@Wt=-mR(Sckl%{O~ym&(KOzMJU4CNcIcM|w1 zo5_sh9H48cL>^^+wnDvKKhcqLrUoO=qGAT1QCX@WY$W2c^%{UDc}@lpu$=*{eaD~$ zRH&#V9n>H=_n^nFO0N-htx;Hd7aKmyty}NYyf^(CNgL}?ev?-= zKevWqGv$qZ=kF4oI=6>mJ5hcIvo&~t=9Tk6SO6!dpx_|nqr@XlkUWjMgViuIEn09o z5houXQ4s867wy;S209=#EDZH@kTwv&?b1gC3^4jFCcaM&IHnQA2@-KX@cOZ25cS{F z4B5@7L^i_X)H<`ELuQU>QqI@abWc-C%*zYJOroTJyE#fj|dc}=~RD|h`8!(v&Y+8B7ood&l12ApPw**m{o%tQY$;Os8E+A zL0Qz?AV+%TA^WU1wbye+%&Je$4XL)peGN73ELfGw%AiA_L^W}wmLq~jZ2O*!bObi& zR!MotXxdTkevdwQ_n%k*_bRp?@VhL*9b!QMBZKh$2r|=vvB30Y@@MOHagjpwv#|Wy_G%9 zFN8?m)(-o5V?=}#V8EgyV?FcH?U2k0x24G$=-M@JCkR096h+Z)H)jEiLvDEiIU*uf zKLgGq2T?GwCaQM9haRGU#b%(@xZ4pj$K5{KuTp2Cm!ekROvr!|fg7QGrVg9vTDZbi zPx9$j+xhOB=K*35zd|JDSH|9=qn#SgP@)Bspa_ZDUVog0%^yUKpkQLV&)guN8?P&S zHK*Zw+7xDs!*VN%(_u>kjl|_?0YD}5Hc@W`Im(+es~?VD{Ev#KdHbYgv0?|FLPn;YtT(kUXs`v~@Fc`t|#5cZs`wG4ViAMscgunirQHUmB^Mw&C6n z&bvfi_s4;Np*S)uJWo0Ghr2~bK>Ni^RuJ9G2ZOvBS1gC>h-OEK>rQ}+Z2NPnUy!+B zjck4JqM?6(2A@!kg0Pk(KGEiA@YZL<{q^4Su82x~7li*Z483PkEtbFI5Z*@&OUrck zUjRq*@eVZ_UAP2nW!Va9XmLuML8}8j(S8*w5`pY$Y-T(>j+eHRk5XS$XD0^2!T3QW z9J+w_9lDKF2aTWS34>tR@X@>Mi`W{Z38!NzDbYWgODvEFAaOkbASUp}zB!GQf28Wa zb&mOh*&pLr%-1Y_)G#I>3u>OZ@`z8p%kKMKyXX=%5GQow1vpu)y_9sDXoF_IO1RXBpPP_hFeOd}86+sX4p1sJP*CeV*f?_N7m;})_PcC;a7#4)`}5Cg)9@iMjqfD^ zz=6zn7evZwR|e*@j;V4ps#gtax`I8u34tE$rL-r zk`RBncQlFu>nAlrS&{GZKif_RTpmp?wtY#(uU=YEP8U7FlYX1?Rtez^hxtkF>N@_c zw*WSMELI`^IBb&^ar$88$YBkpTNC3yFDvn4wN0n~XO|k?^1m{zw{B z1`+{HS-mst|E4v7a{6J$C(}ZNSwf#!I#k6gX}81mTkwM} zfA9mU50P8Kh9_W2m$L z^*!#%-x!8n&%dx@&_WW`z_j?`a=zweXVL^}6w=cBUIX8HOV=#HWngsP0o_CWh!5a& z-U@{F3(;!p!cK}H!QAzKc^$%8ot>7`=5h1$TEAhbO1r6#d6CVbTEF&U=MhEfA4JeV zx7Ux%^q>ZK|6Lkp%Jx>V{N($06Dbez*oYxXs}#v0F2d^1spT3~>W$XeTVHFrch9`< zId%_*;>GUB0F;6bsB1f~LQt${_^qlHv^W*Dph8+(dZwnRfGj~#S5=>^kJT+>(9HfR zccZJNh5`U4*3UrepOy_MwSHeO?o6O3uE9x^C^yJl=BQ32w5RJf306tFBOUM`)z&9#LeJ+%J+U6^ z3Mzdf-Ij=!$i=Ak@+yS0&GY1J?^>VWRq)1NB=Pbu-bQ)M0fNqlf2KZM z#RJHYk?P;4@+w@}B`-E>0K%$3t1Gt0z+pzuYDS$$DP%&uC@=$KL1&xgb5B{*nM$1o z1w>~CG&b}C*`K&y?h2mm?bXpS*AhJeeD&K3#$b8ATdy}~K z{my}N_NBgzdFXj(WcJ_>l$I}a@1IeYZ5;1LZDKW4@Q}LRUclYA{aj@`i)xlr%)GVj zqNxtPPI2Bbo50%xqZf{vSJanehtlVdjPm48_2--anvXp`HPHi^YxYuTJ5RdX%{x`b zz_as_0+c_$CcKwc{WjeJ#qV699{WCo4-?@Cw0e+JtJql!pXYGx`VGJd@p!Y{O05;T ze18%vw61p>zup^0;z_WSq$RpG4qitsqkZsS(FZxy$X!u9t=YoRCvaR1xDijFJ`;43 z3T&1Di7mr`l|Xu*B*+6vkh;*50rv1tmz_~^fW+ezIr-ynWv&-yHZH8tJHrYOdw`&$ z%L<-Gh34~?TEwMQaLUOv@x0q+Y>3Ol0jE~sqHT>x4VQe3TduE}oM=tWLSyWOB5>^b zz!9ooq{Z1Bpffv`yFWXdabbVww#z;R8Owd%HTE`2=%n0Mb4IPfvLlDz85LkLlICm+ zZhPRP(fIcYU#3s(53vRTfe@H(RlmU~jlZAT`|#wrb1@_2tROxcOyStE3|uT@+ts^q zSO#eSn9H~@K`wozeh%4~G0j#QR!0cT+Sv;&t@)vf?U`9Qq_n$a{BW3l7F1j71aP2o z^RHfnc@39HaJ7+-MgiPqJ7LOC^z)iA;KQlPRtrCB{E@6Y^}@ZIZ@6|K<{z9Vyv?*pyR?P z-`#fiO6H|(kD}l#?4%~qslgIRzqUVBg(fx0^zK<>j^(=?P=!&A1OR!;7;2Oxs>Agm z5Q*jnfmt@GtUNRvp6UM$=n64~6Y22PM>0hFk-&;=H!J$!ruwSToMkW3af7m&iWX18 zD=;o_yxF7|9Uv!m#Sh?uToM9~78?cHBKq@XFDwBnpyQZQ@Yw)m&><$JCq0F3qh0Qa zhi;SgaRFggZ{<=0oCLUo2tsCh1=$ISEs>XrIOX{Qe=D`_8STd4w4a4h ziY*2u&Ypzu6?dm%@6Yk<+T~Wys~P}Tk+xFJavL=@n?lG#4%L3O^LbE_ayG%W`~D1C zp3C+yAT(T*e#oF9IvQO0F^9lLPV?({`!+d<^JZ3|g$r`KRI->QafA(W7}ql!jJf^{ zZu`>EjQ=0V&q9YZK(4_elqQOav+yf7NR8pTAD#B`8>={$S-TR>D8CC1qv>d=SU>D0 zB-yztR1;C6siVeepw`m~8vx!LZjmoQ0=BcL-s5qv>2g1I5T4PN z*95hmy4!KVD-;Elo zSQ`I}>i9p;DE=?1^J`06{;~t}t5?CdM9jjjJ2-+_W1}vS5eh*K+UzoNa_T&+QGfne zvy-e!5UE2LgG+0#EXNia|1M54b;JnHP^Pu8EJYBE319(0dPYV!G^b5CC)>`$8%dHY z%pG|oB_}JYsxnDP{C!(O(Nw%+8O_H*(?D%K;UAzzmAY*st>&%#cT0!v%XjC8X zu1d*>zfOPVh&qpqc+&9{n-nGZhT)e(bF$kx0|p+|nu^jhQIEsI4+V0D@;zm%gzx`p zDlw_3tD-S9^hwfq_A4>5?3Ot}jkvg(cQFwWUSpf#^3m~ z?Kd+dWPEb`YasgcpW=4)gM?9}Lhi0GpFbDqkU|0|eQ9aw(oYQ}fHN%S_LbZ_4lMM} zYN5Y^YFdqou?EcsqDl3azMjOUr=#mG(+AfB3JVLpIykQM2-*E~{-s`1i||kRf01E+)o%d!_f6C+Et!x2uHUjDqu#$_T1iC( z=hdrjZ(o~hIEO;*NgzSscM8_O-ISQ9l&YErGaTyz)&mN8Z3(!gvbRK{ZTaSMx3LN|chEvo~5b;55 zLqCwYo*u-!e&w06M&7o4`$lHy?`{`d!$KqCmav=a8+;r31duH?Hu+eu_iFL+zq|ApWyA{f&w8Qov$;#J%q0X3?aItI zZh*YlWVjJDFwI>3><&-oR-IHjx~;PRRm~$(K9S`O7Xw4z(UZqrxi%f?N+bl!(QW*k z@rj=xG4DekniFo^JK6yYE>8La4aotnM=q@vK6Uf^EXjc7fyhZg%~AcnPK2$BzpSv; z#q49UbIcV)#cZMSYvoF3IDKRpzGkM&gh1(RDY-LstF!FMF@N-oQ+`Cs3tUx)yBLD_ z&i*eUxVmE3Xp>vRh#RPy(dqL|MZ$)5Ch5} zF$&*)SqY%!eBV|D_5d|#d!{D?ae8RsOT2IQ4g}06qW+7hB_zON2qppVuywZ@-CZ4_ zb%o$bqIMNV5&=rp83(GikL$gRrs#~T)Jix7B5G5L&KQr%*6~7=C6a%}0g1d3S?&xX z=i2>vH}Q|RriN|Xx(+_qZ##|ZQ~l3=7B1QqmSk_;cG_2F-#@2~@eZY=g(vxUj_tD3 zsVkH{+>Zp3xeNwo1agP8XKrCz)b558;IzZYCGOmxi^k{BaquOlgg)A3A{fn%TZ6wR z3%L&TCGA3E;eS1VGBSQWl7?>t@>>r3Gb$-8==sH76ts+dOyO-lU)I}%{dnlTJXj5T z27X=*TZGr0m@Rx%JO4S5EL{2FN6U_&!yb`*$Z(NUA(M;w&8nSlTMP_)M!ooEd+Ida z#FPNbys3aa;D(0=)Sk;wfSNkYOaR(ik>^9eLzxZXv5(~F#Ac2=XBZe0U_&5pyq)Fs zpyL_*(4f-m9?Q`FL4eDRMa#{;Mf5RR;g$ED%-qMu&*;|&POJV1v;=YN!yx!93O+@- zwhmIO7#+-1Q33|z2GLG`^~R%e+ib`?aB%50#pneNN}{g;YVk>E{OEa_45GE`SJAhz zbteO_Y~_71L%rv{TqTy|oH^OV8$ejhLKi&j>_P;R zczj*MB`jmB)sslcirdS&GCCX)})|jauPUktivec-1 zZn|}H4?E7;Tl(!x_B_h^<>=$zaqkny4G%%$Pyii0Jx>fFEyWrsI|;Ul3G|29-rk;- zRT(h>VnZIy3D%&C>hx_OakOb3doB@SE*r3Wxf)nEiyqpvws1hPP3AZ9>IaU+Wu8`_ zl|&|YaAY!5M*UG+IG)wqukXcnZC>+0kw8>hxHp;JSK(v-3l18)tiHaob^7)$EV7~)$LaJMDW{$z^2Ecc)gRgGZNWf0vOf^Q}pSqeD|#gTaqa) z7lY&03|f)PYC5)$ACUa9|NZiflt8owzs5Ps7q*p-i?L0OArmIrXdmz$S$k11Y{M+f3`5eqMty-~2n zG^aA1Pbd0zxk)E@Br!l1TTLbU+q#Asi^Zd+8rffo4oWVLeJ5}Immz&6(p8Hk=LgAEm?*#F z9n6o%A-s;T&w}vAc(X{WAxsYWX_0q(bH3bf9l^6?V6CYY!$rkevNlO4yU3i9Xk?f+?5P& z7ban=HSUB5fe+bS0hxot5ImQSm^XXhqOE7X3kf05?bO#~22uOrM)y0xIp{+Z@W(+> zUvBd$il z_E4Hert|S*e|DYvEO}l2*Iw!vT5V00sBf9Z*I_M0l@I&$On#7rSe-|s?dZ(OqE-B` zWj_bRt_gy!9El+uhDS#k#Km(82D6hR3QJ0eNJ!)qA~+MUN4RmUF7>Qm^Q0DZb4SF) z^dUi1Vq$RbcZO;@ID@fnFXp|Z3I$wvQ)|FXbP<=)#rWQ|bEPvhgD?DaC!Btdz!h#E zn^S;|7!jwJuo_XcdR=2emOwrZISAZceI9r^>4ZoqF*PaX}e`IO1M5Vml{hS*xBt*UtABH0)HR2={uPpdJw17 zIqk)%_VW-kL!TNKrx**!4n#(=rYD2UFG2F-^N@tjK+5PFN3*?b$ri^Ra~iu(ryr|_ z7Tn7|8+7H4a*P_erBI@4INu9=9KSAz@07n&Ef)6>;u@EfDMst+;tm>3rkZbzq;C_f z^MVTYSO)a1m7|bFvY=Sp)|vn$C{-|{-aJtz_K_4k&o&ZvI`Ki~l@9bsu*R^xV&ad3 z7Wmf+=IInT#VoTNE?f(-@OO2v69$PVc8JcL!%|Af+D@(G;3sR>{iAc8p*juFBZS{P z1WDIXUi?71`wOnvv^=2by5FUZJ4MAkkHwGc%2o#j}oW|?}CDO+-cC@&vhWPJSY zP$K=q>1I3`8Gq}oX}L!1chi0)icmHNB#7BlFDlgs^UmtK6KhtazQw?G+4{qCeX^?N zzGdxa@d?LyOz>XTl7{YrQBvNAe&42#0t3ck-7U-*ZA% zAyBUZrVd))M3cvb-!GZI#B7S{z$Y=<@4CgV?GQ=Ka)y0E$qvO(pqUf7Xz}UL=Er## zrqKQ6ll8K2jEN2t{#QRS=D2+cxgggZi&}BJ>(0+_O)iTxhK`E+61&$ePR{>~OV-uc zVWThceKV|UWXVbly`tL%P!Wqb%QiXQ;%lD~ptcfeR|kw%XM zj^nLicz5!X=kT)8PRZn}8l$m<_#0FH*5r6dOLQyE4!Y9(Ey|79HM!nLoMC~z@kczI z_470V7RPrxr`JV(H|xHBO`q}jjJ$%>CydV2Z=Qi=dMs*pa>BLKsNUR*%Dav?=2+`@ zP^Epun7_RrjPbo^h65WTvCJTatLbmcjHJllW?mH-EtmPVY%*XEE{{ilM@D>?qo+7; zhK-*(N$$**qWeAIq&z z-#vuXpcC;&yvG*}$D`@S)iy0p@+2aVYuumEo>LwO_NQuEe?!xvG7}g0ZM$sykI#4o z4y0cExhY*;|B3NW&y$Ng<`H21qjE0Uhb-j&?D*Che9#qFx7?MS41=RbTTWjg17Q-O zqLNBa*GnFj$kQPOR*eYO;|&-`FZb(@#W>p|=_UGLm(bAA_jRU>zxjB9UZ+ydq-ru^ zFJ2d{f+(&W8OUVZYAJA=!{Z+L)4!k|j zT`wz>MnvQotNkB+tHn=dzZ36IUIf=s>39-zRVi3rCAk+~LQp~G%C#qE1C*dzj;GT5 z&v%y9Z_Wsz%)Ko-AIwj@jBOEW-U`HV(4oMVsG;09h2vPuETO|1hnUn z#eECUeTxOBv4WEc?hRaYNOR)H3dx;K3p8(Vg>M-ugJ-!={|R#50+#&8{Q*n9H_Lz} z_gU(GO(vsE_|+^hpcBciMU^qot*&v*_jHoX)3M0G8vh9M(^!E#5=#G%B_0 zzA66!Dap#Y*u*QBsWXP1Z;v3?>`JSh{=#mz_=sb8`qb2q4bJ>k*0Mw`*?0G^Jgc)C z3gpC}c10wrp`9)qI08y>-}QIZRW!q!twRJ>~GN&^YlFPfuxF8@i2_B%ygZ{zGzR0ms|K{5Zt%DTzcF z2}gxl_nZ9*dWKtTb;`xu)ms|_R$Tg|YR{FogD&9EZ(nD3cQ9%Sil>9^Z`4M$Q+=h? z3fz!cFn%Xse|)A}<__ED$3wBZhk1g0BNLSx+rxi?AU%l2Y8lLo>^YI>Gx{*6d5cFd zBKBn7No+E?W`AoS;PSzj40-yuF`&%XqgL(13ZYaO9;JteR5uZYj8l5fa{dGIDepzjea?e9DY7m~ueNwg(5n=8s zOocSn$(fnMk+)u|*}}nNX^~@Tipac+akQeUpFguAk?6424oWXKCKlE!e)gC@!>JLG zWhx2^5qzketWi-PKFG+)5E2mN?p8R^sh>YlctWL3*;J@wIB65snD_QTp;Oo^lW4Ko z`PWROh^E4POXk=fZVYnYl6A8DsE(IEXSGEz1EsGAJUV9q%ruB=<3xechLY2VzJ3 zEmzu?+$QniH{BRasij-UXawWc&JHROuWzwgg3b@O&c7#fuG}Vf#x%-bj7f@mINiDN zy6@|2tl^hF9Ah3r1l%D#;EJ5WXfj&RVCL)(eQ*KPhUkTCY9c9i54<~e;2)Gbjzws;e zJuU$#MY?r4cN2Zg6Qd7WhbBMd7F1-%@mXtl_osrwyZxyjUCxvkH?(~NFf=E0lTQ^N zfHm*5t8|27a1V1=>?kZsJ%4hD`67Ruodh6$zzi%S4%6{62<%$Y(}m%PPolN#7Ptp+&$^eq0WOb41-g22f3HoH>>iPp zk$H`7Bh34IC}@efwfl{at_W@yCLaB00fBJH>9=y&Wnkh|xxrKHHs?NC`V|y+@jKik z(rL=XHJ)>pT{gL6T zP#De0!~`by{I8>-;Lv71@q-OZ8I+R7_KC%?+!n>WM|G{bcZZZYJ`H)s_z@j^ocFj9s}49ZzA&6 zpCh)ztI;xeya_>%Enoaa;pY1-U;WMusX05gKgi)6DN+|{5+-NB_6KUezwJgLV%nDY z#%(wEKI}Pl`K-K{>38Y=XWGKC;w*^_qA6pI1x`-9JU{0R)1!y|>p^1Vp(MW$V8a5* zW&}1%Woovh7=@n&RG%@ zyPZwE_NQ#%ppJXo_12ijd zX$cY&k(p{AH!nDmXL^AG4c}dTp0=KOk%^_4KD)Z=|J-q^ zG(A2ZpLKt^emKu(hrRg5vW<1upz70)8&Xz7i?jxfnH!e-hQsMu-WzYVno+r{cnRK4 zg?Q4!FAQrczb7Zb+6PDxGtVj^&D8byt&R2l6|dO_qkfA(~EPt14>!?mh7m+t*Yj{^>0Lmq1rxH=!w+~wnH z??MJK7w?|9S|OrINVBA*3u7qxm|P?DSIm39@f47Y>Jp2kL#3-)1?rp#<@?BcsNXZM zKG8?FQj_28>T0m0=B!FBRn*m8kc9>^A)^L$gv_8IMB&z-7J@RdsH9M@`y5Z}m3 zuUjs~y>ION#CV+RgwM&JZF0ZjNkDG z{SxW?Gq2R#V&nK8%GuGf^GvFN8-R6e(ViY7(9gQ7x{I9dh@5kI%;YxL_qs2Ud~Sdy;kt_srp-js{*0tBIb0}E z9{A@yJ!stI{*}jxa&Mn>4#~UIjzarybI@$o+Xt&+r$Ys=9T-=kPxsr?B5k)N;ngnq zW6&42HTX(%4Sz2S&wR(RW)1wQuo3M4V(qP?s@kIeQ4jvl@=ypQKLuCqIvKq4i}DxZJk!b@H>THh#8FfcH%9#*dIn+wkR3H2z( zaoU?Gl?Kck)^>N>lZna61nuqZ**Q6@ySqrak*sD}tV%Hum24(f{)Fdk1qykv30Dt| zFkDEdyQ~8BJL}I2n6$es?mDko3%ps%CJxQ#B3*2-CVrC7x=<}o5q(`X^0uHXno4Q- z`h3yQuFGyq%66+;wyNkXovH(4X!(xxe75CVckj!oqxy~Org@E_SYm`3$0|#G#cyEQ z`HJx6H9{SvY>T@dKh$peS62yQKM%cEMgGdb{Zz3eN?Cz0e){mp@~eg*!Gv(escA*c zR-`{NlI#m+j3$X}v##61~oR-&`2;hErT6i3WIX-Lc$%YKF%kkp+{5Iw`P z@6d95%kDHmyK0k=|dx^Pzraum8Q}Ibq8+8QC&2>oSdJVy76Gc-5Kv zL(6w+L3*}3YvXCW? zQIOh;GLk=pi%kux^;+`0;XmA2=+Bxt*0i^X8 z#W~P3D|v(KF{k40e6~2)&Kn3>x-j)h5(9y#4AUOb<#j7fxeh+*J7LUKA8BDohWb)0_}mb-JBS%W_c5)FctMZ@Bb9IG z_^L08$9=e!f}_TmPR@LY#);P=CqQpF@>z!p-Sk8FJGRJKUG6&t#8~Lz?TLq4ONV21 zDknB-A9>rN+;TB-E|DLbxA`5zh`U!TRpj#uDZ?k6No9$3?#cwJ)^HJpTHJlr;TYJu z*MyS8=})f>33eO#*ZtI_U2HaLCLpY0ua-eQIV7wOSK#-42>P!g`p-3J9lmIa1Uldv z>wB)Hax}_APWeBa$&lUJ(@k+Zy!bWaGWa6%5eu#A>%IXyk$Ues#Q5Z-^*BcsVB}w- zxtbv=E$KJJ>6VfRX3_|}Z2qXj!>u;q<#h-wt7S$&=?_0tmD&vPW>ffE7hUm)^^0$Z z(6f8q7Bpl~I4=DWo@gt&?^<(2cL=0zI)IfmlK{@uDbFq$X9Df2BRH%}p;46&;=sS8 z-w7zJ+@>q#_^3z?gD4M0!*lK`AN7Ya8$wCUwhO$LhihKJm5Ur0ZBi;E*3#eykOvZg z!ZH@gx0g|Pd8Hqc_|pQ$N!Hs1HNPq^BC1~qvNNRg7uj7!S$=u3!{RSTWarg}dkzH* zQC#AIu9y2tb1f=3MX?zxI_8duOKL(D2kZnt65Y?Q&PX|_5>_(a7T=)c*yo->Ar*j% zysLX{YxR?@EsH-aF>&Hht_3>*fRMU`MX?A7LKctO^q4cc2GAKwd0SgaOV=vlQxbng za`;=(Im33we>;BcSLeW4HSNOe@>W^SJ%WNe!X-2)iaqnoS&qA)sl_N zxJrNR(`6c%$#_C7t=zUPKFf8*6uNu74DEg=19_23JeaI~1vWJytn9fe-a zPS`U6S$Dn;QZrv{44m$vU#sCAc$0Ry1A%_f`c+VU8_y-iTN{h5ZEh$6EBR)su)5(+ zB$t07Y0!10TlWy0lexjnVFJaQslVpJ+!7Jz6pI@tN#$AS zYAiQ%rmvXx`koWcTyaHz^7LGN=8r~yl7>Re1HKvmc*B(pv`E(DdRnh3NO0hIbfSt@j7}TZ5jH0Lalgsgd z6Xp_sf)W_(Od)@CLJe#$uSfZavZewNs0%AfeFzd4hgYP&hv4)d-yxH0-zZJdGz1qm^=OVwm1v{2Ei@!9j|(KR62N$Q zwUY8_DPrrrC|q6gsU_lJ+3+uG*bA9;ue68rA_^g$+!L)qDA}E|;?xIt+M;_=BOSbx z%Xe`6Qr9C6oz83${Q=n%ZCZev0zJI(8)i6t>VFkwZ^dK@bIRDl=>Xs^_vq`WnA!jr^zu-j6 z8qJ=|>k@9Jlf8h1#(DzD*4wCV0V_jr5S2U~KfIEa*uI&Y9jIBTp~cfA7RAkah-^G@ z;VX8l|Dn-iMRZD-#9OY;@J1(WA2ar?{-Vaw$E*1r2a%EI*oNJ%B#g;CM4;xOfy@E5 z8XE$)@U(Xn)Nsj-Xo59+HX?UN7bM&%f)+lsJ?-yFWppyXfQ@f%_ls`qhj}0BTo^%m zV>ai=IS;X#^)541pO$6<2Z&&k*1mS#SG8mwLTBBpo%y_o!{2rT-FI8EGxd1n1NQ3X zZv`SMMf7XF0M!EfZn)l4!57u6M12L^40LpqUQ~!=3xE^4Al5~c>4;ldX4vVM{-Vw3p~PS2;E(Ef051Mv`QRx=eweKXdM02 z`vXnwZmlLC0rHXCk|D-J<2B@(M#`~!tSFk*xOcvKw=RT1dn6T-pDzXzXROrXajmWB(X`~TZ2|MkvH1v-%rWfop7Gp)@6n49W~AdkHrZw^)(@i;hD={eYSE-#Z5O@7J5vZjl7;^ z9~~un!6grU>(r&B=6WtV&SF8ry0_4NFCQh`0sM0_{Y6K?juk39D%4r{8-_+Mivnq9#nh%gpTa|!! zq=qS&zx}Y}NpH~z?ZpRhAu@OQauC?fGSa<$tCoh9?M$bo-JfC-q7chO>^?{sA#9tm z9piytgUvbx%}>=+zmx(Ccfe+qdSF zuy0;kN{klKbTE_zBFp#*sqV1lL`ur*kF~Rct2JE8q@10hLNL`PscHp47L(^dKu^`S z&u0JpvYK#z`kH6j*1bF9aXL7@u(I`GAm`=3At99V&wqQX#RSqPb2M;&XA!}Mz1feO z#A3Ju+^jdH4Ikah4$2;Fsh`3z<(J;Hqoh`)W|oEv{kHH23nOu2MB09AE*54;4JUZ| z5NhNJxTjFw-&*j%fx~eB zT-o1XH1G6_`}<3J2L7eVcY$JDd4GG?#k^6<2Eucp(XRhS>2k_tVp}U0VK**@FIoaR zs<=;5q@+<1!3zWZmUVDC6|Wy#ehjNeCI8dXm-jte`|EIjNtqXq`vQd9Pfg8*2g4`} z)+>|TrZ1qAIII6XHrq5GU~yl+Mri{IXO+*I%bFoRgC1gv~mT3Wik zxn0}eRylCkZ6a^j{oTyS%8U_sLhs~c9w1haoPj-jXXa~Ew1O=@5b_bI0a$;1;^QfS zTUAkziHVP{Z*Svze>p5g1@|X{!k!!!Ib|a;t$*D3F;Z~`W~{t5?s=L*r1ndI9v0}E z%8%z$zk{%UUmA%)fNS-O`^E3>c{&{|f&H&5qfGXHTp+Q2lg@7^9nv*niT{ii@VY;T zf&KsP7WtlH&BnMIv*};lDAe!%w|8$SPdt|%2e?d9{>~eDd3kG2!r@R!;Vwg9Wd%MN zUa3SvOzZ_t$dS??*Ua8cTKQ+gfG^h0h;CF=bP7>KVBi23A?n{Qecaqh6w>(XM@OrB z<$owSI|ZNt<0J}1G$K+iE<8qAS$qyl;m?zWswGonfwIE= zr$|UC{0_k}6N;$Byu#;OLrO@02Qv- z)p_V-)--R(|U1!G0a9$2bDka)1i6i-9_FS(d54V1z+E z^_=*BhJ(87rF0uPP}B%rv8M2eu&K=o6;Bc0vawBeNB()3mF1# zWLX9NjBf9yyr|7T!-_#kV~+zkmQyE{Tg-b0bI=wm%FNzX0nZQ<$voa&lPLV}^nPMq zfnwiW9gny6(4V?3*brE_N>K%c|3e$Y|Nqg(|B0M7w)S9zg*VQixBH)9v{IqmFp-fM zxV$Y616%SD)OF#hzc?N4OFcdQ5TMBaV2L!FFFB)o%RtthA+3=%?Z>ZBiU_n?VBx3p zMaPS{M@pvuLU1&biv5}W;;`aC4JaydU;jH){d=ox2ulqisVg%Y(+##w8OEa1r(L2}iF zc?k;O&^i|JyKICeJQ2q_M-U%{CRpBG)|TuCGPs>KK|P!u+CE!M&MjBQw1 zm?j$-?u-Ma7Q;J8Rs;-_>mVG=0{gWx4B%QCT%&dA`8&Bl&7Wb3wvD@+A_EPVA>Kv%{gfb*9>D!tlv70<=do>Nn)5U1pFeeLDU_eRr zMEa3U_I&b{{gSe^W6DNV%3AI3)UcR1%jeH|zxI^hHckrj6i*QG#tGK+9}J*Jk~%*< zGaPR{=2E#fI`#Ji)44j{Z4u261H6%AzI!u zZNI=6vRTTeS-Y*6-dp=hS{`CoYdV-Zv<77F#v}Y^e?|95Yv3rjd|sHO{rywmg09(lSroXTzS(9A0?|Z-|E)3KL_iqX8F^n~$ETJiW8; z5w*V6d>!4T@JAt$+D=MiaK#|CEs)sD&ikjiG@22CtPKpX#&crQVsGE6+$7Pn#jzsR zRk^mC^Uub7&vBKd;CCaaIe*=jlginThp6gfx{7y&(@E($d?XCIc`ie^-Zs2^y_zfn zWTr%T+{Kr71kDBaJ`Z*qE9#|e^r88A_RvzS#P8>?47W1O!%Dln2SyR2aI)^Im#Fmc;Z))_p+HSkX2x;n&tByXn{hBy+f>PYE$I&!TI(dNQX9z_qb%F@w@V}uE0DlIH7E8!j_wjwJMCENt@1- z((RAwi@!+G?eu3o_eD#2{hRgHCzXo+Gs`y(r#Y~&oL)6=@WHRBLgHvUVx(1hNs$G z__^7J=k|l;fP=-eH{7NSh!D>nNjoc}J~EXbd~YVlJBi@RC-(jVaX;TgUo@GjEPR(| z{*NyJl4Z)<^#gI4`AnCk(PBVWkQ%3R&m8z*bn%r*_e%6;*#)1~=!5>J2uD+tO(l7G zOc+m3Pm$Z^DnJ~9IT_+WhP~ccpUtUUlw{Uj6ZZZ{yV}&;oOm~@0sKiNjy4i}avis6 z?x0zXagY8+>rp0Y_%;rAoNy!_cxED)&S$W5icX60Cql_$&z+rBHVqK8%8EHV zyE1w7gr=m&x({eO`pu_0y@FfWjMswK0n)I-n9r1Bc%vB)(ZxC&4MNeq`J3HfN3w0A zs84Y06jP!ySZ~g_s^H?Pi&`B{~&=GcqEI*El{K8>5(qGeadtFmcZOj=>%E8 zyzzv+-jv8_U~7#VZcn$PJ7ZKX=ItxTxIClYl-vSa+?DIDEY)*`^&inA~ae z;=!0&)5t_?W6F3eKP?CwEf{3IVzP*z&!idD@T4WV0Klnmb`N#CMgN;)O8+R%`N?&|4gRKM55|(Zf6t{f6G3Q ze6VB*5Wg(%%?B7*SXdZ$USs8gE}NAVK^P!a4RV_kaw|jZVchox@%piYyWeSg43jaf zY#TJQiHm(Rh1EH9!$kkbx^VplF>GW*8p{yfHCez%!+v%!cAn0^}nQT zyG`s5gdFoph^AwrY6)i(_8#5DY2L)e#l37h#iE3L@v3BN;;SR z5Tn3ma=NGSc7@nj@x%8ivp%-w;}vbl#m|CP)8AwFG=5zU5?rvx=-d}%j;QqQY|17Q z%K=(h$3o+C=?c^Dh7rlLgrdq|#+uL&D|eNhNthF~Ap3`4Nb?P_r%E|kAxia=s z#1pXs7Sn;2YQvTV0K;jc(43>TZgul0! zx&H&8{`boN0VjcXMd~S_{r%KPpMw8hsrUp~NuPgd^zJ|4f%8I&!-FfDL=t1 zKvH*XBm#61_{>3KTndqHZL^1tZ_lgBnTY1uX>CbD>I&sgk?L zJ>b;*H$tSwpadWQ5U|Am?wfA_+LJeHT^)y3npP>Q(FnSYD=T!5&L+BP&4wl$QDflD9>s2BVC zsJX)WM1*PZ|8J=9AAr?s2;d`B7TVQ9gy(W>RK1C(e;x+x6{-BL{uyL|2nPNE|N)VHRc z^AhHXOic*Qrb``mfkdoNSi9R`q(n$btJp$k!=>v(yOGn6AC{PD=K59_TvMt2I>y1f zQDT1!(i(zD^7J9bk zv)nb2;DO;t*(3d#VviYy>*jCWd*AY`U?$BtGSv2E?2jE-IjT!@k5%PgzMNxx?E7NI zS=H(h%KqyNGjH^PN}hP9oEDp+Vn6OovKr2NoHqXK4$fkCYkjR;Sfj1{OT&?bmp`cM zRJ^Gu3nQ|_Ams}vpscB{$pRx~K%vQ0GKdl?#=9!&SmiP)Uz_XRc~_^R;;UwZ->C=a zYLgUf4$F;y#G$v3et!{uwAj&aWu#DYpUTT+LdP+6X=_sa#4H-9VO8)6SBt+u3B=>@ zKNO@7u9BIlHC^fcxH=4hu7~bgc#J^i&I!Sq} zXTGA5K^}uzxM$Vs2Ha^?#oUjo?1c}h597IW$P+rB`+n`FCmXan(Y|Q@=mFPyP&nNC zAkh+Jr?$7;2|Di?A4|F?;4xt~=DcuvL*Vsg(5dJO9pefc96Hx2bS%5sLffQ?SE`6P zg+O+F9fH{KvyGM(uBavhBrFkkB^p^NiBvDIBXA2R`5*zpoC^#1M1Hd7rBdm#OEbK{ z8}sFteB?{ehN3+}tMC?0ELT)dPc=*~do2(MRJLc>?gMCP43+mBjbxJgd#HH)m$4Tv z$C9CqW^4@C!$<3qe~D>+acoFz)WptHv{i2M*|XlPai~9=`-Vcql8#FuvppSn=+1OX zgzv>4WC{Cgky^&+jp{o&evXxYK#K!`;>W zJx&pcAG*3LY=S6sJ2vA6%k2g&!EDH1J_>6LdY+-xrj`4yiU;s>eAcYn&*o)@?k;2z zX%$WVz^q9vUw542P`=LvGV@a#Y9nJXvp{o*w|KR;dRd`M=0T-N`rfRK=#N8ngyqgmfk zUV|t!@O7^z6diQnox?z{t~kmi(NA>jFP*Qi*6Dd?5}sb^i)p zeLN(AD?7=bI7m?k@VmB{?*Po97(t2x8>H-H?;4tRB?q6VGtdcI*YdL&i}&O?aKxpk;=iKikPl&$^()+f6a9LCuhZFItVHhef(fYXhvIsIQ|Jm# zPRI_0t}OMDh_4o4Q1&LSRvWVIcdi?w>}J;Q7)c8NjgWg=!Oj~IHP2@ z!}4-khC>u3<~wRJ0pH1hfG4Hxka&~fIk0+zr{hO{praCU{$*hBUU$A@$KHVwSA?$g zCwxlP`}>|XrPUc5Yuk3!Qq9jJKBCvUyXgxMMK@kaR)CI!Wy&j9x1i6VwI?W3l-H-? zh_Z$FFbJ>qU2VF*89am=6FA8x?OMEt$LJ0^b?ABxcI#rAZXjPvx?60p2jps_wRvDuO zppj&%jT`MDsq|67Zoj6GGmEQasHIAL&SR`XZ!}(irE$D{(I%`x!LUZyNR(zj^eep1 zS*>6l!OMh3Cz1v9rIEOXzI@px>K-wgIXNFKC5Dp1=I(K;8qT^%veQ-KZHO4TUd77R zYES_GtwcAVL1&hZ9sw+|N$vw%Kr;aQ&I`1Nm9}o$D@*r5%!}N56rF#X-VLPGoSgabFUZD=M81 zSLDmKa(w5l+4jYWFA~{Ol~ZPt`)`r&$i`pX8Isv}+!`sB1;@aJR~0{)AZ|;QPaTSZ zlDKC9vB2)Ts4xsVA)&^`Hed$CA|{SJUkKhBOk}0get^5XI(F*2O6D-@0TRNTLag?p z4z)uutz~q0%0qJ+S=$rkg1rP)=v7$rWj$IRFJ7p}LA&}qofK4@XtF=D(-Q%CsgJ7vJVY*%01E-MI~d zr3?~K57>De&!IP6<+tP6)Av&D=2RBGF|XprbOo-SXSuq|t*lRd=}f#o)UO_mjSm#d z$sPYoyfR&d?yctOclBCA=UtJN+fX_AsV)|%dF~Y)xA3E?$p= z!CYRg14~=nt(Wol3u77oTE1fHfh;vZPD`|d`PRpu18i1Pp|Ehb>&MI0F{HE}|81kB zl$62p!#&_)qP$bdhuIjNkf4|y++aQ%Yq$CB-zKctij*V_#A$M=5UXksdH;hy#6rT7ebDFRiyj@>^GnN+PrB>QL}_CTr%SR2H;P~E2_gta47Avnio}}r2dCWq_AC-!tupSi#gB1al1&4{hK1D#R>+5LTL23MS!;1=v%GUMQaoop(D?Yw z1ZpfSP})$*Kl#l4r2LD``2|_NGS=K>Q!1i%(m)yIxF%5nFnIN1KvpY2acp#uFbYs# z>%y>WQoMggu%i8y!eH_R4%(G#m<#~^KM703i*HoAKwlecoiv8hpQ|~qU{HLh`$fl6 zph0M?a<(vn(bVvcP<@H#BVJ!u8kg`vqR@dVgs*Bjt$zj_$KqklR~-P@fMcv49v(he zO%;!9CUKbgx3+rur9GeQMygjtrS}REx1!(v?CNlobi*fGqagD++aS^4Ag$;7Y}WP7 zD8;V?i~=dbJT+b`HgOkBVxt~M9QWpcyoY&-bfh)+t9vODM@oe$kg<|JLA_Fe=zh40 zoIo$K=$hIC>YeYs99{wtD=W)S=z=y>s@l0u|EE31v)C2@xog`u9Q5JPuuK~^j-ggT zhqj&dZv^F~MzFmDu@c3Ly$)SkNZ2FZW%@rMJoz=w2cq+-JR=G|I;F3pq~36DGCJFY zjV)Sj;o>Db8FyN12HOt_oz<^q`#6x<5Sj)ewE3ixX&Qz_O+-~My66}9%qR;e5`vNw zxIOsLq+5w)al7dOl8I+zrjGPLyu$Uk>FP>Jlw}09z}<;UNpvlPA1kL<%^7ayQXQ`m zm)wt}^VHOnQXfL@3rYipv-PUJ*KWMr=XXr6Sr5F9+olu{Zgu*#>LzYxgMPPN z=gxF48CSO4vrI`UK4{&^SWQgN1KBD)m1Vi=amCemYgp|0!@t?sW)8N#Zhh0j>%{zi z(fwfoe(BZ-?(V1bGXneUHzct5rlJI7+ z>wBMr%Wf@%Qx9>z#VVn7^Kr+z1SB%0i&?iWncKu3ay)&!CM<=p#HK#xaN^4gfl|#u z*@KgOw=m6~5l^2?rD?lV+N8lLgG|YdqspDoC=b*GzI?>H?uEmpScYlBXSptcIB$SK zs@sd+Jz@Bx*}iB66wX^(7g1M~(cc|pZKRJ24*Hndt+V=s1S%co!r(W)822AX*XM#> zKS*%2m&Z%InJ!S$#-x=7onn z9_8>Rac7+1!fxwq^^TRN`WZR$Lt_l~V>83HPtVC|C*@s7H-@a#hPISj6I#S0p}6BR zq!(^Kktb7tXKlKCS76(@#!8zafP7O52cyD@oi>28#k_!Y?`zM?6q>A$ z90a(W4huNvHEM4~kJp`UT|3PtS{n7t*%R6=;pJ~62L|>U(wKtlFqccy1sJh+&D}**9#HYu&22`IwuPr_wG`3DY36(g(FQYOPnuPMxe>*$>7nyV#An z9|6UYO|f-R4d#dI@NQHc&xStgfbN`)+)40Vgik@+WvmXHHp5(c*N3rpE;W$!CL3uZ zqLkI6aJQ@Oki_1jUD=a9*RL)@0)-+E?f#mPDXg%pv8_f;Jp+A|3AcGXLv9yYGEjDQ zs7stZ7E|=>)v%=d{fhiiQCdw$HIr8JIzRtXtWVH4(Pk~)+d)-J^9_+f?GV5*RH3#f z+vaOK=TS=~8$p^yXZN0)Fb&Oo`18T#L5jJ&E_V zj>SpRz4(_yZ__A`9#7oQhE6g^r6JZT5TJ5d$y(9P_?t&pbgn$>&fsMAqj!GX%H+{j zjYTHAwcpF_C=#fPQ%90{erl~0VBGI6P-T48{>13!b(V8}o_?~hXUSW~g>=5aZxrg~ z_pViT6+a!tQHBVdJkJtfLXf6%Is=N{RZ^Pecv8hGO{7^3%g4F(#onSTMtpKQz3PB< zRz3y)IyYHkI+p>IA>+++>NTFf*-N{v=jpid+?ZUr*+e?*Pqrg1zFPTCeyYy(`8i3 z?&w>Ic)nZ?j=PD*u*MRyMD1u>`Sq^u^LFPhVU_Rt0p?bTXEHzB`4YLkwO%_)3mUjo zt!B)5&Lxft!x`FlD%YeX{Yhs9}i z<_OWY&5~GD)>zD8)%>N=>9FWJR-?%WeU!sj8NBbSqL=kLV<{?iiK;n8WpQi;T~j-i zTV5j&dmcJPNdvjsJAuMcdefCIAfP@+c2j>na`gm1`ebs3>m`bgnsmieDa(&9?7mfyBX1 zd03G9*jOUm%=rAniNpdoq3cIP#H*XSm#ooFaeF9)PKcLPDi*mm2fc}dnMFn@ zw5{sWAC-rf=FDMy)m^m8OEUO{b~V}THp6$)Rd0*%r6kB>YJ*&m(TvGKD9_v$#nX!m ziNR+6){=|GaDy1#XD$L5@euw5Jn(IL-H$X)=+3EI#ta1)Y3l{DO#O zgA}|b^UJG=;E3%D{l+)y*GBvvj_@Ab>{*UtFfB_<&Mq#io71s%s0Pz52|7Fh^vi7Z zHX>lliNQ;g_0epuSGJWWID1aVGFcjGE_FRaCMnVTtm5K*N!_Eh5k!q+wyuvej?Gd$ zPH>&Qm*7T$^iPJ(ETV*sPN}_0nPl=@*~Q5q$gJiuz0$6ov6bzoNf$@m-j!Q5jt;`E z2PfYu}zmCPHx_#3vFCl4A;l}2o&nhdsjq@`8R?F23 zr>mwYe104T!UK~i;uha-tFTPpISSywMy8vL>Jt$`SlF;3)b zlHf3sh*XqN_W|mYd&RjH7{uqbq59Db{Wii11qXOZ+9NI!Fb5gomi_tOoDSZ(Hyhy^ z>t%6PJCL8}J(8>=)MR}cw$G!e9jA3Z(>^XA_|$hO8~1C0bzRyLsjp@lzvaT+g?&9K z&Q%2m5cvxXE-Zwy$4<6yx3vsLILOpqZ?<`!(NZ*DzFl;4U5n)7(X_m3mh!U3EE=9; zaXY%M)?E-yKZUl@7OZZE>(2LQPfatqSUoscHK@_KFph>HM|s}xX`z!%3msXcZ`?&3 zXUcr5!$jfzn7rdFLxatF<>h|ad9&e5=1N?LIM~>W6{W^%g+Y1j_T$$qC`z|oug`-J z|AshrOT@K)Pc`!~PbZ^CAEczlYQ4IBfVyC`DsdAJ z2ADT@G#$3&@yrT0b~mzwco3MdNs=eaO^VpEg9r_VJk5|uRHfY6?dk^!a;A{>W{IvvUn1RT6AzNR10#^dp5PF8QWEEc!hAg%5y}z}Lx7*oVflnoz*6LVAIPkr#8C<>p>G z;YI^#7Mw=GoA)i;FX?hWs*x}J@c5NE3RqKQr77R8o-OZCX^9Kh@+!-#oQ)LhPb4m! z5>C3CVx$r$(aSp!{=ili$vQDVIMV_3LmY+-R zYGaG6F0(0KGYhF$ovylpSd3TDs1XJjkMQ(HtlXB=A8ql}udm8-e%4v77T0qpoVBDxRKowf0;#)ij>Gt0cwtTkmojR8NkIMdLU* zyhLa{nrRP*KKOa`@YPhJ+Oql{BDWaHI~QcGc?GAcYKy|-+zKatLf39Hiv6J6|llb_RAn~MoNG( zZhyolT9+J)aF=|mF;OWkiR4=v6r7lrJuU0gK0Ab(@J_Z;qA@sGFJ4;MGK_#NQU?g^ z;>$@7La_)62oy=q=pw35`$HE8b*AUxcDOI_meRPQ>nKrkw)yP1bP>mOL9HOT1rNWT zCUL{nm6MJea|v*Nl-jdh9$O~1m)s*BZY*oV{#;oS;KSRb=RB3sI5_wy9^Mb_vq;kv)F$y#PlbDh+n@8H#X!&2qK)gqwA|{|6Qv5Wv6`ITvzGaCk-tKa_()YL|RdcX^ zTzyZMV$Ue#FVH_K7q@W2_m!n36_nSyt5Gt%m@v1OEOlX z=UltqPp0N&^W>FnZuyaVipo^1*!p1DlFAp8bQ=@h-|@hq4Sw_uJ%Fn3(1-5e+D8gE zQfeL%eUeG*sEx4Nvf|SAP&{W?acHcR_pZ zE%TS`WF#_@)-}exN70(4RhQ1T|8%s6^Kk_IcbP*$m-P6=+jXY2;+0T2tDYKL{W$?O z=6yt40%j$G!E_#{s)Hhxon7@*mSKtuS29$~Sd$UY7y8ELy2o=B)z&T}Zv@^2;|x69 z(9O8924?2zE$hN_3e<;YKA`)5U|D<-tjbmMl{m95cV)ML_bX23w`iA4Aiwf@<7($x9lcGq`u)}7NCl~6AYh+H{bz}dlSVHLUN+eo!8G( zM`5Qcmp+&;Cd-lB54IwPSD46i(D2%NNEm{r!RKD@PVo*uY6s~qdkMB_UJhZjMtJEM zOcFH6DWIKp_Er>!#4ddQ_~PC7a{A3L<{Irwy4e;hzE>pW!)txs)FCe0LC{p6s2FD= zZy;^;=KkGQ?8%8W4B#fO(I!gz9o^6OBE*F}>{Xd!PUnP};$PdRytH81tg7A2=OJtb z5kHwCZ>e5lJ^fH?Wq$FbE8w$kba^VWRBaMXc<2QRPlZ7tWLo()|NOZjq3C?AgpZq4#v+@pHEK$h4!}xPoFDc5?kKPzmaOrZT;WD?Urxz0uzKcTccOryTR9{~L z?EUJsU6oe7zETf5&zy(DNt)kH{EiNZ#Q6Qb-Y89SFRhdva0nLit=R7``tIzGw{j(s z(Y-hn>lEzs)1Kku3ti+f#^4U&y(}uJZ2sRr(0}GfQRn|@$L~+rcn9b0tA8q;;=aAp z+7m{AyAnCmGo&3F+|dA>rx$Dedtm-MLirhJg0BTxp2_@v!tY%m?|WFzKVAE~HD^EI zSN=5Xsc*mc3iRrj9b19rkEgGo7fL3+$y!LH0kd#`MPew~eG|;xICSciC7M zU&v8{eLTR4T{su3FCrtxo@r+nYYA2ycOTUI@Z-M|78IN3OnNzn_Pa?lG4Lxdt0t<7 zA9YchJv>h6bbG$qww1`kar5$|i3DJdR5p?f$n;FH|D@{oU`&;X7+rb}E`0TM9-{Qn z_?3j&R4Yx9*>T!|X>4V@S+!C@hHnVX;&614O69zhXqMJkf65F|?P2{6<#0GP z7LPfJ$ma553+b^X^ud-Q8JzRFEfO2EuCxrcjDsWkq5N>N3Odpg`g6w`zdm6C%K7P? zaBlU!=*@Of{O<0)kQ%O=l#!@JT+uhHi1_Q{N-_=b8Zt-S=I?$KJ|zobdM*Fu0Ehp0 zGz45Hob|JFW8<5#2{JP-r0OWbqxjJbmT^#RAVzD`7F|SXB$3Fbc6|Btld^ntnce0D zKSTN)D7U-yOnr7}^^0iFQ7D<)m^guQ( zgQ6qDP8J5~aZYN&P_HE;%bhAF{>x9sXuJz0U7v_3&AXX9l!w%~CF&!^h6HmE!mfdIXji4js4DVW#!IY2{Guhiu50?y zl~Ml+NmLv1E6nX-ZSt)J98Y{zBD*m`PdCfgtiu}y^-19pjE|Cv=5!St+BQf){uFE7 zB8dV{&G4(5EN%R8&Lx zRdtbr6`t>X_$tm8FM{j?q0w>;2G`WJPTFXL7k2iJ5>klH)jK+EjvnZ@dPUU2eT-Y*JAEi_yH& zU80lX<*+&eMzAO%x>)3p-9J!L58kXWo|D`u@R_HyWvc-3Bx%?Fs^Ak0nEaibMPspY14uVqNL zQiZkWtQpF+EoHa!$3pZqX7tuu_1pt#1bx1xdo$Q$r;Co$9!N=h^E;u3x*v~bx&c46 z^oO8I8k58&DZe=fRm&MOOkSk3HBu|XtWanLW=TNUrq}tS;d$JUIVzITBD62max+LP z>>af3QGc&=?RLkKTyK}uG5hPALNs0bI(aIHrQWnR?x&iLVpXo@YsL<9wlT3|k>n|* z5x6PZ_`qKn-jee_B130CyPzb?XCWm{q<`8>Va6{yaXo2HEd8*C%AnVQ%y&|vmKGIm z6iOkwKr@l6v|N{f@4niypjse_sPHi_!;ZKsN-8U;pxAwiXpyg>a*QJ^vr@4A^Ddju z*y`=Kh9-zTYZD(KbK-#wwMufWP^*rJun~{+=3tUIFUjYKm*hlDiIOX7S>$j6T5av$ zc|GOF=<2JF!_rfGCb7LLd1>|`lBj9j+Ifmn(v9oKNuJYG!6};Y8e*^3lWmX#t7&HV zPGP<7nWMPF(Pp%?{janihGO#gMop*>5_!V~zy&{A5;=uWUZA7#) zc!{`VX(K+RF6&?_Aix5WA@ZiW9**wT_Vr@Y&FA4s*y#4aW-~SSw1b{}MqHb>GCxt) zZ1l+Yez8ic-;CAYQTg-gIz3X{(o|ciyyC;(&6BmAx~1BzbaZOX#>76O2j5I*hejhs zHxtArUu^mMiT1u>O&uH+(chgfra3~T7z`&?JSXVgE*dt7bYt1R@9W&CUdH~-MQ-rD z6|Bv9lgcRFyCCOVOgEED*UAo^#c)iYJBrA`GS?}_G<&1PxT=k&xZQRL>d)qdAi9D9 z>LF7TVEd71RB&8LfeFmMU&;5h%E`}cVH?@6Km4Uq_#ww~0=3(qC)+zNs<)wAf9G*k z`=G^{XutzJM}r&R9B4KE5PD}pz8a6BSh$=L*Wy5x>KIJ#q2qaz5A)R{ zFw-I)@#-X&?pM-09ZFwNHTcXQK-{@--*`^EZ+A1XSYuhh3*VQPQ|=Hc6bUk;Aw_&a z`p$=kqIknQv(8uBfy{mGYMjGHU@oy}i_CV8n2Q70`QVuE6D}bZmhk?S2dm@=3X1o)2*ZsLneX~#hfPOTxx$%$i5Ow)O;im&yH? z8RpkzXIP7*kVd_QK%b;2?bTXzlZCVqD6_mo?_SuViZq6uJ=;0zis!nUc8es8uJ}!@FcNzInX*IB>Xw=UPs5UpKH-7!M%j! zBd9hTI_&2RLLf7zishX3&ksF?bQ`~PCY8IW{ym#;6()pIR(ejiTcNJ3Of;A;c4OdB zF45@_ce3*ismnUv8)0I4ls*3>DVGR-rChjSB-i5^nL%P5T}*0DBq=AVr4;pQbuC_; zAmN1dxc`;pl(5yMD^!}xS8MY{F`2nO92|M2HnjJu!Rzh}E*9tf^OP_1Bi3}ejU^?x?{An7 z<9Zxy+0~}mttk77M5cF-Z&WQMm?f-Mv=DL^WNr_Xi#8nnEqhex<{z|qS9!A4U8<$3)Y#FH$5Bw6NVsO% zx=GeZ9u{7&orplqPR_NvLdhqxP;oSN3e;M#F?(m)CKn`yr%5NxE}3oBA^INVSp&Cl z^5%9Mzt?l&z&@NTe@2Oj%|0@kZbo9m5#-H6r#7R;NES9CT4sbR$jqLiH0{(P3aAFGYQS!bOV_bppB z&kp}>kFMKYI!+^04U7W1n%p0o4XBFa()l8q%@u8Fg5U$2Cns#RLP0pF%5 z&2P{+n)ShLT|I;FqBZSp?pc+#ti7n7i;o#`Ve9+ld&l?GxGzvvmnsDVM|IpdH#@~G z=AGBq9*aDS3B2xK=q)xmv}e?^CcmEGad435m*L*qNH~_nyv8D%78Qv?Y`9b1v+IlB zE0?RX7;y#}iz+chqzrABY2$1&u2w{RzoVDuIeLcCk?{#FaG4;aQo8K$&21KFI7s4> zO{kF1d9rJ7AS=2~8ccqA`j=oPCr=nSQ#ZTdirP-SSqcIc!exQo?lDHiuUt`~)8(U_ zl1@grv$lL8qK{{NS+(v$8Hlu06zX6h$c{S#xi(4hX<+K2B3PduG`0)GbpPn}1|V?@ zweOno%&-d}kR>j66e>h61H=FossKe3e7*_y2(HiwV$*XX`JCo5Biy0swUDU*?0q;Pm!)Op{-JR7`gQma zeMX$49zrK$(6>#wy+KZBrK+8`f*1Vf?&JCH#%4FO+DUQ=d zypbo*_2Y*dT`Pn%e#@ot%@yFUqaa0^Z~;cn>Q3p5hLD5uFTJ0UZ@!nR8Rfg$q7WL; z$#g?2wB3bwIYS{?37t1F=0- zswqfDZ5a<)u)eHV>n&=&%Q=d|>eG~lq0G-R*vDyfpVKcCT<#$!4R{uz+H~*5bI@SF zWOF|tk?G4)4$9N>9Pi%Ye&-_Qjck?AgKN~rdsw1_VfWlB*e_A(>`UL3)E#j`4%YO& z`07UTNyEsr@z3`_O-@tP4&kXLkd~r^;wc2aJFmoZr+^k3SN~4zjn{_$ld7+Ig~2r# zQ=FV?iRw+~OAMUqV#$Vx_sY`-GlR?HQ^0&$`TMB;y$a@`kG+u#HJbMajT^6+Q-v8a zWu2zkGg16Wx&un`>nTMx*1G(p(UFlO{M4Y9un$&$-0&36=9?hPj?UG7@g(g&EuXIG zv|JSENq>`Kmsx&=3&4rk?p^q%NKIK*^xGqK8Xaz0z7nGcCmq&d-czBEt8Szf8Xo?U z{8v&|62q!EPhV&VGqaPM%H1GG9q6=bVNx{CXjMDz(4Kgk{zvKi(33EAio~=VALnL> zmRj_bsfrnN>n#yF>NYHb#u;n+5{VcxkB9dlE{n{Bz4n}qbaDsBDs%_h#j5JicbA09 z9#3)KG7?dI8t#yZ8&2*I4aL5EYML`lHI&ZK$|c=gT_}DYA-ih|YEHIyE*^JJypeSvReCwritPyTnGGif93mw++YHVm(19W0A*c;F&P>5<>$xyDHFkFI4GjDhs#Yg zS5q{}xMQK#W6uwBwO(&;ZBEeZgAv;=u$2x&k)2hjvol7N7me!Dx3!rxH%*dmw7; z^sex2vra#~O~@;8SwHS_mq^eO5EC4ja$~|gUIF=t9<#Q5EAL2(X$8%+zx>E_e9Ivr+@3=1C!EaO z^lam;&4V8OPPy$I;63%%zi0d){3EaogTmi3On z5=qnee1#?)s2)dayml%88x{A!?D?oIB+^r*!egW3pfnd%@4TSTfi7rP4}K~l$cocr zVIiH4D*G!os4{J@LdW2)s88)zg0+B^6foD8Y<-8LMtxq+9L2BS8iuZMRU3f9h>m$R zlgII!d^_M8k$>Epc?W}h=N=g~5Aw)JOyE1c4KYuea&7=-Uu)pqEqKu}=F3_7kST!Z%!qa`USYx@&c0fU+h-f=zecP@B?w-tNLOcg;#tBOdp z4CM$xo!a+Y0~^MG9%mSfm@?e# z$BU8*2pWByyRVkGi>W>j1E}yPHU8Rs_2S0$uyl^+nVueV)fXR2HJhdV}EY3;qPx(J0TvMQR? zao9K38|d+Qh;8mfZH$rK>Kq;RL}ZnE*A^n>l8x;Vn5a^N%qTP*sR>qM+Jx+0)F^PE zENxB$WbrMoiy@T(+N1kP8)p4LW{bpejNx_aMC=ZP6Rf}tLL%+u=nC>JPVdAwsTa&2 z-#y`r4%d)1vk7o8^8rXnuzs~a;J|dB-C?@m ziWh~u2PhoM{u>K06uq7_#@2W# z7iRIVOOKUupAG%!%WnTZ-+$|pID>+jm_hF*dNJ1cE@Z{ENC z_?`2t=kx3gVUAwPqz5FoeCxw!P9~+2IT+?wYocvo}nJ6;`6_wHc6~)Wph3fdUdMexXO~&g?kDti-3>qZ=3W1U&aMI9Tm=~NK z8NmbYDkwc}Jju7yWb<^)OcogSLNiy7Xg_GRoztFpQ?w@IIk3n3OX%3&&(4CGAp7B{ zFc3}Nx$60A3yoX*v`XJH;SM3Es%JFST2rl#H-&s66xAAYE6c>PM4}uOmpoKIzzUMo zs0+-y@X@Kn+g7ALUnDxJ`K;SP548(ScWVC^b5*5VAX)n3_Q92Z=x9?xKs4dg1vy=5 ztmVt;f`HT26ABk2PDI)Ev3e3BbdH92@(;UR`&^N`t2O^;Cr4^{xhTVvT~{*=LaZmk z7MkR0tBq{rg?|q`guckbciyir3tpVBhMyePg;w3{KA&>FUc~lKG_7Gm&&LC4DfLqC zEU)IgY7P-@DEvex=bWVQ0P;p8)cXU7Yv25{H6s!@8kV#@0w(t^G2KocD`0I5k1kI9 zxA0_)vYCA-RVOu$KX#Z{SCCev_7JI08f-c%TVci7^dKs-I2`tt+H!_8x@oP)-3|eK zq~kmNl@MNu%RF5S4$IQBE77zx9`3i zFnd}=zQ?Oq|Lm%^IrUzhs|w4y?~nqBFnm){LRxLKYc+&Hiqw4D&Cwxt8`W|tpmeK0 z6V8_wwMjUBrL|?Y=dTsLzN_nd-kMC|av&|TCphMVMSq~;>H}TDyx`CEWStx*s?TpP za8zJ#@gku5`UIp4X}5=YIDi+Ly`{66qan-vwC5V1IgM+H2%u97XmoD}uYRaqq~G0K z;mvW+71mm-(&`=v_$m^=Y5(LVG*j-KXe8Sb3k}>|*UMe@`;=o#NIss;HWBV86=FmW&gW^}np<~0j>-c- zO&}1wG!RT@gJ9I)4vwz}2#X%6;{CZ4efjIc4%H zCCeWFOkG<2$se6KjL~PlQAu}2)A{RkHgHL|D!Rkq;62Z@RI?K^(kp$gK*fEO#V69b z6TpAT&vF0DLK)B9qNH;bM*4Fa4UHsI2}V@0%ggwb#aa}=RZPBgEb>Y7#;ePx3Wk%T zzQBcQEg#C&5dODPz$Kf2>d5vn?KNv~)2p z55b?UkVKfP#D+b5n4xXU)nOi#uG(N*(w&iSc{A4WSt&vy=9vxwRsc6o1rTrcE2A?` z+%lIlzVoP?V!njb``XLg9|^e8ByT^}Bw@i+pw9PCuyz>kwvz8FvHh#1OzirAZSzJ1 zCc3Q)>&iRPVnQ{zGFqOt>whkY_t@%a2@0~$-H#ch6+3mYK-sI?11UKA0TCpe%oVDj z=KbFIBl0Z{X1#H~9jlC6a5*0JM8qUBS}lp5Tmz!k4(`;2w#6HBsU>Vuk41TQWNj`?1hgaGV>L0nfQK0-qL3n=*Hjs5rw4!HAhnDMabsIBk6g{U^ zE8sOpk3C4tZZ@Oa53T~4R5`GH641-x!64GS8-p77+NF^*P;P8x%Up*cm?^JvAYqWyR$a)jN#^qbj z5CSf>Mhj*-%ybr{?_+TStX8F;Z`dX0Pr7TOeND$YmD^XY8D48d7>eT(5@N5E>WX6Q zrd;>DtDnybH1Gq-m%o2#*pk{z*)V$ZC})59yVEV= z)7?#zk(lh$QviZcxSdj;PzVSRIre*l2wTerzY~zz-M%N$_*ZJ~>`TZaUq1#NPUNc6 zl_)`so6w_&*}4y<12e$1@!5!nAAGFfenLHj-Gy}sa+5mD(?Q1${a3p9-@LKW#~?l+ znQw95eQ`LEjld5m`*f)P!r^iOp||V)1!3dLq!X(I9~FmGL_gYf(8`P=vv6x|EMYI? z;UUT|Q9S0hLBa52GSi>Q6rToeeWRy`V9)mPF(xhTh7AIuMD2%`R+#Net25K5Zn*wa zl@fZpcL{M1+kdjfMna!p+fIciT*ffocNcJAKqDhLOa3DXKSGpjq^W0^n;pPdZ2}C3 z7)AXTN`Moz4WjC(_ zT_9QH2%%ucBAwMW|2n4T7UOqcX;^F?53cF`z4#wStsMP}RzaW;jD7$XPd9htcU^lL zgkPhHbtar%_b?sVBO@UH<{$pPlzfsHNOita3rf^*ka<)x(FP*LaR zDfaAqk|GHg7bCyR%Zu^0U?U^%P%1sjA{^hu>fz!kh{3EeH};hPrn9pD7ql z4G1{XYIlnTsB&?ybk)@{0fg=cR#qtF9&M$-fgpx@eRJFoCTDyjvDj$4iE1O$yU;f{ z(ZWd#p)K-7zvCfLP+)x7nnKXhZeM0GY$z7D@qFBp%77|&WO7xqC({vFW)PGMAM^a2o_l)5XXK)Mmbn;2HN?6V1pv=YKx{=;Ccm z#Q#JyupXVCOT*!12iqju@$86JHJ`DKav1NsI6C;~wP5n{vUBmdrS}Sg*!^d8zV{xE zCecVeU2foV%G@8{7y<;X;sOHpTocO`K6-lE(!=Sw;faCB{xk6eR+E6STZ19Ca)9pC zo*YTVT<|tqZ&uL3(ddlF{vR(E+$VtQ4z-f_i<F$8r#+2iz|DekL_S-1?bFS~9+`f%n`Gy;%X&>GI}zoE!V1X9-WWkN zRD-U|NtqFq-g|ZO5zNm{bStkR3v=(&i?|z+4lI1@u^kT4f{{iR{DYopnzE#V-rbEi zzBdu&@FFuxJBbekyn1u@l~?i`IXTBX=#Nv*ns^U;7b+k0+Yg5(vv`di9!YqsDGJCq zvY=>u*N$j{$nZ&2`tJ-jV3e=>>3UGHYF$1LxZlBs29Kk|BDMqLyCFGeO%{1alk z!XA5zIeBDH$3*^0w%%qXgDkx%PbQfc7UvE_Nd+5!HSJ1F`M4nKH5g%)EoZ}1ie0q` zhVK5cb|VZ!1D2|YhIqDYs9Z|*6oUm3q?qYy-akkQ1(cI*)MGVQlv?kws36$NN`Gse z<6M?aVYH6~4=XN;5{FA^2uNd8Wf9z%#epI7x&f=X41*pMnfk|>nk#pBP+^v#_cj4F z`SP1{Ze}^iuic-&IpHv8YdX?K6z>9^~{pz{8~L6gr`mskv*2{8Ate->w6 z%95bl)rPtSLE{-PxGW=R|$_W(FH?qu;oEAY*4RFOb#Mg6fm?m}3^|rTJKaN&^0Gh|PW*s*P&IGzl zRI_r4OnIj*1Iu%-_simC6&|Gm!PaetugoZWD*Z~ELW_hgubSJws61n8e)!;Qle+Rc zj&uqRPst%$%#>=TyBls3cT)BR;hBe8?4wNPGzF*D+xU(}(gM$K<37oJRVVi6ujsh} zubc{U$qFqdx0zZY%~jqnkd9F!L~*7Cq8w|Ed$WQpo{0f0Hd2dqmR$fW(yx3h-&gF& zdMoL!^HJYnHTa*6Cd3QAn}fwIR#&BJw|c~OrrOw9a^=MKW8&u@%nO}o1lu>R{JB~7 zMrR0xfky#3xeYcG25Tcw$>rDJhk1IF_u62mh+AonFva_h!=Js;YK3lV?_}h9L9(->L$9M1zxr zUTh4JCv_f5R2RU;4v4JMKA_}Yx`zKid>eKkgA4am&9cykxcU9wa_m(^G>HGZTRlAV z=LGZBQoMu4Awb0S{{f>Q!fOmI(gwMn{>|dRWBIhjWuvfn``U0Wp=#$bi=fNLhnb4+ zey1UL=}9R`LT!g+kBM&Kk{93fxe0e0GCE*Px4Tx)x96iN@k|1mc*<5>Xp19RoM|w& zwZ+)#jgGOOo^#% zPpT)(M>+P>K*0#zocUEQsd8ob^~Rs%@S)M$iXU6*kk+~p_utxRS-{FEHP zML;Ny=%O0%lnl-rOTUv-V~cDdt0X@>gvC?D;>jtWk2dAv%r8-(PVekz4QD90FrMvJ z80XCjGCc~l;&e{(CMZ{LvEb1kBI2mOCOKWxHQk#3(aubDtCAz7;B<_PZx53l^$r)z zw&ZZC_|3D?mgrra_{UeLC=Zj`t!CI0%}jlbvrN=oMPkN(XmLO`bp1{4f<<2|$BUxHwBWKV8oP#G_5Q>-v9hf|A z?i^g7$5Hav#JPMDFeXPI;i3x)n+rh|U8R{8S5x6DM%Hv_y8b=>wm)&HRMk0@CYUbw zM~C2Wu(Z|Q=02+klc6X*?*=$l?0^sn%oFjY`_&^J4|D>Amg>*VYW!RtQ@856(PmjZZ zE}*4+HOEPmdrChbvq~NIKvg8?9EIg1PZ&`RQ@)178O-{tQu99>5TK#*m1IH4vFW@8<7^}`u>06BMzd`ov(MMel9`qj=auAzL(a0QPujT6sV&md& zV^l3r^RZ!yw7F|=J)ZC0zvIDbP_6;|V0Ql=dDccZaxqWTFWzN9IKwTt=+XKmRtLl83SS6vdia!-R0A7_g(suknRb}8;pCjQr* z2PZuN6~ie>Xgm{TP@7_LzK%Mr zAy#<6AGfb{C##Y@4fJygt}u;8ESZADe5rxMB~0}6vB9dHs72Gm4Ie8{8CDshd>Uvztye8lH8d#EDMYV8Xwa|*SG)NF~jKI_8kYDWr4MI0HOX!kf( zdrL%raPOFde>zdWZI!ZKuHm*+cZ9>e(*M0^AiJBFnrn4nPQ>PmsdecSm8(>m7|MSK zl+TfzN+O$^o0nMJw2KW_Lz%it!I-=&?LkS-$}(4UrGMr%0lh^ zKhPyVu)}K{R2?Wf&{*1;u%ZjMxAmey%udWPh$5|c$tGi6`d*b@qOU^L=;L#4e4N^U zxYyujPgP7KI~|D8Q+Hzw4;p-brDU$KZBw>lB2PG(P9!bj7mO2BLy8-o-jw|nDD?Y` z!*)TT)!Gwt-Av470)QrahLC!M__NddHeXf(N|Th+jZ2=60@r&$498s#zdQ(-Q_$4}i8dwnyYS>2rYiS`Ek|!fjJeR*jRiXkX zBXwZW&)Zm~Ie07T>{crCg7upYOzf+cTI&`ht(04ai@9*U(!^YB;Slu2;3fX&>A~1o zc@-kFKA~k+rT|+p_11IazqY(Ng8E-xoExA`Z(u+;A~9lCg?Wm^%GcFwU$Wx6a#VvLuxwBy5JGMyCTcD?=@d7 z0kQsaIwIY06*-yp8@IX1NDxj1Ti_i>Ttdnt~UDEk2x`>{mhl~ z=muV+c@a`%B$py-%HD}D#qkuTw>P30rGrTPDDSn~9wlc$F~qj+$7iAqOslq$GJ>=5 z$(xt!U{$8v{+90_7Xwd^M4);z6&fu$ZW9eSJVEP|7!ifyW+np>LZay!V(o+RPnvh8 z4?gJ>hQy^MTEjj%f%gY37T8}Ms!|>vJbJp|wxSU@g9R+{92gBz3p*!Hmhq7ipR8AF zxV6yGs#U`FyTae1WTSqTA4bYGSp5BJYvbb7zIpDc z(wm%}j|wGNTWR5msF@J>Mw{6;Pp6ab3KzLYaF_CY3prTX`l|rG7A11o*VnK|ol5VZ zpUo!)1?(UzId@Vlpx81;M;{PVFe1ZqECLi|0kV$sm~HV+GD4oyQjx|cQ}5}YDK!5R z+hFX4YR;9ybM+ln2nQU_T=H=U6oY!d#yp*g0Oa4+X>8k1GxLXQjObz^c9-JxWtS?V z-=aXq$|1wT3l22yJjkNVm#6`ZX zf?W0e4O88)+tcm{JLB1&E>;S$*xGVSdg{?1ONfUYy{|X|cq0QoAmAPLj}>Tj+CzeZ zRF&xf(P!a3(a4gWJ^#A8yS}Ys2E}e99+Z1h2td7z?3}}b`D!kgbro2U`dpBA!lJ8wEoOf~v`Vp!C_BrQEw`Qte ziRH;vdWGFo8nnkZ80@fJQ!d}@ZCIsfVfB||)u}_|8Q5@ow_5{uA^>=v*-%L&1Vnoa znB1d^`IV;M))bhcTR^KG>nkqzKW&%b>H1`4ZC3iT?yK0XAqi)pgGjZH)oAhhs5F?z zipF;$z|PZr2g5|J6wW$-6Ox^pcCt(%RF2r{;u>76A828)gIEs;vg)YX zoqbWuCC2)wA*((zqjUcf)xE^O*PzF8Eu7=S_vGs`QyGgq+2)KG1bG(dS@gEnzx-#M z6^@UcJjP8tM-yoM^Yf8VsFZhi3&Q~%{0@iX>hO3Ry)%Yl6bA}n8Z4H(I|^x!7whZ9qn`2ewQ%pL2+VdSvW*>Q> zRy&IM6lbsI%0vD)7N8BD#+i!A1b0|Fi&`7(bdfd`KGAY#jU(-9q|r)9wHB`rG=sn$ z=Bt83>blohxXh&O#jkHjafV*LPD<5_F*VURa#?b@kdPI;QN9U=2u_T9 zEZ#&MR-^kCCicA!KSa2Y+FR5qx1fJZSixvz;OS#^wnCpL>+=9vT{dZc9!rUBPC^Qz zp1c?BW6%|e_nbBR(LYa*3u@4`Z&Q+Qyxfbt)j>|&Y$$B zlNyY(hJk>_eMr|QxLK&KP6obUVKodEl;ph zUbr+JouSUnX7Y(HIcu;}25ftlm{vshR>M@?{+J`^eDbE0^fp7+?g~`zY}hHqt=vv6 zjca;iOi=vF^`=733Nl>EpuJe1HkYv4x<}|)n>1KRF4pG1coQRCenFfr`xbV9pUzf6 zVC0IKwizy)$+W^Mmr9=s#7upX^oFO?Fh-5@h16!#zeSUjcblNvycW3W`F*O%)vw?? zs&r=W`nHjia<;x(8;Dw_P{?}@UAU3~JJ%zw(ZEz!9oJl}o{DU>#>l3VJS^}0BLk4i z*iPBN2<46*^%1yhS#LJfNm+%#wl9A8iVFjLtxv#XXYIfW=wl~``of* zG;58fBFCes=O%NR14`v)hXd%`wZ_}4(}jwy=2*j_C?p&lKptuaGeOyG7m`h#E@RIR zWtwCAF{d<+&=V0LC$VTxeO$_|N1Tb5X1_HVTLg#EEa6>s>_)+Qh3Pq%MFdvJwqHg(ED^Gtxn4a*5o97Q%VSF_I2*8LMcRPdv%3xS4r*v+)vzkOj!<2J^~rq!UColWe^0#@9v!$`TwLqx z>m`aUj13lx1vVS4^Cg%5#l>WgE|0_{BuE$-Be6tc9B$h{Ls7VU>n(Tv14HpySy5%Y zn=sr5hCq{H1Gh2Rf(>edN3=%;U{11;HW1vHW#ML9?T0@K{nJ5;yDV|X5HLew)-~4%93A_te&Y6=%j*zaGZign?*v zGuHNh;t#BNaO4g&gxU4=^{uvCvkEfkAi|{Ds-8K>)nswE?HJkX&wuF_5jmr9zk{0T z$%k?{#fX=~KRs%jC0!u|{r?78#5^F^4vGA(4?lN2;f^zs^)V!O_gXvP-{ zhQy_)l07)mwME)J2G``ET%y>6(-cPIeiQvce9B|~bn;c_y0xQ_XpT43W$H0n03;z^ zLJ$xIsX)OhZ;$f5q8zfZ;b~#R*;Tu?%6MsMvyG}Gr-Z5(n$l10J3XYhOnwl*2jhv- zCRJ2RZBwZ(Rhp8A;(l|%ajuDZY5RQw-Xyvr1}Pe&7i+}Vq-f_oD$;V zIlf<*O?;l}=`qdeyx|oBgD%zZ*@=9CeiTv4u1KVi@ z;%%jO;YE-f3E8pZCfde??(CQEV^+Jf9S~cLSeLCdwCen=xbU(BLKL}AEK}Ic5v&zr zsMgo?P?gSMPW>aadSW|WJ^Ri(DVsN$ck*21>)`NH0k-Xo;}SbIB3*E?pgMoq)_gEg zBVAU0D%>9mYtTJ^)vyQOpg&P-mS&$*BAoe;IQS;3zwT`D-LlLPV}&Z>&L<-@^e90Y z(TD5@k1ID<`T<|(a5KDT20~E!-Y0W73ZLv@CzbC|_ddDTK#LX4BT{0<;EL+T(0(9Q z3eySlDtPUVMOY(i!5Te)sKK1UiuJD|WdG1mY}#rjulL(*v7BuysVfO9D<&*CEv?B% z5L|^$LmFUvjE%3$dVlvSD=)u!`$#bvOS|eNr=YRS3i(iPcNsWnk5kyv9HXybC(H|R z!f>M+y`Fz&0Pe$-*$Thf+iJ=Z_F|HT)m2vK3zh(A|IJ-hU=$P=gT^E!q3(9^1f1Ho?4|tR&YVkvnprAEamdRTxI3Vg7?cvJfGv)gKeb@13h zzeTO0lB$%lGYf!i0km85sIa*?1t5Qtw{An&+gHnljSeU62QI}Vdu2$FH>XG2tWS@| zyd`@;jx_8Z-E_2uakC4V=Z6y^E~T$hbJ(%ga(rofjUFoPb{$<{egg(r96w?~B}1Zz zlyqbj6cqfC>;?h@+Zq!))L}u}He>;RYRBn0mn#JEdVeCb*=TLG*`7+%(qd$6jD&(B zQk}&kezF-!tKB*~xYNx%5hL4c?qh8(0fPoUqsmzFGAqB2T?C$}3n3ZNS32cU6A zM&D$Mk`Df=0Z(463ye|(trfOQ+hyoMFlYx70g_K%|;x6&D{LZEL~Mc2Nytoxtl%= zCU(yYJlB8qq@<)I6IkpwT1IJMX?d+iSyEMMGM1#ZSfL}eRF!!G=*Gy1Ho%d}{}mw- z3~m_0due@+?t%YD$+ciW5{o(H_SOL!8d|NxgXQMtrXC=8T~qT)(o`!|+9oMp9k7qTL_gB5N z3N6p^;6~X$IM}&yOEdOGEeC5`2%=gSN#F_#frR@<8SL-mnLAggPz~Yvf|Qo>43Cc* zc>3_NC@1oNSQJCu+!VN6ZJq{af;4Z$HgA;nz!MQ;kQ5r5)c8XdWnvrvip}uFX4zdH zccrAHU>t}E33CN5^8U5_vS>ufpLiq*vNo>fix`0bEp>HycoNpB>jMd8{C8XaSq}#;FTuh0rIqwaQm$BUL^Ai(T?l~hRZpfM(KQq5}+Rh2h(j~Vp zZ<-@Sc{Khmhe9h>F4pwLWmxCe0KuQyi^x?N-aJZ-W%LX+ZF?b>XT|mS@ghu2YRs05 zwz>~^oo)|;oa$JwRp6+m-)xp}u~K|2%BhBRyZkQBRIew7s&CiTqnR9EFR-yTk;O>6 z4hey~ahy8!z|5!;^mo-m?3Ia0?z*_pODi%fm(S^D31K;)# z4})a~EGaR4IgiUxdev@My8+qwt9LfF`g(7?$<2M8>HX>h>-daQ4POU!DoscL=p6KF zr|k;QNM0=;`>*U%yoeqob4N@5DKTZi)Jc<2P12jj}oE0Z|d z+0q?Kh_E=<9=cu5u#%K)if?O(`C~1W>ea`;{cr#@je zGnx@$VF9<1HhTmq`H=vGuBGm4RQYSETSwjDnc7<1>WVOiDblsGlv7+Ek80C1IlBXX zbwK3muAhaFYu2^97TZ&rv6jHm*XJAbzeDb%LBE$Gvp4TX;H>l$eR z;F$_Y?vfRD8G2UzxjJdry{+Fb@`z<~C)TfN@HS9w?%gH{!@AfqaO?-pD`lFiNjXCHI`z;MyJwz{ZZ4)5G= zUhZLd=K}|)7FwxE`JTDMM_C?!Gv_&y_vuje=y{?++}__e94mqsgIS(@ft^dg&>H)_lN8!Fa`fWT^=3! zO6>zK?km+CPk36|c7;3K-(wjm0{VnSrKRMrEE?Wn8k3?xNYNAifweJxlSN9RD{bha z07Udoed&J6=plC5QW;*ghkKA4zN*h)srEP`{RiHI2ktv05c={(BlyE7KrS&hKfY-) ze8FI&;aO1In#Q@oQ!bGpm+Tv&9ZQAR9ulM1G2u|9-XK^yt^zh^UELa>@Cb{v^fWO8vZUh`?R;2g&w;Z&z#CfQN6}_;S zEy3qWP!p*-k9<_(N!ewuxR;zI~mEEO7pOWAL$K@*=1^)XF;wZ>3lC+fLARiLO zQgW&xQ@=tPwAOPb`=dO^C1L7w@!N~0kqUqwsC6xRt=5e?+_vf69g!V6co0kMS=Txb z-~w4?-qjfePX7Ez@kyVD1{DjBt#5tou-@{1;omU-)Kg`$ao^TrXUDc0#APSiKl9Ol z!~NV9`DXTnnP0iCIjq|RV2<>wsA`07>x`&$a^)HcyDK;-+TS<|RYS-Ay0uvDAd>cx#_ z!R=9Ti`}?xO5Fm)q?<<}+))5IUtp?~`s0fw^@|s@t(;dp-DZ106_;h=Tqr2yPmkfR z9bnNY&lo2uv{$DqxvyKeiV3YTHl@phH!yYSuM?WVfsaG}C_PWxNlz{BM;7Ukk{(D~ zxh(-A_RWhz`a9EEKF?S@zVvY}wvM55)1esjdS)k_&c=Lp+hyIC(=)o4se2kN55r<_ zqX8Z-j{im4TSry3ePP2OB~k)Phje$hG)Rje-67rGA>BwHKqW=G8ziJby1U~5hePML zxn8}$cZ~1->)m6(U>x?@d&OLHK69>T&UIz%2&bsn@PScQM9^Kviw(uxY5o|+-B9i2 zjPdT+wnj+z+&3nzk-cn@?K7%u=+X@jo6nLHh}fLS@#>e!^I4Bqor+P0`n$}2t6Iq> zFIIa3I~%H`4gH&y>{hBFWmUBV)}xs_PJ_yGGv9l)Z%~%~PG2nh@h6Swr}juspKZLp zdtT>j&quR#N;bccxjWe+ViQ@ZS+#0zCgg{rKcx9Hbh)L24-%@a&r~gPuj8xVm~bSl z*_-Si`AKzTwH_X2%GM`rZlA(oMf>^{1&ddha}GyK(?x;btmo7LuV3p8bs~nJX9jdl z{H*GQ1;3dG-I;2;B=)-P4xa(p7dz6mlI1w<`sA7`*bui%n*rUS>tdZJcoL_3a&>o0 zy44BJ9UeM5x*Z`$!kaw{Fst7v@Pug5BdJ2x=cn6_4E4v)>tnWjuV`!=b`TubZ{p^f zkj`tYWTQ8D#^I;4#1$v^OE!`B-s4rd!zu}$vJsZh$zRUD%thQoHx#bQNfpT`Ywhod zO&Tshr^%~mL{n$pNStlBL=zDD8TI92SVC*_33YT^B%d~17s#2%sj{y8`iA9ldD}@7 zRP#}PsR2Y1L%?33Gcw6yezd1xf+s+|Pay?bNi87qYpDt|4{UZ?`Xc-_k*>#Q4LjcT+qmiDX)FE90njy=hsv z)ib~Wy!SK}vDCND@gW%f49&0`cfo5QqqwB@*VTpHvD~3lG7D&=86i?LBcr=`KIX^n zi1F0&{c!c|ogZSzg*M5<4bN$_9W!CAjX`%pH{(Dbh0{+P)F}VCL7oQasGZx$qHFKj zgq`1cXUJhX5?NQsd1|dQ?JMYYkg4Q3= zK?nXHYuhrqnp6we&stI%^4q{+r4~(PIm;)%GWA(^HeXt6wncTzsaHDOr7ZBZBi}2? zU+weBd*=p+XPZ@UInhP>X?1QLJUA*8q3umUrWo|!RzMX9zm?o3B|9zxMT3!H2^{Af z5m;bxXNC{_gnmBM3ha1I`snbyLe*bXX*m^x^XPNm_D8pe4q1qo>nUo^&Bu}BEz%n!loyaIRl9QfLZ?N1N{|;q&K%yvJ!hJm z-xYX7FcF-sXy!`S1#;ort`(V3tqt51F<RGpIZWlT9N}X49 zTo>9nVNyU?7O_XbxLpb8C6o8u7FC?9aJ2mVIk2{K#B}h8+HoW(Wt_n%cIbeZcfVD{ z$*f-V^Qo2agYSbJY3lvkLDejSei5m->hG=mJAzyKf?m4M)NeQBGX?`KKg3*HUe_u+ z+^WiZt;q9(uX$~{vd?*JiB~&-9z)iJhehIZUVkq3buZLBCwq^^A4EvvlQDvmTjh90 z?|Tnn5cYUfHgDrTQ@U3r=LShqCXb&j|n1ZKILEe#`Q@{jt>SVRe~zlf2O{ko07 zlDK$}de08_6LhuQz?5(gw&^O}-;vrIc8|ej+3z0CADTOSXWM2*>nd-l&Jwh*zpbAm z0KQfXHEkLo*E8P zG^0nSR_-HJ04xx6(sRpEuY`CuT)AtLgA`kL&n(ZkZv1L_$ z(4iSD;9SEP(=tq3DZ;m2UB*9hOUvW#6d7D)=BJMrqgZIs@w%d#!sN7nxCS zi?aoSvW2`J;xpmOR=(UEt>v?|Ad76Agc|9e*)3yO-Rxg}9fd-uC956=laz`-|NPL| z>(?@*+jE#@JeuH57ORjoY+kjzHJ9b(#=y0(C?i|$ui1GHHoZA2v6A3$?9jkE2h0$N z)r=C%^qf|sBQ{MqJ+D;@3(xJW-UlQy>MR%jUBoW}-TpqjT8sK+o~gS6>pp$?MO#RL z8KbGH1y_xlq{+iE$pi0qj`P>BC?pO35Ehf$A{tPhf|4Si6>7m*c)*^ZIy0!}y;B>N zPyxKYhItI31HHIswLnLMFu}Lza%#5guZfo~^uNT!-mu%E7=XL$oo?>U>I;)f7x%Y6 zqJ4Uy(!6d%SW;qrVEh8`E;pN(_~lin+mjh-o!Tc&TL$1pSbk!A(O)Pw8eD)33H%%V z%;@B@8Uj&{uA_#`$aAvNyXUc+h}OQoM6B&~rIdJR9MNglmh_ONQ!}-?Jl$=govRG+ zB5jTy=b!n}7C(uvmZ~AUGoKQCw{b%xbRB0D1YTmCGQ0UaO%kt}xSLj4rfGoM(b>rg zGp-#v670i#=Sn+|#9d*jfDilG^ZpJbblc}sM*Tje=N#sFFBwG-hHfAqfk5Cm^MjD? zl5Yse;)wO(3pSsVwAny5Us9jQgu$*oRZ8|MB6`ht2Y8CI8@Lt`&xlD8;oiO$GmgUq z6g;65&X8<(#>~aG646ktdu2R4bpiDGA5~#Ep)I3F=mejJ z)-Z5cXpHGMT#@RV@-efiHu()0ARG3j6+0G`t%S5KQj(?#iEjmRm>;(}%^zgvK z^*&~E8v+gF6#OJw+TFCvn*v^un^;<(p?>=6v7EPNUuTjzQ zVzh4BY3#TU^_U2{j4dUSd&%`(Lsa!z9&EEb_gQWa2S`1fe!OGeR~B=Sg<%b|z;0S* z8I^Ko97QxxSJl!&@scvHf9eD_9|*Z6=lcYPqk4gZ1e3ToBvNGn{joGJ-avEdh6(Cq z-X7CqIX+I0y&5_2bK1HZ^^Z+DY8e>gM_FWUjY{l9KWd>+y2I5U4BOh8&swR9lJq{_ zAQyJWLY3C`%;JrJvb3b{(`3CdpI%L}<3-W6&1^clo}TcYZT1}+@{S{Hp8{Lzd07Y& zUtELL`&jH22D?O*R#V-Seq8!tFY`MLxT!D|L7&Rhu~>W zHjLA!pH7Bnq#mqD_~&7$^|+sgYIY4icmw6{CzuTTZ@xojuN`;0)?*DyMF@$9n^PW2 zhpBGnG2G8dr|pMVN+l2?GzIL~V|4@qTYGc|P8IGu8Axz6XAQ7x;)>Yrw1Np4_N+w`UNG|^hyV;bMq#kJ*fZ-fLO?88QY7pl6Asu!;5 z9bY({63XZctgskBlL=U%X$V#S!#Dh^i>OVOf<89Xt;G)~eJ9K3q*kBf ztSAVoN*2gAVpM>W1#&Gojh_tXUe42a?BpnDsM0ttGtoTHJD7(}>qe#bMN(q>)CWg6yYQq18hX=Rl4atceRxT z<;zwJ$V&+pAj&KML{ri?5H9SgYRx3~YLklBFNme3m+6q82h)SNJepy&G z7knl-klo0m%2A|JG7MIy>lPisVkB@5NW<-AXwIDfIT<3WTy@w+l@LrzKOYKz-MvKHEz z=b%+ZnW}Qm)z8*ogr}9&A8hPAaDI-OtcdH-jc;3cpM~$lU9l3R)Q48jyIUzIV-XlX zcRrH^a=5D+^fncQ;3QUhR9%)9LHa|VRJo(O>0bFUmT`Y@`kB-0GyVFU&v7cdGd)U6 z-ujf^>xIxco77ZxRh;BJ!SVR7l7k^F{j+|XxnIQb1)X{Q(y-Us zjxz!^ySDurKi#x3P~8Um9WTc|3_^w`b<6i?gs+wnZ}Dcl&Tnm#c_&2f~U z)642n#5lZ<%8ynVy5P8vy)d`yi4qJNU_Nl4p|N~*fkM)@`Sz#IDX-jlgKvmGtVclc7&5u-vO@X^xbY5!C3R9`tYoqOz+`S|v8+p~`WP+l-o6xIuAm-H>!oFJ~i|WGTb23!S zL=K&IyWHo)J((aN)xhGtNS50Hj{W6099ftdLFN)UIaa%1=dp`rnJ~j2&})qm$lt^+ zr?kJ$Us+Cj`!q!C zM&sm&4V92R7@NgCL~T6)=1e%=j+*LOP3cn8<%K0Js9R6#=azxVtLGb;bJ4@v(xER7_WU^PUW=8z{2)r z95!7*+bxC=F9<=zO2j?WL$WDWK)E*J!j5=Ip$#;M00;KZKC`Mi*Nb)CrcP$<7irDR zSk{!=JaqeVJRf>FK7fCTisZxG1R@iVvLto&sgZoX%Kl0jICeJ89pgfY%CVY#&S{l6 zCDA%4oeIO@KA7gp(|EE}wZUv3quKN9gYA8_zkeQa&S66kb|#D=$hFgjXr2AoRIq1q zx@{<8Oz22zqT+1d&vEh&`i;HBx%m=@`Gemj*d3}nPVkv+uHWbBw?_nuX+b{3ss2l> z2idONz9XRjQsaCBK}W=eu(XPFdjLt1FHm+{yOO@ueOIB<+rQzHAA00^JlxAV;!dDP zJ^G$Jl>09Y#^ubhFQ)AJ5%&sPpE;Q}C>o4;DOH^u%wHaa zO7n4S+FTPaXUDA8SZikvpA3;aks=Jazw^<%^btG8ohm2o)L-&S;>s!AT#Qnxw>c&v z0edkDtrNeq)$2PM6tk=tcNk2Rzq)adF8?g)UwI~{Sxt*U;enU+vzJXeY9%5v(F59N zZOIfa-+XIr6uW^z;DLzI>P)fxaF3AO#+lV>w=8ub{Ebp_7Lsg7<>Z2cFiD*p%;A2S zwl5RzNUU`w{$>?v=JLS4Bg@Lb!RndYpnl)FI7ZI4%%V3*n;rK@Sk{i)+uJfI@0I=y z^BymR#O!vTakz0EUGegg#8>Bn-1wStW#h{tGSu3rdo8;!QaGbP*e?y%?!H%Aq>Scb zh=JF1TvRO@rLt;2l^jAf{D2rHAj&nQx zxVzD8-u3mSaNFgq50Rm_%~(Zn)FCB81&adFb)^TD`=xmlVA~}>w#14$c~h`}pTUZ> z+jx_o5*EdiPu4npEVJDchIxy07m)kJoY-PwyILzPaS0L??OFDJO*_0pCG%_@jG9`_ zncK=8mwP`tdtyaP;2fe-jS*I1$>#%}x*W952#T-7x~kWXx8IEM{J@h@`?%2Whw-JY zJjakR&Ly8c;HVV{nh48%M3NriVX!S~ll_ZzyK4$Ryd^f1))(`t)W^dK0icJD@#cHT zG^6GXHi~)>19$k6dDI$cz;68h8zq$5a>n5x%Sv3H7V(HkAvm3D>cHymIo>`Mx7p>Ds~5&Go$0YSm;zs zor4wjp@cT zZy1l6RpLeYR9ezWDbq!EJmoIv?Aj+&mfyW8r;Lvu}v5`Bic2s%Vh1`7Jq3zA? z#`fL!Dxijox^xEpl+TAV=jU>7NeWut-D-^RqIi}5IC{H7u#Kr4?>{Ic-{8}MT|=uG zE{MclE$PfDnv#lmW!M@eAu9|0wU*Z050o2kWmUNE>OTsyJ32nbWuk?^($TRx(?$^@ z`P>AKBXeieF5krV{x4Bx8Urpq&PfLRRR&wt=acf+>)+&5`M#!{wXmebk4caSD8;e! z!RA5Rj7@|^HVoZu@%;O%H6x8WTeq*@q%n4;F!`XEo1ccosC%B zTu)O_1i)NnfTEhWUkMGL!^I3R z#iLOGuK463fH1{@;bs~=MM+0zG2JgvBHOh$Ia6mtagw5Mzp-WXtpwrKiu2o)PZFlf ztrO1sQo3SK?tAsiJ{gr1wpaKFl%DN}Ycyc>OqI;}_MB^u2s+JV1N^(kE4UQkY?V@)-ukPKSs~x9RO-i-3!)Y^v*x-aKMQ=Qr>K^0?8D zJ%|DPlg9`C%X%P3GyU`_Iqe5Jd8(_~r^JMWKqA@onb}AdCZ>{mX%)dWX}t;Z{+76Ja5Rv2`>IZE^(}j_)FOWLRT|If#*gk9(dRgy z=Waf?i z{ccng^a9Vef2RfDMoteR^Cl*IHF|@( zp(9Z%o2#9@j_)vx7gF%kz-Frb&vyVc$uR=EP2-Q&S!}o;aWvDoEvL;LS7AivBn*;& zkfVm|Fww++W(58FcA3Z($h)|dl+d1a*M+(cG zgzqeTi~-MMf&&{*#JEc}2e*&feuPHWRjswOLAQ^m-Ph;zc+&c%^10uuAz`!(%(RYQnBsiYRN>s1=*8+th841>1g^|(f_MCh zQeB2zR+<853a!i>83KZTpNUEm8wxzn#Kdgn+lN*?`>bae_-XjX1HDhRj+%ljw#W*R z3>8)ezxP10?eu8AJL1c!d+}KnUjO9)Rlfkb7lqG}p7l)#ATsTbNY!l^r=}~ZqJl<^9%X<;^=J0b)?%E;IckvLkVqnzj|z$!(<>|4i|D@}Pffxa zxbZ%XeM&=39m1M|@_jO*+wl^Pt1VpTm&Phjc${2&K=?8|DqMkFA8;(}KUaM62lEq~ zfn+8|{ffY7LT-R2`Rui|HL7l66Z=Z4@y<*Ig(34aCcysD?WJ1r*0kz9NjKP)s2e^v z2>rv{A`dk*KVhPw30+vwiHMXdEcuS`xxOAA*iIIx^z9ytAr>&It6LIOuA*L%w!Suguql!&lM^&gopR?}aWznX@s6cSZ}?OkUf)<(F7zg|Xu!d_TCp z&ZD;;bc!YKWj@>j#(JCN_vbAZ@$?Mzv-Q4klHF9N`_m~|LT`7cN(l2UurNuaq@<|7 ztC#)I-ZowFHx44UO|%4Nv&Q;v7e()Kx2@SfpnHpwaWJB+D1yH&99e_B5kP4VTf=0w zLl4w{R+!jTv)-FG1BKA3QGlkHkeKLp<|ubK6&=w}@2l4e8XK$BjpV3SA4>N}A$6b` zSsV|VUCytfH^=rs*nFqvSrO7bR^fz!h$@unu$?{B?k?>V@>c*9dO3#-LZ81Inui-s zlbM^bk|p%AsXA2uv+mMU7FS|Ktvdk?%sSif`#aB(%=RykXJF}ZyZC`4>kigx&S5q)AI(B zVON}aWdOpqTbUlEn*ud(^^S5g{ijxQM?Z#s9Q~$BgWCD|>sPGoX=w)s*o5Mt zx#+-G@_~$v;ExUGbFBbxPDv7>rluw}KYV<*9qhX9e5lJkF$*&cO2AdkZV0 zq=ex}Dg9No9PT+LvISF9bjp;}e>~p!scy>}tS!gU3S((<3JN~gc7NUPnRCSGKpY)d znq-WFA15srt-OF2;p3;hc=2Kd1OCHwEn17$g=GVBeU7f0DFWN5%D+TOVDNJ%%QYA^ z7;z3<1ypR^g6igHp*GO1AwWeW07?y5sy<>~fPAo`q8#?h00cj5spWcW zv(ouz>h>ST$Fz>&Mbn`Q5*z+n&V(!_KZmo30rU0jk#$%o{DwA7z|OZ^oOjQIo?fGl zISAl_vR78p(ld5&D$p}9RLojWOiYYbnPbe=0yv}`?rE))$N-!H|1Ct%@11ym979M^8`9!;=EE38YBVgGLa{AQ*K} zY+YGdsgV*Nu()clQqeOeTp@&rIA3RDJX`s)ct9g`WIG4o)GoDnA>7y#Nc?9?EgXH@ zvKt%uPf(uq_VpS13?p(|&xH-_0!!~>=c-9(2-*uA93wYgnKy58fm~_l-7$Q)un5OQ zedX7_zQO?C|5I|3>h>A2bK95K|NB%I7nc_}FXJ;ahdMq2=gRM^Te}J*sS~2$ctQ(5K3*L<%&48OfIk!k9qJjcO_8Jc5%tIb65=C>9F z{O@?zbMN`AY$aD$ZoXN*ckkY({UL4LV&vyAtY{FfzJB#&@5djB-W&2QJ1;d>5fMm`O=5o#hnsoeC5wDXEop z)jHF;tRUCTly2ZRJ&MFYiWW&|XF#{n?3pl5BtmGXsI@gCa|$+)ZwxHi=ds1BpZxzQ zzIdrRr<(EGz`#YJWqKhAsd`yiSu&tauHd=1YMLJn@hUc27$WQS^M_FbRt(=jpSy&uT}iX{MSV=DF?? z9{|J&_kHM%MrvezEqG_FhCx;~yxMBs_5Da)j%*dp${N3vz1zv*Mn>E4NL_~IRq3yR zEz87yn3-Zk6)FGzE%d*5i9iyV10Z+*(#IE2CqVH9YXEdWVD{n~!mV%r2WB|JFWB-3d7XZ^zmsZoD405$ z2cr*B>OiShP#J`b8JpC(&)&7m_DDOpEF5mzYse~;j|naPh@0G$??2B$cl`61FDz_q z@odIYPDR_FJ|8TDT@zDN@jm&G%5zY!pw1mt^h!$Hu$ETiHznG&(OZ(0)MX$`nl`Ig za%+eEd4!|;!lBH<;eV+?K&^J|sQ3;sj-)doY!{v1BAf?j4T zs59?vOXk|l@!`Jz&l(Qo(#NRx&sI?LI^Z3Y8d+96HpA=Ub9)#+MLhn1hWPiDKhhnZ zTKb=gJse8n6uSYHw)I|z9plu1I8y1WDt=!#ZxvcJPSpLGl zF)U6i3|<&{k>&fhKO0LFhxhkP>yjskE(9TSo`Dsh0oF$jcA+@I)}w zSCU-KZ{Kw?{6JnQbm*tK`hhMgK}mUAE2IM$m^ku(Ivby(z=81>s9^Fn{*MkC3;s{D zVOofB`={Ry{fMEU1hRaP1cE7k(LH)|)wE1aLn+CMXG(uEcrYzWCg9NRyBR$G!&IHz z>zuf81ORr(2WT9}u#ZVX3kEUmfy7^qnvEwCVkIRuWH_DA%xe;{P#YEHXjT3VewZ9K zKek(o?Ch_S1)B1I_Z*r2@h~~_5w^P@_LkFCrT!U|Kn0=yNB5FznZEFzOVDqZ)ZMG4 zEr3wtgWHcP@AGNwavlqVKTC;RfhO3i9bcn_JIWxG?9Zo;@Q=b?Lo?mR5&Io(3pNs= zc4TGbyT(;v6g!wnG%;@gZo$XlAZ>@>-gJbw{>pM-)R^r8m`R@e*piJbMz>0 z7EFQJ{@-=`CN9Pv1Mb4+f&~&*yZ`-LC=le&L6OtsEm}1@Ujk#tH*7 z{Gq1ocZ)~I@WowswolCKFnmM*`i=2FUvx#K$j;WOUh>W(#gOPY_aO%(vEcn(*VLHJ z@E||zY{fKbhjy_E*%WqWO*xjo_F*5p8eV5I!7Z(}5odfZfNec=T*~Ude>1!Q!Rx$= z)jS#`Tee%_KgVJ|JVJeg_h;D26_{v|>*rpMyB}>^%F78z>3MR?YI%wN85}J-ZC2Mk z!Sh(}%H!A{Gc|u-TlIFCA996W#OEZKk7(X(WXTV(rJ_-}4hIL_ z(*!Y+61i-Jy8i_qA2ah38lYQ90`L|et$(1u{@Wpuw&mF6?VSA57$C3B64*}tR+2Nk zPqlJ^$*KUu7!uHJ*=bm}?#OT7*a0}sJ)8ChHM0QX5fx4l{~sOC$=5a_-OwZMD+-&z zZDXXp#QiMd866WtPGcoO*$9gNq8SY%c~t7ohqbv{3c%heW(r`E;!1y-FZs_-^eX0C zyj9hf^U&n$rs{t2VnbC~L&CujlgiZ8onCN4eR>Ug^Yju9<`R{(X0qlJA&ERTkRA9k zDe1+_msq&CN*OcU5|YxZ&b+Ls6ur_jtp8~knH&y<#MiH0%gQ1HwS%Yy1Tu!SYo8xq zg~ckp5*E$^eB1CSXP}Q2vwP$3gtKG!(h2D41wSPxtLnUyP*g<6$%r1#s79SJ6as+hBd-~2ZXITtmHeT{^yX2OBrC#DKQA-(v{MopTCtt5 z2u@7N=aLpA0(&Yi7^Nj+T8Nuw?yP$Pbt`fll~p2WpM5R#4}1Fb2{IR1OUgH(z)f(v za)P|slZzT0tN6YHp^>cF@k7C*o1ZjsYmn!}*g@l)!IAC{QOR z!JJ8td-;pMfg_n5Iy&ZAwry=S$(Kx8)t~LMmvUWXD44l%GKO507cD~!4XS2k6yJQc zTNbA5>Zx+?!veyE8a1A1iglqmUZ?HxppBj!OFV?XOKr?RuMD@WM_kkpc7LkGc)r#e zm`xcuIf@c-f^T$mX9U!iK*E9Hb6~JFH1J@q5D-jEOv;(82vey`R8_;H!+%;<8P;eK z-uxn9d<6^AYzQY6$?hNYh}6ici0*J?sqJ;way#hrA0Jum!0d@D{O|{lNwL?@tT5cTx;4z58gP;m}2$m|&SFD3VblX9l@QC`&uP!UO1^f!EZl!XOvZxRd@mCP-l?grI5$Gv7iQ{4AUIq0FwY4H-dMhQ z*4^C=ltxie*ORi<6Xt7Z2%Z7Y1lcp!=c55(G4J~s~I4jzfcQu%GQ=z>LAqw^Til* zz4!x^DR=n3)A#lFe=V<1#p&y-ww+85>kZ?7FL35A$eXFyKfpjs&zxUa7`C=%1k6HT z|KQi^V(T6kUB&H~2f~nmNlPkn_Q0tU?e%l_zM&y1lX-4_x5JH#_2O7ykU)wcVQfa( z>NjZLHVJUBvUWRM;zng|&{1 zR>I_81)Pr{E#Xd?^b*kAxa-ahw;o|Rm1@JYvp4^ZGA+% zxQ;r@aOh3bXe`vZFLU8)>PWuj0;@IKkzgC$rn9yon|*)eNpT61!#doBqY7@%z1!dE z5&Z!#sE45!i<>bdB_jilWSJ;-)q3}CaEnPB7_L{fZlwIVH3j)f`y++42V|~*$1M{U zOEb~xh6FOi3l5$spS5-uS|a9x&&k(cYY)qtR%2mr7=qK`fDp33A6%(<34<7ez75%k ztYdv8TJk$B9e^&OU(+WQBO@c}7??UYU4gt`*I;w?$RyU#pz>q;N4pad(PjBvpo4wL zAbQc&@;Y}wyZwlCVc;y*M?8O8bbnpSaSF(AE*GD?{mvBDXa>4jPW*#z;F00V zHElF()CEZ}Z0ic9><4%M2_xCsGH<@z1ngv`1=Q^R-iv8}I#2c8I|(r(a+~>TGHPl9 zuMf29`%B`QKEkBTDTJ{-KoqUptfs=PP1pXz96GN)36!KMy{UB}aP~yv-e)=Uw#p5Q z%%8!WF1ziVsBBoHzsXH^xPM=7rOmkUYe3;2nDzsx0<`+nZKc?G&AFklu<#WnCHy?e zqLrH3OA|BGgrqb{HPf>|6ck+M4Pc}@h+>FWwShPgpoqUg&zto~U%q_Fo5>8!fEe)? zjba6n8R%!S^9|^4Rrc-0Y1?@G7CdPy=qxkw;M75s7@I%HY_0YIYA7 zdqvRSId7^84StSI98EDOs;No$@#DvOOUx-;t_Vif(Nkf0cbAbUA6N-*TSO&@(8uCDx zEj%3Zdct-Z>b?L(`kwr1_3`>jP9ZC+Kt@Lw4lG+Eqp!e#kg>BT{QC8a-)>~Lm{Ncy z1Bi(OK{pxsHywHdMrc1i%fi(syv3zPjIXG03AIetSTknwk3Fe8`K;R__6J0|K642M zYJvvNgY#dBWXC;c0LOfg^Y(rcVtAl|IY~nBsG4C^;#DXqenvz97yW@i^*QQIyuYIP|w zEE)v=%C})~`MaOT3Ac*5T5T~OQhDMCVCrv$g)4<5LNX9{U_}W$0dntoJueQU2X?#6 zP{=qr;X^4v%3i&~?*&R$p;v%~eUbE{ip-e8bkj69ucp9ezBc&d$EzFG{XFDt|Ef{r z%cpap{$4{ShweKZ>KjdNS1qxXQ6A0iH^zf~7lyKUgM8vxL9S=d{x|@$eTH?3pidHx zj^qGztgL)nYdb{A!4L_kL*i)^Cteu^*)O_HuGO}S5xKb=#F=(AC9aJH019O1;OJXO zmjvX9J0m(bIC!sNLk#1FZ{=0)JTFOu2ON*EKrC7niSYvHA>miiOIb`Dkff&dta`)% zyta{kspo-fNEe;&A5Cq6Ud&B6`Y3-?sRt*CVfSzYs3x;@(;6>2(o>XWJ>|gzYxT*-*0~IY>0)9RzOp^ZE#SMZzVm=5Uq3! zV#6qEde~0+4e_=!-JWIUWR&wM{gDQOF3TFA@M?H$WPx~3i~*jB%@8Gk+JZ0l_?Fj< zjFHjMx{WG`?D5`n00>R(rKF@J5RFJ|6%I-md#ja~|LcpY^xC@dg@BvRD}+}#oMG|; zX>xW#X>u9&KOY%*k5^ahSL3gIm0mnz;?@BAk55eOT%(ffWug7N%_a?q*c0w~STvrG znNF>nTN~xpSB_)T=5ZMr;yT`fi;XUM#s!KJl0BDKg#^^RHz(=J?h+=S<^H87VDgy+ z&w;CD6o2kIj|p25!sJoq0buUAWgSp~MBZ8+6%e%GkPySV`HIR^ecNv_@J-0DyDKsk^)E9s0{8(GCSmdC%;{Wx0h4!t;jN&eBSm#P-B zeXzQA$16YXnKodCGI%haW_h9~TciUHLR>~l&z3^QBQTw>tX0ueQ8Y1m0Un**gsB>&3MSKU&Nv-MwVdf?Z_J=dzW77bM--kte29y45LfUyi?-Qsoj%o**E zy7pBYSD{uYpvD7i)cOvI(1*EJegF;d>MDBck;347|ETOm zYq)ACAt?#KTsbW*uLLYCnKz2m!otagJE>;atpWUhV?kX?sGaSY1Q8T`;WMcp60orQ z`i4w_O{HM#`4G`Faunfg3?5q;F>A75ezEYF2y!;M@QF_^=735eZ;i+SH6+?HXQBI_ z?+~`EX~PYy2;_=+V0;UlwiWAjY(dPp!evO=nl17H-K^yrHh_T4fXM(IfsKO`LNO6D zIVG4=yrua!8o;hrP0K)O^@JtDbyAu@@ z6%`c$%(DNi5p)$KqIQoKy=3OQ=(An?)cKT;K$^QnsUQhU9u?K6=;H=6>#GJyc4TDa;LuQGHdMc9z{wl~D$~nKN+Mic zU2OoX%hrd#n<@_^=w2@No8@8g4v7A#&07kSi-7)>j`v=$cl9{!^>KKZ`?sISh$pg-Z3u(v4$N7bF!2V$$ z(2ViN{ZF{DaSmE~`e)9v|Hzo*(^)&1I%Xl_aZXwJH{^_r#dMay;8-{Uq79TrF;1rj zMh?Z9@4uTP4IrkELYZXs1^(t z>(<2_fQX9d`yITR?~4H=(ZwQWtxmLx38>h7)t(pv*WYDbe4W~1b)o@Sa(%wL9;@}w z!+>AuUFO+-m^)Y-tY48A{(LBLD+Bw76B8CjO7~ZtVL~YWbK1#$bN_$WSyKOf z2n3Rv)pvAcuUmFy2tq=${dD)op?UZQr`Q!sz~K!C?C7q>2t0Ix>{>0VeIov~7k*bI zaHUFZzD;+{CZk^w!FGES50I^SJ3xny)7~VmC_7~V;_h@c;{X(bY>2LGBM1;y0v`8$ z^~|&3qV)!VI;f<=&7&7Aza^bK`nx8o=$-4K4vF7M`0Erh-%o%kT~`tY0-p6^d2lOd zo*o8k5-^MU|K2akyaQKnw>`Xh=yf;?uEb%|;=Bf5RykRo?Qu017N}h4 ztJV#f!)RL)!W@2xmpr8ONLPi%FKLOly7;%@8?-Ig7u7v0v+JYhfn z3AYSMDv!YOY-iZj-(-2Y!lQFBlkzaXen|`h^m3NxmIX5*PqMzRZZ(BB6e+f+lckN> zXwWppaTyK+;6`oN&0ETQ2`6)XN!a3@pQjTKIo_d&HK@^Y(@}Tw8%pN_3f2BQ+`X;v zHZT|9g@#F}4j8LdvqYUFoj2ZTgpbBQ37kC--pYwBu-}?B=P>Q>@j?i@H%X@kDo1S; zI}NYaOzZb-I&;t3O!s(fXANM-inP8jbUyOApnF%;{rJvbUEwt$_Ry^n6dgm@jm1mu zoLJ!m=Ec}txr;2F<8x~70OmKp#n}W`6u*5ERbY|B+iT_i24C1(1BPL~4FLtmerab^ z@a~l3Q%VX_@nd*a&+E=*nEyh@Y!DP+kq-Oa)Nf)I@I?sutEZkHK4f|F`>S!`Bqr22 zxo-wQqDc5sTU)%wTFmuh1g=PR?1yv(@0N&?&5_&+4_M5)wS-%IUI@h&qZ2k|U(-h3 zO}}IKSN+lMZ24Q2uFmgGTDf1(nqS>rqBI4nJ-M0(t2numvcSAQHROnI@-nlo$j^=O6{@Ddj|DlgN3B`@MpL7d%*&{Tx1JhI$BP+Y`q+(kB8bV-JV-6a-~(q zx%a1ms$-8!G`k$L+EP!t9v~fZENr`pk{i=$MIOEB51c8w&H}2Bw(0TW)D|6wonGkR zshb~~uy|NcdasAJqdXp@!XdKYw(QfBXZ(}(d`zw)Gz77+c(D-s(0ftO5W;KZ3)!f=}=hYrn~`B>4y zPQ}p^B@b}4g?E!P@oXM7CezrC$C#!Lq-^w5q4dslgi&d`iS7ux3(D5aYKiDtis&1Jke7jQqQ}k`;NaW?Q&FK2ZJ!d2iok)g#)ME3&r~2`;&n<@kXU`E>{B5n z{;U~rnM}SD0*kII^Mj`uKf|+gj(1M)&0dNCRZd(KQR3-{cjg+n*lN=?M!GW2n3UQe z3~Eibstq7qTlcs!Zgs;aQW^{~0_H^1x&p~g*R>sn{<{oYv30iVODwfuP$>9w9mi8E zKfwoHsB?_$oQI;eQKLhDnz`e-=H!FoE`b`cgq9ZT?GM>zwkvVHmJZH?{;-6~c(Q~2(i8R}R2tN5r#8y|Ev6Yv~IG6a1@kvh;4D@AD7 zY!3z)%Q#_K8X|p7GcVjdHDKi(&00EctTHDvO{Z$3?vJDe-HohWo^BELd=?nq-ooc| z_xU*J=SHwUjHhI7B~&K>At1qz8w!2z-9y?ixm`tN`jkglFl)GmSj*iU@wDjbN9fV3 zN}1mtI5G+BuD=!YDq(x`6%^wN&=pr#2!q)V;-3d*qWM?Rv8T8}H>XXvZaQ{YB?3)T z-Y+vQcNdZBccM@l9=@NwJ2dC1wZ(PXzaoCh1A|KLrPGVh0u~6VOGogSgciZ3Qr(^` z8qwGCBx}xR{q*SRInG*+j4WN0uB1^)Zxc* z*{VkXKe7g|Kplo-QfN5kVhe9ra?iRm@ulfb@L}E#Mx*kB%By}z_ODvh78mxrdkEK2 z>n*{NUsE!98FH$7#mwYd6VGLpjeE)P?|(UBK%a?vt>IGU?2m6xJ`N{mARSLDs~d-{akwljD`L==wEjoo$qm|6p0mr5Hlsj#Kb-i ze0YTWsQeZNz+fpj?WKPDX+V|_wZFeRcu2jua+1e$gg2abQXkLgY2Mb`3O#A8u}r7v=_DAuf>YouUstq0i@*V zw^~{_J6Lz*22SlJOIA3R6^7f)?tMX!x4aFDKB9WYS^JGT8q#V^rpR2QPOtbh z+N;J=;{UI-_X=yO+oFa+qzD2cN|B~0p!6m!6j2nBrqT($_bQzLB2}7z^k$(-3B5z; zz4w|BTIeD4kOcnto^$^9yZUav^*qVN-Zy)%HTPU=jydL-y1%x+nH61o{>KxA3r&g& zsV3r~`k4~FS8v{s0Barce*6gHw!Y6YAth)x_4d~W$I7!Gqznd<6fjxGKD&a(^O?P`$wop&o%MQM!@H;mdP+r zzX;Q|#Y|4Wt+TgP4fphnj((@ornAn04$+VhatJ(~aR*2^gsI{se(+6I2o1)A$$|?NKX9GOan3KtByJ_yR4N=`FIzI2i z&ln%fu=xE}B@>_P%~Rztmz`7q8p(C6*P3@T?v}~wkkJqm5?62DPSiP*Oy&4sJ8v+; zwCx!#VlF!p?bf<}eTAWw^eYqF;d_Kj*&fo->|%GoPNOWTX4E_Y^rwjK~?Ct1xGLZig?ac=nlpQsBk{q0aAy zPaV??5-tyMm~-o$>TA>h5BQ^z5h6rWz8~)5vYjdv-E#7A@8{3Y{MG{GFe;7KcePh% z9Eky#1SJODmmLweMT(x5c2uniJfceoGwLd*i#e?$`m!ER%OJj?V`T6l)K2tF}n?BD6sN3Xre;du&3zG}_x%^mgr-j8tVXQ*KzT|kKIDCq3bPXDqIkH&# z*mcADe4iKp77OH*^&;jtFyt35d2-f}j5v@?y69oTj1qt>6F_@Z7;RPZn}(NUrGew!!D;5G~Eb;BzGYqoMOfY z>giD*?i~JnaophiNVz&gGKGy^z*@CO+bxR6CGC*{XYRyRpX}`ivgv_TNVQCl>n4Ei z4BeN8-QU~M7dy1FS*JwGR|_w!OeInJMk#ReFt4oF72_$3u{a111y{pRacW72*DA22Z1;vuln-D$6D zzlEv;nHrn%VAQNpX9OH?4MJ!?^JM|dq%It&G41z*sx}*6(OMh@ky^oH<=9GN+w6CP z>9gg2e>ax+O&e%4UZ#mvTH=R?Xnr2#dL+ZSC1(3r$a7EEmt_&pCw1%dg2>EBMPX;>SSH$Hfo|n5~tYg@1jn`vx>+16jbyj_y^}jSuxa_3$s;l3j zFG!b<&OvO^l%`ew8>Rg6^nc@tB|jLbiBhrU{0j37T)5KgK1%5l>OrxNb$Nc=Y$#`X z1JB7;b<3bcZBG?Rqt+9f46SPPeQZ`0)R-2^oys>0%cyE)0BclP4bcAv(o`UMrZ&P% zu}@F6i#@>bLTB}C<|%v>o{omUiDBrD5@=~lfF-Y@ys>V;M~@zz94trOTH_k4v8RVD zg~>I`_#8RO&3Wat$@DGlRIGrq_5C*1rI2_(hZ*(%)ij(P0gPi9iCRQzuhF_LX;)dtx zwijiAV(31_hX!*-Cx?t7SXk|2#_a|Hm>F z(s?}EBKrbt$x8RQrB}zYQvO3c>SCMvmsY&0xbrXG0oh${q6eR4-`f(t@sj0#0W|yn z4f^tAWZ0U1MNFV$ z3<#(joWAwXVSZR1q!>MQiq^v$L2%B$XtDyf&zZ%=Pgu3;!`UQW6`#Nety<4TYu+eNyU`azGR zj_kl5ukjBaduZ@R!1VFnwKERo`}^sOZ=Y+SyfcU8|xuF?+^?| zuP5pa88?Qh`sCH~d9s`9PvpPS@ja9Itf2;o{GGt^nP1dIdR`7SAVYlwJ^C11?=Q?| zxJ+!Xv0N{ZB1sG0ggU#ULOCbVTTq3Dqj`P0Hy=Ab6W0@#LbSE~-4XAvjXX)R@Ez)ennK8iwl3oX!13w`Qku0E<*<|#v?BLlJRVvnU&Q7?8ntw z>hE%Dla1@__z`J*d{TZWQ#Qtc1Q^XgB$K0zfnVm~+)zIxx74hwPCADiYAdhq=(V1+ z6X;m-4I7OGSLok$yI5`y5I9RJvDySArinQ-?p8**0?lov^8{WAF!ZKAvJGL1smW48 z9mRQHHaz3i1x7H=#()0pypkN{ceiQQL#D)U_Z>gTk%#C26TzkUwd9Z zV-{1+b&@nV()1PwzI~3Lj}V6>V@Y4XNw)y3V&Y5)Y?Qjd+k1WKR)9jfXLB9EasH+y zNh#QFXV+1XbHx)_6l2ibK$9y85|k4qkjtdgddaQh1xS4;dU%S=_$97IxvuB7x{M~a}Blh!r;_LH>T!_*f&ysi4K zjN)t>el2fK2soFdquG$D{Sy56KSjP25`s}rCTbz0JX7{cy zt+}W_z!>Vl+Pd!(@Pnx!C-A`|9;>72sJizIXH=s(i@tTEovlmr%_nfX=z}%Jz{bE% z@%IF?h0^xWjduD#j?w7VE#-sNgNiR(cN~eEHScehyAFS(yQ&WLEU0xzhBKw!!{Mv# z>rD>ZopyNE&%M~z4`u^61xk3h&IK&51oQ$5U13Yx#r=oDHG|JGq3DM%e?2S;gq7qP z9^OV8WY-QV|A_n3?7^Fjkwms)hzGS*Hd>3tKS*pp7=^K+U;?#foYCtGQBoL=Hiwe)tN{*V)T#Dcd;*}wl*m0_ zU~|D-?P1A`JyTbvH-~u~I=LXZWmwrg47GlEbLZlB-e2ow=H5AOnd3D%sERx4CJ`T( z0k4cJ3Lxsc?zE{wf7M-#8af|iNU4uL=1`;6ulurcp6%9Tr3~XGntE5ChDo0=Q&YvX z7fQ!|qW~kSsIPkmM4VR&S)S=iMnV0vLN|&sy_V6r&)ZXe1Pv3;78qaaUq7F@%E(TM zk>h{Z>3Da-oS9mcp%SNjC+NPS0lDF2oW&BKj=P)LaWw33UC{a9%$hAc|fj>aO2x-;%~>;FF%I409U5cunw~j^JsRMD+1JDB41dLdW3= z_%8e1*dKn^rdanSo6EY}^L`?;VgYGqRBXcreP1aXzGs+r)ou5nI{7X3wrFiNbY4vm zIwLran-D%*kV)Nyjjo$kFi-X>`v6C|&lqdjPR4N$POQ>V+F+ofLRNdo zZ5BbbkOc^bL9h|(tygK5yR{!RiQ%^t$4ZgUGC?Stu-w-U>g^zlD~Yta{1MA`=Uo>< z?B&w}w)3Y1Z`R4O({KbtyZ-x!8aUOVyR+f;Tt@Yg7K1Oi$BVgRG4{ac8MYiAR3GPZk(_Bkk>m}={t^U+g z7m{{En!61{QZ$^eAI!$nUR<0iOa7XH=!>3Ulk^(`%~m6Tsq?GcvfkFW;mY|aC|4EQ z_BBa^C8y~v$L#>D3xC;~#~|>KD+hw3nxyBo1{rbGZfQ!L%Op|s7@>vpIB@EaG ziHqdm6#3kr;}zXcj?@e=R~BV+8|pe`j!~&+?V7Dumc)#U$`(&5t`GxulQ7(BR^7I1;cjh>P8(7<}mVlt`U{K<*9dG&2Y7 z=EVPMRH^J(z$3mlW@SBhUPU#hdnn0#+yCSWN&h#@ur2%hedn5tztnDO$G<^X{vNu2 z;^n{?&52*YEs85_cFxf#y!nYTuI`vFAp_Y?rv60o*QRnvQWxDV1}pw~AI@rG|M?+r z^s5;LnN*kI_jl8`icJ!UUBrN=s(gAU${NcSv{m0GYVV7g(~jJa($f}74e%RDY>^8~ zDm7l}%F{^xNVA+?{kU;*B1`%55~6o855r!rk+b^m(uCE_`5dL?L5>%O>rk6VA8#w( zwKa@=6lNw?hK|`5;5R++GWXs=qfX`@8%WIU|Ka|rl|x&E-S*SIr*E_DCD8(O0zb}u ztT=ZL>Iw5~4dKczMEB3GIK8g&egUajbB}jZ0*`ky!4f#R zQP=ATu@IAw+TW-nm$zq2QzZx89^xA;#K{G}ZzJ24kHYx_oc1n>x)*%!(Np<;$*cN` z_Ba^5N6pp%tao)mbbp3{H{@goJ-k4w6}FEXwYVDD7Ab*fPVKVGEInslb2wYw;pQ;{ zrAv=l!{Sv^>LW1PtK784*t?Ns%r+~)oeUy(VUY+ zkH&LrkuRdb5vQ=$_AJA0bxQeNY4 zBRNd$l2<3@N>KMs-)1W0aPB9Rf3jw(Zc|oQn`U6QbIf7umlRdwzO0b z(UT;1&xdaZVC=*)Mw+V+5SClxw|CeEs|6}AP~kOL%+|rM{Y53d{nMSIwB!8_3W1v| zmCC(SzAY6g`vK^xQ5n*A@V8RL@K;mc!+O=W+`UmQCly%QRfid+n@Pq61e0Ajqh@hqt47g9QILC3)&A# z8{xb;3XB3ZPl>GQgs!*!eC&NBBe|d*8q}GJ=lg5d&7sOZxHF*7Zrr)gVIEWktscm3 z=Y;rCSCt@k#CgD28|hV+KKk_~KxPE;#gFKDWQ#!fd>phpWAN*(eSiLQS$~;Vi=kR_ z?b5`&8ApO%Ehn_ba-Q^#3y}8d<(J#oxW#$|8)xXGv&BIr^Oj(6#(`{cf5O2EO^*<$ z(LAH&&Md507s;Z8OxOABn@LaV3wd)Q7d|+Od^;GJ+5bbWV$s;!{~n*uXOuXo52g$= z9Y9*US+fOZYRKbHU}i%{NFUs(LM-Iq{sCl5_ik;Dx)xKb*=kJOq3Wiz5yty|ff4NY zrR%wq=}u?dv1)!wxf4CAk9Jn5-15{M&Sx}?P+#tzOqA`loCuPf{!LdfBf2Y`0{hMQS)I^E+;d4^HVQ@_n~g1+mymA76u|uJl(N z@3{eu9`$ioJIt8xyHzI|5zdbLb*MSPBBYRt3U(QZmm||UT!&)aVo#=U+1JyT5 z*~kT7`MGqzs&iW?+h?rx_BE1@_Y=QPWRk;VJaZ20-Qo(Pr8fw_^MWLvPyept3!@LL zK2Dy^$?`aqwUx4WA3d%bAjq_Spgt5yKzr?^if*znLV?ixw2X~WH2_;>JU)N5s}t_ZT;5Y6W;4jTJi z@3wk++o9nYZ6C8e(BK8@4AYuLjc<~uaBuQF;%d5^L!;JfwCok;Oz`GuzarIUwUoGQ z{VaUJ8PDq9jY2VsNqH(A?=m^Qf_Y^Qwpz@*5tz{~5c-2SLv8$&VRW@4&jFRk&l_L0 zCmH%_%sO>QdHpOs^}8C;?erC|c3L7%ey#8%Li2kHabw6U{b9R;I;j67p<&=uged_Y z@e%&fTTm3LI3+9>yEm-!MyZJUHXuI!rw8wXBwcXo;$|v=!dPdDh zHtg7MkG7heehWuP?R?C}*a%a{Fsbwwa*JIF$Iv4iGet%o47MQopST-~v@v=5QnJ2R z*0s2kF%z#qks2C$4@2N@fol%(V2#)l54oyQIdt50;Z^L_70e2(5N_EVbO6;+&>4yG zVMB+FzI^+wC|p^q1c`oo8T;TJPqEVcDlt}j{zQvJ+p?n$3+8$9`BOS=1=)TjIOvyB06IlQ}Pu2x-41umpWv^rkd}BL^T;GE`Jr42W z>yxC(3i^4Mep)MG&B*T+E|&&w&Ds-cY;WpV=IzN8sQ@va+5+?LkoA)%I8Je zL{)Z7K#DVug1?__AH4TVFA;;-O9u}weKEgN(hqTq%WeA(%#s#l&nyl%09Etvjg&UA zv*;}^@pzDWKAYU}Wkyt?jXOW>h}Z=deXhH`J#rsBve^rUtuJJkyB#I^*F)tb-zIvr zvFr`7k^*#=ei_rTPMzZV?@mXW&dTA!J#R98MUk3 zFWvMXxOyn`)ly|CRWUlaIhl2e38+@)yNH--YreM)Hg&y2r|-@xw(J9UPtq{>7(2Pv zY3=*eQ(G>A9Cl;)&8$5Xku^x|)=r=-tXN$JS{3S~t339w_ajJYXt+dYd{5TyT_xAn zXB&UO1Ej9JG7KFvUm`NVoU=4HdsAI|4NF6Jgui{T&hBR{IL$Mhl8rt5VHCc*!k1q@ zb@Op@J*e#AxcAO+tNG@r`DHNs(6`5ot-UX4W?~COnva{Bt9@mShXht~7w8-a$s#%U zCxU*Nk$+Wri$Z+BQ_ISBn%vAlh;?Yqnl&C-K3xc(8s06v`UF2?o-=b+tFy1$r{gnP z&FRQh!!wa&S9TAOX#H7a#x?V$n1fi`$qKZWd6ZS>)W?{%a5@lB&gRZomvzkJr2_hP zyeUn_pi8U1a(3%WKYEMXYO}{>pwnF9kByi>nJ10wgdC&0gEZA@gT+#jzi;QK%@M=V z4Hkg-CAQ~5@Q(VKLL+rU$iXZA^?^LLqp=o&szY#`_XJ)52)kTdAZ!L4KxyqCpi9t=`n=vsI$*DQ8KmJ{7i()`Y|FyF7(oenF+A>>=Uk`KB0vLvY^rOE8}2@cNTfG1RGZmrWO~Va!}b#y*imfuiw{fL zm&Hg5+S`Di82yz_w)rJO2=by2d!`&vDkwC3UpnvrC2hP*)q<760V~5|IBRMPT4-L{ zL=Z~2Hk|1H!-i=p$M+O+B%FMumqttL$H(@$<$JQ0iPz)Xvz?;e7!*hg*CtREO;)NYC~Tyqd*Rl57yQ-ihY>gB#I;9d?)mP`xk( zdLspLXmV#7XEWg`eQ^a4_@k|A)d4X!)k{n2daQ1BlGgieMmW1qkcUo1OeHW_iQ?;S zjdRYV5)u92cM?GTEMItM*yr#+1AnA)AWq-{*LPJ4!PD9dtrwovbV+`CnN^+Z*NW&<@*Isn) z;F^)%lWFLUocanHRyK8o(k73O>BhnC=#-%a7k-_o86^i_7ax^}H!-qdr1)d)8ij@R zQ2VE=zR!}afgYq&c!nI&nbH0yb(p^heb8sFY=SMJTb>>=#`aRM^uPijzKP;TV7J+$ zoV_8(i-?7$(~b<)O)-Id_zz1v|5It((`BdL{jl*yBpLrs6VAHoJMk!_e|4Z1>)tWX zY)Y*2n(=gYTZ6%X>dQe@qPYn|w6%P`Z?;i0)8Q`I$`l$SSCO zVlzTp7nN{5YwQ46q4|8e%&)5}!CuOz?Y|B-R5#S1@|h_$ZRzbQugf=7_RyIw)E5ek zqti0gUuBS}9_5d0%^9H($pIVtMajLROi=Utc5WP)%G8@ZDi@a3@lE~J)~_Vf zqc5@m63+++uKeSu14`|tfb%}?9~44?ZM^w~We)QT!L_9<4Boouv-)SJ$tHBy-2;5G z<+b^S8DAoMb^~Cpr59frcV!q{U!_T$@>mTQZopF5Tr^B%_0ldBeB3CPu7Q^w7jP*bi{TKEY!P%*_ZRZXTx0hck&u`1HxRDuK0fK#1sWy{Me4?vAmQ z*b+H;a`$}bCCH})sH17m^__z|S+|(5mATT*F4ElJhxdvD1ao)Y=YBc@>&!~>1jTan z+pS*Wm|DU6uBTS0$&P5EKm+Q8-#D^2XO-ruTK6_Fws2}?3jLb$Ac45j#T|XZc6nst zC5xioKIs246m7wEW=V+PH+eN0tX$j`4|GNl}|>_}|~nM*nyz1_d_p2wBq=eW@tt$}=n$`52SI$ouC8 zFr4tVjjip9XU(CdFld4+chOh);7_9G&?`q(JNble#Sa1SSme#t4ry|K61Y;o0_zB& z!PxPJ@cH!hTB$&w*+#ONK~pwc*^R5qOCKjcE7|V+tm^p3l7DD=&h-FUgGsOM3NKFi4jMwRT^@4{8^MbpUNj=N;M`5Bf~R+Xm4JdKy}HMn_$=Ox5+ zv+tnI*Ltn6%E7vq#0WS8|gbwR}fxt~EU%%vwf7_A0bv zxoq8RaPL~!M{l{4;r#9s1QI?HFeI5dJ56{$${P(o*Q?ld`*mOt09tzly_}pcHyPsH zI2Vw!gzhxAXv>u@2xg=y8>Y@YnV2xc3R5!8XiDXUny2#5fXjGPL?a~8?|M$iI(f12 zTxCsB{S*_=pr|K8t{Y&}ktlJu!KU3qU456!=fAg`caINM-8l#d$i6Bm$ZOVSBLP;Z z8~LMyz1EJshaY(?xC! z!gtj+RzH-MOQfI{zJRNqf;Wl48O-1AuKDVNvF5D#AJ?Peye>%L$gsdb>4~Yjb5GQ#FGmWS83zyG?(Q!2v=SKAd|U zY|)RUX7SX)5h!lz(d1U)*pE=TT+`q*HSxgjv$XPs@Lv*;Ao99MTL~NYfnjN>THu;= z>YEz{ODKt1x}_c-2yVb$JZrECRce!QFA~npAs`YJ%q{Q;hg^-W@}rp_+jki|FR9u( z90ru&(9ifKouw3OqdK(i^_MHMtp$szPT@?jfqq`B_cuIxT(JONB8lMOuROuQLSKO{ zk19Ui%XmQi+CJiqXu6roVqwL*n6oFW>UJ^@GBg(TPjk5$o|k#NE>cd^Uix*DqCKpz z{tMk}QIq!}X;<)eg%=z|ebHFiz>y(Ah?4A^+<%Zk2{Pee8N8psV13!yK*9lf&{h$6DI8Z#lCEb$Z=K!c~mx?`(#~1a;u#Wk;_DbKP}31b}r}MUf1YxWa^1T z{si*0Qjs9rhYT-Cvi?ZfNnDE9&487%E(l3wuX%*xnQYHX*eEH94f>_KLpwHaGvDiK z&3ZON3g->A;N-V}W>UMNPJU(?yGyH5Ewu)GPUNbp*V~I$zp&`h?9KW{-Y|x0co7{L z-f)t$W|2XJD%$c)aVId}Y#mK1s+#vd62Z4brH?Y)S=+GfKsgcm_|!cW1`5_!Km0V| zZ*|58jGRh$?Nu3R-Dl<{Fh`KFFUaJ>nn-9Q%bx?+M&qm3G~7DbZT%rMuXDKyKG?W+ zHQDRzV0GT&3pL2G{vbkKMmEwmHhTV>5BX)UKF^*8z@gcuY`tKDw#7xvt(x~;`c5Z! zOL#=xq~e%}@5j<{J&hvXbRehp=$ z?IRi|Oks^?7?1DDLk;O2$bWaH113NDFz3$zKEpTWC}S043#wwGQr!{{AosbJPDmJIC_#zg;&4!OQ&(pBfe=$@QkTrK*eJJeFT|e68S6`BrP=k?=oUp=OHaVV+ukJMW8K~%>(NgV%A(CQ zLpc=M#{u6N2WQ9SOo@PJES&o%ugf-&eV#rF$UjOXug2PECegtn<*7Po$bJ{POiDio174^9B zlX74rKZ}e(CXV`mcV!fhjH-QR;%mBXeO9e>Gvg2X#`kP>Xw28zsw!ybA8%dY^6!fd z4g#lAD4Dc0(TJ!Nd7(ea68*Dh1fQtMFXi`XipWo!8s!5Kf686VG51J@C2xbQ;x9~Z zZuGqKGiewZfhh<9ioTuEYF7yNK>Rm20q6UlVW_G(mi3zGO=o*pepC#<{Pg(;?oRE? ze(EfN*-xj<>Ar!$2B`WwLx3?&1B#=U9@{f)*`2Lf=`5qmHf7+F?&xlP+&_P}a5n3( zeDh4^E#reR>jB_$SmJgDl~7A96Fd94mS@1<{yQ~@ZCy&d zYM5uo+jAk2i#(mXy{*#S=)n|;EW<9tOq9*4I{3$(4#jAT7vW**hT}K>+8j+$(;pcE zoqes;+tdebhV$G7?CfSy%!lN+xi|$gQx}}RM!e4XWGL~8{4*R+t_$Fw$1l`zQ6xl=bo7}-RE}C zoUX2V>Z$5*1vzm9SR7aY01zZ4M3evk+z0@`%%QQ2hGCQh#U4#t48nX{9Vv4dg2EDQh;0g@tuDsF3MSMw(myHd4xoP$-IHtKK-+uKZB<*q6lG;$8&}^sRBPJ zIZbw+7+vhG9s&!Mb~41L>S529dZfb|f(fXpJY3D!A6kXHACE3QLOs`T>d=d7W;hg{6J^Ty6$P-$fi8DQb z<}&r?T1K~}1B2~f3*lRB>|yeoOI2?9kXUrO=h~^Gf?AD9CigzyZV&9$`I@cp%te8? zvkiUh`qjiW`V<|G#e((L+l5PV3Vw zcmBmo8_(u1wJeyGKM=TaHsyvFmxSddBNx`?MOPP8NQi{9ranLWqd8oZ3BAlYnzq&T zje{aA<(5cLgh>VB3Dcu?s%vfd?rgq4CX&R+d;$8MgjlweX2$g@is_$fWx1C=j%G)& zMhDc>_FD6`Q+ynFZ#v{th5?aRyJuhaA;~Xh#f@C+Uyg`EMwxoe4;FkeM9{&Q+TGUQ ziZeWJt14ksZg6=F?@W^fR{l)$CQ7us(EYC7{efUG5`#}&6N}3(=Ll0)R<_=3jRqSG z^zJ*1#1dYfoRo`!|FuJJNV}G&kQ7_ICOHU{URvgH^0B$nVlNihDk=uXK!;j^6`mN5zaZAt=H%k z_5`9RPcjHI0vIlEthVe1LWC4i3V%uat+?dx9fjn)E9@Upy z!2wpJi_M0tl7i?of5hU_(dhNAm$$F=G3dk|VA}|AEe8IiEEiALxHY#$+8w#6eear@ z>&|$~x99QSFAM#S?3|GW?u3S`k-&(gZEe-<)YQI$_2PNby%b;p9}lyc=kvMEV0@{f z4|={u#)^QS9sZir>c@AOz;#Ws70Cav)eqM{=%qS=YN;X(Ug%=)KH=( z_oKw9AZVY$Ao5ybW$*#-xxDO`{S4DLk`3u}k%y$aIq=^_ZX+`OrDGIG z0A}PKjE4*X_{Aqh%C`1z95amq8lt*sZV@LBo@ZZ3{ z9rRy+zk&w#lgLzwS9T_(aE^+`z8Pnx$}lxLU(#>={lj-PIJT)ltO)$ar+nEl|X*V!u1Uj(ho083PvRJ!y!=#pN-<5 zBMBm0`U$GBS3mFNFn)gA%rpS-vAwn#Fh-Ia!@(oUlnYldx1nws&CrirDQ|c+<2kLe zaPmdJ;i06k8wpgzBd8~s?^KdYx9EpnK@`cf0VM|s-3wxs{A-vtt>H7 zyWFBN?aA(D<(T4II}dM+&)?d#cb-X|OoZRD0Ss`8N#J)*6w#Q8Nl8^&Epp}zV7ZNr zPVX=GWMpKGb%oFJg2~24iHaA=+Fy8=i+2?X}k>uzkL-$eQ&jzL@5R`c86PiP30~3dR8J)KfOxB#CcW@7&*hD4si;_Kgq;rN5gicDBMK>x^r6B?Qd_;uv4 z3ZR2JU67!SheQ=107~P6SR&`9n|mk#fDi?v6rH}+s(H>aC=e+t+M_54(En@?1v6v> z{PHF0^{GgnpS=Q@P1HSGadj9d%b1l@i$XJCz!Nj|6r?Pg2w?*90fFC%#h~r}frXV6 zmk48Wd|=_kT2q+3zxQS-@otfc%iDYGVwMs zIXQWuL|(7Qr%Qk+H=1mUA3-rLBUbqZe#K=j`||URXs6BPcyCOIl7wIU{FNY4h@_9| z7w)>ceViB5FKlFppT0mmDHw9~ZKjJZ#EtW~kZ%iDAbwsWr*l1yZ7_;uW8h8DwFTmH z^duDI+Wo)#;FIXNJxvu@#;(WO8lSKqtvGiDqee_E{pMd|eia~*OB&RRzGSS6F8iz% zwegzdEJC{{OTnl!d$O8RMBCcQDMktA%M^>4jGoKGuC2C3S0p^_-b z9+pOwP{g>e#hB?IESleS%CWX%JU_-}dlu~Lav04O;LdQzAh5!I=$Bo(3Vkadc(F6O zXl`G6658DHq@aJ2y*w;Foc(xBJFj3+<3T4Ul$ai`Cdz`}e4WllQ=yz|TCR$zcOfoU z_+{A{)|~uFwTKAzF8gP7)!5Y3cB@n4SgIPNPvePHE-o$>K0>(C={+-=5vZ|M>q;99 z(lj$BJCSg&Rfu_f52KEt(lAykv zjl*<{c>oH;^M)tm%?YLL+e>IUQga>8WO-0nFGAMw?gVpDF z{@zh3+edXXxi%C%?YPJ1&PQ~He0|e4TTHo|z1ikNzHXLQ>pITq@hF75P<(-EbULMj zK}+T2xPXn#8&KKJ8ML7{6;j-0Xgv;Wtjb4VSm&7S}z0qr^8VLP(l z8@B77YI}&@T#*$0=Z5nd=Z9%t#30?5*D6ocmcr9B%YNI0$OzM(d+hq#FLQ~^W0O6I zTM`Rz>wOU=G~A)V;!djb-ecBd?B&cxVrSi_We88Db)@?nJ1Cn4`5jK-F&BJyDYt}@ z0Q2s_mgPy%=Fc^jF0zUHz! ze%{X@LhsKoMXPnm=xC`NiVl>FuJb{3?leOtviXY3M_K?o$x!}j!&-~QDro=zvc z%uS}#YPMQ+xmhKVeNajnnT95CtXAp>yB=&(q&wtSpuFXE3CU4jd~xLffS)cNg_l>m zLy2z(jed%ztiKsP9Xn?VE4GigXChu51HLbJz9&vZP{3(5={>i{$fRtip1r>x!>-xA z#m*Lp$GYDhTt9p5Q{~L$u-od!67Z@(RU1yDHe5cgPye3IxlrmysW@-tlD(VrXw>+; z?-1uS%=K|c9kAV7Gx2B#L`*h*YgxazILsRbU@+ONli7`wX}R-$UCjr8dbJfB!>nXT zh}&gud*@FyGKt@$!8_kt5C@x23HK)#c?|JGtazEXFf0XV%?3g-ucH3EudE|Qxhz|C zXUK5Zn>i|8dZ)aw8-5`kb<-OB0`TyJLv}w$>8-DR|`?Z%uf7HAOr$^Xjo;b-9D4 z*o9K*iV3Z%KeydT%|F<$A0FgnWHQ-pw<0iT00DE3uC6Xz16s?L*qXD;T@gRHmQzZ>?B9#Hx{9WQl2{qE`Ln=odf56*t7Y9P>f zIs4m=Uq?s=nxvVK-_?p{Vg7buqy&-s!4WyW)p$0D`@9)H<4l%UkNYU@HYU1D7I!v( z+;%FOFK=bLn7N^|Qs9r((N5bOTiDb2by3BPDJpFgLkmnBD-$+u?w*x=wxiWsh;2mH2+k>z_Tv1F_i=BY~A zRRBo!FQ@_Yd-_MnO#lP86-#Izf4;zbwBJ2XWqJ@?4>sDLfEGgGDtCIi*i4_%92>+w!Rf#3nHqG3-pv2l<GP!P2;~UV4-1^0F$~MwWZGhozs6?868D z{L&lKYrO>Zun>W4Z|D6?gq9mS0a-2C%JwHJPm!93X-zvG1XF#|KS-;0Bee%Odu$Um zMS6X8rlN%$c zbtRB*YRuteGgxjPTBJ1C_dxUoKA$&xWb*SYcA{(&*B^Zpx6v>ZhD&DZgqOOO_=23b^OZ;!p!|J zpYGeS#6u~|HoM}sT=VP6lN=wnZX1PSW?8dy{1AHWTPE)j&$L|S^R?l+ zGJSy+@@HJJHX%Y66jL=-Tk*IJF1}j-O>+}qp0VqN*@6Vk$n#1Y2>pwqbIx_eHz7{i zLn5b(4ff1-@o#7}Em5ahO{STIF(D57`JW0oeosC=moPl99kWJ0aM<5rLjJVv^$yB5 zEBB#m8BA9J(zoQFKI^6$6o|lYx_k95mvSap;AW2au%>yvkxNs}5My-GP}xJ<001iP zGj|=+_0-$Rt2UdJ1G8o`ZyQG5JVxJV&9DHX`oe6>WqB6C{kjIWq6000Kh7IsUXHrk z$8h;-w!Dm4DUNO%qu!4lHD`s9>Ml?2mJGs(;5Si$XueyICxh}IbQDpR_74uPRj1si zW(NjOckf%yHlX8^wGAojw%fC!UvtlHHi=7iAc*InRMwsw z+_OxAPrqN8Pe$+QY1Xrqj1ZP^rLmy9ANmk7Gr^>-X}7@?RK)_CRaPIJ^(~KY7C1%W?Msr<9?myh(qS(FTLBzUs~mD$%$q( zrM#J*#%|(b`(<5t*AHs#YR-xLU#kpZ+kdThi{W(gI%kch(rWnU*{U%qIT=rO7Q{;= z4j)@SA$Smb=AL45tIs<-8@+7&ayHI8LW5gXn@(Vq_RkDC;AaF(?uU`mj$fLtP{%S0 z98)?kovzlPv1m{C_Wb$JANbf;oBkeU4pE_mCSp+_Jejwww2fWl_>gq8n11D?e$bRB z0Ef+sTeRB?z{NZ0zETWuGn+7#F9KUwgDkRYd)J&>IyhDq{0qSVB`~G`lOzRtqsuUc zDXp$*yH;nWogQY4pV?JZcN@)q0DmkIf52l06A&;V%x?SZy;rAIpi>t+GV++wyl^MNo&>&GW@#P&A{2Gf=27p%RIXc=r0b#@NwMVV{A;!w5^IIf> zEVVz(V7+M!?%~eZtK=Vxk@0fzDGu9`YzQ^)@UgYX!Vo~6U42p~RbpU{qu<0U zFPy4r3nHw=>i9O4z|BdYvQPWp6v+_)up=I)qb%;l^gG7i8drb)is+2pczp|cyO#u` zmeeLWE1)*|_*>)i(L}r=Jrp35(_AH6ZlM1nCI))(yUk5&1UG!4XhpzM-gR>wa-Z~w z%1{NT9RXAA zevJ1DEQ#lSc6ZR-(`JtelFq}boS%=ne{d~XYkQZ<4dVa9XwUo8`Kw9c6xqlpCt2hx3c;hhy$yeFF%D?>Bu#N8ylfOIbS_=LGJn+7@sCRvuUz2uVA)$47Sid?kowdBrL{PN( zMr0pv^&1Wy5Mmj*n4v6p3zrsCiA@O$(Yf$?(CzU|-<;vKf&(cZ-a>5r03g{P$iC(6+U5Y{YFvWRsa?1$ z@eFO|{Veqw8o=wmFch1-8(n`Tz;H(6j}M);4KYurO1(V_Tru@v0swS*%;pS8V7`R? za=#=|?#YG%0CdBT_HQKZN;R^;^SROsfM0864TqWhdZXPfOLpNyK*zZ6J=x$ZSO zyGvl^NiB#d1M@1^poa%>=o0?nV3;$$pCss3t0{xO&@oq2#T6-v^zXv(kw600>!sB!ftOM>2~_|kf&>h-RsNYxw9I6a@ocf0XS|=i zH@s7#Iug_xNE0d-4p&y;0jabm2^i;UmK^1dc_y^*Avf5@zhq2)IxwLZNT33~ZMQ48 z9?}nZ@OybTo^XaSJ8RP#EYX<(+?+-_> zeA*f&KFv(lRbSRE7}E{0R7Y+*`)7Qxen5xGh|-lyH2nO^Ucb!|@^108bjcJ_e?tr% zp+E`=K);j;k-#U{%PQy;ebcQ9bU@l_uhzok>Hc{danD`7jd}=$@My77Ao3gigwtXq z{2wMOZbbT!EDCG@xBvhRDF-&?i@Xn1I@~qAG}0d`NwbHXug_EL_FrS|lLCHGIhm?5 z7Bx&GG_}v6q4JREfX2&_kCL@`@{~zH5Pd=pfDKLApsdns$!5m+bRoRELyX|U&nj_a=2Um(T zG4;>-X@Nize}uQTptn*K*}OIr^W$~&2&aLulH)dM5{`xjBbdIUdjI%1HF2utWp^i> z<1#tKLnPa0y_5s^I+PDb-K8WI3_wIDA@eVR=+G2{X!!&GCF;f8iX$yirBHGIy90ra zXQEEkSO=;h4crm}HAShn!uWMptTZT%X#SD$=!ME5cpjh}bNPHjLWGB?rLI`^lL=*i z>@bC{$AH<&ji$L)$zbPiudqLLk1y|svZ^np91IOvaSph4B$k;LgA=%>dk?oSy$&ML zr`yLOT1KscvIs?P)6Dv}mY8n!o|L<+tRPHw3=LenN#j#uqEk(W; z&-lLALIL+Ey=TI2u!y#?hao|0OP9CDP|Y`PxSlOh;6#x&+g@b~x>maxZVyiBcmf~S zXW`<_RYI~TfXgqM%N6eCo=5ZZzH6)VTc7oql`ZRl)o4#EXQA(!q=Xv}xgr`H?hM-) zi&8Cb$7#xhg+M(Lg?!JoC0cK>^^9*!G%hSpTE8%P zb+Lh=$Kw9P+2zNCYuY$xh5%`?TG#OUuwC!*hW)lb3vGePVk>$4!at0F3>uv0@$!Jy zi49U@hJUttFU>xV0xdOL5D_Cq>dSkp>_0Z;`o_pewoNJuCL)E?BPzo)MfCHRiJSPQ z4Dm-aeeCcn9;j)(9fOCZUKJTvul-hk)4RYC+pHzInt#Sdn};bm4apxHpRED8QCc$E z%#&^YI^Ru3XYEsxW$-^oxBQGtFV*)E8Q~^9C zge$_|{zah~eqjOLI_|d`oQ!f3UWgN1XD)M1SBpYlL#o)`2kyIZCQsX(tdAp{2ijqo zqQJk*9r>SjJ+|?Ui7>B4~DM%4ie4G{%)muKKoEbRZhyJJbEtpSKx%9Qx7fD@frj zYh^UtriDZe%@Uh!&AR(oHtB-P%%cj`>h7jE{(da$%WeImTrOsKakG5FF0Dp8JZ2b_WCdRUz7`Y=4`j3p((%GoW4R#JiGcb2Tk5ZHy4);flt<9S zU$hy$F_&xzLEx}4EVV2$j71rb?4cV=hPd|MYjK@x`ot z(`rv7q_My#-yRLcY65BAL;kb0XJgqQBP_i4NagyD`SVa|ca8Me%@+Q`r!URu|6dF6 zB&0i{_lYecum15>Qby+P;UOE8wVO}Ut}_@Kj>bhpL;D%qGK*Wbfb#)|j=ZJEV;B0K z{N+2I0$1b_*TdzCf{%IB;}Ai3M|;B&?aQTk>86V-^@|&b@2hd%pU&czH69xn80hJC z`}}w(Kcj85SUNe4yuQBPuJ%O5!71l)yV)D-DOo_r#(ugV!eY>UTjz3R-Lg12=(uo< ztr`1%b9MFfVxXinzn!Lz1xi({C@U-XC`QT2$+60_Ff)H-C$6opG@w#)bH8pYyjNCM zs^~IMQs&|_;H)(ivKBUkJ@A|9VcM$g&`Re9AdFJ7d5iLpP@U`USuSr%p?P*jCV7 z&pD!bewDPd3tz~UUvo0j=9V%-LW(7MClGMiD3OZgbL0-;rugNI$&O9}f+4feK!bjQ z8%lcJ--8RCKf+^neLU}td}*S8`37l=$oXrW?ejx_ri2U$tXFe{o{i++cIHj=?I%b3 zEqYns2(AJ=E>HDmqj5sIsy~fd_2Mfg9iV`YfreIavs}bqbM~iWWq6!qb_LF`bgM3* zH#}&?*6|i@BpPqE%W{$(H9v&EM%SU%_3q|I742ZZ0_*v?O^3_1!dd57x!Wu0>SRhf zfQXWeRO6mOlo%QZ9T^>UL#RldTv^9>dSc=`H246crRAw_IckKIBWg{}jF){Machn3 z9X$RQ>2}E%?s^@m%yzvi>Mjy8c?h6-S(~3i_meGI8?m@8uJJBPahKxH>KegXhnD;P z%~FLW`%*Xe^$`&7HlS!swO$c7-_s8XvlCPbLJg+u;t;o?4EzewSSoU#47A{@EAa z_+_Smf4RMt;XU^9@}6t<;7z5)-H;Fwc{hJU>in_Z1Jm{3zMul)+C9<|Dsq#{qqV{| zgIwl$E~~^uK6}cDIh6Jipga5$KuO4WRVj?p1UomQz%XP-zv)})xVBhrs;AVwt#QSZmU)re5o7#N1bQDuCaO_nJy z3*_O(kGFI+9L5`_)&y~aBogJ$|#=N@fjD}n$2MTbK(@*M01Z`-zn_fL*mQqG4h}g5> zXnV@CE)**lMo2p>syP$V?cv2*`13~ulSF*3L;xvFckvv0H6UZ(0t`VAgR zbm`*{s_#0R?n?FZ>*E(c{aAv+Hg_Gx1oWZZ;~q&bG*na`$K@HJ-*H5ecob4nQgn1x zQWN9w2nZ`xy3gd-(Z7E)Nh(tB{|HBB9TNe<&(NLU51dIc?49h&o!QCAG^i6?&XDUo zZ9VA5{*>r;%g}%Id+XOOj3|%KryTE}lq!GNEU`#)cXuBi9v&y0Fsjeb&mSL`+NW#N zwlOgoHfcy=URYRYYEH4Z5hQD~wGAG@LpB}Zb_>$odbA8Y`BSP`aMIpnIzvFmS}5T& zUqkZY{Cy-mWJ>JgJ-M>Bb~bQ$c8wSSAwruMiroXZ7Ya?U%LnndM!hI|Hx z*T~*7OY?RXE|ASBndlF;T7_GC@+L+o8Q)^NZH|KawzG74%5J#1xlul{m(45R`pLL8 z_{VJUN}q?IMi7Tu>&w0#gy0G9Qng|^nYeqMb|C-=(>Iu;4i73_HV7z^s-;LHL&WT& zi{f*7l(rU{^|yvYL3P6HlG|*z-S_r+3qv6(n=?@P{W;Va57azaLVvB|nE++-mmAZa zi;=(LjU@5jS%3_!c$-Wg##0!#_hS=x1KrtCc}kC&?XVpraDV;I0!U1!lpOq|Ea>QQ z8am8VM6Y8r`g3FV+05%Ll;0E?MpUk(sQO?&MfTDSJ+ZBn*f#0D7c$9mZCt|XK`I%i zm(e(UYc6vgnOPYXnUIt`esi0&Ux|h(CP{kVYGWX2Y%JO)fQ*fP^|!A8Q=d46K?hB` z&>%c&EM7%bHEa;I;H7;+mPDkfjRaayR71sR@g7(08}`8TMFYN+v~(4+s+dazEdzt# z$p<=~Fi40p0;sTXV)+{V6baZrJ zegpsBvZS{ta}^>~RQCMp6szRvSEQRnW#r}cy1h69`_#KUn*99*zc&)YZOI%uMDuv` zczO(L!l9-ynq8lvGc`%47F+Gl50mE&7N=;byq5?sT#jSk${5$~rT_bWrE z-?>Dm9*ozlx9BaGGigx&U2dZ}9xjue?4Pa-6_F{E%l@kWWB+&|i_M;VZ;lU&n@zP^ ztlTeliZn|* zWA1m3#zEsdP4eEKg{r%1XWRE%!AUp+B;vb)bq9W`75xR;~Nk7 z5qVgzDX`m=hNtIK{+7@tt~5G5+KB0h-5s3Da0Nt3NVtMRAixEJYn4b*X$%z$BzJ=s zowG_xa_*THmWSD>>u;A%)sP|T>g%EWCS$)D?~|cqLS z9+pdo$n*;RRtCE*xPYBqZFY7xIjTx%-e~_|POAsc2xd~L@sx|@(CZ}i=rcPlIuH9_6p z))SC|&;x(vGSl%|Q9Px^)z}$(_2^T!eL?y^5Nf^NsHmD(hDPD$;f4%hf3{Z^5dlk| zQqdN|iW{VP9$0Uvb6V|UtRh``le<<@>0ZZFxA;m69ZV8W8Z3k&h%7lKDJUq&?Rq{r zIVqw@%EIEfa-jptIm_qpo#Pbr^WNJtZZI>c7}K&2`OwVNj~m3|s>zF?0DK2g0Gu8LZ0a7$|@h zH}~Wpb#JWLF{7|S1E`R94r7B&-GbCHqaP9y=D}}Q(UV^uCy+MeuUwe%P|fvf;@ZvT z@{KpLw8X^3V1s$M{-+3x@Cjo^MPy`ITw+&n4*L$z38ph#&PRn?4TFql!}qw*0YrY_ z-~xh&^jTm6{=3|+wpFfO@>5nn*x1PVt5D0k$uJ|6e4U-0dzyZ_dAmQqc6dT-A7Ia7 z0914|4fRd(-OU`&YX*sZ4%Y8ozB8gY5vb4*FfcH{Y@TrFaB1SB2WBXO95GG81^k3G zCn5kjmeGwTFMdx)B8%6(!eYCvK%q_ss*0pOWs{7T@-#PJ9h}I@rr1xbk4}i+YKCsQ zxihn|H4W8=4;Jb(V#bGumi&qodQhF6&8Ie<{loKhxl1FfpiimnPVntJ-pGhtW+h%| ztNUq9vvs+uN@{8<;5Yv#1@7cWGWF_fK#S{6c)ImTN(t0!cjtOXOETRIC@ipy_zy~$ z_%U4i(1?(rh$@o&h6YH!lU#21oSvTQDWt{{m1YyH^z{*QQc-c0I~oqM)zwuJI4Q~b zFY-g(k2T!=DamIhi>AceL3Y151zcS8WVn{^Ww~z>>QqqikTq9&e@kVt z#uSY!to~8|LPYd~JMnrjC~teaUT3K_?rwMRQM?P46A1@JT1TV2F_BafTr??@FRR5Q zy1J(3It+zWQYtb6I>O-fg?inIHQ2z)YUlXh@&3Px;z1r>p6F=VMKdL73H@-ho8bhe ze>MfLA>qame@Zmq9?8YpmcAEOpK@`v6I|#b?d}d}1|?x)fxH5b$6IN3w{C1TZ)D^R znbL9KlN+I}9zMPe$Wow*NzI@N*H!Cu7*K}fQwHRsi~c4B16qZcT<#8IgsSfOmYD7F zM(nj$S4EITKm}sd-RWL$PuHhu4Vq~F&a&Sum?K_4wz9#L*;yE&BzH*jh+u-dL|=Pn z+0gUiq>I#IYOXKJk7>oUwKE;+;6Stl2;s4#k&m9l>_g1;W zd3i9ZciisMdfn8ucx8_a4FT@&H?q6>=g(r3moRxkT54*i?`Jot%s^HhcI?>w{EW_y z>ff&+CT8ZvrKKXu`p!<7-#L|!PSO;gY?n`a`*%N*>DBs&&{ zNn6FR}kWdg{0{xMu&;fRB8>3-IpkI4Xrxe4w zCks9xCdT#5`E7};S+H3ljnY`tf}rAxw!$(mJ^0I&`7{Wl!G>BYUw{c5PW@Z@iZ1#M z8ynt+&BFJ_C5eA#db)brj8)H<3>uV~P(SeY@^v^AYO8!urR{5u`<)%ZguONrkN_&J zBh3>4qr{&LXKH$CI1==|B?1CSARj06*25LIaODi=^NW4I0{O0&H$IQ@RyqWRU}4x>%}+ z!}lF0>T?1+Ogj}toU*W0ND6e(&gc2d={)OI8$>$x6^X#F9I2TdJx1d&==8hy`i!Ud zr;JZ=gl#Izs@17UOm?4t&CVqZ>;<-Z^U8%iB@GP->{OJNQqt3Zc(8p1KyZ(amKGKc zuBSOMnif)tIEiKtJ8;UVPMsDrfCPY^mPaAL-r4385|x;yS4dhZ%_yFeLnh zD@i5e=ducthEYc_&=t@n!O-l2cb2Qx6-GRA+7D=xO3+0?G2~KXiiwE{sif*_Zqw7h zP{N1H?au)uelx`lR(nE+CpmgyA`3m@Qjl2*h=ASaM;Gne^+H_kYB5=$1K|QX28P3H zHyfP0|5_TU%XhQmg6mXhOjo$uru)Q6Soub*dz0h2%b=)FcWPt|d0QA?@g$$2zu=f$CM2qqQ7J|fiH zV&xTtMnJ;84RCu0IaUmoVWp_0*u|5+dv$*2cCL&WS{Qf_)fbkHing+{3PJidPhB~5 z9a^C?0IOK-_LyMKADT1Uo^7Ut#jk~qqJG4VV5XLqA5tZmM$#M$4nT%Q1@PKa(;-i3uw0Ik z;ieBpa{lq(EVF2&`4lP8J4P%iLoIM&vSx8h+R%gjnk`S-4}yV?{JEK!Kq&=KQqV5B zB|`@!CZ$9U#q;sz%V(`6D1LwgPS`vw)q!B^@PNzts?WoYv-sE7`kR~fkj`#QW=FCf z8OjNUB=roY3DFw^|I8^76o6G|QMP<&#B?%)VtNb=5ccM6Ad56Gw2dPjGh}u)o%B#t z1Ov3Sv_g$1qE75$6OUI`c9)!6v{tGWwmoucQlu#@)5oze7Wdq{kiy7w@dBGah?(1D z=u^gw>dU{hY11eCgXp$ABtY2TXmk7utTmbkzI~IXw5zmL1Bj|Oq4JZ5iJ?!h z-?UB&R%30g*qGR5KasU2)a#w9Xmv)4H`@08fO%f;HCL#{3QI{vfZ$MW41(|N?d>;s zF(04qf`S4M>n56D0rrVM+!_ZV%T#6gl^aUlqEz-)h2Eji_`D*ty|-xmMn>TbqdG!b z>J_$G9`{GOr&2lDyyZ}e>8m-Oj~Rn+=YLZ58pgLaeeJ~IcrD$dPvn~@A@>7wy^>Eg z|0;W{O*P~?FQ{2|HXexbRucped1!ZePzwYNG&{nKe~}%u;DI_&$f`~sX*#40MZ{mr z<&^Fv_V}43tl_O-O`Fu$I4KJ8?T1G;eM`B0qBOOr@pJCM61equCHkwmM}uaI&4`K# z;|2~HavDu~1!PX|Vxz ze7`Xj)#Y|lczm9#KjF#%QOZQtnGO@LXj@@iAvE#c(Q)lr6wjUmCtzfJNCM65bS04F z(2cos`anz9F(|-4l4whsj3T2HpE*53Q_lihe~PN4#w=Z>t%FS{+ zb=FpUg#|7ejmcokX_srS@h#5C-2Av=wPFAEAkFFgr76zEdVMDSHo-yk>2S&uEyWXN4p+{=QZq7WcqhmiE*R3)a;4`?=;3G%p!>AXh{GXDCt_-rZBB2jJwV-ds+!7$_1 z#7A`+-z2x0jfc6RV#(4x)O~FfcJK_@38hm5y-# zd9pnu!RU`K>u0Uy9Q|q=W@VNVM;Zj@`P9*rYX6&gQSbWgu{~F23Zpf-{a$}r(sUOy z&?{L%0Nv#< zeLRhQXOOQsN#aD4^R+9B`g2KmYsK}s-W(|UPU_`8}7$eu| zBRSE=_jCUSLM=6?xID>nR0otGHd1{-h9qTt*zD|<4@_F%y;X?IdkW}9gs3|^mwI0NQo}d<#SFSfudizN zn0NOuekTZ9C-Ji#o{t1fOz)xa2@-|-sNHqdNr81O3aW^JgOHR`p$G}2-}l!?14>QD zzC;h=9@&c(%NJrqv|DjIs1P36=^o_An4dR;bF1AY)P}_&@n{qQUlFKVT^auEO9z2y zv;24^CCw%?1n77P)ZwBcwHoa_%JI3SrGX)``#s5|VP4@ixy# zdH@0f;>wNZXdZ& zWwphqDnK#-10ALT(T;DFT%Yd4w|sO_E>@dnCMI||<#Wvn*BT`xnyxg|RN9YrKxmnz zr8yx6*PH%y-P5c52;^n`ejB^Qfc_R4E7l67c*ZGLmGL~z;>}{z{@+H&Cd{YBO!#H4 zWUf%*i($yGp+XvgTD7Yk{N)!SmPCXsEXv0XF|vz_ z-@|aSjTd*QL+f91&3UAOdg7Wvw)qD12oWH6pxVQ1-vMNrOC*#i(e-;F0VP2%xS4vK zg;ea$dgjl&V++IZZ@39SL_ISCS!-)+s)Mz0D>ibWwvySH<2b!MHQHfEYwJckTC7Dg zf_M{h_BYG*9aT?xW4bMOTrfbfoA&e}K(ClVtJC0kzUcOFW`!p$phVso9)6zB%Z}5I z$o=Y9o3>m$sY%|(Cae+jx0u}v@z`RYPw!i~ENTNo*0DX0l~=PJ4Qh1QeRD;{-}4QP zrSlO1^3tXm&8-!#zc{^~YV&@N-G#}G=yg3bG<&2b$ttQyYk_!kPGGiwen;QIp{ouq zxIcwK4_sg(nJ&p6C0>PVG)P9S?7i(kQW$xBZTU}LS_9%2eG)Ok1=x$tam-ydo`z)2 z!&neiXK&AxGSO69E1-CXYYnEDqOONEF@#od=J3GrEcZ9wJwG5Iz}390Y^Va_dp*QV zCFfAshSO0mpETA-yTcTht81*s_j;BqsR`e88l!+RA08g6^yX+*E4-V#Xz#pi(oxhZ}xi8KzMo#k^a$#@4lJ8Tw@ z$!2^lUfxbSyp@IBe9CgQ9MiO?x}DK1Z2)_Idbs0Tjc%Q+Qklh!1|l9uZMODDBhJHq z;$%Cy=SfyWr8ot-G<6bFYNdxTH99kSLV*sg6}vjrSL zlk=9>hR7NW!Tt5?3_^7}0XFBG!P|&&q*|+I>t?ri2fbk#Dla>`zre4hN{#Va{fFvm zNHVfvMa6L~@+|H{EKDwUkK$--{qsE)UES&XyG0R`u2b(t`K84h+NsIO@=d8H2a4L_ zBCd$H%F18f)vST`lglrtpaVUfZD{@_X&J6Nw~7#wzkcD#Y`Sc*m`+!^w}YZxR?B6g zy2rTNmD17X-X+m#y)Qtysnf2r>ge@ZXa0CTy5QVx#17W*@_7|cF^)zvn8@oR?f;Hi zAb=5(kqv>u$Yj%JcH|=qi(Y0n=nRpcmBcQ`MI<|~eDV}NS9L3$f`xp2@ zxhI+k5v23+dTHux3-tN!8K%`)*Pf}49AxOk(8ETIm~^_NNm8V!;e4TA{Qh!OPwdt@ zYcQup`7`wlv0;vf3c6eIn#Gaw=NKA(0bX=2~uR|SVuv3(2W#(v0+}aFMVurKL1;= z5mMx_Bk*3f%>A^OsJs+jR`jC|L!OMMyF8 zpf&XhL7u+f^d7{ur+-|mx7a3$m&j$6o3ox8OoI#$`~P1HkW}`_Y_meczEL+&o+_(3 zZK0CP*H$M^Nm<5DIx|bl;Sf~aUFU+*<^E!G{q)oSUFUS|-`R7x5g@=+xl&vG-2~Bk z?eKZ4%hS1QSf!%B$$DK?){npWvX$_&w7SDuGej1Lzj{kOT_j8jRCDuM1>#?&$QPXL z7;{*}`APR~)pu?WgHM+w7Zy<1)>}>GC(IXe#X{6IJ>p4I)0Z*bzdv|bogpNhdNabG zU@2>mDJu6Aqs#WPeUFX!sv-f`XaCG4D2BZ4~qOaD_9`#zj1q!`MVKDxZD zet+Mti8gw5^yO!P0}X?>UawL1h(@2+gOsPsmCkY)cW>*4a3m;?pt1Eq)smLAyam})&ztKM_^+a?_*=^VQ#_XC#UrhWm(No4zd|v^ z!Dq1z+4=vnBHm{O5QhF&S;i7--f1}WjjO!xD+~sGO-04?(@7^b?>`=gPkqeEdhL_b z#d3c z?;O6z7j!r)8|$kqde7A=p9_+TPG6}Klfe^dsbcwkH-decxiQZgai~1U_b{Gz4y`de zV&j*C3c>y^t|)*a;CvV1g_J?m{DQ)iKoU?mz#)eaO6MhCr;fw*&O~U=Ih^6b$iqkhCF#Cn$h9m+I(^+`W$(w-%4`2<1LMC*__64|Pp+|%9R9af z$)s*WztWt(5+RP$kVU0j+y%2M=WgL{oBP-x??Fi7aXAr^oyKM{%F*MK^btJ7_S9)Q znyk zJdVua-I@sWZ^Vffl;z|gsi@g(CYx-cUMnxS%ws#HUXqQ`m}_7DN(%y{=;_tLsgv zF`>p=tk}JQc}R7u{li@rEEp1KdO55F?&|FCb#?Ue`PSyv%bVksoyoyM^!yIbV+_t_)$!8AiJwg- zKjh`@4f^v;O^@8Kd1YpwB?==-J1gVMK`R*e`s!6aqqExK7W=w6IXTPkdVEn#Ax}d? zgT?_5;NrrC&sHs4bN5{ir>aiD>$v6P)J~YJPnmE8?4C6}xNwG&6_ID|41YhHiYz{( zn+|2B2#3S_9-E<9cOKrx_Biy-xBfUhF;Ou~#LC*3+v`^cc_OptyGpAQan_;t3S8*y z$^FZ_mbwc5mjKzVU?4TCiWXi}k>5^@o$Y(Pa@iMvV8U6bg&IL$A)-P%VVFlEuDwyI zyn=^jV?m3CzOHU?i(OeOyN4w<|gD17dVGSP#pKW*!nakkw_&8`xxD zwD0iromJ*{cPt5difkXyu0r*556o7~7P^;t@iPAkg7(wZ@8O4J66nrao}N~-+psXP z2q^wqUqAaSjy@wxAni06mQpr85a7QKNw?SQmBZzClW{O8C=hbE8EXuW&kz4bwe2rq zztvQlxo)ppR#rNz-7vQ?#LCswn+F2>jEbO=vK|A$6N>Nt zC65~zjAhFN=D{n|rm;!NQ9~^Zt_${-S`w^@3MT9n;i}i^bUGbPlqKA6Or-XIcpv(} z3=R&85G%^^H>H_J5<+E9`~Cf+C`^_NQOTm%Y(AsB(TQ5!D7-9$z36F$OmwCsm3Yi1 z1`+usphM}=|036uM`ep(bT-%bqWi)+jWY7WG?vnoPUWuULo5IBsIq*a@A~borJPu_ zJETxW33WniJWC>NfYJH-MIU_;Ef;H)Yw>Qrz* zLK4cbBP>9mEXm(c$Unb1htHSm&pN8s)&ZQ{1FtmTCP~77#dl9r^hl_fEsuYl=tLQoP*!?F|s|?wFX>(b}%pX>d)A{342cA%3t%j@5|BR5UqL)wbEZ~Nx z)!ve9nA0?Ck{&SraV6mhlhfS_vR?T|XW=~Knu;v(lW?q@X6jDNsiKe*7xVow; zQ&klNs8@pCN)>-=<|wZ^5z9sKwu(zg(1|>I+OT;I7qwVv%mj`ZcBliguApG<-P+F1 z&fBQdYjJxy`_ZkU(pD^Ww7dmlx%=t5ti0Uq4>7Ci!|j32diT0JxaH#1oHqMTzrFW- zY*kcNbhP#ex-_vCMU6GxaYvK@hLt>~0_rd$EeQ$85K!Zt2}|yy=}5FHDYx(QIYK zVf!7AXu^WJBJhLS^AQ6^ty6{e{@RFRGYkou@s(`JH1#C@|N-Ef`KsTGeZtNb!~NRST$jsQ|nsj)n#F5$WF=8A`iI&>xakFi237sP(e!! z5{Av3ApJ9K)~wM=R9Hw)Z?~M!uR*t_^8Bhg&o>GtjZfI(rVl+aHfHe(j4vrGgU91) znG8PUf26Ih%CLp^bE8gBTMAz@B;)}V%NLqc4V3>9w4R)#5m2p#6|n9vSZjK!td)h=sVCV@8aGF60g5@?y6O_oO;n%TlO%d;dXe<}0{ zL?8hbukWTWvjY^O)sWF&1lU{}QUw5n)bL96UTLG?1(p4BcsxM0F!i(MQTcpkyHDfZ zr{l^K44#q=&wYfw3iIYlmF^N@9n=c%uwDOs+bQ~A&C?wDT;6JC z0TA77T)e!sH9dFd?RQmHYQDfE5#lwitbGnW{?GB@!#4y1>dDD{%vUZdQU8ojLfhK1 zY=&<-ST)7EAUjS$zKKur{d`;<(+O~YGN*%zgQS;V{6+1u?-R&_enW5o9nFS8?@YPY;2TKNugxS1X~ME znX-iUsv(z%nB9fbHerStU0t9Azpk`2wfp%M^3tXGc}oxO(&Qf^IFS6AQ&M_OG*1Fr zuJR|-nscE79!t+NGn(7I&KN_Cs0XMRe#xc3p$%=6llP>enDqNqMa9H0FhYL3q(X@( z9kri53hk1$yG!^ILPJBZt*uS-w_j=gJ~%m9>BS?&9f}Y}*YSSl&pc)hZ3O_JTOE0! zx!sy~unL(K<;5eAt*&L2c~`Hjst6XUs1%4Jml7nukHyD&M}@SNR)c=>lRE;3m;apI zI$iQMx^nyqY`*Ssi-?Yj`o6ol zXTsfaq)B`K<d+uLA;ZSnmX4Zu!O-XVJd#MfkQxaloWNvsmjuBXTJ3qY17uA zfvstza5P;fhV)f@#ZkfjLKJAR>1=SVI_r2YZm2FB9S@E{cPSe#C} z+9eG<-8+Npw>J#ck?C2E-XL!aS*bN2Sad)^aq;5Ae6bwp5v~G>F3tH+fjE5rDKdCx z^Vw@5$X$jv)(v`EaE`bG4^hhpfmqxnJXh$FGa5+sKc>ZAF+sBL+g>)D1ylhD% z4a(YlmMI!QTvJ2VP}C!{uZ4e!1O#QZYoZ#0)RC%>67{;sw9^09@+9XPIFZgo;N2{opxB3 zxk+&%{OH|UX+ zQdCxL!nM3>ZTYKgw|xPe?!rKKV`Y5)+c@-KuiX}zmuII2%F?I`%DDjGLYf{OX%g@q z9bw2hI6u1J2@@jH>wQ1I@}XB3hfnj&$tWx+qAxDdRaDj1&{9lKAFrz>w)`_$2`~8L z-PUST9=-HT%O(@JZTjiW-E(Q~r0DJc%=j*7re>C%uA;h(iEtFC{9v zZ21>+JJ{>y#?QMshI)cpF3z~SGX;qXe26znUTzLO`N(I|+MbDljqQ=57mh2d`B0Iz z6H-Z0@$){=fnc)MUj*ZrHvR2l1T3z|LQ?X!IHB|oTB>#GM5t(;@-4fj20S@(_Tiyi z`8gs4pNG8K4>Gc*h88}2@%`St(dHF|sFogAjV~8lzFnpQs|ij@G%llT+uFZ+uKt{@ zt_dr{P>tfS>3>cq4g{3t^(p;*l7gv)M*jY#=$n6j&zu%565D@G{8K9%CUCr(>S7(o#rRh>H|0h#2d~yf=7Mk!;|SJlF4~rA=tK%M^{U%^uE? zweO?Ezy(7Xjy!Quj;wlGn*zaxK*_jAZoEKzBPG1;)(n{kI>T!~2nii~} zl9CdT0+E%K{aAT__wjrQo3_5YyF*1qJ zAVh>zQW_2swZzoa%mSgQJesl~XT2t7l(hy1OR@LG)#ev1+C{~LethWE1k()CJFq&ovdC#862Lsv-`E!9eB2P8k^S0#B~lC zrmJS>-jFJ6?X%_pj?toueqUc%qBGqS3xD&Cvzd_9&HU-1Jj0>DVbwYa$Bj*&AjyB|q= zJQodf;Qof!>2yJqZN=AUxBG2%J@`8)00Ka<2ygkjLB^P2qtNZ1BMvXvG#?y!{{jwE zH&wYi#MyZc-K_D%sV?WV%s0Vj{{fw#qQ%o`aq;!{&97X`|Gs6PFU{p?PkF(yNvOei zWP22}QmAM}F)mLjn(nJm;d3AM@5M!w_qK59_r7;}_-hAN+C+&Qwq^JACAMu73+JEB zyh9YSIgFM@(X!Q!xI0|~{mOOH9`|DE(y1cEi4vgSrHe0(AVNv@>xaK>486&HTYbUu z{*uPD;Nt>b~qn=d>Lc~`;LY`xGEHZH{veCXY>Ha6$;Gm$aYVs18cluG6cNtK6 zaqL@PU>Daql|8p#vTsD)mA&n;`~rc^q0#X~YPDLO4JU61A!@jPHGy;*`3{D|HYM*= z?aB}o9QhtUn_ft-aeGE>%1Kd&{d;D1lIBA#tOz$HxDYd3f>c6E z%7`N^JA30zE|`P>5vpV;%>~1(hHDu;+-R-*Eq(gsW-qv^%IfXACTR%dqy$b$X}Ylp zadPcDr?0(gD@W#aFBFx4fP$ig3O~E0Kic5s1x2vokW`=F^z?XTRh5{7O-3Gm*caT< zOw}p#+5E|g3CC65&mzTav%y3JD2N4JSId%>u?5vqz~{Iae|oLWlQchk1u>L2EY#O&HT>Wx#U+X6 zzVh>!a0Y<)u899c3!O<WX_zyT51_x8P;1>)c}htp~3*7e?-kI4czQR33MD=F{J?9=km{cXEL>O>VOzTf;F zMHPwGSN7XIHMydss;JMLW{Z#)7n{K1;2IYz`@>~hM_-NrLdRqRAe7+tVC1V#Z&g*5 z6X)*4<{Geg-U1Nd3Ctc&b}|0r;*NqGRKTG(EHqi&*klDCwNDx5tbjXwb9mU!xD48A zvje7qCh04%ZQQWYD~%cz0U4NRU;v=o`Ped;e}7iha6CXDSv55{avsHUtyQIl_^qr2 zcl3{3C_xme31o^N6)+>6->>mYm=QQJT(q%clT-O%N^)vzb7LpLqk@LUGbrG*`0j%g zvhs)&<|Ox(FHjOTP{0oj_FFN|`KsG5Lt#-$v={Hpl@dvKZ~zKgh-?y)57K_O05mEh$H4rU!q2%v`F!4k zg9B-w4=xG{bxFy@QU1-7@^6b6;DPUwuCx};m~M7slW<<73dx>cNkb{TLj?6p13zMv zpAU1jRAjQ`bT>yF56biByjk?WUeRpWTD~M#VmjWAkCxy90Jfy16*tyqBcoI41ZNE6 z)*v>Yj2+I$KN}}Z06?VB-3C)&I%9HDGXK$GWMt$Y_=P=Ps(*rMw%Qo1eYwHq!BGxd zx{mjyNl08vP&1HiYFb(yAo$bax^V)O_;Ub2gO-W0lbp<#AHLgJx;Oqe|0gYKfMr}ue(G8lS);!e#36X))8 zcT~7`6QTkIaksJ5s56|DOMrB%rr{b11rEg&XE)+KH01Q|lge@WM8XH>vP3v&@~5ul zeH_sNmVu$J!#SIXhR)KYz?Q#3%~^;L$Ly6y+ep$Z zUIi&0?o9$39e)&Onm=vKg8?8gxqa*Z-&_DqiZ=5pho$YGLgUbK-g2{eP{U320f-6x zZJxay*<(~SrcBkAgDg9#d4PbH6V2~WNq=2ftTbBV^T}7`+f!82kE!g3dB$gE#D4hX zBqwK8NiANp3kgL)n7nt~;zhE1y*=t?(1R96zNl0~$tLc28Q1oluc>M@P$K@zoS`ow zu8L3Xlq}7XsuCR)U9wsd0B~4}LCz!5#)v;sQ-zMWi-L|?L2G)4tg&}8%Pu5r0tt(V zis;Zlu>98>gySeVC54oDgcITF8bB|Cw8gvOPf@OZUQ0(!BJZncrLUyFM4|c8$E+t% zPyBqbpN&H170{#6fb^{eH_>@_#xYgb2>Ixlohn9+F&X7ZaEGdE71@23K^y*)i%mxfweS|E&@7ybFRQn}V3N{H+5 z-BV0!{o>QiH9n2cg0-bS9QYoCV+zU`*Lu!b5g;S=z8bjoWpMp_q>2X*3(zJzmO@ zDut%P@}%2l5kvhD&zAY&6TEfd4MBwj4F*Z-IE|FoY}Wk;8u&7#d2xbsVYYP~aksKk z|0{42$h}uBbfK=MELPLox)!%Rx<3|8`RQg9iMEL#i(^9hNqP7JB2#k#oI+ceUbNP@ zh%8NrjxQi8xKpiyR=RF$0ZaSRh}fqitFL+ zkQL~<7toRO8^#U60f5o|J_+x?NWt_kWDU&QzAAG#3p?39ErMRyte`4UK{_AU4GWScv`&O&^63V99^Yh+Rk%hVsvSp9-5feW-#+R!p6hm7sJ$ z!4Q0n|8FMqU0LLfHL@~OC6n=c)mO_N@7QY=h93M9=Y9sR1h`?tGa^2~IUKDT&(c8r}@93vQiaR-58FJ9j4-r&U+F1rVP=3{>p|dn=a3R{er2eVK}6V3mu&pW8h~jp8X9RouCbA={(9FIp6lp!aGFR(Ew3WWa7M zM;$z7$IF}9rl0632n=X?!BZS z`SM_cP@Xh(lq56q1NU~sSZH3p^(TA(4soB~o{G8OzD51yo9x-9+e&mWy zO+|})T5_|VpP%QkTLhe*h=_}8Yh9hEt7!{W7cPHHpQ#D09<!Wu*Pjb5WrEPp-Xi z?s_-<6MHYodws%O|MP49x_4REqWtgpz0u{&REmCLU9$04zu1+QKfGf-kytqd zohD+RFh*cLU&s%3kZ0^S++NjeCd!S-#n4ih8C&w3j6T4vH8ofTUg7%>d8D(>Dmc$| zsfB^t(6jY_WaoltaN=mOKCTUz{)*-=PXuiY$ZLYgrQco&Ihvf_3F^~YBen79Rz_5j zLQ=vq*P@)Dlk|H09PV6IE>0paxTjtW(gObC`F-+C%GHI5KBqpwK}y{ZZOhaLAx4ID z{R+kj?%$om@>Qd`x7YQb0C%|@_#*jW^-``q<@dOAn=V7haps_O9325^I4`Lw{~mi< zK!5q}uei)8F7QdaTTRmc#f0Q!{)>R*pR!&C4O`vONxj@0^LiQ@kl+sMl61wV6oZZ7*sSV&(i zU4Eb~Qgs{m{hSY;u{gTiHK!G5&S)R9R=}}}Oy}ERx9kW)%+%mMZ;kb>Y7U1S?%(OU1 zUEeDbYg+52y;s8OC$#xful?HGQ;+qLGI~K)S4_JrUiq39-R-Qpc{eSLhh{*91{b#S z74XuV67DOM5OMNP`*WYBxIX%?15)+VO^yFH)hhkAMl^MTON9f*%nNC5Ct3T86x>qEth&T;oWn*Gss)$!;GAM$ge-1|xV=R6C3>z&%oqLu)wD;|wB z)&-MZajBM(=Rmm~hA~9t_j2Q_JboNx=if+FVqk=bh=8BhXFf&8n_tdrOCvo_&#Sdh zI^&lTk&>64b9zrc!?V1qJ6AC2Xe;64Z4L00wq=-%XoFQfs16eUR&~_MdJVaWNd&YgO;KgHv>`_OCJcdqqJ0BA zBnUzmWk_j52*l;^KQ}}G8XCHx)kt~$-@8z%+R9ZNAfzq7mP;cv0*&9S=SX1Zz97}} zLCf;uj8+~%%A4+ZMz@r7fAz4zg6Q^VJK}kD_TQfS7f$4M+^cpHbsFr~;2XOX1cF3cz> zPDI)7q3wX@N1R7LVL0SgtR+h?+Wn3Nj8re`YM-bhrqg~L=;!0zG%KK$CRVaw=WhZB ztw5|gFL4hhV1n2n$AEY;NhL1_}8Vbjc<`DN}s zhX>Uw53;rg@4W^ca6EKS2ylnfrAtCF&vkWsW>Bx`!z?#3sm zv1mBbMIsoJGF$q>K+nL2LSmqg)g8xY=x%`_I50e`7X+GsRfp^klpv+=r;{#^%ScTx z{ZChPXZw{T3HiZxAlkRZHOuxdY`U1@)>T%M_6&~NPK6P(4p*7edRaoM?&)zGcQwt` zq?E+OE)Yw18+E-M7SB&If+<+vcG3?j^|sh4nyBvWk)+cm<%@qb<-9g|4I}X z0l)rqw{SFqY?KBUR3FyY7h|uq1pO?nrfut!vxKwlO(#BMxXbW7s`#*=Y0 zqd;;Qsa3{?U2b+baEx0-lKA+|;=?n~wkJJtMy0Z|mWfKvcN}w-ZHW47$8M3*L3_ zq@A=3DQH}r13{Ynm3`8O+d8kEZuxgLyW-*OrKEyCSL{+D0cg6pY=f3j*Ba55PmR|@ z4gHn(p(F3zjxDHx$)7tAv0zcbf5YLhI0XC-AJbbSQiiHJ%S`IrR7Yy^ok?R!_}nJd z*qb}w($17K<=LkAAdZm#$zfi?Bs(BIO9An;Uq;8tp$`q3m)|e{6C(-bnQ)yA9$j_J zw;ReEOQLIMy@-ab?VAKRlm|(wK=C*o*`kao4oo-iBL!#|PJ69Nwr$b+mx{`OR&VFl zlqYVy7Qdc2kF9VDBuSI_14wcyWI%<#^P#&dpX7o6!0h!!#u0~1qE1$7PKsUx0_GW` z@mp>Aig_hU=b9?k0sor>w)ULW)|I3agut?Xmwnr}$d^@u39^#w-@PCsCieq}ygAxA z4P>uyY%5>mKFtmd5cP|z4OHLl^i5M!qG8(D6AA-7Zbj7}Dhro|-}QTS_(T4dYi;o_ zf^419nmS-^`sEcSY*_!l0R%y){yhZrI{^Y-g0#{EHMqRzdRVesehbe;De;D2?KFsv zqn&*>wBS&OPXPEKA;MmT8Pqb}4tI-3%GNCNlc#HL6xBK|xA9DIb@Qs9;820@_z5j7 zEnc1F@7dqrUy`56G(|_PABgJBb|z7bi-11WZ{}LU)j*zV$zC4wHD{7jhZ$t>W@wNl z23LDU^fk0t;6IO1pb_Mmf{vod{=cRbXy5&F|HcSN(w^JP#nEM495Ij1&i2~(pATkx z{kHKxA6^DdNg$=}7QaHmHtBoZULsUGWrEeYlz@Q;%3vhon6R4O_g-}A9~!b52NKKr z8#A@%o@9V72oYf2$Xz?5@3~pB@B{y+>zBtzi}X?GYaSaH5-|?SL72dD%m^I^X6DAE zoWZw_1*$M@Wo@nDIL!(Wiv`Amb7Ez54vjeT>RlZ_KGDbw%*MWL6|q?P+;B=P&-=BX z(Y-@u_n+CkaDm1{&FVjA^q=NeJUv)GUWrccbzkE9_Ik0jFFwBK|ESIzo8fyPJewi1 zpP0^+gAtPUivgE(9_fszsHhl=fE=IzHi{Fob;NBU>aV1ufEO-EMg|VhEhA4Nz{L!o zF3nt(Py6{>Mm4^~EMBAW0(spYj$`k+&9E5lYg^bcd+$w`36h*8v$i)7YTZ&!{f$Yc zGM3@#HW*xkr(!s6UjVXYYq%BA`DfTmF%wjyy6+DHA#(vr0}DuWq}-I%QU#Ql$23V zd*3ixAKVE3O50+94;LVQh%Lr@j-^}L2M<&}wKTM<5-klkov>Qlpn=8Awb zwX&1t^BtO(3Gj0;Aft%u)@y8I`Bk`jqaVV3Iat7})%>ORjvA`_MQLI|rw;??(ZYGhw>2+T zwwF#|wCyI4lU^P_vfpO?B&%lYmqIUS^%vB-P;_w?w-={PB9zU>S<06JeUjHkmLx(3 zw#vpL@fWR1S@zY85E#HU|LRqqQ3<_R`Gt(N#tb)-rq-Wa{hnL;?bF`UUeIH*9%=g9 zm-#_)?Os{c;TK91K1WsMYr4I!{uhLW0gD6b_CgqNmNGh7EiH@{MrwOH48Hb`Oz4z% zYZt~4Qv{XNQst2q>$LsJ>c3Wx`Bc;rX7~kmPM))^u1*)C9+|#FfTXTLDMMThLW!I< zuVO1ZPEAgHCP6?_*WHsjG-b6^F#%Jve+}l3l%YKbj-2(Vp66&9cV^P96HjVHP*9*y zVNB!kWlKKhDOB{@9eswEciLGx>Z&WD!`(8{ogG1X>yY#0EIn?5J5pr7hr5MI!}1GB zc@N9c?#*9c=8vH&Fu-d0;$n|`Dcl-nTFTvX3;X?USOSxt6#}ZFv+LqGG_=oFL9E8( zbV2%;t<7&iVa7Sy!#%n9ylE&n*|Lnd=cI4K>(Co<&HD_|v1D4Ys50@f)%M0Je6fBc zg@n&{k{=d$S!i!963Zi6nNZZlY8)NcvusG`;Bhty^cwQosJjmh*huN@v-eP^CMGWNGfJW5wvRI_5r z&&9@1r7*E66F&B;RQ=7{#X56riIIql2DNRrqc6uJvxn?+(!f~4F?-h*Syu}8c55MvJ@`A+BGd)6V_hF zCOKrhKXw393}+|aJRS9O%sgsJ$-0DAR?q@KA?0a$ZX_iE z@h#~q=wU=dVK{$T89O_X-zf&{md@stNNKE{CEX+?e38!dtBt?|)N0+W4H9Mc;&DUkG#jE~XEmw9_2v2G`3l|d|4;ucrF^=b} zE`8`j0hnszcE|NJrs+qS!^gAt9p+ zSyfp%{2cel4jey+OSI?^!Z#^l_*rZ!!uk{$<~b1P=n9I@30&t3>nlR{2gSwqk(q8}JoNJ$xM z!zX!&z5rlpslfq%QdAriH$1I3iY(LL_GE27oUSb8!v_tNlU(WZ;Yy;W1}K6G+WKcb z_}i#EDMDK6&2{JydAXIJ|4PstIwFr&Ewh8_Xc4>my$*7t$`WYB7;!&b3`jG@=j}L-`$|I&rliPsQFbYO2v|{e4_epBDLXwgQMWoCQ$o!kAbat+ z24r(ryQ{M1z~aP`(?<(WVDCaa3AXGhO6; zdlq8%O+5hA$Rh85_wL2{&!zyiqW4~IN~kWM>!JBURbAz`K_-5O0PIr35-OVJ2o?qD z)YSofr4|>~2(--8uat;Znw`!1Aegsc*W0r;nH&oM;Lwkj{z14=2EL2SRzCENd2m!7 za(RjxHsDpG@W@mqFMAEb6N4dIHM^ewHn}g)j&WHw6tJ3mp@Cu|reU?mEo5iTpjEQo zgr|~VdKJ6#B-KHp{vgLLRxqKntw2zYucxlw8TuxSGwk?HTn5F6T z?i|K5s0y2K@W0XhKo#=nS?WDi%3t#$O9n-E^=I!^gjAZFmFavz?DK_-0T^4oSkjDm zw#$1Ls5G2=c;VDEzyluqnJPn~owu@O$Mm}LLXJ34a+1(N@xj#H#E57eLGrd6tv9JF zoh#|JtkS_oe(QJneeU^jL6Z?#TumfDrFxLNq^GEEYhl++-{rcKT)=Yh%h7&79A;|T=Qm!r z+sS7yj=B3xGr^Tp(V*eZ%aNA}0I`UZ5!Ixx9EOOGKp(mPpk>Ru8)UEuyFm{4Bty(PN`Q<~kY z)oHhwr$2YN_5#!Vz@FZ1L;QN;ll|6bA6YP7)C{MhKDt!xRS-x72c`_q6;6X3Yw#&p z=Er@WNb)u$EJaBxFCBqy-EMhZZH6O{$mp6Xjt!w3d?OQBW(#l!#t|W`x}sD*7jm_!y~wwJ z{^U#SGfH?BA_U8W{@bcK(Oa?OBjbu7$dkM#r|o^``4&81yr-?DDcbH6QktqNY~Ex4 z{x4jauw9}ZX83N8nZGmjlwK=BhDi7PFUSd?O#A^m%i?%` zJ_xw=(5{LHN$k3FWbW*@fx-e#EWvcwb87kBSY`nWuJO+#7_bhII$s{xk#*9mim1s3(*!HqE*etweg-l|8 zhAQ~9b~~_o39kNCNqM;lslw<=O!=GsJ5r|;o<^^V(9eU<5M(m z_>(NX!vB$Fn~Ox9n}UhCsYKWKKKD~2NABqMt9npUYb@6z)1Bk6I{~Tqa0FUn%IJK6{q~9#QDp&MLsStLdQY*ZSC~ z_LSvje$3HM2d1R^cE^@3_2*q{jQ%3jF?$>4I~!w#CqK+23|tJ3h0)IyvHIM_-06I6q9w-WV0_w)a4Non$uR z&e*jw{TUe1vHC$oefp5j$}~?$=5AduB|Ck|6P6sFIwD&oYla(XW!Obf0Rn`*v}pi9 z&CO$0WHQglTkerZZBE7S>hK=oFFg!Ay-ec$L&dKR@qrlm5nUeKt6rXzq+fL;_im7V zw#vIdQDRbt|ETy$!c}95?@OK2nEr!2@}~pM+keL_ry`Zi53^pVZ;zjMbVp6DbHYo( zKug(qaiyUkqt;(pkSyLvZxwSkxpezJJ#kh;L&x%}Z-zlBgl)v4zG9RTT{4y>rc^UhXM5h20t2$MjGQY2KSkQm@+CYsDoil4K!9yd8Pmu9f|$|&Hc*G7^F zj@wR`P6{`Xl}m5sgHGP9UP8~NMo-)L+N4pY+0aB=hc?92Q$!j%Gvt?KN! zGzA0eaaxcj`>_5x6=ZMxpSr#>EUqS4a}pvDJVAp68C-(9OM<&waCaSKNWuhncL^5U z-4Z-FgF6IwXK?o9`*!!`iD>6FDEk=vOmf#CTyjk-#8oN!O z-$y!5ollOb&LJdI0&EEoJlqdlb;O(-ll=vQpA$HV?AO-bX&5p%0J} zw+2_&bVA`HOvG+J0F`;}_5k-=Mft7)vZbK)up~C+JhE`cHDbK#?^nM2jgrOX zl$S2PHNohDvd||&Wp>g_G4Z-RE0q&~BUO))$TYM}pkL!eU(Lsf%vP%E4YMJ%U~|I+ zASL0Lv4}Akt_1@5Fm^NkXV?WS@t=K?LvK%KN2lt0Cd8>|=sgDSf<=;G9>#D8P-6^X zVEFt77#Pnx!?Psp6g^6hZmF?@B{(!kA&A`*^8g*VodPrY9}V{x@960BpS}E?PH>Ss z_p@KZbMzzX*tD`@zawY{B}^+nt+v!1-|PBWvNXOboD%aYW$#C^Rq8ck>i6C-sM>Ok z2-auKovShz^)(t4OK%7Pwyd>0J_{MSJb5UQH?NKJQk#NJPVX@EKM>+~;hn9`2>6s! zCYQo7L`11G3<9|dNA`&;Pqh_>%ap9F6F3Hh3j$eeNJ+G8XsTfydcUEl9bwiV zm^w+xLIU<5ywp57=Av(NO8(`0Lbr7gDv@yakzq)RJm+5+###3w=v3U5=tCP8xeAeJi-&a(O#RPqpdgQA&D-{Nrk?_$JKp4T9VZJ zO(y|Q1=+134`o$*-33o%JjdjfB%@uRn-$4Rl#=GFeZIo!bK+} zya?UhATvkvl-4*T^zWU3i98VV)pIi%q&t}mwFR|UqxJ4McedVJ5W;WLq}8MGzY0~p zn#Ra3wbKx6X+C#ESZc0b>h8wl#-}L;+C7itRBt3Gz(8ZB|MeLFoaE7|*C7b&449>*@3rcXa}^iAp5?}h)+tldl=D}+;CVo*JOXKBt` zbBaY-Xm4pO_jb}JnXr{%xCqT~Q(&#BzX)}N()LBjn8>i`-k0LkS!oLFc8`yss|Vb7 zpwC=35)%L5JP3E9e8X%F=^oA9!h-TSHdLsnpxgjbZC&jpaZK4X0DigppzchE#%uXy+xD(q8KY@E0`xxMMW8MiVci>+N7AS-uG zbdyTSB!StT7Aj|sep-Dv=R@)r6?S|FvhM1UcG4}Fm-zPA;N0nv6wW0!3UU=0+ z$XT5y@j^W4KVhx~YMQZ$oHs*uQEv4{jRRygrHoW>`Lrfh6R15W8jCa6 zOYL5`mx2}F8ViJ-9?p$Ld0az&BkJUJRJq^^pVO1Cc3sRc?dr$Xh3hBH5O)J*$LnRy zAwsbMmzg=6eNycBNz>1nj`iFwMfcB=bU$of+4_Q4`3B5DVro&HYa@&)Ja&eCYgbpj zY*?EqYqz(WgYfH`*K8snV)lPMre~^zWD6RUh zGp{dmFaVth2vz;-*Mlx!>Kx_vqdDeLqGKVOHU%xbE2c|5GMm1CdNd_D z9G^pI3kj^dSzsOnY4V9>r5p^~x)B+8T5||L$!<0*{lofW7{1`Zl;!)^Nzpo^G8#BuUU}{glc52>cwxb8kaywRtI80 zl?cNOe!c|l)A*1-ANc&^+i?ZrQ~TOV8fCAKd+bzRj8FA`p+ZIz9Fk$_?e6Y-w&sKLO#r=)#!{o)%v! zn{sg5l?Xixv?F0EER;DT6&Rf3@fmffSmSE1{HCM#Non`Awkf=_`&0~R?L`WKsc<@)^vm(6G&0PBZ@$X3Rj`)nmu+S0Y55GQ z`p=z}V(#qF01ly4Y9=PMq9k>agCPZqSZoQAsCK7j3XPr%b>>bS$!)c!q*F&kZN1P0 zrg|kID$cbnhCp+zc}ZE=yKv?*qxMq;(y3oykjVcJu>6dhJ@uxBos@Kxl$^&2DE||X z3Vp&{b3CTupsMa9^eS0VZ*sz0*7M2qa)p=7NGWnH5JcEh#|B3a745^0eq zAc=LG_52e)6(5`?SpI#;JmlWwC#HqQ`zM(oKcZ9za?{r+ zgr`%1E;h?od(pH#{P3~MDtq66UAXfdqR(EdAZvxdl91Gd3zxm%j*pAy_s<6El*C=e)+Tqwyb5FGJSCSZ- zl&4{!J$CcwGM?r3H^6XpAwuR6o?!7jnS{KlGsjqVXJMseKN&WUjcleF6iAG$MRF9^VXDBE z-aVXvAi*0tMAvn+o`WCBG8E&}0`0i_Y24y#@!B+B->+#bh+-Ev>XFwyoD|@jS-+XP z-*}@S(ba2^S{~KlFm+Q=^q^$wWDb6FjCx#4p1`qJDD*0rrru!WM;AyZ!F5xqul`qT z>v?JjgD7{Cp#d8#4A70Iy3b1ulNb*}f0@Qh!#-SZm zvS%F@QQ`@uO{auMwDGCin`}TlzEh<{J5@{na8u3bB;@-@|4JGQCfC(L(6uBkjsa=2 z)8-+f=kbnTVLdQ(U(@F0c;8<~;2Vwi84yM1-?R6m7wop4W8vaYzIAG5oO=-F=l?L@ zuH5)Ze1X_ffU!RTAwPYeAFw=}{NlCg)6A5D@>WYLV9#^hJh+9*^f4KeZ(w-;giA#g z{yx)v`*Wt!`j_$Y=C&(X?kcO3rMu&bEEforMP~mDZ0+4-Q#&ap>avulj#+W;%sB-l z8|S!~b@O-f4)sTdFC8tA4Jv4rvxqo+`MJV*qQ^x3>UU9L@!vz*hp(Ecq2FtMr@dAh zgk?&tFDD%-uwqk|QCe+~cHHBR99z=bniz?i$csGS6*Tu)j1JXy$o4 zOf?FVy#9?W_U1J{{wn2D(5MPsW)44%Hrf=S&|xNcm?E$IP4%t#@@oqXwy`Q2{)O`m z*blWQgo|USBQ)xNfY5eecK5y=#p+RGQsx6xM!SITc`{LPsspED7D_PccrbFXqJ)^< zk6rg$ZG2IXLEc*PuzGkVRo20VkpyEQr+$kMjN==l{r*PDZ^$iQ__#7Xw^ZZp05E43 zbjSSQxmbwpRR)OI+VCnm^_8v9Z(JBYoIZx!eE-irw%W1s|H;c}3&i#w6jRlu1v`z? z<5dkGXZyteE*GA_*ZY2lrGN55UK9rV7L`(z65#GUpUC52NL97|LC1>AnsBiX9&{?i z+}5zyDfizw8t3+hVrz(r)lVbjN!cGC4;CSM@>WPMkGB>|;rVfD^f0QpT3fe+c^spU zl>97493e4zj3!q5VZLv@+4~GAoU6mY+EuD*FuYv*J^kU^4^(dUi>dvk(B+Z4=<)Pw zfi1{z`4;cig2m)g1;6yHE#;m)7y2Zf_Q1f$9XwV4GK1Tmc=7Sl^~PGsc42mp{yI!w z&7&*B>aT(uN_1aq*P4=ce>iC~hAd9(=|Js=6RW6f*O>K*{0vx|8|C--AeD-Lz#%x) z^Al5-14#5GO-*x>!)oEwzY|XBMmpF;2?s?RZ+$fDYXV2RPXp`(YGEPdA&HOZOai?*&?lzte*dYvn-S#EW5APHNxJ*Wq#COLd476tJpP0<4x`pZ19vyFp;_?T! zm4fY>Z$hz!J z^NauSf%Eao%k0MAIuKN6GrON5J={A$Ec{nGG41p)xy)j3-+L++-kH9>x-2GO)V_O^KkP;PtrF|+aA&-lxxjZWI4A49n#w$ z#_NhXgKbu8FP0O_NMn9JPkr-}I4J2zFA+Jyv;GST z1WZC6p6)N1AS_WWc%>=T>>0wol;rX?HJsAsQu%RFBc*B-gMAb_b154>AsKn9ulMLqD%)m{H8 zr^}WCX}-Oa@XrVu8%TF1jul*(-QrF*oUdlK`%0EB#6BGjV#2ohv*Qb%o)|KAyW;&j zu#};qsmA8?%j)g@65{V=*`k$H@ybb@K8}5({VItVeu?zaxx&7pL>D$XGcAQ);Y3zj zhT*9y^D&J=n%2JJ@$N-N{Cf8=YgzMTtHLwy{>(U7)WU5& zS+IK8+bIj|GW8Ec2JrCI@-_jzcTIlx?hz64-e=Brx)JKV?nOs`gtv8Su1Xr)=ru>Q zj&2QJx$GA@#x)em1jT?x)k{8LdD_KvY3r zxSQaXz)&9!QhTkbIUO_E!1oQDPFgi1tmpP~yKJ`p**E4z`F@5I>D7VM=;!Sb$f!#~ z4p-YlC-OZ*juY9)>dV)f+-pBT2wWpf!*&Q{FA7m|5L}wCaHL+)QmEl@T)r0mJhX?2 z*Xg!0wE3kMtH@QS3P#9(YLxu{q<#!Gm^ysDAsYa>} zL{Dv59Za8r!Djfg_dF!lx$5_nBFzoTWU+s*F;O;O>f!vJP|jn|FytlD7@R$0rK^gR zd|hu6GEI260#O|l?&=2EKY;_>HbR-kRd?VS`LO7Grwre_&1QfmXWB38il5&poHq~p zh>gQk{2uCicU>Y4+9AwZN7Z1dk)1>#ojg41ys|(3>*W&RFilbFZ+p zio>goA3p|qrxgh*_aiD(dq)Cr3TpSdhX3T8J=E`Xbi$@jSS>V~S=w_CI3PjT#ea$p z>*=)Sw9ER?pQwR7JP^SAgO^&@$hTM>*R|LXPp`noQGJw6Aq%;auf#V`@^RtC5?3>d-{kOH~qbGeO(D`?D6i5#lClj&q_G zge2E3y7DEGs-A?uN9{U*|mxa4ZJCEX|xhBTBDyL6k>BTuyeCWz3 zq1&{tD|Ls^2`;T?Kke-WL)>&R^q`japY)5)J0>2j=B|d`)J5#$vssN?&hbMFCM^yc z#V>p`Y|}+ zyU=4n$pAa^3w^t>`ZtNd;>8(Y*m7}proU>W9=4EH&aHSXDiOB1Sai-|DWU+0aH1s8 zFJ*|l`!R9Be|#OiY~Qguz%kojEl?GM3V7XL;rME*kcPzyqdt4OPz}?0>EY@)!LvK( zGO_G3Qe~<2#|iKner9{e2;7SWc(y>uvcMl07{~(?nY-eh5Yuo#8EEI1qaFwb!%SG+ zJJ~)Uy~;xd?i+BZ7}Dd~CozffuR;QS)_K}l0X&JjR2|nMv?<6aA?B)k?wHC|CyRvf z3JBK3N|3Aw4=lqZ%Fm;I!141ge}w1l5^*(t${+!}FS9u4eCqcBH${wM4PJJE=XS)8 zJj`Tf|9ppHd!L(Sd2}SOhsxz78Ok17w5()VTt%Jc|JN1wjAQy@(|Q zE3#-IL*#vK%+%o4nG>Jrp%<+PIcVYqKo|bXw@o|{BTN%*c&74|#o%J|(cz-EX4#^G zha(X4-w#vJ%5N$Y@8zoBqOSvR7woO3Gn_e;{B&2S#Q6A z3nIc@di{;oT%3KEX&#P#KGWw40m3V-`6M#}@)}C@ng{o@FE8P(vwq#OVdzXKRX_~@ zVOCd_oqV?6rQ)^!wV*5NvAns}n{@@O*Qrks2*lZ0Xjm9yo!%ECt-9zsltd)~-^?hK z#>O+%BXmTC`Y(PP5J+i_&Uqwe$QWD9s@k|d5HsU~YP{e2Ldct+xgYpGR)XQj@hW&O z%rrMcAj8r6R#230h-=IyKZ5NU=<{2JFWw1o4{6OzZgv|DzwIDuvTJnjsXS+U>V-iG zLO`k=r#3p~`ZMmTLqkQk=LW60vylphDXe=pZ&3byz2q93;Ke%iub9&jKHz?(`xPP| zpl$wZCj_LFC8LC+ekHy-0s@)+nW$L%g$tC9UWgH~kFB&vsN z1Dybx3-?WeVuqah`6Ko#faJCU*v@cvje)U8Uzh|K$^8A1o`PhQZCA22)RX%9`gCSQ z{*DMBcHxz_@S*m<9ilDEDkb0Qi-^)O(yM3xPnkZmDLGC>$K|v)+Bd{1%I2w1yoTb; zGx4l1Bb^BY_I04tx&oAKrX3Ec z5Eyqba8657H9q+6fvbpC^lz_Z>uKpb56PP;PqN6qdiE5D4+K*COM~%E$nWOdN?5oZkeBfQuVRLPvgI7M0lWH~cOPe^unp z*vh1(QhrBC&rJQWw>>T14h*0oBLn(#Fz+jj&Z4=Ihx^SM-U#7o<$>RDvL@GB+`meq zNZkOanq<3z!mkXorM0y$nDdXoPTkSfth_014f~c9(VT5yvsKC^3j>(3J5ns7t^Uxi zR32NVI9~ka(tR_^>o|U(Znh!cz_QCq!r)yUJq&W!scIV!p|`1ApK`E(hs{;@Gr+~G zLC;+>nNA6v#37=CmaP&>Kzg<0f33z=j}F*ayL^ znn@p!K^yp*+eIP+XzK`wHm*^y5YTq-SY%OUss1?`Kpr52cRmM!q6~2nds;I8oGT6I zQsoJ(__qMcmdcFKDEQx#StIFVGeMwdqN^|dqZXAui{sp#YWomz0zD3F)<=S2FZUFf zmx_;BCr@GEwyz?jK0=xKJJ!C)INus&QIt)M1wcx#`yCHDfNaY^m%$i_i7r#>OMF;8 z(o=7ITv);X0+bYlJ6g@ihX@j@j_x~KJsOnFOU9SHqc>fRsza|;Ybet3GB4d~)r1<&qp2965;pc7l%vZM$aeFl>t7$sH6ixavswcAZkjvl+#p$Z$e(F(WO zowjbQFbq(ZkCAp^w8r9@O3b+cSE#uz59H?R>gJ^_zt45Mu-EY`Kc*02E{o6cUjQlz zkMdCu6#$xm1FkTSEHyOa3pC0J{rWGr6*)RP5Xx;+t=wRlkB8FrO*na6e?GST8vUF_ z!qDR}`d!KQ2`G+%vH0B}z4x1{LsgBwds&50C{k*1h{Kh~=rm{L;m)7yf_KttSGp(F zIlmnE8y*%kE{5XKg>+52Bzzq5&W`g|_sRDnzmcYy5E5-4t~>D^*av?zsPxwuJ}U2} zShyTV^8b42HRG&o^#oM(_Cp3H-MQE{!u1wVwRgdJdiS!mWejBt+c*sfiLUtKM}46h z4Mz@+6L_n7-E+Sff(V0lZz~7BtB1qlKm(eVgpX=sic2GT-O7Fn&Udj`6!9|JU(bv- zALF$=#cv8+s`&;wU_I{Q`g%3WCP3wCEq1oH3Y1=!EqDWIPiY>J_+K5U=MCH@4ciC~ zx?C(p?NzcokmM-_8%RhNd=yRujH!Ycb4ESdijGr^WlWCVLOUCp9#OxUUT^l5dYe z$GN{dpGg6TUhHu^>i;*SpOqQ!BKPK?6E+7vG0o+PG zTfa++A(I@^h<%kE6Y;!Ql1Ma~;;%}Cz8648RXY0ygZ14^W{LVM3H@ErMiGYO!DvA^ z-8^WiMEFhS3G;o?01E-L{kUcZ1PqqI*?Elh%l*+ZA5CrWs_9DQ?m+BjZL-uQKg@Z5 ze86o0sWOehi=@uBKM6&3VOsx2!mrnB{asV5y{B*ic3IAc=Ees$`+(&sc13o}Z|)`g z@^|$M5U6KD*eRXK@(r3Z>BDRv?*{`qwzH3`?x=pBuH>{W^io_0dl_g&#VlagXs0L) zj6}-0kMAN)=r~*?>3K}Z#C%_zJ1-KSxm|p>_(BRbD&&}>%272`6aD(2h2|zc`@3Wd z7c9KG4=}7Wgu4;3n_M(%exFl405tw-OZl(S>bRbl<2r(O2iKn~3%Z z7U>|grkTmc4{HuwKaA%VhrEM$e3tkiHu&j2_k--Ijb9E+d6sKQc`J$dSe99r(&VWK zov`^6yuAzVE5}L+H4jN{hfnEzm*oP>=YY%izFX}>4eHdp)*IJ`)j1d$Q;vEL{nW{9 z9ct#1-(y)aLhYuc5dE|Bv2!zcgF_v5T)FL@hwTXOLCc)4wv4~V#!e;*RU;xIa1*2< zkhl;T_|Gpr+1by98KgDXCeXVzDdJz_6|1yKN~EQN&8L+&RN}pV-PMtkXcN*BH$fCL zy^cEZN3?Kl8^^kH^M&nVQWD-GC=$8>Thz3J0lNcS%}~+sy5n>_(6uZhn;FpOGdlT9 zwQUn3tNwlo#Em$KDTQ;;kai12alCBdrKNW0bhl=s3QDsmBG}-~{&^B!(TB%Y1VG2U zu%J!jsnO1u0hdk(W(D=$Ty-J}$Xiil)xO|mgSm$fD}IHq#dd03(gkzcwGStS{e<79 z($8)a+ML89_Jv-=3G5X{O-!&IvAb;MILSsxMIP4;F}r`RTd%+bv@9F3| zKR3+|9=MwoMAw!$-(9%Dy)~B5@uDVqTR?WZPTW#)JIxj`dq4trdOT_^?EUqGNP$Mi z)0U3aS!acNChj^6-ynj`0k&ulcL|`DG>d82eriA`$%{9`S?sttpl1M5SBiR@W&cQb~xLkK7Wo;3^}^1b!@iA+(P{HoCVSS;c$M$HWuoB zP*;DSaI~G@iw(}y{!@EVHTm>bAmAIuT5kKYqiggwg|$3Q9^EZFgj5Vw+}PxwRi3iGk#b3RmguQ1M<)=c!D`wa?RR@)}5N&*WB&iHUi< za%#n8h%+shnL@8NXmxGvIFh$r6pdF1MvHV1?$C(+KHfpV%vJ>Z7<0;)z?fPjj@_|< zw#>%yk}rHGCNEi?4If2$j{~BvI@A;=fG@XceoN8#kh$mnc2k0xsQ#0g;^SwgagJCF z@8MTXOMYKk->MSGsw}rovyQS@z9IHSN=F>2fV+NB*GD(tSALV0Mjo4*a(8~~^M3m? z{2O?c%gLav-J&BW=_Tfo!P1V?_Gwya8%7GuvBc}#wvQ6VQOPmF8=IndN~+hM{`S$T zaLWj2ans6)nB{TE2^wl}Wrugd?e_4Qxo-l83OS76g7i}>`_)6>n7pgsxmE5*p>gF{ zhnuh0!K?h224Mr{Co|g8bY8*Qt_-CiNBb#gx(^5MOMBnvL*aLD6Ca{?MMRnMe{ z*P5M}ym$yIP>hH!_RVk!x*OQ+-`Jel-6q(1+F96*|E>!k+nQ6p3-c>wxwf(rvm>(0 z7j{5xcgm=w*a;!cXzkl+ta+Yj?6x!Qz7-w%`#zA9_TvEH93fj#k;QKhGXXov14vUdL#o(W*cKMz|Dlt$tg%{ zZef;e!0@w8?DfuzTfHLQEXi0CP;?;)cU_r!)d7a=%Ej*D}y$T7H*ewXd;U zD_Ri^jiJQrcZ`hc&VChJ^;tom1S9aw1R+0_Nuy`SzISLowfJ(zNNJX~n> zu!+?CBpzAX?xteIvnI*>mR`Uenh``@gwtKD8WQ zcfgd)O%H&AEmZc`BQ!jb_FWXPC?FYQVsG*9HQ9b~he>)M6S(JT?&UMuZKCtzgPIydTZ~*@ZKD;ZsMdDc zXW-hU+zW$~+@WKni(sZ<_n~3n2NmUEN5l~R9$&eAfj$Ch{0W|wx3+d;Qvy0Ct(e6^ zlPiC04HZnu4Q*F-^@MEu^2Pf2!4C`L(*u*BSBa4=-7_6cClNt?>xcJfxegS}0ys9T zLJzbuvNu`wrju@2>bEcGxIKEHS^R_vO$Hg-Z`Rf|9df(A%B$(r)qDBg490$0W11{n zz&F}5Np(LP%}t_z&A)atn@X+fQR>Fru*~RHVb+myyu}>L!~^@d)FK0Lf_DI}$|_Iv1V2e+tA?|0 zKWE10Y#u>Lo7@@5$yf8w6tHMBXmz|}t%Q}$Lt93IdKk4ak8un~R(-4KX((lLE@-PXvn<=YoObP0B z?LRXJU|hajrlAY{sD_<~5?t8xqhq!YC0*RfGs5iAnK`Cn$uV|qV&o7RsoI6XV6+4C zl;^$07G{&-3cmdI=e#yNc8b4{VPOo4m2IS|?54KB;4#@+61?(0@lJ5PSqloX5RT05 zZ~3H>SLczO#MN(oUX5r#f0AME4Yp-UJ#h(4o;?3xG?dCA2>p{3JN_xXiSM~Ig!!(- zpq1>7!!|tKP2wHa)%A5euYXP&nd)f{tv=MASyMfpBBmy2gVdsA`Z>>zrUdDjl~E{O zcXLUmb$4-;$x!&xv2N0kjA7s(Zn5|~V)a=Rrj5yHHcj0&hvUNBn8!+*bMF+SbO^N2 zus=yykEqgS<(pVxZhl^VE)_nS+Bj^gxS(89QE?~0DBB!cQ0P5kga}xj$Cewh_qcq9 zKEGI?Z50|{5PLJ8+x3IuVH7vdG*vOlwmck~SCY#p$RZnfR8t(aDPnbLP=jZtg?oMh za9EzB-Z=NaXA7$+L2sCP!+n^fMQCGA;D3UF6y!X diff --git a/doc/source/getting_started/intro_tutorials/01_table_oriented.rst b/doc/source/getting_started/intro_tutorials/01_table_oriented.rst index ff89589baefb1..efcdb22778ef4 100644 --- a/doc/source/getting_started/intro_tutorials/01_table_oriented.rst +++ b/doc/source/getting_started/intro_tutorials/01_table_oriented.rst @@ -46,7 +46,7 @@ I want to store passenger data of the Titanic. For a number of passengers, I kno "Name": [ "Braund, Mr. Owen Harris", "Allen, Mr. William Henry", - "Bonnell, Miss. Elizabeth", + "Bonnell, Miss Elizabeth", ], "Age": [22, 35, 58], "Sex": ["male", "male", "female"], From 566e59255b8332a691676d3b4a8bfc81ee3633b5 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 13 Jun 2024 08:11:15 -1000 Subject: [PATCH 081/272] PERF: cache plotting date locators for DatetimeIndex plotting (#58992) * PERF: cache plotting date locators for DatetimeIndex plotting * Type vmin, vmax --- pandas/plotting/_matplotlib/converter.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/pandas/plotting/_matplotlib/converter.py b/pandas/plotting/_matplotlib/converter.py index a8f08769ceae2..fc63d65f1e160 100644 --- a/pandas/plotting/_matplotlib/converter.py +++ b/pandas/plotting/_matplotlib/converter.py @@ -556,7 +556,8 @@ def _get_periods_per_ymd(freq: BaseOffset) -> tuple[int, int, int]: return ppd, ppm, ppy -def _daily_finder(vmin, vmax, freq: BaseOffset) -> np.ndarray: +@functools.cache +def _daily_finder(vmin: float, vmax: float, freq: BaseOffset) -> np.ndarray: # error: "BaseOffset" has no attribute "_period_dtype_code" dtype_code = freq._period_dtype_code # type: ignore[attr-defined] @@ -755,7 +756,8 @@ def _second_finder(label_interval: int) -> None: return info -def _monthly_finder(vmin, vmax, freq: BaseOffset) -> np.ndarray: +@functools.cache +def _monthly_finder(vmin: float, vmax: float, freq: BaseOffset) -> np.ndarray: _, _, periodsperyear = _get_periods_per_ymd(freq) vmin_orig = vmin @@ -826,7 +828,8 @@ def _monthly_finder(vmin, vmax, freq: BaseOffset) -> np.ndarray: return info -def _quarterly_finder(vmin, vmax, freq: BaseOffset) -> np.ndarray: +@functools.cache +def _quarterly_finder(vmin: float, vmax: float, freq: BaseOffset) -> np.ndarray: _, _, periodsperyear = _get_periods_per_ymd(freq) vmin_orig = vmin (vmin, vmax) = (int(vmin), int(vmax)) @@ -873,7 +876,8 @@ def _quarterly_finder(vmin, vmax, freq: BaseOffset) -> np.ndarray: return info -def _annual_finder(vmin, vmax, freq: BaseOffset) -> np.ndarray: +@functools.cache +def _annual_finder(vmin: float, vmax: float, freq: BaseOffset) -> np.ndarray: # Note: small difference here vs other finders in adding 1 to vmax (vmin, vmax) = (int(vmin), int(vmax + 1)) span = vmax - vmin + 1 From 5435b1d9e853430ff3c5dbb00e92634f61ffb209 Mon Sep 17 00:00:00 2001 From: Abdulaziz Aloqeely <52792999+Aloqeely@users.noreply.github.com> Date: Fri, 14 Jun 2024 19:53:35 +0300 Subject: [PATCH 082/272] WEB: Correctly link to PR (#59011) DOC: Correctly link to PR in PDEP-1 --- web/pandas/pdeps/0001-purpose-and-guidelines.md | 1 + 1 file changed, 1 insertion(+) diff --git a/web/pandas/pdeps/0001-purpose-and-guidelines.md b/web/pandas/pdeps/0001-purpose-and-guidelines.md index bb15b8f997b11..7f5f0326eba6c 100644 --- a/web/pandas/pdeps/0001-purpose-and-guidelines.md +++ b/web/pandas/pdeps/0001-purpose-and-guidelines.md @@ -285,3 +285,4 @@ hope can help clarify our meaning here: [51417]: https://github.com/pandas-dev/pandas/pull/51417 [28900]: https://github.com/pandas-dev/pandas/issues/28900 [35407]: https://github.com/pandas-dev/pandas/issues/35407 +[53576]: https://github.com/pandas-dev/pandas/pull/53576 From 6895f740c0d9fe2b77673a74eb1c3ab8b0f627a0 Mon Sep 17 00:00:00 2001 From: Zhengbo Wang Date: Sat, 15 Jun 2024 00:55:01 +0800 Subject: [PATCH 083/272] BUG: Fix issue with negative labels in `group_cumsum` (#58984) * BUG: Fix issue with negative labels in group_cumsum function * Remove blank line * Revert remove blank line * Add test * Add what's new * typo --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/_libs/groupby.pyx | 5 +++++ pandas/tests/groupby/transform/test_transform.py | 9 +++++++++ 3 files changed, 15 insertions(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 4a02622ae9eda..80e5e89b79690 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -571,6 +571,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrameGroupBy.agg` that raises ``AttributeError`` when there is dictionary input and duplicated columns, instead of returning a DataFrame with the aggregation of all duplicate columns. (:issue:`55041`) - Bug in :meth:`DataFrameGroupBy.apply` that was returning a completely empty DataFrame when all return values of ``func`` were ``None`` instead of returning an empty DataFrame with the original columns and dtypes. (:issue:`57775`) - Bug in :meth:`DataFrameGroupBy.apply` with ``as_index=False`` that was returning :class:`MultiIndex` instead of returning :class:`Index`. (:issue:`58291`) +- Bug in :meth:`DataFrameGroupBy.cumsum` where it did not return the correct dtype when the label contained ``None``. (:issue:`58811`) - Bug in :meth:`DataFrameGroupby.transform` and :meth:`SeriesGroupby.transform` with a reducer and ``observed=False`` that coerces dtype to float when there are unobserved categories. (:issue:`55326`) - Bug in :meth:`Rolling.apply` where the applied function could be called on fewer than ``min_period`` periods if ``method="table"``. (:issue:`58868`) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 15f8727c38f8d..7937b2ab72c37 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -399,7 +399,12 @@ def group_cumsum( lab = labels[i] if lab < 0: + # GH#58811 + if uses_mask: + result_mask[i, :] = True + out[i, :] = 0 continue + for j in range(K): val = values[i, j] diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 726c57081373c..a189d6772ece4 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -1591,3 +1591,12 @@ def test_min_one_dim_no_type_coercion(): expected = DataFrame({"Y": [9435, -5465765, -5465765, 0, 9435]}, dtype="int32") tm.assert_frame_equal(expected, result) + + +def test_nan_in_cumsum_group_label(): + # GH#58811 + df = DataFrame({"A": [1, None], "B": [2, 3]}, dtype="Int16") + gb = df.groupby("A")["B"] + result = gb.cumsum() + expected = Series([2, None], dtype="Int16", name="B") + tm.assert_series_equal(expected, result) From 0fb5cfe7edb332d2cd4553bd73671d58b67bd5da Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 14 Jun 2024 06:58:22 -1000 Subject: [PATCH 084/272] PERF: Only copy in plotting when needed (#58958) * PERF: Only copy in plotting when needed * Remove more unnecessary copies --- pandas/plotting/_core.py | 10 +++------- pandas/tests/plotting/frame/test_frame.py | 2 +- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index 0daf3cfafe81c..0a29ab530c2fc 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -982,10 +982,7 @@ def __call__(self, *args, **kwargs): f"Valid plot kinds: {self._all_kinds}" ) - # The original data structured can be transformed before passed to the - # backend. For example, for DataFrame is common to set the index as the - # `x` parameter, and return a Series with the parameter `y` as values. - data = self._parent.copy() + data = self._parent if isinstance(data, ABCSeries): kwargs["reuse_plot"] = True @@ -1005,7 +1002,7 @@ def __call__(self, *args, **kwargs): if is_integer(y) and not holds_integer(data.columns): y = data.columns[y] # converted to series actually. copy to not modify - data = data[y].copy() + data = data[y].copy(deep=False) data.index.name = y elif isinstance(data, ABCDataFrame): data_cols = data.columns @@ -1032,8 +1029,7 @@ def __call__(self, *args, **kwargs): except (IndexError, KeyError, TypeError): pass - # don't overwrite - data = data[y].copy() + data = data[y] if isinstance(data, ABCSeries): label_name = label_kw or y diff --git a/pandas/tests/plotting/frame/test_frame.py b/pandas/tests/plotting/frame/test_frame.py index e809bd33610f1..b381c4fce8430 100644 --- a/pandas/tests/plotting/frame/test_frame.py +++ b/pandas/tests/plotting/frame/test_frame.py @@ -1120,7 +1120,7 @@ def test_boxplot_return_type_invalid_type(self, return_type): def test_kde_df(self): pytest.importorskip("scipy") - df = DataFrame(np.random.default_rng(2).standard_normal((100, 4))) + df = DataFrame(np.random.default_rng(2).standard_normal((10, 4))) ax = _check_plot_works(df.plot, kind="kde") expected = [pprint_thing(c) for c in df.columns] _check_legend_labels(ax, labels=expected) From 3bcc95f042cd3fe7784a007cdaf61d8d4314fce9 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 14 Jun 2024 07:00:18 -1000 Subject: [PATCH 085/272] PERF: Use shallow copies/defer copies in io (#58960) --- pandas/io/json/_json.py | 12 ++++++++---- pandas/io/json/_table_schema.py | 2 +- pandas/io/sql.py | 2 +- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 13d74e935f786..ff01d2f62761b 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -369,18 +369,22 @@ def __init__( msg = "Overlapping names between the index and columns" raise ValueError(msg) - obj = obj.copy() timedeltas = obj.select_dtypes(include=["timedelta"]).columns + copied = False if len(timedeltas): + obj = obj.copy() + copied = True obj[timedeltas] = obj[timedeltas].map(lambda x: x.isoformat()) - # Convert PeriodIndex to datetimes before serializing - if isinstance(obj.index.dtype, PeriodDtype): - obj.index = obj.index.to_timestamp() # exclude index from obj if index=False if not self.index: self.obj = obj.reset_index(drop=True) else: + # Convert PeriodIndex to datetimes before serializing + if isinstance(obj.index.dtype, PeriodDtype): + if not copied: + obj = obj.copy(deep=False) + obj.index = obj.index.to_timestamp() self.obj = obj.reset_index(drop=False) self.date_format = "iso" self.orient = "records" diff --git a/pandas/io/json/_table_schema.py b/pandas/io/json/_table_schema.py index d4b412404c308..b44aecff79779 100644 --- a/pandas/io/json/_table_schema.py +++ b/pandas/io/json/_table_schema.py @@ -114,7 +114,7 @@ def set_default_names(data): ) return data - data = data.copy() + data = data.copy(deep=False) if data.index.nlevels > 1: data.index.names = com.fill_missing_names(data.index.names) else: diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 874320f08fb75..c8c9fd99d0165 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -1014,7 +1014,7 @@ def _execute_insert_multi(self, conn, keys: list[str], data_iter) -> int: def insert_data(self) -> tuple[list[str], list[np.ndarray]]: if self.index is not None: - temp = self.frame.copy() + temp = self.frame.copy(deep=False) temp.index.names = self.index try: temp.reset_index(inplace=True) From c1dcd5437182ac54c0527b9a90fb74715666eb3a Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 14 Jun 2024 07:01:30 -1000 Subject: [PATCH 086/272] PERF/CLN: Avoid ravel in plotting (#58973) * Avoid ravel in plotting * Use reshape instead of ravel * Add type ignore --- pandas/plotting/_matplotlib/boxplot.py | 9 ++------- pandas/plotting/_matplotlib/core.py | 2 +- pandas/plotting/_matplotlib/hist.py | 17 ++++++----------- pandas/plotting/_matplotlib/tools.py | 26 +++++++++++++++----------- 4 files changed, 24 insertions(+), 30 deletions(-) diff --git a/pandas/plotting/_matplotlib/boxplot.py b/pandas/plotting/_matplotlib/boxplot.py index 11c0ba01fff64..6bb10068bee38 100644 --- a/pandas/plotting/_matplotlib/boxplot.py +++ b/pandas/plotting/_matplotlib/boxplot.py @@ -311,8 +311,6 @@ def _grouped_plot_by_column( layout=layout, ) - _axes = flatten_axes(axes) - # GH 45465: move the "by" label based on "vert" xlabel, ylabel = kwargs.pop("xlabel", None), kwargs.pop("ylabel", None) if kwargs.get("vert", True): @@ -322,8 +320,7 @@ def _grouped_plot_by_column( ax_values = [] - for i, col in enumerate(columns): - ax = _axes[i] + for ax, col in zip(flatten_axes(axes), columns): gp_col = grouped[col] keys, values = zip(*gp_col) re_plotf = plotf(keys, values, ax, xlabel=xlabel, ylabel=ylabel, **kwargs) @@ -531,10 +528,8 @@ def boxplot_frame_groupby( figsize=figsize, layout=layout, ) - axes = flatten_axes(axes) - data = {} - for (key, group), ax in zip(grouped, axes): + for (key, group), ax in zip(grouped, flatten_axes(axes)): d = group.boxplot( ax=ax, column=column, fontsize=fontsize, rot=rot, grid=grid, **kwds ) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 2d3c81f2512aa..22be9baf1ff5c 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -586,7 +586,7 @@ def _axes_and_fig(self) -> tuple[Sequence[Axes], Figure]: fig.set_size_inches(self.figsize) axes = self.ax - axes = flatten_axes(axes) + axes = np.fromiter(flatten_axes(axes), dtype=object) if self.logx is True or self.loglog is True: [a.set_xscale("log") for a in axes] diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index ca635386be335..2c4d714bf1a0c 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -95,11 +95,12 @@ def _adjust_bins(self, bins: int | np.ndarray | list[np.ndarray]): def _calculate_bins(self, data: Series | DataFrame, bins) -> np.ndarray: """Calculate bins given data""" nd_values = data.infer_objects()._get_numeric_data() - values = np.ravel(nd_values) + values = nd_values.values + if nd_values.ndim == 2: + values = values.reshape(-1) values = values[~isna(values)] - hist, bins = np.histogram(values, bins=bins, range=self._bin_range) - return bins + return np.histogram_bin_edges(values, bins=bins, range=self._bin_range) # error: Signature of "_plot" incompatible with supertype "LinePlot" @classmethod @@ -322,10 +323,7 @@ def _grouped_plot( naxes=naxes, figsize=figsize, sharex=sharex, sharey=sharey, ax=ax, layout=layout ) - _axes = flatten_axes(axes) - - for i, (key, group) in enumerate(grouped): - ax = _axes[i] + for ax, (key, group) in zip(flatten_axes(axes), grouped): if numeric_only and isinstance(group, ABCDataFrame): group = group._get_numeric_data() plotf(group, ax, **kwargs) @@ -557,12 +555,9 @@ def hist_frame( figsize=figsize, layout=layout, ) - _axes = flatten_axes(axes) - can_set_label = "label" not in kwds - for i, col in enumerate(data.columns): - ax = _axes[i] + for ax, col in zip(flatten_axes(axes), data.columns): if legend and can_set_label: kwds["label"] = col ax.hist(data[col].dropna().values, bins=bins, **kwds) diff --git a/pandas/plotting/_matplotlib/tools.py b/pandas/plotting/_matplotlib/tools.py index ae82f0232aee0..f9c370b2486fd 100644 --- a/pandas/plotting/_matplotlib/tools.py +++ b/pandas/plotting/_matplotlib/tools.py @@ -18,7 +18,10 @@ ) if TYPE_CHECKING: - from collections.abc import Iterable + from collections.abc import ( + Generator, + Iterable, + ) from matplotlib.axes import Axes from matplotlib.axis import Axis @@ -231,7 +234,7 @@ def create_subplots( else: if is_list_like(ax): if squeeze: - ax = flatten_axes(ax) + ax = np.fromiter(flatten_axes(ax), dtype=object) if layout is not None: warnings.warn( "When passing multiple axes, layout keyword is ignored.", @@ -260,7 +263,7 @@ def create_subplots( if squeeze: return fig, ax else: - return fig, flatten_axes(ax) + return fig, np.fromiter(flatten_axes(ax), dtype=object) else: warnings.warn( "To output multiple subplots, the figure containing " @@ -439,12 +442,13 @@ def handle_shared_axes( _remove_labels_from_axis(ax.yaxis) -def flatten_axes(axes: Axes | Iterable[Axes]) -> np.ndarray: +def flatten_axes(axes: Axes | Iterable[Axes]) -> Generator[Axes, None, None]: if not is_list_like(axes): - return np.array([axes]) + yield axes # type: ignore[misc] elif isinstance(axes, (np.ndarray, ABCIndex)): - return np.asarray(axes).ravel() - return np.array(axes) + yield from np.asarray(axes).reshape(-1) + else: + yield from axes # type: ignore[misc] def set_ticks_props( @@ -456,13 +460,13 @@ def set_ticks_props( ): for ax in flatten_axes(axes): if xlabelsize is not None: - mpl.artist.setp(ax.get_xticklabels(), fontsize=xlabelsize) + mpl.artist.setp(ax.get_xticklabels(), fontsize=xlabelsize) # type: ignore[arg-type] if xrot is not None: - mpl.artist.setp(ax.get_xticklabels(), rotation=xrot) + mpl.artist.setp(ax.get_xticklabels(), rotation=xrot) # type: ignore[arg-type] if ylabelsize is not None: - mpl.artist.setp(ax.get_yticklabels(), fontsize=ylabelsize) + mpl.artist.setp(ax.get_yticklabels(), fontsize=ylabelsize) # type: ignore[arg-type] if yrot is not None: - mpl.artist.setp(ax.get_yticklabels(), rotation=yrot) + mpl.artist.setp(ax.get_yticklabels(), rotation=yrot) # type: ignore[arg-type] return axes From f703cabe5f2e3b927e91f2f0bc253ffc35622a1e Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 14 Jun 2024 07:02:28 -1000 Subject: [PATCH 087/272] PERF: Use reshape instead of ravel/flatten (#58972) * Use reshape instead of flatten * Use reshape instead of ravel' * add back tuple --- pandas/core/arrays/arrow/array.py | 2 +- pandas/core/generic.py | 4 +++- pandas/core/indexing.py | 2 +- pandas/core/reshape/reshape.py | 4 ++-- 4 files changed, 7 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 3d55513ab914c..d073cf0b11c6b 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2970,7 +2970,7 @@ def transpose_homogeneous_pyarrow( """ arrays = list(arrays) nrows, ncols = len(arrays[0]), len(arrays) - indices = np.arange(nrows * ncols).reshape(ncols, nrows).T.flatten() + indices = np.arange(nrows * ncols).reshape(ncols, nrows).T.reshape(-1) arr = pa.chunked_array([chunk for arr in arrays for chunk in arr._pa_array.chunks]) arr = arr.take(indices) return [ArrowExtensionArray(arr.slice(i * ncols, ncols)) for i in range(nrows)] diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 84745b25b5eef..599b3d5578fca 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9271,7 +9271,9 @@ def compare( # reorder axis to keep things organized indices = ( - np.arange(diff.shape[axis]).reshape([2, diff.shape[axis] // 2]).T.flatten() + np.arange(diff.shape[axis]) + .reshape([2, diff.shape[axis] // 2]) + .T.reshape(-1) ) diff = diff.take(indices, axis=axis) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 9140b1dbe9b33..8d1239ff71174 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -2440,7 +2440,7 @@ def _align_frame(self, indexer, df: DataFrame) -> DataFrame: ax = self.obj.axes[i] if is_sequence(ix) or isinstance(ix, slice): if isinstance(ix, np.ndarray): - ix = ix.ravel() + ix = ix.reshape(-1) if idx is None: idx = ax[ix] elif cols is None: diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 5426c72a356d6..a8efae8da82c8 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -842,7 +842,7 @@ def _convert_level_number(level_num: int, columns: Index): [x._values.astype(dtype, copy=False) for _, x in subset.items()] ) N, K = subset.shape - idx = np.arange(N * K).reshape(K, N).T.ravel() + idx = np.arange(N * K).reshape(K, N).T.reshape(-1) value_slice = value_slice.take(idx) else: value_slice = subset.values @@ -924,7 +924,7 @@ def _reorder_for_extension_array_stack( # idx is an indexer like # [c0r0, c1r0, c2r0, ..., # c0r1, c1r1, c2r1, ...] - idx = np.arange(n_rows * n_columns).reshape(n_columns, n_rows).T.ravel() + idx = np.arange(n_rows * n_columns).reshape(n_columns, n_rows).T.reshape(-1) return arr.take(idx) From bf1bef572244af609a32a81ee139cdc879610943 Mon Sep 17 00:00:00 2001 From: Abdulaziz Aloqeely <52792999+Aloqeely@users.noreply.github.com> Date: Fri, 14 Jun 2024 21:14:14 +0300 Subject: [PATCH 088/272] CI: remove xfail in test_to_xarray_index_types (#59013) * CI: xfail test_to_xarray_index_types only on v2024.5 * Remove xfail --- pandas/tests/generic/test_to_xarray.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/pandas/tests/generic/test_to_xarray.py b/pandas/tests/generic/test_to_xarray.py index 491f621783a76..d8401a8b2ae3f 100644 --- a/pandas/tests/generic/test_to_xarray.py +++ b/pandas/tests/generic/test_to_xarray.py @@ -9,7 +9,6 @@ date_range, ) import pandas._testing as tm -from pandas.util.version import Version pytest.importorskip("xarray") @@ -30,17 +29,11 @@ def df(self): } ) - def test_to_xarray_index_types(self, index_flat, df, using_infer_string, request): + def test_to_xarray_index_types(self, index_flat, df, using_infer_string): index = index_flat # MultiIndex is tested in test_to_xarray_with_multiindex if len(index) == 0: pytest.skip("Test doesn't make sense for empty index") - import xarray - - if Version(xarray.__version__) >= Version("2024.5"): - request.applymarker( - pytest.mark.xfail(reason="https://github.com/pydata/xarray/issues/9026") - ) from xarray import Dataset From dd87dd3ef661ac06e30adc55d25a1f03deda3abf Mon Sep 17 00:00:00 2001 From: Tilova Shahrin <46762829+tilovashahrin@users.noreply.github.com> Date: Fri, 14 Jun 2024 18:52:54 -0400 Subject: [PATCH 089/272] Change link from NEP 29 to SPEC 0 for Numpy Guidelines (#59017) * Change link from NEP 29 to SPEC 0 for Numpy Guidelines * Update doc/source/development/policies.rst Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/development/policies.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/development/policies.rst b/doc/source/development/policies.rst index f958e4c4ad1fc..a3665c5bb4d1f 100644 --- a/doc/source/development/policies.rst +++ b/doc/source/development/policies.rst @@ -51,7 +51,7 @@ pandas may change the behavior of experimental features at any time. Python support ~~~~~~~~~~~~~~ -pandas mirrors the `NumPy guidelines for Python support `__. +pandas mirrors the `SPEC 0 guideline for Python support `__. Security policy ~~~~~~~~~~~~~~~ From c2eb3dacefc0f5c915aa495d502bdf40f77aafb5 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 17 Jun 2024 10:13:28 -0700 Subject: [PATCH 090/272] Bump pypa/cibuildwheel from 2.18.1 to 2.19.1 (#59027) Bumps [pypa/cibuildwheel](https://github.com/pypa/cibuildwheel) from 2.18.1 to 2.19.1. - [Release notes](https://github.com/pypa/cibuildwheel/releases) - [Changelog](https://github.com/pypa/cibuildwheel/blob/main/docs/changelog.md) - [Commits](https://github.com/pypa/cibuildwheel/compare/v2.18.1...v2.19.1) --- updated-dependencies: - dependency-name: pypa/cibuildwheel dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/wheels.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index d7a98671c42bc..4b34d2b21495b 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -140,7 +140,7 @@ jobs: run: echo "sdist_name=$(cd ./dist && ls -d */)" >> "$GITHUB_ENV" - name: Build wheels - uses: pypa/cibuildwheel@v2.18.1 + uses: pypa/cibuildwheel@v2.19.1 with: package-dir: ./dist/${{ startsWith(matrix.buildplat[1], 'macosx') && env.sdist_name || needs.build_sdist.outputs.sdist_file }} env: From b7970fc4db0f11df383c437c681db3e91a973ace Mon Sep 17 00:00:00 2001 From: Natalia Mokeeva <91160475+natmokval@users.noreply.github.com> Date: Mon, 17 Jun 2024 19:15:19 +0200 Subject: [PATCH 091/272] DOC: update the documentation for Timestamp: add to parameters the missing unit 'W' and an example (#59033) add 'W'as a valid unit for Timestamp --- pandas/_libs/tslibs/timestamps.pyx | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 04bd439b40b8d..9cd0fea1d618e 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -1378,7 +1378,7 @@ class Timestamp(_Timestamp): Time zone for time which Timestamp will have. unit : str Unit used for conversion if ts_input is of type int or float. The - valid values are 'D', 'h', 'm', 's', 'ms', 'us', and 'ns'. For + valid values are 'W', 'D', 'h', 'm', 's', 'ms', 'us', and 'ns'. For example, 's' means seconds and 'ms' means milliseconds. For float inputs, the result will be stored in nanoseconds, and @@ -1417,6 +1417,11 @@ class Timestamp(_Timestamp): >>> pd.Timestamp(1513393355.5, unit='s') Timestamp('2017-12-16 03:02:35.500000') + This converts an int representing a Unix-epoch in units of weeks + + >>> pd.Timestamp(1535, unit='W') + Timestamp('1999-06-03 00:00:00') + This converts an int representing a Unix-epoch in units of seconds and for a particular timezone From 5d451fe01d35ae91750a43a7190a5e378831b8fe Mon Sep 17 00:00:00 2001 From: Zhengbo Wang Date: Tue, 18 Jun 2024 01:17:19 +0800 Subject: [PATCH 092/272] PREF: Fix regression from #58984 (#59025) * check uses_mask before * backup gh * backup gh --- pandas/_libs/groupby.pyx | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 7937b2ab72c37..d7e485f74e58b 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -398,11 +398,12 @@ def group_cumsum( for i in range(N): lab = labels[i] - if lab < 0: + if uses_mask and lab < 0: # GH#58811 - if uses_mask: - result_mask[i, :] = True - out[i, :] = 0 + result_mask[i, :] = True + out[i, :] = 0 + continue + elif lab < 0: continue for j in range(K): From 3809e2ae913ab97d30d9c5497a5cbb20d5ac8efe Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Tue, 18 Jun 2024 00:32:46 +0530 Subject: [PATCH 093/272] DOC: fix SA01 for pandas.Timedelta.as_unit (#59030) --- ci/code_checks.sh | 1 - pandas/_libs/tslibs/timedeltas.pyx | 6 ++++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 4d74fec24c4ab..ffa26f4821705 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -211,7 +211,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.to_frame SA01" \ -i "pandas.Series.to_markdown SA01" \ -i "pandas.Series.update PR07,SA01" \ - -i "pandas.Timedelta.as_unit SA01" \ -i "pandas.Timedelta.asm8 SA01" \ -i "pandas.Timedelta.ceil SA01" \ -i "pandas.Timedelta.components SA01" \ diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 4ff2df34ac717..dc4aed9920734 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -1731,6 +1731,12 @@ cdef class _Timedelta(timedelta): ------- Timedelta + See Also + -------- + Timedelta : Represents a duration, the difference between two dates or times. + to_timedelta : Convert argument to timedelta. + Timedelta.asm8 : Return a numpy timedelta64 array scalar view. + Examples -------- >>> td = pd.Timedelta('1001ms') From 8395f981851323913763790ad959a80e3e882ba9 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Tue, 18 Jun 2024 00:33:13 +0530 Subject: [PATCH 094/272] DOC: fix SA01 for pandas.NamedAgg (#59029) --- ci/code_checks.sh | 1 - pandas/core/groupby/generic.py | 4 ++++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index ffa26f4821705..5dbec7f0c8a28 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -91,7 +91,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.MultiIndex.to_frame RT03" \ -i "pandas.NA SA01" \ -i "pandas.NaT SA01" \ - -i "pandas.NamedAgg SA01" \ -i "pandas.Period.asfreq SA01" \ -i "pandas.Period.freq GL08" \ -i "pandas.Period.freqstr SA01" \ diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 945b9f9c14c0b..5a9805e3f8c93 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -124,6 +124,10 @@ class NamedAgg(NamedTuple): Function to apply to the provided column. If string, the name of a built-in pandas function. + See Also + -------- + DataFrame.groupby : Group DataFrame using a mapper or by a Series of columns. + Examples -------- >>> df = pd.DataFrame({"key": [1, 1, 2], "a": [-1, 0, 1], 1: [10, 11, 12]}) From ee05885328bca2f627e2d249fd3aed2df1cf9ef4 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 17 Jun 2024 09:59:34 -1000 Subject: [PATCH 095/272] CLN: BaseGrouper (#59034) --- pandas/core/groupby/generic.py | 3 +- pandas/core/groupby/groupby.py | 17 ++++++---- pandas/core/groupby/ops.py | 57 ++++++++-------------------------- pandas/tests/test_sorting.py | 4 ++- 4 files changed, 29 insertions(+), 52 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 5a9805e3f8c93..eb334e0e57493 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -686,7 +686,8 @@ def nunique(self, dropna: bool = True) -> Series | DataFrame: b 1 dtype: int64 """ - ids, ngroups = self._grouper.group_info + ids = self._grouper.ids + ngroups = self._grouper.ngroups val = self.obj._values codes, uniques = algorithms.factorize(val, use_na_sentinel=dropna, sort=False) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 1b58317c08736..83bb79bcddcf8 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1360,7 +1360,7 @@ def _wrap_applied_output( @final def _numba_prep(self, data: DataFrame): - ids, ngroups = self._grouper.group_info + ngroups = self._grouper.ngroups sorted_index = self._grouper.result_ilocs sorted_ids = self._grouper._sorted_ids @@ -1969,7 +1969,8 @@ def _cumcount_array(self, ascending: bool = True) -> np.ndarray: this is currently implementing sort=False (though the default is sort=True) for groupby in general """ - ids, ngroups = self._grouper.group_info + ids = self._grouper.ids + ngroups = self._grouper.ngroups sorter = get_group_index_sorter(ids, ngroups) ids, count = ids[sorter], len(ids) @@ -2185,7 +2186,8 @@ def count(self) -> NDFrameT: Freq: MS, dtype: int64 """ data = self._get_data_to_aggregate() - ids, ngroups = self._grouper.group_info + ids = self._grouper.ids + ngroups = self._grouper.ngroups mask = ids != -1 is_series = data.ndim == 1 @@ -3840,7 +3842,8 @@ def _fill(self, direction: Literal["ffill", "bfill"], limit: int | None = None): if limit is None: limit = -1 - ids, ngroups = self._grouper.group_info + ids = self._grouper.ids + ngroups = self._grouper.ngroups col_func = partial( libgroupby.group_fillna_indexer, @@ -4361,7 +4364,8 @@ def post_processor( qs = np.array([q], dtype=np.float64) pass_qs = None - ids, ngroups = self._grouper.group_info + ids = self._grouper.ids + ngroups = self._grouper.ngroups if self.dropna: # splitter drops NA groups, we need to do the same ids = ids[ids >= 0] @@ -5038,7 +5042,8 @@ def shift( else: if fill_value is lib.no_default: fill_value = None - ids, ngroups = self._grouper.group_info + ids = self._grouper.ids + ngroups = self._grouper.ngroups res_indexer = np.zeros(len(ids), dtype=np.int64) libgroupby.group_shift_indexer(res_indexer, ids, ngroups, period) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 4f40c4f4283f0..58c27d80ea99a 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -73,7 +73,6 @@ Generator, Hashable, Iterator, - Sequence, ) from pandas.core.generic import NDFrame @@ -581,14 +580,14 @@ class BaseGrouper: def __init__( self, axis: Index, - groupings: Sequence[grouper.Grouping], + groupings: list[grouper.Grouping], sort: bool = True, dropna: bool = True, ) -> None: assert isinstance(axis, Index), axis self.axis = axis - self._groupings: list[grouper.Grouping] = list(groupings) + self._groupings = groupings self._sort = sort self.dropna = dropna @@ -596,10 +595,6 @@ def __init__( def groupings(self) -> list[grouper.Grouping]: return self._groupings - @property - def shape(self) -> Shape: - return tuple(ping.ngroups for ping in self.groupings) - def __iter__(self) -> Iterator[Hashable]: return iter(self.indices) @@ -628,11 +623,15 @@ def _get_splitter(self, data: NDFrame) -> DataSplitter: ------- Generator yielding subsetted objects """ - ids, ngroups = self.group_info - return _get_splitter( + if isinstance(data, Series): + klass: type[DataSplitter] = SeriesSplitter + else: + # i.e. DataFrame + klass = FrameSplitter + + return klass( data, - ids, - ngroups, + self.ngroups, sorted_ids=self._sorted_ids, sort_idx=self.result_ilocs, ) @@ -692,7 +691,8 @@ def size(self) -> Series: """ Compute group sizes. """ - ids, ngroups = self.group_info + ids = self.ids + ngroups = self.ngroups out: np.ndarray | list if ngroups: out = np.bincount(ids[ids != -1], minlength=ngroups) @@ -729,12 +729,6 @@ def has_dropped_na(self) -> bool: """ return bool((self.ids < 0).any()) - @cache_readonly - def group_info(self) -> tuple[npt.NDArray[np.intp], int]: - result_index, ids = self.result_index_and_ids - ngroups = len(result_index) - return ids, ngroups - @cache_readonly def codes_info(self) -> npt.NDArray[np.intp]: # return the codes of items in original grouped axis @@ -1123,10 +1117,6 @@ def indices(self): i = bin return indices - @cache_readonly - def group_info(self) -> tuple[npt.NDArray[np.intp], int]: - return self.ids, self.ngroups - @cache_readonly def codes(self) -> list[npt.NDArray[np.intp]]: return [self.ids] @@ -1191,29 +1181,25 @@ class DataSplitter(Generic[NDFrameT]): def __init__( self, data: NDFrameT, - labels: npt.NDArray[np.intp], ngroups: int, *, sort_idx: npt.NDArray[np.intp], sorted_ids: npt.NDArray[np.intp], ) -> None: self.data = data - self.labels = ensure_platform_int(labels) # _should_ already be np.intp self.ngroups = ngroups self._slabels = sorted_ids self._sort_idx = sort_idx def __iter__(self) -> Iterator: - sdata = self._sorted_data - if self.ngroups == 0: # we are inside a generator, rather than raise StopIteration # we merely return signal the end return starts, ends = lib.generate_slices(self._slabels, self.ngroups) - + sdata = self._sorted_data for start, end in zip(starts, ends): yield self._chop(sdata, slice(start, end)) @@ -1241,20 +1227,3 @@ def _chop(self, sdata: DataFrame, slice_obj: slice) -> DataFrame: mgr = sdata._mgr.get_slice(slice_obj, axis=1) df = sdata._constructor_from_mgr(mgr, axes=mgr.axes) return df.__finalize__(sdata, method="groupby") - - -def _get_splitter( - data: NDFrame, - labels: npt.NDArray[np.intp], - ngroups: int, - *, - sort_idx: npt.NDArray[np.intp], - sorted_ids: npt.NDArray[np.intp], -) -> DataSplitter: - if isinstance(data, Series): - klass: type[DataSplitter] = SeriesSplitter - else: - # i.e. DataFrame - klass = FrameSplitter - - return klass(data, labels, ngroups, sort_idx=sort_idx, sorted_ids=sorted_ids) diff --git a/pandas/tests/test_sorting.py b/pandas/tests/test_sorting.py index 132608d7df115..56de3f7f39175 100644 --- a/pandas/tests/test_sorting.py +++ b/pandas/tests/test_sorting.py @@ -104,7 +104,9 @@ def test_int64_overflow_groupby_large_df_shuffled(self, agg): gr = df.groupby(list("abcde")) # verify this is testing what it is supposed to test! - assert is_int64_overflow_possible(gr._grouper.shape) + assert is_int64_overflow_possible( + tuple(ping.ngroups for ping in gr._grouper.groupings) + ) mi = MultiIndex.from_arrays( [ar.ravel() for ar in np.array_split(np.unique(arr, axis=0), 5, axis=1)], From b403f3cfd9457dde5b35e897beed2ecc3a5d98ca Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Tue, 18 Jun 2024 23:10:42 +0530 Subject: [PATCH 096/272] DOC: fix PR07,RT03 for pandas.merge_asof (#59044) --- ci/code_checks.sh | 1 - pandas/core/reshape/merge.py | 3 +++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 5dbec7f0c8a28..c4f143e9be2f4 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -462,7 +462,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.io.stata.StataReader.variable_labels RT03,SA01" \ -i "pandas.io.stata.StataWriter.write_file SA01" \ -i "pandas.json_normalize RT03,SA01" \ - -i "pandas.merge_asof PR07,RT03" \ -i "pandas.period_range RT03,SA01" \ -i "pandas.plotting.andrews_curves RT03,SA01" \ -i "pandas.plotting.lag_plot RT03,SA01" \ diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index a6cb6b5f48de2..2ce77ac19b9c5 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -673,7 +673,9 @@ def merge_asof( Parameters ---------- left : DataFrame or named Series + First pandas object to merge. right : DataFrame or named Series + Second pandas object to merge. on : label Field name to join on. Must be found in both DataFrames. The data MUST be ordered. Furthermore this must be a numeric column, @@ -712,6 +714,7 @@ def merge_asof( Returns ------- DataFrame + A DataFrame of the two merged objects. See Also -------- From bcbc5b21cc782ff542aa640391cb054089770ff6 Mon Sep 17 00:00:00 2001 From: ananiavito <48645073+ananiavito@users.noreply.github.com> Date: Tue, 18 Jun 2024 19:44:40 +0200 Subject: [PATCH 097/272] DOC: fix typos in User Guide (#59040) * DOC: fix two typos in User Guide io.rst * DOC: fix typo in timeseries.rst * DOC: fix typo in options.rst * DOC: fix typo in missing_data.rst * DOC: fix typo in io.rst * DOC: fix grammar error in basics.rst --- doc/source/user_guide/basics.rst | 2 +- doc/source/user_guide/io.rst | 8 ++++---- doc/source/user_guide/missing_data.rst | 2 +- doc/source/user_guide/options.rst | 2 +- doc/source/user_guide/timeseries.rst | 2 +- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/doc/source/user_guide/basics.rst b/doc/source/user_guide/basics.rst index 0ff40dcdcd150..5cdc9779ef4e1 100644 --- a/doc/source/user_guide/basics.rst +++ b/doc/source/user_guide/basics.rst @@ -1606,7 +1606,7 @@ For instance: This method does not convert the row to a Series object; it merely returns the values inside a namedtuple. Therefore, :meth:`~DataFrame.itertuples` preserves the data type of the values -and is generally faster as :meth:`~DataFrame.iterrows`. +and is generally faster than :meth:`~DataFrame.iterrows`. .. note:: diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index dc06dd9620c24..c523f3a641d91 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -3003,7 +3003,7 @@ However, if XPath does not reference node names such as default, ``/*``, then .. note:: Since ``xpath`` identifies the parent of content to be parsed, only immediate - desendants which include child nodes or current attributes are parsed. + descendants which include child nodes or current attributes are parsed. Therefore, ``read_xml`` will not parse the text of grandchildren or other descendants and will not parse attributes of any descendant. To retrieve lower level content, adjust xpath to lower level. For example, @@ -3535,7 +3535,7 @@ For example, to read in a ``MultiIndex`` index without names: df = pd.read_excel("path_to_file.xlsx", index_col=[0, 1]) df -If the index has level names, they will parsed as well, using the same +If the index has level names, they will be parsed as well, using the same parameters. .. ipython:: python @@ -5847,10 +5847,10 @@ You can check if a table exists using :func:`~pandas.io.sql.has_table` Schema support '''''''''''''' -Reading from and writing to different schema's is supported through the ``schema`` +Reading from and writing to different schemas is supported through the ``schema`` keyword in the :func:`~pandas.read_sql_table` and :func:`~pandas.DataFrame.to_sql` functions. Note however that this depends on the database flavor (sqlite does not -have schema's). For example: +have schemas). For example: .. code-block:: python diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst index 66e42352754ae..4e0245312b827 100644 --- a/doc/source/user_guide/missing_data.rst +++ b/doc/source/user_guide/missing_data.rst @@ -319,7 +319,7 @@ Missing values propagate through arithmetic operations between pandas objects. The descriptive statistics and computational methods discussed in the :ref:`data structure overview ` (and listed :ref:`here -` and :ref:`here `) are all +` and :ref:`here `) all account for missing data. When summing data, NA values or empty data will be treated as zero. diff --git a/doc/source/user_guide/options.rst b/doc/source/user_guide/options.rst index ce805f98ca528..7757d95c2bccd 100644 --- a/doc/source/user_guide/options.rst +++ b/doc/source/user_guide/options.rst @@ -8,7 +8,7 @@ Options and settings Overview -------- -pandas has an options API configure and customize global behavior related to +pandas has an options API to configure and customize global behavior related to :class:`DataFrame` display, data behavior and more. Options have a full "dotted-style", case-insensitive name (e.g. ``display.max_rows``). diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index d5137baa95ab8..0fa36f1e30104 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -1479,7 +1479,7 @@ or some other non-observed day. Defined observance rules are: "after_nearest_workday", "apply ``nearest_workday`` and then move to next workday after that day" "sunday_to_monday", "move Sunday to following Monday" "next_monday_or_tuesday", "move Saturday to Monday and Sunday/Monday to Tuesday" - "previous_friday", move Saturday and Sunday to previous Friday" + "previous_friday", "move Saturday and Sunday to previous Friday" "next_monday", "move Saturday and Sunday to following Monday" "weekend_to_monday", "same as ``next_monday``" From f9f12de2c109151be8773d280bd68446c429c66c Mon Sep 17 00:00:00 2001 From: dsousa <106392201+St0rmie@users.noreply.github.com> Date: Tue, 18 Jun 2024 19:22:37 +0100 Subject: [PATCH 098/272] BUG: fixed Series.dt methods in ArrowDtype class that were returning incorrect values #57355 (#58052) BUG: fixed Series.dt methods in ArrowDtype class that were returning incorrect time values. (#57355) --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/arrays/arrow/array.py | 35 ++++++++++++++-------------- pandas/tests/extension/test_arrow.py | 25 ++++++++++++++++++++ 3 files changed, 44 insertions(+), 17 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 80e5e89b79690..36a0dd3718feb 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -608,6 +608,7 @@ Other - Bug in :meth:`DataFrame.where` where using a non-bool type array in the function would return a ``ValueError`` instead of a ``TypeError`` (:issue:`56330`) - Bug in :meth:`Index.sort_values` when passing a key function that turns values into tuples, e.g. ``key=natsort.natsort_key``, would raise ``TypeError`` (:issue:`56081`) - Bug in :meth:`Series.diff` allowing non-integer values for the ``periods`` argument. (:issue:`56607`) +- Bug in :meth:`Series.dt` methods in :class:`ArrowDtype` that were returning incorrect values. (:issue:`57355`) - Bug in :meth:`Series.rank` that doesn't preserve missing values for nullable integers when ``na_option='keep'``. (:issue:`56976`) - Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` inconsistently replacing matching instances when ``regex=True`` and missing values are present. (:issue:`56599`) - Bug in Dataframe Interchange Protocol implementation was returning incorrect results for data buffers' associated dtype, for string and datetime columns (:issue:`54781`) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index d073cf0b11c6b..8c39e0d87df4e 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -18,7 +18,6 @@ from pandas._libs import lib from pandas._libs.tslibs import ( - NaT, Timedelta, Timestamp, timezones, @@ -2612,17 +2611,19 @@ def _str_wrap(self, width: int, **kwargs) -> Self: @property def _dt_days(self) -> Self: return type(self)( - pa.array(self._to_timedeltaarray().days, from_pandas=True, type=pa.int32()) + pa.array( + self._to_timedeltaarray().components.days, + from_pandas=True, + type=pa.int32(), + ) ) @property def _dt_hours(self) -> Self: return type(self)( pa.array( - [ - td.components.hours if td is not NaT else None - for td in self._to_timedeltaarray() - ], + self._to_timedeltaarray().components.hours, + from_pandas=True, type=pa.int32(), ) ) @@ -2631,10 +2632,8 @@ def _dt_hours(self) -> Self: def _dt_minutes(self) -> Self: return type(self)( pa.array( - [ - td.components.minutes if td is not NaT else None - for td in self._to_timedeltaarray() - ], + self._to_timedeltaarray().components.minutes, + from_pandas=True, type=pa.int32(), ) ) @@ -2643,7 +2642,9 @@ def _dt_minutes(self) -> Self: def _dt_seconds(self) -> Self: return type(self)( pa.array( - self._to_timedeltaarray().seconds, from_pandas=True, type=pa.int32() + self._to_timedeltaarray().components.seconds, + from_pandas=True, + type=pa.int32(), ) ) @@ -2651,10 +2652,8 @@ def _dt_seconds(self) -> Self: def _dt_milliseconds(self) -> Self: return type(self)( pa.array( - [ - td.components.milliseconds if td is not NaT else None - for td in self._to_timedeltaarray() - ], + self._to_timedeltaarray().components.milliseconds, + from_pandas=True, type=pa.int32(), ) ) @@ -2663,7 +2662,7 @@ def _dt_milliseconds(self) -> Self: def _dt_microseconds(self) -> Self: return type(self)( pa.array( - self._to_timedeltaarray().microseconds, + self._to_timedeltaarray().components.microseconds, from_pandas=True, type=pa.int32(), ) @@ -2673,7 +2672,9 @@ def _dt_microseconds(self) -> Self: def _dt_nanoseconds(self) -> Self: return type(self)( pa.array( - self._to_timedeltaarray().nanoseconds, from_pandas=True, type=pa.int32() + self._to_timedeltaarray().components.nanoseconds, + from_pandas=True, + type=pa.int32(), ) ) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 5926d23b44dd0..f2e9d2321f33e 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -2905,6 +2905,31 @@ def test_dt_components(): tm.assert_frame_equal(result, expected) +def test_dt_components_large_values(): + ser = pd.Series( + [ + pd.Timedelta("365 days 23:59:59.999000"), + None, + ], + dtype=ArrowDtype(pa.duration("ns")), + ) + result = ser.dt.components + expected = pd.DataFrame( + [[365, 23, 59, 59, 999, 0, 0], [None, None, None, None, None, None, None]], + columns=[ + "days", + "hours", + "minutes", + "seconds", + "milliseconds", + "microseconds", + "nanoseconds", + ], + dtype="int32[pyarrow]", + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("skipna", [True, False]) def test_boolean_reduce_series_all_null(all_boolean_reductions, skipna): # GH51624 From 849016caf476f6c678dfa5502c5a8075718460f5 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 18 Jun 2024 08:57:09 -1000 Subject: [PATCH 099/272] REF: Use `default_index` or preserve original Index type for empty-like results (#59035) * Use more default_index for empty cases * fix tests * Update number * Address typing --- doc/source/whatsnew/v3.0.0.rst | 2 +- pandas/core/frame.py | 2 +- pandas/core/generic.py | 3 +-- pandas/core/groupby/groupby.py | 5 ++--- pandas/core/groupby/grouper.py | 3 ++- pandas/core/indexes/api.py | 2 +- pandas/core/internals/managers.py | 2 +- pandas/core/methods/selectn.py | 7 ++++--- pandas/core/reshape/reshape.py | 4 ++-- pandas/tests/frame/methods/test_quantile.py | 13 +++++++++++-- pandas/tests/generic/test_generic.py | 8 ++++++-- pandas/tests/indexes/test_base.py | 2 +- .../tests/series/methods/test_get_numeric_data.py | 5 ++--- 13 files changed, 35 insertions(+), 23 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 36a0dd3718feb..b2afc5e560b25 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -503,8 +503,8 @@ Timezones Numeric ^^^^^^^ +- Bug in :meth:`DataFrame.quantile` where the column type was not preserved when ``numeric_only=True`` with a list-like ``q`` produced an empty result (:issue:`59035`) - Bug in ``np.matmul`` with :class:`Index` inputs raising a ``TypeError`` (:issue:`57079`) -- Conversion ^^^^^^^^^^ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a6c0e1e372530..0aeda77233125 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -13078,7 +13078,7 @@ def quantile( if len(data.columns) == 0: # GH#23925 _get_numeric_data may have dropped all columns - cols = Index([], name=self.columns.name) + cols = self.columns[:0] dtype = np.float64 if axis == 1: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 599b3d5578fca..93068c665a880 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -158,7 +158,6 @@ Index, MultiIndex, PeriodIndex, - RangeIndex, default_index, ensure_index, ) @@ -1852,7 +1851,7 @@ def _drop_labels_or_levels(self, keys, axis: AxisInt = 0): else: # Drop the last level of Index by replacing with # a RangeIndex - dropped.columns = RangeIndex(dropped.columns.size) + dropped.columns = default_index(dropped.columns.size) # Handle dropping index labels if labels_to_drop: diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 83bb79bcddcf8..d45c891d6413b 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -128,7 +128,6 @@ class providing the base-class of operations. from pandas.core.indexes.api import ( Index, MultiIndex, - RangeIndex, default_index, ) from pandas.core.internals.blocks import ensure_block_shape @@ -1264,7 +1263,7 @@ def _set_result_index_ordered( if self._grouper.has_dropped_na: # Add back in any missing rows due to dropna - index here is integral # with values referring to the row of the input so can use RangeIndex - result = result.reindex(RangeIndex(len(index)), axis=0) + result = result.reindex(default_index(len(index)), axis=0) result = result.set_axis(index, axis=0) return result @@ -1334,7 +1333,7 @@ def _wrap_aggregated_output( # enforced in __init__ result = self._insert_inaxis_grouper(result, qs=qs) result = result._consolidate() - result.index = RangeIndex(len(result)) + result.index = default_index(len(result)) else: index = self._grouper.result_index diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index e75a5b9089f5f..5f680de77649f 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -34,6 +34,7 @@ from pandas.core.indexes.api import ( Index, MultiIndex, + default_index, ) from pandas.core.series import Series @@ -901,7 +902,7 @@ def is_in_obj(gpr) -> bool: if len(groupings) == 0 and len(obj): raise ValueError("No group keys passed!") if len(groupings) == 0: - groupings.append(Grouping(Index([], dtype="int"), np.array([], dtype=np.intp))) + groupings.append(Grouping(default_index(0), np.array([], dtype=np.intp))) # create the internals grouper grouper = ops.BaseGrouper(group_axis, groupings, sort=sort, dropna=dropna) diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index 83e8df5072b92..5144e647e73b4 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -130,7 +130,7 @@ def _get_combined_index( # TODO: handle index names! indexes = _get_distinct_objs(indexes) if len(indexes) == 0: - index = Index([]) + index: Index = default_index(0) elif len(indexes) == 1: index = indexes[0] elif intersect: diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 64109f5c1655c..79cba9275a119 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -249,7 +249,7 @@ def blklocs(self) -> npt.NDArray[np.intp]: def make_empty(self, axes=None) -> Self: """return an empty BlockManager with the items axis of len 0""" if axes is None: - axes = [Index([])] + self.axes[1:] + axes = [default_index(0)] + self.axes[1:] # preserve dtype if possible if self.ndim == 1: diff --git a/pandas/core/methods/selectn.py b/pandas/core/methods/selectn.py index 283acaca2c117..02e7445f1d275 100644 --- a/pandas/core/methods/selectn.py +++ b/pandas/core/methods/selectn.py @@ -29,6 +29,8 @@ ) from pandas.core.dtypes.dtypes import BaseMaskedDtype +from pandas.core.indexes.api import default_index + if TYPE_CHECKING: from pandas._typing import ( DtypeObj, @@ -38,6 +40,7 @@ from pandas import ( DataFrame, + Index, Series, ) else: @@ -199,8 +202,6 @@ def __init__(self, obj: DataFrame, n: int, keep: str, columns: IndexLabel) -> No self.columns = columns def compute(self, method: str) -> DataFrame: - from pandas.core.api import Index - n = self.n frame = self.obj columns = self.columns @@ -227,7 +228,7 @@ def get_indexer(current_indexer: Index, other_indexer: Index) -> Index: original_index = frame.index cur_frame = frame = frame.reset_index(drop=True) cur_n = n - indexer = Index([], dtype=np.int64) + indexer: Index = default_index(0) for i, column in enumerate(columns): # For each column we apply method to cur_frame[column]. diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index a8efae8da82c8..664ac57fcc823 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -42,7 +42,7 @@ from pandas.core.indexes.api import ( Index, MultiIndex, - RangeIndex, + default_index, ) from pandas.core.reshape.concat import concat from pandas.core.series import Series @@ -1047,7 +1047,7 @@ def stack_reshape( if data.ndim == 1: data.name = 0 else: - data.columns = RangeIndex(len(data.columns)) + data.columns = default_index(len(data.columns)) buf.append(data) if len(buf) > 0 and not frame.empty: diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py index 32ae4c0ff2f50..f35b77da0b547 100644 --- a/pandas/tests/frame/methods/test_quantile.py +++ b/pandas/tests/frame/methods/test_quantile.py @@ -710,14 +710,14 @@ def test_quantile_empty_no_columns(self, interp_method): result = df.quantile( 0.5, numeric_only=True, interpolation=interpolation, method=method ) - expected = Series([], index=[], name=0.5, dtype=np.float64) + expected = Series([], name=0.5, dtype=np.float64) expected.index.name = "captain tightpants" tm.assert_series_equal(result, expected) result = df.quantile( [0.5], numeric_only=True, interpolation=interpolation, method=method ) - expected = DataFrame([], index=[0.5], columns=[]) + expected = DataFrame([], index=[0.5]) expected.columns.name = "captain tightpants" tm.assert_frame_equal(result, expected) @@ -926,3 +926,12 @@ def test_datelike_numeric_only(self, expected_data, expected_index, axis): expected_data, name=0.5, index=Index(expected_index), dtype=np.float64 ) tm.assert_series_equal(result, expected) + + +def test_multi_quantile_numeric_only_retains_columns(): + df = DataFrame(list("abc")) + result = df.quantile([0.5, 0.7], numeric_only=True) + expected = DataFrame(index=[0.5, 0.7]) + tm.assert_frame_equal( + result, expected, check_index_type=True, check_column_type=True + ) diff --git a/pandas/tests/generic/test_generic.py b/pandas/tests/generic/test_generic.py index 0b607d91baf65..b591b1b1092d4 100644 --- a/pandas/tests/generic/test_generic.py +++ b/pandas/tests/generic/test_generic.py @@ -93,8 +93,7 @@ def test_get_numeric_data(self, frame_or_series): if isinstance(o, DataFrame): # preserve columns dtype expected.columns = o.columns[:0] - # https://github.com/pandas-dev/pandas/issues/50862 - tm.assert_equal(result.reset_index(drop=True), expected) + tm.assert_equal(result, expected) # get the bool data arr = np.array([True, True, False, True]) @@ -102,6 +101,11 @@ def test_get_numeric_data(self, frame_or_series): result = o._get_numeric_data() tm.assert_equal(result, o) + def test_get_bool_data_empty_preserve_index(self): + expected = Series([], dtype="bool") + result = expected._get_bool_data() + tm.assert_series_equal(result, expected, check_index_type=True) + def test_nonzero(self, frame_or_series): # GH 4633 # look at the boolean/nonzero behavior for objects diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index e701a49ea93ad..16908fbb4fecc 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -1558,7 +1558,7 @@ def test_ensure_index_uint64(self): def test_get_combined_index(self): result = _get_combined_index([]) - expected = Index([]) + expected = RangeIndex(0) tm.assert_index_equal(result, expected) diff --git a/pandas/tests/series/methods/test_get_numeric_data.py b/pandas/tests/series/methods/test_get_numeric_data.py index f25583904377a..4a11d7905f506 100644 --- a/pandas/tests/series/methods/test_get_numeric_data.py +++ b/pandas/tests/series/methods/test_get_numeric_data.py @@ -1,5 +1,4 @@ from pandas import ( - Index, Series, date_range, ) @@ -19,7 +18,7 @@ def test_get_numeric_data_preserve_dtype(self): obj = Series([1, "2", 3.0]) result = obj._get_numeric_data() - expected = Series([], dtype=object, index=Index([], dtype=object)) + expected = Series([], dtype=object) tm.assert_series_equal(result, expected) obj = Series([True, False, True]) @@ -28,5 +27,5 @@ def test_get_numeric_data_preserve_dtype(self): obj = Series(date_range("20130101", periods=3)) result = obj._get_numeric_data() - expected = Series([], dtype="M8[ns]", index=Index([], dtype=object)) + expected = Series([], dtype="M8[ns]") tm.assert_series_equal(result, expected) From a4c9446d45e6cebeabe9d8d67997b860b403576c Mon Sep 17 00:00:00 2001 From: chaoyihu <90495101+chaoyihu@users.noreply.github.com> Date: Tue, 18 Jun 2024 14:49:20 -0700 Subject: [PATCH 100/272] Fix wrong save of datetime64[s] in HDFStore (#59018) * Fix wrong save of datetime64[s] in HDFStore * generic datetime unit parsing * use tmp_path * Adding entry to whatsnew * datetime64 dtype parsing using numpy api * move whatsnew entry * update test comment * update hdfstore dtypes test case --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/io/pytables.py | 12 ++++++++---- pandas/tests/io/pytables/test_read.py | 11 +++++++++++ pandas/tests/io/pytables/test_round_trip.py | 10 +++++++--- 4 files changed, 27 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index b2afc5e560b25..b952ffd7661a7 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -546,6 +546,7 @@ I/O - Bug in :meth:`DataFrame.to_excel` when writing empty :class:`DataFrame` with :class:`MultiIndex` on both axes (:issue:`57696`) - Bug in :meth:`DataFrame.to_stata` when writing :class:`DataFrame` and ``byteorder=`big```. (:issue:`58969`) - Bug in :meth:`DataFrame.to_string` that raised ``StopIteration`` with nested DataFrames. (:issue:`16098`) +- Bug in :meth:`HDFStore.get` was failing to save data of dtype datetime64[s] correctly (:issue:`59004`) - Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`) - Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 4fce338ccad6f..d98c51159eb63 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -2655,7 +2655,7 @@ def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str): # reverse converts if dtype.startswith("datetime64"): # recreate with tz if indicated - converted = _set_tz(converted, tz) + converted = _set_tz(converted, tz, dtype) elif dtype == "timedelta64": converted = np.asarray(converted, dtype="m8[ns]") @@ -3036,7 +3036,7 @@ def read_array(self, key: str, start: int | None = None, stop: int | None = None if dtype and dtype.startswith("datetime64"): # reconstruct a timezone if indicated tz = getattr(attrs, "tz", None) - ret = _set_tz(ret, tz) + ret = _set_tz(ret, tz, dtype) elif dtype == "timedelta64": ret = np.asarray(ret, dtype="m8[ns]") @@ -4964,7 +4964,9 @@ def _get_tz(tz: tzinfo) -> str | tzinfo: return zone -def _set_tz(values: npt.NDArray[np.int64], tz: str | tzinfo | None) -> DatetimeArray: +def _set_tz( + values: npt.NDArray[np.int64], tz: str | tzinfo | None, datetime64_dtype: str +) -> DatetimeArray: """ Coerce the values to a DatetimeArray with appropriate tz. @@ -4972,11 +4974,13 @@ def _set_tz(values: npt.NDArray[np.int64], tz: str | tzinfo | None) -> DatetimeA ---------- values : ndarray[int64] tz : str, tzinfo, or None + datetime64_dtype : str, e.g. "datetime64[ns]", "datetime64[25s]" """ assert values.dtype == "i8", values.dtype # Argument "tz" to "tz_to_dtype" has incompatible type "str | tzinfo | None"; # expected "tzinfo" - dtype = tz_to_dtype(tz=tz, unit="ns") # type: ignore[arg-type] + unit, _ = np.datetime_data(datetime64_dtype) # parsing dtype: unit, count + dtype = tz_to_dtype(tz=tz, unit=unit) # type: ignore[arg-type] dta = DatetimeArray._from_sequence(values, dtype=dtype) return dta diff --git a/pandas/tests/io/pytables/test_read.py b/pandas/tests/io/pytables/test_read.py index e33ddaf3b81f0..ba108370a4a92 100644 --- a/pandas/tests/io/pytables/test_read.py +++ b/pandas/tests/io/pytables/test_read.py @@ -317,3 +317,14 @@ def test_read_infer_string(tmp_path, setup_path): columns=Index(["a"], dtype="string[pyarrow_numpy]"), ) tm.assert_frame_equal(result, expected) + + +def test_hdfstore_read_datetime64_unit_s(tmp_path, setup_path): + # GH 59004 + df_s = DataFrame(["2001-01-01", "2002-02-02"], dtype="datetime64[s]") + path = tmp_path / setup_path + with HDFStore(path, mode="w") as store: + store.put("df_s", df_s) + with HDFStore(path, mode="r") as store: + df_fromstore = store.get("df_s") + tm.assert_frame_equal(df_s, df_fromstore) diff --git a/pandas/tests/io/pytables/test_round_trip.py b/pandas/tests/io/pytables/test_round_trip.py index 51ee289c8e27a..3ad05cec3bca3 100644 --- a/pandas/tests/io/pytables/test_round_trip.py +++ b/pandas/tests/io/pytables/test_round_trip.py @@ -236,8 +236,10 @@ def test_table_values_dtypes_roundtrip(setup_path): df1["float322"] = 1.0 df1["float322"] = df1["float322"].astype("float32") df1["bool"] = df1["float32"] > 0 - df1["time1"] = Timestamp("20130101") - df1["time2"] = Timestamp("20130102") + df1["time_s_1"] = Timestamp("20130101") + df1["time_s_2"] = Timestamp("20130101 00:00:00") + df1["time_ms"] = Timestamp("20130101 00:00:00.000") + df1["time_ns"] = Timestamp("20130102 00:00:00.000000000") store.append("df_mixed_dtypes1", df1) result = store.select("df_mixed_dtypes1").dtypes.value_counts() @@ -252,7 +254,9 @@ def test_table_values_dtypes_roundtrip(setup_path): "int8": 1, "int64": 1, "object": 1, - "datetime64[ns]": 2, + "datetime64[s]": 2, + "datetime64[ms]": 1, + "datetime64[ns]": 1, }, name="count", ) From c46fb76afaf98153b9eef97fc9bbe9077229e7cd Mon Sep 17 00:00:00 2001 From: William Andrea <22385371+wjandrea@users.noreply.github.com> Date: Tue, 18 Jun 2024 20:22:19 -0400 Subject: [PATCH 101/272] DOC: Fix diction: "e.g." -> "i.e." (#59047) Fix diction: "e.g." -> "i.e." "e.g." means "for example", "i.e." means "that is" --- pandas/io/html.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/html.py b/pandas/io/html.py index 42f5266e7649b..db4c5f8507946 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -1178,7 +1178,7 @@ def read_html( **after** `skiprows` is applied. This function will *always* return a list of :class:`DataFrame` *or* - it will fail, e.g., it will *not* return an empty list. + it will fail, i.e., it will *not* return an empty list. Examples -------- From 75ac95e6f7825633b02468c6eadd762cdece1b5d Mon Sep 17 00:00:00 2001 From: William Andrea <22385371+wjandrea@users.noreply.github.com> Date: Fri, 21 Jun 2024 13:38:02 -0400 Subject: [PATCH 102/272] DOC: Typo in missing_data.rst (#59061) Typo in missing_data.rst *compaisons -> comparisons --- doc/source/user_guide/missing_data.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst index 4e0245312b827..e15939eb49239 100644 --- a/doc/source/user_guide/missing_data.rst +++ b/doc/source/user_guide/missing_data.rst @@ -60,7 +60,7 @@ To detect these missing value, use the :func:`isna` or :func:`notna` methods. .. warning:: - Equality compaisons between ``np.nan``, :class:`NaT`, and :class:`NA` + Equality comparisons between ``np.nan``, :class:`NaT`, and :class:`NA` do not act like ``None`` .. ipython:: python From 0d6a48c3ac5773f8685c6fd293250621de7a8b90 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Fri, 21 Jun 2024 23:09:05 +0530 Subject: [PATCH 103/272] DOC: fix SA01 for pandas.Timedelta.days (#59068) * DOC: fix SA01 for pandas.Timedelta.days * DOC: fix SA01 for pandas.Timedelta.days --- ci/code_checks.sh | 1 - pandas/_libs/tslibs/timedeltas.pyx | 12 ++++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index c4f143e9be2f4..5b73bb30cbfa3 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -213,7 +213,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Timedelta.asm8 SA01" \ -i "pandas.Timedelta.ceil SA01" \ -i "pandas.Timedelta.components SA01" \ - -i "pandas.Timedelta.days SA01" \ -i "pandas.Timedelta.floor SA01" \ -i "pandas.Timedelta.max PR02" \ -i "pandas.Timedelta.min PR02" \ diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index dc4aed9920734..46a68836b24a1 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -1078,10 +1078,22 @@ cdef class _Timedelta(timedelta): """ Returns the days of the timedelta. + The `days` attribute of a `pandas.Timedelta` object provides the number + of days represented by the `Timedelta`. This is useful for extracting + the day component from a `Timedelta` that may also include hours, minutes, + seconds, and smaller time units. This attribute simplifies the process + of working with durations where only the day component is of interest. + Returns ------- int + See Also + -------- + Timedelta.seconds : Returns the seconds component of the timedelta. + Timedelta.microseconds : Returns the microseconds component of the timedelta. + Timedelta.total_seconds : Returns the total duration in seconds. + Examples -------- >>> td = pd.Timedelta(1, "d") From 05a2f1c624dc78124e6ae71f1a03b89f5e45acbe Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Fri, 21 Jun 2024 23:09:37 +0530 Subject: [PATCH 104/272] DOC: add SA01 for pandas.Series.ge (#59067) --- ci/code_checks.sh | 1 - pandas/core/series.py | 63 ++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 62 insertions(+), 2 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 5b73bb30cbfa3..013f7abe5ff0d 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -154,7 +154,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.dt.tz_convert PR01,PR02" \ -i "pandas.Series.dt.tz_localize PR01,PR02" \ -i "pandas.Series.dt.unit GL08" \ - -i "pandas.Series.ge SA01" \ -i "pandas.Series.gt SA01" \ -i "pandas.Series.list.__getitem__ SA01" \ -i "pandas.Series.list.flatten SA01" \ diff --git a/pandas/core/series.py b/pandas/core/series.py index 3d1bd8ebb03cb..e833c2a078762 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -6050,8 +6050,69 @@ def lt(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: other, operator.lt, level=level, fill_value=fill_value, axis=axis ) - @Appender(ops.make_flex_doc("ge", "series")) def ge(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: + """ + Return Greater than or equal to of series and other, \ + element-wise (binary operator `ge`). + + Equivalent to ``series >= other``, but with support to substitute a + fill_value for missing data in either one of the inputs. + + Parameters + ---------- + other : Series or scalar value + The second operand in this operation. + level : int or name + Broadcast across a level, matching Index values on the + passed MultiIndex level. + fill_value : None or float value, default None (NaN) + Fill existing missing (NaN) values, and any new element needed for + successful Series alignment, with this value before computation. + If data in both corresponding Series locations is missing + the result of filling (at that location) will be missing. + axis : {0 or 'index'} + Unused. Parameter needed for compatibility with DataFrame. + + Returns + ------- + Series + The result of the operation. + + See Also + -------- + Series.gt : Greater than comparison, element-wise. + Series.le : Less than or equal to comparison, element-wise. + Series.lt : Less than comparison, element-wise. + Series.eq : Equal to comparison, element-wise. + Series.ne : Not equal to comparison, element-wise. + + Examples + -------- + >>> a = pd.Series([1, 1, 1, np.nan, 1], index=["a", "b", "c", "d", "e"]) + >>> a + a 1.0 + b 1.0 + c 1.0 + d NaN + e 1.0 + dtype: float64 + >>> b = pd.Series([0, 1, 2, np.nan, 1], index=["a", "b", "c", "d", "f"]) + >>> b + a 0.0 + b 1.0 + c 2.0 + d NaN + f 1.0 + dtype: float64 + >>> a.ge(b, fill_value=0) + a True + b True + c False + d False + e True + f False + dtype: bool + """ return self._flex_method( other, operator.ge, level=level, fill_value=fill_value, axis=axis ) From 214ac73ab7de1a3bcd38dcb2630145f831661530 Mon Sep 17 00:00:00 2001 From: JulienBacquart <16917004+JulienBacquart@users.noreply.github.com> Date: Fri, 21 Jun 2024 19:49:33 +0200 Subject: [PATCH 105/272] DOC: Correct typo in pandas.pivot_table doc (#59056) Correct typo Missing 's' --- pandas/core/reshape/pivot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 8c2c2053b0554..131924bc059f6 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -90,7 +90,7 @@ def pivot_table( hierarchical columns whose top level are the function names (inferred from the function objects themselves). If a dict is passed, the key is column to aggregate and the value is - function or list of functions. If ``margin=True``, aggfunc will be + function or list of functions. If ``margins=True``, aggfunc will be used to calculate the partial aggregates. fill_value : scalar, default None Value to replace missing values with (in the resulting pivot table, From 73ea3c16b62704598bce4e8741dbe0be4bae87e2 Mon Sep 17 00:00:00 2001 From: Natalia Mokeeva <91160475+natmokval@users.noreply.github.com> Date: Fri, 21 Jun 2024 19:51:32 +0200 Subject: [PATCH 106/272] DEPR: deprecate units 'w', 'd', 'MS', 'US', 'NS' for Timedelta in favor of 'W', 'D', 'ms', 'us', 'ns' (#59051) * deprecate lower/uppercase units for Timedelta * correct examples in timedeltas.rst, add a note to v.3.0.0 * fix tests * add tests, fix tests, corrected docs examples * correct docs * fix an example in v0.13.0 --- doc/source/user_guide/timedeltas.rst | 4 +- doc/source/whatsnew/v0.13.0.rst | 24 ++++++--- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/_libs/tslibs/dtypes.pxd | 1 + pandas/_libs/tslibs/dtypes.pyx | 11 +++++ pandas/_libs/tslibs/timedeltas.pyx | 13 +++-- pandas/core/arrays/timedeltas.py | 6 +-- pandas/core/indexes/accessors.py | 2 +- pandas/core/tools/timedeltas.py | 2 +- pandas/tests/frame/indexing/test_mask.py | 4 +- pandas/tests/frame/methods/test_astype.py | 2 +- .../tests/frame/methods/test_reset_index.py | 4 +- pandas/tests/groupby/test_groupby.py | 4 +- pandas/tests/groupby/test_reductions.py | 2 +- .../indexes/timedeltas/methods/test_shift.py | 4 +- .../indexes/timedeltas/test_constructors.py | 27 +++++++++- .../tests/indexes/timedeltas/test_delete.py | 2 +- .../tests/indexes/timedeltas/test_indexing.py | 14 ++++-- .../tests/indexes/timedeltas/test_setops.py | 7 ++- pandas/tests/indexing/test_categorical.py | 4 +- pandas/tests/io/sas/test_sas7bdat.py | 4 +- pandas/tests/reshape/merge/test_merge.py | 4 +- .../tests/scalar/timedelta/test_arithmetic.py | 46 ++++++++--------- .../scalar/timedelta/test_constructors.py | 30 +++++++----- pandas/tests/scalar/timedelta/test_formats.py | 2 +- .../tests/scalar/timedelta/test_timedelta.py | 49 ++++++++++++++----- pandas/tests/series/methods/test_astype.py | 2 +- pandas/tests/series/methods/test_isin.py | 2 +- pandas/tests/series/methods/test_nlargest.py | 2 +- pandas/tests/tools/test_to_timedelta.py | 5 +- pandas/tests/window/test_rolling.py | 2 +- 31 files changed, 192 insertions(+), 94 deletions(-) diff --git a/doc/source/user_guide/timedeltas.rst b/doc/source/user_guide/timedeltas.rst index 5daf204f39bcf..01df17bac5fd7 100644 --- a/doc/source/user_guide/timedeltas.rst +++ b/doc/source/user_guide/timedeltas.rst @@ -35,7 +35,7 @@ You can construct a ``Timedelta`` scalar through various arguments, including `I pd.Timedelta(days=1, seconds=1) # integers with a unit - pd.Timedelta(1, unit="d") + pd.Timedelta(1, unit="D") # from a datetime.timedelta/np.timedelta64 pd.Timedelta(datetime.timedelta(days=1, seconds=1)) @@ -94,7 +94,7 @@ is numeric: .. ipython:: python pd.to_timedelta(np.arange(5), unit="s") - pd.to_timedelta(np.arange(5), unit="d") + pd.to_timedelta(np.arange(5), unit="D") .. warning:: If a string or array of strings is passed as an input then the ``unit`` keyword diff --git a/doc/source/whatsnew/v0.13.0.rst b/doc/source/whatsnew/v0.13.0.rst index 3c5488a47bdf2..8e323d8aac5e3 100644 --- a/doc/source/whatsnew/v0.13.0.rst +++ b/doc/source/whatsnew/v0.13.0.rst @@ -523,13 +523,25 @@ Enhancements Using the new top-level ``to_timedelta``, you can convert a scalar or array from the standard timedelta format (produced by ``to_csv``) into a timedelta type (``np.timedelta64`` in ``nanoseconds``). - .. ipython:: python + .. code-block:: ipython + + In [53]: pd.to_timedelta('1 days 06:05:01.00003') + Out[53]: Timedelta('1 days 06:05:01.000030') + + In [54]: pd.to_timedelta('15.5us') + Out[54]: Timedelta('0 days 00:00:00.000015500') + + In [55]: pd.to_timedelta(['1 days 06:05:01.00003', '15.5us', 'nan']) + Out[55]: TimedeltaIndex(['1 days 06:05:01.000030', '0 days 00:00:00.000015500', NaT], dtype='timedelta64[ns]', freq=None) + + In [56]: pd.to_timedelta(np.arange(5), unit='s') + Out[56]: + TimedeltaIndex(['0 days 00:00:00', '0 days 00:00:01', '0 days 00:00:02', + '0 days 00:00:03', '0 days 00:00:04'], + dtype='timedelta64[ns]', freq=None) - pd.to_timedelta('1 days 06:05:01.00003') - pd.to_timedelta('15.5us') - pd.to_timedelta(['1 days 06:05:01.00003', '15.5us', 'nan']) - pd.to_timedelta(np.arange(5), unit='s') - pd.to_timedelta(np.arange(5), unit='d') + In [57]: pd.to_timedelta(np.arange(5), unit='d') + Out[57]: TimedeltaIndex(['0 days', '1 days', '2 days', '3 days', '4 days'], dtype='timedelta64[ns]', freq=None) A Series of dtype ``timedelta64[ns]`` can now be divided by another ``timedelta64[ns]`` object, or astyped to yield a ``float64`` dtyped Series. This diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index b952ffd7661a7..9748a461859c2 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -273,6 +273,7 @@ Other Deprecations - Deprecated allowing non-keyword arguments in :meth:`Series.to_string` except ``buf``. (:issue:`57280`) - Deprecated behavior of :meth:`Series.dt.to_pytimedelta`, in a future version this will return a :class:`Series` containing python ``datetime.timedelta`` objects instead of an ``ndarray`` of timedelta; this matches the behavior of other :meth:`Series.dt` properties. (:issue:`57463`) - Deprecated parameter ``method`` in :meth:`DataFrame.reindex_like` / :meth:`Series.reindex_like` (:issue:`58667`) +- Deprecated strings ``w``, ``d``, ``MIN``, ``MS``, ``US`` and ``NS`` denoting units in :class:`Timedelta` in favour of ``W``, ``D``, ``min``, ``ms``, ``us`` and ``ns`` (:issue:`59051`) - Deprecated using ``epoch`` date format in :meth:`DataFrame.to_json` and :meth:`Series.to_json`, use ``iso`` instead. (:issue:`57063`) .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/tslibs/dtypes.pxd b/pandas/_libs/tslibs/dtypes.pxd index 455bca35d160a..204d582294a5b 100644 --- a/pandas/_libs/tslibs/dtypes.pxd +++ b/pandas/_libs/tslibs/dtypes.pxd @@ -15,6 +15,7 @@ cdef dict c_OFFSET_TO_PERIOD_FREQSTR cdef dict c_PERIOD_TO_OFFSET_FREQSTR cdef dict c_OFFSET_RENAMED_FREQSTR cdef dict c_DEPR_ABBREVS +cdef dict c_DEPR_UNITS cdef dict c_PERIOD_AND_OFFSET_DEPR_FREQSTR cdef dict attrname_to_abbrevs cdef dict npy_unit_to_attrname diff --git a/pandas/_libs/tslibs/dtypes.pyx b/pandas/_libs/tslibs/dtypes.pyx index 479a5a328b1d8..e047566a1868e 100644 --- a/pandas/_libs/tslibs/dtypes.pyx +++ b/pandas/_libs/tslibs/dtypes.pyx @@ -346,6 +346,17 @@ cdef dict c_DEPR_ABBREVS = { "S": "s", } +cdef dict c_DEPR_UNITS = { + "w": "W", + "d": "D", + "H": "h", + "MIN": "min", + "S": "s", + "MS": "ms", + "US": "us", + "NS": "ns", +} + cdef dict c_PERIOD_AND_OFFSET_DEPR_FREQSTR = { "w": "W", "MIN": "min", diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 46a68836b24a1..de192d511d507 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -43,7 +43,7 @@ from pandas._libs.tslibs.conversion cimport ( precision_from_unit, ) from pandas._libs.tslibs.dtypes cimport ( - c_DEPR_ABBREVS, + c_DEPR_UNITS, get_supported_reso, is_supported_unit, npy_unit_to_abbrev, @@ -719,15 +719,15 @@ cpdef inline str parse_timedelta_unit(str unit): return "ns" elif unit == "M": return unit - elif unit in c_DEPR_ABBREVS: + elif unit in c_DEPR_UNITS: warnings.warn( f"\'{unit}\' is deprecated and will be removed in a " - f"future version. Please use \'{c_DEPR_ABBREVS.get(unit)}\' " + f"future version. Please use \'{c_DEPR_UNITS.get(unit)}\' " f"instead of \'{unit}\'.", FutureWarning, stacklevel=find_stack_level(), ) - unit = c_DEPR_ABBREVS[unit] + unit = c_DEPR_UNITS[unit] try: return timedelta_abbrevs[unit.lower()] except KeyError: @@ -1823,6 +1823,11 @@ class Timedelta(_Timedelta): Values `H`, `T`, `S`, `L`, `U`, and `N` are deprecated in favour of the values `h`, `min`, `s`, `ms`, `us`, and `ns`. + .. deprecated:: 3.0.0 + + Allowing the values `w`, `d`, `MIN`, `MS`, `US` and `NS` to denote units + are deprecated in favour of the values `W`, `D`, `min`, `ms`, `us` and `ns`. + **kwargs Available kwargs: {days, seconds, microseconds, milliseconds, minutes, hours, weeks}. diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 865e81d7754ef..15bfe442ca87f 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -746,7 +746,7 @@ def total_seconds(self) -> npt.NDArray[np.float64]: -------- **Series** - >>> s = pd.Series(pd.to_timedelta(np.arange(5), unit="d")) + >>> s = pd.Series(pd.to_timedelta(np.arange(5), unit="D")) >>> s 0 0 days 1 1 days @@ -765,7 +765,7 @@ def total_seconds(self) -> npt.NDArray[np.float64]: **TimedeltaIndex** - >>> idx = pd.to_timedelta(np.arange(5), unit="d") + >>> idx = pd.to_timedelta(np.arange(5), unit="D") >>> idx TimedeltaIndex(['0 days', '1 days', '2 days', '3 days', '4 days'], dtype='timedelta64[ns]', freq=None) @@ -809,7 +809,7 @@ def to_pytimedelta(self) -> npt.NDArray[np.object_]: -------- For Series: - >>> ser = pd.Series(pd.to_timedelta([1, 2, 3], unit='d')) + >>> ser = pd.Series(pd.to_timedelta([1, 2, 3], unit='D')) >>> ser 0 1 days 1 2 days diff --git a/pandas/core/indexes/accessors.py b/pandas/core/indexes/accessors.py index 3cb51f7447677..e2dc71f68a65b 100644 --- a/pandas/core/indexes/accessors.py +++ b/pandas/core/indexes/accessors.py @@ -459,7 +459,7 @@ def to_pytimedelta(self) -> np.ndarray: Examples -------- - >>> s = pd.Series(pd.to_timedelta(np.arange(5), unit="d")) + >>> s = pd.Series(pd.to_timedelta(np.arange(5), unit="D")) >>> s 0 0 days 1 1 days diff --git a/pandas/core/tools/timedeltas.py b/pandas/core/tools/timedeltas.py index 296168fe7e725..8d82a5c213910 100644 --- a/pandas/core/tools/timedeltas.py +++ b/pandas/core/tools/timedeltas.py @@ -170,7 +170,7 @@ def to_timedelta( TimedeltaIndex(['0 days 00:00:00', '0 days 00:00:01', '0 days 00:00:02', '0 days 00:00:03', '0 days 00:00:04'], dtype='timedelta64[ns]', freq=None) - >>> pd.to_timedelta(np.arange(5), unit="d") + >>> pd.to_timedelta(np.arange(5), unit="D") TimedeltaIndex(['0 days', '1 days', '2 days', '3 days', '4 days'], dtype='timedelta64[ns]', freq=None) """ diff --git a/pandas/tests/frame/indexing/test_mask.py b/pandas/tests/frame/indexing/test_mask.py index 264e27c9c122e..ac6f0a1ac0f73 100644 --- a/pandas/tests/frame/indexing/test_mask.py +++ b/pandas/tests/frame/indexing/test_mask.py @@ -122,7 +122,7 @@ def test_mask_stringdtype(frame_or_series): def test_mask_where_dtype_timedelta(): # https://github.com/pandas-dev/pandas/issues/39548 - df = DataFrame([Timedelta(i, unit="d") for i in range(5)]) + df = DataFrame([Timedelta(i, unit="D") for i in range(5)]) expected = DataFrame(np.full(5, np.nan, dtype="timedelta64[ns]")) tm.assert_frame_equal(df.mask(df.notna()), expected) @@ -130,7 +130,7 @@ def test_mask_where_dtype_timedelta(): expected = DataFrame( [np.nan, np.nan, np.nan, Timedelta("3 day"), Timedelta("4 day")] ) - tm.assert_frame_equal(df.where(df > Timedelta(2, unit="d")), expected) + tm.assert_frame_equal(df.where(df > Timedelta(2, unit="D")), expected) def test_mask_return_dtype(): diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index 55f8052d05cf1..41129966cd589 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -149,7 +149,7 @@ def test_astype_str(self): # see GH#9757 a = Series(date_range("2010-01-04", periods=5)) b = Series(date_range("3/6/2012 00:00", periods=5, tz="US/Eastern")) - c = Series([Timedelta(x, unit="d") for x in range(5)]) + c = Series([Timedelta(x, unit="D") for x in range(5)]) d = Series(range(5)) e = Series([0.0, 0.2, 0.4, 0.6, 0.8]) diff --git a/pandas/tests/frame/methods/test_reset_index.py b/pandas/tests/frame/methods/test_reset_index.py index 22ce091d4ed62..980dd5243daa5 100644 --- a/pandas/tests/frame/methods/test_reset_index.py +++ b/pandas/tests/frame/methods/test_reset_index.py @@ -600,8 +600,8 @@ def test_reset_index_with_drop( {"a": [pd.NaT, Timestamp("2020-01-01")], "b": [1, 2], "x": [11, 12]}, ), ( - [(pd.NaT, 1), (pd.Timedelta(123, "d"), 2)], - {"a": [pd.NaT, pd.Timedelta(123, "d")], "b": [1, 2], "x": [11, 12]}, + [(pd.NaT, 1), (pd.Timedelta(123, "D"), 2)], + {"a": [pd.NaT, pd.Timedelta(123, "D")], "b": [1, 2], "x": [11, 12]}, ), ], ) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 4c1dc8953580a..6ff70d26d8425 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -148,8 +148,8 @@ def test_len_nan_group(): def test_groupby_timedelta_median(): # issue 57926 - expected = Series(data=Timedelta("1d"), index=["foo"]) - df = DataFrame({"label": ["foo", "foo"], "timedelta": [pd.NaT, Timedelta("1d")]}) + expected = Series(data=Timedelta("1D"), index=["foo"]) + df = DataFrame({"label": ["foo", "foo"], "timedelta": [pd.NaT, Timedelta("1D")]}) gb = df.groupby("label")["timedelta"] actual = gb.median() tm.assert_series_equal(actual, expected, check_names=False) diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index edc94b2beeec1..00438c2100bad 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -982,7 +982,7 @@ def test_groupby_sum_timedelta_with_nat(): df = DataFrame( { "a": [1, 1, 2, 2], - "b": [pd.Timedelta("1d"), pd.Timedelta("2d"), pd.Timedelta("3d"), pd.NaT], + "b": [pd.Timedelta("1D"), pd.Timedelta("2D"), pd.Timedelta("3D"), pd.NaT], } ) td3 = pd.Timedelta(days=3) diff --git a/pandas/tests/indexes/timedeltas/methods/test_shift.py b/pandas/tests/indexes/timedeltas/methods/test_shift.py index a0986d1496881..9bbf06dc51a0c 100644 --- a/pandas/tests/indexes/timedeltas/methods/test_shift.py +++ b/pandas/tests/indexes/timedeltas/methods/test_shift.py @@ -37,7 +37,7 @@ def test_tdi_shift_minutes(self): def test_tdi_shift_int(self): # GH#8083 - tdi = pd.to_timedelta(range(5), unit="d") + tdi = pd.to_timedelta(range(5), unit="D") trange = tdi._with_freq("infer") + pd.offsets.Hour(1) result = trange.shift(1) expected = TimedeltaIndex( @@ -54,7 +54,7 @@ def test_tdi_shift_int(self): def test_tdi_shift_nonstandard_freq(self): # GH#8083 - tdi = pd.to_timedelta(range(5), unit="d") + tdi = pd.to_timedelta(range(5), unit="D") trange = tdi._with_freq("infer") + pd.offsets.Hour(1) result = trange.shift(3, freq="2D 1s") expected = TimedeltaIndex( diff --git a/pandas/tests/indexes/timedeltas/test_constructors.py b/pandas/tests/indexes/timedeltas/test_constructors.py index 12ac5dd63bd8c..ace0ab7990138 100644 --- a/pandas/tests/indexes/timedeltas/test_constructors.py +++ b/pandas/tests/indexes/timedeltas/test_constructors.py @@ -168,7 +168,7 @@ def test_constructor_coverage(self): # NumPy string array strings = np.array(["1 days", "2 days", "3 days"]) result = TimedeltaIndex(strings) - expected = to_timedelta([1, 2, 3], unit="d") + expected = to_timedelta([1, 2, 3], unit="D") tm.assert_index_equal(result, expected) from_ints = TimedeltaIndex(expected.asi8) @@ -239,3 +239,28 @@ def test_from_categorical(self): ci = pd.CategoricalIndex(tdi) result = TimedeltaIndex(ci) tm.assert_index_equal(result, tdi) + + @pytest.mark.parametrize( + "unit,unit_depr", + [ + ("W", "w"), + ("D", "d"), + ("min", "MIN"), + ("s", "S"), + ("h", "H"), + ("ms", "MS"), + ("us", "US"), + ], + ) + def test_unit_deprecated(self, unit, unit_depr): + # GH#52536, GH#59051 + msg = f"'{unit_depr}' is deprecated and will be removed in a future version." + + expected = TimedeltaIndex([f"1{unit}", f"2{unit}"]) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = TimedeltaIndex([f"1{unit_depr}", f"2{unit_depr}"]) + tm.assert_index_equal(result, expected) + + with tm.assert_produces_warning(FutureWarning, match=msg): + tdi = to_timedelta([1, 2], unit=unit_depr) + tm.assert_index_equal(tdi, expected) diff --git a/pandas/tests/indexes/timedeltas/test_delete.py b/pandas/tests/indexes/timedeltas/test_delete.py index 6e6f54702ce1a..f49af7cd0befd 100644 --- a/pandas/tests/indexes/timedeltas/test_delete.py +++ b/pandas/tests/indexes/timedeltas/test_delete.py @@ -44,7 +44,7 @@ def test_delete_slice(self): # reset freq to None expected_3_5 = TimedeltaIndex( - ["1 d", "2 d", "3 d", "7 d", "8 d", "9 d", "10d"], freq=None, name="idx" + ["1 D", "2 D", "3 D", "7 D", "8 D", "9 D", "10D"], freq=None, name="idx" ) cases = { diff --git a/pandas/tests/indexes/timedeltas/test_indexing.py b/pandas/tests/indexes/timedeltas/test_indexing.py index 397f9d9e18331..e411555c65bea 100644 --- a/pandas/tests/indexes/timedeltas/test_indexing.py +++ b/pandas/tests/indexes/timedeltas/test_indexing.py @@ -20,8 +20,10 @@ class TestGetItem: def test_getitem_slice_keeps_name(self): - # GH#4226 - tdi = timedelta_range("1d", "5d", freq="h", name="timebucket") + # GH#4226, GH#59051 + msg = "'d' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + tdi = timedelta_range("1d", "5d", freq="h", name="timebucket") assert tdi[1:].name == tdi.name def test_getitem(self): @@ -230,7 +232,7 @@ def test_take_invalid_kwargs(self): def test_take_equiv_getitem(self): tds = ["1day 02:00:00", "1 day 04:00:00", "1 day 10:00:00"] - idx = timedelta_range(start="1d", end="2d", freq="h", name="idx") + idx = timedelta_range(start="1D", end="2D", freq="h", name="idx") expected = TimedeltaIndex(tds, freq=None, name="idx") taken1 = idx.take([2, 4, 10]) @@ -337,8 +339,10 @@ def test_contains_nonunique(self): def test_contains(self): # Checking for any NaT-like objects - # GH#13603 - td = to_timedelta(range(5), unit="d") + offsets.Hour(1) + # GH#13603, GH#59051 + msg = "'d' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + td = to_timedelta(range(5), unit="d") + offsets.Hour(1) for v in [NaT, None, float("nan"), np.nan]: assert v not in td diff --git a/pandas/tests/indexes/timedeltas/test_setops.py b/pandas/tests/indexes/timedeltas/test_setops.py index fce10d9176d74..ae88caf18fdae 100644 --- a/pandas/tests/indexes/timedeltas/test_setops.py +++ b/pandas/tests/indexes/timedeltas/test_setops.py @@ -42,7 +42,10 @@ def test_union_sort_false(self): tm.assert_index_equal(result, expected) def test_union_coverage(self): - idx = TimedeltaIndex(["3d", "1d", "2d"]) + # GH#59051 + msg = "'d' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + idx = TimedeltaIndex(["3d", "1d", "2d"]) ordered = TimedeltaIndex(idx.sort_values(), freq="infer") result = ordered.union(idx) tm.assert_index_equal(result, ordered) @@ -70,7 +73,7 @@ def test_union_bug_1745(self): tm.assert_index_equal(result, exp) def test_union_bug_4564(self): - left = timedelta_range("1 day", "30d") + left = timedelta_range("1 day", "30D") right = left + pd.offsets.Minute(15) result = left.union(right) diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index 1b58f8e8b9831..c9f29b2cb55fe 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -511,13 +511,13 @@ def test_loc_and_at_with_categorical_index(self): # pandas scalars [Interval(1, 4), Interval(4, 6), Interval(6, 9)], [Timestamp(2019, 1, 1), Timestamp(2019, 2, 1), Timestamp(2019, 3, 1)], - [Timedelta(1, "d"), Timedelta(2, "d"), Timedelta(3, "D")], + [Timedelta(1, "D"), Timedelta(2, "D"), Timedelta(3, "D")], # pandas Integer arrays *(pd.array([1, 2, 3], dtype=dtype) for dtype in tm.ALL_INT_EA_DTYPES), # other pandas arrays pd.IntervalIndex.from_breaks([1, 4, 6, 9]).array, pd.date_range("2019-01-01", periods=3).array, - pd.timedelta_range(start="1d", periods=3).array, + pd.timedelta_range(start="1D", periods=3).array, ], ) def test_loc_getitem_with_non_string_categories(self, idx_values, ordered): diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index fc5df6d9babcb..62f234ec2db4a 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -30,9 +30,9 @@ def data_test_ix(request, dirpath): fname = os.path.join(dirpath, f"test_sas7bdat_{i}.csv") df = pd.read_csv(fname) epoch = datetime(1960, 1, 1) - t1 = pd.to_timedelta(df["Column4"], unit="d") + t1 = pd.to_timedelta(df["Column4"], unit="D") df["Column4"] = (epoch + t1).astype("M8[s]") - t2 = pd.to_timedelta(df["Column12"], unit="d") + t2 = pd.to_timedelta(df["Column12"], unit="D") df["Column12"] = (epoch + t2).astype("M8[s]") for k in range(df.shape[1]): col = df.iloc[:, k] diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 0a5989e3c82e6..0ab4d08db7cc9 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1367,8 +1367,8 @@ def test_merge_two_empty_df_no_division_error(self): ), ), ( - TimedeltaIndex(["1d", "2d", "3d"]), - TimedeltaIndex(["1d", "2d", "3d", pd.NaT, pd.NaT, pd.NaT]), + TimedeltaIndex(["1D", "2D", "3D"]), + TimedeltaIndex(["1D", "2D", "3D", pd.NaT, pd.NaT, pd.NaT]), ), ], ) diff --git a/pandas/tests/scalar/timedelta/test_arithmetic.py b/pandas/tests/scalar/timedelta/test_arithmetic.py index efeca375affbb..f29135cbf399e 100644 --- a/pandas/tests/scalar/timedelta/test_arithmetic.py +++ b/pandas/tests/scalar/timedelta/test_arithmetic.py @@ -79,7 +79,7 @@ def test_td_add_sub_one_day_ten_seconds(self, one_day_ten_secs): @pytest.mark.parametrize("op", [operator.add, ops.radd]) def test_td_add_datetimelike_scalar(self, op): # GH#19738 - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = op(td, datetime(2016, 1, 1)) if op is operator.add: @@ -111,7 +111,7 @@ def test_td_add_timestamp_overflow(self): @pytest.mark.parametrize("op", [operator.add, ops.radd]) def test_td_add_td(self, op): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = op(td, Timedelta(days=10)) assert isinstance(result, Timedelta) @@ -119,35 +119,35 @@ def test_td_add_td(self, op): @pytest.mark.parametrize("op", [operator.add, ops.radd]) def test_td_add_pytimedelta(self, op): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = op(td, timedelta(days=9)) assert isinstance(result, Timedelta) assert result == Timedelta(days=19) @pytest.mark.parametrize("op", [operator.add, ops.radd]) def test_td_add_timedelta64(self, op): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = op(td, np.timedelta64(-4, "D")) assert isinstance(result, Timedelta) assert result == Timedelta(days=6) @pytest.mark.parametrize("op", [operator.add, ops.radd]) def test_td_add_offset(self, op): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = op(td, offsets.Hour(6)) assert isinstance(result, Timedelta) assert result == Timedelta(days=10, hours=6) def test_td_sub_td(self): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") expected = Timedelta(0, unit="ns") result = td - td assert isinstance(result, Timedelta) assert result == expected def test_td_sub_pytimedelta(self): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") expected = Timedelta(0, unit="ns") result = td - td.to_pytimedelta() @@ -159,7 +159,7 @@ def test_td_sub_pytimedelta(self): assert result == expected def test_td_sub_timedelta64(self): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") expected = Timedelta(0, unit="ns") result = td - td.to_timedelta64() @@ -172,12 +172,12 @@ def test_td_sub_timedelta64(self): def test_td_sub_nat(self): # In this context pd.NaT is treated as timedelta-like - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = td - NaT assert result is NaT def test_td_sub_td64_nat(self): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") td_nat = np.timedelta64("NaT") result = td - td_nat @@ -187,13 +187,13 @@ def test_td_sub_td64_nat(self): assert result is NaT def test_td_sub_offset(self): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = td - offsets.Hour(1) assert isinstance(result, Timedelta) assert result == Timedelta(239, unit="h") def test_td_add_sub_numeric_raises(self): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") msg = "unsupported operand type" for other in [2, 2.0, np.int64(2), np.float64(2)]: with pytest.raises(TypeError, match=msg): @@ -234,7 +234,7 @@ def test_td_add_sub_int_ndarray(self): other - td def test_td_rsub_nat(self): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = NaT - td assert result is NaT @@ -242,7 +242,7 @@ def test_td_rsub_nat(self): assert result is NaT def test_td_rsub_offset(self): - result = offsets.Hour(1) - Timedelta(10, unit="d") + result = offsets.Hour(1) - Timedelta(10, unit="D") assert isinstance(result, Timedelta) assert result == Timedelta(-239, unit="h") @@ -362,7 +362,7 @@ class TestTimedeltaMultiplicationDivision: @pytest.mark.parametrize("op", [operator.mul, ops.rmul]) def test_td_mul_nat(self, op, td_nat): # GH#19819 - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") typs = "|".join(["numpy.timedelta64", "NaTType", "Timedelta"]) msg = "|".join( [ @@ -377,7 +377,7 @@ def test_td_mul_nat(self, op, td_nat): @pytest.mark.parametrize("op", [operator.mul, ops.rmul]) def test_td_mul_nan(self, op, nan): # np.float64('NaN') has a 'dtype' attr, avoid treating as array - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = op(td, nan) assert result is NaT @@ -449,7 +449,7 @@ def test_td_mul_td64_ndarray_invalid(self): def test_td_div_timedeltalike_scalar(self): # GH#19738 - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = td / offsets.Hour(1) assert result == 240 @@ -480,7 +480,7 @@ def test_td_div_td64_non_nano(self): def test_td_div_numeric_scalar(self): # GH#19738 - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = td / 2 assert isinstance(result, Timedelta) @@ -500,7 +500,7 @@ def test_td_div_numeric_scalar(self): ) def test_td_div_nan(self, nan): # np.float64('NaN') has a 'dtype' attr, avoid treating as array - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = td / nan assert result is NaT @@ -532,7 +532,7 @@ def test_td_div_ndarray_0d(self): def test_td_rdiv_timedeltalike_scalar(self): # GH#19738 - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = offsets.Hour(1) / td assert result == 1 / 240.0 @@ -540,7 +540,7 @@ def test_td_rdiv_timedeltalike_scalar(self): def test_td_rdiv_na_scalar(self): # GH#31869 None gets cast to NaT - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = NaT / td assert np.isnan(result) @@ -560,7 +560,7 @@ def test_td_rdiv_na_scalar(self): np.nan / td def test_td_rdiv_ndarray(self): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") arr = np.array([td], dtype=object) result = arr / td @@ -583,7 +583,7 @@ def test_td_rdiv_ndarray(self): arr / td def test_td_rdiv_ndarray_0d(self): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") arr = np.array(td.asm8) diff --git a/pandas/tests/scalar/timedelta/test_constructors.py b/pandas/tests/scalar/timedelta/test_constructors.py index 5509216f4daf4..e029dfc3b2703 100644 --- a/pandas/tests/scalar/timedelta/test_constructors.py +++ b/pandas/tests/scalar/timedelta/test_constructors.py @@ -32,20 +32,31 @@ def test_unit_m_y_raises(self, unit): with pytest.raises(ValueError, match=msg): to_timedelta([1, 2], unit) - @pytest.mark.parametrize("unit", ["h", "s"]) - def test_units_H_S_deprecated(self, unit): + @pytest.mark.parametrize( + "unit,unit_depr", + [ + ("W", "w"), + ("D", "d"), + ("min", "MIN"), + ("s", "S"), + ("h", "H"), + ("ms", "MS"), + ("us", "US"), + ], + ) + def test_unit_deprecated(self, unit, unit_depr): # GH#52536 - msg = f"'{unit.upper()}' is deprecated and will be removed in a future version." + msg = f"'{unit_depr}' is deprecated and will be removed in a future version." expected = Timedelta(1, unit=unit) with tm.assert_produces_warning(FutureWarning, match=msg): - result = Timedelta(1, unit=unit.upper()) + result = Timedelta(1, unit=unit_depr) tm.assert_equal(result, expected) @pytest.mark.parametrize( "unit, np_unit", - [(value, "W") for value in ["W", "w"]] - + [(value, "D") for value in ["D", "d", "days", "day", "Days", "Day"]] + [("W", "W")] + + [(value, "D") for value in ["D", "days", "day", "Days", "Day"]] + [ (value, "m") for value in [ @@ -78,7 +89,6 @@ def test_units_H_S_deprecated(self, unit): "millisecond", "milli", "millis", - "MS", "Milliseconds", "Millisecond", "Milli", @@ -93,7 +103,6 @@ def test_units_H_S_deprecated(self, unit): "microsecond", "micro", "micros", - "US", "Microseconds", "Microsecond", "Micro", @@ -108,7 +117,6 @@ def test_units_H_S_deprecated(self, unit): "nanosecond", "nano", "nanos", - "NS", "Nanoseconds", "Nanosecond", "Nano", @@ -250,8 +258,8 @@ def test_from_tick_reso(): def test_construction(): expected = np.timedelta64(10, "D").astype("m8[ns]").view("i8") - assert Timedelta(10, unit="d")._value == expected - assert Timedelta(10.0, unit="d")._value == expected + assert Timedelta(10, unit="D")._value == expected + assert Timedelta(10.0, unit="D")._value == expected assert Timedelta("10 days")._value == expected assert Timedelta(days=10)._value == expected assert Timedelta(days=10.0)._value == expected diff --git a/pandas/tests/scalar/timedelta/test_formats.py b/pandas/tests/scalar/timedelta/test_formats.py index e1b0076d5b7b9..1aafeec2ceed5 100644 --- a/pandas/tests/scalar/timedelta/test_formats.py +++ b/pandas/tests/scalar/timedelta/test_formats.py @@ -6,7 +6,7 @@ @pytest.mark.parametrize( "td, expected_repr", [ - (Timedelta(10, unit="d"), "Timedelta('10 days 00:00:00')"), + (Timedelta(10, unit="D"), "Timedelta('10 days 00:00:00')"), (Timedelta(10, unit="s"), "Timedelta('0 days 00:00:10')"), (Timedelta(10, unit="ms"), "Timedelta('0 days 00:00:00.010000')"), (Timedelta(-10, unit="ms"), "Timedelta('-1 days +23:59:59.990000')"), diff --git a/pandas/tests/scalar/timedelta/test_timedelta.py b/pandas/tests/scalar/timedelta/test_timedelta.py index 01e7ba52e58aa..8be2ec846a6d9 100644 --- a/pandas/tests/scalar/timedelta/test_timedelta.py +++ b/pandas/tests/scalar/timedelta/test_timedelta.py @@ -280,7 +280,7 @@ def test_timedelta_class_min_max_resolution(): class TestTimedeltaUnaryOps: def test_invert(self): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") msg = "bad operand type for unary ~" with pytest.raises(TypeError, match=msg): @@ -295,17 +295,17 @@ def test_invert(self): ~(td.to_timedelta64()) def test_unary_ops(self): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") # __neg__, __pos__ - assert -td == Timedelta(-10, unit="d") - assert -td == Timedelta("-10d") - assert +td == Timedelta(10, unit="d") + assert -td == Timedelta(-10, unit="D") + assert -td == Timedelta("-10D") + assert +td == Timedelta(10, unit="D") # __abs__, __abs__(__neg__) assert abs(td) == td assert abs(-td) == td - assert abs(-td) == Timedelta("10d") + assert abs(-td) == Timedelta("10D") class TestTimedeltas: @@ -334,7 +334,7 @@ def test_total_seconds_scalar(self): assert np.isnan(rng.total_seconds()) def test_conversion(self): - for td in [Timedelta(10, unit="d"), Timedelta("1 days, 10:11:12.012345")]: + for td in [Timedelta(10, unit="D"), Timedelta("1 days, 10:11:12.012345")]: pydt = td.to_pytimedelta() assert td == Timedelta(pydt) assert td == pydt @@ -450,7 +450,7 @@ def test_numeric_conversions(self): assert Timedelta(10, unit="us") == np.timedelta64(10, "us") assert Timedelta(10, unit="ms") == np.timedelta64(10, "ms") assert Timedelta(10, unit="s") == np.timedelta64(10, "s") - assert Timedelta(10, unit="d") == np.timedelta64(10, "D") + assert Timedelta(10, unit="D") == np.timedelta64(10, "D") def test_timedelta_conversions(self): assert Timedelta(timedelta(seconds=1)) == np.timedelta64(1, "s").astype( @@ -474,7 +474,7 @@ def test_to_numpy_alias(self): td.to_numpy(copy=True) def test_identity(self): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") assert isinstance(td, Timedelta) assert isinstance(td, timedelta) @@ -489,7 +489,10 @@ def conv(v): assert Timedelta("1000") == np.timedelta64(1000, "ns") assert Timedelta("1000ns") == np.timedelta64(1000, "ns") - assert Timedelta("1000NS") == np.timedelta64(1000, "ns") + + msg = "'NS' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + assert Timedelta("1000NS") == np.timedelta64(1000, "ns") assert Timedelta("10us") == np.timedelta64(10000, "ns") assert Timedelta("100us") == np.timedelta64(100000, "ns") @@ -508,8 +511,10 @@ def conv(v): assert Timedelta("100s") == np.timedelta64(100000000000, "ns") assert Timedelta("1000s") == np.timedelta64(1000000000000, "ns") - assert Timedelta("1d") == conv(np.timedelta64(1, "D")) - assert Timedelta("-1d") == -conv(np.timedelta64(1, "D")) + msg = "'d' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + assert Timedelta("1d") == conv(np.timedelta64(1, "D")) + assert Timedelta("-1D") == -conv(np.timedelta64(1, "D")) assert Timedelta("1D") == conv(np.timedelta64(1, "D")) assert Timedelta("10D") == conv(np.timedelta64(10, "D")) assert Timedelta("100D") == conv(np.timedelta64(100, "D")) @@ -663,6 +668,26 @@ def test_resolution_deprecated(self): result = Timedelta.resolution assert result == Timedelta(nanoseconds=1) + @pytest.mark.parametrize( + "unit,unit_depr", + [ + ("W", "w"), + ("D", "d"), + ("min", "MIN"), + ("s", "S"), + ("h", "H"), + ("ms", "MS"), + ("us", "US"), + ], + ) + def test_unit_deprecated(self, unit, unit_depr): + # GH#59051 + msg = f"'{unit_depr}' is deprecated and will be removed in a future version." + + with tm.assert_produces_warning(FutureWarning, match=msg): + result = Timedelta(1, unit_depr) + assert result == Timedelta(1, unit) + @pytest.mark.parametrize( "value, expected", diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py index 4b2122e25f819..d2d92d7273d3d 100644 --- a/pandas/tests/series/methods/test_astype.py +++ b/pandas/tests/series/methods/test_astype.py @@ -298,7 +298,7 @@ def test_astype_str_cast_dt64(self): def test_astype_str_cast_td64(self): # see GH#9757 - td = Series([Timedelta(1, unit="d")]) + td = Series([Timedelta(1, unit="D")]) ser = td.astype(str) expected = Series(["1 days"], dtype=object) diff --git a/pandas/tests/series/methods/test_isin.py b/pandas/tests/series/methods/test_isin.py index 937b85a547bcd..e997ae32cf2e2 100644 --- a/pandas/tests/series/methods/test_isin.py +++ b/pandas/tests/series/methods/test_isin.py @@ -92,7 +92,7 @@ def test_isin_with_i8(self): tm.assert_series_equal(result, expected) # timedelta64[ns] - s = Series(pd.to_timedelta(range(5), unit="d")) + s = Series(pd.to_timedelta(range(5), unit="D")) result = s.isin(s[0:2]) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_nlargest.py b/pandas/tests/series/methods/test_nlargest.py index 56b7cf42a798d..6a5b58c5da6b5 100644 --- a/pandas/tests/series/methods/test_nlargest.py +++ b/pandas/tests/series/methods/test_nlargest.py @@ -46,7 +46,7 @@ def test_nlargest_error(self, r, method, arg): [ pd.to_datetime(["2003", "2002", "2001", "2002", "2005"]), pd.to_datetime(["2003", "2002", "2001", "2002", "2005"], utc=True), - pd.to_timedelta(["3d", "2d", "1d", "2d", "5d"]), + pd.to_timedelta(["3D", "2D", "1D", "2D", "5D"]), np.array([3, 2, 1, 2, 5], dtype="int8"), np.array([3, 2, 1, 2, 5], dtype="int16"), np.array([3, 2, 1, 2, 5], dtype="int32"), diff --git a/pandas/tests/tools/test_to_timedelta.py b/pandas/tests/tools/test_to_timedelta.py index 894f49b2fa140..9ec2689069da9 100644 --- a/pandas/tests/tools/test_to_timedelta.py +++ b/pandas/tests/tools/test_to_timedelta.py @@ -56,7 +56,10 @@ def test_to_timedelta_same_np_timedelta64(self): def test_to_timedelta_series(self): # Series expected = Series([timedelta(days=1), timedelta(days=1, seconds=1)]) - result = to_timedelta(Series(["1d", "1days 00:00:01"])) + + msg = "'d' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + result = to_timedelta(Series(["1d", "1days 00:00:01"])) tm.assert_series_equal(result, expected) def test_to_timedelta_units(self): diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index fc8d7f69b8180..0f2386d1f229f 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -578,7 +578,7 @@ def test_missing_minp_zero_variable(): [np.nan] * 4, index=DatetimeIndex(["2017-01-01", "2017-01-04", "2017-01-06", "2017-01-07"]), ) - result = x.rolling(Timedelta("2d"), min_periods=0).sum() + result = x.rolling(Timedelta("2D"), min_periods=0).sum() expected = Series(0.0, index=x.index) tm.assert_series_equal(result, expected) From 6c4903e1c9e8b1246394d26dff2b6c9d081187b0 Mon Sep 17 00:00:00 2001 From: Zhengbo Wang Date: Sat, 22 Jun 2024 01:59:45 +0800 Subject: [PATCH 107/272] DOC: Change `to_numeric`'s `dtype_backend` default doc (#59021) * DOC: Change `to_numic`'s `dtype_backend` default doc * improve * improve * improve --- pandas/core/tools/numeric.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index 3d28a73df99d1..3d406d3bfb115 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -64,6 +64,7 @@ def to_numeric( ---------- arg : scalar, list, tuple, 1-d array, or Series Argument to be converted. + errors : {'raise', 'coerce'}, default 'raise' - If 'raise', then invalid parsing will raise an exception. - If 'coerce', then invalid parsing will be set as NaN. @@ -88,14 +89,15 @@ def to_numeric( the dtype it is to be cast to, so if none of the dtypes checked satisfy that specification, no downcasting will be performed on the data. - dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable' + + dtype_backend : {'numpy_nullable', 'pyarrow'} Back-end data type applied to the resultant :class:`DataFrame` - (still experimental). Behaviour is as follows: + (still experimental). If not specified, the default behavior + is to not use nullable data types. If specified, the behavior + is as follows: - * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` - (default). - * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` - DataFrame. + * ``"numpy_nullable"``: returns with nullable-dtype-backed + * ``"pyarrow"``: returns with pyarrow-backed nullable :class:`ArrowDtype` .. versionadded:: 2.0 From a5e812d86deb62872f8d514d894a22931fc84217 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 21 Jun 2024 08:04:33 -1000 Subject: [PATCH 108/272] DEPS: Drop Python 3.9 (#58238) * DEPS: Drop Python 3.9 * Update GHA files * remove 3.8 ref in test file * Move back to 3.9 for pypy * Bump pyupgrade * Run pyupgrade * Remove pandas.compat.compressors * Fix env file * Wronge error message * Ignore pypy * Test package checks with 3.12 * Modify subprocess test * revert conda-forge checks * Remove 3.9 from circleci * Pyupgrade * Don't build 39 wheels --- .circleci/config.yml | 4 +- .github/workflows/package-checks.yml | 2 +- .github/workflows/unit-tests.yml | 8 +- .github/workflows/wheels.yml | 2 +- .pre-commit-config.yaml | 2 +- ...yaml => actions-310-minimum_versions.yaml} | 2 +- ci/deps/actions-39.yaml | 63 --------------- .../development/contributing_environment.rst | 2 +- doc/source/getting_started/install.rst | 2 +- doc/source/whatsnew/v3.0.0.rst | 5 ++ pandas/_config/config.py | 2 +- pandas/_testing/__init__.py | 3 +- pandas/_testing/_io.py | 15 ++-- pandas/_typing.py | 20 ++--- pandas/_version.py | 2 +- pandas/compat/__init__.py | 49 ------------ pandas/compat/_constants.py | 2 - pandas/compat/_optional.py | 2 +- pandas/compat/compressors.py | 77 ------------------- pandas/conftest.py | 6 +- pandas/core/_numba/executor.py | 2 +- pandas/core/accessor.py | 3 +- pandas/core/apply.py | 2 +- .../array_algos/datetimelike_accumulations.py | 5 +- .../core/array_algos/masked_accumulations.py | 7 +- pandas/core/array_algos/masked_reductions.py | 7 +- pandas/core/arrays/arrow/array.py | 6 +- pandas/core/arrays/base.py | 2 +- pandas/core/arrays/categorical.py | 2 +- pandas/core/arrays/datetimelike.py | 2 +- pandas/core/arrays/interval.py | 2 +- pandas/core/arrays/masked.py | 2 +- pandas/core/arrays/numeric.py | 6 +- pandas/core/arrays/period.py | 6 +- pandas/core/arrays/sparse/array.py | 6 +- pandas/core/arrays/string_arrow.py | 6 +- pandas/core/common.py | 2 +- pandas/core/computation/align.py | 10 +-- pandas/core/computation/expr.py | 5 +- pandas/core/computation/ops.py | 2 +- pandas/core/config_init.py | 2 +- pandas/core/dtypes/common.py | 3 +- pandas/core/frame.py | 2 +- pandas/core/generic.py | 2 +- pandas/core/groupby/generic.py | 2 +- pandas/core/groupby/groupby.py | 2 +- pandas/core/groupby/numba_.py | 3 +- pandas/core/groupby/ops.py | 2 +- pandas/core/indexes/base.py | 2 +- pandas/core/indexes/extension.py | 3 +- pandas/core/indexes/multi.py | 2 +- pandas/core/indexes/range.py | 2 +- pandas/core/internals/blocks.py | 2 +- pandas/core/internals/managers.py | 2 +- pandas/core/methods/describe.py | 2 +- pandas/core/nanops.py | 5 +- pandas/core/ops/common.py | 7 +- pandas/core/ops/invalid.py | 3 +- pandas/core/resample.py | 6 +- pandas/core/reshape/concat.py | 2 +- pandas/core/reshape/pivot.py | 6 +- pandas/core/reshape/tile.py | 3 +- pandas/core/series.py | 2 +- pandas/core/sorting.py | 2 +- pandas/core/strings/accessor.py | 2 +- pandas/core/strings/base.py | 6 +- pandas/core/strings/object_array.py | 17 ++-- pandas/core/tools/datetimes.py | 6 +- pandas/core/util/numba_.py | 8 +- pandas/core/window/expanding.py | 3 +- pandas/core/window/numba_.py | 3 +- pandas/core/window/rolling.py | 2 +- pandas/io/_util.py | 5 +- pandas/io/common.py | 12 +-- pandas/io/excel/_base.py | 2 +- pandas/io/excel/_util.py | 2 +- pandas/io/formats/css.py | 6 +- pandas/io/formats/excel.py | 2 +- pandas/io/formats/format.py | 2 +- pandas/io/formats/printing.py | 2 +- pandas/io/formats/style.py | 2 +- pandas/io/formats/style_render.py | 6 +- pandas/io/json/_json.py | 2 +- pandas/io/parsers/base_parser.py | 2 +- pandas/io/parsers/readers.py | 2 +- pandas/io/pytables.py | 2 +- pandas/io/sql.py | 2 +- pandas/io/stata.py | 2 +- pandas/io/xml.py | 6 +- pandas/plotting/_core.py | 2 +- pandas/tests/groupby/test_numeric_only.py | 3 +- pandas/tests/io/excel/test_writers.py | 14 ++-- pandas/tests/io/test_compression.py | 4 +- pandas/tests/io/test_pickle.py | 40 +--------- .../scalar/timestamp/test_constructors.py | 7 +- pandas/tests/test_common.py | 19 ----- pandas/tseries/holiday.py | 5 +- pandas/util/_decorators.py | 6 +- pandas/util/_test_decorators.py | 6 +- pandas/util/version/__init__.py | 6 +- pyproject.toml | 6 +- ...check_for_inconsistent_pandas_namespace.py | 9 +-- scripts/tests/data/deps_minimum.toml | 2 - scripts/validate_unwanted_patterns.py | 10 +-- 104 files changed, 223 insertions(+), 446 deletions(-) rename ci/deps/{actions-39-minimum_versions.yaml => actions-310-minimum_versions.yaml} (98%) delete mode 100644 ci/deps/actions-39.yaml delete mode 100644 pandas/compat/compressors.py diff --git a/.circleci/config.yml b/.circleci/config.yml index 463667446ed42..4acc6473e6add 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -141,11 +141,9 @@ workflows: only: /^v.*/ matrix: parameters: - cibw-build: ["cp39-manylinux_aarch64", - "cp310-manylinux_aarch64", + cibw-build: ["cp310-manylinux_aarch64", "cp311-manylinux_aarch64", "cp312-manylinux_aarch64", - "cp39-musllinux_aarch64", "cp310-musllinux_aarch64", "cp311-musllinux_aarch64", "cp312-musllinux_aarch64",] diff --git a/.github/workflows/package-checks.yml b/.github/workflows/package-checks.yml index 2de1649d42dfd..97f90c1588962 100644 --- a/.github/workflows/package-checks.yml +++ b/.github/workflows/package-checks.yml @@ -53,7 +53,7 @@ jobs: runs-on: ubuntu-22.04 strategy: matrix: - python-version: ['3.9', '3.10', '3.11'] + python-version: ['3.10', '3.11'] fail-fast: false name: Test Conda Forge Recipe - Python ${{ matrix.python-version }} concurrency: diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 1b88d4d90d3e1..600ffd56b6d56 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -26,7 +26,7 @@ jobs: timeout-minutes: 90 strategy: matrix: - env_file: [actions-39.yaml, actions-310.yaml, actions-311.yaml, actions-312.yaml] + env_file: [actions-310.yaml, actions-311.yaml, actions-312.yaml] # Prevent the include jobs from overriding other jobs pattern: [""] include: @@ -35,7 +35,7 @@ jobs: pattern: "not slow and not network and not single_cpu" pytest_target: "pandas/tests/test_downstream.py" - name: "Minimum Versions" - env_file: actions-39-minimum_versions.yaml + env_file: actions-310-minimum_versions.yaml pattern: "not slow and not network and not single_cpu" - name: "Locale: it_IT" env_file: actions-311.yaml @@ -146,6 +146,8 @@ jobs: - name: Build Pandas id: build uses: ./.github/actions/build_pandas + # TODO: Re-enable once Pypy has Pypy 3.10 on conda-forge + if: ${{ matrix.name != 'Pypy' }} with: meson_args: ${{ matrix.meson_args }} cflags_adds: ${{ matrix.cflags_adds }} @@ -170,7 +172,7 @@ jobs: matrix: # Note: Don't use macOS latest since macos 14 appears to be arm64 only os: [macos-13, macos-14, windows-latest] - env_file: [actions-39.yaml, actions-310.yaml, actions-311.yaml, actions-312.yaml] + env_file: [actions-310.yaml, actions-311.yaml, actions-312.yaml] fail-fast: false runs-on: ${{ matrix.os }} name: ${{ format('{0} {1}', matrix.os, matrix.env_file) }} diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 4b34d2b21495b..b92588d81f4ed 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -99,7 +99,7 @@ jobs: - [macos-14, macosx_arm64] - [windows-2022, win_amd64] # TODO: support PyPy? - python: [["cp39", "3.9"], ["cp310", "3.10"], ["cp311", "3.11"], ["cp312", "3.12"]] + python: [["cp310", "3.10"], ["cp311", "3.11"], ["cp312", "3.12"]] env: IS_PUSH: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') }} IS_SCHEDULE_DISPATCH: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index bf88500b10524..c32f727213152 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -75,7 +75,7 @@ repos: rev: v3.15.2 hooks: - id: pyupgrade - args: [--py39-plus] + args: [--py310-plus] - repo: https://github.com/pre-commit/pygrep-hooks rev: v1.10.0 hooks: diff --git a/ci/deps/actions-39-minimum_versions.yaml b/ci/deps/actions-310-minimum_versions.yaml similarity index 98% rename from ci/deps/actions-39-minimum_versions.yaml rename to ci/deps/actions-310-minimum_versions.yaml index b760f27a3d4d3..a9c205d24d212 100644 --- a/ci/deps/actions-39-minimum_versions.yaml +++ b/ci/deps/actions-310-minimum_versions.yaml @@ -4,7 +4,7 @@ name: pandas-dev channels: - conda-forge dependencies: - - python=3.9 + - python=3.10 # build dependencies - versioneer[toml] diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml deleted file mode 100644 index 8f235a836bb3d..0000000000000 --- a/ci/deps/actions-39.yaml +++ /dev/null @@ -1,63 +0,0 @@ -name: pandas-dev -channels: - - conda-forge -dependencies: - - python=3.9 - - # build dependencies - - versioneer[toml] - - cython>=0.29.33 - - meson[ninja]=1.2.1 - - meson-python=0.13.1 - - # test dependencies - - pytest>=7.3.2 - - pytest-cov - - pytest-xdist>=2.2.0 - - pytest-qt>=4.2.0 - - boto3 - - # required dependencies - - python-dateutil - - numpy - - pytz - - # optional dependencies - - beautifulsoup4>=4.11.2 - - blosc>=1.21.3 - - bottleneck>=1.3.6 - - fastparquet>=2023.10.0 - - fsspec>=2022.11.0 - - html5lib>=1.1 - - hypothesis>=6.46.1 - - gcsfs>=2022.11.0 - - jinja2>=3.1.2 - - lxml>=4.9.2 - - matplotlib>=3.6.3 - - numba>=0.56.4 - - numexpr>=2.8.4 - - odfpy>=1.4.1 - - qtpy>=2.3.0 - - openpyxl>=3.1.0 - - psycopg2>=2.9.6 - - pyarrow>=10.0.1 - - pymysql>=1.0.2 - - pyqt>=5.15.9 - - pyreadstat>=1.2.0 - - pytables>=3.8.0 - - python-calamine>=0.1.7 - - pyxlsb>=1.0.10 - - s3fs>=2022.11.0 - - scipy>=1.10.0 - - sqlalchemy>=2.0.0 - - tabulate>=0.9.0 - - xarray>=2022.12.0 - - xlrd>=2.0.1 - - xlsxwriter>=3.0.5 - - zstandard>=0.19.0 - - - pip: - - adbc-driver-postgresql>=0.10.0 - - adbc-driver-sqlite>=0.8.0 - - tzdata>=2022.7 - - pytest-localserver>=0.7.1 diff --git a/doc/source/development/contributing_environment.rst b/doc/source/development/contributing_environment.rst index 325c902dd4f9e..0691414f53306 100644 --- a/doc/source/development/contributing_environment.rst +++ b/doc/source/development/contributing_environment.rst @@ -130,7 +130,7 @@ Consult the docs for setting up pyenv `here `__. pyenv virtualenv # For instance: - pyenv virtualenv 3.9.10 pandas-dev + pyenv virtualenv 3.10 pandas-dev # Activate the virtualenv pyenv activate pandas-dev diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index 01a79fc8e36fd..86ce05fde547b 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -161,7 +161,7 @@ Python terminal. >>> import pandas as pd >>> pd.test() - running: pytest -m "not slow and not network and not db" /home/user/anaconda3/lib/python3.9/site-packages/pandas + running: pytest -m "not slow and not network and not db" /home/user/anaconda3/lib/python3.10/site-packages/pandas ============================= test session starts ============================== platform linux -- Python 3.9.7, pytest-6.2.5, py-1.11.0, pluggy-1.0.0 diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 9748a461859c2..3d869bf31f372 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -190,6 +190,11 @@ In cases with mixed-resolution inputs, the highest resolution is used: .. _whatsnew_300.api_breaking.deps: +Increased minimum version for Python +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +pandas 3.0.0 supports Python 3.10 and higher. + Increased minimum versions for dependencies ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Some minimum supported versions of dependencies were updated. diff --git a/pandas/_config/config.py b/pandas/_config/config.py index 95c549a8ff0e8..51794ec04b29e 100644 --- a/pandas/_config/config.py +++ b/pandas/_config/config.py @@ -55,7 +55,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, NamedTuple, cast, ) @@ -66,6 +65,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Generator, Sequence, ) diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 85d03ea17bf42..fb8ca8aad3428 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -6,7 +6,6 @@ from sys import byteorder from typing import ( TYPE_CHECKING, - Callable, ContextManager, cast, ) @@ -85,6 +84,8 @@ from pandas.core.construction import extract_array if TYPE_CHECKING: + from collections.abc import Callable + from pandas._typing import ( Dtype, NpDtype, diff --git a/pandas/_testing/_io.py b/pandas/_testing/_io.py index 2955108d3db1a..e1841c95dcdfe 100644 --- a/pandas/_testing/_io.py +++ b/pandas/_testing/_io.py @@ -7,21 +7,18 @@ from typing import ( TYPE_CHECKING, Any, - Callable, ) import uuid import zipfile -from pandas.compat import ( - get_bz2_file, - get_lzma_file, -) from pandas.compat._optional import import_optional_dependency import pandas as pd from pandas._testing.contexts import ensure_clean if TYPE_CHECKING: + from collections.abc import Callable + from pandas._typing import ( FilePath, ReadPickleBuffer, @@ -129,11 +126,15 @@ def write_to_compressed(compression, path, data, dest: str = "test") -> None: elif compression == "gzip": compress_method = gzip.GzipFile elif compression == "bz2": - compress_method = get_bz2_file() + import bz2 + + compress_method = bz2.BZ2File elif compression == "zstd": compress_method = import_optional_dependency("zstandard").open elif compression == "xz": - compress_method = get_lzma_file() + import lzma + + compress_method = lzma.LZMAFile else: raise ValueError(f"Unrecognized compression type: {compression}") diff --git a/pandas/_typing.py b/pandas/_typing.py index ef68018f2721a..d90596878ba51 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -1,6 +1,7 @@ from __future__ import annotations from collections.abc import ( + Callable, Hashable, Iterator, Mapping, @@ -18,7 +19,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Literal, Optional, Protocol, @@ -90,18 +90,12 @@ # Name "npt._ArrayLikeInt_co" is not defined [name-defined] NumpySorter = Optional[npt._ArrayLikeInt_co] # type: ignore[name-defined] - from typing import SupportsIndex - - if sys.version_info >= (3, 10): - from typing import Concatenate # pyright: ignore[reportUnusedImport] - from typing import ParamSpec - from typing import TypeGuard # pyright: ignore[reportUnusedImport] - else: - from typing_extensions import ( # pyright: ignore[reportUnusedImport] - Concatenate, - ParamSpec, - TypeGuard, - ) + from typing import ( + ParamSpec, + SupportsIndex, + ) + from typing import Concatenate # pyright: ignore[reportUnusedImport] + from typing import TypeGuard # pyright: ignore[reportUnusedImport] P = ParamSpec("P") diff --git a/pandas/_version.py b/pandas/_version.py index 7bd9da2bb1cfa..b32c9e67fdbb6 100644 --- a/pandas/_version.py +++ b/pandas/_version.py @@ -10,13 +10,13 @@ """Git implementation of _version.py.""" +from collections.abc import Callable import errno import functools import os import re import subprocess import sys -from typing import Callable def get_keywords(): diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 4583e7edebbdc..13e6707667d0a 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -18,13 +18,11 @@ from pandas.compat._constants import ( IS64, ISMUSL, - PY310, PY311, PY312, PYPY, WASM, ) -import pandas.compat.compressors from pandas.compat.numpy import is_numpy_dev from pandas.compat.pyarrow import ( pa_version_under10p1, @@ -148,52 +146,6 @@ def is_ci_environment() -> bool: return os.environ.get("PANDAS_CI", "0") == "1" -def get_lzma_file() -> type[pandas.compat.compressors.LZMAFile]: - """ - Importing the `LZMAFile` class from the `lzma` module. - - Returns - ------- - class - The `LZMAFile` class from the `lzma` module. - - Raises - ------ - RuntimeError - If the `lzma` module was not imported correctly, or didn't exist. - """ - if not pandas.compat.compressors.has_lzma: - raise RuntimeError( - "lzma module not available. " - "A Python re-install with the proper dependencies, " - "might be required to solve this issue." - ) - return pandas.compat.compressors.LZMAFile - - -def get_bz2_file() -> type[pandas.compat.compressors.BZ2File]: - """ - Importing the `BZ2File` class from the `bz2` module. - - Returns - ------- - class - The `BZ2File` class from the `bz2` module. - - Raises - ------ - RuntimeError - If the `bz2` module was not imported correctly, or didn't exist. - """ - if not pandas.compat.compressors.has_bz2: - raise RuntimeError( - "bz2 module not available. " - "A Python re-install with the proper dependencies, " - "might be required to solve this issue." - ) - return pandas.compat.compressors.BZ2File - - __all__ = [ "is_numpy_dev", "pa_version_under10p1", @@ -204,7 +156,6 @@ def get_bz2_file() -> type[pandas.compat.compressors.BZ2File]: "pa_version_under16p0", "IS64", "ISMUSL", - "PY310", "PY311", "PY312", "PYPY", diff --git a/pandas/compat/_constants.py b/pandas/compat/_constants.py index 2625389e5254a..c7b7341013251 100644 --- a/pandas/compat/_constants.py +++ b/pandas/compat/_constants.py @@ -13,7 +13,6 @@ IS64 = sys.maxsize > 2**32 -PY310 = sys.version_info >= (3, 10) PY311 = sys.version_info >= (3, 11) PY312 = sys.version_info >= (3, 12) PYPY = platform.python_implementation() == "PyPy" @@ -24,7 +23,6 @@ __all__ = [ "IS64", "ISMUSL", - "PY310", "PY311", "PY312", "PYPY", diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index f4e717c26d6fd..b62a4c8dcc8c8 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -16,7 +16,7 @@ if TYPE_CHECKING: import types -# Update install.rst, actions-39-minimum_versions.yaml, +# Update install.rst, actions-310-minimum_versions.yaml, # deps_minimum.toml & pyproject.toml when updating versions! VERSIONS = { diff --git a/pandas/compat/compressors.py b/pandas/compat/compressors.py deleted file mode 100644 index 1f31e34c092c9..0000000000000 --- a/pandas/compat/compressors.py +++ /dev/null @@ -1,77 +0,0 @@ -""" -Patched ``BZ2File`` and ``LZMAFile`` to handle pickle protocol 5. -""" - -from __future__ import annotations - -from pickle import PickleBuffer - -from pandas.compat._constants import PY310 - -try: - import bz2 - - has_bz2 = True -except ImportError: - has_bz2 = False - -try: - import lzma - - has_lzma = True -except ImportError: - has_lzma = False - - -def flatten_buffer( - b: bytes | bytearray | memoryview | PickleBuffer, -) -> bytes | bytearray | memoryview: - """ - Return some 1-D `uint8` typed buffer. - - Coerces anything that does not match that description to one that does - without copying if possible (otherwise will copy). - """ - - if isinstance(b, (bytes, bytearray)): - return b - - if not isinstance(b, PickleBuffer): - b = PickleBuffer(b) - - try: - # coerce to 1-D `uint8` C-contiguous `memoryview` zero-copy - return b.raw() - except BufferError: - # perform in-memory copy if buffer is not contiguous - return memoryview(b).tobytes("A") - - -if has_bz2: - - class BZ2File(bz2.BZ2File): - if not PY310: - - def write(self, b) -> int: - # Workaround issue where `bz2.BZ2File` expects `len` - # to return the number of bytes in `b` by converting - # `b` into something that meets that constraint with - # minimal copying. - # - # Note: This is fixed in Python 3.10. - return super().write(flatten_buffer(b)) - - -if has_lzma: - - class LZMAFile(lzma.LZMAFile): - if not PY310: - - def write(self, b) -> int: - # Workaround issue where `lzma.LZMAFile` expects `len` - # to return the number of bytes in `b` by converting - # `b` into something that meets that constraint with - # minimal copying. - # - # Note: This is fixed in Python 3.10. - return super().write(flatten_buffer(b)) diff --git a/pandas/conftest.py b/pandas/conftest.py index 163c3890a7f6d..c3bfc8c06ad8a 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -32,10 +32,7 @@ import gc import operator import os -from typing import ( - TYPE_CHECKING, - Callable, -) +from typing import TYPE_CHECKING import uuid from dateutil.tz import ( @@ -83,6 +80,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Hashable, Iterator, ) diff --git a/pandas/core/_numba/executor.py b/pandas/core/_numba/executor.py index 82fd4e34ac67b..3f3ebe8dbe023 100644 --- a/pandas/core/_numba/executor.py +++ b/pandas/core/_numba/executor.py @@ -4,10 +4,10 @@ from typing import ( TYPE_CHECKING, Any, - Callable, ) if TYPE_CHECKING: + from collections.abc import Callable from pandas._typing import Scalar import numpy as np diff --git a/pandas/core/accessor.py b/pandas/core/accessor.py index 3acbfc3eabbac..d8463fda34caa 100644 --- a/pandas/core/accessor.py +++ b/pandas/core/accessor.py @@ -9,7 +9,6 @@ from typing import ( TYPE_CHECKING, - Callable, final, ) import warnings @@ -18,6 +17,8 @@ from pandas.util._exceptions import find_stack_level if TYPE_CHECKING: + from collections.abc import Callable + from pandas._typing import TypeT from pandas import Index diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 75ad17b59bf88..607a65598783f 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -2,13 +2,13 @@ import abc from collections import defaultdict +from collections.abc import Callable import functools from functools import partial import inspect from typing import ( TYPE_CHECKING, Any, - Callable, Literal, cast, ) diff --git a/pandas/core/array_algos/datetimelike_accumulations.py b/pandas/core/array_algos/datetimelike_accumulations.py index c3a7c2e4fefb2..bc10dbfbec90d 100644 --- a/pandas/core/array_algos/datetimelike_accumulations.py +++ b/pandas/core/array_algos/datetimelike_accumulations.py @@ -4,7 +4,7 @@ from __future__ import annotations -from typing import Callable +from typing import TYPE_CHECKING import numpy as np @@ -12,6 +12,9 @@ from pandas.core.dtypes.missing import isna +if TYPE_CHECKING: + from collections.abc import Callable + def _cum_func( func: Callable, diff --git a/pandas/core/array_algos/masked_accumulations.py b/pandas/core/array_algos/masked_accumulations.py index b31d32a606eed..b4e116388b85e 100644 --- a/pandas/core/array_algos/masked_accumulations.py +++ b/pandas/core/array_algos/masked_accumulations.py @@ -5,14 +5,13 @@ from __future__ import annotations -from typing import ( - TYPE_CHECKING, - Callable, -) +from typing import TYPE_CHECKING import numpy as np if TYPE_CHECKING: + from collections.abc import Callable + from pandas._typing import npt diff --git a/pandas/core/array_algos/masked_reductions.py b/pandas/core/array_algos/masked_reductions.py index 3784689995802..f2a32fbe2b0e5 100644 --- a/pandas/core/array_algos/masked_reductions.py +++ b/pandas/core/array_algos/masked_reductions.py @@ -5,10 +5,7 @@ from __future__ import annotations -from typing import ( - TYPE_CHECKING, - Callable, -) +from typing import TYPE_CHECKING import warnings import numpy as np @@ -18,6 +15,8 @@ from pandas.core.nanops import check_below_min_count if TYPE_CHECKING: + from collections.abc import Callable + from pandas._typing import ( AxisInt, npt, diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 8c39e0d87df4e..4ff7553af2b69 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -7,7 +7,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Literal, cast, overload, @@ -174,7 +173,10 @@ def floordiv_compat( } if TYPE_CHECKING: - from collections.abc import Sequence + from collections.abc import ( + Callable, + Sequence, + ) from pandas._libs.missing import NAType from pandas._typing import ( diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index f83fdcd46b371..1e8fec7fde3de 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -13,7 +13,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, ClassVar, Literal, cast, @@ -78,6 +77,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Iterator, Sequence, ) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 64e5eec43a5c1..c656e4bf1e20c 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -6,7 +6,6 @@ from shutil import get_terminal_size from typing import ( TYPE_CHECKING, - Callable, Literal, cast, overload, @@ -94,6 +93,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Hashable, Iterator, Sequence, diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 673001337767b..c90ff410b4b93 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -9,7 +9,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Literal, Union, cast, @@ -148,6 +147,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Iterator, Sequence, ) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 2e1ea7236e5c4..52d64162358c8 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -8,7 +8,6 @@ import textwrap from typing import ( TYPE_CHECKING, - Callable, Literal, Union, overload, @@ -99,6 +98,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Iterator, Sequence, ) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 04cffcaaa5f04..93471788e72ab 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -3,7 +3,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Literal, cast, overload, @@ -73,6 +72,7 @@ from pandas.core.util.hashing import hash_array if TYPE_CHECKING: + from collections.abc import Callable from collections.abc import ( Iterator, Sequence, diff --git a/pandas/core/arrays/numeric.py b/pandas/core/arrays/numeric.py index c5e9ed8698ffe..2c0236273e731 100644 --- a/pandas/core/arrays/numeric.py +++ b/pandas/core/arrays/numeric.py @@ -4,7 +4,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, ) import numpy as np @@ -28,7 +27,10 @@ ) if TYPE_CHECKING: - from collections.abc import Mapping + from collections.abc import ( + Callable, + Mapping, + ) import pyarrow diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 8baf363b909fb..e762c3e547819 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -5,7 +5,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Literal, TypeVar, cast, @@ -75,7 +74,10 @@ import pandas.core.common as com if TYPE_CHECKING: - from collections.abc import Sequence + from collections.abc import ( + Callable, + Sequence, + ) from pandas._typing import ( AnyArrayLike, diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index adf8f44377e62..3a08344369822 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -10,7 +10,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Literal, cast, overload, @@ -87,7 +86,10 @@ # See https://github.com/python/typing/issues/684 if TYPE_CHECKING: - from collections.abc import Sequence + from collections.abc import ( + Callable, + Sequence, + ) from enum import Enum class ellipsis(Enum): diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index f2fd9d5d6610f..97c06149d0b7e 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -5,7 +5,6 @@ import re from typing import ( TYPE_CHECKING, - Callable, Union, cast, ) @@ -53,7 +52,10 @@ if TYPE_CHECKING: - from collections.abc import Sequence + from collections.abc import ( + Callable, + Sequence, + ) from pandas._typing import ( ArrayLike, diff --git a/pandas/core/common.py b/pandas/core/common.py index 96291991227d9..1423ea456384b 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -12,6 +12,7 @@ defaultdict, ) from collections.abc import ( + Callable, Collection, Generator, Hashable, @@ -24,7 +25,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, TypeVar, cast, overload, diff --git a/pandas/core/computation/align.py b/pandas/core/computation/align.py index b4e33b8ac75cb..7de4d8cdf99e1 100644 --- a/pandas/core/computation/align.py +++ b/pandas/core/computation/align.py @@ -8,10 +8,7 @@ partial, wraps, ) -from typing import ( - TYPE_CHECKING, - Callable, -) +from typing import TYPE_CHECKING import warnings import numpy as np @@ -31,7 +28,10 @@ from pandas.core.computation.common import result_type_many if TYPE_CHECKING: - from collections.abc import Sequence + from collections.abc import ( + Callable, + Sequence, + ) from pandas._typing import F diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py index a8123a898b4fe..b287cd542068d 100644 --- a/pandas/core/computation/expr.py +++ b/pandas/core/computation/expr.py @@ -12,7 +12,7 @@ from keyword import iskeyword import tokenize from typing import ( - Callable, + TYPE_CHECKING, ClassVar, TypeVar, ) @@ -47,6 +47,9 @@ from pandas.io.formats import printing +if TYPE_CHECKING: + from collections.abc import Callable + def _rewrite_assign(tok: tuple[int, str]) -> tuple[int, str]: """ diff --git a/pandas/core/computation/ops.py b/pandas/core/computation/ops.py index d69765e91f467..056325fd2e4ab 100644 --- a/pandas/core/computation/ops.py +++ b/pandas/core/computation/ops.py @@ -9,7 +9,6 @@ import operator from typing import ( TYPE_CHECKING, - Callable, Literal, ) @@ -37,6 +36,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Iterable, Iterator, ) diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 46c9139c3456c..05661033bd5ed 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -12,8 +12,8 @@ from __future__ import annotations +from collections.abc import Callable import os -from typing import Callable import pandas._config.config as cf from pandas._config.config import ( diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 2ac75a0700759..bee8af46baa64 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -7,7 +7,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, ) import warnings @@ -55,6 +54,8 @@ ) if TYPE_CHECKING: + from collections.abc import Callable + from pandas._typing import ( ArrayLike, DtypeObj, diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 0aeda77233125..08b339dc26452 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -14,6 +14,7 @@ import collections from collections import abc from collections.abc import ( + Callable, Hashable, Iterable, Iterator, @@ -29,7 +30,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Literal, cast, overload, diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 93068c665a880..b4908ad7a2158 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -13,7 +13,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, ClassVar, Literal, NoReturn, @@ -185,6 +184,7 @@ from pandas.io.formats.printing import pprint_thing if TYPE_CHECKING: + from collections.abc import Callable from collections.abc import ( Hashable, Iterator, diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index eb334e0e57493..c112d9b6a4b54 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -9,12 +9,12 @@ from __future__ import annotations from collections import abc +from collections.abc import Callable from functools import partial from textwrap import dedent from typing import ( TYPE_CHECKING, Any, - Callable, Literal, NamedTuple, TypeVar, diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index d45c891d6413b..763fd4e59a978 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -10,6 +10,7 @@ class providing the base-class of operations. from __future__ import annotations from collections.abc import ( + Callable, Hashable, Iterable, Iterator, @@ -24,7 +25,6 @@ class providing the base-class of operations. from textwrap import dedent from typing import ( TYPE_CHECKING, - Callable, Literal, TypeVar, Union, diff --git a/pandas/core/groupby/numba_.py b/pandas/core/groupby/numba_.py index b22fc9248eeca..73b681c64c3a3 100644 --- a/pandas/core/groupby/numba_.py +++ b/pandas/core/groupby/numba_.py @@ -7,7 +7,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, ) import numpy as np @@ -20,6 +19,8 @@ ) if TYPE_CHECKING: + from collections.abc import Callable + from pandas._typing import Scalar diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 58c27d80ea99a..da80969b613cd 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -12,7 +12,6 @@ import functools from typing import ( TYPE_CHECKING, - Callable, Generic, final, ) @@ -70,6 +69,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Generator, Hashable, Iterator, diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 15c318e5e9caf..71dfff520113c 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -8,7 +8,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, ClassVar, Literal, NoReturn, @@ -193,6 +192,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Hashable, Iterable, Sequence, diff --git a/pandas/core/indexes/extension.py b/pandas/core/indexes/extension.py index fc806a3546571..48d5e59250f35 100644 --- a/pandas/core/indexes/extension.py +++ b/pandas/core/indexes/extension.py @@ -7,7 +7,6 @@ from inspect import signature from typing import ( TYPE_CHECKING, - Callable, TypeVar, ) @@ -18,6 +17,8 @@ from pandas.core.indexes.base import Index if TYPE_CHECKING: + from collections.abc import Callable + import numpy as np from pandas._typing import ( diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 63908ada0c73e..9d7c7f3e4a5c9 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1,6 +1,7 @@ from __future__ import annotations from collections.abc import ( + Callable, Collection, Generator, Hashable, @@ -12,7 +13,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Literal, cast, ) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index bd9e8b84fd82a..ce9e639656acb 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -1,6 +1,7 @@ from __future__ import annotations from collections.abc import ( + Callable, Hashable, Iterator, ) @@ -10,7 +11,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Literal, cast, overload, diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index cffb1f658a640..3614d43425a09 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -5,7 +5,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Literal, cast, final, @@ -121,6 +120,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Generator, Iterable, Sequence, diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 79cba9275a119..b47d5fe18b9c9 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1,6 +1,7 @@ from __future__ import annotations from collections.abc import ( + Callable, Hashable, Sequence, ) @@ -8,7 +9,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Literal, NoReturn, cast, diff --git a/pandas/core/methods/describe.py b/pandas/core/methods/describe.py index ef20d4c509732..17d4d38c97f33 100644 --- a/pandas/core/methods/describe.py +++ b/pandas/core/methods/describe.py @@ -12,7 +12,6 @@ ) from typing import ( TYPE_CHECKING, - Callable, cast, ) @@ -42,6 +41,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Hashable, Sequence, ) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 22092551ec882..e775156a6ae2f 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -3,8 +3,8 @@ import functools import itertools from typing import ( + TYPE_CHECKING, Any, - Callable, cast, ) import warnings @@ -48,6 +48,9 @@ notna, ) +if TYPE_CHECKING: + from collections.abc import Callable + bn = import_optional_dependency("bottleneck", errors="warn") _BOTTLENECK_INSTALLED = bn is not None _USE_BOTTLENECK = False diff --git a/pandas/core/ops/common.py b/pandas/core/ops/common.py index d19ac6246e1cd..5cbe1c421e05a 100644 --- a/pandas/core/ops/common.py +++ b/pandas/core/ops/common.py @@ -5,10 +5,7 @@ from __future__ import annotations from functools import wraps -from typing import ( - TYPE_CHECKING, - Callable, -) +from typing import TYPE_CHECKING from pandas._libs.lib import item_from_zerodim from pandas._libs.missing import is_matching_na @@ -19,6 +16,8 @@ ) if TYPE_CHECKING: + from collections.abc import Callable + from pandas._typing import F diff --git a/pandas/core/ops/invalid.py b/pandas/core/ops/invalid.py index c300db8c114c1..395db1617cb63 100644 --- a/pandas/core/ops/invalid.py +++ b/pandas/core/ops/invalid.py @@ -8,13 +8,14 @@ from typing import ( TYPE_CHECKING, Any, - Callable, NoReturn, ) import numpy as np if TYPE_CHECKING: + from collections.abc import Callable + from pandas._typing import ( ArrayLike, Scalar, diff --git a/pandas/core/resample.py b/pandas/core/resample.py index ccbe25fdae841..8ee71ea2293e6 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -4,7 +4,6 @@ from textwrap import dedent from typing import ( TYPE_CHECKING, - Callable, Literal, cast, final, @@ -92,7 +91,10 @@ ) if TYPE_CHECKING: - from collections.abc import Hashable + from collections.abc import ( + Callable, + Hashable, + ) from pandas._typing import ( Any, diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 7055201b5a1ee..2d2787e56f402 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -7,7 +7,6 @@ from collections import abc from typing import ( TYPE_CHECKING, - Callable, Literal, cast, overload, @@ -46,6 +45,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Hashable, Iterable, Mapping, diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 131924bc059f6..2dc5c7af00958 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -3,7 +3,6 @@ import itertools from typing import ( TYPE_CHECKING, - Callable, Literal, cast, ) @@ -36,7 +35,10 @@ from pandas.core.series import Series if TYPE_CHECKING: - from collections.abc import Hashable + from collections.abc import ( + Callable, + Hashable, + ) from pandas._typing import ( AggFuncType, diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index d780433386395..0052bcfe09147 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -7,7 +7,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Literal, ) @@ -44,6 +43,8 @@ from pandas.core.arrays.datetimelike import dtype_to_unit if TYPE_CHECKING: + from collections.abc import Callable + from pandas._typing import ( DtypeObj, IntervalLeftRight, diff --git a/pandas/core/series.py b/pandas/core/series.py index e833c2a078762..2781fa6af0d42 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -5,6 +5,7 @@ from __future__ import annotations from collections.abc import ( + Callable, Hashable, Iterable, Mapping, @@ -17,7 +18,6 @@ IO, TYPE_CHECKING, Any, - Callable, Literal, cast, overload, diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 4fba243f73536..0d8f42694ccb4 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -5,7 +5,6 @@ import itertools from typing import ( TYPE_CHECKING, - Callable, cast, ) @@ -32,6 +31,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Hashable, Sequence, ) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 7494a43caf004..dd9276179cf4d 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -5,7 +5,6 @@ import re from typing import ( TYPE_CHECKING, - Callable, Literal, cast, ) @@ -50,6 +49,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Hashable, Iterator, ) diff --git a/pandas/core/strings/base.py b/pandas/core/strings/base.py index c1f94abff428a..1281a03e297f9 100644 --- a/pandas/core/strings/base.py +++ b/pandas/core/strings/base.py @@ -3,14 +3,16 @@ import abc from typing import ( TYPE_CHECKING, - Callable, Literal, ) import numpy as np if TYPE_CHECKING: - from collections.abc import Sequence + from collections.abc import ( + Callable, + Sequence, + ) import re from pandas._typing import ( diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index bdcf55e61d2d1..290a28ab60ae1 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -5,7 +5,6 @@ import textwrap from typing import ( TYPE_CHECKING, - Callable, Literal, cast, ) @@ -22,7 +21,10 @@ from pandas.core.strings.base import BaseStringArrayMethods if TYPE_CHECKING: - from collections.abc import Sequence + from collections.abc import ( + Callable, + Sequence, + ) from pandas._typing import ( NpDtype, @@ -457,16 +459,7 @@ def _str_rstrip(self, to_strip=None): return self._str_map(lambda x: x.rstrip(to_strip)) def _str_removeprefix(self, prefix: str): - # outstanding question on whether to use native methods for users on Python 3.9+ - # https://github.com/pandas-dev/pandas/pull/39226#issuecomment-836719770, - # in which case we could do return self._str_map(str.removeprefix) - - def removeprefix(text: str) -> str: - if text.startswith(prefix): - return text[len(prefix) :] - return text - - return self._str_map(removeprefix) + return self._str_map(lambda x: x.removeprefix(prefix)) def _str_removesuffix(self, suffix: str): return self._str_map(lambda x: x.removesuffix(suffix)) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index c116ef015ae16..9b8970f86ed6d 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -6,7 +6,6 @@ from itertools import islice from typing import ( TYPE_CHECKING, - Callable, TypedDict, Union, cast, @@ -77,7 +76,10 @@ from pandas.core.indexes.datetimes import DatetimeIndex if TYPE_CHECKING: - from collections.abc import Hashable + from collections.abc import ( + Callable, + Hashable, + ) from pandas._libs.tslibs.nattype import NaTType from pandas._libs.tslibs.timedeltas import UnitChoices diff --git a/pandas/core/util/numba_.py b/pandas/core/util/numba_.py index d93984d210cb4..de024f612516b 100644 --- a/pandas/core/util/numba_.py +++ b/pandas/core/util/numba_.py @@ -4,16 +4,16 @@ import inspect import types -from typing import ( - TYPE_CHECKING, - Callable, -) +from typing import TYPE_CHECKING import numpy as np from pandas.compat._optional import import_optional_dependency from pandas.errors import NumbaUtilError +if TYPE_CHECKING: + from collections.abc import Callable + GLOBAL_USE_NUMBA: bool = False diff --git a/pandas/core/window/expanding.py b/pandas/core/window/expanding.py index f14954cd9a4b0..d0c8a2e67b6ca 100644 --- a/pandas/core/window/expanding.py +++ b/pandas/core/window/expanding.py @@ -4,7 +4,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Literal, ) @@ -32,6 +31,8 @@ ) if TYPE_CHECKING: + from collections.abc import Callable + from pandas._typing import ( QuantileInterpolation, WindowingRankType, diff --git a/pandas/core/window/numba_.py b/pandas/core/window/numba_.py index 824cf936b8185..621b0f2c0f2d8 100644 --- a/pandas/core/window/numba_.py +++ b/pandas/core/window/numba_.py @@ -4,7 +4,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, ) import numpy as np @@ -14,6 +13,8 @@ from pandas.core.util.numba_ import jit_user_function if TYPE_CHECKING: + from collections.abc import Callable + from pandas._typing import Scalar diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 2243d8dd1a613..16aa6d7e56a1c 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -13,7 +13,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Literal, ) @@ -90,6 +89,7 @@ ) if TYPE_CHECKING: + from collections.abc import Callable from collections.abc import ( Hashable, Iterator, diff --git a/pandas/io/_util.py b/pandas/io/_util.py index 3b2ae5daffdba..cb0f89945e440 100644 --- a/pandas/io/_util.py +++ b/pandas/io/_util.py @@ -1,11 +1,14 @@ from __future__ import annotations -from typing import Callable +from typing import TYPE_CHECKING from pandas.compat._optional import import_optional_dependency import pandas as pd +if TYPE_CHECKING: + from collections.abc import Callable + def _arrow_dtype_mapping() -> dict: pa = import_optional_dependency("pyarrow") diff --git a/pandas/io/common.py b/pandas/io/common.py index 4507a7d08c8ba..a76f0cf6dd34d 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -55,10 +55,6 @@ BaseBuffer, ReadCsvBuffer, ) -from pandas.compat import ( - get_bz2_file, - get_lzma_file, -) from pandas.compat._optional import import_optional_dependency from pandas.util._decorators import doc from pandas.util._exceptions import find_stack_level @@ -784,9 +780,11 @@ def get_handle( # BZ Compression elif compression == "bz2": + import bz2 + # Overload of "BZ2File" to handle pickle protocol 5 # "Union[str, BaseBuffer]", "str", "Dict[str, Any]" - handle = get_bz2_file()( # type: ignore[call-overload] + handle = bz2.BZ2File( # type: ignore[call-overload] handle, mode=ioargs.mode, **compression_args, @@ -849,7 +847,9 @@ def get_handle( # error: Argument 1 to "LZMAFile" has incompatible type "Union[str, # BaseBuffer]"; expected "Optional[Union[Union[str, bytes, PathLike[str], # PathLike[bytes]], IO[bytes]], None]" - handle = get_lzma_file()( + import lzma + + handle = lzma.LZMAFile( handle, # type: ignore[arg-type] ioargs.mode, **compression_args, diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 1eb22d4ee9de7..de0ef3728fb6e 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -1,6 +1,7 @@ from __future__ import annotations from collections.abc import ( + Callable, Hashable, Iterable, Mapping, @@ -14,7 +15,6 @@ IO, TYPE_CHECKING, Any, - Callable, Generic, Literal, TypeVar, diff --git a/pandas/io/excel/_util.py b/pandas/io/excel/_util.py index f879f16aa5dc8..e7c5d518abaee 100644 --- a/pandas/io/excel/_util.py +++ b/pandas/io/excel/_util.py @@ -1,6 +1,7 @@ from __future__ import annotations from collections.abc import ( + Callable, Hashable, Iterable, MutableMapping, @@ -9,7 +10,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Literal, TypeVar, overload, diff --git a/pandas/io/formats/css.py b/pandas/io/formats/css.py index d3d0da6f562a7..0af04526ea96d 100644 --- a/pandas/io/formats/css.py +++ b/pandas/io/formats/css.py @@ -5,10 +5,7 @@ from __future__ import annotations import re -from typing import ( - TYPE_CHECKING, - Callable, -) +from typing import TYPE_CHECKING import warnings from pandas.errors import CSSWarning @@ -16,6 +13,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Generator, Iterable, Iterator, diff --git a/pandas/io/formats/excel.py b/pandas/io/formats/excel.py index b6c6112b05ab3..a98d9c175c2bd 100644 --- a/pandas/io/formats/excel.py +++ b/pandas/io/formats/excel.py @@ -5,6 +5,7 @@ from __future__ import annotations from collections.abc import ( + Callable, Hashable, Iterable, Mapping, @@ -16,7 +17,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, cast, ) import warnings diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index c503121328f53..9ad5ac83e9eae 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -6,6 +6,7 @@ from __future__ import annotations from collections.abc import ( + Callable, Generator, Hashable, Mapping, @@ -22,7 +23,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Final, cast, ) diff --git a/pandas/io/formats/printing.py b/pandas/io/formats/printing.py index 0bd4f2935f4d0..67b5eb6f5ee5b 100644 --- a/pandas/io/formats/printing.py +++ b/pandas/io/formats/printing.py @@ -5,6 +5,7 @@ from __future__ import annotations from collections.abc import ( + Callable, Iterable, Mapping, Sequence, @@ -13,7 +14,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, TypeVar, Union, ) diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 8212b50594842..a695c539977b3 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -9,7 +9,6 @@ import operator from typing import ( TYPE_CHECKING, - Callable, overload, ) @@ -55,6 +54,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Hashable, Sequence, ) diff --git a/pandas/io/formats/style_render.py b/pandas/io/formats/style_render.py index 92afbc0e150ef..19a3563f43b4e 100644 --- a/pandas/io/formats/style_render.py +++ b/pandas/io/formats/style_render.py @@ -1,13 +1,15 @@ from __future__ import annotations from collections import defaultdict -from collections.abc import Sequence +from collections.abc import ( + Callable, + Sequence, +) from functools import partial import re from typing import ( TYPE_CHECKING, Any, - Callable, DefaultDict, Optional, TypedDict, diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index ff01d2f62761b..74e6595a7f0f2 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -9,7 +9,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Generic, Literal, TypeVar, @@ -65,6 +64,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Hashable, Mapping, ) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index c6cc85b9f722b..7e91d9e262748 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -7,7 +7,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, cast, final, overload, @@ -78,6 +77,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Iterable, Mapping, Sequence, diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 66edbcaa755ed..8a07c99b0fe94 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -17,7 +17,6 @@ IO, TYPE_CHECKING, Any, - Callable, Generic, Literal, TypedDict, @@ -70,6 +69,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Hashable, Iterable, Mapping, diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index d98c51159eb63..1420ce84b4db8 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -18,7 +18,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Final, Literal, cast, @@ -102,6 +101,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Hashable, Iterator, Sequence, diff --git a/pandas/io/sql.py b/pandas/io/sql.py index c8c9fd99d0165..41b368c9b05c2 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -23,7 +23,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Literal, cast, overload, @@ -67,6 +66,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Generator, Iterator, Mapping, diff --git a/pandas/io/stata.py b/pandas/io/stata.py index d1e57ad568ba5..5146876d20374 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -25,7 +25,6 @@ IO, TYPE_CHECKING, AnyStr, - Callable, Final, cast, ) @@ -74,6 +73,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Hashable, Sequence, ) diff --git a/pandas/io/xml.py b/pandas/io/xml.py index a6cd06cd61687..8c7381a926e72 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -9,7 +9,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, ) from pandas._libs import lib @@ -35,7 +34,10 @@ from pandas.io.parsers import TextParser if TYPE_CHECKING: - from collections.abc import Sequence + from collections.abc import ( + Callable, + Sequence, + ) from xml.etree.ElementTree import Element from lxml import etree diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index 0a29ab530c2fc..61c44e58b643a 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -3,7 +3,6 @@ import importlib from typing import ( TYPE_CHECKING, - Callable, Literal, ) @@ -27,6 +26,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Hashable, Sequence, ) diff --git a/pandas/tests/groupby/test_numeric_only.py b/pandas/tests/groupby/test_numeric_only.py index afbc64429e93c..7e7c84fa2b390 100644 --- a/pandas/tests/groupby/test_numeric_only.py +++ b/pandas/tests/groupby/test_numeric_only.py @@ -291,8 +291,7 @@ def test_numeric_only(kernel, has_arg, numeric_only, keys): [ "not allowed for this dtype", "cannot be performed against 'object' dtypes", - # On PY39 message is "a number"; on PY310 and after is "a real number" - "must be a string or a.* number", + "must be a string or a real number", "unsupported operand type", "function is not implemented for this dtype", re.escape(f"agg function failed [how->{kernel},dtype->object]"), diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index 744fe20e4995d..ad1f22224bc0d 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -12,7 +12,6 @@ import numpy as np import pytest -from pandas.compat._constants import PY310 from pandas.compat._optional import import_optional_dependency import pandas.util._test_decorators as td @@ -1251,13 +1250,12 @@ def test_engine_kwargs(self, engine, tmp_excel): "xlsxwriter": r"__init__() got an unexpected keyword argument 'foo'", } - if PY310: - msgs["openpyxl"] = ( - "Workbook.__init__() got an unexpected keyword argument 'foo'" - ) - msgs["xlsxwriter"] = ( - "Workbook.__init__() got an unexpected keyword argument 'foo'" - ) + msgs["openpyxl"] = ( + "Workbook.__init__() got an unexpected keyword argument 'foo'" + ) + msgs["xlsxwriter"] = ( + "Workbook.__init__() got an unexpected keyword argument 'foo'" + ) # Handle change in error message for openpyxl (write and append mode) if engine == "openpyxl" and not os.path.exists(tmp_excel): diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py index 00082be7e07e8..efc3e71564260 100644 --- a/pandas/tests/io/test_compression.py +++ b/pandas/tests/io/test_compression.py @@ -231,7 +231,7 @@ def test_with_missing_lzma(): @pytest.mark.single_cpu def test_with_missing_lzma_runtime(): - """Tests if RuntimeError is hit when calling lzma without + """Tests if ModuleNotFoundError is hit when calling lzma without having the module available. """ code = textwrap.dedent( @@ -241,7 +241,7 @@ def test_with_missing_lzma_runtime(): sys.modules['lzma'] = None import pandas as pd df = pd.DataFrame() - with pytest.raises(RuntimeError, match='lzma module'): + with pytest.raises(ModuleNotFoundError, match='import of lzma'): df.to_csv('foo.csv', compression='xz') """ ) diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 1420e24858ffb..98abbe3905204 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -13,7 +13,6 @@ from __future__ import annotations -from array import array import bz2 import datetime import functools @@ -32,12 +31,8 @@ import numpy as np import pytest -from pandas.compat import ( - get_lzma_file, - is_platform_little_endian, -) +from pandas.compat import is_platform_little_endian from pandas.compat._optional import import_optional_dependency -from pandas.compat.compressors import flatten_buffer import pandas as pd from pandas import ( @@ -81,35 +76,6 @@ def compare_element(result, expected, typ): # --------------------- -@pytest.mark.parametrize( - "data", - [ - b"123", - b"123456", - bytearray(b"123"), - memoryview(b"123"), - pickle.PickleBuffer(b"123"), - array("I", [1, 2, 3]), - memoryview(b"123456").cast("B", (3, 2)), - memoryview(b"123456").cast("B", (3, 2))[::2], - np.arange(12).reshape((3, 4), order="C"), - np.arange(12).reshape((3, 4), order="F"), - np.arange(12).reshape((3, 4), order="C")[:, ::2], - ], -) -def test_flatten_buffer(data): - result = flatten_buffer(data) - expected = memoryview(data).tobytes("A") - assert result == expected - if isinstance(data, (bytes, bytearray)): - assert result is data - elif isinstance(result, memoryview): - assert result.ndim == 1 - assert result.format == "B" - assert result.contiguous - assert result.shape == (result.nbytes,) - - def test_pickles(datapath): if not is_platform_little_endian(): pytest.skip("known failure on non-little endian") @@ -261,7 +227,9 @@ def compress_file(self, src_path, dest_path, compression): tarinfo = tar.gettarinfo(src_path, os.path.basename(src_path)) tar.addfile(tarinfo, fh) elif compression == "xz": - f = get_lzma_file()(dest_path, "w") + import lzma + + f = lzma.LZMAFile(dest_path, "w") elif compression == "zstd": f = import_optional_dependency("zstandard").open(dest_path, "wb") else: diff --git a/pandas/tests/scalar/timestamp/test_constructors.py b/pandas/tests/scalar/timestamp/test_constructors.py index 4ebdea3733484..4249063b67d31 100644 --- a/pandas/tests/scalar/timestamp/test_constructors.py +++ b/pandas/tests/scalar/timestamp/test_constructors.py @@ -18,7 +18,6 @@ import pytz from pandas._libs.tslibs.dtypes import NpyDatetimeUnit -from pandas.compat import PY310 from pandas.errors import OutOfBoundsDatetime from pandas import ( @@ -211,11 +210,7 @@ def test_timestamp_constructor_adjust_value_for_fold(self, tz, fold, value_out): class TestTimestampConstructorPositionalAndKeywordSupport: def test_constructor_positional(self): # see GH#10758 - msg = ( - "'NoneType' object cannot be interpreted as an integer" - if PY310 - else "an integer is required" - ) + msg = "'NoneType' object cannot be interpreted as an integer" with pytest.raises(TypeError, match=msg): Timestamp(2000, 1) diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index bcecd1b2d5eec..7b93416600f8f 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -3,7 +3,6 @@ import string import subprocess import sys -import textwrap import numpy as np import pytest @@ -247,21 +246,3 @@ def test_str_size(): ] result = subprocess.check_output(call).decode()[-4:-1].strip("\n") assert int(result) == int(expected) - - -@pytest.mark.single_cpu -def test_bz2_missing_import(): - # Check whether bz2 missing import is handled correctly (issue #53857) - code = """ - import sys - sys.modules['bz2'] = None - import pytest - import pandas as pd - from pandas.compat import get_bz2_file - msg = 'bz2 module not available.' - with pytest.raises(RuntimeError, match=msg): - get_bz2_file() - """ - code = textwrap.dedent(code) - call = [sys.executable, "-c", code] - subprocess.check_output(call) diff --git a/pandas/tseries/holiday.py b/pandas/tseries/holiday.py index 8e51183138b5c..bf4ec2e551f01 100644 --- a/pandas/tseries/holiday.py +++ b/pandas/tseries/holiday.py @@ -4,7 +4,7 @@ datetime, timedelta, ) -from typing import Callable +from typing import TYPE_CHECKING import warnings from dateutil.relativedelta import ( @@ -35,6 +35,9 @@ Easter, ) +if TYPE_CHECKING: + from collections.abc import Callable + def next_monday(dt: datetime) -> datetime: """ diff --git a/pandas/util/_decorators.py b/pandas/util/_decorators.py index bdfb0b1cad8ae..165824bec131f 100644 --- a/pandas/util/_decorators.py +++ b/pandas/util/_decorators.py @@ -6,7 +6,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, cast, ) import warnings @@ -19,7 +18,10 @@ from pandas.util._exceptions import find_stack_level if TYPE_CHECKING: - from collections.abc import Mapping + from collections.abc import ( + Callable, + Mapping, + ) def deprecate( diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index 48684c4810d2a..1c17587db72d4 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -27,14 +27,12 @@ def test_foo(): from __future__ import annotations import locale -from typing import ( - TYPE_CHECKING, - Callable, -) +from typing import TYPE_CHECKING import pytest if TYPE_CHECKING: + from collections.abc import Callable from pandas._typing import F from pandas.compat import ( diff --git a/pandas/util/version/__init__.py b/pandas/util/version/__init__.py index 153424e339c45..9838e371f0d00 100644 --- a/pandas/util/version/__init__.py +++ b/pandas/util/version/__init__.py @@ -8,11 +8,13 @@ from __future__ import annotations import collections -from collections.abc import Iterator +from collections.abc import ( + Callable, + Iterator, +) import itertools import re from typing import ( - Callable, SupportsInt, Tuple, Union, diff --git a/pyproject.toml b/pyproject.toml index e7d7474134c3a..661e8efbb95fc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,7 +25,7 @@ authors = [ { name = 'The Pandas Development Team', email='pandas-dev@python.org' }, ] license = {file = 'LICENSE'} -requires-python = '>=3.9' +requires-python = '>=3.10' dependencies = [ "numpy>=1.23.5; python_version<'3.12'", "numpy>=1.26.0; python_version>='3.12'", @@ -43,7 +43,6 @@ classifiers = [ 'Programming Language :: Python', 'Programming Language :: Python :: 3', 'Programming Language :: Python :: 3 :: Only', - 'Programming Language :: Python :: 3.9', 'Programming Language :: Python :: 3.10', 'Programming Language :: Python :: 3.11', 'Programming Language :: Python :: 3.12', @@ -146,7 +145,7 @@ parentdir_prefix = "pandas-" setup = ['--vsenv'] # For Windows [tool.cibuildwheel] -skip = "cp36-* cp37-* cp38-* pp* *_i686 *_ppc64le *_s390x" +skip = "cp36-* cp37-* cp38-* cp39-* pp* *_i686 *_ppc64le *_s390x" build-verbosity = "3" environment = {LDFLAGS="-Wl,--strip-all"} test-requires = "hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0" @@ -521,7 +520,6 @@ module = [ "pandas._libs.*", "pandas._testing.*", # TODO "pandas.compat.numpy.function", # TODO - "pandas.compat.compressors", # TODO "pandas.core._numba.executor", # TODO "pandas.core.array_algos.masked_reductions", # TODO "pandas.core.array_algos.putmask", # TODO diff --git a/scripts/check_for_inconsistent_pandas_namespace.py b/scripts/check_for_inconsistent_pandas_namespace.py index 52eca6f6d93ac..ec0a4a408c800 100644 --- a/scripts/check_for_inconsistent_pandas_namespace.py +++ b/scripts/check_for_inconsistent_pandas_namespace.py @@ -27,10 +27,7 @@ Sequence, ) import sys -from typing import ( - NamedTuple, - Optional, -) +from typing import NamedTuple ERROR_MESSAGE = ( "{path}:{lineno}:{col_offset}: " @@ -89,7 +86,7 @@ def replace_inconsistent_pandas_namespace(visitor: Visitor, content: str) -> str def check_for_inconsistent_pandas_namespace( content: str, path: str, *, replace: bool -) -> Optional[str]: +) -> str | None: tree = ast.parse(content) visitor = Visitor() @@ -121,7 +118,7 @@ def check_for_inconsistent_pandas_namespace( return replace_inconsistent_pandas_namespace(visitor, content) -def main(argv: Optional[Sequence[str]] = None) -> None: +def main(argv: Sequence[str] | None = None) -> None: parser = argparse.ArgumentParser() parser.add_argument("paths", nargs="*") parser.add_argument("--replace", action="store_true") diff --git a/scripts/tests/data/deps_minimum.toml b/scripts/tests/data/deps_minimum.toml index ed7b9affe9a50..b832b6aa95198 100644 --- a/scripts/tests/data/deps_minimum.toml +++ b/scripts/tests/data/deps_minimum.toml @@ -39,8 +39,6 @@ classifiers = [ 'Programming Language :: Python', 'Programming Language :: Python :: 3', 'Programming Language :: Python :: 3 :: Only', - 'Programming Language :: Python :: 3.8', - 'Programming Language :: Python :: 3.9', 'Programming Language :: Python :: 3.10', 'Programming Language :: Python :: 3.11', 'Topic :: Scientific/Engineering' diff --git a/scripts/validate_unwanted_patterns.py b/scripts/validate_unwanted_patterns.py index ba3123a07df4b..35f6ffb4980df 100755 --- a/scripts/validate_unwanted_patterns.py +++ b/scripts/validate_unwanted_patterns.py @@ -12,14 +12,14 @@ import argparse import ast -from collections.abc import Iterable +from collections.abc import ( + Callable, + Iterable, +) import sys import token import tokenize -from typing import ( - IO, - Callable, -) +from typing import IO PRIVATE_IMPORTS_TO_IGNORE: set[str] = { "_extension_array_shared_docs", From fe785cc091048a74779e5bfd578bd57a27ed96df Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Mon, 24 Jun 2024 22:36:34 +0530 Subject: [PATCH 109/272] DOC: fix SA01 for pandas.Timestamp.floor (#59070) * DOC: fix SA01 for pandas.Timestamp.floor * DOC: fix SA01 for pandas.Timestamp.floor --- ci/code_checks.sh | 1 - pandas/_libs/tslibs/nattype.pyx | 6 ++++++ pandas/_libs/tslibs/timestamps.pyx | 6 ++++++ 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 013f7abe5ff0d..424171cee794c 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -231,7 +231,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Timestamp.ctime SA01" \ -i "pandas.Timestamp.date SA01" \ -i "pandas.Timestamp.day GL08" \ - -i "pandas.Timestamp.floor SA01" \ -i "pandas.Timestamp.fold GL08" \ -i "pandas.Timestamp.fromordinal SA01" \ -i "pandas.Timestamp.fromtimestamp PR01,SA01" \ diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index c483814a3ef74..653097026465a 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -1139,6 +1139,12 @@ timedelta}, default 'raise' ------ ValueError if the freq cannot be converted. + See Also + -------- + Timestamp.ceil : Round up a Timestamp to the specified resolution. + Timestamp.round : Round a Timestamp to the specified resolution. + Series.dt.floor : Round down the datetime values in a Series. + Notes ----- If the Timestamp has a timezone, flooring will take place relative to the diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 9cd0fea1d618e..707e36a848a49 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -2207,6 +2207,12 @@ timedelta}, default 'raise' ------ ValueError if the freq cannot be converted. + See Also + -------- + Timestamp.ceil : Round up a Timestamp to the specified resolution. + Timestamp.round : Round a Timestamp to the specified resolution. + Series.dt.floor : Round down the datetime values in a Series. + Notes ----- If the Timestamp has a timezone, flooring will take place relative to the From dfaaa39034eeb0bcf1da4dc7c4534b5b4ed6b1c4 Mon Sep 17 00:00:00 2001 From: Abdulaziz Aloqeely <52792999+Aloqeely@users.noreply.github.com> Date: Mon, 24 Jun 2024 20:13:09 +0300 Subject: [PATCH 110/272] BUG: Fix read_csv raising TypeError when iterator and nrows are specified without chunksize (#59080) BUG: Fix read_csv raising TypeError when iterator and nrows are specified without a chunksize --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/io/parsers/readers.py | 5 +++- .../tests/io/parser/common/test_iterator.py | 25 +++++++++++++++++++ 3 files changed, 30 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 3d869bf31f372..73e939e7f4bb0 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -554,6 +554,7 @@ I/O - Bug in :meth:`DataFrame.to_string` that raised ``StopIteration`` with nested DataFrames. (:issue:`16098`) - Bug in :meth:`HDFStore.get` was failing to save data of dtype datetime64[s] correctly (:issue:`59004`) - Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`) +- Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`) - Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`) Period diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 8a07c99b0fe94..d00fc3b15976c 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -1534,7 +1534,10 @@ def get_chunk(self, size: int | None = None) -> DataFrame: if self.nrows is not None: if self._currow >= self.nrows: raise StopIteration - size = min(size, self.nrows - self._currow) + if size is None: + size = self.nrows - self._currow + else: + size = min(size, self.nrows - self._currow) return self.read(nrows=size) def __enter__(self) -> Self: diff --git a/pandas/tests/io/parser/common/test_iterator.py b/pandas/tests/io/parser/common/test_iterator.py index 091edb67f6e19..668aab05b9fa4 100644 --- a/pandas/tests/io/parser/common/test_iterator.py +++ b/pandas/tests/io/parser/common/test_iterator.py @@ -98,6 +98,31 @@ def test_iterator_stop_on_chunksize(all_parsers): tm.assert_frame_equal(concat(result), expected) +def test_nrows_iterator_without_chunksize(all_parsers): + # GH 59079 + parser = all_parsers + data = """A,B,C +foo,1,2,3 +bar,4,5,6 +baz,7,8,9 +""" + if parser.engine == "pyarrow": + msg = "The 'iterator' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), iterator=True, nrows=2) + return + + with parser.read_csv(StringIO(data), iterator=True, nrows=2) as reader: + result = reader.get_chunk() + + expected = DataFrame( + [[1, 2, 3], [4, 5, 6]], + index=["foo", "bar"], + columns=["A", "B", "C"], + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( "kwargs", [{"iterator": True, "chunksize": 1}, {"iterator": True}, {"chunksize": 1}] ) From bd7ece0016fe883723b02d3012c31f731490a3c4 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 24 Jun 2024 07:55:15 -1000 Subject: [PATCH 111/272] BUG: DatetimeIndex.union with non-nano (#59037) * BUG: DatetimeIndex.union with non-nano * Add as_unit --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/indexes/datetimelike.py | 6 ++++-- pandas/tests/indexes/datetimes/test_setops.py | 16 ++++++++++++++++ 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 73e939e7f4bb0..f7039021ff276 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -495,6 +495,7 @@ Datetimelike - Bug in :meth:`Dataframe.agg` with df with missing values resulting in IndexError (:issue:`58810`) - Bug in :meth:`DatetimeIndex.is_year_start` and :meth:`DatetimeIndex.is_quarter_start` does not raise on Custom business days frequencies bigger then "1C" (:issue:`58664`) - Bug in :meth:`DatetimeIndex.is_year_start` and :meth:`DatetimeIndex.is_quarter_start` returning ``False`` on double-digit frequencies (:issue:`58523`) +- Bug in :meth:`DatetimeIndex.union` when ``unit`` was non-nanosecond (:issue:`59036`) - Bug in setting scalar values with mismatched resolution into arrays with non-nanosecond ``datetime64``, ``timedelta64`` or :class:`DatetimeTZDtype` incorrectly truncating those scalars (:issue:`56410`) Timedelta diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 7e8d808769bc1..e1120466eaf83 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -523,7 +523,7 @@ def _as_range_index(self) -> RangeIndex: # Convert our i8 representations to RangeIndex # Caller is responsible for checking isinstance(self.freq, Tick) freq = cast(Tick, self.freq) - tick = Timedelta(freq).as_unit("ns")._value + tick = Timedelta(freq).as_unit(self.unit)._value rng = range(self[0]._value, self[-1]._value + tick, tick) return RangeIndex(rng) @@ -536,7 +536,9 @@ def _wrap_range_setop(self, other, res_i8) -> Self: # RangeIndex defaults to step=1, which we don't want. new_freq = self.freq elif isinstance(res_i8, RangeIndex): - new_freq = to_offset(Timedelta(res_i8.step)) + new_freq = to_offset( + Timedelta(res_i8.step, unit=self.unit).as_unit(self.unit) + ) # TODO(GH#41493): we cannot just do # type(self._data)(res_i8.values, dtype=self.dtype, freq=new_freq) diff --git a/pandas/tests/indexes/datetimes/test_setops.py b/pandas/tests/indexes/datetimes/test_setops.py index fc3a1d4721841..36011b981179b 100644 --- a/pandas/tests/indexes/datetimes/test_setops.py +++ b/pandas/tests/indexes/datetimes/test_setops.py @@ -664,3 +664,19 @@ def test_intersection_dst_transition(self, tz): result = index1.union(index2) expected = date_range("2021-10-28", periods=6, freq="D", tz="Europe/London") tm.assert_index_equal(result, expected) + + +def test_union_non_nano_rangelike(): + # GH 59036 + l1 = DatetimeIndex( + ["2024-05-11", "2024-05-12"], dtype="datetime64[us]", name="Date", freq="D" + ) + l2 = DatetimeIndex(["2024-05-13"], dtype="datetime64[us]", name="Date", freq="D") + result = l1.union(l2) + expected = DatetimeIndex( + ["2024-05-11", "2024-05-12", "2024-05-13"], + dtype="datetime64[us]", + name="Date", + freq="D", + ) + tm.assert_index_equal(result, expected) From 1cf98aa9ecd65764ace8f0e38cf54220f3034052 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 24 Jun 2024 08:04:21 -1000 Subject: [PATCH 112/272] REF: Prefer testing and documenting zoneinfo instead of pytz (#59016) * REF: Prefer testing and documenting zoneinfo instead of pytz * Fix tests * Remove bad test case, fix bad attribute --- asv_bench/benchmarks/tslibs/timestamp.py | 15 +-- asv_bench/benchmarks/tslibs/tslib.py | 4 +- asv_bench/benchmarks/tslibs/tz_convert.py | 5 +- doc/source/user_guide/io.rst | 6 +- doc/source/user_guide/timeseries.rst | 16 +-- pandas/_libs/tslibs/nattype.pyx | 14 +-- pandas/_libs/tslibs/timestamps.pyx | 14 +-- pandas/_libs/tslibs/timezones.pyx | 39 ++++--- pandas/_testing/_hypothesis.py | 3 +- pandas/core/arrays/datetimes.py | 12 +-- pandas/core/indexes/datetimes.py | 2 +- pandas/tests/arithmetic/test_datetime64.py | 8 +- pandas/tests/arrays/test_array.py | 18 ++-- pandas/tests/arrays/test_datetimes.py | 25 ++--- pandas/tests/dtypes/test_dtypes.py | 4 +- pandas/tests/dtypes/test_inference.py | 4 +- .../frame/constructors/test_from_records.py | 8 +- pandas/tests/frame/methods/test_at_time.py | 11 +- pandas/tests/frame/methods/test_join.py | 12 +-- pandas/tests/frame/methods/test_to_dict.py | 14 +-- pandas/tests/frame/methods/test_tz_convert.py | 24 +++-- pandas/tests/frame/test_alter_axes.py | 9 +- pandas/tests/frame/test_constructors.py | 5 +- pandas/tests/groupby/test_timegrouper.py | 12 +-- .../indexes/datetimes/methods/test_astype.py | 3 +- .../indexes/datetimes/methods/test_insert.py | 100 ++++++++++-------- .../indexes/datetimes/methods/test_shift.py | 32 ++++-- .../datetimes/methods/test_to_period.py | 11 +- .../datetimes/methods/test_tz_convert.py | 6 +- .../datetimes/methods/test_tz_localize.py | 47 +++----- .../indexes/datetimes/test_constructors.py | 27 +++-- .../indexes/datetimes/test_date_range.py | 15 +-- .../tests/indexes/datetimes/test_formats.py | 8 +- pandas/tests/indexes/datetimes/test_setops.py | 5 +- .../tests/indexes/datetimes/test_timezones.py | 26 +++-- pandas/tests/indexes/multi/test_reshape.py | 16 +-- pandas/tests/io/json/test_ujson.py | 2 +- pandas/tests/io/parser/test_parse_dates.py | 2 +- pandas/tests/io/test_feather.py | 7 +- pandas/tests/io/test_parquet.py | 4 +- pandas/tests/io/test_pickle.py | 1 + pandas/tests/resample/test_datetime_index.py | 8 +- pandas/tests/resample/test_period_index.py | 21 ++-- .../reshape/concat/test_append_common.py | 9 +- pandas/tests/reshape/merge/test_merge_asof.py | 7 +- pandas/tests/scalar/test_nat.py | 4 +- .../scalar/timestamp/methods/test_replace.py | 11 +- .../methods/test_timestamp_method.py | 4 +- .../timestamp/methods/test_to_pydatetime.py | 3 +- .../timestamp/methods/test_tz_localize.py | 57 +++++----- .../tests/scalar/timestamp/test_arithmetic.py | 7 +- .../scalar/timestamp/test_constructors.py | 43 ++++---- pandas/tests/scalar/timestamp/test_formats.py | 14 +-- .../tests/scalar/timestamp/test_timestamp.py | 15 ++- pandas/tests/series/indexing/test_datetime.py | 2 +- pandas/tests/series/methods/test_fillna.py | 11 +- pandas/tests/tools/test_to_datetime.py | 21 ++-- pandas/tests/tseries/holiday/test_holiday.py | 16 +-- pandas/tests/tseries/offsets/test_dst.py | 31 +++--- pandas/tests/tslibs/test_conversion.py | 22 ++-- pandas/tests/tslibs/test_resolution.py | 5 +- pandas/tests/tslibs/test_timezones.py | 27 ++--- 62 files changed, 504 insertions(+), 430 deletions(-) diff --git a/asv_bench/benchmarks/tslibs/timestamp.py b/asv_bench/benchmarks/tslibs/timestamp.py index 082220ee0dff2..6145966fb6a0e 100644 --- a/asv_bench/benchmarks/tslibs/timestamp.py +++ b/asv_bench/benchmarks/tslibs/timestamp.py @@ -1,7 +1,10 @@ -from datetime import datetime +from datetime import ( + datetime, + timezone, +) +import zoneinfo import numpy as np -import pytz from pandas import Timestamp @@ -12,7 +15,7 @@ class TimestampConstruction: def setup(self): self.npdatetime64 = np.datetime64("2020-01-01 00:00:00") self.dttime_unaware = datetime(2020, 1, 1, 0, 0, 0) - self.dttime_aware = datetime(2020, 1, 1, 0, 0, 0, 0, pytz.UTC) + self.dttime_aware = datetime(2020, 1, 1, 0, 0, 0, 0, timezone.utc) self.ts = Timestamp("2020-01-01 00:00:00") def time_parse_iso8601_no_tz(self): @@ -113,7 +116,7 @@ def setup(self, tz): self.ts = Timestamp("2017-08-25 08:16:14", tz=tz) def time_replace_tz(self, tz): - self.ts.replace(tzinfo=pytz.timezone("US/Eastern")) + self.ts.replace(tzinfo=zoneinfo.ZoneInfo("US/Eastern")) def time_replace_None(self, tz): self.ts.replace(tzinfo=None) @@ -144,8 +147,8 @@ def time_ceil(self, tz): class TimestampAcrossDst: def setup(self): - dt = datetime(2016, 3, 27, 1) - self.tzinfo = pytz.timezone("CET").localize(dt, is_dst=False).tzinfo + dt = datetime(2016, 3, 27, 1, fold=0) + self.tzinfo = dt.astimezone(zoneinfo.ZoneInfo("Europe/Berlin")).tzinfo self.ts2 = Timestamp(dt) def time_replace_across_dst(self): diff --git a/asv_bench/benchmarks/tslibs/tslib.py b/asv_bench/benchmarks/tslibs/tslib.py index 4a011d4bb3f06..885cf48d01743 100644 --- a/asv_bench/benchmarks/tslibs/tslib.py +++ b/asv_bench/benchmarks/tslibs/tslib.py @@ -20,13 +20,13 @@ timedelta, timezone, ) +import zoneinfo from dateutil.tz import ( gettz, tzlocal, ) import numpy as np -import pytz try: from pandas._libs.tslibs import ints_to_pydatetime @@ -38,7 +38,7 @@ None, timezone.utc, timezone(timedelta(minutes=60)), - pytz.timezone("US/Pacific"), + zoneinfo.ZoneInfo("US/Pacific"), gettz("Asia/Tokyo"), tzlocal_obj, ] diff --git a/asv_bench/benchmarks/tslibs/tz_convert.py b/asv_bench/benchmarks/tslibs/tz_convert.py index c6b510efdca69..c87adb5e5d0e9 100644 --- a/asv_bench/benchmarks/tslibs/tz_convert.py +++ b/asv_bench/benchmarks/tslibs/tz_convert.py @@ -1,5 +1,6 @@ +from datetime import timezone + import numpy as np -from pytz import UTC from pandas._libs.tslibs.tzconversion import tz_localize_to_utc @@ -41,7 +42,7 @@ def time_tz_convert_from_utc(self, size, tz): # dti = DatetimeIndex(self.i8data, tz=tz) # dti.tz_localize(None) if old_sig: - tz_convert_from_utc(self.i8data, UTC, tz) + tz_convert_from_utc(self.i8data, timezone.utc, tz) else: tz_convert_from_utc(self.i8data, tz) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index c523f3a641d91..64b151c167ef3 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -4990,7 +4990,7 @@ Caveats convenience you can use ``store.flush(fsync=True)`` to do this for you. * Once a ``table`` is created columns (DataFrame) are fixed; only exactly the same columns can be appended -* Be aware that timezones (e.g., ``pytz.timezone('US/Eastern')``) +* Be aware that timezones (e.g., ``zoneinfo.ZoneInfo('US/Eastern')``) are not necessarily equal across timezone versions. So if data is localized to a specific timezone in the HDFStore using one version of a timezone library and that data is updated with another version, the data @@ -5169,6 +5169,8 @@ See the `Full Documentation `__. .. ipython:: python + import pytz + df = pd.DataFrame( { "a": list("abc"), @@ -5178,7 +5180,7 @@ See the `Full Documentation `__. "e": [True, False, True], "f": pd.Categorical(list("abc")), "g": pd.date_range("20130101", periods=3), - "h": pd.date_range("20130101", periods=3, tz="US/Eastern"), + "h": pd.date_range("20130101", periods=3, tz=pytz.timezone("US/Eastern")), "i": pd.date_range("20130101", periods=3, freq="ns"), } ) diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index 0fa36f1e30104..0845417e4910d 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -2337,7 +2337,7 @@ Time zone handling ------------------ pandas provides rich support for working with timestamps in different time -zones using the ``pytz`` and ``dateutil`` libraries or :class:`datetime.timezone` +zones using the ``zoneinfo``, ``pytz`` and ``dateutil`` libraries or :class:`datetime.timezone` objects from the standard library. @@ -2354,14 +2354,14 @@ By default, pandas objects are time zone unaware: To localize these dates to a time zone (assign a particular time zone to a naive date), you can use the ``tz_localize`` method or the ``tz`` keyword argument in :func:`date_range`, :class:`Timestamp`, or :class:`DatetimeIndex`. -You can either pass ``pytz`` or ``dateutil`` time zone objects or Olson time zone database strings. +You can either pass ``zoneinfo``, ``pytz`` or ``dateutil`` time zone objects or Olson time zone database strings. Olson time zone strings will return ``pytz`` time zone objects by default. To return ``dateutil`` time zone objects, append ``dateutil/`` before the string. -* In ``pytz`` you can find a list of common (and less common) time zones using - ``from pytz import common_timezones, all_timezones``. +* For ``zoneinfo``, a list of available timezones are available from :py:func:`zoneinfo.available_timezones`. +* In ``pytz`` you can find a list of common (and less common) time zones using ``pytz.all_timezones``. * ``dateutil`` uses the OS time zones so there isn't a fixed list available. For - common zones, the names are the same as ``pytz``. + common zones, the names are the same as ``pytz`` and ``zoneinfo``. .. ipython:: python @@ -2466,7 +2466,7 @@ you can use the ``tz_convert`` method. .. warning:: - If you are using dates beyond 2038-01-18, due to current deficiencies + If you are using dates beyond 2038-01-18 with ``pytz``, due to current deficiencies in the underlying libraries caused by the year 2038 problem, daylight saving time (DST) adjustments to timezone aware dates will not be applied. If and when the underlying libraries are fixed, the DST transitions will be applied. @@ -2475,9 +2475,11 @@ you can use the ``tz_convert`` method. .. ipython:: python + import pytz + d_2037 = "2037-03-31T010101" d_2038 = "2038-03-31T010101" - DST = "Europe/London" + DST = pytz.timezone("Europe/London") assert pd.Timestamp(d_2037, tz=DST) != pd.Timestamp(d_2037, tz="GMT") assert pd.Timestamp(d_2038, tz=DST) == pd.Timestamp(d_2038, tz="GMT") diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index 653097026465a..27a371ef43832 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -841,7 +841,7 @@ class NaTType(_NaT): Parameters ---------- - tz : str, pytz.timezone, dateutil.tz.tzfile or None + tz : str, zoneinfo.ZoneInfo, pytz.timezone, dateutil.tz.tzfile or None Time zone for time which Timestamp will be converted to. None will remove timezone holding UTC time. @@ -894,7 +894,7 @@ class NaTType(_NaT): ---------- ordinal : int Date corresponding to a proleptic Gregorian ordinal. - tz : str, pytz.timezone, dateutil.tz.tzfile or None + tz : str, zoneinfo.ZoneInfo, pytz.timezone, dateutil.tz.tzfile or None Time zone for the Timestamp. Notes @@ -1307,7 +1307,7 @@ timedelta}, default 'raise' Parameters ---------- - tz : str, pytz.timezone, dateutil.tz.tzfile or None + tz : str, zoneinfo.ZoneInfo, pytz.timezone, dateutil.tz.tzfile or None Time zone for time which Timestamp will be converted to. None will remove timezone holding UTC time. @@ -1361,7 +1361,7 @@ timedelta}, default 'raise' Parameters ---------- - tz : str, pytz.timezone, dateutil.tz.tzfile or None + tz : str, zoneinfo.ZoneInfo, pytz.timezone, dateutil.tz.tzfile or None Time zone for time which Timestamp will be converted to. None will remove timezone holding local time. @@ -1461,13 +1461,13 @@ default 'raise' Replace timezone (not a conversion): - >>> import pytz - >>> ts.replace(tzinfo=pytz.timezone('US/Pacific')) + >>> import zoneinfo + >>> ts.replace(tzinfo=zoneinfo.ZoneInfo('US/Pacific')) Timestamp('2020-03-14 15:32:52.192548651-0700', tz='US/Pacific') Analogous for ``pd.NaT``: - >>> pd.NaT.replace(tzinfo=pytz.timezone('US/Pacific')) + >>> pd.NaT.replace(tzinfo=zoneinfo.ZoneInfo('US/Pacific')) NaT """, ) diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 707e36a848a49..93715c907d182 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -1374,7 +1374,7 @@ class Timestamp(_Timestamp): Timezone info. nanosecond : int, optional, default 0 Value of nanosecond. - tz : str, pytz.timezone, dateutil.tz.tzfile or None + tz : str, zoneinfo.ZoneInfo, pytz.timezone, dateutil.tz.tzfile or None Time zone for time which Timestamp will have. unit : str Unit used for conversion if ts_input is of type int or float. The @@ -1446,7 +1446,7 @@ class Timestamp(_Timestamp): ---------- ordinal : int Date corresponding to a proleptic Gregorian ordinal. - tz : str, pytz.timezone, dateutil.tz.tzfile or None + tz : str, zoneinfo.ZoneInfo, pytz.timezone, dateutil.tz.tzfile or None Time zone for the Timestamp. Notes @@ -2393,7 +2393,7 @@ timedelta}, default 'raise' Parameters ---------- - tz : str, pytz.timezone, dateutil.tz.tzfile or None + tz : str, zoneinfo.ZoneInfo, pytz.timezone, dateutil.tz.tzfile or None Time zone for time which Timestamp will be converted to. None will remove timezone holding local time. @@ -2500,7 +2500,7 @@ default 'raise' Parameters ---------- - tz : str, pytz.timezone, dateutil.tz.tzfile or None + tz : str, zoneinfo.ZoneInfo, pytz.timezone, dateutil.tz.tzfile or None Time zone for time which Timestamp will be converted to. None will remove timezone holding UTC time. @@ -2604,13 +2604,13 @@ default 'raise' Replace timezone (not a conversion): - >>> import pytz - >>> ts.replace(tzinfo=pytz.timezone('US/Pacific')) + >>> import zoneinfo + >>> ts.replace(tzinfo=zoneinfo.ZoneInfo('US/Pacific')) Timestamp('2020-03-14 15:32:52.192548651-0700', tz='US/Pacific') Analogous for ``pd.NaT``: - >>> pd.NaT.replace(tzinfo=pytz.timezone('US/Pacific')) + >>> pd.NaT.replace(tzinfo=zoneinfo.ZoneInfo('US/Pacific')) NaT """ diff --git a/pandas/_libs/tslibs/timezones.pyx b/pandas/_libs/tslibs/timezones.pyx index 10e5790dd1c35..6292b6ce0fd1d 100644 --- a/pandas/_libs/tslibs/timezones.pyx +++ b/pandas/_libs/tslibs/timezones.pyx @@ -119,27 +119,26 @@ cpdef inline object get_timezone(tzinfo tz): raise TypeError("tz argument cannot be None") if is_utc(tz): return tz + elif is_zoneinfo(tz): + return tz.key + elif treat_tz_as_pytz(tz): + zone = tz.zone + if zone is None: + return tz + return zone + elif treat_tz_as_dateutil(tz): + if ".tar.gz" in tz._filename: + raise ValueError( + "Bad tz filename. Dateutil on python 3 on windows has a " + "bug which causes tzfile._filename to be the same for all " + "timezone files. Please construct dateutil timezones " + 'implicitly by passing a string like "dateutil/Europe' + '/London" when you construct your pandas objects instead ' + "of passing a timezone object. See " + "https://github.com/pandas-dev/pandas/pull/7362") + return "dateutil/" + tz._filename else: - if treat_tz_as_dateutil(tz): - if ".tar.gz" in tz._filename: - raise ValueError( - "Bad tz filename. Dateutil on python 3 on windows has a " - "bug which causes tzfile._filename to be the same for all " - "timezone files. Please construct dateutil timezones " - 'implicitly by passing a string like "dateutil/Europe' - '/London" when you construct your pandas objects instead ' - "of passing a timezone object. See " - "https://github.com/pandas-dev/pandas/pull/7362") - return "dateutil/" + tz._filename - else: - # tz is a pytz timezone or unknown. - try: - zone = tz.zone - if zone is None: - return tz - return zone - except AttributeError: - return tz + return tz cpdef inline tzinfo maybe_get_tz(object tz): diff --git a/pandas/_testing/_hypothesis.py b/pandas/_testing/_hypothesis.py index b7fc175b10d17..bbad21d8ab8d1 100644 --- a/pandas/_testing/_hypothesis.py +++ b/pandas/_testing/_hypothesis.py @@ -6,7 +6,6 @@ from hypothesis import strategies as st from hypothesis.extra.dateutil import timezones as dateutil_timezones -from hypothesis.extra.pytz import timezones as pytz_timezones from pandas.compat import is_platform_windows @@ -57,7 +56,7 @@ DATETIME_JAN_1_1900_OPTIONAL_TZ = st.datetimes( min_value=pd.Timestamp(1900, 1, 1).to_pydatetime(), # pyright: ignore[reportArgumentType] max_value=pd.Timestamp(1900, 1, 1).to_pydatetime(), # pyright: ignore[reportArgumentType] - timezones=st.one_of(st.none(), dateutil_timezones(), pytz_timezones()), + timezones=st.one_of(st.none(), dateutil_timezones(), st.timezones()), ) DATETIME_IN_PD_TIMESTAMP_RANGE_NO_TZ = st.datetimes( diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index e0a4587535cfd..34d25f04b69e1 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -594,7 +594,7 @@ def tz(self) -> tzinfo | None: Returns ------- - datetime.tzinfo, pytz.tzinfo.BaseTZInfo, dateutil.tz.tz.tzfile, or None + zoneinfo.ZoneInfo,, datetime.tzinfo, pytz.tzinfo.BaseTZInfo, dateutil.tz.tz.tzfile, or None Returns None when the array is tz-naive. See Also @@ -624,7 +624,7 @@ def tz(self) -> tzinfo | None: ... ) >>> idx.tz datetime.timezone.utc - """ + """ # noqa: E501 # GH 18595 return getattr(self.dtype, "tz", None) @@ -863,7 +863,7 @@ def tz_convert(self, tz) -> Self: Parameters ---------- - tz : str, pytz.timezone, dateutil.tz.tzfile, datetime.tzinfo or None + tz : str, zoneinfo.ZoneInfo, pytz.timezone, dateutil.tz.tzfile, datetime.tzinfo or None Time zone for time. Corresponding timestamps would be converted to this time zone of the Datetime Array/Index. A `tz` of None will convert to UTC and remove the timezone information. @@ -923,7 +923,7 @@ def tz_convert(self, tz) -> Self: '2014-08-01 08:00:00', '2014-08-01 09:00:00'], dtype='datetime64[ns]', freq='h') - """ + """ # noqa: E501 tz = timezones.maybe_get_tz(tz) if self.tz is None: @@ -955,7 +955,7 @@ def tz_localize( Parameters ---------- - tz : str, pytz.timezone, dateutil.tz.tzfile, datetime.tzinfo or None + tz : str, zoneinfo.ZoneInfo,, pytz.timezone, dateutil.tz.tzfile, datetime.tzinfo or None Time zone to convert timestamps to. Passing ``None`` will remove the time zone information preserving local time. ambiguous : 'infer', 'NaT', bool array, default 'raise' @@ -1081,7 +1081,7 @@ def tz_localize( 0 2015-03-29 03:30:00+02:00 1 2015-03-29 03:30:00+02:00 dtype: datetime64[ns, Europe/Warsaw] - """ + """ # noqa: E501 nonexistent_options = ("raise", "NaT", "shift_forward", "shift_backward") if nonexistent not in nonexistent_options and not isinstance( nonexistent, timedelta diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index c276750314a34..00a929724ed4c 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -147,7 +147,7 @@ class DatetimeIndex(DatetimeTimedeltaMixin): One of pandas date offset strings or corresponding objects. The string 'infer' can be passed in order to set the frequency of the index as the inferred frequency upon creation. - tz : pytz.timezone or dateutil.tz.tzfile or datetime.tzinfo or str + tz : zoneinfo.ZoneInfo, pytz.timezone, dateutil.tz.tzfile, datetime.tzinfo or str Set the Timezone of the data. ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise' When clocks moved backward due to DST, ambiguous times may arise. diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index f9807310460b4..cfc93ecae295d 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -5,6 +5,7 @@ datetime, time, timedelta, + timezone, ) from itertools import ( product, @@ -14,7 +15,6 @@ import numpy as np import pytest -import pytz from pandas._libs.tslibs.conversion import localize_pydatetime from pandas._libs.tslibs.offsets import shift_months @@ -1870,8 +1870,10 @@ def test_dt64tz_series_sub_dtitz(self): def test_sub_datetime_compat(self, unit): # see GH#14088 - ser = Series([datetime(2016, 8, 23, 12, tzinfo=pytz.utc), NaT]).dt.as_unit(unit) - dt = datetime(2016, 8, 22, 12, tzinfo=pytz.utc) + ser = Series([datetime(2016, 8, 23, 12, tzinfo=timezone.utc), NaT]).dt.as_unit( + unit + ) + dt = datetime(2016, 8, 22, 12, tzinfo=timezone.utc) # The datetime object has "us" so we upcast lower units exp_unit = tm.get_finest_unit(unit, "us") exp = Series([Timedelta("1 days"), NaT]).dt.as_unit(exp_unit) diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index 97d57163ed079..f7b76e7388ae9 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -1,9 +1,9 @@ import datetime import decimal +import zoneinfo import numpy as np import pytest -import pytz import pandas as pd import pandas._testing as tm @@ -285,9 +285,6 @@ def test_array_copy(): assert tm.shares_memory(a, b) -cet = pytz.timezone("CET") - - @pytest.mark.parametrize( "data, expected", [ @@ -326,11 +323,18 @@ def test_array_copy(): ), ( [ - datetime.datetime(2000, 1, 1, tzinfo=cet), - datetime.datetime(2001, 1, 1, tzinfo=cet), + datetime.datetime( + 2000, 1, 1, tzinfo=zoneinfo.ZoneInfo("Europe/Berlin") + ), + datetime.datetime( + 2001, 1, 1, tzinfo=zoneinfo.ZoneInfo("Europe/Berlin") + ), ], DatetimeArray._from_sequence( - ["2000", "2001"], dtype=pd.DatetimeTZDtype(tz=cet, unit="us") + ["2000", "2001"], + dtype=pd.DatetimeTZDtype( + tz=zoneinfo.ZoneInfo("Europe/Berlin"), unit="us" + ), ), ), # timedelta diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index 63d60c78da482..0a00264a7156f 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -7,12 +7,6 @@ from datetime import timedelta import operator -try: - from zoneinfo import ZoneInfo -except ImportError: - # Cannot assign to a type - ZoneInfo = None # type: ignore[misc, assignment] - import numpy as np import pytest @@ -724,21 +718,14 @@ def test_tz_localize_t2d(self): roundtrip = expected.tz_localize("US/Pacific") tm.assert_datetime_array_equal(roundtrip, dta) - easts = ["US/Eastern", "dateutil/US/Eastern"] - if ZoneInfo is not None: - try: - tz = ZoneInfo("US/Eastern") - except KeyError: - # no tzdata - pass - else: - # Argument 1 to "append" of "list" has incompatible type "ZoneInfo"; - # expected "str" - easts.append(tz) # type: ignore[arg-type] - - @pytest.mark.parametrize("tz", easts) + @pytest.mark.parametrize( + "tz", ["US/Eastern", "dateutil/US/Eastern", "pytz/US/Eastern"] + ) def test_iter_zoneinfo_fold(self, tz): # GH#49684 + if tz.startswith("pytz/"): + pytz = pytest.importorskip("pytz") + tz = pytz.timezone(tz.removeprefix("pytz/")) utc_vals = np.array( [1320552000, 1320555600, 1320559200, 1320562800], dtype=np.int64 ) diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index c6da01636247d..252fc484a8246 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -3,7 +3,6 @@ import numpy as np import pytest -import pytz from pandas._libs.tslibs.dtypes import NpyDatetimeUnit @@ -391,8 +390,9 @@ def test_empty(self): def test_tz_standardize(self): # GH 24713 + pytz = pytest.importorskip("pytz") tz = pytz.timezone("US/Eastern") - dr = date_range("2013-01-01", periods=3, tz="US/Eastern") + dr = date_range("2013-01-01", periods=3, tz=tz) dtype = DatetimeTZDtype("ns", dr.tz) assert dtype.tz == tz dtype = DatetimeTZDtype("ns", dr[0].tz) diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index db18cd4aef14e..b1d7c701e1267 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -12,6 +12,7 @@ datetime, time, timedelta, + timezone, ) from decimal import Decimal from fractions import Fraction @@ -27,7 +28,6 @@ import numpy as np import pytest -import pytz from pandas._libs import ( lib, @@ -1022,7 +1022,7 @@ def test_maybe_convert_objects_itemsize(self, data0, data1): def test_mixed_dtypes_remain_object_array(self): # GH14956 - arr = np.array([datetime(2015, 1, 1, tzinfo=pytz.utc), 1], dtype=object) + arr = np.array([datetime(2015, 1, 1, tzinfo=timezone.utc), 1], dtype=object) result = lib.maybe_convert_objects(arr, convert_non_numeric=True) tm.assert_numpy_array_equal(result, arr) diff --git a/pandas/tests/frame/constructors/test_from_records.py b/pandas/tests/frame/constructors/test_from_records.py index 35e143fcedf7b..5be42d41af03a 100644 --- a/pandas/tests/frame/constructors/test_from_records.py +++ b/pandas/tests/frame/constructors/test_from_records.py @@ -1,10 +1,12 @@ from collections.abc import Iterator -from datetime import datetime +from datetime import ( + datetime, + timezone, +) from decimal import Decimal import numpy as np import pytest -import pytz from pandas._config import using_pyarrow_string_dtype @@ -239,7 +241,7 @@ def test_from_records_series_categorical_index(self): tm.assert_frame_equal(frame, expected) def test_frame_from_records_utc(self): - rec = {"datum": 1.5, "begin_time": datetime(2006, 4, 27, tzinfo=pytz.utc)} + rec = {"datum": 1.5, "begin_time": datetime(2006, 4, 27, tzinfo=timezone.utc)} # it works DataFrame.from_records([rec], index="begin_time") diff --git a/pandas/tests/frame/methods/test_at_time.py b/pandas/tests/frame/methods/test_at_time.py index 126899826fac3..b69db80dee446 100644 --- a/pandas/tests/frame/methods/test_at_time.py +++ b/pandas/tests/frame/methods/test_at_time.py @@ -1,8 +1,11 @@ -from datetime import time +from datetime import ( + time, + timezone, +) +import zoneinfo import numpy as np import pytest -import pytz from pandas._libs.tslibs import timezones @@ -65,7 +68,7 @@ def test_at_time_nonexistent(self, frame_or_series): assert len(rs) == 0 @pytest.mark.parametrize( - "hour", ["1:00", "1:00AM", time(1), time(1, tzinfo=pytz.UTC)] + "hour", ["1:00", "1:00AM", time(1), time(1, tzinfo=timezone.utc)] ) def test_at_time_errors(self, hour): # GH#24043 @@ -83,7 +86,7 @@ def test_at_time_tz(self): # GH#24043 dti = date_range("2018", periods=3, freq="h", tz="US/Pacific") df = DataFrame(list(range(len(dti))), index=dti) - result = df.at_time(time(4, tzinfo=pytz.timezone("US/Eastern"))) + result = df.at_time(time(4, tzinfo=zoneinfo.ZoneInfo("US/Eastern"))) expected = df.iloc[1:2] tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_join.py b/pandas/tests/frame/methods/test_join.py index 82802dd6e99eb..7de87e633cfb1 100644 --- a/pandas/tests/frame/methods/test_join.py +++ b/pandas/tests/frame/methods/test_join.py @@ -1,4 +1,5 @@ from datetime import datetime +import zoneinfo import numpy as np import pytest @@ -543,17 +544,14 @@ def test_merge_join_different_levels_raises(self): df1.join(df2, on="a") def test_frame_join_tzaware(self): + tz = zoneinfo.ZoneInfo("US/Central") test1 = DataFrame( np.zeros((6, 3)), - index=date_range( - "2012-11-15 00:00:00", periods=6, freq="100ms", tz="US/Central" - ), + index=date_range("2012-11-15 00:00:00", periods=6, freq="100ms", tz=tz), ) test2 = DataFrame( np.zeros((3, 3)), - index=date_range( - "2012-11-15 00:00:00", periods=3, freq="250ms", tz="US/Central" - ), + index=date_range("2012-11-15 00:00:00", periods=3, freq="250ms", tz=tz), columns=range(3, 6), ) @@ -561,4 +559,4 @@ def test_frame_join_tzaware(self): expected = test1.index.union(test2.index) tm.assert_index_equal(result.index, expected) - assert result.index.tz.zone == "US/Central" + assert result.index.tz.key == "US/Central" diff --git a/pandas/tests/frame/methods/test_to_dict.py b/pandas/tests/frame/methods/test_to_dict.py index 0272b679e85a2..c43d947b4877e 100644 --- a/pandas/tests/frame/methods/test_to_dict.py +++ b/pandas/tests/frame/methods/test_to_dict.py @@ -2,11 +2,13 @@ OrderedDict, defaultdict, ) -from datetime import datetime +from datetime import ( + datetime, + timezone, +) import numpy as np import pytest -import pytz from pandas import ( NA, @@ -209,15 +211,15 @@ def test_to_dict_tz(self): # GH#18372 When converting to dict with orient='records' columns of # datetime that are tz-aware were not converted to required arrays data = [ - (datetime(2017, 11, 18, 21, 53, 0, 219225, tzinfo=pytz.utc),), - (datetime(2017, 11, 18, 22, 6, 30, 61810, tzinfo=pytz.utc),), + (datetime(2017, 11, 18, 21, 53, 0, 219225, tzinfo=timezone.utc),), + (datetime(2017, 11, 18, 22, 6, 30, 61810, tzinfo=timezone.utc),), ] df = DataFrame(list(data), columns=["d"]) result = df.to_dict(orient="records") expected = [ - {"d": Timestamp("2017-11-18 21:53:00.219225+0000", tz=pytz.utc)}, - {"d": Timestamp("2017-11-18 22:06:30.061810+0000", tz=pytz.utc)}, + {"d": Timestamp("2017-11-18 21:53:00.219225+0000", tz=timezone.utc)}, + {"d": Timestamp("2017-11-18 22:06:30.061810+0000", tz=timezone.utc)}, ] tm.assert_dict_equal(result[0], expected[0]) tm.assert_dict_equal(result[1], expected[1]) diff --git a/pandas/tests/frame/methods/test_tz_convert.py b/pandas/tests/frame/methods/test_tz_convert.py index e9209f218bca9..5ee4021102f22 100644 --- a/pandas/tests/frame/methods/test_tz_convert.py +++ b/pandas/tests/frame/methods/test_tz_convert.py @@ -1,3 +1,5 @@ +import zoneinfo + import numpy as np import pytest @@ -13,28 +15,34 @@ class TestTZConvert: def test_tz_convert(self, frame_or_series): - rng = date_range("1/1/2011", periods=200, freq="D", tz="US/Eastern") + rng = date_range( + "1/1/2011", periods=200, freq="D", tz=zoneinfo.ZoneInfo("US/Eastern") + ) obj = DataFrame({"a": 1}, index=rng) obj = tm.get_obj(obj, frame_or_series) - result = obj.tz_convert("Europe/Berlin") - expected = DataFrame({"a": 1}, rng.tz_convert("Europe/Berlin")) + berlin = zoneinfo.ZoneInfo("Europe/Berlin") + result = obj.tz_convert(berlin) + expected = DataFrame({"a": 1}, rng.tz_convert(berlin)) expected = tm.get_obj(expected, frame_or_series) - assert result.index.tz.zone == "Europe/Berlin" + assert result.index.tz.key == "Europe/Berlin" tm.assert_equal(result, expected) def test_tz_convert_axis1(self): - rng = date_range("1/1/2011", periods=200, freq="D", tz="US/Eastern") + rng = date_range( + "1/1/2011", periods=200, freq="D", tz=zoneinfo.ZoneInfo("US/Eastern") + ) obj = DataFrame({"a": 1}, index=rng) obj = obj.T - result = obj.tz_convert("Europe/Berlin", axis=1) - assert result.columns.tz.zone == "Europe/Berlin" + berlin = zoneinfo.ZoneInfo("Europe/Berlin") + result = obj.tz_convert(berlin, axis=1) + assert result.columns.tz.key == "Europe/Berlin" - expected = DataFrame({"a": 1}, rng.tz_convert("Europe/Berlin")) + expected = DataFrame({"a": 1}, rng.tz_convert(berlin)) tm.assert_equal(result, expected.T) diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index c68171ab254c7..b4c16b94fcf8b 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -1,6 +1,7 @@ -from datetime import datetime - -import pytz +from datetime import ( + datetime, + timezone, +) from pandas import DataFrame import pandas._testing as tm @@ -13,7 +14,7 @@ def test_set_axis_setattr_index(self): # GH 6785 # set the index manually - df = DataFrame([{"ts": datetime(2014, 4, 1, tzinfo=pytz.utc), "foo": 1}]) + df = DataFrame([{"ts": datetime(2014, 4, 1, tzinfo=timezone.utc), "foo": 1}]) expected = df.set_index("ts") df.index = df["ts"] df.pop("ts") diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index da0504458cf5d..c0b9e6549c4ba 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -14,12 +14,12 @@ ) import functools import re +import zoneinfo import numpy as np from numpy import ma from numpy.ma import mrecords import pytest -import pytz from pandas._config import using_pyarrow_string_dtype @@ -1908,8 +1908,7 @@ def test_constructor_with_datetimes2(self): def test_constructor_with_datetimes3(self): # GH 7594 # don't coerce tz-aware - tz = pytz.timezone("US/Eastern") - dt = tz.localize(datetime(2012, 1, 1)) + dt = datetime(2012, 1, 1, tzinfo=zoneinfo.ZoneInfo("US/Eastern")) df = DataFrame({"End Date": dt}, index=[0]) assert df.iat[0, 0] == dt diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index ea556d043be2d..44e8e050cb756 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -5,11 +5,11 @@ from datetime import ( datetime, timedelta, + timezone, ) import numpy as np import pytest -import pytz import pandas as pd from pandas import ( @@ -774,12 +774,12 @@ def test_groupby_with_timezone_selection(self): def test_timezone_info(self): # see gh-11682: Timezone info lost when broadcasting # scalar datetime to DataFrame - - df = DataFrame({"a": [1], "b": [datetime.now(pytz.utc)]}) - assert df["b"][0].tzinfo == pytz.utc + utc = timezone.utc + df = DataFrame({"a": [1], "b": [datetime.now(utc)]}) + assert df["b"][0].tzinfo == utc df = DataFrame({"a": [1, 2, 3]}) - df["b"] = datetime.now(pytz.utc) - assert df["b"][0].tzinfo == pytz.utc + df["b"] = datetime.now(utc) + assert df["b"][0].tzinfo == utc def test_datetime_count(self): df = DataFrame( diff --git a/pandas/tests/indexes/datetimes/methods/test_astype.py b/pandas/tests/indexes/datetimes/methods/test_astype.py index c0bc6601769b1..81dc3b3ecc45e 100644 --- a/pandas/tests/indexes/datetimes/methods/test_astype.py +++ b/pandas/tests/indexes/datetimes/methods/test_astype.py @@ -3,7 +3,6 @@ import dateutil import numpy as np import pytest -import pytz import pandas as pd from pandas import ( @@ -251,6 +250,8 @@ def _check_rng(rng): _check_rng(rng_utc) def test_index_convert_to_datetime_array_explicit_pytz(self): + pytz = pytest.importorskip("pytz") + def _check_rng(rng): converted = rng.to_pydatetime() assert isinstance(converted, np.ndarray) diff --git a/pandas/tests/indexes/datetimes/methods/test_insert.py b/pandas/tests/indexes/datetimes/methods/test_insert.py index ebfe490e0e067..4a5b7bcc1a86f 100644 --- a/pandas/tests/indexes/datetimes/methods/test_insert.py +++ b/pandas/tests/indexes/datetimes/methods/test_insert.py @@ -1,8 +1,8 @@ from datetime import datetime +import zoneinfo import numpy as np import pytest -import pytz from pandas import ( NA, @@ -133,49 +133,59 @@ def test_insert3(self, unit): assert result.name == expected.name assert result.freq is None - def test_insert4(self, unit): - for tz in ["US/Pacific", "Asia/Singapore"]: - idx = date_range( - "1/1/2000 09:00", periods=6, freq="h", tz=tz, name="idx", unit=unit - ) - # preserve freq - expected = date_range( - "1/1/2000 09:00", periods=7, freq="h", tz=tz, name="idx", unit=unit - ) - for d in [ - Timestamp("2000-01-01 15:00", tz=tz), - pytz.timezone(tz).localize(datetime(2000, 1, 1, 15)), - ]: - result = idx.insert(6, d) - tm.assert_index_equal(result, expected) - assert result.name == expected.name - assert result.freq == expected.freq - assert result.tz == expected.tz - - expected = DatetimeIndex( - [ - "2000-01-01 09:00", - "2000-01-01 10:00", - "2000-01-01 11:00", - "2000-01-01 12:00", - "2000-01-01 13:00", - "2000-01-01 14:00", - "2000-01-01 10:00", - ], - name="idx", - tz=tz, - freq=None, - ).as_unit(unit) - # reset freq to None - for d in [ - Timestamp("2000-01-01 10:00", tz=tz), - pytz.timezone(tz).localize(datetime(2000, 1, 1, 10)), - ]: - result = idx.insert(6, d) - tm.assert_index_equal(result, expected) - assert result.name == expected.name - assert result.tz == expected.tz - assert result.freq is None + @pytest.mark.parametrize("tz", ["US/Pacific", "Asia/Singapore"]) + @pytest.mark.parametrize( + "to_ts", + [lambda x: x, lambda x: x.to_pydatetime()], + ids=["Timestamp", "datetime"], + ) + def test_insert4(self, unit, tz, to_ts): + idx = date_range( + "1/1/2000 09:00", periods=6, freq="h", tz=tz, name="idx", unit=unit + ) + # preserve freq + expected = date_range( + "1/1/2000 09:00", periods=7, freq="h", tz=tz, name="idx", unit=unit + ) + tz = zoneinfo.ZoneInfo(tz) + d = to_ts(Timestamp("2000-01-01 15:00", tz=tz)) + result = idx.insert(6, d) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freq == expected.freq + assert result.tz == expected.tz + + @pytest.mark.parametrize("tz", ["US/Pacific", "Asia/Singapore"]) + @pytest.mark.parametrize( + "to_ts", + [lambda x: x, lambda x: x.to_pydatetime()], + ids=["Timestamp", "datetime"], + ) + def test_insert4_no_freq(self, unit, tz, to_ts): + idx = date_range( + "1/1/2000 09:00", periods=6, freq="h", tz=tz, name="idx", unit=unit + ) + expected = DatetimeIndex( + [ + "2000-01-01 09:00", + "2000-01-01 10:00", + "2000-01-01 11:00", + "2000-01-01 12:00", + "2000-01-01 13:00", + "2000-01-01 14:00", + "2000-01-01 10:00", + ], + name="idx", + tz=tz, + freq=None, + ).as_unit(unit) + # reset freq to None + d = to_ts(Timestamp("2000-01-01 10:00", tz=tz)) + result = idx.insert(6, d) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.tz == expected.tz + assert result.freq is None # TODO: also changes DataFrame.__setitem__ with expansion def test_insert_mismatched_tzawareness(self): @@ -214,7 +224,7 @@ def test_insert_mismatched_tz(self): assert expected.dtype == idx.dtype tm.assert_index_equal(result, expected) - item = datetime(2000, 1, 4, tzinfo=pytz.timezone("US/Eastern")) + item = datetime(2000, 1, 4, tzinfo=zoneinfo.ZoneInfo("US/Eastern")) result = idx.insert(3, item) expected = Index( list(idx[:3]) + [item.astimezone(idx.tzinfo)] + list(idx[3:]), diff --git a/pandas/tests/indexes/datetimes/methods/test_shift.py b/pandas/tests/indexes/datetimes/methods/test_shift.py index 375dea01974bb..a202627550cd2 100644 --- a/pandas/tests/indexes/datetimes/methods/test_shift.py +++ b/pandas/tests/indexes/datetimes/methods/test_shift.py @@ -1,7 +1,7 @@ from datetime import datetime +import zoneinfo import pytest -import pytz from pandas.errors import NullFrequencyError @@ -13,8 +13,6 @@ ) import pandas._testing as tm -START, END = datetime(2009, 1, 1), datetime(2010, 1, 1) - class TestDatetimeIndexShift: # ------------------------------------------------------------- @@ -122,24 +120,28 @@ def test_dti_shift_across_dst(self, unit): ) def test_dti_shift_near_midnight(self, shift, result_time, unit): # GH 8616 - dt = datetime(2014, 11, 14, 0) - dt_est = pytz.timezone("EST").localize(dt) + tz = zoneinfo.ZoneInfo("US/Eastern") + dt_est = datetime(2014, 11, 14, 0, tzinfo=tz) idx = DatetimeIndex([dt_est]).as_unit(unit) ser = Series(data=[1], index=idx) result = ser.shift(shift, freq="h") - exp_index = DatetimeIndex([result_time], tz="EST").as_unit(unit) + exp_index = DatetimeIndex([result_time], tz=tz).as_unit(unit) expected = Series(1, index=exp_index) tm.assert_series_equal(result, expected) def test_shift_periods(self, unit): # GH#22458 : argument 'n' was deprecated in favor of 'periods' - idx = date_range(start=START, end=END, periods=3, unit=unit) + idx = date_range( + start=datetime(2009, 1, 1), end=datetime(2010, 1, 1), periods=3, unit=unit + ) tm.assert_index_equal(idx.shift(periods=0), idx) tm.assert_index_equal(idx.shift(0), idx) @pytest.mark.parametrize("freq", ["B", "C"]) def test_shift_bday(self, freq, unit): - rng = date_range(START, END, freq=freq, unit=unit) + rng = date_range( + datetime(2009, 1, 1), datetime(2010, 1, 1), freq=freq, unit=unit + ) shifted = rng.shift(5) assert shifted[0] == rng[5] assert shifted.freq == rng.freq @@ -153,11 +155,21 @@ def test_shift_bday(self, freq, unit): assert shifted.freq == rng.freq def test_shift_bmonth(self, performance_warning, unit): - rng = date_range(START, END, freq=pd.offsets.BMonthEnd(), unit=unit) + rng = date_range( + datetime(2009, 1, 1), + datetime(2010, 1, 1), + freq=pd.offsets.BMonthEnd(), + unit=unit, + ) shifted = rng.shift(1, freq=pd.offsets.BDay()) assert shifted[0] == rng[0] + pd.offsets.BDay() - rng = date_range(START, END, freq=pd.offsets.BMonthEnd(), unit=unit) + rng = date_range( + datetime(2009, 1, 1), + datetime(2010, 1, 1), + freq=pd.offsets.BMonthEnd(), + unit=unit, + ) with tm.assert_produces_warning(performance_warning): shifted = rng.shift(1, freq=pd.offsets.CDay()) assert shifted[0] == rng[0] + pd.offsets.CDay() diff --git a/pandas/tests/indexes/datetimes/methods/test_to_period.py b/pandas/tests/indexes/datetimes/methods/test_to_period.py index 8e279162b7012..cd4a142dd5b30 100644 --- a/pandas/tests/indexes/datetimes/methods/test_to_period.py +++ b/pandas/tests/indexes/datetimes/methods/test_to_period.py @@ -1,7 +1,8 @@ +from datetime import timezone + import dateutil.tz from dateutil.tz import tzlocal import pytest -import pytz from pandas._libs.tslibs.ccalendar import MONTHS from pandas._libs.tslibs.offsets import MonthEnd @@ -155,7 +156,13 @@ def test_to_period_microsecond(self): @pytest.mark.parametrize( "tz", - ["US/Eastern", pytz.utc, tzlocal(), "dateutil/US/Eastern", dateutil.tz.tzutc()], + [ + "US/Eastern", + timezone.utc, + tzlocal(), + "dateutil/US/Eastern", + dateutil.tz.tzutc(), + ], ) def test_to_period_tz(self, tz): ts = date_range("1/1/2000", "2/1/2000", tz=tz) diff --git a/pandas/tests/indexes/datetimes/methods/test_tz_convert.py b/pandas/tests/indexes/datetimes/methods/test_tz_convert.py index b2cf488ac8313..9eabb742b93a4 100644 --- a/pandas/tests/indexes/datetimes/methods/test_tz_convert.py +++ b/pandas/tests/indexes/datetimes/methods/test_tz_convert.py @@ -4,7 +4,6 @@ from dateutil.tz import gettz import numpy as np import pytest -import pytz from pandas._libs.tslibs import timezones @@ -260,11 +259,14 @@ def test_dti_tz_convert_tzlocal(self): [ "US/Eastern", "dateutil/US/Eastern", - pytz.timezone("US/Eastern"), + "pytz/US/Eastern", gettz("US/Eastern"), ], ) def test_dti_tz_convert_utc_to_local_no_modify(self, tz): + if isinstance(tz, str) and tz.startswith("pytz/"): + pytz = pytest.importorskip("pytz") + tz = pytz.timezone(tz.removeprefix("pytz/")) rng = date_range("3/11/2012", "3/12/2012", freq="h", tz="utc") rng_eastern = rng.tz_convert(tz) diff --git a/pandas/tests/indexes/datetimes/methods/test_tz_localize.py b/pandas/tests/indexes/datetimes/methods/test_tz_localize.py index ad7769c6b9671..c6697fd169e8a 100644 --- a/pandas/tests/indexes/datetimes/methods/test_tz_localize.py +++ b/pandas/tests/indexes/datetimes/methods/test_tz_localize.py @@ -1,7 +1,9 @@ from datetime import ( datetime, timedelta, + timezone, ) +from zoneinfo import ZoneInfo import dateutil.tz from dateutil.tz import gettz @@ -19,22 +21,13 @@ ) import pandas._testing as tm -try: - from zoneinfo import ZoneInfo -except ImportError: - # Cannot assign to a type [misc] - ZoneInfo = None # type: ignore[misc, assignment] - -easts = [pytz.timezone("US/Eastern"), gettz("US/Eastern")] -if ZoneInfo is not None: - try: - tz = ZoneInfo("US/Eastern") - except KeyError: - # no tzdata - pass - else: - easts.append(tz) +@pytest.fixture(params=["pytz/US/Eastern", gettz("US/Eastern"), ZoneInfo("US/Eastern")]) +def tz(request): + if isinstance(request.param, str) and request.param.startswith("pytz/"): + pytz = pytest.importorskip("pytz") + return pytz.timezone(request.param.removeprefix("pytz/")) + return request.param class TestTZLocalize: @@ -88,7 +81,6 @@ def test_dti_tz_localize_nonexistent_raise_coerce(self): expected = dti.tz_convert("US/Eastern") tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("tz", easts) def test_dti_tz_localize_ambiguous_infer(self, tz): # November 6, 2011, fall back, repeat 2 AM hour # With no repeated hours, we cannot infer the transition @@ -96,7 +88,6 @@ def test_dti_tz_localize_ambiguous_infer(self, tz): with pytest.raises(pytz.AmbiguousTimeError, match="Cannot infer dst time"): dr.tz_localize(tz) - @pytest.mark.parametrize("tz", easts) def test_dti_tz_localize_ambiguous_infer2(self, tz, unit): # With repeated hours, we can infer the transition dr = date_range( @@ -116,7 +107,6 @@ def test_dti_tz_localize_ambiguous_infer2(self, tz, unit): result2 = DatetimeIndex(times, tz=tz, ambiguous="infer").as_unit(unit) tm.assert_index_equal(result2, expected) - @pytest.mark.parametrize("tz", easts) def test_dti_tz_localize_ambiguous_infer3(self, tz): # When there is no dst transition, nothing special happens dr = date_range(datetime(2011, 6, 1, 0), periods=10, freq=offsets.Hour()) @@ -124,7 +114,6 @@ def test_dti_tz_localize_ambiguous_infer3(self, tz): localized_infer = dr.tz_localize(tz, ambiguous="infer") tm.assert_index_equal(localized, localized_infer) - @pytest.mark.parametrize("tz", easts) def test_dti_tz_localize_ambiguous_times(self, tz): # March 13, 2011, spring forward, skip from 2 AM to 3 AM dr = date_range(datetime(2011, 3, 13, 1, 30), periods=3, freq=offsets.Hour()) @@ -143,7 +132,7 @@ def test_dti_tz_localize_ambiguous_times(self, tz): # UTC is OK dr = date_range( - datetime(2011, 3, 13), periods=48, freq=offsets.Minute(30), tz=pytz.utc + datetime(2011, 3, 13), periods=48, freq=offsets.Minute(30), tz=timezone.utc ) @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) @@ -181,15 +170,6 @@ def test_dti_tz_localize(self, prefix): with pytest.raises(pytz.NonExistentTimeError, match="2011-03-13 02:00:00"): dti.tz_localize(tzstr) - @pytest.mark.parametrize( - "tz", - [ - "US/Eastern", - "dateutil/US/Eastern", - pytz.timezone("US/Eastern"), - gettz("US/Eastern"), - ], - ) def test_dti_tz_localize_utc_conversion(self, tz): # Localizing to time zone should: # 1) check for DST ambiguities @@ -245,7 +225,6 @@ def test_dti_tz_localize_tzlocal(self): dti2 = dti.tz_localize(None) tm.assert_numpy_array_equal(dti2.asi8 - offset, dti.asi8) - @pytest.mark.parametrize("tz", easts) def test_dti_tz_localize_ambiguous_nat(self, tz): times = [ "11/06/2011 00:00", @@ -270,7 +249,6 @@ def test_dti_tz_localize_ambiguous_nat(self, tz): # right is datetime64[ns, tzfile('/usr/share/zoneinfo/US/Eastern')] tm.assert_numpy_array_equal(di_test.values, localized.values) - @pytest.mark.parametrize("tz", easts) def test_dti_tz_localize_ambiguous_flags(self, tz, unit): # November 6, 2011, fall back, repeat 2 AM hour @@ -321,8 +299,7 @@ def test_dti_tz_localize_ambiguous_flags(self, tz, unit): dr = dr.append(dr) tm.assert_index_equal(dr, localized) - @pytest.mark.parametrize("tz", easts) - def test_dti_tz_localize_ambiguous_flags2(self, tz, unit): + def test_dti_tz_localize_ambiguous_flags2(self, tz): # When there is no dst transition, nothing special happens dr = date_range(datetime(2011, 6, 1, 0), periods=10, freq=offsets.Hour()) is_dst = np.array([1] * 10) @@ -332,8 +309,8 @@ def test_dti_tz_localize_ambiguous_flags2(self, tz, unit): def test_dti_tz_localize_bdate_range(self): dr = bdate_range("1/1/2009", "1/1/2010") - dr_utc = bdate_range("1/1/2009", "1/1/2010", tz=pytz.utc) - localized = dr.tz_localize(pytz.utc) + dr_utc = bdate_range("1/1/2009", "1/1/2010", tz=timezone.utc) + localized = dr.tz_localize(timezone.utc) tm.assert_index_equal(dr_utc, localized) @pytest.mark.parametrize( diff --git a/pandas/tests/indexes/datetimes/test_constructors.py b/pandas/tests/indexes/datetimes/test_constructors.py index 43a7cdf63d9b9..aba440ceeb56b 100644 --- a/pandas/tests/indexes/datetimes/test_constructors.py +++ b/pandas/tests/indexes/datetimes/test_constructors.py @@ -7,6 +7,7 @@ ) from functools import partial from operator import attrgetter +import zoneinfo import dateutil import dateutil.tz @@ -152,7 +153,9 @@ def test_construction_caching(self): df = pd.DataFrame( { "dt": date_range("20130101", periods=3), - "dttz": date_range("20130101", periods=3, tz="US/Eastern"), + "dttz": date_range( + "20130101", periods=3, tz=zoneinfo.ZoneInfo("US/Eastern") + ), "dt_with_null": [ Timestamp("20130101"), pd.NaT, @@ -161,7 +164,7 @@ def test_construction_caching(self): "dtns": date_range("20130101", periods=3, freq="ns"), } ) - assert df.dttz.dtype.tz.zone == "US/Eastern" + assert df.dttz.dtype.tz.key == "US/Eastern" @pytest.mark.parametrize( "kwargs", @@ -198,7 +201,11 @@ def test_construction_with_alt_tz_localize(self, kwargs, tz_aware_fixture): # incompat tz/dtype msg = "cannot supply both a tz and a dtype with a tz" with pytest.raises(ValueError, match=msg): - DatetimeIndex(i.tz_localize(None).asi8, dtype=i.dtype, tz="US/Pacific") + DatetimeIndex( + i.tz_localize(None).asi8, + dtype=i.dtype, + tz=zoneinfo.ZoneInfo("US/Hawaii"), + ) def test_construction_index_with_mixed_timezones(self): # gh-11488: no tz results in DatetimeIndex @@ -736,7 +743,7 @@ def test_disallow_setting_tz(self): dti = DatetimeIndex(["2010"], tz="UTC") msg = "Cannot directly set timezone" with pytest.raises(AttributeError, match=msg): - dti.tz = pytz.timezone("US/Pacific") + dti.tz = zoneinfo.ZoneInfo("US/Pacific") @pytest.mark.parametrize( "tz", @@ -764,7 +771,9 @@ def test_constructor_start_end_with_tz(self, tz): @pytest.mark.parametrize("tz", ["US/Pacific", "US/Eastern", "Asia/Tokyo"]) def test_constructor_with_non_normalized_pytz(self, tz): # GH 18595 - non_norm_tz = Timestamp("2010", tz=tz).tz + pytz = pytest.importorskip("pytz") + tz_in = pytz.timezone(tz) + non_norm_tz = Timestamp("2010", tz=tz_in).tz result = DatetimeIndex(["2010"], tz=non_norm_tz) assert pytz.timezone(tz) is result.tz @@ -914,7 +923,9 @@ def test_index_constructor_with_numpy_object_array_and_timestamp_tz_with_nan(sel expected = DatetimeIndex([Timestamp("2019", tz="UTC"), pd.NaT]) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("tz", [pytz.timezone("US/Eastern"), gettz("US/Eastern")]) + @pytest.mark.parametrize( + "tz", [zoneinfo.ZoneInfo("US/Eastern"), gettz("US/Eastern")] + ) def test_dti_from_tzaware_datetime(self, tz): d = [datetime(2012, 8, 19, tzinfo=tz)] @@ -963,7 +974,7 @@ def test_dti_convert_datetime_list(self, tzstr): @pytest.mark.parametrize( "tz", [ - pytz.timezone("US/Eastern"), + "pytz/US/Eastern", gettz("US/Eastern"), ], ) @@ -972,6 +983,8 @@ def test_dti_convert_datetime_list(self, tzstr): def test_dti_ambiguous_matches_timestamp(self, tz, use_str, box_cls, request): # GH#47471 check that we get the same raising behavior in the DTI # constructor and Timestamp constructor + if isinstance(tz, str) and tz.startswith("pytz/"): + tz = pytz.timezone(tz.removeprefix("pytz/")) dtstr = "2013-11-03 01:59:59.999999" item = dtstr if not use_str: diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py index 8bf51bcd38862..ee1c906efea73 100644 --- a/pandas/tests/indexes/datetimes/test_date_range.py +++ b/pandas/tests/indexes/datetimes/test_date_range.py @@ -12,7 +12,6 @@ import numpy as np import pytest import pytz -from pytz import timezone from pandas._libs.tslibs import timezones from pandas._libs.tslibs.offsets import ( @@ -97,6 +96,7 @@ def test_date_range_timestamp_equiv_dateutil(self): assert ts == stamp def test_date_range_timestamp_equiv_explicit_pytz(self): + pytz = pytest.importorskip("pytz") rng = date_range("20090415", "20090519", tz=pytz.timezone("US/Eastern")) stamp = rng[0] @@ -490,7 +490,8 @@ def test_range_bug(self, unit): def test_range_tz_pytz(self): # see gh-2906 - tz = timezone("US/Eastern") + pytz = pytest.importorskip("pytz") + tz = pytz.timezone("US/Eastern") start = tz.localize(datetime(2011, 1, 1)) end = tz.localize(datetime(2011, 1, 3)) @@ -517,14 +518,16 @@ def test_range_tz_pytz(self): ], ) def test_range_tz_dst_straddle_pytz(self, start, end): - start = Timestamp(start, tz="US/Eastern") - end = Timestamp(end, tz="US/Eastern") + pytz = pytest.importorskip("pytz") + tz = pytz.timezone("US/Eastern") + start = Timestamp(start, tz=tz) + end = Timestamp(end, tz=tz) dr = date_range(start, end, freq="D") assert dr[0] == start assert dr[-1] == end assert np.all(dr.hour == 0) - dr = date_range(start, end, freq="D", tz="US/Eastern") + dr = date_range(start, end, freq="D", tz=tz) assert dr[0] == start assert dr[-1] == end assert np.all(dr.hour == 0) @@ -533,7 +536,7 @@ def test_range_tz_dst_straddle_pytz(self, start, end): start.replace(tzinfo=None), end.replace(tzinfo=None), freq="D", - tz="US/Eastern", + tz=tz, ) assert dr[0] == start assert dr[-1] == end diff --git a/pandas/tests/indexes/datetimes/test_formats.py b/pandas/tests/indexes/datetimes/test_formats.py index 6e4e22942ab07..4551fdf073193 100644 --- a/pandas/tests/indexes/datetimes/test_formats.py +++ b/pandas/tests/indexes/datetimes/test_formats.py @@ -1,9 +1,11 @@ -from datetime import datetime +from datetime import ( + datetime, + timezone, +) import dateutil.tz import numpy as np import pytest -import pytz import pandas as pd from pandas import ( @@ -276,7 +278,7 @@ def test_dti_summary(self): result = idx._summary() assert result == expected - @pytest.mark.parametrize("tz", [None, pytz.utc, dateutil.tz.tzutc()]) + @pytest.mark.parametrize("tz", [None, timezone.utc, dateutil.tz.tzutc()]) @pytest.mark.parametrize("freq", ["B", "C"]) def test_dti_business_repr_etc_smoke(self, tz, freq): # only really care that it works diff --git a/pandas/tests/indexes/datetimes/test_setops.py b/pandas/tests/indexes/datetimes/test_setops.py index 36011b981179b..f04f1592ea0c1 100644 --- a/pandas/tests/indexes/datetimes/test_setops.py +++ b/pandas/tests/indexes/datetimes/test_setops.py @@ -1,11 +1,11 @@ from datetime import ( datetime, + timedelta, timezone, ) import numpy as np import pytest -import pytz import pandas.util._test_decorators as td @@ -560,6 +560,7 @@ def test_intersection_list(self): tm.assert_index_equal(res, idx) def test_month_range_union_tz_pytz(self, sort): + pytz = pytest.importorskip("pytz") tz = pytz.timezone("US/Eastern") early_start = datetime(2011, 1, 1) @@ -648,7 +649,7 @@ def test_intersection_bug(self): assert result.freq == b.freq @pytest.mark.parametrize( - "tz", [None, "UTC", "Europe/Berlin", pytz.FixedOffset(-60)] + "tz", [None, "UTC", "Europe/Berlin", timezone(timedelta(hours=-1))] ) def test_intersection_dst_transition(self, tz): # GH 46702: Europe/Berlin has DST transition diff --git a/pandas/tests/indexes/datetimes/test_timezones.py b/pandas/tests/indexes/datetimes/test_timezones.py index 0c8bdbdd2fb22..e4b8a909add0d 100644 --- a/pandas/tests/indexes/datetimes/test_timezones.py +++ b/pandas/tests/indexes/datetimes/test_timezones.py @@ -8,11 +8,11 @@ timezone, tzinfo, ) +import zoneinfo from dateutil.tz import gettz import numpy as np import pytest -import pytz from pandas._libs.tslibs import ( conversion, @@ -184,8 +184,11 @@ def test_dti_tz_nat(self, tzstr): assert isna(idx[1]) assert idx[0].tzinfo is not None - @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) + @pytest.mark.parametrize("tzstr", ["pytz/US/Eastern", "dateutil/US/Eastern"]) def test_utc_box_timestamp_and_localize(self, tzstr): + if tzstr.startswith("pytz/"): + pytest.importorskip("pytz") + tzstr = tzstr.removeprefix("pytz/") tz = timezones.maybe_get_tz(tzstr) rng = date_range("3/11/2012", "3/12/2012", freq="h", tz="utc") @@ -206,15 +209,17 @@ def test_utc_box_timestamp_and_localize(self, tzstr): rng_eastern[0].tzinfo ) - @pytest.mark.parametrize("tz", [pytz.timezone("US/Central"), gettz("US/Central")]) + @pytest.mark.parametrize( + "tz", [zoneinfo.ZoneInfo("US/Central"), gettz("US/Central")] + ) def test_with_tz(self, tz): # just want it to work - start = datetime(2011, 3, 12, tzinfo=pytz.utc) + start = datetime(2011, 3, 12, tzinfo=timezone.utc) dr = bdate_range(start, periods=50, freq=pd.offsets.Hour()) - assert dr.tz is pytz.utc + assert dr.tz is timezone.utc # DateRange with naive datetimes - dr = bdate_range("1/1/2005", "1/1/2009", tz=pytz.utc) + dr = bdate_range("1/1/2005", "1/1/2009", tz=timezone.utc) dr = bdate_range("1/1/2005", "1/1/2009", tz=tz) # normalized @@ -231,13 +236,16 @@ def test_with_tz(self, tz): # datetimes with tzinfo set dr = bdate_range( - datetime(2005, 1, 1, tzinfo=pytz.utc), datetime(2009, 1, 1, tzinfo=pytz.utc) + datetime(2005, 1, 1, tzinfo=timezone.utc), + datetime(2009, 1, 1, tzinfo=timezone.utc), ) msg = "Start and end cannot both be tz-aware with different timezones" with pytest.raises(Exception, match=msg): - bdate_range(datetime(2005, 1, 1, tzinfo=pytz.utc), "1/1/2009", tz=tz) + bdate_range(datetime(2005, 1, 1, tzinfo=timezone.utc), "1/1/2009", tz=tz) - @pytest.mark.parametrize("tz", [pytz.timezone("US/Eastern"), gettz("US/Eastern")]) + @pytest.mark.parametrize( + "tz", [zoneinfo.ZoneInfo("US/Eastern"), gettz("US/Eastern")] + ) def test_dti_convert_tz_aware_datetime_datetime(self, tz): # GH#1581 dates = [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)] diff --git a/pandas/tests/indexes/multi/test_reshape.py b/pandas/tests/indexes/multi/test_reshape.py index 06dbb33aadf97..cc3dadc6bb61c 100644 --- a/pandas/tests/indexes/multi/test_reshape.py +++ b/pandas/tests/indexes/multi/test_reshape.py @@ -1,8 +1,8 @@ from datetime import datetime +import zoneinfo import numpy as np import pytest -import pytz import pandas as pd from pandas import ( @@ -114,11 +114,11 @@ def test_append_index(): result = idx1.append(midx_lv2) # see gh-7112 - tz = pytz.timezone("Asia/Tokyo") + tz = zoneinfo.ZoneInfo("Asia/Tokyo") expected_tuples = [ - (1.1, tz.localize(datetime(2011, 1, 1))), - (1.2, tz.localize(datetime(2011, 1, 2))), - (1.3, tz.localize(datetime(2011, 1, 3))), + (1.1, datetime(2011, 1, 1, tzinfo=tz)), + (1.2, datetime(2011, 1, 2, tzinfo=tz)), + (1.3, datetime(2011, 1, 3, tzinfo=tz)), ] expected = Index([1.1, 1.2, 1.3] + expected_tuples) tm.assert_index_equal(result, expected) @@ -138,9 +138,9 @@ def test_append_index(): expected = Index._simple_new( np.array( [ - (1.1, tz.localize(datetime(2011, 1, 1)), "A"), - (1.2, tz.localize(datetime(2011, 1, 2)), "B"), - (1.3, tz.localize(datetime(2011, 1, 3)), "C"), + (1.1, datetime(2011, 1, 1, tzinfo=tz), "A"), + (1.2, datetime(2011, 1, 2, tzinfo=tz), "B"), + (1.3, datetime(2011, 1, 3, tzinfo=tz), "C"), ] + expected_tuples, dtype=object, diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index 8e05a8e6fc5d8..62118f1c82ebb 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -10,7 +10,6 @@ import dateutil import numpy as np import pytest -import pytz import pandas._libs.json as ujson from pandas.compat import IS64 @@ -370,6 +369,7 @@ def test_encode_time_conversion_basic(self, test): def test_encode_time_conversion_pytz(self): # see gh-11473: to_json segfaults with timezone-aware datetimes + pytz = pytest.importorskip("pytz") test = datetime.time(10, 12, 15, 343243, pytz.utc) output = ujson.ujson_dumps(test) expected = f'"{test.isoformat()}"' diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index e9c6c0f5e32d7..ec7e5575b2e7d 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -12,7 +12,6 @@ import numpy as np import pytest -import pytz import pandas as pd from pandas import ( @@ -217,6 +216,7 @@ def test_parse_tz_aware(all_parsers): {"x": [0.5]}, index=Index([Timestamp("2012-06-13 01:39:00+00:00")], name="Date") ) if parser.engine == "pyarrow": + pytz = pytest.importorskip("pytz") expected_tz = pytz.utc else: expected_tz = timezone.utc diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 893728748f276..dc82994bcbc7f 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -1,5 +1,7 @@ """test feather-format compat""" +import zoneinfo + import numpy as np import pytest @@ -62,6 +64,7 @@ def test_error(self): self.check_error_on_write(obj, ValueError, msg) def test_basic(self): + tz = zoneinfo.ZoneInfo("US/Eastern") df = pd.DataFrame( { "string": list("abc"), @@ -76,7 +79,7 @@ def test_basic(self): list(pd.date_range("20130101", periods=3)), freq=None ), "dttz": pd.DatetimeIndex( - list(pd.date_range("20130101", periods=3, tz="US/Eastern")), + list(pd.date_range("20130101", periods=3, tz=tz)), freq=None, ), "dt_with_null": [ @@ -93,7 +96,7 @@ def test_basic(self): df["timedeltas"] = pd.timedelta_range("1 day", periods=3) df["intervals"] = pd.interval_range(0, 3, 3) - assert df.dttz.dtype.tz.zone == "US/Eastern" + assert df.dttz.dtype.tz.key == "US/Eastern" expected = df.copy() expected.loc[1, "bool_with_null"] = None diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index af492b967bc1d..2e8e358b8e3c9 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1129,9 +1129,11 @@ def test_infer_string_large_string_type(self, tmp_path, pa): class TestParquetFastParquet(Base): @pytest.mark.xfail(reason="datetime_with_nat gets incorrect values") def test_basic(self, fp, df_full): + pytz = pytest.importorskip("pytz") + tz = pytz.timezone("US/Eastern") df = df_full - dti = pd.date_range("20130101", periods=3, tz="US/Eastern") + dti = pd.date_range("20130101", periods=3, tz=tz) dti = dti._with_freq(None) # freq doesn't round-trip df["datetime_tz"] = dti df["timedelta"] = pd.timedelta_range("1 day", periods=3) diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 98abbe3905204..5fe0f1265edff 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -77,6 +77,7 @@ def compare_element(result, expected, typ): def test_pickles(datapath): + pytest.importorskip("pytz") if not is_platform_little_endian(): pytest.skip("known failure on non-little endian") diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 7f37ca6831faa..cf0cbabb0258c 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -1,9 +1,9 @@ from datetime import datetime from functools import partial +import zoneinfo import numpy as np import pytest -import pytz from pandas._libs import lib from pandas._typing import DatetimeNaTType @@ -1655,13 +1655,13 @@ def test_resample_dst_anchor2(unit): def test_downsample_across_dst(unit): # GH 8531 - tz = pytz.timezone("Europe/Berlin") + tz = zoneinfo.ZoneInfo("Europe/Berlin") dt = datetime(2014, 10, 26) - dates = date_range(tz.localize(dt), periods=4, freq="2h").as_unit(unit) + dates = date_range(dt.astimezone(tz), periods=4, freq="2h").as_unit(unit) result = Series(5, index=dates).resample("h").mean() expected = Series( [5.0, np.nan] * 3 + [5.0], - index=date_range(tz.localize(dt), periods=7, freq="h").as_unit(unit), + index=date_range(dt.astimezone(tz), periods=7, freq="h").as_unit(unit), ) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/resample/test_period_index.py b/pandas/tests/resample/test_period_index.py index a4e27ad46c59c..89a21f0565793 100644 --- a/pandas/tests/resample/test_period_index.py +++ b/pandas/tests/resample/test_period_index.py @@ -1,11 +1,14 @@ -from datetime import datetime +from datetime import ( + datetime, + timezone, +) import re import warnings +import zoneinfo import dateutil import numpy as np import pytest -import pytz from pandas._libs.tslibs.ccalendar import ( DAYS, @@ -304,7 +307,7 @@ def test_resample_incompat_freq(self): @pytest.mark.parametrize( "tz", [ - pytz.timezone("America/Los_Angeles"), + zoneinfo.ZoneInfo("America/Los_Angeles"), dateutil.tz.gettz("America/Los_Angeles"), ], ) @@ -312,9 +315,13 @@ def test_with_local_timezone(self, tz): # see gh-5430 local_timezone = tz - start = datetime(year=2013, month=11, day=1, hour=0, minute=0, tzinfo=pytz.utc) + start = datetime( + year=2013, month=11, day=1, hour=0, minute=0, tzinfo=timezone.utc + ) # 1 day later - end = datetime(year=2013, month=11, day=2, hour=0, minute=0, tzinfo=pytz.utc) + end = datetime( + year=2013, month=11, day=2, hour=0, minute=0, tzinfo=timezone.utc + ) index = date_range(start, end, freq="h", name="idx") @@ -336,7 +343,7 @@ def test_with_local_timezone(self, tz): @pytest.mark.parametrize( "tz", [ - pytz.timezone("America/Los_Angeles"), + zoneinfo.ZoneInfo("America/Los_Angeles"), dateutil.tz.gettz("America/Los_Angeles"), ], ) @@ -353,8 +360,6 @@ def test_resample_with_tz(self, tz, unit): index=exp_dti, ) tm.assert_series_equal(result, expected) - # Especially assert that the timezone is LMT for pytz - assert result.index.tz == tz def test_resample_nonexistent_time_bin_edge(self): # GH 19375 diff --git a/pandas/tests/reshape/concat/test_append_common.py b/pandas/tests/reshape/concat/test_append_common.py index afafe8f6ab264..d0ff950e7985f 100644 --- a/pandas/tests/reshape/concat/test_append_common.py +++ b/pandas/tests/reshape/concat/test_append_common.py @@ -1,3 +1,5 @@ +import zoneinfo + import numpy as np import pytest @@ -353,14 +355,15 @@ def test_concatlike_datetimetz_to_object(self, tz_aware_fixture): tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1])) # different tz - dti3 = pd.DatetimeIndex(["2012-01-01", "2012-01-02"], tz="US/Pacific") + tz_diff = zoneinfo.ZoneInfo("US/Hawaii") + dti3 = pd.DatetimeIndex(["2012-01-01", "2012-01-02"], tz=tz_diff) exp = Index( [ pd.Timestamp("2011-01-01", tz=tz), pd.Timestamp("2011-01-02", tz=tz), - pd.Timestamp("2012-01-01", tz="US/Pacific"), - pd.Timestamp("2012-01-02", tz="US/Pacific"), + pd.Timestamp("2012-01-01", tz=tz_diff), + pd.Timestamp("2012-01-02", tz=tz_diff), ], dtype=object, ) diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index 4fc57c14ec4c3..bd364de26a3c4 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -2,7 +2,6 @@ import numpy as np import pytest -import pytz import pandas.util._test_decorators as td @@ -2071,7 +2070,7 @@ def test_tolerance_tz(self, unit): start=to_datetime("2016-01-02"), freq="D", periods=5, - tz=pytz.timezone("UTC"), + tz=datetime.timezone.utc, unit=unit, ), "value1": np.arange(5), @@ -2083,7 +2082,7 @@ def test_tolerance_tz(self, unit): start=to_datetime("2016-01-01"), freq="D", periods=5, - tz=pytz.timezone("UTC"), + tz=datetime.timezone.utc, unit=unit, ), "value2": list("ABCDE"), @@ -2097,7 +2096,7 @@ def test_tolerance_tz(self, unit): start=to_datetime("2016-01-02"), freq="D", periods=5, - tz=pytz.timezone("UTC"), + tz=datetime.timezone.utc, unit=unit, ), "value1": np.arange(5), diff --git a/pandas/tests/scalar/test_nat.py b/pandas/tests/scalar/test_nat.py index 131be7a77f2e5..b20df43dd49a6 100644 --- a/pandas/tests/scalar/test_nat.py +++ b/pandas/tests/scalar/test_nat.py @@ -3,10 +3,10 @@ timedelta, ) import operator +import zoneinfo import numpy as np import pytest -import pytz from pandas._libs.tslibs import iNaT from pandas.compat.numpy import np_version_gte1p24p3 @@ -361,7 +361,7 @@ def test_nat_doc_strings(compare): (Timestamp("2014-01-01"), "timestamp"), (Timestamp("2014-01-01", tz="UTC"), "timestamp"), (Timestamp("2014-01-01", tz="US/Eastern"), "timestamp"), - (pytz.timezone("Asia/Tokyo").localize(datetime(2014, 1, 1)), "timestamp"), + (datetime(2014, 1, 1).astimezone(zoneinfo.ZoneInfo("Asia/Tokyo")), "timestamp"), ], ) def test_nat_arithmetic_scalar(op_name, value, val_type): diff --git a/pandas/tests/scalar/timestamp/methods/test_replace.py b/pandas/tests/scalar/timestamp/methods/test_replace.py index c5169fdff0cd4..f15ea0e485cae 100644 --- a/pandas/tests/scalar/timestamp/methods/test_replace.py +++ b/pandas/tests/scalar/timestamp/methods/test_replace.py @@ -1,9 +1,9 @@ from datetime import datetime +import zoneinfo from dateutil.tz import gettz import numpy as np import pytest -import pytz from pandas._libs.tslibs import ( OutOfBoundsDatetime, @@ -111,8 +111,8 @@ def test_replace_tzinfo_equiv_tz_localize_none(self): @pytest.mark.skipif(WASM, reason="tzset is not available on WASM") def test_replace_tzinfo(self): # GH#15683 - dt = datetime(2016, 3, 27, 1) - tzinfo = pytz.timezone("CET").localize(dt, is_dst=False).tzinfo + dt = datetime(2016, 3, 27, 1, fold=1) + tzinfo = dt.astimezone(zoneinfo.ZoneInfo("Europe/Berlin")).tzinfo result_dt = dt.replace(tzinfo=tzinfo) result_pd = Timestamp(dt).replace(tzinfo=tzinfo) @@ -137,13 +137,16 @@ def test_replace_tzinfo(self): @pytest.mark.parametrize( "tz, normalize", [ - (pytz.timezone("US/Eastern"), lambda x: x.tzinfo.normalize(x)), + ("pytz/US/Eastern", lambda x: x.tzinfo.normalize(x)), (gettz("US/Eastern"), lambda x: x), ], ) def test_replace_across_dst(self, tz, normalize): # GH#18319 check that 1) timezone is correctly normalized and # 2) that hour is not incorrectly changed by this normalization + if isinstance(tz, str) and tz.startswith("pytz/"): + pytz = pytest.importorskip("pytz") + tz = pytz.timezone(tz.removeprefix("pytz/")) ts_naive = Timestamp("2017-12-03 16:03:30") ts_aware = conversion.localize_pydatetime(ts_naive, tz) diff --git a/pandas/tests/scalar/timestamp/methods/test_timestamp_method.py b/pandas/tests/scalar/timestamp/methods/test_timestamp_method.py index b576317fca8b4..beacaaf04e6b2 100644 --- a/pandas/tests/scalar/timestamp/methods/test_timestamp_method.py +++ b/pandas/tests/scalar/timestamp/methods/test_timestamp_method.py @@ -1,8 +1,8 @@ # NB: This is for the Timestamp.timestamp *method* specifically, not # the Timestamp class in general. +from datetime import timezone import pytest -from pytz import utc from pandas._libs.tslibs import Timestamp from pandas.compat import WASM @@ -18,7 +18,7 @@ def test_timestamp(self, fixed_now_ts): # GH#17329 # tz-naive --> treat it as if it were UTC for purposes of timestamp() ts = fixed_now_ts - uts = ts.replace(tzinfo=utc) + uts = ts.replace(tzinfo=timezone.utc) assert ts.timestamp() == uts.timestamp() tsc = Timestamp("2014-10-11 11:00:01.12345678", tz="US/Central") diff --git a/pandas/tests/scalar/timestamp/methods/test_to_pydatetime.py b/pandas/tests/scalar/timestamp/methods/test_to_pydatetime.py index be6ec7dbc24c7..07e57b51a7f1e 100644 --- a/pandas/tests/scalar/timestamp/methods/test_to_pydatetime.py +++ b/pandas/tests/scalar/timestamp/methods/test_to_pydatetime.py @@ -3,7 +3,7 @@ timedelta, ) -import pytz +import pytest from pandas._libs.tslibs.timezones import dateutil_gettz as gettz import pandas.util._test_decorators as td @@ -43,6 +43,7 @@ def test_timestamp_to_pydatetime_dateutil(self): assert stamp.tzinfo == dtval.tzinfo def test_timestamp_to_pydatetime_explicit_pytz(self): + pytz = pytest.importorskip("pytz") stamp = Timestamp("20090415", tz=pytz.timezone("US/Eastern")) dtval = stamp.to_pydatetime() assert stamp == dtval diff --git a/pandas/tests/scalar/timestamp/methods/test_tz_localize.py b/pandas/tests/scalar/timestamp/methods/test_tz_localize.py index 0786cc58a4f95..90dc8d77608cb 100644 --- a/pandas/tests/scalar/timestamp/methods/test_tz_localize.py +++ b/pandas/tests/scalar/timestamp/methods/test_tz_localize.py @@ -1,5 +1,6 @@ from datetime import timedelta import re +import zoneinfo from dateutil.tz import gettz import pytest @@ -17,68 +18,56 @@ Timestamp, ) -try: - from zoneinfo import ZoneInfo -except ImportError: - # Cannot assign to a type - ZoneInfo = None # type: ignore[misc, assignment] - class TestTimestampTZLocalize: @pytest.mark.skip_ubsan def test_tz_localize_pushes_out_of_bounds(self): # GH#12677 # tz_localize that pushes away from the boundary is OK + pytz = pytest.importorskip("pytz") msg = ( f"Converting {Timestamp.min.strftime('%Y-%m-%d %H:%M:%S')} " f"underflows past {Timestamp.min}" ) - pac = Timestamp.min.tz_localize("US/Pacific") + pac = Timestamp.min.tz_localize(pytz.timezone("US/Pacific")) assert pac._value > Timestamp.min._value pac.tz_convert("Asia/Tokyo") # tz_convert doesn't change value with pytest.raises(OutOfBoundsDatetime, match=msg): - Timestamp.min.tz_localize("Asia/Tokyo") + Timestamp.min.tz_localize(pytz.timezone("Asia/Tokyo")) # tz_localize that pushes away from the boundary is OK msg = ( f"Converting {Timestamp.max.strftime('%Y-%m-%d %H:%M:%S')} " f"overflows past {Timestamp.max}" ) - tokyo = Timestamp.max.tz_localize("Asia/Tokyo") + tokyo = Timestamp.max.tz_localize(pytz.timezone("Asia/Tokyo")) assert tokyo._value < Timestamp.max._value tokyo.tz_convert("US/Pacific") # tz_convert doesn't change value with pytest.raises(OutOfBoundsDatetime, match=msg): - Timestamp.max.tz_localize("US/Pacific") + Timestamp.max.tz_localize(pytz.timezone("US/Pacific")) - def test_tz_localize_ambiguous_bool(self, unit): + @pytest.mark.parametrize( + "tz", + [zoneinfo.ZoneInfo("US/Central"), "dateutil/US/Central", "pytz/US/Central"], + ) + def test_tz_localize_ambiguous_bool(self, unit, tz): # make sure that we are correctly accepting bool values as ambiguous # GH#14402 + if isinstance(tz, str) and tz.startswith("pytz/"): + tz = pytz.timezone(tz.removeprefix("pytz/")) ts = Timestamp("2015-11-01 01:00:03").as_unit(unit) - expected0 = Timestamp("2015-11-01 01:00:03-0500", tz="US/Central") - expected1 = Timestamp("2015-11-01 01:00:03-0600", tz="US/Central") + expected0 = Timestamp("2015-11-01 01:00:03-0500", tz=tz) + expected1 = Timestamp("2015-11-01 01:00:03-0600", tz=tz) msg = "Cannot infer dst time from 2015-11-01 01:00:03" with pytest.raises(pytz.AmbiguousTimeError, match=msg): - ts.tz_localize("US/Central") + ts.tz_localize(tz) - with pytest.raises(pytz.AmbiguousTimeError, match=msg): - ts.tz_localize("dateutil/US/Central") - - if ZoneInfo is not None: - try: - tz = ZoneInfo("US/Central") - except KeyError: - # no tzdata - pass - else: - with pytest.raises(pytz.AmbiguousTimeError, match=msg): - ts.tz_localize(tz) - - result = ts.tz_localize("US/Central", ambiguous=True) + result = ts.tz_localize(tz, ambiguous=True) assert result == expected0 assert result._creso == getattr(NpyDatetimeUnit, f"NPY_FR_{unit}").value - result = ts.tz_localize("US/Central", ambiguous=False) + result = ts.tz_localize(tz, ambiguous=False) assert result == expected1 assert result._creso == getattr(NpyDatetimeUnit, f"NPY_FR_{unit}").value @@ -205,9 +194,10 @@ def test_tz_localize_roundtrip(self, stamp, tz_aware_fixture): def test_tz_localize_ambiguous_compat(self): # validate that pytz and dateutil are compat for dst # when the transition happens + pytz = pytest.importorskip("pytz") naive = Timestamp("2013-10-27 01:00:00") - pytz_zone = "Europe/London" + pytz_zone = pytz.timezone("Europe/London") dateutil_zone = "dateutil/Europe/London" result_pytz = naive.tz_localize(pytz_zone, ambiguous=False) result_dateutil = naive.tz_localize(dateutil_zone, ambiguous=False) @@ -236,13 +226,16 @@ def test_tz_localize_ambiguous_compat(self): @pytest.mark.parametrize( "tz", [ - pytz.timezone("US/Eastern"), + "pytz/US/Eastern", gettz("US/Eastern"), - "US/Eastern", + zoneinfo.ZoneInfo("US/Eastern"), "dateutil/US/Eastern", ], ) def test_timestamp_tz_localize(self, tz): + if isinstance(tz, str) and tz.startswith("pytz/"): + pytz = pytest.importorskip("pytz") + tz = pytz.timezone(tz.removeprefix("pytz/")) stamp = Timestamp("3/11/2012 04:00") result = stamp.tz_localize(tz) diff --git a/pandas/tests/scalar/timestamp/test_arithmetic.py b/pandas/tests/scalar/timestamp/test_arithmetic.py index 2d58513989a66..7aa6c6c0496a9 100644 --- a/pandas/tests/scalar/timestamp/test_arithmetic.py +++ b/pandas/tests/scalar/timestamp/test_arithmetic.py @@ -7,7 +7,6 @@ from dateutil.tz import gettz import numpy as np import pytest -import pytz from pandas._libs.tslibs import ( OutOfBoundsDatetime, @@ -294,7 +293,7 @@ def test_subtract_different_utc_objects(self, utc_fixture, utc_fixture2): @pytest.mark.parametrize( "tz", [ - pytz.timezone("US/Eastern"), + "pytz/US/Eastern", gettz("US/Eastern"), "US/Eastern", "dateutil/US/Eastern", @@ -302,7 +301,9 @@ def test_subtract_different_utc_objects(self, utc_fixture, utc_fixture2): ) def test_timestamp_add_timedelta_push_over_dst_boundary(self, tz): # GH#1389 - + if isinstance(tz, str) and tz.startswith("pytz/"): + pytz = pytest.importorskip("pytz") + tz = pytz.timezone(tz.removeprefix("pytz/")) # 4 hours before DST transition stamp = Timestamp("3/10/2012 22:00", tz=tz) diff --git a/pandas/tests/scalar/timestamp/test_constructors.py b/pandas/tests/scalar/timestamp/test_constructors.py index 4249063b67d31..39f302c3357de 100644 --- a/pandas/tests/scalar/timestamp/test_constructors.py +++ b/pandas/tests/scalar/timestamp/test_constructors.py @@ -122,6 +122,7 @@ def test_timestamp_constructor_pytz_fold_raise(self): # Test for GH#25057 # pytz doesn't support fold. Check that we raise # if fold is passed with pytz + pytz = pytest.importorskip("pytz") msg = "pytz timezones do not support fold. Please use dateutil timezones." tz = pytz.timezone("Europe/London") with pytest.raises(ValueError, match=msg): @@ -159,15 +160,13 @@ def test_timestamp_constructor_retain_fold(self, tz, fold): expected = fold assert result == expected - try: - _tzs = [ + @pytest.mark.parametrize( + "tz", + [ "dateutil/Europe/London", zoneinfo.ZoneInfo("Europe/London"), - ] - except zoneinfo.ZoneInfoNotFoundError: - _tzs = ["dateutil/Europe/London"] - - @pytest.mark.parametrize("tz", _tzs) + ], + ) @pytest.mark.parametrize( "ts_input,fold_out", [ @@ -560,11 +559,11 @@ def test_constructor(self): timezones = [ (None, 0), ("UTC", 0), - (pytz.utc, 0), + (timezone.utc, 0), ("Asia/Tokyo", 9), ("US/Eastern", -4), ("dateutil/US/Pacific", -7), - (pytz.FixedOffset(-180), -3), + (timezone(timedelta(hours=-3)), -3), (dateutil.tz.tzoffset(None, 18000), 5), ] @@ -617,11 +616,11 @@ def test_constructor_with_stringoffset(self): timezones = [ ("UTC", 0), - (pytz.utc, 0), + (timezone.utc, 0), ("Asia/Tokyo", 9), ("US/Eastern", -4), ("dateutil/US/Pacific", -7), - (pytz.FixedOffset(-180), -3), + (timezone(timedelta(hours=-3)), -3), (dateutil.tz.tzoffset(None, 18000), 5), ] @@ -701,7 +700,7 @@ def test_constructor_invalid_tz(self): msg = "at most one of" with pytest.raises(ValueError, match=msg): - Timestamp("2017-10-22", tzinfo=pytz.utc, tz="UTC") + Timestamp("2017-10-22", tzinfo=timezone.utc, tz="UTC") msg = "Cannot pass a date attribute keyword argument when passing a date string" with pytest.raises(ValueError, match=msg): @@ -714,11 +713,11 @@ def test_constructor_tz_or_tzinfo(self): # GH#17943, GH#17690, GH#5168 stamps = [ Timestamp(year=2017, month=10, day=22, tz="UTC"), - Timestamp(year=2017, month=10, day=22, tzinfo=pytz.utc), - Timestamp(year=2017, month=10, day=22, tz=pytz.utc), - Timestamp(datetime(2017, 10, 22), tzinfo=pytz.utc), + Timestamp(year=2017, month=10, day=22, tzinfo=timezone.utc), + Timestamp(year=2017, month=10, day=22, tz=timezone.utc), + Timestamp(datetime(2017, 10, 22), tzinfo=timezone.utc), Timestamp(datetime(2017, 10, 22), tz="UTC"), - Timestamp(datetime(2017, 10, 22), tz=pytz.utc), + Timestamp(datetime(2017, 10, 22), tz=timezone.utc), ] assert all(ts == stamps[0] for ts in stamps) @@ -893,13 +892,13 @@ def test_construct_timestamp_near_dst(self, offset): def test_construct_with_different_string_format(self, arg): # GH 12064 result = Timestamp(arg) - expected = Timestamp(datetime(2013, 1, 1), tz=pytz.FixedOffset(540)) + expected = Timestamp(datetime(2013, 1, 1), tz=timezone(timedelta(hours=9))) assert result == expected @pytest.mark.parametrize("box", [datetime, Timestamp]) def test_raise_tz_and_tzinfo_in_datetime_input(self, box): # GH 23579 - kwargs = {"year": 2018, "month": 1, "day": 1, "tzinfo": pytz.utc} + kwargs = {"year": 2018, "month": 1, "day": 1, "tzinfo": timezone.utc} msg = "Cannot pass a datetime or Timestamp" with pytest.raises(ValueError, match=msg): Timestamp(box(**kwargs), tz="US/Pacific") @@ -907,7 +906,7 @@ def test_raise_tz_and_tzinfo_in_datetime_input(self, box): with pytest.raises(ValueError, match=msg): Timestamp(box(**kwargs), tzinfo=pytz.timezone("US/Pacific")) - def test_dont_convert_dateutil_utc_to_pytz_utc(self): + def test_dont_convert_dateutil_utc_to_default_utc(self): result = Timestamp(datetime(2018, 1, 1), tz=tzutc()) expected = Timestamp(datetime(2018, 1, 1)).tz_localize(tzutc()) assert result == expected @@ -991,7 +990,7 @@ def test_timestamp_constructor_near_dst_boundary(self): @pytest.mark.parametrize( "tz", [ - pytz.timezone("US/Eastern"), + "pytz/US/Eastern", gettz("US/Eastern"), "US/Eastern", "dateutil/US/Eastern", @@ -1000,7 +999,9 @@ def test_timestamp_constructor_near_dst_boundary(self): def test_timestamp_constructed_by_date_and_tz(self, tz): # GH#2993, Timestamp cannot be constructed by datetime.date # and tz correctly - + if isinstance(tz, str) and tz.startswith("pytz/"): + pytz = pytest.importorskip("pytz") + tz = pytz.timezone(tz.removeprefix("pytz/")) result = Timestamp(date(2012, 3, 11), tz=tz) expected = Timestamp("3/11/2012", tz=tz) diff --git a/pandas/tests/scalar/timestamp/test_formats.py b/pandas/tests/scalar/timestamp/test_formats.py index 44db1187850c9..7b20f0a17556d 100644 --- a/pandas/tests/scalar/timestamp/test_formats.py +++ b/pandas/tests/scalar/timestamp/test_formats.py @@ -1,9 +1,11 @@ -from datetime import datetime +from datetime import ( + datetime, + timezone, +) import pprint import dateutil.tz import pytest -import pytz # a test below uses pytz but only inside a `eval` call from pandas.compat import WASM @@ -181,14 +183,14 @@ def test_repr_matches_pydatetime_no_tz(self): ts_nanos_micros = Timestamp(1200) assert str(ts_nanos_micros) == "1970-01-01 00:00:00.000001200" - def test_repr_matches_pydatetime_tz_pytz(self): - dt_date = datetime(2013, 1, 2, tzinfo=pytz.utc) + def test_repr_matches_pydatetime_tz_stdlib(self): + dt_date = datetime(2013, 1, 2, tzinfo=timezone.utc) assert str(dt_date) == str(Timestamp(dt_date)) - dt_datetime = datetime(2013, 1, 2, 12, 1, 3, tzinfo=pytz.utc) + dt_datetime = datetime(2013, 1, 2, 12, 1, 3, tzinfo=timezone.utc) assert str(dt_datetime) == str(Timestamp(dt_datetime)) - dt_datetime_us = datetime(2013, 1, 2, 12, 1, 3, 45, tzinfo=pytz.utc) + dt_datetime_us = datetime(2013, 1, 2, 12, 1, 3, 45, tzinfo=timezone.utc) assert str(dt_datetime_us) == str(Timestamp(dt_datetime_us)) def test_repr_matches_pydatetime_tz_dateutil(self): diff --git a/pandas/tests/scalar/timestamp/test_timestamp.py b/pandas/tests/scalar/timestamp/test_timestamp.py index 79fd285073983..38d0ddfbc13bd 100644 --- a/pandas/tests/scalar/timestamp/test_timestamp.py +++ b/pandas/tests/scalar/timestamp/test_timestamp.py @@ -9,6 +9,7 @@ import locale import time import unicodedata +import zoneinfo from dateutil.tz import ( tzlocal, @@ -20,8 +21,6 @@ ) import numpy as np import pytest -import pytz -from pytz import utc from pandas._libs.tslibs.dtypes import NpyDatetimeUnit from pandas._libs.tslibs.timezones import ( @@ -259,7 +258,7 @@ def test_dow_parametric(self, ts, sign): class TestTimestamp: - @pytest.mark.parametrize("tz", [None, pytz.timezone("US/Pacific")]) + @pytest.mark.parametrize("tz", [None, zoneinfo.ZoneInfo("US/Pacific")]) def test_disallow_setting_tz(self, tz): # GH#3746 ts = Timestamp("2010") @@ -311,7 +310,7 @@ def compare(x, y): assert int((Timestamp(x)._value - Timestamp(y)._value) / 1e9) == 0 compare(Timestamp.now(), datetime.now()) - compare(Timestamp.now("UTC"), datetime.now(pytz.timezone("UTC"))) + compare(Timestamp.now("UTC"), datetime.now(timezone.utc)) compare(Timestamp.now("UTC"), datetime.now(tzutc())) msg = "Timestamp.utcnow is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): @@ -329,12 +328,12 @@ def compare(x, y): compare( # Support tz kwarg in Timestamp.fromtimestamp Timestamp.fromtimestamp(current_time, "UTC"), - datetime.fromtimestamp(current_time, utc), + datetime.fromtimestamp(current_time, timezone.utc), ) compare( # Support tz kwarg in Timestamp.fromtimestamp Timestamp.fromtimestamp(current_time, tz="UTC"), - datetime.fromtimestamp(current_time, utc), + datetime.fromtimestamp(current_time, timezone.utc), ) date_component = datetime.now(timezone.utc) @@ -585,9 +584,9 @@ def test_month_name(self, dt64, ts): assert ts.month_name() == alt.month_name() def test_tz_convert(self, ts): - ts = Timestamp._from_value_and_reso(ts._value, ts._creso, utc) + ts = Timestamp._from_value_and_reso(ts._value, ts._creso, timezone.utc) - tz = pytz.timezone("US/Pacific") + tz = zoneinfo.ZoneInfo("US/Pacific") result = ts.tz_convert(tz) assert isinstance(result, Timestamp) diff --git a/pandas/tests/series/indexing/test_datetime.py b/pandas/tests/series/indexing/test_datetime.py index 3b41c8ee463d8..97cafc33611ed 100644 --- a/pandas/tests/series/indexing/test_datetime.py +++ b/pandas/tests/series/indexing/test_datetime.py @@ -14,7 +14,6 @@ ) import numpy as np import pytest -import pytz from pandas._libs import index as libindex @@ -63,6 +62,7 @@ def test_fancy_setitem(): @pytest.mark.parametrize("tz_source", ["pytz", "dateutil"]) def test_getitem_setitem_datetime_tz(tz_source): if tz_source == "pytz": + pytz = pytest.importorskip(tz_source) tzget = pytz.timezone else: # handle special case for utc in dateutil diff --git a/pandas/tests/series/methods/test_fillna.py b/pandas/tests/series/methods/test_fillna.py index c10bb8278a3d1..e1c771ec6e658 100644 --- a/pandas/tests/series/methods/test_fillna.py +++ b/pandas/tests/series/methods/test_fillna.py @@ -6,7 +6,6 @@ import numpy as np import pytest -import pytz from pandas import ( Categorical, @@ -861,7 +860,7 @@ def test_fillna_bug(self): def test_ffill_mixed_dtypes_without_missing_data(self): # GH#14956 - series = Series([datetime(2015, 1, 1, tzinfo=pytz.utc), 1]) + series = Series([datetime(2015, 1, 1, tzinfo=timezone.utc), 1]) result = series.ffill() tm.assert_series_equal(series, result) @@ -923,16 +922,16 @@ def test_datetime64tz_fillna_round_issue(self): # GH#14872 data = Series( - [NaT, NaT, datetime(2016, 12, 12, 22, 24, 6, 100001, tzinfo=pytz.utc)] + [NaT, NaT, datetime(2016, 12, 12, 22, 24, 6, 100001, tzinfo=timezone.utc)] ) filled = data.bfill() expected = Series( [ - datetime(2016, 12, 12, 22, 24, 6, 100001, tzinfo=pytz.utc), - datetime(2016, 12, 12, 22, 24, 6, 100001, tzinfo=pytz.utc), - datetime(2016, 12, 12, 22, 24, 6, 100001, tzinfo=pytz.utc), + datetime(2016, 12, 12, 22, 24, 6, 100001, tzinfo=timezone.utc), + datetime(2016, 12, 12, 22, 24, 6, 100001, tzinfo=timezone.utc), + datetime(2016, 12, 12, 22, 24, 6, 100001, tzinfo=timezone.utc), ] ) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index cbbd018720bad..4f5b7f884ce12 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -10,11 +10,11 @@ ) from decimal import Decimal import locale +import zoneinfo from dateutil.parser import parse import numpy as np import pytest -import pytz from pandas._libs import tslib from pandas._libs.tslibs import ( @@ -432,9 +432,11 @@ def test_to_datetime_format_weeks(self, value, fmt, expected, cache): ["2010-01-01 12:00:00 Z", "2010-01-01 12:00:00 Z"], [ Timestamp( - "2010-01-01 12:00:00", tzinfo=pytz.FixedOffset(0) - ), # pytz coerces to UTC - Timestamp("2010-01-01 12:00:00", tzinfo=pytz.FixedOffset(0)), + "2010-01-01 12:00:00", tzinfo=timezone(timedelta(minutes=0)) + ), + Timestamp( + "2010-01-01 12:00:00", tzinfo=timezone(timedelta(minutes=0)) + ), ], ], ], @@ -1169,6 +1171,7 @@ def test_to_datetime_different_offsets_removed(self, cache): def test_to_datetime_tz_pytz(self, cache): # see gh-8260 + pytz = pytest.importorskip("pytz") us_eastern = pytz.timezone("US/Eastern") arr = np.array( [ @@ -1699,7 +1702,9 @@ def test_to_datetime_fixed_offset(self): ["2020-10-26 00:00:00+06:00", Timestamp("2018-01-01", tz="US/Pacific")], [ "2020-10-26 00:00:00+06:00", - datetime(2020, 1, 1, 18, tzinfo=pytz.timezone("Australia/Melbourne")), + datetime(2020, 1, 1, 18).astimezone( + zoneinfo.ZoneInfo("Australia/Melbourne") + ), ], ], ) @@ -2351,7 +2356,7 @@ def test_to_datetime_iso8601_non_padded(self, input, format): ) def test_to_datetime_iso8601_with_timezone_valid(self, input, format): # https://github.com/pandas-dev/pandas/issues/12649 - expected = Timestamp(2020, 1, 1, tzinfo=pytz.UTC) + expected = Timestamp(2020, 1, 1, tzinfo=timezone.utc) result = to_datetime(input, format=format) assert result == expected @@ -2778,7 +2783,7 @@ def test_infer_datetime_format_zero_tz(self, ts, zero_tz): # GH 41047 ser = Series([ts + zero_tz]) result = to_datetime(ser) - tz = pytz.utc if zero_tz == "Z" else None + tz = timezone.utc if zero_tz == "Z" else None expected = Series([Timestamp(ts, tz=tz)]) tm.assert_series_equal(result, expected) @@ -3213,7 +3218,7 @@ def test_invalid_origins(self, origin, exc, units): def test_invalid_origins_tzinfo(self): # GH16842 with pytest.raises(ValueError, match="must be tz-naive"): - to_datetime(1, unit="D", origin=datetime(2000, 1, 1, tzinfo=pytz.utc)) + to_datetime(1, unit="D", origin=datetime(2000, 1, 1, tzinfo=timezone.utc)) def test_incorrect_value_exception(self): # GH47495 diff --git a/pandas/tests/tseries/holiday/test_holiday.py b/pandas/tests/tseries/holiday/test_holiday.py index 08f4a1250392e..ffe6ff0b51bcf 100644 --- a/pandas/tests/tseries/holiday/test_holiday.py +++ b/pandas/tests/tseries/holiday/test_holiday.py @@ -1,7 +1,9 @@ -from datetime import datetime +from datetime import ( + datetime, + timezone, +) import pytest -from pytz import utc from pandas import ( DatetimeIndex, @@ -128,9 +130,9 @@ def test_holiday_dates(holiday, start_date, end_date, expected): # Verify that timezone info is preserved. assert list( holiday.dates( - utc.localize(Timestamp(start_date)), utc.localize(Timestamp(end_date)) + Timestamp(start_date, tz=timezone.utc), Timestamp(end_date, tz=timezone.utc) ) - ) == [utc.localize(dt) for dt in expected] + ) == [dt.replace(tzinfo=timezone.utc) for dt in expected] @pytest.mark.parametrize( @@ -194,8 +196,10 @@ def test_holidays_within_dates(holiday, start, expected): # Verify that timezone info is preserved. assert list( - holiday.dates(utc.localize(Timestamp(start)), utc.localize(Timestamp(start))) - ) == [utc.localize(dt) for dt in expected] + holiday.dates( + Timestamp(start, tz=timezone.utc), Timestamp(start, tz=timezone.utc) + ) + ) == [dt.replace(tzinfo=timezone.utc) for dt in expected] @pytest.mark.parametrize( diff --git a/pandas/tests/tseries/offsets/test_dst.py b/pandas/tests/tseries/offsets/test_dst.py index 8ff80536fc69e..dfdc69c0fe18e 100644 --- a/pandas/tests/tseries/offsets/test_dst.py +++ b/pandas/tests/tseries/offsets/test_dst.py @@ -5,7 +5,6 @@ from datetime import timedelta import pytest -import pytz from pandas._libs.tslibs import Timestamp from pandas._libs.tslibs.offsets import ( @@ -33,10 +32,8 @@ from pandas import DatetimeIndex import pandas._testing as tm -from pandas.util.version import Version -# error: Module has no attribute "__version__" -pytz_version = Version(pytz.__version__) # type: ignore[attr-defined] +pytz = pytest.importorskip("pytz") def get_utc_offset_hours(ts): @@ -52,7 +49,10 @@ class TestDST: # test both basic names and dateutil timezones timezone_utc_offsets = { - "US/Eastern": {"utc_offset_daylight": -4, "utc_offset_standard": -5}, + pytz.timezone("US/Eastern"): { + "utc_offset_daylight": -4, + "utc_offset_standard": -5, + }, "dateutil/US/Pacific": {"utc_offset_daylight": -7, "utc_offset_standard": -8}, } valid_date_offsets_singular = [ @@ -96,7 +96,10 @@ def _test_offset( if ( offset_name in ["hour", "minute", "second", "microsecond"] and offset_n == 1 - and tstart == Timestamp("2013-11-03 01:59:59.999999-0500", tz="US/Eastern") + and tstart + == Timestamp( + "2013-11-03 01:59:59.999999-0500", tz=pytz.timezone("US/Eastern") + ) ): # This addition results in an ambiguous wall time err_msg = { @@ -147,7 +150,9 @@ def _test_offset( assert datepart_offset == offset.kwds[offset_name] else: # the offset should be the same as if it was done in UTC - assert t == (tstart.tz_convert("UTC") + offset).tz_convert("US/Pacific") + assert t == (tstart.tz_convert("UTC") + offset).tz_convert( + pytz.timezone("US/Pacific") + ) def _make_timestamp(self, string, hrs_offset, tz): if hrs_offset >= 0: @@ -224,16 +229,6 @@ def test_all_offset_classes(self, tup): @pytest.mark.parametrize( "original_dt, target_dt, offset, tz", [ - pytest.param( - Timestamp("1900-01-01"), - Timestamp("1905-07-01"), - MonthBegin(66), - "Africa/Lagos", - marks=pytest.mark.xfail( - pytz_version < Version("2020.5") or pytz_version == Version("2022.2"), - reason="GH#41906: pytz utc transition dates changed", - ), - ), ( Timestamp("2021-10-01 01:15"), Timestamp("2021-10-31 01:15"), @@ -263,7 +258,7 @@ def test_all_offset_classes(self, tup): def test_nontick_offset_with_ambiguous_time_error(original_dt, target_dt, offset, tz): # .apply for non-Tick offsets throws AmbiguousTimeError when the target dt # is dst-ambiguous - localized_dt = original_dt.tz_localize(tz) + localized_dt = original_dt.tz_localize(pytz.timezone(tz)) msg = f"Cannot infer dst time from {target_dt}, try using the 'ambiguous' argument" with pytest.raises(pytz.AmbiguousTimeError, match=msg): diff --git a/pandas/tests/tslibs/test_conversion.py b/pandas/tests/tslibs/test_conversion.py index 6a0b86cbd03ee..f62910b5e1f1c 100644 --- a/pandas/tests/tslibs/test_conversion.py +++ b/pandas/tests/tslibs/test_conversion.py @@ -1,8 +1,10 @@ -from datetime import datetime +from datetime import ( + datetime, + timezone, +) import numpy as np import pytest -from pytz import UTC from pandas._libs.tslibs import ( OutOfBoundsTimedelta, @@ -55,7 +57,7 @@ def _compare_local_to_utc(tz_didx, naive_didx): def test_tz_localize_to_utc_copies(): # GH#46460 arr = np.arange(5, dtype="i8") - result = tz_convert_from_utc(arr, tz=UTC) + result = tz_convert_from_utc(arr, tz=timezone.utc) tm.assert_numpy_array_equal(result, arr) assert not np.shares_memory(arr, result) @@ -100,7 +102,7 @@ def test_tz_convert_readonly(): # GH#35530 arr = np.array([0], dtype=np.int64) arr.setflags(write=False) - result = tz_convert_from_utc(arr, UTC) + result = tz_convert_from_utc(arr, timezone.utc) tm.assert_numpy_array_equal(result, arr) @@ -141,14 +143,18 @@ class SubDatetime(datetime): "dt, expected", [ pytest.param( - Timestamp("2000-01-01"), Timestamp("2000-01-01", tz=UTC), id="timestamp" + Timestamp("2000-01-01"), + Timestamp("2000-01-01", tz=timezone.utc), + id="timestamp", ), pytest.param( - datetime(2000, 1, 1), datetime(2000, 1, 1, tzinfo=UTC), id="datetime" + datetime(2000, 1, 1), + datetime(2000, 1, 1, tzinfo=timezone.utc), + id="datetime", ), pytest.param( SubDatetime(2000, 1, 1), - SubDatetime(2000, 1, 1, tzinfo=UTC), + SubDatetime(2000, 1, 1, tzinfo=timezone.utc), id="subclassed_datetime", ), ], @@ -157,5 +163,5 @@ def test_localize_pydatetime_dt_types(dt, expected): # GH 25851 # ensure that subclassed datetime works with # localize_pydatetime - result = conversion.localize_pydatetime(dt, UTC) + result = conversion.localize_pydatetime(dt, timezone.utc) assert result == expected diff --git a/pandas/tests/tslibs/test_resolution.py b/pandas/tests/tslibs/test_resolution.py index e9da6b3cf991c..49b87c055dc69 100644 --- a/pandas/tests/tslibs/test_resolution.py +++ b/pandas/tests/tslibs/test_resolution.py @@ -1,6 +1,7 @@ +import datetime + import numpy as np import pytest -import pytz from pandas._libs.tslibs import ( Resolution, @@ -23,7 +24,7 @@ def test_get_resolution_non_nano_data(): res = get_resolution(arr, None, NpyDatetimeUnit.NPY_FR_us.value) assert res == Resolution.RESO_US - res = get_resolution(arr, pytz.UTC, NpyDatetimeUnit.NPY_FR_us.value) + res = get_resolution(arr, datetime.timezone.utc, NpyDatetimeUnit.NPY_FR_us.value) assert res == Resolution.RESO_US diff --git a/pandas/tests/tslibs/test_timezones.py b/pandas/tests/tslibs/test_timezones.py index 28e4889983fb9..8dd7060f21d59 100644 --- a/pandas/tests/tslibs/test_timezones.py +++ b/pandas/tests/tslibs/test_timezones.py @@ -6,7 +6,6 @@ import dateutil.tz import pytest -import pytz from pandas._libs.tslibs import ( conversion, @@ -22,10 +21,11 @@ def test_is_utc(utc_fixture): assert timezones.is_utc(tz) -@pytest.mark.parametrize("tz_name", list(pytz.common_timezones)) -def test_cache_keys_are_distinct_for_pytz_vs_dateutil(tz_name): - tz_p = timezones.maybe_get_tz(tz_name) - tz_d = timezones.maybe_get_tz("dateutil/" + tz_name) +def test_cache_keys_are_distinct_for_pytz_vs_dateutil(): + pytz = pytest.importorskip("pytz") + for tz_name in pytz.common_timezones: + tz_p = timezones.maybe_get_tz(tz_name) + tz_d = timezones.maybe_get_tz("dateutil/" + tz_name) if tz_d is None: pytest.skip(tz_name + ": dateutil does not know about this one") @@ -76,12 +76,15 @@ def test_tz_compare_utc(utc_fixture, utc_fixture2): @pytest.fixture( params=[ - (pytz.timezone("US/Eastern"), lambda tz, x: tz.localize(x)), + ("pytz/US/Eastern", lambda tz, x: tz.localize(x)), (dateutil.tz.gettz("US/Eastern"), lambda tz, x: x.replace(tzinfo=tz)), ] ) def infer_setup(request): eastern, localize = request.param + if isinstance(eastern, str) and eastern.startswith("pytz/"): + pytz = pytest.importorskip("pytz") + eastern = pytz.timezone(eastern.removeprefix("pytz/")) start_naive = datetime(2001, 1, 1) end_naive = datetime(2009, 1, 1) @@ -111,10 +114,10 @@ def test_infer_tz_compat(infer_setup): def test_infer_tz_utc_localize(infer_setup): _, _, start, end, start_naive, end_naive = infer_setup - utc = pytz.utc + utc = timezone.utc - start = utc.localize(start_naive) - end = utc.localize(end_naive) + start = start_naive.astimezone(utc) + end = end_naive.astimezone(utc) assert timezones.infer_tzinfo(start, end) is utc @@ -124,8 +127,8 @@ def test_infer_tz_mismatch(infer_setup, ordered): eastern, _, _, _, start_naive, end_naive = infer_setup msg = "Inputs must both have the same timezone" - utc = pytz.utc - start = utc.localize(start_naive) + utc = timezone.utc + start = start_naive.astimezone(utc) end = conversion.localize_pydatetime(end_naive, eastern) args = (start, end) if ordered else (end, start) @@ -139,7 +142,7 @@ def test_maybe_get_tz_invalid_types(): timezones.maybe_get_tz(44.0) with pytest.raises(TypeError, match=""): - timezones.maybe_get_tz(pytz) + timezones.maybe_get_tz(pytest) msg = "" with pytest.raises(TypeError, match=msg): From 0b45a6e0b26b779c62d83b597d921350bddeded0 Mon Sep 17 00:00:00 2001 From: Abdulaziz Aloqeely <52792999+Aloqeely@users.noreply.github.com> Date: Mon, 24 Jun 2024 23:10:33 +0300 Subject: [PATCH 113/272] DEPR: Change version to 4.0 in deprecate decorators (#58967) --- pandas/core/frame.py | 24 +++++++++--------- pandas/core/series.py | 28 ++++++++++----------- pandas/tests/io/formats/test_to_markdown.py | 2 +- pandas/tests/io/formats/test_to_string.py | 2 +- 4 files changed, 28 insertions(+), 28 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 08b339dc26452..5b156cd75e373 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -11644,7 +11644,7 @@ def all( **kwargs, ) -> Series | bool: ... - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="all") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="all") @doc(make_doc("all", ndim=1)) def all( self, @@ -11691,7 +11691,7 @@ def min( **kwargs, ) -> Series | Any: ... - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="min") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="min") @doc(make_doc("min", ndim=2)) def min( self, @@ -11738,7 +11738,7 @@ def max( **kwargs, ) -> Series | Any: ... - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="max") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="max") @doc(make_doc("max", ndim=2)) def max( self, @@ -11754,7 +11754,7 @@ def max( result = result.__finalize__(self, method="max") return result - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="sum") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="sum") def sum( self, axis: Axis | None = 0, @@ -11855,7 +11855,7 @@ def sum( result = result.__finalize__(self, method="sum") return result - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="prod") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="prod") def prod( self, axis: Axis | None = 0, @@ -11973,7 +11973,7 @@ def mean( **kwargs, ) -> Series | Any: ... - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="mean") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="mean") @doc(make_doc("mean", ndim=2)) def mean( self, @@ -12020,7 +12020,7 @@ def median( **kwargs, ) -> Series | Any: ... - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="median") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="median") @doc(make_doc("median", ndim=2)) def median( self, @@ -12070,7 +12070,7 @@ def sem( **kwargs, ) -> Series | Any: ... - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="sem") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="sem") def sem( self, axis: Axis | None = 0, @@ -12190,7 +12190,7 @@ def var( **kwargs, ) -> Series | Any: ... - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="var") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="var") def var( self, axis: Axis | None = 0, @@ -12309,7 +12309,7 @@ def std( **kwargs, ) -> Series | Any: ... - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="std") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="std") def std( self, axis: Axis | None = 0, @@ -12432,7 +12432,7 @@ def skew( **kwargs, ) -> Series | Any: ... - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="skew") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="skew") def skew( self, axis: Axis | None = 0, @@ -12552,7 +12552,7 @@ def kurt( **kwargs, ) -> Series | Any: ... - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="kurt") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="kurt") def kurt( self, axis: Axis | None = 0, diff --git a/pandas/core/series.py b/pandas/core/series.py index 2781fa6af0d42..ee34240366aba 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1426,7 +1426,7 @@ def to_string( ) -> None: ... @deprecate_nonkeyword_arguments( - version="3.0.0", allowed_args=["self", "buf"], name="to_string" + version="4.0", allowed_args=["self", "buf"], name="to_string" ) def to_string( self, @@ -1584,7 +1584,7 @@ def to_markdown( ), ) @deprecate_nonkeyword_arguments( - version="3.0.0", allowed_args=["self", "buf"], name="to_markdown" + version="4.0", allowed_args=["self", "buf"], name="to_markdown" ) def to_markdown( self, @@ -6530,7 +6530,7 @@ def any( # type: ignore[override] filter_type="bool", ) - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="all") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="all") @Appender(make_doc("all", ndim=1)) def all( self, @@ -6550,7 +6550,7 @@ def all( filter_type="bool", ) - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="min") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="min") def min( self, axis: Axis | None = 0, @@ -6621,7 +6621,7 @@ def min( self, axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs ) - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="max") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="max") def max( self, axis: Axis | None = 0, @@ -6692,7 +6692,7 @@ def max( self, axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs ) - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="sum") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="sum") def sum( self, axis: Axis | None = None, @@ -6793,7 +6793,7 @@ def sum( **kwargs, ) - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="prod") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="prod") @doc(make_doc("prod", ndim=1)) def prod( self, @@ -6812,7 +6812,7 @@ def prod( **kwargs, ) - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="mean") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="mean") def mean( self, axis: Axis | None = 0, @@ -6866,7 +6866,7 @@ def mean( self, axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs ) - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="median") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="median") def median( self, axis: Axis | None = 0, @@ -6947,7 +6947,7 @@ def median( self, axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs ) - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="sem") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="sem") @doc(make_doc("sem", ndim=1)) def sem( self, @@ -6966,7 +6966,7 @@ def sem( **kwargs, ) - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="var") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="var") def var( self, axis: Axis | None = None, @@ -7053,7 +7053,7 @@ def var( **kwargs, ) - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="std") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="std") @doc(make_doc("std", ndim=1)) def std( self, @@ -7072,7 +7072,7 @@ def std( **kwargs, ) - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="skew") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="skew") @doc(make_doc("skew", ndim=1)) def skew( self, @@ -7085,7 +7085,7 @@ def skew( self, axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs ) - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="kurt") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="kurt") def kurt( self, axis: Axis | None = 0, diff --git a/pandas/tests/io/formats/test_to_markdown.py b/pandas/tests/io/formats/test_to_markdown.py index fffb1b9b9d2a4..7aa7cebb5120f 100644 --- a/pandas/tests/io/formats/test_to_markdown.py +++ b/pandas/tests/io/formats/test_to_markdown.py @@ -11,7 +11,7 @@ def test_keyword_deprecation(): # GH 57280 msg = ( - "Starting with pandas version 3.0.0 all arguments of to_markdown " + "Starting with pandas version 4.0 all arguments of to_markdown " "except for the argument 'buf' will be keyword-only." ) s = pd.Series() diff --git a/pandas/tests/io/formats/test_to_string.py b/pandas/tests/io/formats/test_to_string.py index 7c7069aa74eeb..ed871577d677f 100644 --- a/pandas/tests/io/formats/test_to_string.py +++ b/pandas/tests/io/formats/test_to_string.py @@ -38,7 +38,7 @@ class TestDataFrameToStringFormatters: def test_keyword_deprecation(self): # GH 57280 msg = ( - "Starting with pandas version 3.0.0 all arguments of to_string " + "Starting with pandas version 4.0 all arguments of to_string " "except for the argument 'buf' will be keyword-only." ) s = Series(["a", "b"]) From b1e5f0636ab2cc3e6949a051e37dc289616a4a4f Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 24 Jun 2024 11:07:02 -1000 Subject: [PATCH 114/272] TST: Fix some test builds for numpy 2.0 (#59046) * TST: Fix some test builds for numpy 2.0 * 64 not 32 * Adjust some tests * Revert "Adjust some tests" This reverts commit ca28f250be92757fe3053019ff183139bb53fc56. * Just pin numpy in pyarrow nightly build * Mark test as pa under 17 --- ci/deps/actions-311-pyarrownightly.yaml | 2 +- pandas/compat/__init__.py | 2 ++ pandas/compat/numpy/__init__.py | 2 +- pandas/compat/pyarrow.py | 2 ++ pandas/core/dtypes/cast.py | 13 +++++-------- pandas/tests/indexes/datetimelike_/test_indexing.py | 2 +- pandas/tests/io/test_parquet.py | 5 ++++- pandas/tests/scalar/timedelta/test_arithmetic.py | 2 +- pandas/tests/tools/test_to_datetime.py | 2 +- 9 files changed, 18 insertions(+), 14 deletions(-) diff --git a/ci/deps/actions-311-pyarrownightly.yaml b/ci/deps/actions-311-pyarrownightly.yaml index d84063ac2a9ba..5455b9b84b034 100644 --- a/ci/deps/actions-311-pyarrownightly.yaml +++ b/ci/deps/actions-311-pyarrownightly.yaml @@ -18,7 +18,7 @@ dependencies: # required dependencies - python-dateutil - - numpy + - numpy<2 - pytz - pip diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 13e6707667d0a..e08da7c7e14e3 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -31,6 +31,7 @@ pa_version_under14p0, pa_version_under14p1, pa_version_under16p0, + pa_version_under17p0, ) if TYPE_CHECKING: @@ -154,6 +155,7 @@ def is_ci_environment() -> bool: "pa_version_under14p0", "pa_version_under14p1", "pa_version_under16p0", + "pa_version_under17p0", "IS64", "ISMUSL", "PY311", diff --git a/pandas/compat/numpy/__init__.py b/pandas/compat/numpy/__init__.py index 54a12c76a230b..2fab8f32b8e71 100644 --- a/pandas/compat/numpy/__init__.py +++ b/pandas/compat/numpy/__init__.py @@ -12,7 +12,7 @@ np_version_gte1p24 = _nlv >= Version("1.24") np_version_gte1p24p3 = _nlv >= Version("1.24.3") np_version_gte1p25 = _nlv >= Version("1.25") -np_version_gt2 = _nlv >= Version("2.0.0.dev0") +np_version_gt2 = _nlv >= Version("2.0.0") is_numpy_dev = _nlv.dev is not None _min_numpy_ver = "1.23.5" diff --git a/pandas/compat/pyarrow.py b/pandas/compat/pyarrow.py index 5a96e5a4cc49a..87d3dc86cee87 100644 --- a/pandas/compat/pyarrow.py +++ b/pandas/compat/pyarrow.py @@ -16,6 +16,7 @@ pa_version_under14p1 = _palv < Version("14.0.1") pa_version_under15p0 = _palv < Version("15.0.0") pa_version_under16p0 = _palv < Version("16.0.0") + pa_version_under17p0 = _palv < Version("17.0.0") except ImportError: pa_version_under10p1 = True pa_version_under11p0 = True @@ -25,3 +26,4 @@ pa_version_under14p1 = True pa_version_under15p0 = True pa_version_under16p0 = True + pa_version_under17p0 = True diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 662b8c5791e51..f2af69fcc9d84 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -39,7 +39,6 @@ is_supported_dtype, ) from pandas._libs.tslibs.timedeltas import array_to_timedelta64 -from pandas.compat.numpy import np_version_gt2 from pandas.errors import ( IntCastingNaNError, LossySetitemError, @@ -1643,13 +1642,11 @@ def maybe_cast_to_integer_array(arr: list | np.ndarray, dtype: np.dtype) -> np.n with warnings.catch_warnings(): # We already disallow dtype=uint w/ negative numbers # (test_constructor_coercion_signed_to_unsigned) so safe to ignore. - if not np_version_gt2: - warnings.filterwarnings( - "ignore", - "NumPy will stop allowing conversion of " - "out-of-bound Python int", - DeprecationWarning, - ) + warnings.filterwarnings( + "ignore", + "NumPy will stop allowing conversion of " "out-of-bound Python int", + DeprecationWarning, + ) casted = np.asarray(arr, dtype=dtype) else: with warnings.catch_warnings(): diff --git a/pandas/tests/indexes/datetimelike_/test_indexing.py b/pandas/tests/indexes/datetimelike_/test_indexing.py index ee7128601256a..7b2c81aaf17de 100644 --- a/pandas/tests/indexes/datetimelike_/test_indexing.py +++ b/pandas/tests/indexes/datetimelike_/test_indexing.py @@ -19,7 +19,7 @@ @pytest.mark.parametrize("ldtype", dtlike_dtypes) @pytest.mark.parametrize("rdtype", dtlike_dtypes) def test_get_indexer_non_unique_wrong_dtype(ldtype, rdtype): - vals = np.tile(3600 * 10**9 * np.arange(3), 2) + vals = np.tile(3600 * 10**9 * np.arange(3, dtype=np.int64), 2) def construct(dtype): if dtype is dtlike_dtypes[-1]: diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 2e8e358b8e3c9..930df8abea30f 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -14,6 +14,7 @@ pa_version_under11p0, pa_version_under13p0, pa_version_under15p0, + pa_version_under17p0, ) import pandas as pd @@ -1033,7 +1034,9 @@ def test_read_dtype_backend_pyarrow_config_index(self, pa): expected=expected, ) - @pytest.mark.xfail(reason="pa.pandas_compat passes 'datetime64' to .astype") + @pytest.mark.xfail( + pa_version_under17p0, reason="pa.pandas_compat passes 'datetime64' to .astype" + ) def test_columns_dtypes_not_invalid(self, pa): df = pd.DataFrame({"string": list("abc"), "int": list(range(1, 4))}) diff --git a/pandas/tests/scalar/timedelta/test_arithmetic.py b/pandas/tests/scalar/timedelta/test_arithmetic.py index f29135cbf399e..8efd4b551ad49 100644 --- a/pandas/tests/scalar/timedelta/test_arithmetic.py +++ b/pandas/tests/scalar/timedelta/test_arithmetic.py @@ -419,7 +419,7 @@ def test_td_mul_numeric_ndarray(self): def test_td_mul_numeric_ndarray_0d(self): td = Timedelta("1 day") - other = np.array(2) + other = np.array(2, dtype=np.int64) assert other.ndim == 0 expected = Timedelta("2 days") diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 4f5b7f884ce12..c1d6baaf17c92 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -3187,7 +3187,7 @@ def test_invalid_origin(self, unit): ) def test_epoch(self, units, epochs): epoch_1960 = Timestamp(1960, 1, 1) - units_from_epochs = list(range(5)) + units_from_epochs = np.arange(5, dtype=np.int64) expected = Series( [pd.Timedelta(x, unit=units) + epoch_1960 for x in units_from_epochs] ) From 9dc725a85523aa208b3f061792f4610903571fb6 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 24 Jun 2024 14:42:05 -1000 Subject: [PATCH 115/272] CI: Clean GHA files (#59085) * Bump postgres, mysql, moto * Clean up other GHA items --- .github/actions/build_pandas/action.yml | 11 ++--------- .github/actions/run-tests/action.yml | 4 ++-- .github/workflows/unit-tests.yml | 15 ++++++--------- 3 files changed, 10 insertions(+), 20 deletions(-) diff --git a/.github/actions/build_pandas/action.yml b/.github/actions/build_pandas/action.yml index 63f687324b0ae..460ae2f8594c0 100644 --- a/.github/actions/build_pandas/action.yml +++ b/.github/actions/build_pandas/action.yml @@ -4,12 +4,6 @@ inputs: editable: description: Whether to build pandas in editable mode (default true) default: true - meson_args: - description: Extra flags to pass to meson - required: false - cflags_adds: - description: Items to append to the CFLAGS variable - required: false runs: using: composite steps: @@ -30,12 +24,11 @@ runs: - name: Build Pandas run: | - export CFLAGS="$CFLAGS ${{ inputs.cflags_adds }}" if [[ ${{ inputs.editable }} == "true" ]]; then - pip install -e . --no-build-isolation -v --no-deps ${{ inputs.meson_args }} \ + pip install -e . --no-build-isolation -v --no-deps \ --config-settings=setup-args="--werror" else - pip install . --no-build-isolation -v --no-deps ${{ inputs.meson_args }} \ + pip install . --no-build-isolation -v --no-deps \ --config-settings=setup-args="--werror" fi shell: bash -el {0} diff --git a/.github/actions/run-tests/action.yml b/.github/actions/run-tests/action.yml index 66e4142dc0cbb..f5d6abdf0f186 100644 --- a/.github/actions/run-tests/action.yml +++ b/.github/actions/run-tests/action.yml @@ -7,14 +7,14 @@ runs: shell: bash -el {0} - name: Publish test results - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: Test results path: test-data.xml if: failure() - name: Upload coverage to Codecov - uses: codecov/codecov-action@v3 + uses: codecov/codecov-action@v4 with: flags: unittests name: codecov-pandas diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 600ffd56b6d56..89d4f1e96a5ee 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -74,9 +74,9 @@ jobs: PATTERN: ${{ matrix.pattern }} LANG: ${{ matrix.lang || 'C.UTF-8' }} LC_ALL: ${{ matrix.lc_all || '' }} - PANDAS_CI: ${{ matrix.pandas_ci || '1' }} + PANDAS_CI: '1' TEST_ARGS: ${{ matrix.test_args || '' }} - PYTEST_WORKERS: ${{ matrix.pytest_workers || 'auto' }} + PYTEST_WORKERS: 'auto' PYTEST_TARGET: ${{ matrix.pytest_target || 'pandas' }} NPY_PROMOTION_STATE: ${{ matrix.env_file == 'actions-311-numpydev.yaml' && 'weak' || 'legacy' }} # Clipboard tests @@ -88,7 +88,7 @@ jobs: services: mysql: - image: mysql:8.0.33 + image: mysql:8 env: MYSQL_ALLOW_EMPTY_PASSWORD: yes MYSQL_DATABASE: pandas @@ -101,7 +101,7 @@ jobs: - 3306:3306 postgres: - image: postgres:13 + image: postgres:16 env: PGUSER: postgres POSTGRES_USER: postgres @@ -116,7 +116,7 @@ jobs: - 5432:5432 moto: - image: motoserver/moto:4.1.13 + image: motoserver/moto:5.0.0 env: AWS_ACCESS_KEY_ID: foobar_key AWS_SECRET_ACCESS_KEY: foobar_secret @@ -148,9 +148,6 @@ jobs: uses: ./.github/actions/build_pandas # TODO: Re-enable once Pypy has Pypy 3.10 on conda-forge if: ${{ matrix.name != 'Pypy' }} - with: - meson_args: ${{ matrix.meson_args }} - cflags_adds: ${{ matrix.cflags_adds }} - name: Test (not single_cpu) uses: ./.github/actions/run-tests @@ -317,7 +314,7 @@ jobs: concurrency: # https://github.community/t/concurrecy-not-work-for-push/183068/7 - group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.os }}-${{ matrix.pytest_target }}-dev + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.os }}-python-dev cancel-in-progress: true env: From 39a3bf355574df76df44cededffac96fddb48a04 Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Tue, 25 Jun 2024 03:40:22 +0200 Subject: [PATCH 116/272] ENH: Fix Python 3.13 test failures & enable CI (#59065) * ENH: Fix Python 3.13 test failures & enable CI x-ref #58734 Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> * Cast npy_intp to int to fix Windows CI --------- Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- .github/workflows/unit-tests.yml | 4 ++-- pandas/_libs/src/vendored/ujson/python/objToJSON.c | 12 ++++++------ pandas/_libs/tslibs/offsets.pyx | 7 ++++++- pandas/tests/groupby/test_groupby.py | 2 +- pandas/tests/io/parser/test_dialect.py | 2 +- pandas/tests/io/test_common.py | 5 ++++- pandas/tests/io/xml/test_xml.py | 2 +- pandas/tests/scalar/timedelta/test_arithmetic.py | 1 + 8 files changed, 22 insertions(+), 13 deletions(-) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 89d4f1e96a5ee..32b7aee7c2bdc 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -299,7 +299,7 @@ jobs: # To freeze this file, uncomment out the ``if: false`` condition, and migrate the jobs # to the corresponding posix/windows-macos/sdist etc. workflows. # Feel free to modify this comment as necessary. - if: false # Uncomment this to freeze the workflow, comment it to unfreeze + # if: false # Uncomment this to freeze the workflow, comment it to unfreeze defaults: run: shell: bash -eou pipefail {0} @@ -331,7 +331,7 @@ jobs: - name: Set up Python Dev Version uses: actions/setup-python@v5 with: - python-version: '3.12-dev' + python-version: '3.13-dev' - name: Build Environment run: | diff --git a/pandas/_libs/src/vendored/ujson/python/objToJSON.c b/pandas/_libs/src/vendored/ujson/python/objToJSON.c index fa91db5fe34e3..5f35860c59cb7 100644 --- a/pandas/_libs/src/vendored/ujson/python/objToJSON.c +++ b/pandas/_libs/src/vendored/ujson/python/objToJSON.c @@ -410,8 +410,8 @@ static void NpyArr_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { npyarr->type_num = PyArray_DESCR(obj)->type_num; if (GET_TC(tc)->transpose) { - npyarr->dim = PyArray_DIM(obj, npyarr->ndim); - npyarr->stride = PyArray_STRIDE(obj, npyarr->ndim); + npyarr->dim = PyArray_DIM(obj, (int)npyarr->ndim); + npyarr->stride = PyArray_STRIDE(obj, (int)npyarr->ndim); npyarr->stridedim = npyarr->ndim; npyarr->index[npyarr->ndim] = 0; npyarr->inc = -1; @@ -452,8 +452,8 @@ static void NpyArrPassThru_iterEnd(JSOBJ obj, JSONTypeContext *tc) { return; } const PyArrayObject *arrayobj = (const PyArrayObject *)npyarr->array; - npyarr->dim = PyArray_DIM(arrayobj, npyarr->stridedim); - npyarr->stride = PyArray_STRIDE(arrayobj, npyarr->stridedim); + npyarr->dim = PyArray_DIM(arrayobj, (int)npyarr->stridedim); + npyarr->stride = PyArray_STRIDE(arrayobj, (int)npyarr->stridedim); npyarr->dataptr += npyarr->stride; NpyArr_freeItemValue(obj, tc); @@ -524,8 +524,8 @@ static int NpyArr_iterNext(JSOBJ _obj, JSONTypeContext *tc) { } const PyArrayObject *arrayobj = (const PyArrayObject *)npyarr->array; - npyarr->dim = PyArray_DIM(arrayobj, npyarr->stridedim); - npyarr->stride = PyArray_STRIDE(arrayobj, npyarr->stridedim); + npyarr->dim = PyArray_DIM(arrayobj, (int)npyarr->stridedim); + npyarr->stride = PyArray_STRIDE(arrayobj, (int)npyarr->stridedim); npyarr->index[npyarr->stridedim] = 0; ((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = npyarr; diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index a24941e4f0a5a..80889aeb58332 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -4948,7 +4948,12 @@ cpdef to_offset(freq, bint is_period=False): if result is None: raise ValueError(INVALID_FREQ_ERR_MSG.format(freq)) - if is_period and not hasattr(result, "_period_dtype_code"): + try: + has_period_dtype_code = hasattr(result, "_period_dtype_code") + except ValueError: + has_period_dtype_code = False + + if is_period and not has_period_dtype_code: if isinstance(freq, str): raise ValueError(f"{result.name} is not supported as period frequency") else: diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 6ff70d26d8425..3506fb55e0d4c 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2445,7 +2445,7 @@ def test_rolling_wrong_param_min_period(): test_df.columns = ["name", "val"] result_error_msg = ( - r"^[a-zA-Z._]*\(\) got an unexpected keyword argument 'min_period'$" + r"^[a-zA-Z._]*\(\) got an unexpected keyword argument 'min_period'" ) with pytest.raises(TypeError, match=result_error_msg): test_df.groupby("name")["val"].rolling(window=2, min_period=1).sum() diff --git a/pandas/tests/io/parser/test_dialect.py b/pandas/tests/io/parser/test_dialect.py index 7a72e66996d43..803114723bc74 100644 --- a/pandas/tests/io/parser/test_dialect.py +++ b/pandas/tests/io/parser/test_dialect.py @@ -26,7 +26,7 @@ def custom_dialect(): "escapechar": "~", "delimiter": ":", "skipinitialspace": False, - "quotechar": "~", + "quotechar": "`", "quoting": 3, } return dialect_name, dialect_kwargs diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index e4b4d3a82669d..d73790365bb1f 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -474,7 +474,10 @@ def test_warning_missing_utf_bom(self, encoding, compression_): df.to_csv(path, compression=compression_, encoding=encoding) # reading should fail (otherwise we wouldn't need the warning) - msg = r"UTF-\d+ stream does not start with BOM" + msg = ( + r"UTF-\d+ stream does not start with BOM|" + r"'utf-\d+' codec can't decode byte" + ) with pytest.raises(UnicodeError, match=msg): pd.read_csv(path, compression=compression_, encoding=encoding) diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index 357e6129dd8f1..4454607606395 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -1038,7 +1038,7 @@ def test_utf16_encoding(xml_baby_names, parser): UnicodeError, match=( "UTF-16 stream does not start with BOM|" - "'utf-16-le' codec can't decode byte" + "'utf-16(-le)?' codec can't decode byte" ), ): read_xml(xml_baby_names, encoding="UTF-16", parser=parser) diff --git a/pandas/tests/scalar/timedelta/test_arithmetic.py b/pandas/tests/scalar/timedelta/test_arithmetic.py index 8efd4b551ad49..2183a5851ea9c 100644 --- a/pandas/tests/scalar/timedelta/test_arithmetic.py +++ b/pandas/tests/scalar/timedelta/test_arithmetic.py @@ -623,6 +623,7 @@ def test_td_floordiv_invalid_scalar(self): [ r"Invalid dtype datetime64\[D\] for __floordiv__", "'dtype' is an invalid keyword argument for this function", + "this function got an unexpected keyword argument 'dtype'", r"ufunc '?floor_divide'? cannot use operands with types", ] ) From 46916db5a94001ab2c8f022026ea2bfa48951e44 Mon Sep 17 00:00:00 2001 From: Alex Malins Date: Wed, 26 Jun 2024 02:06:00 +0900 Subject: [PATCH 117/272] DOC: document default behaviour of `sample` methods when `random_state = None` (#59073) * Document the default None random_state of sample Previously the default random state of the Series/DataFrame/GroupBy `sample` methods was unclear as the documentation for the behaviour when `random_state=None` was hidden in the `pandas.core.common.random_state` helper function. This commit documents how `sample` will just defer to the random state of `numpy.random` when no `random_state` parameter is passed. * Add Series.sample to GroupBy See Also docstring * Format None type as code in doc strings --- pandas/core/generic.py | 3 ++- pandas/core/groupby/groupby.py | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index b4908ad7a2158..33190f905be13 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5731,7 +5731,7 @@ def sample( replace : bool, default False Allow or disallow sampling of the same row more than once. weights : str or ndarray-like, optional - Default 'None' results in equal probability weighting. + Default ``None`` results in equal probability weighting. If passed a Series, will align with target object on index. Index values in weights not found in sampled object will be ignored and index values in sampled object not in weights will be assigned @@ -5746,6 +5746,7 @@ def sample( random_state : int, array-like, BitGenerator, np.random.RandomState, np.random.Generator, optional If int, array-like, or BitGenerator, seed for random number generator. If np.random.RandomState or np.random.Generator, use as given. + Default ``None`` results in sampling with the current state of np.random. .. versionchanged:: 1.4.0 diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 763fd4e59a978..bf71bb80b3623 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -5389,6 +5389,7 @@ def sample( random_state : int, array-like, BitGenerator, np.random.RandomState, np.random.Generator, optional If int, array-like, or BitGenerator, seed for random number generator. If np.random.RandomState or np.random.Generator, use as given. + Default ``None`` results in sampling with the current state of np.random. .. versionchanged:: 1.4.0 @@ -5403,6 +5404,7 @@ def sample( See Also -------- DataFrame.sample: Generate random samples from a DataFrame object. + Series.sample: Generate random samples from a Series object. numpy.random.choice: Generate a random sample from a given 1-D numpy array. From 288f47a848b7232001e0321ad2304b9f71e81d49 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Tue, 25 Jun 2024 22:48:51 +0530 Subject: [PATCH 118/272] DOC: fix SA01, ES01 for pandas.Timestamp.tz (#59097) * DOC: fix SA01, ES01 for pandas.Timestamp.tz * DOC: fix SA01, ES01 for pandas.Timestamp.tz --- ci/code_checks.sh | 1 - pandas/_libs/tslibs/timestamps.pyx | 11 +++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 424171cee794c..41564e6426418 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -260,7 +260,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Timestamp.to_period PR01,SA01" \ -i "pandas.Timestamp.today SA01" \ -i "pandas.Timestamp.toordinal SA01" \ - -i "pandas.Timestamp.tz SA01" \ -i "pandas.Timestamp.tz_localize SA01" \ -i "pandas.Timestamp.tzinfo GL08" \ -i "pandas.Timestamp.tzname SA01" \ diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 93715c907d182..628527bd4ff9b 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -2368,6 +2368,17 @@ timedelta}, default 'raise' """ Alias for tzinfo. + The `tz` property provides a simple and direct way to retrieve the timezone + information of a `Timestamp` object. It is particularly useful when working + with time series data that includes timezone information, allowing for easy + access and manipulation of the timezone context. + + See Also + -------- + Timestamp.tzinfo : Returns the timezone information of the Timestamp. + Timestamp.tz_convert : Convert timezone-aware Timestamp to another time zone. + Timestamp.tz_localize : Localize the Timestamp to a timezone. + Examples -------- >>> ts = pd.Timestamp(1584226800, unit='s', tz='Europe/Stockholm') From 5719571ef136decf94e4ae7544630d1c53c67727 Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Tue, 25 Jun 2024 19:20:12 +0200 Subject: [PATCH 119/272] BUG: Fix sparse doctests for SciPy 1.14.0 (#59094) --- pandas/core/arrays/sparse/accessor.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py index 6a1c25711acb0..39a70fba9aa78 100644 --- a/pandas/core/arrays/sparse/accessor.py +++ b/pandas/core/arrays/sparse/accessor.py @@ -98,8 +98,8 @@ def from_coo(cls, A, dense_index: bool = False) -> Series: ... ([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), shape=(3, 4) ... ) >>> A - <3x4 sparse matrix of type '' - with 3 stored elements in COOrdinate format> + >>> A.todense() matrix([[0., 0., 1., 2.], @@ -186,8 +186,8 @@ def to_coo( ... row_levels=["A", "B"], column_levels=["C", "D"], sort_labels=True ... ) >>> A - <3x4 sparse matrix of type '' - with 3 stored elements in COOrdinate format> + >>> A.todense() matrix([[0., 0., 1., 3.], [3., 0., 0., 0.], @@ -380,8 +380,8 @@ def to_coo(self) -> spmatrix: -------- >>> df = pd.DataFrame({"A": pd.arrays.SparseArray([0, 1, 0, 1])}) >>> df.sparse.to_coo() - <4x1 sparse matrix of type '' - with 2 stored elements in COOrdinate format> + """ import_optional_dependency("scipy") from scipy.sparse import coo_matrix From a3babacc329a560ada63254545bf2c990f1d976a Mon Sep 17 00:00:00 2001 From: Ulrich Dobramysl <1979498+ulido@users.noreply.github.com> Date: Tue, 25 Jun 2024 18:22:07 +0100 Subject: [PATCH 120/272] ENH: Add option to only merge column header cells in `ExcelFormatter`. (#59081) * Add option to only merge column header cells in `ExcelFormatter`. * Add entry in the whatsnew document * Remove erroneous `:ref:` from docstring Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> * Correct typo in docstring Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> * Remove superfluous parentheses from if statement; better error message Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> * Fix missing double quote. * Wording of whatsnew entry Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/_typing.py | 1 + pandas/io/formats/excel.py | 23 ++++++++++++++++------- pandas/io/formats/style.py | 3 ++- pandas/tests/io/excel/test_writers.py | 2 +- 5 files changed, 21 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index f7039021ff276..ec7b8b375abe5 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -39,6 +39,7 @@ Other enhancements - Users can globally disable any ``PerformanceWarning`` by setting the option ``mode.performance_warnings`` to ``False`` (:issue:`56920`) - :meth:`Styler.format_index_names` can now be used to format the index and column names (:issue:`48936` and :issue:`47489`) - :class:`.errors.DtypeWarning` improved to include column names when mixed data types are detected (:issue:`58174`) +- :func:`DataFrame.to_excel` argument ``merge_cells`` now accepts a value of ``"columns"`` to only merge :class:`MultiIndex` column header header cells (:issue:`35384`) - :meth:`DataFrame.corrwith` now accepts ``min_periods`` as optional arguments, as in :meth:`DataFrame.corr` and :meth:`Series.corr` (:issue:`9490`) - :meth:`DataFrame.cummin`, :meth:`DataFrame.cummax`, :meth:`DataFrame.cumprod` and :meth:`DataFrame.cumsum` methods now have a ``numeric_only`` parameter (:issue:`53072`) - :meth:`DataFrame.fillna` and :meth:`Series.fillna` can now accept ``value=None``; for non-object dtype the corresponding NA value will be used (:issue:`57723`) diff --git a/pandas/_typing.py b/pandas/_typing.py index d90596878ba51..09a3f58d6ab7f 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -510,6 +510,7 @@ def closed(self) -> bool: # ExcelWriter ExcelWriterIfSheetExists = Literal["error", "new", "replace", "overlay"] +ExcelWriterMergeCells = Union[bool, Literal["columns"]] # Offsets OffsetCalendar = Union[np.busdaycalendar, "AbstractHolidayCalendar"] diff --git a/pandas/io/formats/excel.py b/pandas/io/formats/excel.py index a98d9c175c2bd..52b5755558900 100644 --- a/pandas/io/formats/excel.py +++ b/pandas/io/formats/excel.py @@ -52,6 +52,7 @@ if TYPE_CHECKING: from pandas._typing import ( + ExcelWriterMergeCells, FilePath, IndexLabel, StorageOptions, @@ -523,8 +524,11 @@ class ExcelFormatter: Column label for index column(s) if desired. If None is given, and `header` and `index` are True, then the index names are used. A sequence should be given if the DataFrame uses MultiIndex. - merge_cells : bool, default False - Format MultiIndex and Hierarchical Rows as merged cells. + merge_cells : bool or 'columns', default False + Format MultiIndex column headers and Hierarchical Rows as merged cells + if True. Merge MultiIndex column headers only if 'columns'. + .. versionchanged:: 3.0.0 + Added the 'columns' option. inf_rep : str, default `'inf'` representation for np.inf values (which aren't representable in Excel) A `'-'` sign will be added in front of -inf. @@ -547,7 +551,7 @@ def __init__( header: Sequence[Hashable] | bool = True, index: bool = True, index_label: IndexLabel | None = None, - merge_cells: bool = False, + merge_cells: ExcelWriterMergeCells = False, inf_rep: str = "inf", style_converter: Callable | None = None, ) -> None: @@ -580,6 +584,9 @@ def __init__( self.index = index self.index_label = index_label self.header = header + + if not isinstance(merge_cells, bool) and merge_cells != "columns": + raise ValueError(f"Unexpected value for {merge_cells=}.") self.merge_cells = merge_cells self.inf_rep = inf_rep @@ -614,7 +621,7 @@ def _format_header_mi(self) -> Iterable[ExcelCell]: columns = self.columns level_strs = columns._format_multi( - sparsify=self.merge_cells, include_names=False + sparsify=self.merge_cells in {True, "columns"}, include_names=False ) level_lengths = get_level_lengths(level_strs) coloffset = 0 @@ -623,7 +630,7 @@ def _format_header_mi(self) -> Iterable[ExcelCell]: if self.index and isinstance(self.df.index, MultiIndex): coloffset = self.df.index.nlevels - 1 - if self.merge_cells: + if self.merge_cells in {True, "columns"}: # Format multi-index as a merged cells. for lnum, name in enumerate(columns.names): yield ExcelCell( @@ -793,7 +800,9 @@ def _format_hierarchical_rows(self) -> Iterable[ExcelCell]: # with index names (blank if None) for # unambiguous round-trip, unless not merging, # in which case the names all go on one row Issue #11328 - if isinstance(self.columns, MultiIndex) and self.merge_cells: + if isinstance(self.columns, MultiIndex) and ( + self.merge_cells in {True, "columns"} + ): self.rowcounter += 1 # if index labels are not empty go ahead and dump @@ -801,7 +810,7 @@ def _format_hierarchical_rows(self) -> Iterable[ExcelCell]: for cidx, name in enumerate(index_labels): yield ExcelCell(self.rowcounter - 1, cidx, name, None) - if self.merge_cells: + if self.merge_cells and self.merge_cells != "columns": # Format hierarchical rows as merged cells. level_strs = self.df.index._format_multi( sparsify=True, include_names=False diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index a695c539977b3..6f4c2fa6c6eae 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -66,6 +66,7 @@ Axis, AxisInt, Concatenate, + ExcelWriterMergeCells, FilePath, IndexLabel, IntervalClosedType, @@ -551,7 +552,7 @@ def to_excel( startrow: int = 0, startcol: int = 0, engine: str | None = None, - merge_cells: bool = True, + merge_cells: ExcelWriterMergeCells = True, encoding: str | None = None, inf_rep: str = "inf", verbose: bool = True, diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index ad1f22224bc0d..482b331332462 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -49,7 +49,7 @@ def frame(float_frame): return float_frame[:10] -@pytest.fixture(params=[True, False]) +@pytest.fixture(params=[True, False, "columns"]) def merge_cells(request): return request.param From 7ca6cd07859678f465c7e06a98575947422ab9fc Mon Sep 17 00:00:00 2001 From: Tim Yang Date: Tue, 25 Jun 2024 13:22:41 -0400 Subject: [PATCH 121/272] DOC: Fix typo in `StylerRenderer.format` docstring (#59090) Fix typo in `StylerRenderer.format` docstring --- pandas/io/formats/style_render.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/formats/style_render.py b/pandas/io/formats/style_render.py index 19a3563f43b4e..ec718f2a1276f 100644 --- a/pandas/io/formats/style_render.py +++ b/pandas/io/formats/style_render.py @@ -1052,7 +1052,7 @@ def format( When using a ``formatter`` string the dtypes must be compatible, otherwise a `ValueError` will be raised. - When instantiating a Styler, default formatting can be applied be setting the + When instantiating a Styler, default formatting can be applied by setting the ``pandas.options``: - ``styler.format.formatter``: default None. From d465645c9d73b249fa1008651608bd80f1f7c83e Mon Sep 17 00:00:00 2001 From: Mark Akritas <40570077+MarkAkritov@users.noreply.github.com> Date: Wed, 26 Jun 2024 00:02:50 +0400 Subject: [PATCH 122/272] RT03 fix in pandas.Series.pop docstring. (#59103) --- ci/code_checks.sh | 2 +- pandas/core/series.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 41564e6426418..0fa61a2465a2e 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -162,7 +162,7 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.ne SA01" \ -i "pandas.Series.pad PR01,SA01" \ -i "pandas.Series.plot PR02" \ - -i "pandas.Series.pop RT03,SA01" \ + -i "pandas.Series.pop SA01" \ -i "pandas.Series.prod RT03" \ -i "pandas.Series.product RT03" \ -i "pandas.Series.reorder_levels RT03,SA01" \ diff --git a/pandas/core/series.py b/pandas/core/series.py index ee34240366aba..0a67ad7e57f86 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -5020,7 +5020,8 @@ def pop(self, item: Hashable) -> Any: Returns ------- - Value that is popped from series. + scalar + Value that is popped from series. Examples -------- From 2db934e3b183443ae297cb00effac3bcbc1a406c Mon Sep 17 00:00:00 2001 From: Alexey Murz Korepov Date: Wed, 26 Jun 2024 00:04:58 +0400 Subject: [PATCH 123/272] Added a test for the groupby MultiIndex with codes (#59102) --- pandas/tests/groupby/test_groupby.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 3506fb55e0d4c..13fb9cfc4c0e4 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2985,3 +2985,14 @@ def test_groupby_agg_namedagg_with_duplicate_columns(): ) tm.assert_frame_equal(result, expected) + + +def test_groupby_multi_index_codes(): + # GH#54347 + df = DataFrame( + {"A": [1, 2, 3, 4], "B": [1, float("nan"), 2, float("nan")], "C": [2, 4, 6, 8]} + ) + df_grouped = df.groupby(["A", "B"], dropna=False).sum() + + index = df_grouped.index + tm.assert_index_equal(index, MultiIndex.from_frame(index.to_frame())) From a3d5e003891d08cf1e72108f94ccf47a603d5ebe Mon Sep 17 00:00:00 2001 From: Movsisyan Date: Tue, 25 Jun 2024 13:12:19 -0700 Subject: [PATCH 124/272] TST: .loc dtype change bug (#59101) * Naive test case for #29707 * Compare frames when testing loc dtype change for #29707 * Update pandas/tests/dtypes/test_dtypes.py --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/tests/dtypes/test_dtypes.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index 252fc484a8246..903c13587151a 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -1231,3 +1231,15 @@ def test_multi_column_dtype_assignment(): df["b"] = 0 tm.assert_frame_equal(df, expected) + + +def test_loc_setitem_empty_labels_no_dtype_conversion(): + # GH 29707 + + df = pd.DataFrame({"a": [2, 3]}) + expected = df.copy() + assert df.a.dtype == "int64" + df.loc[[]] = 0.1 + + assert df.a.dtype == "int64" + tm.assert_frame_equal(df, expected) From d604aa81736b1683dc166d05bfa8e34413b28960 Mon Sep 17 00:00:00 2001 From: chaoyihu <90495101+chaoyihu@users.noreply.github.com> Date: Tue, 25 Jun 2024 13:14:41 -0700 Subject: [PATCH 125/272] Fix insertion of None value in MultiIndex (#59069) * Fix insertion of None value in MultiIndex * add whatnew entry * check for NA values using missing::isna * adding test insertion of np.nan * update whatsnew entry * test case revision --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/indexes/multi.py | 7 +++++-- pandas/tests/test_multilevel.py | 7 +++++++ 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index ec7b8b375abe5..e06df92a220dd 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -545,6 +545,7 @@ MultiIndex ^^^^^^^^^^ - :func:`DataFrame.loc` with ``axis=0`` and :class:`MultiIndex` when setting a value adds extra columns (:issue:`58116`) - :meth:`DataFrame.melt` would not accept multiple names in ``var_name`` when the columns were a :class:`MultiIndex` (:issue:`58033`) +- :meth:`MultiIndex.insert` would not insert NA value correctly at unified location of index -1 (:issue:`59003`) - I/O diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 9d7c7f3e4a5c9..8e3ebc7816fed 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3906,8 +3906,11 @@ def insert(self, loc: int, item) -> MultiIndex: # have to insert into level # must insert at end otherwise you have to recompute all the # other codes - lev_loc = len(level) - level = level.insert(lev_loc, k) + if isna(k): # GH 59003 + lev_loc = -1 + else: + lev_loc = len(level) + level = level.insert(lev_loc, k) else: lev_loc = level.get_loc(k) diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 8f661edf0f241..e87498742061b 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -288,6 +288,13 @@ def test_multiindex_with_na(self): tm.assert_frame_equal(df, expected) + @pytest.mark.parametrize("na", [None, np.nan]) + def test_multiindex_insert_level_with_na(self, na): + # GH 59003 + df = DataFrame([0], columns=[["A"], ["B"]]) + df[na, "B"] = 1 + tm.assert_frame_equal(df[na], DataFrame([1], columns=["B"])) + class TestSorted: """everything you wanted to test about sorting""" From f44ce136678a029772303cfcc566b95deebc86f5 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 25 Jun 2024 10:16:13 -1000 Subject: [PATCH 126/272] PERF: Use shallow copies/remove unnecessary copies in reshaping (#58959) --- pandas/core/reshape/melt.py | 4 ++-- pandas/core/reshape/pivot.py | 12 +++++------- pandas/core/reshape/reshape.py | 4 ++-- 3 files changed, 9 insertions(+), 11 deletions(-) diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index 294de2cf2fe1d..8756d0ad003d7 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -202,9 +202,9 @@ def melt( if value_vars_was_not_none: frame = frame.iloc[:, algos.unique(idx)] else: - frame = frame.copy() + frame = frame.copy(deep=False) else: - frame = frame.copy() + frame = frame.copy(deep=False) if col_level is not None: # allow list or other? # frame is a copy diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 2dc5c7af00958..0d9e9f520cbea 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -557,8 +557,6 @@ def _all_key(key): piece = piece.T all_key = _all_key(key) - # we are going to mutate this, so need to copy! - piece = piece.copy() piece[all_key] = margin[key] table_pieces.append(piece) @@ -842,11 +840,11 @@ def pivot( # If columns is None we will create a MultiIndex level with None as name # which might cause duplicated names because None is the default for # level names - data = data.copy(deep=False) - data.index = data.index.copy() - data.index.names = [ - name if name is not None else lib.no_default for name in data.index.names - ] + if any(name is None for name in data.index.names): + data = data.copy(deep=False) + data.index.names = [ + name if name is not None else lib.no_default for name in data.index.names + ] indexed: DataFrame | Series if values is lib.no_default: diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 664ac57fcc823..3df55256ec43b 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -461,7 +461,7 @@ def _unstack_multiple( ) if isinstance(data, Series): - dummy = data.copy() + dummy = data.copy(deep=False) dummy.index = dummy_index unstacked = dummy.unstack("__placeholder__", fill_value=fill_value, sort=sort) @@ -1025,7 +1025,7 @@ def stack_reshape( buf = [] for idx in stack_cols.unique(): if len(frame.columns) == 1: - data = frame.copy() + data = frame.copy(deep=False) else: if not isinstance(frame.columns, MultiIndex) and not isinstance(idx, tuple): # GH#57750 - if the frame is an Index with tuples, .loc below will fail From 1a95c790b3b37667acb9a029db5db967944f9c12 Mon Sep 17 00:00:00 2001 From: Richard Howe <45905457+rmhowe425@users.noreply.github.com> Date: Tue, 25 Jun 2024 16:25:00 -0400 Subject: [PATCH 127/272] BUG: True cannot be cast to bool in read_excel (#58994) * Adding implementation, tests. Updating documentation * Fixing dependency error by moving test inside of TestReaders * Updating implementation based on reviewer feedback * Creating a more clean implementation * Fixing broken unit tests * Fixing docstring error * Updating implementation based on reviewer feedback. Adding additional unit tests * Updating implementation based on reviewer feedback * Updating implementation based on reviewer feedback * Using datapath fixture * Fixing failing unit test * Removing unneeded file * Fixing failing documentation test * Updating unit test based on reviewer feedback --- doc/source/whatsnew/v3.0.0.rst | 2 ++ pandas/core/arrays/boolean.py | 8 ++++- pandas/io/parsers/base_parser.py | 4 ++- .../io/data/excel/test_boolean_types.xlsx | Bin 0 -> 5279 bytes .../tests/io/data/excel/test_none_type.xlsx | Bin 0 -> 5302 bytes pandas/tests/io/excel/test_readers.py | 30 ++++++++++++++++++ 6 files changed, 42 insertions(+), 2 deletions(-) create mode 100644 pandas/tests/io/data/excel/test_boolean_types.xlsx create mode 100644 pandas/tests/io/data/excel/test_none_type.xlsx diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index e06df92a220dd..d7edbd1c20e91 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -558,7 +558,9 @@ I/O - Bug in :meth:`HDFStore.get` was failing to save data of dtype datetime64[s] correctly (:issue:`59004`) - Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`) - Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`) +- Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`) - Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`) +- Period ^^^^^^ diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index a326925545045..74c0cd7719c13 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -329,15 +329,21 @@ def _from_sequence_of_strings( copy: bool = False, true_values: list[str] | None = None, false_values: list[str] | None = None, + none_values: list[str] | None = None, ) -> BooleanArray: true_values_union = cls._TRUE_VALUES.union(true_values or []) false_values_union = cls._FALSE_VALUES.union(false_values or []) - def map_string(s) -> bool: + if none_values is None: + none_values = [] + + def map_string(s) -> bool | None: if s in true_values_union: return True elif s in false_values_union: return False + elif s in none_values: + return None else: raise ValueError(f"{s} cannot be cast to bool") diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 7e91d9e262748..e7473aabdff87 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -745,11 +745,13 @@ def _cast_types(self, values: ArrayLike, cast_type: DtypeObj, column) -> ArrayLi if isinstance(cast_type, BooleanDtype): # error: Unexpected keyword argument "true_values" for # "_from_sequence_of_strings" of "ExtensionArray" + values_str = [str(val) for val in values] return array_type._from_sequence_of_strings( # type: ignore[call-arg] - values, + values_str, dtype=cast_type, true_values=self.true_values, false_values=self.false_values, + none_values=self.na_values, ) else: return array_type._from_sequence_of_strings(values, dtype=cast_type) diff --git a/pandas/tests/io/data/excel/test_boolean_types.xlsx b/pandas/tests/io/data/excel/test_boolean_types.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..234703c32f0abe61516c3e44aa35275242d14f08 GIT binary patch literal 5279 zcmZ`-2T+sS)(s#fRD<*`O-krZ1nDL8geFBqN`z1X1Q4Y6Dn$g94gvzwR1lF~l&XM$ z^xh%#s?d8U{%ze zwCR=#p2{uXIdD1 zV=GY)eLXB#wP0Wzc2F_jgJuCTM6FryIT+_bg>Ds5x|s3T{#5Re2nmvsA{i8)6(b56 z*RbXF#}W4}s}<>>Yj5TG7Hwz5q18J_!Kyw;9d35;iP!)x_NJ|N<{9>=9G)iA2syd& zFkGg6@Osnv%-C(a8Xb22bR`qX#&XrqFXU*8u?k#1kM0p9|L_mmFZUO3D3`=OAx0d4 zn%DYTD($~

?J`EE6YCX9KdqYbSAxk4ng2NT~ae+&!6Z@OE---;km8!-D>y|4ydb z=n6)64Qv2_2(y}_;0~?={1@+%geL74L1MX0ALaSu6eNToJFkW%cY;$?rjZ><|2`(? zQ}D)SWV{@43<5vaWo^l(3;k6(W;I6lfPolXL`Q+!)g9j5*mZ_*A>$#!3unU@cFrYE zU3_q=$);Y|Q5R=S32Ddkdh#R7z4Er!3*K0YPMt1<0(Z~AO2os)*uB@Kn{G!P0gEAp zmECfq)f@$bkC+ik3S!_&BI=QSh86yK94Q!@`-C4}$f%Yz$84lH)+|JK{ z^k!#OrrCn@3bkjuOd8x7#>{|AJ&!dMKAeAKNC^#M^DqWloe1~ckm&UR{HchNyXn?4 zMv0FYMd&cAr-Q&RQ9PVb_Lfdg_7^$%k3e=Uf>)I`PYK^1nmi;|=YHbK9eQ_c4=1Fe zQqw`r_5B*@0k{PcJ@8NUYowWl{4hjTUP4wRCEV2^XS`vakjhf`b+OT_yR z)nrVRk0EagnFlwA?Yl+ht0Ao6p1RX7=Asj=8Awy2u6|p}T{pr;70#AdQs+-glcKm%Tw8#iKNu+MQL-vqg7>?1#z_OQCTD8j0rod9pg3STx zs15zHm?YKEPp7Il=+VSCO{l5hyqt(&?Q3zHrXrrn`AIYPhE;DChPeW~!ZMVWcnYf=Zp9g9Wc?y{g=Aed9Io|L~dD4J;NLryh~ zcQ1*w{SXaW8DC7~+uA)sk8xBKFr3>&9mWDW$S+=ajz=y0un4r07}6zuo_>P=XApvE z>^LBpAaG)WK=wNb|DQE` z(E8|@_G`HZ+r%$t$#=dA2U2b(PW9U5NTu_pzd}&mQXDe0-1G~>hPR~;H1A2~>Q%2H ztUHK}WT(?9>N&4RFHx@p7wQirzm$VX7rzM{&iGluy$BAOoE6R~R#OgcI8Y}9m_=JV z)3m9Vs;gh??0una)4^c$o!eb?w8&4~N030#Dq)uXPtl7H*{EM&#ID9X7dIH@WN@{$ zK*6o`+)xOljq63$i^T|$HVYCfduW3m<#Q7gWQ#J!-^BXDRS~A5zNRZhfYhO(nL79K zD)nEozE=gYUcKL6aX%i9iS72Ssx%$+ZJKX#RuIkMCVX741{AB^YxTaS+*{1WNfylv zxihqdG9LF(y-{5)owfIA;0gn2Yn6^3;^C0{c>K9yM6aMeRMKZ7QjwE|4MO?o#zJe9 zVO@E{$Y9ku_MbbUtEKCR!=#TD4gf&%_fFg%Il%wgLf%uD{kR};+teZV0mfzoyI^OH>Ky#A)OpLEwHiq)~!Lx>}dn32kc%s{-B>GbOo^s` zmsh3!L+mxFucav^;v>1C>ytw7Mk0v7C@^58A~BYn2jAx^Gxc4B7xXDO32wtO70}&Xyu*-eOH$>2XLI;1!RS* z0U8Z#Y-<#M@UYI%epKjHWGFcU&d^v6Xdg&t4K696-DL_cQ|fSNv6=1d;aH`R6sM6I ziA(F`zqU+JUWtps_LF8IOAMuXMu@B^F*$p?R#J437Ewr9L1!j=p6JFQFIc+d#>&tQ zOs}tTf!lO9+}kp5-S?_P_9X9iVGSQP>DmC%2d3IKt$H|=&%0`C#L5;8x9QGX$^CNr zS3Kz{eXIgKo^O?y^iP!D+1kL~!5Y&=Y*to_g^@qFu)mtN~bRF6C;r^6GN{@Il& zSv-?H6Ae$QRehT{n7Q5JUXs32c}~0xO^!tN(##5raT8>h{&=v-zv}$J zvOzKSHH3@cizXOYRJDC8gNuP)tW~K0iNxC|O?4*iP;-|m{i`eSg0Ws$Px_QxO*MH} zH!PQxTHkEGMch>;RBq#(*ZT4?c6??I88SCeAfm;}c%03k7-d<}lLQ0aD~Wd?acS)6 zbe_snN-rzk>=hKg0eNV{+F5G0(jcVOZP)r(LBen9vCWpFr)aYJtQ*n2&|_xtNk(b4 z1K|A8as6@qW_`4;hXG&Nm?JgG;Na41nt3e>!USt2{hs(FRoLk(c_Qi2n`8mh%b%JV zD$d4HMbip_yY}o=M=xG43pR5{xS)X9VB>J|yt~b6jV2vZvf8?WGE=(`Y~4sxE%gXFHsbmD!rQNt%nh4X?uf$O?>|$iT2tfWJHVB0$Uw)JiSDmaRySmG#}J8r z_V&=~5T1HCM3L4-2!dg1G_g=NBb*#|2Th(#G!DYhq7?#>Rg9A73 ztt#9av8-p@!ikVc(McpA-_|VxH~d_@>V`zL!i&#FgzP{;gHfL5^IL_IHE)|oY@)qg zZM^Z!y=`hn@A1P-rP}9<@?z?vZmbF5Oj`*+KAhqGdBBtO zE>f^#u4X8vvjbtSW@{%aD9Xv%Rlv##1;6Mp(i6U3^wlAozJxh7d?UbYg0aBL`2o3X ze!e;&;%S75EX0nn!I$!e3;7mz+xIViQZ)s4Hw;ow0(bkC2^VPPfwstyfiXf~(V{&N zhm0Vfk-eyX0e;-GeZ>yGh1RB25vsfK(@zQdGEk=3W(3bH*M5>x>>9f*b0Ht=fj{** zzJ;HJJ7Q6XfZqf)ULhi84k%=dTn?e|Q5rQLA_X)FRbq6Dg#takQ0;*O+?#2@I#X|nE4TEKyEw2j)!u4F_+_`TqkQxb=@^8C?$cBWO2e&`b z17{24jy{bco>(zQu;XR5RoN`K9-QAH6Z$28$~>-JX#jjjv`>5z znSOD6aBblHPf71p=l6SHB)y91cCP-nq!!N3zZ7lKMqfA;jM2UAu6un?Us9zYz-UV; zkIIPNrqAeROzkzIXFm@4dw!9gVkxoz23yT_(m?ni}Y4ne-dVlY~Ke%eLPFIhjR7nGyD` z&+2$WnDVXDqf{g<@yD`hDo7x(`X4{SNC4T>R<7}SoBF9l#>8Rl+A%&?)Q%`ztlF_< z26-u0sAnzJr3K#r=c<=@{k9l&j-J)+IYRmQ;^S7Esw8{}>HTvEV$sr$QSLdc`YU*? z?r|h8nN=^>PrFn;^69Jg^vL`F>3rtbd(|GVs=8n_rq002VFHvki7Ao<0w F{{s~i>9PO- literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/excel/test_none_type.xlsx b/pandas/tests/io/data/excel/test_none_type.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..38aaf72ddfc8ffd80094f8b57618e1d519fd51d1 GIT binary patch literal 5302 zcmaJ_2RNJk77uD~Lix;~_DB^qYtiyE~@YL(i=Xl<=cts+)Yds7-)joPzH zi!1%^bGg0u>v^9fZ=UBp=l`DlJ4amw;|36bgM$O0Bl)BUxJLMw_a?4Z_E0`v)OSf- ztJ()J!SfyO2+m`{87Z8s{03FU29N-K3yUL7b$CvD@cy^^#A{QS?&^JY~Y zxo5$wD#qd3`r+sZRtw*P_TF$&T04!9MC?w<8ivf;i^7G3&r%{DdY6LpM4GFhf$%4Udzjw2J_C?ImSj z%#vE=aeS5~781dqYlM3l%%!#r~zU@_Dv}Q#pw1~o$LzxYHnCaK`H3v-F_Qm@~x2G^phgp&n8jH zdp`cQnmy6_-FcW#szwU00ttB(~;f(ur$79k8&%b*Qhucj3I()_h+t5{NE8Ef76IghsJ z4u0XLTM$qbkBeXFMZb)C%k<4y*-f)IGwCaQBm~QBrjwTLHHl|#}(7A2l*)}%w=aeeL?Hlg6Vi~v(&gJ`1Guw)7@#-ZC}5M zikV2yJ6p$S0DvRLpJIma@0fwQdD&Y*Q4zDHH|&@O2KvtF>iJC~@sv0mbTc>Vtx^Rb z+VRTq`4V-eErk{Et*R^&Ng9m6nyGyI&zj+E$WZTXiF=dnF3$Cw>~$S7WCJq7Q&3O~ z_X@6=K290sJ6#15MVMX?yNz%NMbfUoGg3uu4e`O-I#M>K{2}i(m+d~rx>nhbwN1&A`;eI5M*}b(yIP_ew_;M7&fdsvrG!P;b`|Zz&(_vZ9$4!0x~q>D$pfy8c7~4=#A4fKhFVF2Qsb( zO>c~?U|RRf-*3|q22RG!=cJoh`VxB?ZYSe4Y@1dB_mx|`t<9A54f9aii&y^>5gpB^~u zH#r`z@wY+FA=w581^pk{H(RBR`fNW-rkW`j43z+;Y-E!73DrKSTAeBPl{f^6f5Ji|(-E-CO~F$%R0`Mt!$Z3+OD3G+PbolkRW_lvR#t9MzF$ALQQ3gdg*YLFNf0|_ zByMYu?er_@8r~K&JrzQtKc;Q}N;yfxqe)2`p{3LleXt?MazueNyy5Lklq4*R@H%)t zqPqBC5c8nsJ}2k*X<}^IIc9N+l>Pqci*wHdzZGv~=4ev#^u99*jq`#pFFo15#P{G1 z(*v9ZkD(>Ky^QLQ)$nq$vRP=w;W3yaO zpqvsAPoS+zgGZv))!BsE34Ki?LM;_B5~4@ashZl9oQg0(yD;DM=92*y0dJFV&=PrX zi2eAUHQwbF`7OJ$a?JREzZmAp&l)TyAXlkR<)vb2oSq8X0hK$3TO0_|9uscsKRBzR zxIIu~j#ckJ2VCuZMa}MBg1NREY@UJf&KV3RQZN&E3=69C0m-}V%42xGGs&cl%Hs`i zR-TLFF)N=VTSqbmsDM2wB*Ue(t^A{!2oig=%CR6^d2$-$MGhhzDZBA54nyb?7?3(m z$1>#|hghj?Ug?4u{4Ha{Tn#dnii6yl4JNT9qcxoX3^K=KCtyWi^nP#^L>O8#Z!hos zWu*yyLx{Swb%LbW!^d4rK1ndf9xiWV{TJPBLgEzpi&i%`>--$%e2v~iii$mWc+|_V zC^ho5bok_#u*vTN6_Izy8$655NAHlQ(UYd7vkHmbshDyyeyG5`=6j^9^xinhopRsR zwh7$q*8gl&a9tLkfB>@52g~IPvTLUVo03lb2>X!m=JB&loawPW!frxXOJB{wf~jbl z+B*FlPZs`X4k#G}DXHknBhcm66OW|M#^?$eLOdN(6W7L#s$p@JGkKJ(j>vap%gX1Q z=&d0?eHEyuOoJUs=ii5GB2%Nzp#;(wGa5M4ROBIrQ8rn#`_?ahS5SK@_Q(sY@(l|P zJUx*CTB~YXQ!cc(Cn~4mE|zjmvHKK!=KUC7RKg3Kh=m8DRG0{nGN@eD3(9M&U}yGk)19v6GZkj1WWHq1H&#( zq=dk@%BxjB+PaKO7c+xzheh{Y$O@!-emkFSo6vI^8YEBJl4S1YLlu;MPIi6#8vwu% z!Ji7sUw;>rzY*(oH1;=ojfL3#La%2O^2Z%Y*cLB@5ig|6&c35{YAg=53&l-TU--1;O&fDI4Soeyu6wfje$uO~3OD-w794 zqVBi(ljUH)7A{v(Hap{kjy-IHe{mwMm&BSHoK8*SK{xmsC1baQ>|X!8qnM8({8 zytIkjak|X>Y}XbEFpz~9jo)3FxTQ;#yCwTkEYWAAE+o`G@FVfPb8Mea{;S>7Z3{7RYWzJLJvEVpS-p=Z|ucq zbA$n$Yk`qDcBQ-N>CAw&DgkQ}Y+M$~o!&$#`g4`;1{+6RgE`-wPHRUxnIm};HJ0^a zESrl=8&e}&imtArI%c)X(!HFLYP@zREq$g#!*dgq0s6{Chnp|eOT2_!{|wPdFQ2e< zvQT$*a)$C*IJsK=0_mE)j%{F)#$zs~Ir3F$(#kl-YKj6TZS=9iyPu4MknI)rvhzC| zEG$DwvQA=ROH;|CCl#Cs3|r=B z(&EIBZVt1kp{tNVA%a&7V*s3>9$1B?ARWSIfM3slm%*(sv6Adt?tH!*qr%Z}n8u_4 zWQZkW?1*8ZcKM)?X8dNQ5d}sbadB);QGUBsg>g~j(c8Xis|67jX# z+e^ND*0&udWiqJ1J|#^LjRCeZk=xr3^WJJ^VTqulqIYpbPE$Vui!bxrhtpu)nja4z z4uIsA7CvtQ1tv18_+w_{yS#sJ0aIpEJMMsZ9Kt`^Vt3lT3ea#>5lgPyPC%QmJTVxu z*{LQIw^Eoi@Wu9&EMtUCZ21;8hHlLPaF$s!iK%yLeeuUs?*q6gaG zn~P}iqFUn`Q!w%qCLfWLy>-De6(Y;Io$s?wAa92pKO5vPy%r__0m+wdy?6Qf=Y&P+ zWhm?9=w{{UW}@ZgYy~y?g$uR%AADd0Ht&?#)}^+sv{H>aVoOHXw|fcUl<}1M+?bp` z_s$x(f>%TJm%>}ToB50x(;O@_id#jqw4+%)Uy!M*LbG~k5#016b^>Y9l^qiSDl%oR zB8822?}iBL1m+UiVQ^~UIovujvr{o=Hyk1BTK4pjOVOcEc;Nd*YIKtbufEXNwHfzg ziDN*{mzu3eO_A)*u^Ol~Y>8rM3^`NA)jjU>=YN3vx{Q2#Fm$VH-?FF5*j)XaJ3=2z7#=ii2j?d?TdbAygnJDb|zQE zc6roq8Q=BrWqch>9Z0X1}7kulbP&i$Ve Date: Tue, 25 Jun 2024 23:44:41 +0200 Subject: [PATCH 128/272] CLN: enforce deprecation of the Series[categorical].replace special-casing (#58270) * enforce depr behavior df.replace / s.replace with CategoricalDtype * fixup tests in frame/methods/test_replace.py * fixup tests in arrays/categorical/test_replace.py and pandas/tests/copy_view/test_replace.py * add a note to v3.0.0 * remove _replace and special-casing, fix tests * fix tests * Adjust tests and clarify whatsnew * Fix pre-commit --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/arrays/categorical.py | 58 --------- pandas/core/internals/blocks.py | 17 --- .../tests/arrays/categorical/test_replace.py | 118 ++++++------------ pandas/tests/copy_view/test_replace.py | 56 ++------- pandas/tests/frame/methods/test_replace.py | 81 ++++++------ pandas/tests/series/methods/test_replace.py | 34 ++--- 7 files changed, 96 insertions(+), 269 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index d7edbd1c20e91..6da38a364ab5b 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -381,6 +381,7 @@ Other Removals - Enforced deprecation of strings ``T``, ``L``, ``U``, and ``N`` denoting frequencies in :class:`Minute`, :class:`Milli`, :class:`Micro`, :class:`Nano` (:issue:`57627`) - Enforced deprecation of strings ``T``, ``L``, ``U``, and ``N`` denoting units in :class:`Timedelta` (:issue:`57627`) - Enforced deprecation of the behavior of :func:`concat` when ``len(keys) != len(objs)`` would truncate to the shorter of the two. Now this raises a ``ValueError`` (:issue:`43485`) +- Enforced deprecation of the behavior of :meth:`DataFrame.replace` and :meth:`Series.replace` with :class:`CategoricalDtype` that would introduce new categories. (:issue:`58270`) - Enforced deprecation of values "pad", "ffill", "bfill", and "backfill" for :meth:`Series.interpolate` and :meth:`DataFrame.interpolate` (:issue:`57869`) - Enforced deprecation removing :meth:`Categorical.to_list`, use ``obj.tolist()`` instead (:issue:`51254`) - Enforced silent-downcasting deprecation for :ref:`all relevant methods ` (:issue:`54710`) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index c656e4bf1e20c..18b52f741370f 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -10,7 +10,6 @@ cast, overload, ) -import warnings import numpy as np @@ -23,7 +22,6 @@ ) from pandas._libs.arrays import NDArrayBacked from pandas.compat.numpy import function as nv -from pandas.util._exceptions import find_stack_level from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.cast import ( @@ -2673,62 +2671,6 @@ def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: code_values = code_values[null_mask | (code_values >= 0)] return algorithms.isin(self.codes, code_values) - @overload - def _replace(self, *, to_replace, value, inplace: Literal[False] = ...) -> Self: ... - - @overload - def _replace(self, *, to_replace, value, inplace: Literal[True]) -> None: ... - - def _replace(self, *, to_replace, value, inplace: bool = False) -> Self | None: - from pandas import Index - - orig_dtype = self.dtype - - inplace = validate_bool_kwarg(inplace, "inplace") - cat = self if inplace else self.copy() - - mask = isna(np.asarray(value)) - if mask.any(): - removals = np.asarray(to_replace)[mask] - removals = cat.categories[cat.categories.isin(removals)] - new_cat = cat.remove_categories(removals) - NDArrayBacked.__init__(cat, new_cat.codes, new_cat.dtype) - - ser = cat.categories.to_series() - ser = ser.replace(to_replace=to_replace, value=value) - - all_values = Index(ser) - - # GH51016: maintain order of existing categories - idxr = cat.categories.get_indexer_for(all_values) - locs = np.arange(len(ser)) - locs = np.where(idxr == -1, locs, idxr) - locs = locs.argsort() - - new_categories = ser.take(locs) - new_categories = new_categories.drop_duplicates(keep="first") - index_categories = Index(new_categories) - new_codes = recode_for_categories( - cat._codes, all_values, index_categories, copy=False - ) - new_dtype = CategoricalDtype(index_categories, ordered=self.dtype.ordered) - NDArrayBacked.__init__(cat, new_codes, new_dtype) - - if new_dtype != orig_dtype: - warnings.warn( - # GH#55147 - "The behavior of Series.replace (and DataFrame.replace) with " - "CategoricalDtype is deprecated. In a future version, replace " - "will only be used for cases that preserve the categories. " - "To change the categories, use ser.cat.rename_categories " - "instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - if not inplace: - return cat - return None - # ------------------------------------------------------------------------ # String methods interface def _str_map( diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 3614d43425a09..6bb335bca12b3 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -100,7 +100,6 @@ ) from pandas.core.array_algos.transforms import shift from pandas.core.arrays import ( - Categorical, DatetimeArray, ExtensionArray, IntervalArray, @@ -696,14 +695,6 @@ def replace( # go through replace_list values = self.values - if isinstance(values, Categorical): - # TODO: avoid special-casing - # GH49404 - blk = self._maybe_copy(inplace) - values = cast(Categorical, blk.values) - values._replace(to_replace=to_replace, value=value, inplace=True) - return [blk] - if not self._can_hold_element(to_replace): # We cannot hold `to_replace`, so we know immediately that # replacing it is a no-op. @@ -803,14 +794,6 @@ def replace_list( """ values = self.values - if isinstance(values, Categorical): - # TODO: avoid special-casing - # GH49404 - blk = self._maybe_copy(inplace) - values = cast(Categorical, blk.values) - values._replace(to_replace=src_list, value=dest_list, inplace=True) - return [blk] - # Exclude anything that we know we won't contain pairs = [ (x, y) for x, y in zip(src_list, dest_list) if self._can_hold_element(x) diff --git a/pandas/tests/arrays/categorical/test_replace.py b/pandas/tests/arrays/categorical/test_replace.py index 3c677142846d7..7f3e8d3ed6e6e 100644 --- a/pandas/tests/arrays/categorical/test_replace.py +++ b/pandas/tests/arrays/categorical/test_replace.py @@ -6,106 +6,66 @@ @pytest.mark.parametrize( - "to_replace,value,expected,flip_categories", + "to_replace,value,expected", [ # one-to-one - (1, 2, [2, 2, 3], False), - (1, 4, [4, 2, 3], False), - (4, 1, [1, 2, 3], False), - (5, 6, [1, 2, 3], False), + (4, 1, [1, 2, 3]), + (3, 1, [1, 2, 1]), # many-to-one - ([1], 2, [2, 2, 3], False), - ([1, 2], 3, [3, 3, 3], False), - ([1, 2], 4, [4, 4, 3], False), - ((1, 2, 4), 5, [5, 5, 3], False), - ((5, 6), 2, [1, 2, 3], False), - ([1], [2], [2, 2, 3], False), - ([1, 4], [5, 2], [5, 2, 3], False), - # GH49404: overlap between to_replace and value - ([1, 2, 3], [2, 3, 4], [2, 3, 4], False), - # GH50872, GH46884: replace with null - (1, None, [None, 2, 3], False), - (1, pd.NA, [None, 2, 3], False), - # check_categorical sorts categories, which crashes on mixed dtypes - (3, "4", [1, 2, "4"], False), - ([1, 2, "3"], "5", ["5", "5", 3], True), + ((5, 6), 2, [1, 2, 3]), + ((3, 2), 1, [1, 1, 1]), ], ) -@pytest.mark.filterwarnings( - "ignore:.*with CategoricalDtype is deprecated:FutureWarning" -) -def test_replace_categorical_series(to_replace, value, expected, flip_categories): +def test_replace_categorical_series(to_replace, value, expected): # GH 31720 - ser = pd.Series([1, 2, 3], dtype="category") result = ser.replace(to_replace, value) - expected = pd.Series(expected, dtype="category") - ser.replace(to_replace, value, inplace=True) - - if flip_categories: - expected = expected.cat.set_categories(expected.cat.categories[::-1]) - - tm.assert_series_equal(expected, result, check_category_order=False) - tm.assert_series_equal(expected, ser, check_category_order=False) + expected = pd.Series(Categorical(expected, categories=[1, 2, 3])) + tm.assert_series_equal(result, expected) @pytest.mark.parametrize( - "to_replace, value, result, expected_error_msg", + "to_replace,value", [ - ("b", "c", ["a", "c"], "Categorical.categories are different"), - ("c", "d", ["a", "b"], None), - # https://github.com/pandas-dev/pandas/issues/33288 - ("a", "a", ["a", "b"], None), - ("b", None, ["a", None], "Categorical.categories length are different"), + # one-to-one + (3, 5), + # many-to-one + ((3, 2), 5), ], ) -def test_replace_categorical(to_replace, value, result, expected_error_msg): - # GH#26988 - cat = Categorical(["a", "b"]) - expected = Categorical(result) - msg = ( - r"The behavior of Series\.replace \(and DataFrame.replace\) " - "with CategoricalDtype" - ) - warn = FutureWarning if expected_error_msg is not None else None - with tm.assert_produces_warning(warn, match=msg): - result = pd.Series(cat, copy=False).replace(to_replace, value)._values +def test_replace_categorical_series_new_category_raises(to_replace, value): + # GH 31720 + ser = pd.Series([1, 2, 3], dtype="category") + with pytest.raises( + TypeError, match="Cannot setitem on a Categorical with a new category" + ): + ser.replace(to_replace, value) - tm.assert_categorical_equal(result, expected) - if to_replace == "b": # the "c" test is supposed to be unchanged - with pytest.raises(AssertionError, match=expected_error_msg): - # ensure non-inplace call does not affect original - tm.assert_categorical_equal(cat, expected) - ser = pd.Series(cat, copy=False) - with tm.assert_produces_warning(warn, match=msg): - ser.replace(to_replace, value, inplace=True) - tm.assert_categorical_equal(cat, expected) +def test_replace_maintain_ordering(): + # GH51016 + dtype = pd.CategoricalDtype([0, 1, 2], ordered=True) + ser = pd.Series([0, 1, 2], dtype=dtype) + result = ser.replace(0, 2) + expected = pd.Series([2, 1, 2], dtype=dtype) + tm.assert_series_equal(expected, result, check_category_order=True) def test_replace_categorical_ea_dtype(): # GH49404 - cat = Categorical(pd.array(["a", "b"], dtype="string")) - msg = ( - r"The behavior of Series\.replace \(and DataFrame.replace\) " - "with CategoricalDtype" + cat = Categorical(pd.array(["a", "b", "c"], dtype="string")) + result = pd.Series(cat).replace(["a", "b"], ["c", "c"])._values + expected = Categorical( + pd.array(["c"] * 3, dtype="string"), + categories=pd.array(["a", "b", "c"], dtype="string"), ) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = pd.Series(cat).replace(["a", "b"], ["c", pd.NA])._values - expected = Categorical(pd.array(["c", pd.NA], dtype="string")) tm.assert_categorical_equal(result, expected) -def test_replace_maintain_ordering(): - # GH51016 - dtype = pd.CategoricalDtype([0, 1, 2], ordered=True) - ser = pd.Series([0, 1, 2], dtype=dtype) - msg = ( - r"The behavior of Series\.replace \(and DataFrame.replace\) " - "with CategoricalDtype" - ) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = ser.replace(0, 2) - expected_dtype = pd.CategoricalDtype([1, 2], ordered=True) - expected = pd.Series([2, 1, 2], dtype=expected_dtype) - tm.assert_series_equal(expected, result, check_category_order=True) +def test_replace_categorical_ea_dtype_different_cats_raises(): + # GH49404 + cat = Categorical(pd.array(["a", "b"], dtype="string")) + with pytest.raises( + TypeError, match="Cannot setitem on a Categorical with a new category" + ): + pd.Series(cat).replace(["a", "b"], ["c", pd.NA]) diff --git a/pandas/tests/copy_view/test_replace.py b/pandas/tests/copy_view/test_replace.py index 63254f1244a2e..2eb88923c0087 100644 --- a/pandas/tests/copy_view/test_replace.py +++ b/pandas/tests/copy_view/test_replace.py @@ -129,18 +129,14 @@ def test_replace_to_replace_wrong_dtype(): def test_replace_list_categorical(): df = DataFrame({"a": ["a", "b", "c"]}, dtype="category") arr = get_array(df, "a") - msg = ( - r"The behavior of Series\.replace \(and DataFrame.replace\) " - "with CategoricalDtype" - ) - with tm.assert_produces_warning(FutureWarning, match=msg): - df.replace(["c"], value="a", inplace=True) + + df.replace(["c"], value="a", inplace=True) assert np.shares_memory(arr.codes, get_array(df, "a").codes) assert df._mgr._has_no_reference(0) df_orig = df.copy() - with tm.assert_produces_warning(FutureWarning, match=msg): - df2 = df.replace(["b"], value="a") + df.replace(["b"], value="a") + df2 = df.apply(lambda x: x.cat.rename_categories({"b": "d"})) assert not np.shares_memory(arr.codes, get_array(df2, "a").codes) tm.assert_frame_equal(df, df_orig) @@ -150,13 +146,7 @@ def test_replace_list_inplace_refs_categorical(): df = DataFrame({"a": ["a", "b", "c"]}, dtype="category") view = df[:] df_orig = df.copy() - msg = ( - r"The behavior of Series\.replace \(and DataFrame.replace\) " - "with CategoricalDtype" - ) - with tm.assert_produces_warning(FutureWarning, match=msg): - df.replace(["c"], value="a", inplace=True) - assert not np.shares_memory(get_array(view, "a").codes, get_array(df, "a").codes) + df.replace(["c"], value="a", inplace=True) tm.assert_frame_equal(df_orig, view) @@ -195,56 +185,34 @@ def test_replace_inplace_reference_no_op(to_replace): @pytest.mark.parametrize("to_replace", [1, [1]]) -@pytest.mark.parametrize("val", [1, 1.5]) -def test_replace_categorical_inplace_reference(val, to_replace): +def test_replace_categorical_inplace_reference(to_replace): df = DataFrame({"a": Categorical([1, 2, 3])}) df_orig = df.copy() arr_a = get_array(df, "a") view = df[:] - msg = ( - r"The behavior of Series\.replace \(and DataFrame.replace\) " - "with CategoricalDtype" - ) - warn = FutureWarning if val == 1.5 else None - with tm.assert_produces_warning(warn, match=msg): - df.replace(to_replace=to_replace, value=val, inplace=True) - + df.replace(to_replace=to_replace, value=1, inplace=True) assert not np.shares_memory(get_array(df, "a").codes, arr_a.codes) assert df._mgr._has_no_reference(0) assert view._mgr._has_no_reference(0) tm.assert_frame_equal(view, df_orig) -@pytest.mark.parametrize("val", [1, 1.5]) -def test_replace_categorical_inplace(val): +def test_replace_categorical_inplace(): df = DataFrame({"a": Categorical([1, 2, 3])}) arr_a = get_array(df, "a") - msg = ( - r"The behavior of Series\.replace \(and DataFrame.replace\) " - "with CategoricalDtype" - ) - warn = FutureWarning if val == 1.5 else None - with tm.assert_produces_warning(warn, match=msg): - df.replace(to_replace=1, value=val, inplace=True) + df.replace(to_replace=1, value=1, inplace=True) assert np.shares_memory(get_array(df, "a").codes, arr_a.codes) assert df._mgr._has_no_reference(0) - expected = DataFrame({"a": Categorical([val, 2, 3])}) + expected = DataFrame({"a": Categorical([1, 2, 3])}) tm.assert_frame_equal(df, expected) -@pytest.mark.parametrize("val", [1, 1.5]) -def test_replace_categorical(val): +def test_replace_categorical(): df = DataFrame({"a": Categorical([1, 2, 3])}) df_orig = df.copy() - msg = ( - r"The behavior of Series\.replace \(and DataFrame.replace\) " - "with CategoricalDtype" - ) - warn = FutureWarning if val == 1.5 else None - with tm.assert_produces_warning(warn, match=msg): - df2 = df.replace(to_replace=1, value=val) + df2 = df.replace(to_replace=1, value=1) assert df._mgr._has_no_reference(0) assert df2._mgr._has_no_reference(0) diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index fb7ba2b7af38a..3fcc4aaa6960f 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -1171,38 +1171,6 @@ def test_replace_with_empty_dictlike(self, mix_abc): tm.assert_frame_equal(df, df.replace({"b": {}})) tm.assert_frame_equal(df, df.replace(Series({"b": {}}))) - @pytest.mark.parametrize( - "replace_dict, final_data", - [({"a": 1, "b": 1}, [[3, 3], [2, 2]]), ({"a": 1, "b": 2}, [[3, 1], [2, 3]])], - ) - def test_categorical_replace_with_dict(self, replace_dict, final_data): - # GH 26988 - df = DataFrame([[1, 1], [2, 2]], columns=["a", "b"], dtype="category") - - final_data = np.array(final_data) - - a = pd.Categorical(final_data[:, 0], categories=[3, 2]) - - ex_cat = [3, 2] if replace_dict["b"] == 1 else [1, 3] - b = pd.Categorical(final_data[:, 1], categories=ex_cat) - - expected = DataFrame({"a": a, "b": b}) - msg2 = "with CategoricalDtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg2): - result = df.replace(replace_dict, 3) - tm.assert_frame_equal(result, expected) - msg = ( - r"Attributes of DataFrame.iloc\[:, 0\] \(column name=\"a\"\) are " - "different" - ) - with pytest.raises(AssertionError, match=msg): - # ensure non-inplace call does not affect original - tm.assert_frame_equal(df, expected) - with tm.assert_produces_warning(FutureWarning, match=msg2): - return_value = df.replace(replace_dict, 3, inplace=True) - assert return_value is None - tm.assert_frame_equal(df, expected) - @pytest.mark.parametrize( "df, to_replace, exp", [ @@ -1300,6 +1268,30 @@ def test_replace_ea_ignore_float(self, frame_or_series, value): result = obj.replace(1.0, 0.0) tm.assert_equal(expected, result) + @pytest.mark.parametrize( + "replace_dict, final_data", + [({"a": 1, "b": 1}, [[2, 2], [2, 2]]), ({"a": 1, "b": 2}, [[2, 1], [2, 2]])], + ) + def test_categorical_replace_with_dict(self, replace_dict, final_data): + # GH 26988 + df = DataFrame([[1, 1], [2, 2]], columns=["a", "b"], dtype="category") + + final_data = np.array(final_data) + + a = pd.Categorical(final_data[:, 0], categories=[1, 2]) + b = pd.Categorical(final_data[:, 1], categories=[1, 2]) + + expected = DataFrame({"a": a, "b": b}) + result = df.replace(replace_dict, 2) + tm.assert_frame_equal(result, expected) + msg = r"DataFrame.iloc\[:, 0\] \(column name=\"a\"\) are " "different" + with pytest.raises(AssertionError, match=msg): + # ensure non-inplace call does not affect original + tm.assert_frame_equal(df, expected) + return_value = df.replace(replace_dict, 2, inplace=True) + assert return_value is None + tm.assert_frame_equal(df, expected) + def test_replace_value_category_type(self): """ Test for #23305: to ensure category dtypes are maintained @@ -1345,15 +1337,17 @@ def test_replace_value_category_type(self): ) # replace values in input dataframe - msg = ( - r"The behavior of Series\.replace \(and DataFrame.replace\) " - "with CategoricalDtype" + input_df = input_df.apply( + lambda x: x.astype("category").cat.rename_categories({"d": "z"}) + ) + input_df = input_df.apply( + lambda x: x.astype("category").cat.rename_categories({"obj1": "obj9"}) + ) + result = input_df.apply( + lambda x: x.astype("category").cat.rename_categories({"cat2": "catX"}) ) - with tm.assert_produces_warning(FutureWarning, match=msg): - input_df = input_df.replace("d", "z") - input_df = input_df.replace("obj1", "obj9") - result = input_df.replace("cat2", "catX") + result = result.astype({"col1": "int64", "col3": "float64", "col5": "object"}) tm.assert_frame_equal(result, expected) def test_replace_dict_category_type(self): @@ -1378,12 +1372,11 @@ def test_replace_dict_category_type(self): ) # replace values in input dataframe using a dict - msg = ( - r"The behavior of Series\.replace \(and DataFrame.replace\) " - "with CategoricalDtype" + result = input_df.apply( + lambda x: x.cat.rename_categories( + {"a": "z", "obj1": "obj9", "cat1": "catX"} + ) ) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = input_df.replace({"a": "z", "obj1": "obj9", "cat1": "catX"}) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index 0a79bcea679a7..90654df155cf0 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -370,9 +370,7 @@ def test_replace_mixed_types_with_string(self): def test_replace_categorical(self, categorical, numeric): # GH 24971, GH#23305 ser = pd.Series(pd.Categorical(categorical, categories=["A", "B"])) - msg = "with CategoricalDtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = ser.replace({"A": 1, "B": 2}) + result = ser.cat.rename_categories({"A": 1, "B": 2}) expected = pd.Series(numeric).astype("category") if 2 not in expected.cat.categories: # i.e. categories should be [1, 2] even if there are no "B"s present @@ -380,16 +378,13 @@ def test_replace_categorical(self, categorical, numeric): expected = expected.cat.add_categories(2) tm.assert_series_equal(expected, result, check_categorical=False) - @pytest.mark.parametrize( - "data, data_exp", [(["a", "b", "c"], ["b", "b", "c"]), (["a"], ["b"])] - ) - def test_replace_categorical_inplace(self, data, data_exp): + def test_replace_categorical_inplace(self): # GH 53358 + data = ["a", "b", "c"] + data_exp = ["b", "b", "c"] result = pd.Series(data, dtype="category") - msg = "with CategoricalDtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result.replace(to_replace="a", value="b", inplace=True) - expected = pd.Series(data_exp, dtype="category") + result.replace(to_replace="a", value="b", inplace=True) + expected = pd.Series(pd.Categorical(data_exp, categories=data)) tm.assert_series_equal(result, expected) def test_replace_categorical_single(self): @@ -404,25 +399,10 @@ def test_replace_categorical_single(self): expected = expected.cat.remove_unused_categories() assert c[2] != "foo" - msg = "with CategoricalDtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = c.replace(c[2], "foo") + result = c.cat.rename_categories({c.values[2]: "foo"}) tm.assert_series_equal(expected, result) assert c[2] != "foo" # ensure non-inplace call does not alter original - msg = "with CategoricalDtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - return_value = c.replace(c[2], "foo", inplace=True) - assert return_value is None - tm.assert_series_equal(expected, c) - - first_value = c[0] - msg = "with CategoricalDtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - return_value = c.replace(c[1], c[0], inplace=True) - assert return_value is None - assert c[0] == c[1] == first_value # test replacing with existing value - def test_replace_with_no_overflowerror(self): # GH 25616 # casts to object without Exception from OverflowError From 2d6e61ead8ecf4666764e87e7aebe4bddaee5ef9 Mon Sep 17 00:00:00 2001 From: Natalia Mokeeva <91160475+natmokval@users.noreply.github.com> Date: Tue, 25 Jun 2024 23:46:55 +0200 Subject: [PATCH 129/272] DEPR: replace for Timedelta deprecated unit 'd' with 'D' in benchmarks (#59083) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit replace deprecated unit d with “D” --- asv_bench/benchmarks/tslibs/timedelta.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/tslibs/timedelta.py b/asv_bench/benchmarks/tslibs/timedelta.py index dcc73aefc6c7a..9d9689fcfa94b 100644 --- a/asv_bench/benchmarks/tslibs/timedelta.py +++ b/asv_bench/benchmarks/tslibs/timedelta.py @@ -20,7 +20,7 @@ def time_from_int(self): Timedelta(123456789) def time_from_unit(self): - Timedelta(1, unit="d") + Timedelta(1, unit="D") def time_from_components(self): Timedelta( From bef88efe999809b775ed88a02f0fc2fd6d2d08a2 Mon Sep 17 00:00:00 2001 From: Natalia Mokeeva <91160475+natmokval@users.noreply.github.com> Date: Wed, 26 Jun 2024 02:07:31 +0200 Subject: [PATCH 130/272] CLN: enforce the deprecation of the `Series.argsort` NA behavior (#58232) * enforce deprecation of the Series.argsort NA behavior * remove comments * add a note to v3.0.0 * correct def argsort and tests * correct def argsort/tests * fix pre-commit error * Restore numpy test --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/series.py | 21 +------------------- pandas/tests/extension/base/methods.py | 6 ++---- pandas/tests/series/methods/test_argsort.py | 22 +++++++-------------- 4 files changed, 11 insertions(+), 39 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 6da38a364ab5b..130ccded72859 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -382,6 +382,7 @@ Other Removals - Enforced deprecation of strings ``T``, ``L``, ``U``, and ``N`` denoting units in :class:`Timedelta` (:issue:`57627`) - Enforced deprecation of the behavior of :func:`concat` when ``len(keys) != len(objs)`` would truncate to the shorter of the two. Now this raises a ``ValueError`` (:issue:`43485`) - Enforced deprecation of the behavior of :meth:`DataFrame.replace` and :meth:`Series.replace` with :class:`CategoricalDtype` that would introduce new categories. (:issue:`58270`) +- Enforced deprecation of the behavior of :meth:`Series.argsort` in the presence of NA values (:issue:`58232`) - Enforced deprecation of values "pad", "ffill", "bfill", and "backfill" for :meth:`Series.interpolate` and :meth:`DataFrame.interpolate` (:issue:`57869`) - Enforced deprecation removing :meth:`Categorical.to_list`, use ``obj.tolist()`` instead (:issue:`51254`) - Enforced silent-downcasting deprecation for :ref:`all relevant methods ` (:issue:`54710`) diff --git a/pandas/core/series.py b/pandas/core/series.py index 0a67ad7e57f86..a22cc59b62499 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -49,7 +49,6 @@ deprecate_nonkeyword_arguments, doc, ) -from pandas.util._exceptions import find_stack_level from pandas.util._validators import ( validate_ascending, validate_bool_kwarg, @@ -3722,25 +3721,7 @@ def argsort( # GH#54257 We allow -1 here so that np.argsort(series) works self._get_axis_number(axis) - values = self._values - mask = isna(values) - - if mask.any(): - # TODO(3.0): once this deprecation is enforced we can call - # self.array.argsort directly, which will close GH#43840 and - # GH#12694 - warnings.warn( - "The behavior of Series.argsort in the presence of NA values is " - "deprecated. In a future version, NA values will be ordered " - "last instead of set to -1.", - FutureWarning, - stacklevel=find_stack_level(), - ) - result = np.full(len(self), -1, dtype=np.intp) - notmask = ~mask - result[notmask] = np.argsort(values[notmask], kind=kind) - else: - result = np.argsort(values, kind=kind) + result = self.array.argsort(kind=kind) res = self._constructor( result, index=self.index, name=self.name, dtype=np.intp, copy=False diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index b951d4c35d208..b7f0f973e640a 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -116,10 +116,8 @@ def test_argsort_missing_array(self, data_missing_for_sorting): tm.assert_numpy_array_equal(result, expected) def test_argsort_missing(self, data_missing_for_sorting): - msg = "The behavior of Series.argsort in the presence of NA values" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = pd.Series(data_missing_for_sorting).argsort() - expected = pd.Series(np.array([1, -1, 0], dtype=np.intp)) + result = pd.Series(data_missing_for_sorting).argsort() + expected = pd.Series(np.array([2, 0, 1], dtype=np.intp)) tm.assert_series_equal(result, expected) def test_argmin_argmax(self, data_for_sorting, data_missing_for_sorting, na_value): diff --git a/pandas/tests/series/methods/test_argsort.py b/pandas/tests/series/methods/test_argsort.py index 432c0eceee011..c1082c06ce307 100644 --- a/pandas/tests/series/methods/test_argsort.py +++ b/pandas/tests/series/methods/test_argsort.py @@ -20,21 +20,15 @@ def test_argsort_axis(self): def test_argsort_numpy(self, datetime_series): ser = datetime_series - res = np.argsort(ser).values expected = np.argsort(np.array(ser)) tm.assert_numpy_array_equal(res, expected) - # with missing values - ts = ser.copy() - ts[::2] = np.nan - - msg = "The behavior of Series.argsort in the presence of NA values" - with tm.assert_produces_warning( - FutureWarning, match=msg, check_stacklevel=False - ): - result = np.argsort(ts)[1::2] - expected = np.argsort(np.array(ts.dropna())) + def test_argsort_numpy_missing(self): + data = [0.1, np.nan, 0.2, np.nan, 0.3] + ser = Series(data) + result = np.argsort(ser) + expected = np.argsort(np.array(data)) tm.assert_numpy_array_equal(result.values, expected) @@ -56,10 +50,8 @@ def test_argsort_dt64(self, unit): expected = Series(range(5), dtype=np.intp) tm.assert_series_equal(result, expected) - msg = "The behavior of Series.argsort in the presence of NA values" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = shifted.argsort() - expected = Series(list(range(4)) + [-1], dtype=np.intp) + result = shifted.argsort() + expected = Series(list(range(4)) + [4], dtype=np.intp) tm.assert_series_equal(result, expected) def test_argsort_stable(self): From 679c590c51bb7d6dacca4853882d191994440370 Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Wed, 26 Jun 2024 18:32:45 +0200 Subject: [PATCH 131/272] CI: Set up CI for the free-threaded build of 3.13 (#59058) * CI: Set up CI for the free-threaded build of 3.13 * Only run CI on ubuntu * No need for prerelease pip anymore * Update .github/workflows/unit-tests.yml Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- .github/workflows/unit-tests.yml | 43 ++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 32b7aee7c2bdc..d2240192982fc 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -346,6 +346,49 @@ jobs: - name: Run Tests uses: ./.github/actions/run-tests + python-freethreading: + defaults: + run: + shell: bash -eou pipefail {0} + runs-on: ubuntu-22.04 + + timeout-minutes: 90 + + concurrency: + # https://github.community/t/concurrecy-not-work-for-push/183068/7 + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.os }}-python-freethreading-dev + cancel-in-progress: true + + env: + PYTEST_WORKERS: "auto" + PANDAS_CI: 1 + PATTERN: "not slow and not network and not clipboard and not single_cpu" + PYTEST_TARGET: pandas + + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Python Free-threading Version + uses: deadsnakes/action@v3.1.0 + with: + python-version: 3.13-dev + nogil: true + + - name: Build Environment + run: | + python --version + python -m pip install --upgrade pip setuptools wheel meson[ninja]==1.2.1 meson-python==0.13.1 + python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy cython + python -m pip install versioneer[toml] + python -m pip install python-dateutil pytz tzdata hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-cov + python -m pip install -ve . --no-build-isolation --no-index --no-deps --config-settings=setup-args="--werror" + python -m pip list + + - name: Run Tests + uses: ./.github/actions/run-tests + emscripten: # Note: the Python version, Emscripten toolchain version are determined # by the Pyodide version. The appropriate versions can be found in the From f28fe5abc0fb4bc869a198f5a4c32d638d1875bd Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Wed, 26 Jun 2024 22:25:57 +0200 Subject: [PATCH 132/272] CI: Keep GIL disabled in free-threading CI (#59109) --- .github/workflows/unit-tests.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index d2240192982fc..982877ee7f365 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -388,6 +388,8 @@ jobs: - name: Run Tests uses: ./.github/actions/run-tests + env: + PYTHON_GIL: 0 emscripten: # Note: the Python version, Emscripten toolchain version are determined From 32ceb4ae0b51a9b157ffc55902eb4edeb8e304f5 Mon Sep 17 00:00:00 2001 From: haffara <94278115+haffara@users.noreply.github.com> Date: Thu, 27 Jun 2024 00:44:42 +0200 Subject: [PATCH 133/272] DOC: fix Raises TypeError if any kind of string dtype is passed in (#59113) Co-authored-by: fatslow --- pandas/core/frame.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5b156cd75e373..fab798dd617b7 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4782,6 +4782,7 @@ def select_dtypes(self, include=None, exclude=None) -> DataFrame: ValueError * If both of ``include`` and ``exclude`` are empty * If ``include`` and ``exclude`` have overlapping elements + TypeError * If any kind of string dtype is passed in. See Also From 195401f607b76b9d017bc31fa202d480127b9d8b Mon Sep 17 00:00:00 2001 From: Christopher Titchen <109701765+christopher-titchen@users.noreply.github.com> Date: Thu, 27 Jun 2024 16:13:43 +0100 Subject: [PATCH 134/272] BUG: `DataFrame.sparse.from_spmatrix` hard codes an invalid ``fill_value`` for certain subtypes (#59064) * BUG: :bug: :sparkles: Add fill_value param to from_spmatrix method. * ENH: :sparkles: Set explicit fill_value of NaN for complex floats. * TST: :white_check_mark: Fix failing tests. * TST: :white_check_mark: Add tests for from_spmatrix method. * DOC: :memo: Add what's new entry. * TST: :white_check_mark: Fix failing tests for sparse getitem. * TST: :white_check_mark: Remove test for 256-bit complex float. * DOC: :memo: Update example in docstring for from_spmatrix method. * DOC: :memo: Update some docstrings and sparse user guide. * DOC: :pencil2: Update dtype docstring. Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> * BUG: :rewind: :bug: Revert fill_value change and fix to_coo method. * TST: :rewind: :white_check_mark: Fix and add sparse accessor tests. * TST: :rewind: :white_check_mark: Fix and add sparse getitem tests. * DOC: :rewind: :memo: Revert fill_value change to sparse user guide. * CLN: :pencil2: Fix instantiation of np.ma.array in test. --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v3.0.0.rst | 2 +- pandas/core/arrays/sparse/accessor.py | 12 ++-- pandas/core/dtypes/dtypes.py | 7 +- pandas/core/dtypes/missing.py | 4 +- pandas/tests/arrays/sparse/test_accessor.py | 77 +++++++++++---------- pandas/tests/dtypes/test_missing.py | 3 + pandas/tests/indexing/test_loc.py | 16 ++--- 7 files changed, 62 insertions(+), 59 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 130ccded72859..2b487f19828fa 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -598,7 +598,7 @@ Reshaping Sparse ^^^^^^ - Bug in :class:`SparseDtype` for equal comparison with na fill value. (:issue:`54770`) -- +- Bug in :meth:`DataFrame.sparse.from_spmatrix` which hard coded an invalid ``fill_value`` for certain subtypes. (:issue:`59063`) ExtensionArray ^^^^^^^^^^^^^^ diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py index 39a70fba9aa78..b8245349a4e62 100644 --- a/pandas/core/arrays/sparse/accessor.py +++ b/pandas/core/arrays/sparse/accessor.py @@ -291,12 +291,12 @@ def from_spmatrix(cls, data, index=None, columns=None) -> DataFrame: Examples -------- >>> import scipy.sparse - >>> mat = scipy.sparse.eye(3, dtype=float) + >>> mat = scipy.sparse.eye(3, dtype=int) >>> pd.DataFrame.sparse.from_spmatrix(mat) 0 1 2 - 0 1.0 0 0 - 1 0 1.0 0 - 2 0 0 1.0 + 0 1 0 0 + 1 0 1 0 + 2 0 0 1 """ from pandas._libs.sparse import IntIndex @@ -313,7 +313,7 @@ def from_spmatrix(cls, data, index=None, columns=None) -> DataFrame: indices = data.indices indptr = data.indptr array_data = data.data - dtype = SparseDtype(array_data.dtype, 0) + dtype = SparseDtype(array_data.dtype) arrays = [] for i in range(n_columns): sl = slice(indptr[i], indptr[i + 1]) @@ -393,8 +393,6 @@ def to_coo(self) -> spmatrix: cols, rows, data = [], [], [] for col, (_, ser) in enumerate(self._parent.items()): sp_arr = ser.array - if sp_arr.fill_value != 0: - raise ValueError("fill value must be 0 when converting to COO matrix") row = sp_arr.sp_index.indices cols.append(np.repeat(col, len(row))) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 5213be8b69016..3aeab96e03163 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -1666,7 +1666,7 @@ class SparseDtype(ExtensionDtype): """ Dtype for data stored in :class:`SparseArray`. - `SparseDtype` is used as the data type for :class:`SparseArray`, enabling + ``SparseDtype`` is used as the data type for :class:`SparseArray`, enabling more efficient storage of data that contains a significant number of repetitive values typically represented by a fill value. It supports any scalar dtype as the underlying data type of the non-fill values. @@ -1677,19 +1677,20 @@ class SparseDtype(ExtensionDtype): The dtype of the underlying array storing the non-fill value values. fill_value : scalar, optional The scalar value not stored in the SparseArray. By default, this - depends on `dtype`. + depends on ``dtype``. =========== ========== dtype na_value =========== ========== float ``np.nan`` + complex ``np.nan`` int ``0`` bool ``False`` datetime64 ``pd.NaT`` timedelta64 ``pd.NaT`` =========== ========== - The default value may be overridden by specifying a `fill_value`. + The default value may be overridden by specifying a ``fill_value``. Attributes ---------- diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index f0e21136f8a97..b9cd6ae2f13e8 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -618,6 +618,8 @@ def na_value_for_dtype(dtype: DtypeObj, compat: bool = True): nan >>> na_value_for_dtype(np.dtype("float64")) nan + >>> na_value_for_dtype(np.dtype("complex128")) + nan >>> na_value_for_dtype(np.dtype("bool")) False >>> na_value_for_dtype(np.dtype("datetime64[ns]")) @@ -629,7 +631,7 @@ def na_value_for_dtype(dtype: DtypeObj, compat: bool = True): elif dtype.kind in "mM": unit = np.datetime_data(dtype)[0] return dtype.type("NaT", unit) - elif dtype.kind == "f": + elif dtype.kind in "fc": return np.nan elif dtype.kind in "iu": if compat: diff --git a/pandas/tests/arrays/sparse/test_accessor.py b/pandas/tests/arrays/sparse/test_accessor.py index 87eb7bcfa9cee..bd3298940ae3a 100644 --- a/pandas/tests/arrays/sparse/test_accessor.py +++ b/pandas/tests/arrays/sparse/test_accessor.py @@ -105,28 +105,36 @@ def test_accessor_raises(self): @pytest.mark.parametrize("format", ["csc", "csr", "coo"]) @pytest.mark.parametrize("labels", [None, list(string.ascii_letters[:10])]) - @pytest.mark.parametrize("dtype", ["float64", "int64"]) + @pytest.mark.parametrize("dtype", [np.complex128, np.float64, np.int64, bool]) def test_from_spmatrix(self, format, labels, dtype): sp_sparse = pytest.importorskip("scipy.sparse") - sp_dtype = SparseDtype(dtype, np.array(0, dtype=dtype).item()) + sp_dtype = SparseDtype(dtype) - mat = sp_sparse.eye(10, format=format, dtype=dtype) - result = pd.DataFrame.sparse.from_spmatrix(mat, index=labels, columns=labels) + sp_mat = sp_sparse.eye(10, format=format, dtype=dtype) + result = pd.DataFrame.sparse.from_spmatrix(sp_mat, index=labels, columns=labels) + mat = np.eye(10, dtype=dtype) expected = pd.DataFrame( - np.eye(10, dtype=dtype), index=labels, columns=labels + np.ma.array(mat, mask=(mat == 0)).filled(sp_dtype.fill_value), + index=labels, + columns=labels, ).astype(sp_dtype) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("format", ["csc", "csr", "coo"]) - def test_from_spmatrix_including_explicit_zero(self, format): + @pytest.mark.parametrize("dtype", [np.int64, bool]) + def test_from_spmatrix_including_explicit_zero(self, format, dtype): sp_sparse = pytest.importorskip("scipy.sparse") - mat = sp_sparse.random(10, 2, density=0.5, format=format) - mat.data[0] = 0 - result = pd.DataFrame.sparse.from_spmatrix(mat) - dtype = SparseDtype("float64", 0.0) - expected = pd.DataFrame(mat.todense()).astype(dtype) + sp_dtype = SparseDtype(dtype) + + sp_mat = sp_sparse.random(10, 2, density=0.5, format=format, dtype=dtype) + sp_mat.data[0] = 0 + result = pd.DataFrame.sparse.from_spmatrix(sp_mat) + mat = sp_mat.toarray() + expected = pd.DataFrame( + np.ma.array(mat, mask=(mat == 0)).filled(sp_dtype.fill_value) + ).astype(sp_dtype) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( @@ -136,41 +144,34 @@ def test_from_spmatrix_including_explicit_zero(self, format): def test_from_spmatrix_columns(self, columns): sp_sparse = pytest.importorskip("scipy.sparse") - dtype = SparseDtype("float64", 0.0) + sp_dtype = SparseDtype(np.float64) - mat = sp_sparse.random(10, 2, density=0.5) - result = pd.DataFrame.sparse.from_spmatrix(mat, columns=columns) - expected = pd.DataFrame(mat.toarray(), columns=columns).astype(dtype) + sp_mat = sp_sparse.random(10, 2, density=0.5) + result = pd.DataFrame.sparse.from_spmatrix(sp_mat, columns=columns) + mat = sp_mat.toarray() + expected = pd.DataFrame( + np.ma.array(mat, mask=(mat == 0)).filled(sp_dtype.fill_value), + columns=columns, + ).astype(sp_dtype) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( - "colnames", [("A", "B"), (1, 2), (1, pd.NA), (0.1, 0.2), ("x", "x"), (0, 0)] + "columns", [("A", "B"), (1, 2), (1, pd.NA), (0.1, 0.2), ("x", "x"), (0, 0)] ) - def test_to_coo(self, colnames): + @pytest.mark.parametrize("dtype", [np.complex128, np.float64, np.int64, bool]) + def test_to_coo(self, columns, dtype): sp_sparse = pytest.importorskip("scipy.sparse") - df = pd.DataFrame( - {colnames[0]: [0, 1, 0], colnames[1]: [1, 0, 0]}, dtype="Sparse[int64, 0]" - ) - result = df.sparse.to_coo() - expected = sp_sparse.coo_matrix(np.asarray(df)) - assert (result != expected).nnz == 0 + sp_dtype = SparseDtype(dtype) - @pytest.mark.parametrize("fill_value", [1, np.nan]) - def test_to_coo_nonzero_fill_val_raises(self, fill_value): - pytest.importorskip("scipy") - df = pd.DataFrame( - { - "A": SparseArray( - [fill_value, fill_value, fill_value, 2], fill_value=fill_value - ), - "B": SparseArray( - [fill_value, 2, fill_value, fill_value], fill_value=fill_value - ), - } - ) - with pytest.raises(ValueError, match="fill value must be 0"): - df.sparse.to_coo() + expected = sp_sparse.random(10, 2, density=0.5, format="coo", dtype=dtype) + mat = expected.toarray() + result = pd.DataFrame( + np.ma.array(mat, mask=(mat == 0)).filled(sp_dtype.fill_value), + columns=columns, + dtype=sp_dtype, + ).sparse.to_coo() + assert (result != expected).nnz == 0 def test_to_coo_midx_categorical(self): # GH#50996 diff --git a/pandas/tests/dtypes/test_missing.py b/pandas/tests/dtypes/test_missing.py index 2109c794ad44f..f86ed6f49759f 100644 --- a/pandas/tests/dtypes/test_missing.py +++ b/pandas/tests/dtypes/test_missing.py @@ -697,6 +697,9 @@ def test_array_equivalent_index_with_tuples(): ("f2", np.nan), ("f4", np.nan), ("f8", np.nan), + # Complex + ("c8", np.nan), + ("c16", np.nan), # Object ("O", np.nan), # Interval diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 16f3e0fd0c229..903ad24ce53b3 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -1281,7 +1281,7 @@ def test_loc_getitem_time_object(self, frame_or_series): tm.assert_equal(result, expected) @pytest.mark.parametrize("spmatrix_t", ["coo_matrix", "csc_matrix", "csr_matrix"]) - @pytest.mark.parametrize("dtype", [np.int64, np.float64, complex]) + @pytest.mark.parametrize("dtype", [np.complex128, np.float64, np.int64, bool]) def test_loc_getitem_range_from_spmatrix(self, spmatrix_t, dtype): sp_sparse = pytest.importorskip("scipy.sparse") @@ -1296,13 +1296,13 @@ def test_loc_getitem_range_from_spmatrix(self, spmatrix_t, dtype): # regression test for GH#34526 itr_idx = range(2, rows) - result = df.loc[itr_idx].values + result = np.nan_to_num(df.loc[itr_idx].values) expected = spmatrix.toarray()[itr_idx] tm.assert_numpy_array_equal(result, expected) # regression test for GH#34540 result = df.loc[itr_idx].dtypes.values - expected = np.full(cols, SparseDtype(dtype, fill_value=0)) + expected = np.full(cols, SparseDtype(dtype)) tm.assert_numpy_array_equal(result, expected) def test_loc_getitem_listlike_all_retains_sparse(self): @@ -1314,18 +1314,16 @@ def test_loc_getitem_sparse_frame(self): # GH34687 sp_sparse = pytest.importorskip("scipy.sparse") - df = DataFrame.sparse.from_spmatrix(sp_sparse.eye(5)) + df = DataFrame.sparse.from_spmatrix(sp_sparse.eye(5, dtype=np.int64)) result = df.loc[range(2)] expected = DataFrame( - [[1.0, 0.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0, 0.0]], - dtype=SparseDtype("float64", 0.0), + [[1, 0, 0, 0, 0], [0, 1, 0, 0, 0]], + dtype=SparseDtype(np.int64), ) tm.assert_frame_equal(result, expected) result = df.loc[range(2)].loc[range(1)] - expected = DataFrame( - [[1.0, 0.0, 0.0, 0.0, 0.0]], dtype=SparseDtype("float64", 0.0) - ) + expected = DataFrame([[1, 0, 0, 0, 0]], dtype=SparseDtype(np.int64)) tm.assert_frame_equal(result, expected) def test_loc_getitem_sparse_series(self): From 42082a80d0c3b3483f08b887271b1b08c66551e1 Mon Sep 17 00:00:00 2001 From: Abdulaziz Aloqeely <52792999+Aloqeely@users.noreply.github.com> Date: Thu, 27 Jun 2024 18:15:28 +0300 Subject: [PATCH 135/272] CI: Modify the trailing-whitespace hook to preserve markdown hard linebreaks (#59117) --- .pre-commit-config.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c32f727213152..2d5cd9e841df3 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -67,6 +67,7 @@ repos: - id: fix-encoding-pragma args: [--remove] - id: trailing-whitespace + args: [--markdown-linebreak-ext=md] - repo: https://github.com/PyCQA/isort rev: 5.13.2 hooks: From 10d3615f12c85b6e7b5af20ba6a9d4feb9f6a332 Mon Sep 17 00:00:00 2001 From: "Y.X" Date: Fri, 28 Jun 2024 01:20:20 +0800 Subject: [PATCH 136/272] BUG: Add type check for encoding_errors in pd.read_csv (#59075) * BUG: Add type check for encoding_errors in pd.read_csv * BUG: Add type check for encoding_errors in pd.read_csv * pre-commit * Update pandas/io/parsers/readers.py Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> * Unit test Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> * Update pandas/io/parsers/readers.py Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> * update the unit test for `encoding_errors` * Update doc/source/whatsnew/v3.0.0.rst Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> * add a unit test * update unit test * update unit test * update unit test * update unit test * Update pandas/tests/io/test_common.py Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> * Update pandas/tests/io/test_common.py Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> * update unit test * update unit test --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/io/parsers/readers.py | 8 ++++++++ pandas/tests/io/test_common.py | 13 ++++++++++++- 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 2b487f19828fa..34cfca7a0e777 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -558,6 +558,7 @@ I/O - Bug in :meth:`DataFrame.to_stata` when writing :class:`DataFrame` and ``byteorder=`big```. (:issue:`58969`) - Bug in :meth:`DataFrame.to_string` that raised ``StopIteration`` with nested DataFrames. (:issue:`16098`) - Bug in :meth:`HDFStore.get` was failing to save data of dtype datetime64[s] correctly (:issue:`59004`) +- Bug in :meth:`read_csv` causing segmentation fault when ``encoding_errors`` is not a string. (:issue:`59059`) - Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`) - Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`) - Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index d00fc3b15976c..c28d3aaaf4748 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -674,6 +674,14 @@ def _read( # Extract some of the arguments (pass chunksize on). iterator = kwds.get("iterator", False) chunksize = kwds.get("chunksize", None) + + # Check type of encoding_errors + errors = kwds.get("encoding_errors", "strict") + if not isinstance(errors, str): + raise ValueError( + f"encoding_errors must be a string, got {type(errors).__name__}" + ) + if kwds.get("engine") == "pyarrow": if iterator: raise ValueError( diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index d73790365bb1f..26bb2be73838a 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -555,7 +555,7 @@ def test_explicit_encoding(io_class, mode, msg): expected.to_csv(buffer, mode=f"w{mode}") -@pytest.mark.parametrize("encoding_errors", [None, "strict", "replace"]) +@pytest.mark.parametrize("encoding_errors", ["strict", "replace"]) @pytest.mark.parametrize("format", ["csv", "json"]) def test_encoding_errors(encoding_errors, format): # GH39450 @@ -590,6 +590,17 @@ def test_encoding_errors(encoding_errors, format): tm.assert_frame_equal(df, expected) +@pytest.mark.parametrize("encoding_errors", [0, None]) +def test_encoding_errors_badtype(encoding_errors): + # GH 59075 + content = StringIO("A,B\n1,2\n3,4\n") + reader = partial(pd.read_csv, encoding_errors=encoding_errors) + expected_error = "encoding_errors must be a string, got " + expected_error += f"{type(encoding_errors).__name__}" + with pytest.raises(ValueError, match=expected_error): + reader(content) + + def test_bad_encdoing_errors(): # GH 39777 with tm.ensure_clean() as path: From 0320b3c25b641513e84969aa2d92e4f1b501487f Mon Sep 17 00:00:00 2001 From: eilonc-cx <160746118+eilonc-cx@users.noreply.github.com> Date: Thu, 27 Jun 2024 20:22:17 +0300 Subject: [PATCH 137/272] DOC: Update warning message in pandas.eval function (#59108) * Update warning message in pandas.eval function Modify warning to indicate the risks using eval func. * Update pandas.eval function warning message - fix Docstring * Update pandas/core/computation/eval.py Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> * Update pandas/core/computation/eval.py --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/core/computation/eval.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/core/computation/eval.py b/pandas/core/computation/eval.py index fee08c6199eef..aad768d31483a 100644 --- a/pandas/core/computation/eval.py +++ b/pandas/core/computation/eval.py @@ -193,8 +193,11 @@ def eval( corresponding bitwise operators. :class:`~pandas.Series` and :class:`~pandas.DataFrame` objects are supported and behave as they would with plain ol' Python evaluation. - `eval` can run arbitrary code which can make you vulnerable to code - injection if you pass user input to this function. + + .. warning:: + + ``eval`` can run arbitrary code which can make you vulnerable to code + injection and untrusted data. Parameters ---------- From 750d75e0385d845551e1b611d79d8644f5ea92b9 Mon Sep 17 00:00:00 2001 From: JBurley Date: Thu, 27 Jun 2024 16:51:33 -0400 Subject: [PATCH 138/272] DOC: json_normalize breaking changes in pandas 3.0.0 (#59127) * note breaking change in json_normalize retaining index For context: #51542 & #57422 * Update doc/source/whatsnew/v3.0.0.rst * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- doc/source/whatsnew/v3.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 34cfca7a0e777..d9d2330f8f11b 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -230,6 +230,7 @@ Other API changes - 3rd party ``py.path`` objects are no longer explicitly supported in IO methods. Use :py:class:`pathlib.Path` objects instead (:issue:`57091`) - :func:`read_table`'s ``parse_dates`` argument defaults to ``None`` to improve consistency with :func:`read_csv` (:issue:`57476`) - Made ``dtype`` a required argument in :meth:`ExtensionArray._from_sequence_of_strings` (:issue:`56519`) +- Passing a :class:`Series` input to :func:`json_normalize` will now retain the :class:`Series` :class:`Index`, previously output had a new :class:`RangeIndex` (:issue:`51452`) - Updated :meth:`DataFrame.to_excel` so that the output spreadsheet has no styling. Custom styling can still be done using :meth:`Styler.to_excel` (:issue:`54154`) - pickle and HDF (``.h5``) files created with Python 2 are no longer explicitly supported (:issue:`57387`) - pickled objects from pandas version less than ``1.0.0`` are no longer supported (:issue:`57155`) From a89f20853591516b4ba45a1fbadbf645247d133e Mon Sep 17 00:00:00 2001 From: Natalia Mokeeva <91160475+natmokval@users.noreply.github.com> Date: Thu, 27 Jun 2024 22:52:45 +0200 Subject: [PATCH 139/272] CLN: unify a ValueError message for removed units T, L, U, N and remove these entries from UnitChoices (#59119) cln: change msg in ValueError for units T, L, U, N --- pandas/_libs/tslibs/dtypes.pyx | 4 ---- pandas/_libs/tslibs/timedeltas.pyi | 6 ------ pandas/_libs/tslibs/timedeltas.pyx | 5 ----- pandas/tests/tslibs/test_resolution.py | 7 ------- 4 files changed, 22 deletions(-) diff --git a/pandas/_libs/tslibs/dtypes.pyx b/pandas/_libs/tslibs/dtypes.pyx index e047566a1868e..0fdadf5b7611d 100644 --- a/pandas/_libs/tslibs/dtypes.pyx +++ b/pandas/_libs/tslibs/dtypes.pyx @@ -453,10 +453,6 @@ class Resolution(Enum): """ cdef: str abbrev - if freq in {"T", "t", "L", "l", "U", "u", "N", "n"}: - raise ValueError( - f"Frequency \'{freq}\' is no longer supported." - ) try: if freq in c_DEPR_ABBREVS: abbrev = c_DEPR_ABBREVS[freq] diff --git a/pandas/_libs/tslibs/timedeltas.pyi b/pandas/_libs/tslibs/timedeltas.pyi index 24ec6c8891a89..979a5666661b2 100644 --- a/pandas/_libs/tslibs/timedeltas.pyi +++ b/pandas/_libs/tslibs/timedeltas.pyi @@ -39,8 +39,6 @@ UnitChoices: TypeAlias = Literal[ "minute", "min", "minutes", - "T", - "t", "s", "seconds", "sec", @@ -50,21 +48,17 @@ UnitChoices: TypeAlias = Literal[ "millisecond", "milli", "millis", - "L", - "l", "us", "microseconds", "microsecond", "µs", "micro", "micros", - "u", "ns", "nanoseconds", "nano", "nanos", "nanosecond", - "n", ] _S = TypeVar("_S", bound=timedelta) diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index de192d511d507..d5348311f19e2 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -1818,11 +1818,6 @@ class Timedelta(_Timedelta): * 'microseconds', 'microsecond', 'micros', 'micro', or 'us' * 'nanoseconds', 'nanosecond', 'nanos', 'nano', or 'ns'. - .. deprecated:: 2.2.0 - - Values `H`, `T`, `S`, `L`, `U`, and `N` are deprecated in favour - of the values `h`, `min`, `s`, `ms`, `us`, and `ns`. - .. deprecated:: 3.0.0 Allowing the values `w`, `d`, `MIN`, `MS`, `US` and `NS` to denote units diff --git a/pandas/tests/tslibs/test_resolution.py b/pandas/tests/tslibs/test_resolution.py index 49b87c055dc69..722359380f6a3 100644 --- a/pandas/tests/tslibs/test_resolution.py +++ b/pandas/tests/tslibs/test_resolution.py @@ -56,10 +56,3 @@ def test_units_H_S_deprecated_from_attrname_to_abbrevs(freq): with tm.assert_produces_warning(FutureWarning, match=msg): Resolution.get_reso_from_freqstr(freq) - - -@pytest.mark.parametrize("freq", ["T", "t", "L", "U", "N", "n"]) -def test_reso_abbrev_T_L_U_N_raises(freq): - msg = f"Frequency '{freq}' is no longer supported." - with pytest.raises(ValueError, match=msg): - Resolution.get_reso_from_freqstr(freq) From 23e592f51b7ad94ce52d2376fff59a8d64bade99 Mon Sep 17 00:00:00 2001 From: Pascal Corpet Date: Sat, 29 Jun 2024 20:15:39 +0200 Subject: [PATCH 140/272] BUG: chokes on pd.DatetimeTZDtype if there are no rows. (#59123) BUG: `pivot_table` chokes on pd.DatetimeTZDtype if there are no rows. This is a follow up to #41875 --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/reshape/reshape.py | 24 +++++++++++------------- pandas/tests/reshape/test_pivot.py | 14 ++++++++++++++ 3 files changed, 26 insertions(+), 13 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index d9d2330f8f11b..afb2f91f65ccd 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -596,6 +596,7 @@ Reshaping ^^^^^^^^^ - Bug in :meth:`DataFrame.join` inconsistently setting result index name (:issue:`55815`) - Bug in :meth:`DataFrame.unstack` producing incorrect results when ``sort=False`` (:issue:`54987`, :issue:`55516`) +- Bug in :meth:`DataFrame.unstack` producing incorrect results when manipulating empty :class:`DataFrame` with an :class:`ExtentionDtype` (:issue:`59123`) Sparse ^^^^^^ diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 3df55256ec43b..9b7b768fe7adb 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -288,21 +288,19 @@ def get_new_values(self, values, fill_value=None): dtype = values.dtype - # if our mask is all True, then we can use our existing dtype - if mask_all: - dtype = values.dtype - new_values = np.empty(result_shape, dtype=dtype) - else: - if isinstance(dtype, ExtensionDtype): - # GH#41875 - # We are assuming that fill_value can be held by this dtype, - # unlike the non-EA case that promotes. - cls = dtype.construct_array_type() - new_values = cls._empty(result_shape, dtype=dtype) + if isinstance(dtype, ExtensionDtype): + # GH#41875 + # We are assuming that fill_value can be held by this dtype, + # unlike the non-EA case that promotes. + cls = dtype.construct_array_type() + new_values = cls._empty(result_shape, dtype=dtype) + if not mask_all: new_values[:] = fill_value - else: + else: + if not mask_all: dtype, fill_value = maybe_promote(dtype, fill_value) - new_values = np.empty(result_shape, dtype=dtype) + new_values = np.empty(result_shape, dtype=dtype) + if not mask_all: new_values.fill(fill_value) name = dtype.name diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 728becc76b71f..2872b1e29d629 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -2769,3 +2769,17 @@ def test_unstack_copy(self, m): result = df.unstack(sort=False) result.iloc[0, 0] = -1 tm.assert_frame_equal(df, df_orig) + + def test_pivot_empty_with_datetime(self): + # GH#59126 + df = DataFrame( + { + "timestamp": Series([], dtype=pd.DatetimeTZDtype(tz="UTC")), + "category": Series([], dtype=str), + "value": Series([], dtype=str), + } + ) + df_pivoted = df.pivot_table( + index="category", columns="value", values="timestamp" + ) + assert df_pivoted.empty From f2eeb4e9baee7ba4d8ff854b22c2c34fd865d969 Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Mon, 1 Jul 2024 16:30:32 +0200 Subject: [PATCH 141/272] Add Py_mod_gil slot to C extension modules (#59135) --- pandas/_libs/src/datetime/pd_datetime.c | 7 ++++++- pandas/_libs/src/parser/pd_parser.c | 7 ++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/src/datetime/pd_datetime.c b/pandas/_libs/src/datetime/pd_datetime.c index 4c1969f6d9f57..2c32fb0481486 100644 --- a/pandas/_libs/src/datetime/pd_datetime.c +++ b/pandas/_libs/src/datetime/pd_datetime.c @@ -245,7 +245,12 @@ static int pandas_datetime_exec(PyObject *Py_UNUSED(module)) { } static PyModuleDef_Slot pandas_datetime_slots[] = { - {Py_mod_exec, pandas_datetime_exec}, {0, NULL}}; + {Py_mod_exec, pandas_datetime_exec}, +#if PY_VERSION_HEX >= 0x030D0000 + {Py_mod_gil, Py_MOD_GIL_NOT_USED}, +#endif + {0, NULL}, +}; static struct PyModuleDef pandas_datetimemodule = { PyModuleDef_HEAD_INIT, diff --git a/pandas/_libs/src/parser/pd_parser.c b/pandas/_libs/src/parser/pd_parser.c index 48f3cd14cbc30..51cdf071a15cf 100644 --- a/pandas/_libs/src/parser/pd_parser.c +++ b/pandas/_libs/src/parser/pd_parser.c @@ -161,7 +161,12 @@ static int pandas_parser_exec(PyObject *Py_UNUSED(module)) { } static PyModuleDef_Slot pandas_parser_slots[] = { - {Py_mod_exec, pandas_parser_exec}, {0, NULL}}; + {Py_mod_exec, pandas_parser_exec}, +#if PY_VERSION_HEX >= 0x030D0000 + {Py_mod_gil, Py_MOD_GIL_NOT_USED}, +#endif + {0, NULL}, +}; static struct PyModuleDef pandas_parsermodule = { PyModuleDef_HEAD_INIT, From db13fb5e68771183f77be3d8b3742824edd4d265 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 1 Jul 2024 06:36:56 -1000 Subject: [PATCH 142/272] BUG: Allow show_versions to work for any module that raises an exception (#59114) * BUG: Allow show_versions to work for any module that raises an exception * Remove setuptools --- pandas/util/_print_versions.py | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) diff --git a/pandas/util/_print_versions.py b/pandas/util/_print_versions.py index c4fec39594407..7e18ebe40cfa8 100644 --- a/pandas/util/_print_versions.py +++ b/pandas/util/_print_versions.py @@ -45,7 +45,7 @@ def _get_sys_info() -> dict[str, JSONSerializable]: language_code, encoding = locale.getlocale() return { "commit": _get_commit_hash(), - "python": ".".join([str(i) for i in sys.version_info]), + "python": platform.python_version(), "python-bits": struct.calcsize("P") * 8, "OS": uname_result.system, "OS-release": uname_result.release, @@ -70,33 +70,25 @@ def _get_dependency_info() -> dict[str, JSONSerializable]: "pytz", "dateutil", # install / build, - "setuptools", "pip", "Cython", - # test - "pytest", - "hypothesis", # docs "sphinx", - # Other, need a min version - "blosc", - "feather", - "xlsxwriter", - "lxml.etree", - "html5lib", - "pymysql", - "psycopg2", - "jinja2", # Other, not imported. "IPython", - "pandas_datareader", ] + # Optional dependencies deps.extend(list(VERSIONS)) result: dict[str, JSONSerializable] = {} for modname in deps: - mod = import_optional_dependency(modname, errors="ignore") - result[modname] = get_version(mod) if mod else None + try: + mod = import_optional_dependency(modname, errors="ignore") + except Exception: + # Dependency conflicts may cause a non ImportError + result[modname] = "N/A" + else: + result[modname] = get_version(mod) if mod else None return result From 8bca186f9b7c6ffad142887f4dfe0629a4166535 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Mon, 1 Jul 2024 23:08:08 +0530 Subject: [PATCH 143/272] DOC: add SA01 for pandas.Timestamp.now (#59159) --- ci/code_checks.sh | 1 - pandas/_libs/tslibs/nattype.pyx | 10 ++++++++++ pandas/_libs/tslibs/timestamps.pyx | 10 ++++++++++ 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 0fa61a2465a2e..5d0c523f2651f 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -244,7 +244,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Timestamp.month_name SA01" \ -i "pandas.Timestamp.nanosecond GL08" \ -i "pandas.Timestamp.normalize SA01" \ - -i "pandas.Timestamp.now SA01" \ -i "pandas.Timestamp.quarter SA01" \ -i "pandas.Timestamp.replace PR07,SA01" \ -i "pandas.Timestamp.resolution PR02" \ diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index 27a371ef43832..8a47f1aee041d 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -957,11 +957,21 @@ class NaTType(_NaT): """ Return new Timestamp object representing current time local to tz. + This method returns a new `Timestamp` object that represents the current time. + If a timezone is provided, the current time will be localized to that timezone. + Otherwise, it returns the current local time. + Parameters ---------- tz : str or timezone object, default None Timezone to localize to. + See Also + -------- + to_datetime : Convert argument to datetime. + Timestamp.utcnow : Return a new Timestamp representing UTC day and time. + Timestamp.today : Return the current time in the local timezone. + Examples -------- >>> pd.Timestamp.now() # doctest: +SKIP diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 628527bd4ff9b..d4f37cf640c50 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -1465,11 +1465,21 @@ class Timestamp(_Timestamp): """ Return new Timestamp object representing current time local to tz. + This method returns a new `Timestamp` object that represents the current time. + If a timezone is provided, the current time will be localized to that timezone. + Otherwise, it returns the current local time. + Parameters ---------- tz : str or timezone object, default None Timezone to localize to. + See Also + -------- + to_datetime : Convert argument to datetime. + Timestamp.utcnow : Return a new Timestamp representing UTC day and time. + Timestamp.today : Return the current time in the local timezone. + Examples -------- >>> pd.Timestamp.now() # doctest: +SKIP From 3782dd17c37df76c75de14b12b5afa88f812199c Mon Sep 17 00:00:00 2001 From: mutricyl <118692416+mutricyl@users.noreply.github.com> Date: Mon, 1 Jul 2024 21:15:24 +0200 Subject: [PATCH 144/272] remove ops div class to solve #21374 (#59144) * remove core.computation.ops.Div resolves #21374 #58748 * need to preserve order * updating tests * update whatsnew * solve mypy issue * fixing pytests * better than cast * adding specific test * Update pandas/tests/frame/test_query_eval.py Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> * Update pandas/tests/computation/test_eval.py Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --------- Co-authored-by: Laurent Mutricy Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/_testing/__init__.py | 1 + pandas/conftest.py | 15 ++++++++ pandas/core/computation/expr.py | 6 +--- pandas/core/computation/ops.py | 52 --------------------------- pandas/tests/computation/test_eval.py | 23 ++++++++---- pandas/tests/frame/test_query_eval.py | 16 +++++++-- 7 files changed, 48 insertions(+), 66 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index afb2f91f65ccd..a94d0588a081d 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -616,6 +616,7 @@ Other ^^^^^ - Bug in :class:`DataFrame` when passing a ``dict`` with a NA scalar and ``columns`` that would always return ``np.nan`` (:issue:`57205`) - Bug in :func:`eval` on :class:`ExtensionArray` on including division ``/`` failed with a ``TypeError``. (:issue:`58748`) +- Bug in :func:`eval` on :class:`complex` including division ``/`` discards imaginary part. (:issue:`21374`) - Bug in :func:`eval` where the names of the :class:`Series` were not preserved when using ``engine="numexpr"``. (:issue:`10239`) - Bug in :func:`unique` on :class:`Index` not always returning :class:`Index` (:issue:`57043`) - Bug in :meth:`DataFrame.apply` where passing ``engine="numba"`` ignored ``args`` passed to the applied function (:issue:`58712`) diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index fb8ca8aad3428..1cd91ee5b120c 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -107,6 +107,7 @@ COMPLEX_DTYPES: list[Dtype] = [complex, "complex64", "complex128"] STRING_DTYPES: list[Dtype] = [str, "str", "U"] +COMPLEX_FLOAT_DTYPES: list[Dtype] = [*COMPLEX_DTYPES, *FLOAT_NUMPY_DTYPES] DATETIME64_DTYPES: list[Dtype] = ["datetime64[ns]", "M8[ns]"] TIMEDELTA64_DTYPES: list[Dtype] = ["timedelta64[ns]", "m8[ns]"] diff --git a/pandas/conftest.py b/pandas/conftest.py index c3bfc8c06ad8a..70e729dfb98a4 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1448,6 +1448,21 @@ def complex_dtype(request): return request.param +@pytest.fixture(params=tm.COMPLEX_FLOAT_DTYPES) +def complex_or_float_dtype(request): + """ + Parameterized fixture for complex and numpy float dtypes. + + * complex + * 'complex64' + * 'complex128' + * float + * 'float32' + * 'float64' + """ + return request.param + + @pytest.fixture(params=tm.SIGNED_INT_NUMPY_DTYPES) def any_signed_int_numpy_dtype(request): """ diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py index b287cd542068d..b074e768e0842 100644 --- a/pandas/core/computation/expr.py +++ b/pandas/core/computation/expr.py @@ -32,7 +32,6 @@ UNARY_OPS_SYMS, BinOp, Constant, - Div, FuncNode, Op, Term, @@ -374,7 +373,7 @@ class BaseExprVisitor(ast.NodeVisitor): "Add", "Sub", "Mult", - None, + "Div", "Pow", "FloorDiv", "Mod", @@ -537,9 +536,6 @@ def visit_BinOp(self, node, **kwargs): left, right = self._maybe_downcast_constants(left, right) return self._maybe_evaluate_binop(op, op_class, left, right) - def visit_Div(self, node, **kwargs): - return lambda lhs, rhs: Div(lhs, rhs) - def visit_UnaryOp(self, node, **kwargs): op = self.visit(node.op) operand = self.visit(node.operand) diff --git a/pandas/core/computation/ops.py b/pandas/core/computation/ops.py index 056325fd2e4ab..a1a5f77f8539e 100644 --- a/pandas/core/computation/ops.py +++ b/pandas/core/computation/ops.py @@ -18,7 +18,6 @@ from pandas.core.dtypes.common import ( is_list_like, - is_numeric_dtype, is_scalar, ) @@ -328,31 +327,6 @@ def _not_in(x, y): _binary_ops_dict.update(d) -def _cast_inplace(terms, acceptable_dtypes, dtype) -> None: - """ - Cast an expression inplace. - - Parameters - ---------- - terms : Op - The expression that should cast. - acceptable_dtypes : list of acceptable numpy.dtype - Will not cast if term's dtype in this list. - dtype : str or numpy.dtype - The dtype to cast to. - """ - dt = np.dtype(dtype) - for term in terms: - if term.type in acceptable_dtypes: - continue - - try: - new_value = term.value.astype(dt) - except AttributeError: - new_value = dt.type(term.value) - term.update(new_value) - - def is_term(obj) -> bool: return isinstance(obj, Term) @@ -509,32 +483,6 @@ def _disallow_scalar_only_bool_ops(self) -> None: raise NotImplementedError("cannot evaluate scalar only bool ops") -class Div(BinOp): - """ - Div operator to special case casting. - - Parameters - ---------- - lhs, rhs : Term or Op - The Terms or Ops in the ``/`` expression. - """ - - def __init__(self, lhs, rhs) -> None: - super().__init__("/", lhs, rhs) - - if not is_numeric_dtype(lhs.return_type) or not is_numeric_dtype( - rhs.return_type - ): - raise TypeError( - f"unsupported operand type(s) for {self.op}: " - f"'{lhs.return_type}' and '{rhs.return_type}'" - ) - - # do not upcast float32s to float64 un-necessarily - acceptable_dtypes = [np.float32, np.float64] - _cast_inplace(com.flatten(self), acceptable_dtypes, np.float64) - - UNARY_OPS_SYMS = ("+", "-", "~", "not") _unary_ops_funcs = (operator.pos, operator.neg, operator.invert, operator.invert) _unary_ops_dict = dict(zip(UNARY_OPS_SYMS, _unary_ops_funcs)) diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index d52f33fe80434..1844b47847e95 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -758,16 +758,25 @@ class TestTypeCasting: # maybe someday... numexpr has too many upcasting rules now # chain(*(np.core.sctypes[x] for x in ['uint', 'int', 'float'])) @pytest.mark.parametrize("left_right", [("df", "3"), ("3", "df")]) - def test_binop_typecasting(self, engine, parser, op, float_numpy_dtype, left_right): - df = DataFrame( - np.random.default_rng(2).standard_normal((5, 3)), dtype=float_numpy_dtype - ) + def test_binop_typecasting( + self, engine, parser, op, complex_or_float_dtype, left_right, request + ): + # GH#21374 + dtype = complex_or_float_dtype + df = DataFrame(np.random.default_rng(2).standard_normal((5, 3)), dtype=dtype) left, right = left_right s = f"{left} {op} {right}" res = pd.eval(s, engine=engine, parser=parser) - assert df.values.dtype == float_numpy_dtype - assert res.values.dtype == float_numpy_dtype - tm.assert_frame_equal(res, eval(s)) + if dtype == "complex64" and engine == "numexpr": + mark = pytest.mark.xfail( + reason="numexpr issue with complex that are upcast " + "to complex 128 " + "https://github.com/pydata/numexpr/issues/492" + ) + request.applymarker(mark) + assert df.values.dtype == dtype + assert res.values.dtype == dtype + tm.assert_frame_equal(res, eval(s), check_exact=False) # ------------------------------------- diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index ff1bf5632e920..c9ea5f379f1e9 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -202,11 +202,23 @@ def test_eval_simple(self, engine, parser): expected = df["a"] tm.assert_series_equal(expected, res) - def test_extension_array_eval(self, engine, parser): + def test_extension_array_eval(self, engine, parser, request): # GH#58748 + if engine == "numexpr": + mark = pytest.mark.xfail( + reason="numexpr does not support extension array dtypes" + ) + request.applymarker(mark) df = DataFrame({"a": pd.array([1, 2, 3]), "b": pd.array([4, 5, 6])}) result = df.eval("a / b", engine=engine, parser=parser) - expected = Series([0.25, 0.40, 0.50]) + expected = Series(pd.array([0.25, 0.40, 0.50])) + tm.assert_series_equal(result, expected) + + def test_complex_eval(self, engine, parser): + # GH#21374 + df = DataFrame({"a": [1 + 2j], "b": [1 + 1j]}) + result = df.eval("a/b", engine=engine, parser=parser) + expected = Series([1.5 + 0.5j]) tm.assert_series_equal(result, expected) From bc79c520c657bf37a6fb7d08ecdd625fa6ab1c3a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 1 Jul 2024 16:04:32 -0700 Subject: [PATCH 145/272] [pre-commit.ci] pre-commit autoupdate (#59156) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * [pre-commit.ci] pre-commit autoupdate updates: - [github.com/astral-sh/ruff-pre-commit: v0.4.7 → v0.5.0](https://github.com/astral-sh/ruff-pre-commit/compare/v0.4.7...v0.5.0) - [github.com/asottile/pyupgrade: v3.15.2 → v3.16.0](https://github.com/asottile/pyupgrade/compare/v3.15.2...v3.16.0) - [github.com/pre-commit/mirrors-clang-format: v18.1.5 → v18.1.8](https://github.com/pre-commit/mirrors-clang-format/compare/v18.1.5...v18.1.8) * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Ingore E721 --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- .pre-commit-config.yaml | 6 +++--- pandas/core/common.py | 2 +- pandas/core/indexes/base.py | 2 +- pandas/core/internals/construction.py | 2 +- pandas/core/tools/datetimes.py | 2 +- pandas/tests/frame/test_repr.py | 4 ++-- pyproject.toml | 2 ++ 7 files changed, 11 insertions(+), 9 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2d5cd9e841df3..b81b9ba070a44 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -19,7 +19,7 @@ ci: skip: [pyright, mypy] repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.4.7 + rev: v0.5.0 hooks: - id: ruff args: [--exit-non-zero-on-fix] @@ -73,7 +73,7 @@ repos: hooks: - id: isort - repo: https://github.com/asottile/pyupgrade - rev: v3.15.2 + rev: v3.16.0 hooks: - id: pyupgrade args: [--py310-plus] @@ -93,7 +93,7 @@ repos: - id: sphinx-lint args: ["--enable", "all", "--disable", "line-too-long"] - repo: https://github.com/pre-commit/mirrors-clang-format - rev: v18.1.5 + rev: v18.1.8 hooks: - id: clang-format files: ^pandas/_libs/src|^pandas/_libs/include diff --git a/pandas/core/common.py b/pandas/core/common.py index 1423ea456384b..ec0473a20458b 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -145,7 +145,7 @@ def is_bool_indexer(key: Any) -> bool: elif isinstance(key, list): # check if np.array(key).dtype would be bool if len(key) > 0: - if type(key) is not list: # noqa: E721 + if type(key) is not list: # GH#42461 cython will raise TypeError if we pass a subclass key = list(key) return lib.is_bool_list(key) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 71dfff520113c..7d43498d4267b 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -7528,7 +7528,7 @@ def ensure_index(index_like: Axes, copy: bool = False) -> Index: index_like = list(index_like) if isinstance(index_like, list): - if type(index_like) is not list: # noqa: E721 + if type(index_like) is not list: # must check for exactly list here because of strict type # check in clean_index_list index_like = list(index_like) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 23572975a1112..0d149f47fd08c 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -842,7 +842,7 @@ def _list_of_dict_to_arrays( # assure that they are of the base dict class and not of derived # classes - data = [d if type(d) is dict else dict(d) for d in data] # noqa: E721 + data = [d if type(d) is dict else dict(d) for d in data] content = lib.dicts_to_array(data, list(columns)) return content, columns diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 9b8970f86ed6d..0e91bfa99e887 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -129,7 +129,7 @@ class FulldatetimeDict(YearMonthDayDict, total=False): def _guess_datetime_format_for_array(arr, dayfirst: bool | None = False) -> str | None: # Try to guess the format based on the first non-NaN element, return None if can't if (first_non_null := tslib.first_non_null(arr)) != -1: - if type(first_non_nan_element := arr[first_non_null]) is str: # noqa: E721 + if type(first_non_nan_element := arr[first_non_null]) is str: # GH#32264 np.str_ object guessed_format = guess_datetime_format( first_non_nan_element, dayfirst=dayfirst diff --git a/pandas/tests/frame/test_repr.py b/pandas/tests/frame/test_repr.py index f6e0251d52de1..f799495d8025a 100644 --- a/pandas/tests/frame/test_repr.py +++ b/pandas/tests/frame/test_repr.py @@ -38,10 +38,10 @@ def test_repr_should_return_str(self): index1 = ["\u03c3", "\u03c4", "\u03c5", "\u03c6"] cols = ["\u03c8"] df = DataFrame(data, columns=cols, index=index1) - assert type(df.__repr__()) is str # noqa: E721 + assert type(df.__repr__()) is str ser = df[cols[0]] - assert type(ser.__repr__()) is str # noqa: E721 + assert type(ser.__repr__()) is str def test_repr_bytes_61_lines(self): # GH#12857 diff --git a/pyproject.toml b/pyproject.toml index 661e8efbb95fc..9156c73efbb35 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -318,6 +318,8 @@ ignore = [ "RUF007", # mutable-class-default "RUF012", + # type-comparison + "E721", # Additional pylint rules # literal-membership From 69fe98dda091cae71ea699be3f44926406851a2c Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 1 Jul 2024 16:33:07 -1000 Subject: [PATCH 146/272] REF: Make read_json less stateful (#59124) * REF: Make read_json less stateful * Fix typing * Clean up dataframe column casting * Remove extra bool return * Add whatsnew --- doc/source/whatsnew/v3.0.0.rst | 2 +- pandas/io/json/_json.py | 209 ++++++++++--------------- pandas/tests/io/json/test_pandas.py | 2 +- pandas/tests/io/json/test_readlines.py | 4 +- 4 files changed, 86 insertions(+), 131 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index a94d0588a081d..be4b9c218f9f5 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -563,8 +563,8 @@ I/O - Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`) - Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`) - Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`) +- Bug in :meth:`read_json` not validating the ``typ`` argument to not be exactly ``"frame"`` or ``"series"`` (:issue:`59124`) - Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`) -- Period ^^^^^^ diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 74e6595a7f0f2..24fcb78a41e9d 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -969,7 +969,7 @@ def read(self) -> DataFrame | Series: else: return obj - def _get_object_parser(self, json) -> DataFrame | Series: + def _get_object_parser(self, json: str) -> DataFrame | Series: """ Parses a json document into a pandas object. """ @@ -985,16 +985,14 @@ def _get_object_parser(self, json) -> DataFrame | Series: "date_unit": self.date_unit, "dtype_backend": self.dtype_backend, } - obj = None if typ == "frame": - obj = FrameParser(json, **kwargs).parse() - - if typ == "series" or obj is None: + return FrameParser(json, **kwargs).parse() + elif typ == "series": if not isinstance(dtype, bool): kwargs["dtype"] = dtype - obj = SeriesParser(json, **kwargs).parse() - - return obj + return SeriesParser(json, **kwargs).parse() + else: + raise ValueError(f"{typ=} must be 'frame' or 'series'.") def close(self) -> None: """ @@ -1107,7 +1105,6 @@ def __init__( self.convert_dates = convert_dates self.date_unit = date_unit self.keep_default_dates = keep_default_dates - self.obj: DataFrame | Series | None = None self.dtype_backend = dtype_backend @final @@ -1121,26 +1118,22 @@ def check_keys_split(self, decoded: dict) -> None: raise ValueError(f"JSON data had unexpected key(s): {bad_keys_joined}") @final - def parse(self): - self._parse() + def parse(self) -> DataFrame | Series: + obj = self._parse() - if self.obj is None: - return None if self.convert_axes: - self._convert_axes() - self._try_convert_types() - return self.obj + obj = self._convert_axes(obj) + obj = self._try_convert_types(obj) + return obj - def _parse(self) -> None: + def _parse(self) -> DataFrame | Series: raise AbstractMethodError(self) @final - def _convert_axes(self) -> None: + def _convert_axes(self, obj: DataFrame | Series) -> DataFrame | Series: """ Try to convert axes. """ - obj = self.obj - assert obj is not None # for mypy for axis_name in obj._AXIS_ORDERS: ax = obj._get_axis(axis_name) ser = Series(ax, dtype=ax.dtype, copy=False) @@ -1153,9 +1146,10 @@ def _convert_axes(self) -> None: ) if result: new_axis = Index(new_ser, dtype=new_ser.dtype, copy=False) - setattr(self.obj, axis_name, new_axis) + setattr(obj, axis_name, new_axis) + return obj - def _try_convert_types(self) -> None: + def _try_convert_types(self, obj): raise AbstractMethodError(self) @final @@ -1182,8 +1176,10 @@ def _try_convert_data( elif self.dtype is True: pass - else: - # dtype to force + elif not _should_convert_dates( + convert_dates, self.keep_default_dates, name + ): + # convert_dates takes precedence over columns listed in dtypes dtype = ( self.dtype.get(name) if isinstance(self.dtype, dict) else self.dtype ) @@ -1194,8 +1190,8 @@ def _try_convert_data( return data, False if convert_dates: - new_data, result = self._try_convert_to_date(data) - if result: + new_data = self._try_convert_to_date(data) + if new_data is not data: return new_data, True converted = False @@ -1245,16 +1241,16 @@ def _try_convert_data( return data, converted @final - def _try_convert_to_date(self, data: Series) -> tuple[Series, bool]: + def _try_convert_to_date(self, data: Series) -> Series: """ Try to parse a ndarray like into a date column. Try to coerce object in epoch/iso formats and integer/float in epoch - formats. Return a boolean if parsing was successful. + formats. """ # no conversion on empty if not len(data): - return data, False + return data new_data = data @@ -1265,7 +1261,7 @@ def _try_convert_to_date(self, data: Series) -> tuple[Series, bool]: try: new_data = data.astype("int64") except OverflowError: - return data, False + return data except (TypeError, ValueError): pass @@ -1277,57 +1273,45 @@ def _try_convert_to_date(self, data: Series) -> tuple[Series, bool]: | (new_data._values == iNaT) ) if not in_range.all(): - return data, False + return data date_units = (self.date_unit,) if self.date_unit else self._STAMP_UNITS for date_unit in date_units: try: - new_data = to_datetime(new_data, errors="raise", unit=date_unit) + return to_datetime(new_data, errors="raise", unit=date_unit) except (ValueError, OverflowError, TypeError): continue - return new_data, True - return data, False + return data class SeriesParser(Parser): _default_orient = "index" _split_keys = ("name", "index", "data") - obj: Series | None - def _parse(self) -> None: + def _parse(self) -> Series: data = ujson_loads(self.json, precise_float=self.precise_float) if self.orient == "split": decoded = {str(k): v for k, v in data.items()} self.check_keys_split(decoded) - self.obj = Series(**decoded) + return Series(**decoded) else: - self.obj = Series(data) + return Series(data) - def _try_convert_types(self) -> None: - if self.obj is None: - return - obj, result = self._try_convert_data( - "data", self.obj, convert_dates=self.convert_dates - ) - if result: - self.obj = obj + def _try_convert_types(self, obj: Series) -> Series: + obj, _ = self._try_convert_data("data", obj, convert_dates=self.convert_dates) + return obj class FrameParser(Parser): _default_orient = "columns" _split_keys = ("columns", "index", "data") - obj: DataFrame | None - def _parse(self) -> None: + def _parse(self) -> DataFrame: json = self.json orient = self.orient - if orient == "columns": - self.obj = DataFrame( - ujson_loads(json, precise_float=self.precise_float), dtype=None - ) - elif orient == "split": + if orient == "split": decoded = { str(k): v for k, v in ujson_loads(json, precise_float=self.precise_float).items() @@ -1341,90 +1325,61 @@ def _parse(self) -> None: orig_names, is_potential_multi_index(orig_names, None), ) - self.obj = DataFrame(dtype=None, **decoded) + return DataFrame(dtype=None, **decoded) elif orient == "index": - self.obj = DataFrame.from_dict( + return DataFrame.from_dict( ujson_loads(json, precise_float=self.precise_float), dtype=None, orient="index", ) elif orient == "table": - self.obj = parse_table_schema(json, precise_float=self.precise_float) + return parse_table_schema(json, precise_float=self.precise_float) else: - self.obj = DataFrame( + # includes orient == "columns" + return DataFrame( ujson_loads(json, precise_float=self.precise_float), dtype=None ) - def _process_converter( - self, - f: Callable[[Hashable, Series], tuple[Series, bool]], - filt: Callable[[Hashable], bool] | None = None, - ) -> None: - """ - Take a conversion function and possibly recreate the frame. - """ - if filt is None: - filt = lambda col: True - - obj = self.obj - assert obj is not None # for mypy - - needs_new_obj = False - new_obj = {} - for i, (col, c) in enumerate(obj.items()): - if filt(col): - new_data, result = f(col, c) - if result: - c = new_data - needs_new_obj = True - new_obj[i] = c - - if needs_new_obj: - # possibly handle dup columns - new_frame = DataFrame(new_obj, index=obj.index) - new_frame.columns = obj.columns - self.obj = new_frame - - def _try_convert_types(self) -> None: - if self.obj is None: - return - if self.convert_dates: - self._try_convert_dates() - - self._process_converter( - lambda col, c: self._try_convert_data(col, c, convert_dates=False) + def _try_convert_types(self, obj: DataFrame) -> DataFrame: + arrays = [] + for col_label, series in obj.items(): + result, _ = self._try_convert_data( + col_label, + series, + convert_dates=_should_convert_dates( + self.convert_dates, + keep_default_dates=self.keep_default_dates, + col=col_label, + ), + ) + arrays.append(result.array) + return DataFrame._from_arrays( + arrays, obj.columns, obj.index, verify_integrity=False ) - def _try_convert_dates(self) -> None: - if self.obj is None: - return - - # our columns to parse - convert_dates_list_bool = self.convert_dates - if isinstance(convert_dates_list_bool, bool): - convert_dates_list_bool = [] - convert_dates = set(convert_dates_list_bool) - - def is_ok(col) -> bool: - """ - Return if this col is ok to try for a date parse. - """ - if col in convert_dates: - return True - if not self.keep_default_dates: - return False - if not isinstance(col, str): - return False - - col_lower = col.lower() - if ( - col_lower.endswith(("_at", "_time")) - or col_lower == "modified" - or col_lower == "date" - or col_lower == "datetime" - or col_lower.startswith("timestamp") - ): - return True - return False - self._process_converter(lambda col, c: self._try_convert_to_date(c), filt=is_ok) +def _should_convert_dates( + convert_dates: bool | list[str], + keep_default_dates: bool, + col: Hashable, +) -> bool: + """ + Return bool whether a DataFrame column should be cast to datetime. + """ + if convert_dates is False: + # convert_dates=True means follow keep_default_dates + return False + elif not isinstance(convert_dates, bool) and col in set(convert_dates): + return True + elif not keep_default_dates: + return False + elif not isinstance(col, str): + return False + col_lower = col.lower() + if ( + col_lower.endswith(("_at", "_time")) + or col_lower in {"modified", "date", "datetime"} + or col_lower.startswith("timestamp") + ): + return True + return False diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index b53957a7e77d1..e00c193fd471a 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -792,7 +792,7 @@ def test_frame_from_json_precise_float(self): def test_typ(self): s = Series(range(6), index=["a", "b", "c", "d", "e", "f"], dtype="int64") - result = read_json(StringIO(s.to_json()), typ=None) + result = read_json(StringIO(s.to_json()), typ="series") tm.assert_series_equal(result, s) def test_reconstruction_index(self): diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py index d96ccb4b94cc2..3c843479b446a 100644 --- a/pandas/tests/io/json/test_readlines.py +++ b/pandas/tests/io/json/test_readlines.py @@ -165,11 +165,11 @@ def test_readjson_chunks_series(request, engine): s = pd.Series({"A": 1, "B": 2}) strio = StringIO(s.to_json(lines=True, orient="records")) - unchunked = read_json(strio, lines=True, typ="Series", engine=engine) + unchunked = read_json(strio, lines=True, typ="series", engine=engine) strio = StringIO(s.to_json(lines=True, orient="records")) with read_json( - strio, lines=True, typ="Series", chunksize=1, engine=engine + strio, lines=True, typ="series", chunksize=1, engine=engine ) as reader: chunked = pd.concat(reader) From faad5abf44d01114bfaab3910ca66e15c8d19e41 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Wed, 3 Jul 2024 01:27:09 +0530 Subject: [PATCH 147/272] DOC: add SA01 for pandas.Timestamp.time (#59169) --- ci/code_checks.sh | 1 - pandas/_libs/tslibs/nattype.pyx | 10 ++++++++++ pandas/_libs/tslibs/timestamps.pyx | 10 ++++++++++ 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 5d0c523f2651f..d0123e64eb542 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -249,7 +249,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Timestamp.resolution PR02" \ -i "pandas.Timestamp.second GL08" \ -i "pandas.Timestamp.strptime PR01,SA01" \ - -i "pandas.Timestamp.time SA01" \ -i "pandas.Timestamp.timestamp SA01" \ -i "pandas.Timestamp.timetuple SA01" \ -i "pandas.Timestamp.timetz SA01" \ diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index 8a47f1aee041d..c1f2341328570 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -633,6 +633,16 @@ class NaTType(_NaT): """ Return time object with same time but with tzinfo=None. + This method extracts the time part of the `Timestamp` object, excluding any + timezone information. It returns a `datetime.time` object which only represents + the time (hours, minutes, seconds, and microseconds). + + See Also + -------- + Timestamp.date : Return date object with same year, month and day. + Timestamp.tz_convert : Convert timezone-aware Timestamp to another time zone. + Timestamp.tz_localize : Localize the Timestamp to a timezone. + Examples -------- >>> ts = pd.Timestamp('2023-01-01 10:00:00') diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index d4f37cf640c50..c4bd9e1b47bbe 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -1778,6 +1778,16 @@ class Timestamp(_Timestamp): """ Return time object with same time but with tzinfo=None. + This method extracts the time part of the `Timestamp` object, excluding any + timezone information. It returns a `datetime.time` object which only represents + the time (hours, minutes, seconds, and microseconds). + + See Also + -------- + Timestamp.date : Return date object with same year, month and day. + Timestamp.tz_convert : Convert timezone-aware Timestamp to another time zone. + Timestamp.tz_localize : Localize the Timestamp to a timezone. + Examples -------- >>> ts = pd.Timestamp('2023-01-01 10:00:00') From 684faf7906d48fe45b9ae7ab6fecc5f445c7a466 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Tue, 2 Jul 2024 18:24:06 -0400 Subject: [PATCH 148/272] DOC: Series reduction return value descriptions (#59171) --- pandas/core/series.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index a22cc59b62499..184c774d04a47 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -6567,7 +6567,7 @@ def min( Returns ------- scalar or Series (if level specified) - The maximum of the values in the Series. + The minimum of the values in the Series. See Also -------- @@ -6716,7 +6716,7 @@ def sum( Returns ------- scalar or Series (if level specified) - Median of the values for the requested axis. + Sum of the values for the requested axis. See Also -------- @@ -6826,7 +6826,7 @@ def mean( Returns ------- scalar or Series (if level specified) - Median of the values for the requested axis. + Mean of the values for the requested axis. See Also -------- From dcb5494e511cee9643ce3748d4450a97ed1a7c03 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 3 Jul 2024 06:45:24 -1000 Subject: [PATCH 149/272] TST: Address UserWarning in matplotlib test (#59168) * TST: Address UserWarning in matplotlib test * Filter the warning instead --- pandas/plotting/_matplotlib/core.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 22be9baf1ff5c..8b108346160d6 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -893,7 +893,13 @@ def _make_legend(self) -> None: elif self.subplots and self.legend: for ax in self.axes: if ax.get_visible(): - ax.legend(loc="best") + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + "No artists with labels found to put in legend.", + UserWarning, + ) + ax.legend(loc="best") @final @staticmethod From e3af7c6675c24eb2ebcbebbf64320177ed4bed84 Mon Sep 17 00:00:00 2001 From: Asish Mahapatra Date: Fri, 5 Jul 2024 13:03:02 -0400 Subject: [PATCH 150/272] DOC: Fix for excelwriter engine for ods files (#59185) doc fix for excelwriter engine for ods files --- pandas/io/excel/_base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index de0ef3728fb6e..f83f9cb1c8d74 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -957,7 +957,7 @@ class ExcelWriter(Generic[_WorkbookT]): * `xlsxwriter `__ for xlsx files if xlsxwriter is installed otherwise `openpyxl `__ - * `odswriter `__ for ods files + * `odf `__ for ods files See :meth:`DataFrame.to_excel` for typical usage. @@ -1004,7 +1004,7 @@ class ExcelWriter(Generic[_WorkbookT]): * xlsxwriter: ``xlsxwriter.Workbook(file, **engine_kwargs)`` * openpyxl (write mode): ``openpyxl.Workbook(**engine_kwargs)`` * openpyxl (append mode): ``openpyxl.load_workbook(file, **engine_kwargs)`` - * odswriter: ``odf.opendocument.OpenDocumentSpreadsheet(**engine_kwargs)`` + * odf: ``odf.opendocument.OpenDocumentSpreadsheet(**engine_kwargs)`` .. versionadded:: 1.3.0 From 039edee0bc1a729ee9b67a6f9a337f7cd03118d1 Mon Sep 17 00:00:00 2001 From: Natalia Mokeeva <91160475+natmokval@users.noreply.github.com> Date: Fri, 5 Jul 2024 19:08:08 +0200 Subject: [PATCH 151/272] DEPR: lowercase strings `w`, `d`, `b` and ``c`` denoting frequencies in `Week`, `Day`, `BusinessDay` and `CustomBusinessDay` classes (#58998) * deprecate lowercase 'd' * fix tests, add tests * deprecate lowercase alias 'b', fix tests * fix tests and docs * fix tests, fix an example in v0.20.0 * deprecate 'c',fix examples in v0.22.0, add tests and a note to v3.0.0 * correct examples in whatsnew * update examples in user_guide/io.rst --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/user_guide/io.rst | 4 +- doc/source/whatsnew/v0.18.0.rst | 25 ++++++-- doc/source/whatsnew/v0.20.0.rst | 27 ++++++--- doc/source/whatsnew/v0.22.0.rst | 21 +++++-- doc/source/whatsnew/v0.23.0.rst | 60 +++++++++++++++---- doc/source/whatsnew/v3.0.0.rst | 2 + pandas/_libs/tslibs/dtypes.pyx | 10 ++++ pandas/_libs/tslibs/offsets.pyx | 20 +++---- pandas/io/json/_table_schema.py | 2 +- pandas/tests/arithmetic/test_period.py | 2 +- pandas/tests/frame/methods/test_astype.py | 10 +++- pandas/tests/frame/methods/test_reindex.py | 5 +- pandas/tests/frame/test_query_eval.py | 2 +- pandas/tests/groupby/test_groupby_dropna.py | 4 +- .../indexes/datetimes/methods/test_snap.py | 10 +++- .../indexes/datetimes/test_date_range.py | 20 +++++++ .../indexes/datetimes/test_partial_slicing.py | 2 +- .../tests/indexes/period/test_constructors.py | 24 ++++++++ .../indexes/timedeltas/test_scalar_compat.py | 46 +++++++------- pandas/tests/io/formats/test_to_csv.py | 2 +- .../tests/io/json/test_json_table_schema.py | 28 ++++----- pandas/tests/plotting/test_datetimelike.py | 4 +- pandas/tests/resample/test_base.py | 2 +- pandas/tests/resample/test_datetime_index.py | 43 ++++++++++--- pandas/tests/resample/test_period_index.py | 4 +- pandas/tests/resample/test_resample_api.py | 2 +- pandas/tests/resample/test_time_grouper.py | 4 +- pandas/tests/scalar/period/test_period.py | 23 ++++++- .../scalar/timedelta/methods/test_round.py | 2 +- pandas/tests/tslibs/test_to_offset.py | 40 +++++++++---- pandas/tests/window/test_groupby.py | 15 +++-- pandas/tests/window/test_rolling.py | 19 +++--- pandas/tests/window/test_timeseries_window.py | 6 +- 33 files changed, 351 insertions(+), 139 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 64b151c167ef3..be40710a9e307 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -2161,7 +2161,7 @@ a JSON string with two fields, ``schema`` and ``data``. { "A": [1, 2, 3], "B": ["a", "b", "c"], - "C": pd.date_range("2016-01-01", freq="d", periods=3), + "C": pd.date_range("2016-01-01", freq="D", periods=3), }, index=pd.Index(range(3), name="idx"), ) @@ -2270,7 +2270,7 @@ round-trippable manner. { "foo": [1, 2, 3, 4], "bar": ["a", "b", "c", "d"], - "baz": pd.date_range("2018-01-01", freq="d", periods=4), + "baz": pd.date_range("2018-01-01", freq="D", periods=4), "qux": pd.Categorical(["a", "b", "c", "c"]), }, index=pd.Index(range(4), name="idx"), diff --git a/doc/source/whatsnew/v0.18.0.rst b/doc/source/whatsnew/v0.18.0.rst index 569197fe9daf5..563035e0e2940 100644 --- a/doc/source/whatsnew/v0.18.0.rst +++ b/doc/source/whatsnew/v0.18.0.rst @@ -322,15 +322,28 @@ Tz-aware are rounded, floored and ceiled in local times Timedeltas -.. ipython:: python +.. code-block:: ipython + + In [37]: t = pd.timedelta_range('1 days 2 hr 13 min 45 us', periods=3, freq='d') - t = pd.timedelta_range('1 days 2 hr 13 min 45 us', periods=3, freq='d') - t - t.round('10min') + In [38]: t + Out[38]: + TimedeltaIndex(['1 days 02:13:00.000045', '2 days 02:13:00.000045', + '3 days 02:13:00.000045'], + dtype='timedelta64[ns]', freq='D') + + In [39]: t.round('10min') + Out[39]: + TimedeltaIndex(['1 days 02:10:00', '2 days 02:10:00', + '3 days 02:10:00'], + dtype='timedelta64[ns]', freq=None) # Timedelta scalar - t[0] - t[0].round('2h') + In [40]: t[0] + Out[40]: Timedelta('1 days 02:13:00.000045') + + In [41]: t[0].round('2h') + Out[41]: Timedelta('1 days 02:00:00') In addition, ``.round()``, ``.floor()`` and ``.ceil()`` will be available through the ``.dt`` accessor of ``Series``. diff --git a/doc/source/whatsnew/v0.20.0.rst b/doc/source/whatsnew/v0.20.0.rst index f63db945165e7..d6d1d96ccc878 100644 --- a/doc/source/whatsnew/v0.20.0.rst +++ b/doc/source/whatsnew/v0.20.0.rst @@ -308,15 +308,26 @@ The new orient ``'table'`` for :meth:`DataFrame.to_json` will generate a `Table Schema`_ compatible string representation of the data. -.. ipython:: python +.. code-block:: ipython - df = pd.DataFrame( - {'A': [1, 2, 3], - 'B': ['a', 'b', 'c'], - 'C': pd.date_range('2016-01-01', freq='d', periods=3)}, - index=pd.Index(range(3), name='idx')) - df - df.to_json(orient='table') + In [38]: df = pd.DataFrame( + ....: {'A': [1, 2, 3], + ....: 'B': ['a', 'b', 'c'], + ....: 'C': pd.date_range('2016-01-01', freq='d', periods=3)}, + ....: index=pd.Index(range(3), name='idx')) + In [39]: df + Out[39]: + A B C + idx + 0 1 a 2016-01-01 + 1 2 b 2016-01-02 + 2 3 c 2016-01-03 + + [3 rows x 3 columns] + + In [40]: df.to_json(orient='table') + Out[40]: + '{"schema":{"fields":[{"name":"idx","type":"integer"},{"name":"A","type":"integer"},{"name":"B","type":"string"},{"name":"C","type":"datetime"}],"primaryKey":["idx"],"pandas_version":"1.4.0"},"data":[{"idx":0,"A":1,"B":"a","C":"2016-01-01T00:00:00.000"},{"idx":1,"A":2,"B":"b","C":"2016-01-02T00:00:00.000"},{"idx":2,"A":3,"B":"c","C":"2016-01-03T00:00:00.000"}]}' See :ref:`IO: Table Schema for more information `. diff --git a/doc/source/whatsnew/v0.22.0.rst b/doc/source/whatsnew/v0.22.0.rst index a33a8f7addeef..8a9227ac37b67 100644 --- a/doc/source/whatsnew/v0.22.0.rst +++ b/doc/source/whatsnew/v0.22.0.rst @@ -157,16 +157,27 @@ sum and ``1`` for product. *pandas 0.22.0* -.. ipython:: python +.. code-block:: ipython + + In [11]: s = pd.Series([1, 1, np.nan, np.nan], + ....: index=pd.date_range("2017", periods=4)) - s = pd.Series([1, 1, np.nan, np.nan], index=pd.date_range("2017", periods=4)) - s.resample("2d").sum() + In [12]: s.resample("2d").sum() + Out[12]: + 2017-01-01 2.0 + 2017-01-03 0.0 + Freq: 2D, Length: 2, dtype: float64 To restore the 0.21 behavior of returning ``NaN``, use ``min_count>=1``. -.. ipython:: python +.. code-block:: ipython + + In [13]: s.resample("2d").sum(min_count=1) + Out[13]: + 2017-01-01 2.0 + 2017-01-03 NaN + Freq: 2D, Length: 2, dtype: float64 - s.resample("2d").sum(min_count=1) In particular, upsampling and taking the sum or product is affected, as upsampling introduces missing values even if the original series was diff --git a/doc/source/whatsnew/v0.23.0.rst b/doc/source/whatsnew/v0.23.0.rst index 808741ccf4475..663b47a4d2d55 100644 --- a/doc/source/whatsnew/v0.23.0.rst +++ b/doc/source/whatsnew/v0.23.0.rst @@ -50,19 +50,55 @@ JSON read/write round-trippable with ``orient='table'`` A ``DataFrame`` can now be written to and subsequently read back via JSON while preserving metadata through usage of the ``orient='table'`` argument (see :issue:`18912` and :issue:`9146`). Previously, none of the available ``orient`` values guaranteed the preservation of dtypes and index names, amongst other metadata. -.. ipython:: python +.. code-block:: ipython - df = pd.DataFrame({'foo': [1, 2, 3, 4], - 'bar': ['a', 'b', 'c', 'd'], - 'baz': pd.date_range('2018-01-01', freq='d', periods=4), - 'qux': pd.Categorical(['a', 'b', 'c', 'c'])}, - index=pd.Index(range(4), name='idx')) - df - df.dtypes - df.to_json('test.json', orient='table') - new_df = pd.read_json('test.json', orient='table') - new_df - new_df.dtypes + In [1]: df = pd.DataFrame({'foo': [1, 2, 3, 4], + ...: 'bar': ['a', 'b', 'c', 'd'], + ...: 'baz': pd.date_range('2018-01-01', freq='d', periods=4), + ...: 'qux': pd.Categorical(['a', 'b', 'c', 'c'])}, + ...: index=pd.Index(range(4), name='idx')) + + In [2]: df + Out[2]: + foo bar baz qux + idx + 0 1 a 2018-01-01 a + 1 2 b 2018-01-02 b + 2 3 c 2018-01-03 c + 3 4 d 2018-01-04 c + + [4 rows x 4 columns] + + In [3]: df.dtypes + Out[3]: + foo int64 + bar object + baz datetime64[ns] + qux category + Length: 4, dtype: object + + In [4]: df.to_json('test.json', orient='table') + + In [5]: new_df = pd.read_json('test.json', orient='table') + + In [6]: new_df + Out[6]: + foo bar baz qux + idx + 0 1 a 2018-01-01 a + 1 2 b 2018-01-02 b + 2 3 c 2018-01-03 c + 3 4 d 2018-01-04 c + + [4 rows x 4 columns] + + In [7]: new_df.dtypes + Out[7]: + foo int64 + bar object + baz datetime64[ns] + qux category + Length: 4, dtype: object Please note that the string ``index`` is not supported with the round trip format, as it is used by default in ``write_json`` to indicate a missing index name. diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index be4b9c218f9f5..41d18feaa532c 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -279,6 +279,8 @@ Other Deprecations - Deprecated allowing non-keyword arguments in :meth:`Series.to_markdown` except ``buf``. (:issue:`57280`) - Deprecated allowing non-keyword arguments in :meth:`Series.to_string` except ``buf``. (:issue:`57280`) - Deprecated behavior of :meth:`Series.dt.to_pytimedelta`, in a future version this will return a :class:`Series` containing python ``datetime.timedelta`` objects instead of an ``ndarray`` of timedelta; this matches the behavior of other :meth:`Series.dt` properties. (:issue:`57463`) +- Deprecated lowercase strings ``d``, ``b`` and ``c`` denoting frequencies in :class:`Day`, :class:`BusinessDay` and :class:`CustomBusinessDay` in favour of ``D``, ``B`` and ``C`` (:issue:`58998`) +- Deprecated lowercase strings ``w``, ``w-mon``, ``w-tue``, etc. denoting frequencies in :class:`Week` in favour of ``W``, ``W-MON``, ``W-TUE``, etc. (:issue:`58998`) - Deprecated parameter ``method`` in :meth:`DataFrame.reindex_like` / :meth:`Series.reindex_like` (:issue:`58667`) - Deprecated strings ``w``, ``d``, ``MIN``, ``MS``, ``US`` and ``NS`` denoting units in :class:`Timedelta` in favour of ``W``, ``D``, ``min``, ``ms``, ``us`` and ``ns`` (:issue:`59051`) - Deprecated using ``epoch`` date format in :meth:`DataFrame.to_json` and :meth:`Series.to_json`, use ``iso`` instead. (:issue:`57063`) diff --git a/pandas/_libs/tslibs/dtypes.pyx b/pandas/_libs/tslibs/dtypes.pyx index 0fdadf5b7611d..40d2395b38f04 100644 --- a/pandas/_libs/tslibs/dtypes.pyx +++ b/pandas/_libs/tslibs/dtypes.pyx @@ -359,6 +359,16 @@ cdef dict c_DEPR_UNITS = { cdef dict c_PERIOD_AND_OFFSET_DEPR_FREQSTR = { "w": "W", + "w-mon": "W-MON", + "w-tue": "W-TUE", + "w-wed": "W-WED", + "w-thu": "W-THU", + "w-fri": "W-FRI", + "w-sat": "W-SAT", + "w-sun": "W-SUN", + "d": "D", + "b": "B", + "c": "C", "MIN": "min", } diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 80889aeb58332..5ae2de907af18 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -4890,16 +4890,16 @@ cpdef to_offset(freq, bint is_period=False): ) name = c_PERIOD_TO_OFFSET_FREQSTR.get(name.upper()) - if name in c_PERIOD_AND_OFFSET_DEPR_FREQSTR: - warnings.warn( - f"\'{name}\' is deprecated and will be removed " - f"in a future version, please use " - f"\'{c_PERIOD_AND_OFFSET_DEPR_FREQSTR.get(name)}\' " - f" instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - name = c_PERIOD_AND_OFFSET_DEPR_FREQSTR.get(name) + if name in c_PERIOD_AND_OFFSET_DEPR_FREQSTR: + warnings.warn( + f"\'{name}\' is deprecated and will be removed " + f"in a future version, please use " + f"\'{c_PERIOD_AND_OFFSET_DEPR_FREQSTR.get(name)}\' " + f" instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + name = c_PERIOD_AND_OFFSET_DEPR_FREQSTR.get(name) if sep != "" and not sep.isspace(): raise ValueError("separator must be spaces") prefix = _lite_rule_alias.get(name) or name diff --git a/pandas/io/json/_table_schema.py b/pandas/io/json/_table_schema.py index b44aecff79779..d966e38fa11a5 100644 --- a/pandas/io/json/_table_schema.py +++ b/pandas/io/json/_table_schema.py @@ -275,7 +275,7 @@ def build_table_schema( >>> df = pd.DataFrame( ... {'A': [1, 2, 3], ... 'B': ['a', 'b', 'c'], - ... 'C': pd.date_range('2016-01-01', freq='d', periods=3), + ... 'C': pd.date_range('2016-01-01', freq='D', periods=3), ... }, index=pd.Index(range(3), name='idx')) >>> build_table_schema(df) {'fields': \ diff --git a/pandas/tests/arithmetic/test_period.py b/pandas/tests/arithmetic/test_period.py index 539df9d61a7b2..67762e0b89c73 100644 --- a/pandas/tests/arithmetic/test_period.py +++ b/pandas/tests/arithmetic/test_period.py @@ -1086,7 +1086,7 @@ def test_parr_add_timedeltalike_minute_gt1(self, three_days, box_with_array): with pytest.raises(TypeError, match=msg): other - rng - @pytest.mark.parametrize("freqstr", ["5ns", "5us", "5ms", "5s", "5min", "5h", "5d"]) + @pytest.mark.parametrize("freqstr", ["5ns", "5us", "5ms", "5s", "5min", "5h", "5D"]) def test_parr_add_timedeltalike_tick_gt1(self, three_days, freqstr, box_with_array): # GH#23031 adding a time-delta-like offset to a PeriodArray that has # tick-like frequency with n != 1 diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index 41129966cd589..edc90ce77ad3a 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -715,8 +715,12 @@ def test_astype_ignores_errors_for_extension_dtypes(self, data, dtype, errors): df.astype(float, errors=errors) def test_astype_tz_conversion(self): - # GH 35973 - val = {"tz": date_range("2020-08-30", freq="d", periods=2, tz="Europe/London")} + # GH 35973, GH#58998 + msg = "'d' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + val = { + "tz": date_range("2020-08-30", freq="d", periods=2, tz="Europe/London") + } df = DataFrame(val) result = df.astype({"tz": "datetime64[ns, Europe/Berlin]"}) @@ -727,7 +731,7 @@ def test_astype_tz_conversion(self): @pytest.mark.parametrize("tz", ["UTC", "Europe/Berlin"]) def test_astype_tz_object_conversion(self, tz): # GH 35973 - val = {"tz": date_range("2020-08-30", freq="d", periods=2, tz="Europe/London")} + val = {"tz": date_range("2020-08-30", freq="D", periods=2, tz="Europe/London")} expected = DataFrame(val) # convert expected to object dtype from other tz str (independently tested) diff --git a/pandas/tests/frame/methods/test_reindex.py b/pandas/tests/frame/methods/test_reindex.py index 45109991c4553..37adc31fb0f4d 100644 --- a/pandas/tests/frame/methods/test_reindex.py +++ b/pandas/tests/frame/methods/test_reindex.py @@ -754,7 +754,10 @@ def test_reindex_axes(self): index=[datetime(2012, 1, 1), datetime(2012, 1, 2), datetime(2012, 1, 3)], columns=["a", "b", "c"], ) - time_freq = date_range("2012-01-01", "2012-01-03", freq="d") + + msg = "'d' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + time_freq = date_range("2012-01-01", "2012-01-03", freq="d") some_cols = ["a", "b"] index_freq = df.reindex(index=time_freq).index.freq diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index c9ea5f379f1e9..b791868b173e4 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -763,7 +763,7 @@ def test_check_tz_aware_index_query(self, tz_aware_fixture): # https://github.com/pandas-dev/pandas/issues/29463 tz = tz_aware_fixture df_index = date_range( - start="2019-01-01", freq="1d", periods=10, tz=tz, name="time" + start="2019-01-01", freq="1D", periods=10, tz=tz, name="time" ) expected = DataFrame(index=df_index) df = DataFrame(index=df_index) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 4749e845a0e59..cedbd577da0ca 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -420,7 +420,7 @@ def test_groupby_drop_nan_with_multi_index(): ), ), "datetime64[ns]", - "period[d]", + "period[D]", "Sparse[float]", ], ) @@ -437,7 +437,7 @@ def test_no_sort_keep_na(sequence_index, dtype, test_series, as_index): # Unique values to use for grouper, depends on dtype if dtype in ("string", "string[pyarrow]"): uniques = {"x": "x", "y": "y", "z": pd.NA} - elif dtype in ("datetime64[ns]", "period[d]"): + elif dtype in ("datetime64[ns]", "period[D]"): uniques = {"x": "2016-01-01", "y": "2017-01-01", "z": pd.NA} else: uniques = {"x": 1, "y": 2, "z": np.nan} diff --git a/pandas/tests/indexes/datetimes/methods/test_snap.py b/pandas/tests/indexes/datetimes/methods/test_snap.py index 651e4383a3fac..a3c06ac6257cf 100644 --- a/pandas/tests/indexes/datetimes/methods/test_snap.py +++ b/pandas/tests/indexes/datetimes/methods/test_snap.py @@ -7,6 +7,8 @@ import pandas._testing as tm +@pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") +@pytest.mark.filterwarnings("ignore:Period with BDay freq:FutureWarning") @pytest.mark.parametrize("tz", [None, "Asia/Shanghai", "Europe/Berlin"]) @pytest.mark.parametrize("name", [None, "my_dti"]) def test_dti_snap(name, tz, unit): @@ -27,7 +29,9 @@ def test_dti_snap(name, tz, unit): dti = dti.as_unit(unit) result = dti.snap(freq="W-MON") - expected = date_range("12/31/2001", "1/7/2002", name=name, tz=tz, freq="w-mon") + msg = "'w-mon' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = date_range("12/31/2001", "1/7/2002", name=name, tz=tz, freq="w-mon") expected = expected.repeat([3, 4]) expected = expected.as_unit(unit) tm.assert_index_equal(result, expected) @@ -37,7 +41,9 @@ def test_dti_snap(name, tz, unit): result = dti.snap(freq="B") - expected = date_range("1/1/2002", "1/7/2002", name=name, tz=tz, freq="b") + msg = "'b' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = date_range("1/1/2002", "1/7/2002", name=name, tz=tz, freq="b") expected = expected.repeat([1, 1, 1, 2, 2]) expected = expected.as_unit(unit) tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py index ee1c906efea73..b37b5cf74b347 100644 --- a/pandas/tests/indexes/datetimes/test_date_range.py +++ b/pandas/tests/indexes/datetimes/test_date_range.py @@ -791,6 +791,26 @@ def test_frequency_A_raises(self, freq): with pytest.raises(ValueError, match=msg): date_range("1/1/2000", periods=2, freq=freq) + @pytest.mark.parametrize( + "freq,freq_depr", + [ + ("2W", "2w"), + ("2W-WED", "2w-wed"), + ("2B", "2b"), + ("2D", "2d"), + ("2C", "2c"), + ], + ) + def test_date_range_depr_lowercase_frequency(self, freq, freq_depr): + # GH#58998 + depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed " + "in a future version." + + expected = date_range("1/1/2000", periods=4, freq=freq) + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + result = date_range("1/1/2000", periods=4, freq=freq_depr) + tm.assert_index_equal(result, expected) + class TestDateRangeTZ: """Tests for date_range with timezones""" diff --git a/pandas/tests/indexes/datetimes/test_partial_slicing.py b/pandas/tests/indexes/datetimes/test_partial_slicing.py index 173b32b12e2d1..94175a56f1c4a 100644 --- a/pandas/tests/indexes/datetimes/test_partial_slicing.py +++ b/pandas/tests/indexes/datetimes/test_partial_slicing.py @@ -35,7 +35,7 @@ def test_string_index_series_name_converted(self): def test_stringified_slice_with_tz(self): # GH#2658 start = "2013-01-07" - idx = date_range(start=start, freq="1d", periods=10, tz="US/Eastern") + idx = date_range(start=start, freq="1D", periods=10, tz="US/Eastern") df = DataFrame(np.arange(10), index=idx) df["2013-01-14 23:44:34.437768-05:00":] # no exception here diff --git a/pandas/tests/indexes/period/test_constructors.py b/pandas/tests/indexes/period/test_constructors.py index aca765e7167b2..be07a71b283fd 100644 --- a/pandas/tests/indexes/period/test_constructors.py +++ b/pandas/tests/indexes/period/test_constructors.py @@ -73,6 +73,30 @@ def test_period_index_T_L_U_N_raises(self, freq_depr): with pytest.raises(ValueError, match=msg): PeriodIndex(["2020-01", "2020-05"], freq=freq_depr) + @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") + @pytest.mark.filterwarnings("ignore:Period with BDay freq:FutureWarning") + @pytest.mark.parametrize( + "freq,freq_depr", + [("2W", "2w"), ("2W-FRI", "2w-fri"), ("2D", "2d"), ("2B", "2b")], + ) + def test_period_index_depr_lowercase_frequency(self, freq, freq_depr): + # GH#58998 + msg = ( + f"'{freq_depr[1:]}' is deprecated and will be removed in a future version." + ) + + with tm.assert_produces_warning(FutureWarning, match=msg): + result = PeriodIndex(["2020-01-01", "2020-01-02"], freq=freq_depr) + + expected = PeriodIndex(["2020-01-01", "2020-01-02"], freq=freq) + tm.assert_index_equal(result, expected) + + with tm.assert_produces_warning(FutureWarning, match=msg): + result = period_range(start="2020-01-01", end="2020-01-02", freq=freq_depr) + + expected = period_range(start="2020-01-01", end="2020-01-02", freq=freq) + tm.assert_index_equal(result, expected) + class TestPeriodIndex: def test_from_ordinals(self): diff --git a/pandas/tests/indexes/timedeltas/test_scalar_compat.py b/pandas/tests/indexes/timedeltas/test_scalar_compat.py index 9f0552f8baa90..9a00c556dc515 100644 --- a/pandas/tests/indexes/timedeltas/test_scalar_compat.py +++ b/pandas/tests/indexes/timedeltas/test_scalar_compat.py @@ -103,30 +103,34 @@ def test_round(self): t1c = TimedeltaIndex(np.array([1, 1, 1], "m8[D]")).as_unit("ns") # note that negative times round DOWN! so don't give whole numbers - for freq, s1, s2 in [ - ("ns", t1, t2), - ("us", t1, t2), - ( - "ms", - t1a, - TimedeltaIndex( - ["-1 days +00:00:00", "-2 days +23:58:58", "-2 days +23:57:56"] + msg = "'d' is deprecated and will be removed in a future version." + + with tm.assert_produces_warning(FutureWarning, match=msg): + for freq, s1, s2 in [ + ("ns", t1, t2), + ("us", t1, t2), + ( + "ms", + t1a, + TimedeltaIndex( + ["-1 days +00:00:00", "-2 days +23:58:58", "-2 days +23:57:56"] + ), ), - ), - ( - "s", - t1a, - TimedeltaIndex( - ["-1 days +00:00:00", "-2 days +23:58:58", "-2 days +23:57:56"] + ( + "s", + t1a, + TimedeltaIndex( + ["-1 days +00:00:00", "-2 days +23:58:58", "-2 days +23:57:56"] + ), ), - ), - ("12min", t1c, TimedeltaIndex(["-1 days", "-1 days", "-1 days"])), - ("h", t1c, TimedeltaIndex(["-1 days", "-1 days", "-1 days"])), - ("d", t1c, -1 * t1c), - ]: - r1 = t1.round(freq) + ("12min", t1c, TimedeltaIndex(["-1 days", "-1 days", "-1 days"])), + ("h", t1c, TimedeltaIndex(["-1 days", "-1 days", "-1 days"])), + ("d", t1c, -1 * t1c), + ]: + r1 = t1.round(freq) + r2 = t2.round(freq) + tm.assert_index_equal(r1, s1) - r2 = t2.round(freq) tm.assert_index_equal(r2, s2) def test_components(self): diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index 49776d532db1d..7bf041a50b745 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -221,7 +221,7 @@ def test_to_csv_na_rep_nullable_string(self, nullable_string_dtype): def test_to_csv_date_format(self): # GH 10209 df_sec = DataFrame({"A": pd.date_range("20130101", periods=5, freq="s")}) - df_day = DataFrame({"A": pd.date_range("20130101", periods=5, freq="d")}) + df_day = DataFrame({"A": pd.date_range("20130101", periods=5, freq="D")}) expected_rows = [ ",A", diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py index a0d5b3a741aaf..e61a8ee722443 100644 --- a/pandas/tests/io/json/test_json_table_schema.py +++ b/pandas/tests/io/json/test_json_table_schema.py @@ -32,7 +32,7 @@ def df_schema(): { "A": [1, 2, 3, 4], "B": ["a", "b", "c", "c"], - "C": pd.date_range("2016-01-01", freq="d", periods=4), + "C": pd.date_range("2016-01-01", freq="D", periods=4), "D": pd.timedelta_range("1h", periods=4, freq="min"), }, index=pd.Index(range(4), name="idx"), @@ -45,12 +45,12 @@ def df_table(): { "A": [1, 2, 3, 4], "B": ["a", "b", "c", "c"], - "C": pd.date_range("2016-01-01", freq="d", periods=4), + "C": pd.date_range("2016-01-01", freq="D", periods=4), "D": pd.timedelta_range("1h", periods=4, freq="min"), "E": pd.Series(pd.Categorical(["a", "b", "c", "c"])), "F": pd.Series(pd.Categorical(["a", "b", "c", "c"], ordered=True)), "G": [1.0, 2.0, 3, 4.0], - "H": pd.date_range("2016-01-01", freq="d", periods=4, tz="US/Central"), + "H": pd.date_range("2016-01-01", freq="D", periods=4, tz="US/Central"), }, index=pd.Index(range(4), name="idx"), ) @@ -687,7 +687,7 @@ class TestTableOrientReader: {"ints": [1, 2, 3, 4]}, {"objects": ["a", "b", "c", "d"]}, {"objects": ["1", "2", "3", "4"]}, - {"date_ranges": pd.date_range("2016-01-01", freq="d", periods=4)}, + {"date_ranges": pd.date_range("2016-01-01", freq="D", periods=4)}, {"categoricals": pd.Series(pd.Categorical(["a", "b", "c", "c"]))}, { "ordered_cats": pd.Series( @@ -699,7 +699,7 @@ class TestTableOrientReader: {"bools": [True, False, False, True]}, { "timezones": pd.date_range( - "2016-01-01", freq="d", periods=4, tz="US/Central" + "2016-01-01", freq="D", periods=4, tz="US/Central" ) # added in # GH 35973 }, ], @@ -738,7 +738,7 @@ def test_read_json_table_orient_raises(self, index_nm): {"ints": [1, 2, 3, 4]}, {"objects": ["a", "b", "c", "d"]}, {"objects": ["1", "2", "3", "4"]}, - {"date_ranges": pd.date_range("2016-01-01", freq="d", periods=4)}, + {"date_ranges": pd.date_range("2016-01-01", freq="D", periods=4)}, {"categoricals": pd.Series(pd.Categorical(["a", "b", "c", "c"]))}, { "ordered_cats": pd.Series( @@ -750,7 +750,7 @@ def test_read_json_table_orient_raises(self, index_nm): {"bools": [True, False, False, True]}, { "timezones": pd.date_range( - "2016-01-01", freq="d", periods=4, tz="US/Central" + "2016-01-01", freq="D", periods=4, tz="US/Central" ) # added in # GH 35973 }, ], @@ -772,15 +772,15 @@ def test_read_json_table_period_orient(self, index_nm, vals): pd.Index(range(4)), pd.date_range( "2020-08-30", - freq="d", + freq="D", periods=4, )._with_freq(None), pd.date_range( - "2020-08-30", freq="d", periods=4, tz="US/Central" + "2020-08-30", freq="D", periods=4, tz="US/Central" )._with_freq(None), pd.MultiIndex.from_product( [ - pd.date_range("2020-08-30", freq="d", periods=2, tz="US/Central"), + pd.date_range("2020-08-30", freq="D", periods=2, tz="US/Central"), ["x", "y"], ], ), @@ -790,10 +790,10 @@ def test_read_json_table_period_orient(self, index_nm, vals): "vals", [ {"floats": [1.1, 2.2, 3.3, 4.4]}, - {"dates": pd.date_range("2020-08-30", freq="d", periods=4)}, + {"dates": pd.date_range("2020-08-30", freq="D", periods=4)}, { "timezones": pd.date_range( - "2020-08-30", freq="d", periods=4, tz="Europe/London" + "2020-08-30", freq="D", periods=4, tz="Europe/London" ) }, ], @@ -810,12 +810,12 @@ def test_comprehensive(self): { "A": [1, 2, 3, 4], "B": ["a", "b", "c", "c"], - "C": pd.date_range("2016-01-01", freq="d", periods=4), + "C": pd.date_range("2016-01-01", freq="D", periods=4), # 'D': pd.timedelta_range('1h', periods=4, freq='min'), "E": pd.Series(pd.Categorical(["a", "b", "c", "c"])), "F": pd.Series(pd.Categorical(["a", "b", "c", "c"], ordered=True)), "G": [1.1, 2.2, 3.3, 4.4], - "H": pd.date_range("2016-01-01", freq="d", periods=4, tz="US/Central"), + "H": pd.date_range("2016-01-01", freq="D", periods=4, tz="US/Central"), "I": [True, False, False, True], }, index=pd.Index(range(4), name="idx"), diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py index a9135ee583d91..1275f3d6f7d6d 100644 --- a/pandas/tests/plotting/test_datetimelike.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -1543,7 +1543,7 @@ def test_format_timedelta_ticks_wide(self): "9 days 06:13:20", ] - rng = timedelta_range("0", periods=10, freq="1 d") + rng = timedelta_range("0", periods=10, freq="1 D") df = DataFrame(np.random.default_rng(2).standard_normal((len(rng), 3)), rng) _, ax = mpl.pyplot.subplots() ax = df.plot(fontsize=2, ax=ax) @@ -1562,7 +1562,7 @@ def test_timedelta_plot(self): def test_timedelta_long_period(self): # test long period - index = timedelta_range("1 day 2 hr 30 min 10 s", periods=10, freq="1 d") + index = timedelta_range("1 day 2 hr 30 min 10 s", periods=10, freq="1 D") s = Series(np.random.default_rng(2).standard_normal(len(index)), index) _, ax = mpl.pyplot.subplots() _check_plot_works(s.plot, ax=ax) diff --git a/pandas/tests/resample/test_base.py b/pandas/tests/resample/test_base.py index f4ea6b1d3f3de..b2d9f6c0e3eb0 100644 --- a/pandas/tests/resample/test_base.py +++ b/pandas/tests/resample/test_base.py @@ -436,7 +436,7 @@ def test_resample_empty_dtypes(index, dtype, resample_method): empty_series_dti = Series([], index, dtype) with tm.assert_produces_warning(warn, match=msg): - rs = empty_series_dti.resample("d", group_keys=False) + rs = empty_series_dti.resample("D", group_keys=False) try: getattr(rs, resample_method)() except DataError: diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index cf0cbabb0258c..dc2ddcc70828f 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -239,7 +239,9 @@ def _ohlc(group): def test_resample_how_callables(unit): # GH#7929 data = np.arange(5, dtype=np.int64) - ind = date_range(start="2014-01-01", periods=len(data), freq="d").as_unit(unit) + msg = "'d' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + ind = date_range(start="2014-01-01", periods=len(data), freq="d").as_unit(unit) df = DataFrame({"A": data, "B": data}, index=ind) def fn(x, a=1): @@ -334,7 +336,9 @@ def test_resample_basic_from_daily(unit): s = Series(np.random.default_rng(2).random(len(dti)), dti) # to weekly - result = s.resample("w-sun").last() + msg = "'w-sun' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + result = s.resample("w-sun").last() assert len(result) == 3 assert (result.index.dayofweek == [6, 6, 6]).all() @@ -1190,7 +1194,9 @@ def test_anchored_lowercase_buglet(unit): dates = date_range("4/16/2012 20:00", periods=50000, freq="s").as_unit(unit) ts = Series(np.random.default_rng(2).standard_normal(len(dates)), index=dates) # it works! - ts.resample("d").mean() + msg = "'d' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + ts.resample("d").mean() def test_upsample_apply_functions(unit): @@ -1531,9 +1537,9 @@ def test_groupby_with_dst_time_change(unit): ) df = DataFrame([1, 2], index=index) - result = df.groupby(Grouper(freq="1d")).last() + result = df.groupby(Grouper(freq="1D")).last() expected_index_values = date_range( - "2016-11-02", "2016-11-24", freq="d", tz="America/Chicago" + "2016-11-02", "2016-11-24", freq="D", tz="America/Chicago" ).as_unit(unit) index = DatetimeIndex(expected_index_values) @@ -2018,7 +2024,7 @@ def test_resample_empty_series_with_tz(): def test_resample_M_Q_Y_raises(freq): msg = f"Invalid frequency: {freq}" - s = Series(range(10), index=date_range("20130101", freq="d", periods=10)) + s = Series(range(10), index=date_range("20130101", freq="D", periods=10)) with pytest.raises(ValueError, match=msg): s.resample(freq).mean() @@ -2027,11 +2033,32 @@ def test_resample_M_Q_Y_raises(freq): def test_resample_BM_BQ_raises(freq): msg = f"Invalid frequency: {freq}" - s = Series(range(10), index=date_range("20130101", freq="d", periods=10)) + s = Series(range(10), index=date_range("20130101", freq="D", periods=10)) with pytest.raises(ValueError, match=msg): s.resample(freq).mean() +@pytest.mark.parametrize( + "freq,freq_depr,data", + [ + ("1W-SUN", "1w-sun", ["2013-01-06"]), + ("1D", "1d", ["2013-01-01"]), + ("1B", "1b", ["2013-01-01"]), + ("1C", "1c", ["2013-01-01"]), + ], +) +def test_resample_depr_lowercase_frequency(freq, freq_depr, data): + msg = f"'{freq_depr[1:]}' is deprecated and will be removed in a future version." + + s = Series(range(5), index=date_range("20130101", freq="h", periods=5)) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = s.resample(freq_depr).mean() + + exp_dti = DatetimeIndex(data=data, dtype="datetime64[ns]", freq=freq) + expected = Series(2.0, index=exp_dti) + tm.assert_series_equal(result, expected) + + def test_resample_ms_closed_right(unit): # https://github.com/pandas-dev/pandas/issues/55271 dti = date_range(start="2020-01-31", freq="1min", periods=6000, unit=unit) @@ -2129,6 +2156,6 @@ def test_arrow_timestamp_resample(tz): def test_resample_A_raises(freq): msg = f"Invalid frequency: {freq[1:]}" - s = Series(range(10), index=date_range("20130101", freq="d", periods=10)) + s = Series(range(10), index=date_range("20130101", freq="D", periods=10)) with pytest.raises(ValueError, match=msg): s.resample(freq).mean() diff --git a/pandas/tests/resample/test_period_index.py b/pandas/tests/resample/test_period_index.py index 89a21f0565793..e17529dfab00c 100644 --- a/pandas/tests/resample/test_period_index.py +++ b/pandas/tests/resample/test_period_index.py @@ -164,12 +164,12 @@ def test_basic_downsample(self, simple_period_range_series): ("Y-DEC", ""), ("Q-MAR", ""), ("M", ""), - ("w-thu", ""), + ("W-THU", ""), ], ) def test_not_subperiod(self, simple_period_range_series, rule, expected_error_msg): # These are incompatible period rules for resampling - ts = simple_period_range_series("1/1/1990", "6/30/1995", freq="w-wed") + ts = simple_period_range_series("1/1/1990", "6/30/1995", freq="W-WED") msg = ( "Frequency cannot be resampled to " f"{expected_error_msg}, as they are not sub or super periods" diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index bf1f6bd34b171..a8fb1b392322d 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -732,7 +732,7 @@ def test_agg_with_datetime_index_list_agg_func(col_name): ), columns=[col_name], ) - result = df.resample("1d").aggregate(["mean"]) + result = df.resample("1D").aggregate(["mean"]) expected = DataFrame( [47.5, 143.5, 195.5], index=date_range(start="2017-01-01", freq="D", periods=3, tz="Europe/Berlin"), diff --git a/pandas/tests/resample/test_time_grouper.py b/pandas/tests/resample/test_time_grouper.py index 2646106b9b97c..f694b90a707c7 100644 --- a/pandas/tests/resample/test_time_grouper.py +++ b/pandas/tests/resample/test_time_grouper.py @@ -193,7 +193,7 @@ def test_aggregate_nth(): ) def test_resample_entirely_nat_window(method, method_args, unit): ser = Series([0] * 2 + [np.nan] * 2, index=date_range("2017", periods=4)) - result = methodcaller(method, **method_args)(ser.resample("2d")) + result = methodcaller(method, **method_args)(ser.resample("2D")) exp_dti = pd.DatetimeIndex(["2017-01-01", "2017-01-03"], dtype="M8[ns]", freq="2D") expected = Series([0.0, unit], index=exp_dti) @@ -372,7 +372,7 @@ def test_groupby_resample_interpolate_with_apply_syntax(groupy_test_df): for df in dfs: result = df.groupby("volume").apply( - lambda x: x.resample("1d").interpolate(method="linear"), + lambda x: x.resample("1D").interpolate(method="linear"), include_groups=False, ) diff --git a/pandas/tests/scalar/period/test_period.py b/pandas/tests/scalar/period/test_period.py index 49bd48b40e67a..fe51817a78be8 100644 --- a/pandas/tests/scalar/period/test_period.py +++ b/pandas/tests/scalar/period/test_period.py @@ -117,7 +117,9 @@ def test_construction(self): i2 = Period("3/1/2005", freq="D") assert i1 == i2 - i3 = Period(year=2005, month=3, day=1, freq="d") + msg = "'d' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + i3 = Period(year=2005, month=3, day=1, freq="d") assert i1 == i3 i1 = Period("2007-01-01 09:00:00.001") @@ -613,6 +615,25 @@ def test_period_large_ordinal(self, hour): p = Period(ordinal=2562048 + hour, freq="1h") assert p.hour == hour + @pytest.mark.filterwarnings( + "ignore:Period with BDay freq is deprecated:FutureWarning" + ) + @pytest.mark.parametrize( + "freq,freq_depr", + [("2W", "2w"), ("2W-FRI", "2w-fri"), ("2D", "2d"), ("2B", "2b")], + ) + def test_period_deprecated_lowercase_freq(self, freq, freq_depr): + # GH#58998 + msg = ( + f"'{freq_depr[1:]}' is deprecated and will be removed in a future version." + ) + + with tm.assert_produces_warning(FutureWarning, match=msg): + result = Period("2016-03-01 09:00", freq=freq_depr) + + expected = Period("2016-03-01 09:00", freq=freq) + assert result == expected + class TestPeriodMethods: def test_round_trip(self): diff --git a/pandas/tests/scalar/timedelta/methods/test_round.py b/pandas/tests/scalar/timedelta/methods/test_round.py index 082c36999e06f..96cb1c07d2b76 100644 --- a/pandas/tests/scalar/timedelta/methods/test_round.py +++ b/pandas/tests/scalar/timedelta/methods/test_round.py @@ -38,7 +38,7 @@ class TestTimedeltaRound: ("min", "1 days 02:35:00", "-1 days 02:35:00"), ("12min", "1 days 02:36:00", "-1 days 02:36:00"), ("h", "1 days 03:00:00", "-1 days 03:00:00"), - ("d", "1 days", "-1 days"), + ("D", "1 days", "-1 days"), ], ) def test_round(self, freq, s1, s2): diff --git a/pandas/tests/tslibs/test_to_offset.py b/pandas/tests/tslibs/test_to_offset.py index 07bdfca8f2f2d..c123c00e749db 100644 --- a/pandas/tests/tslibs/test_to_offset.py +++ b/pandas/tests/tslibs/test_to_offset.py @@ -61,11 +61,11 @@ def test_to_offset_negative(freqstr, expected): "2SMS-15D", "100foo", # Invalid leading +/- signs. - "+-1d", + "+-1D", "-+1h", "+1", "-7", - "+d", + "+D", "-m", # Invalid shortcut anchors. "SME-0", @@ -128,9 +128,14 @@ def test_to_offset_leading_zero(freqstr, expected): assert result.n == expected -@pytest.mark.parametrize("freqstr,expected", [("+1d", 1), ("+2h30min", 150)]) -def test_to_offset_leading_plus(freqstr, expected): - result = to_offset(freqstr) +@pytest.mark.parametrize( + "freqstr,expected,wrn", [("+1d", 1, FutureWarning), ("+2h30min", 150, None)] +) +def test_to_offset_leading_plus(freqstr, expected, wrn): + msg = "'d' is deprecated and will be removed in a future version." + + with tm.assert_produces_warning(wrn, match=msg): + result = to_offset(freqstr) assert result.n == expected @@ -176,14 +181,6 @@ def test_anchored_shortcuts(shortcut, expected): assert result == expected -def test_to_offset_lowercase_frequency_w_deprecated(): - # GH#54939 - msg = "'w' is deprecated and will be removed in a future version" - - with tm.assert_produces_warning(FutureWarning, match=msg): - to_offset("2w") - - @pytest.mark.parametrize( "freq_depr", [ @@ -224,3 +221,20 @@ def test_to_offset_uppercase_frequency_deprecated(freq_depr): with tm.assert_produces_warning(FutureWarning, match=depr_msg): to_offset(freq_depr) + + +@pytest.mark.parametrize( + "freq_depr,expected", + [ + ("2w", offsets.Week(2, weekday=6)), + ("2b", offsets.BusinessDay(2)), + ("2d", offsets.Day(2)), + ], +) +def test_to_offset_lowercase_frequency_deprecated(freq_depr, expected): + # GH#54939, GH#58998 + msg = f"'{freq_depr[1:]}' is deprecated and will be removed in a future version." + + with tm.assert_produces_warning(FutureWarning, match=msg): + result = to_offset(freq_depr) + assert result == expected diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py index 120470b09a92b..4d37c6d57f788 100644 --- a/pandas/tests/window/test_groupby.py +++ b/pandas/tests/window/test_groupby.py @@ -582,7 +582,7 @@ def test_groupby_rolling_string_index(self): groups = df.groupby("group") df["count_to_date"] = groups.cumcount() - rolling_groups = groups.rolling("10d", on="eventTime") + rolling_groups = groups.rolling("10D", on="eventTime") result = rolling_groups.apply(lambda df: df.shape[0]) expected = DataFrame( [ @@ -623,11 +623,14 @@ def test_groupby_rolling_count_closed_on(self, unit): "date": date_range(end="20190101", periods=6, unit=unit), } ) - result = ( - df.groupby("group") - .rolling("3d", on="date", closed="left")["column1"] - .count() - ) + msg = "'d' is deprecated and will be removed in a future version." + + with tm.assert_produces_warning(FutureWarning, match=msg): + result = ( + df.groupby("group") + .rolling("3d", on="date", closed="left")["column1"] + .count() + ) dti = DatetimeIndex( [ "2018-12-27", diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 0f2386d1f229f..af3194b5085c4 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -1153,7 +1153,7 @@ def test_timeoffset_as_window_parameter_for_corr(unit): index=dti, ) - res = df.rolling(window="3d").corr() + res = df.rolling(window="3D").corr() tm.assert_frame_equal(exp, res) @@ -1380,17 +1380,20 @@ def test_invalid_method(): Series(range(1)).rolling(1, method="foo") -@pytest.mark.parametrize("window", [1, "1d"]) -def test_rolling_descending_date_order_with_offset(window, frame_or_series): +def test_rolling_descending_date_order_with_offset(frame_or_series): # GH#40002 - idx = date_range(start="2020-01-01", end="2020-01-03", freq="1d") - obj = frame_or_series(range(1, 4), index=idx) - result = obj.rolling("1d", closed="left").sum() + msg = "'d' is deprecated and will be removed in a future version." + + with tm.assert_produces_warning(FutureWarning, match=msg): + idx = date_range(start="2020-01-01", end="2020-01-03", freq="1d") + obj = frame_or_series(range(1, 4), index=idx) + result = obj.rolling("1d", closed="left").sum() + expected = frame_or_series([np.nan, 1, 2], index=idx) tm.assert_equal(result, expected) - result = obj.iloc[::-1].rolling("1d", closed="left").sum() - idx = date_range(start="2020-01-03", end="2020-01-01", freq="-1d") + result = obj.iloc[::-1].rolling("1D", closed="left").sum() + idx = date_range(start="2020-01-03", end="2020-01-01", freq="-1D") expected = frame_or_series([np.nan, 3, 2], index=idx) tm.assert_equal(result, expected) diff --git a/pandas/tests/window/test_timeseries_window.py b/pandas/tests/window/test_timeseries_window.py index 820b0134cc577..eacdaddfa28b0 100644 --- a/pandas/tests/window/test_timeseries_window.py +++ b/pandas/tests/window/test_timeseries_window.py @@ -101,7 +101,7 @@ def test_on(self, regular): # column is valid df = df.copy() df["C"] = date_range("20130101", periods=len(df)) - df.rolling(window="2d", on="C").sum() + df.rolling(window="2D", on="C").sum() # invalid columns msg = "window must be an integer" @@ -109,7 +109,7 @@ def test_on(self, regular): df.rolling(window="2d", on="B") # ok even though on non-selected - df.rolling(window="2d", on="C").B.sum() + df.rolling(window="2D", on="C").B.sum() def test_monotonic_on(self): # on/index must be monotonic @@ -682,7 +682,7 @@ def test_rolling_on_multi_index_level(self): [date_range("20190101", periods=3), range(2)], names=["date", "seq"] ), ) - result = df.rolling("10d", on=df.index.get_level_values("date")).sum() + result = df.rolling("10D", on=df.index.get_level_values("date")).sum() expected = DataFrame( {"column": [0.0, 1.0, 3.0, 6.0, 10.0, 15.0]}, index=df.index ) From f3ad4d50aee360092c54b918b28c2cba90c90f20 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 5 Jul 2024 07:54:38 -1000 Subject: [PATCH 152/272] REF: Make concat not stateful. (#59141) * Make concat non stateful * Fix bug and error message * Update check * FIx typing --- pandas/core/generic.py | 11 +- pandas/core/reshape/concat.py | 560 +++++++++++++++------------- pandas/tests/generic/test_frame.py | 3 +- pandas/tests/generic/test_series.py | 7 +- 4 files changed, 311 insertions(+), 270 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 33190f905be13..312f5d20d794f 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6017,17 +6017,16 @@ def __finalize__(self, other, method: str | None = None, **kwargs) -> Self: object.__setattr__(self, name, getattr(other, name, None)) if method == "concat": + objs = kwargs["objs"] # propagate attrs only if all concat arguments have the same attrs - if all(bool(obj.attrs) for obj in other.objs): + if all(bool(obj.attrs) for obj in objs): # all concatenate arguments have non-empty attrs - attrs = other.objs[0].attrs - have_same_attrs = all(obj.attrs == attrs for obj in other.objs[1:]) + attrs = objs[0].attrs + have_same_attrs = all(obj.attrs == attrs for obj in objs[1:]) if have_same_attrs: self.attrs = deepcopy(attrs) - allows_duplicate_labels = all( - x.flags.allows_duplicate_labels for x in other.objs - ) + allows_duplicate_labels = all(x.flags.allows_duplicate_labels for x in objs) self.flags.allows_duplicate_labels = allows_duplicate_labels return self diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 2d2787e56f402..6381869c3e559 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -16,10 +16,12 @@ import numpy as np from pandas._libs import lib -from pandas.util._decorators import cache_readonly from pandas.util._exceptions import find_stack_level -from pandas.core.dtypes.common import is_bool +from pandas.core.dtypes.common import ( + is_bool, + is_scalar, +) from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.generic import ( ABCDataFrame, @@ -385,291 +387,328 @@ def concat( DeprecationWarning, stacklevel=find_stack_level(), ) + if join == "outer": + intersect = False + elif join == "inner": + intersect = True + else: # pragma: no cover + raise ValueError( + "Only can inner (intersect) or outer (union) join the other axis" + ) - op = _Concatenator( - objs, - axis=axis, - ignore_index=ignore_index, - join=join, - keys=keys, - levels=levels, - names=names, - verify_integrity=verify_integrity, - sort=sort, - ) - - return op.get_result() + if not is_bool(sort): + raise ValueError( + f"The 'sort' keyword only accepts boolean values; {sort} was passed." + ) + sort = bool(sort) + objs, keys, ndims = _clean_keys_and_objs(objs, keys) -class _Concatenator: - """ - Orchestrates a concatenation operation for BlockManagers - """ + # select an object to be our result reference + sample, objs = _get_sample_object(objs, ndims, keys, names, levels, intersect) - sort: bool - - def __init__( - self, - objs: Iterable[Series | DataFrame] | Mapping[HashableT, Series | DataFrame], - axis: Axis = 0, - join: str = "outer", - keys: Iterable[Hashable] | None = None, - levels=None, - names: list[HashableT] | None = None, - ignore_index: bool = False, - verify_integrity: bool = False, - sort: bool = False, - ) -> None: - if isinstance(objs, (ABCSeries, ABCDataFrame, str)): - raise TypeError( - "first argument must be an iterable of pandas " - f'objects, you passed an object of type "{type(objs).__name__}"' - ) + # Standardize axis parameter to int + if sample.ndim == 1: + from pandas import DataFrame - if join == "outer": - self.intersect = False - elif join == "inner": - self.intersect = True - else: # pragma: no cover - raise ValueError( - "Only can inner (intersect) or outer (union) join the other axis" - ) + bm_axis = DataFrame._get_axis_number(axis) + is_frame = False + is_series = True + else: + bm_axis = sample._get_axis_number(axis) + is_frame = True + is_series = False - if not is_bool(sort): - raise ValueError( - f"The 'sort' keyword only accepts boolean values; {sort} was passed." - ) - # Incompatible types in assignment (expression has type "Union[bool, bool_]", - # variable has type "bool") - self.sort = sort # type: ignore[assignment] + # Need to flip BlockManager axis in the DataFrame special case + bm_axis = sample._get_block_manager_axis(bm_axis) - self.ignore_index = ignore_index - self.verify_integrity = verify_integrity + # if we have mixed ndims, then convert to highest ndim + # creating column numbers as needed + if len(ndims) > 1: + objs = _sanitize_mixed_ndim(objs, sample, ignore_index, bm_axis) - objs, keys, ndims = _clean_keys_and_objs(objs, keys) + axis = 1 - bm_axis if is_frame else 0 + names = names or getattr(keys, "names", None) + return _get_result( + objs, + is_series, + bm_axis, + ignore_index, + intersect, + sort, + keys, + levels, + verify_integrity, + names, + axis, + ) - # select an object to be our result reference - sample, objs = _get_sample_object( - objs, ndims, keys, names, levels, self.intersect - ) - # Standardize axis parameter to int - if sample.ndim == 1: - from pandas import DataFrame +def _sanitize_mixed_ndim( + objs: list[Series | DataFrame], + sample: Series | DataFrame, + ignore_index: bool, + axis: AxisInt, +) -> list[Series | DataFrame]: + # if we have mixed ndims, then convert to highest ndim + # creating column numbers as needed + + new_objs = [] + + current_column = 0 + max_ndim = sample.ndim + for obj in objs: + ndim = obj.ndim + if ndim == max_ndim: + pass + + elif ndim != max_ndim - 1: + raise ValueError( + "cannot concatenate unaligned mixed dimensional NDFrame objects" + ) - axis = DataFrame._get_axis_number(axis) - self._is_frame = False - self._is_series = True else: - axis = sample._get_axis_number(axis) - self._is_frame = True - self._is_series = False - - # Need to flip BlockManager axis in the DataFrame special case - axis = sample._get_block_manager_axis(axis) - - # if we have mixed ndims, then convert to highest ndim - # creating column numbers as needed - if len(ndims) > 1: - objs = self._sanitize_mixed_ndim(objs, sample, ignore_index, axis) - - self.objs = objs - - # note: this is the BlockManager axis (since DataFrame is transposed) - self.bm_axis = axis - self.axis = 1 - self.bm_axis if self._is_frame else 0 - self.keys = keys - self.names = names or getattr(keys, "names", None) - self.levels = levels - - def _sanitize_mixed_ndim( - self, - objs: list[Series | DataFrame], - sample: Series | DataFrame, - ignore_index: bool, - axis: AxisInt, - ) -> list[Series | DataFrame]: - # if we have mixed ndims, then convert to highest ndim - # creating column numbers as needed - - new_objs = [] - - current_column = 0 - max_ndim = sample.ndim - for obj in objs: - ndim = obj.ndim - if ndim == max_ndim: - pass - - elif ndim != max_ndim - 1: - raise ValueError( - "cannot concatenate unaligned mixed dimensional NDFrame objects" - ) - - else: - name = getattr(obj, "name", None) - if ignore_index or name is None: - if axis == 1: - # doing a row-wise concatenation so need everything - # to line up - name = 0 - else: - # doing a column-wise concatenation so need series - # to have unique names - name = current_column - current_column += 1 - obj = sample._constructor(obj, copy=False) - if isinstance(obj, ABCDataFrame): - obj.columns = range(name, name + 1, 1) + name = getattr(obj, "name", None) + if ignore_index or name is None: + if axis == 1: + # doing a row-wise concatenation so need everything + # to line up + name = 0 else: - obj = sample._constructor({name: obj}, copy=False) - - new_objs.append(obj) - - return new_objs + # doing a column-wise concatenation so need series + # to have unique names + name = current_column + current_column += 1 + obj = sample._constructor(obj, copy=False) + if isinstance(obj, ABCDataFrame): + obj.columns = range(name, name + 1, 1) + else: + obj = sample._constructor({name: obj}, copy=False) - def get_result(self): - cons: Callable[..., DataFrame | Series] - sample: DataFrame | Series + new_objs.append(obj) - # series only - if self._is_series: - sample = cast("Series", self.objs[0]) + return new_objs - # stack blocks - if self.bm_axis == 0: - name = com.consensus_name_attr(self.objs) - cons = sample._constructor - arrs = [ser._values for ser in self.objs] +def _get_result( + objs: list[Series | DataFrame], + is_series: bool, + bm_axis: AxisInt, + ignore_index: bool, + intersect: bool, + sort: bool, + keys: Iterable[Hashable] | None, + levels, + verify_integrity: bool, + names: list[HashableT] | None, + axis: AxisInt, +): + cons: Callable[..., DataFrame | Series] + sample: DataFrame | Series - res = concat_compat(arrs, axis=0) + # series only + if is_series: + sample = cast("Series", objs[0]) - new_index: Index - if self.ignore_index: - # We can avoid surprisingly-expensive _get_concat_axis - new_index = default_index(len(res)) - else: - new_index = self.new_axes[0] + # stack blocks + if bm_axis == 0: + name = com.consensus_name_attr(objs) + cons = sample._constructor - mgr = type(sample._mgr).from_array(res, index=new_index) + arrs = [ser._values for ser in objs] - result = sample._constructor_from_mgr(mgr, axes=mgr.axes) - result._name = name - return result.__finalize__(self, method="concat") + res = concat_compat(arrs, axis=0) - # combine as columns in a frame + if ignore_index: + new_index: Index = default_index(len(res)) else: - data = dict(enumerate(self.objs)) + new_index = _get_concat_axis_series( + objs, + ignore_index, + bm_axis, + keys, + levels, + verify_integrity, + names, + ) - # GH28330 Preserves subclassed objects through concat - cons = sample._constructor_expanddim + mgr = type(sample._mgr).from_array(res, index=new_index) - index, columns = self.new_axes - df = cons(data, index=index, copy=False) - df.columns = columns - return df.__finalize__(self, method="concat") + result = sample._constructor_from_mgr(mgr, axes=mgr.axes) + result._name = name + return result.__finalize__(object(), method="concat", objs=objs) - # combine block managers + # combine as columns in a frame else: - sample = cast("DataFrame", self.objs[0]) - - mgrs_indexers = [] - for obj in self.objs: - indexers = {} - for ax, new_labels in enumerate(self.new_axes): - # ::-1 to convert BlockManager ax to DataFrame ax - if ax == self.bm_axis: - # Suppress reindexing on concat axis - continue - - # 1-ax to convert BlockManager axis to DataFrame axis - obj_labels = obj.axes[1 - ax] - if not new_labels.equals(obj_labels): - indexers[ax] = obj_labels.get_indexer(new_labels) - - mgrs_indexers.append((obj._mgr, indexers)) - - new_data = concatenate_managers( - mgrs_indexers, self.new_axes, concat_axis=self.bm_axis, copy=False - ) + data = dict(enumerate(objs)) - out = sample._constructor_from_mgr(new_data, axes=new_data.axes) - return out.__finalize__(self, method="concat") + # GH28330 Preserves subclassed objects through concat + cons = sample._constructor_expanddim - @cache_readonly - def new_axes(self) -> list[Index]: - if self._is_series and self.bm_axis == 1: - ndim = 2 - else: - ndim = self.objs[0].ndim - return [ - self._get_concat_axis - if i == self.bm_axis - else get_objs_combined_axis( - self.objs, - axis=self.objs[0]._get_block_manager_axis(i), - intersect=self.intersect, - sort=self.sort, + index = get_objs_combined_axis( + objs, + axis=objs[0]._get_block_manager_axis(0), + intersect=intersect, + sort=sort, ) - for i in range(ndim) - ] - - @cache_readonly - def _get_concat_axis(self) -> Index: - """ - Return index to be used along concatenation axis. - """ - if self._is_series: - if self.bm_axis == 0: - indexes = [x.index for x in self.objs] - elif self.ignore_index: - idx = default_index(len(self.objs)) - return idx - elif self.keys is None: - names: list[Hashable] = [None] * len(self.objs) - num = 0 - has_names = False - for i, x in enumerate(self.objs): - if x.ndim != 1: - raise TypeError( - f"Cannot concatenate type 'Series' with " - f"object of type '{type(x).__name__}'" - ) - if x.name is not None: - names[i] = x.name - has_names = True - else: - names[i] = num - num += 1 - if has_names: - return Index(names) - else: - return default_index(len(self.objs)) - else: - return ensure_index(self.keys).set_names(self.names) - else: - indexes = [x.axes[self.axis] for x in self.objs] + columns = _get_concat_axis_series( + objs, ignore_index, bm_axis, keys, levels, verify_integrity, names + ) + df = cons(data, index=index, copy=False) + df.columns = columns + return df.__finalize__(object(), method="concat", objs=objs) + + # combine block managers + else: + sample = cast("DataFrame", objs[0]) + + mgrs_indexers = [] + result_axes = new_axes( + objs, + bm_axis, + intersect, + sort, + keys, + names, + axis, + levels, + verify_integrity, + ignore_index, + ) + for obj in objs: + indexers = {} + for ax, new_labels in enumerate(result_axes): + # ::-1 to convert BlockManager ax to DataFrame ax + if ax == bm_axis: + # Suppress reindexing on concat axis + continue + + # 1-ax to convert BlockManager axis to DataFrame axis + obj_labels = obj.axes[1 - ax] + if not new_labels.equals(obj_labels): + indexers[ax] = obj_labels.get_indexer(new_labels) + + mgrs_indexers.append((obj._mgr, indexers)) + + new_data = concatenate_managers( + mgrs_indexers, result_axes, concat_axis=bm_axis, copy=False + ) + + out = sample._constructor_from_mgr(new_data, axes=new_data.axes) + return out.__finalize__(object(), method="concat", objs=objs) - if self.ignore_index: - idx = default_index(sum(len(i) for i in indexes)) - return idx - if self.keys is None: - if self.levels is not None: +def new_axes( + objs: list[Series | DataFrame], + bm_axis: AxisInt, + intersect: bool, + sort: bool, + keys: Iterable[Hashable] | None, + names: list[HashableT] | None, + axis: AxisInt, + levels, + verify_integrity: bool, + ignore_index: bool, +) -> list[Index]: + """Return the new [index, column] result for concat.""" + return [ + _get_concat_axis_dataframe( + objs, + axis, + ignore_index, + keys, + names, + levels, + verify_integrity, + ) + if i == bm_axis + else get_objs_combined_axis( + objs, + axis=objs[0]._get_block_manager_axis(i), + intersect=intersect, + sort=sort, + ) + for i in range(2) + ] + + +def _get_concat_axis_series( + objs: list[Series | DataFrame], + ignore_index: bool, + bm_axis: AxisInt, + keys: Iterable[Hashable] | None, + levels, + verify_integrity: bool, + names: list[HashableT] | None, +) -> Index: + """Return result concat axis when concatenating Series objects.""" + if ignore_index: + return default_index(len(objs)) + elif bm_axis == 0: + indexes = [x.index for x in objs] + if keys is None: + if levels is not None: raise ValueError("levels supported only when keys is not None") concat_axis = _concat_indexes(indexes) else: - concat_axis = _make_concat_multiindex( - indexes, self.keys, self.levels, self.names - ) + concat_axis = _make_concat_multiindex(indexes, keys, levels, names) + if verify_integrity and not concat_axis.is_unique: + overlap = concat_axis[concat_axis.duplicated()].unique() + raise ValueError(f"Indexes have overlapping values: {overlap}") + return concat_axis + elif keys is None: + result_names: list[Hashable] = [None] * len(objs) + num = 0 + has_names = False + for i, x in enumerate(objs): + if x.ndim != 1: + raise TypeError( + f"Cannot concatenate type 'Series' with " + f"object of type '{type(x).__name__}'" + ) + if x.name is not None: + result_names[i] = x.name + has_names = True + else: + result_names[i] = num + num += 1 + if has_names: + return Index(result_names) + else: + return default_index(len(objs)) + else: + return ensure_index(keys).set_names(names) # type: ignore[arg-type] - if self.verify_integrity: - if not concat_axis.is_unique: - overlap = concat_axis[concat_axis.duplicated()].unique() - raise ValueError(f"Indexes have overlapping values: {overlap}") - return concat_axis +def _get_concat_axis_dataframe( + objs: list[Series | DataFrame], + axis: AxisInt, + ignore_index: bool, + keys: Iterable[Hashable] | None, + names: list[HashableT] | None, + levels, + verify_integrity: bool, +) -> Index: + """Return result concat axis when concatenating DataFrame objects.""" + indexes_gen = (x.axes[axis] for x in objs) + + if ignore_index: + return default_index(sum(len(i) for i in indexes_gen)) + else: + indexes = list(indexes_gen) + + if keys is None: + if levels is not None: + raise ValueError("levels supported only when keys is not None") + concat_axis = _concat_indexes(indexes) + else: + concat_axis = _make_concat_multiindex(indexes, keys, levels, names) + + if verify_integrity and not concat_axis.is_unique: + overlap = concat_axis[concat_axis.duplicated()].unique() + raise ValueError(f"Indexes have overlapping values: {overlap}") + + return concat_axis def _clean_keys_and_objs( @@ -680,7 +719,7 @@ def _clean_keys_and_objs( Returns ------- clean_objs : list[Series | DataFrame] - LIst of DataFrame and Series with Nones removed. + List of DataFrame and Series with Nones removed. keys : Index | None None if keys was None Index if objs was a Mapping or keys was not None. Filtered where objs was None. @@ -690,28 +729,33 @@ def _clean_keys_and_objs( if isinstance(objs, abc.Mapping): if keys is None: keys = objs.keys() - objs_list = [objs[k] for k in keys] - else: - objs_list = list(objs) + objs = [objs[k] for k in keys] + elif isinstance(objs, (ABCSeries, ABCDataFrame)) or is_scalar(objs): + raise TypeError( + "first argument must be an iterable of pandas " + f'objects, you passed an object of type "{type(objs).__name__}"' + ) + elif not isinstance(objs, abc.Sized): + objs = list(objs) - if len(objs_list) == 0: + if len(objs) == 0: raise ValueError("No objects to concatenate") if keys is not None: if not isinstance(keys, Index): keys = Index(keys) - if len(keys) != len(objs_list): + if len(keys) != len(objs): # GH#43485 raise ValueError( f"The length of the keys ({len(keys)}) must match " - f"the length of the objects to concatenate ({len(objs_list)})" + f"the length of the objects to concatenate ({len(objs)})" ) # GH#1649 key_indices = [] clean_objs = [] ndims = set() - for i, obj in enumerate(objs_list): + for i, obj in enumerate(objs): if obj is None: continue elif isinstance(obj, (ABCSeries, ABCDataFrame)): diff --git a/pandas/tests/generic/test_frame.py b/pandas/tests/generic/test_frame.py index 1d0f491529b56..d06bfad930d7c 100644 --- a/pandas/tests/generic/test_frame.py +++ b/pandas/tests/generic/test_frame.py @@ -84,8 +84,9 @@ def finalize(self, other, method=None, **kwargs): value = getattr(left, name, "") + "|" + getattr(right, name, "") object.__setattr__(self, name, value) elif method == "concat": + objs = kwargs["objs"] value = "+".join( - [getattr(o, name) for o in other.objs if getattr(o, name, None)] + [getattr(o, name) for o in objs if getattr(o, name, None)] ) object.__setattr__(self, name, value) else: diff --git a/pandas/tests/generic/test_series.py b/pandas/tests/generic/test_series.py index 7dcdcd96cce51..9e2dae8d132eb 100644 --- a/pandas/tests/generic/test_series.py +++ b/pandas/tests/generic/test_series.py @@ -94,12 +94,9 @@ def test_metadata_propagation_indiv(self, monkeypatch): def finalize(self, other, method=None, **kwargs): for name in self._metadata: if method == "concat" and name == "filename": + objs = kwargs["objs"] value = "+".join( - [ - getattr(obj, name) - for obj in other.objs - if getattr(obj, name, None) - ] + [getattr(obj, name) for obj in objs if getattr(obj, name, None)] ) object.__setattr__(self, name, value) else: From f6d06b8e160232c80a835297f2343a619ba991ad Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Sat, 6 Jul 2024 06:01:24 -1000 Subject: [PATCH 153/272] REF: Minimize state in read_csv (#59194) * Move some set parse_date_cols to pythong parser * Clean up do_date_conversions * Move can cast to python parser * Move can cast to python parser * Revert "Move can cast to python parser" This reverts commit 99ca747bbc4c82934245384049a209459432221b. * Typing issues --- pandas/io/parsers/base_parser.py | 497 ++++++++----------------- pandas/io/parsers/c_parser_wrapper.py | 9 +- pandas/io/parsers/python_parser.py | 183 ++++++++- pandas/tests/io/parser/test_network.py | 1 + 4 files changed, 336 insertions(+), 354 deletions(-) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index e7473aabdff87..e8faea76897c6 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -28,27 +28,19 @@ ) from pandas.util._exceptions import find_stack_level -from pandas.core.dtypes.astype import astype_array from pandas.core.dtypes.common import ( is_bool_dtype, is_dict_like, - is_extension_array_dtype, is_float_dtype, is_integer, is_integer_dtype, is_list_like, is_object_dtype, is_string_dtype, - pandas_dtype, -) -from pandas.core.dtypes.dtypes import ( - CategoricalDtype, - ExtensionDtype, ) from pandas.core.dtypes.missing import isna from pandas import ( - ArrowDtype, DataFrame, DatetimeIndex, StringDtype, @@ -58,12 +50,9 @@ ArrowExtensionArray, BaseMaskedArray, BooleanArray, - Categorical, - ExtensionArray, FloatingArray, IntegerArray, ) -from pandas.core.arrays.boolean import BooleanDtype from pandas.core.indexes.api import ( Index, MultiIndex, @@ -86,7 +75,6 @@ from pandas._typing import ( ArrayLike, DtypeArg, - DtypeObj, Hashable, HashableT, Scalar, @@ -127,7 +115,6 @@ def __init__(self, kwds) -> None: "for the 'parse_dates' parameter" ) self.parse_dates: bool | list = parse_dates - self._parse_date_cols: set = set() self.date_parser = kwds.pop("date_parser", lib.no_default) self.date_format = kwds.pop("date_format", None) self.dayfirst = kwds.pop("dayfirst", False) @@ -145,12 +132,6 @@ def __init__(self, kwds) -> None: self.false_values = kwds.get("false_values") self.cache_dates = kwds.pop("cache_dates", True) - self._date_conv = _make_date_converter( - date_format=self.date_format, - dayfirst=self.dayfirst, - cache_dates=self.cache_dates, - ) - # validate header options for mi self.header = kwds.get("header") if is_list_like(self.header, allow_sets=False): @@ -181,58 +162,12 @@ def __init__(self, kwds) -> None: self._first_chunk = True - self.usecols, self.usecols_dtype = self._validate_usecols_arg(kwds["usecols"]) + self.usecols, self.usecols_dtype = _validate_usecols_arg(kwds["usecols"]) # Fallback to error to pass a sketchy test(test_override_set_noconvert_columns) # Normally, this arg would get pre-processed earlier on self.on_bad_lines = kwds.get("on_bad_lines", self.BadLineHandleMethod.ERROR) - def _validate_parse_dates_presence(self, columns: Sequence[Hashable]) -> set: - """ - Check if parse_dates are in columns. - - If user has provided names for parse_dates, check if those columns - are available. - - Parameters - ---------- - columns : list - List of names of the dataframe. - - Returns - ------- - The names of the columns which will get parsed later if a list - is given as specification. - - Raises - ------ - ValueError - If column to parse_date is not in dataframe. - - """ - if not isinstance(self.parse_dates, list): - return set() - - # get only columns that are references using names (str), not by index - missing_cols = ", ".join( - sorted( - { - col - for col in self.parse_dates - if isinstance(col, str) and col not in columns - } - ) - ) - if missing_cols: - raise ValueError( - f"Missing column provided to 'parse_dates': '{missing_cols}'" - ) - # Convert positions to actual column names - return { - col if (isinstance(col, str) or col in columns) else columns[col] - for col in self.parse_dates - } - def close(self) -> None: pass @@ -404,9 +339,12 @@ def _agg_index(self, index, try_parse_dates: bool = True) -> Index: for i, arr in enumerate(index): if try_parse_dates and self._should_parse_dates(i): - arr = self._date_conv( + arr = date_converter( arr, col=self.index_names[i] if self.index_names is not None else None, + dayfirst=self.dayfirst, + cache_dates=self.cache_dates, + date_format=self.date_format, ) if self.na_filter: @@ -420,7 +358,7 @@ def _agg_index(self, index, try_parse_dates: bool = True) -> Index: assert self.index_names is not None col_name = self.index_names[i] if col_name is not None: - col_na_values, col_na_fvalues = _get_na_values( + col_na_values, col_na_fvalues = get_na_values( col_name, self.na_values, self.na_fvalues, self.keep_default_na ) else: @@ -451,90 +389,6 @@ def _agg_index(self, index, try_parse_dates: bool = True) -> Index: return index - @final - def _convert_to_ndarrays( - self, - dct: Mapping, - na_values, - na_fvalues, - converters=None, - dtypes=None, - ) -> dict[Any, np.ndarray]: - result = {} - for c, values in dct.items(): - conv_f = None if converters is None else converters.get(c, None) - if isinstance(dtypes, dict): - cast_type = dtypes.get(c, None) - else: - # single dtype or None - cast_type = dtypes - - if self.na_filter: - col_na_values, col_na_fvalues = _get_na_values( - c, na_values, na_fvalues, self.keep_default_na - ) - else: - col_na_values, col_na_fvalues = set(), set() - - if c in self._parse_date_cols: - # GH#26203 Do not convert columns which get converted to dates - # but replace nans to ensure to_datetime works - mask = algorithms.isin(values, set(col_na_values) | col_na_fvalues) - np.putmask(values, mask, np.nan) - result[c] = values - continue - - if conv_f is not None: - # conv_f applied to data before inference - if cast_type is not None: - warnings.warn( - ( - "Both a converter and dtype were specified " - f"for column {c} - only the converter will be used." - ), - ParserWarning, - stacklevel=find_stack_level(), - ) - - try: - values = lib.map_infer(values, conv_f) - except ValueError: - mask = algorithms.isin(values, list(na_values)).view(np.uint8) - values = lib.map_infer_mask(values, conv_f, mask) - - cvals, na_count = self._infer_types( - values, - set(col_na_values) | col_na_fvalues, - cast_type is None, - try_num_bool=False, - ) - else: - is_ea = is_extension_array_dtype(cast_type) - is_str_or_ea_dtype = is_ea or is_string_dtype(cast_type) - # skip inference if specified dtype is object - # or casting to an EA - try_num_bool = not (cast_type and is_str_or_ea_dtype) - - # general type inference and conversion - cvals, na_count = self._infer_types( - values, - set(col_na_values) | col_na_fvalues, - cast_type is None, - try_num_bool, - ) - - # type specified in dtype param or cast_type is an EA - if cast_type is not None: - cast_type = pandas_dtype(cast_type) - if cast_type and (cvals.dtype != cast_type or is_ea): - if not is_ea and na_count > 0: - if is_bool_dtype(cast_type): - raise ValueError(f"Bool column has NA values in column {c}") - cvals = self._cast_types(cvals, cast_type, c) - - result[c] = cvals - return result - @final def _set_noconvert_dtype_columns( self, col_indices: list[int], names: Sequence[Hashable] @@ -580,6 +434,7 @@ def _set(x) -> int: return x if isinstance(self.parse_dates, list): + validate_parse_dates_presence(self.parse_dates, names) for val in self.parse_dates: noconvert_columns.add(_set(val)) @@ -705,80 +560,6 @@ def _infer_types( return result, na_count - @final - def _cast_types(self, values: ArrayLike, cast_type: DtypeObj, column) -> ArrayLike: - """ - Cast values to specified type - - Parameters - ---------- - values : ndarray or ExtensionArray - cast_type : np.dtype or ExtensionDtype - dtype to cast values to - column : string - column name - used only for error reporting - - Returns - ------- - converted : ndarray or ExtensionArray - """ - if isinstance(cast_type, CategoricalDtype): - known_cats = cast_type.categories is not None - - if not is_object_dtype(values.dtype) and not known_cats: - # TODO: this is for consistency with - # c-parser which parses all categories - # as strings - values = lib.ensure_string_array( - values, skipna=False, convert_na_value=False - ) - - cats = Index(values).unique().dropna() - values = Categorical._from_inferred_categories( - cats, cats.get_indexer(values), cast_type, true_values=self.true_values - ) - - # use the EA's implementation of casting - elif isinstance(cast_type, ExtensionDtype): - array_type = cast_type.construct_array_type() - try: - if isinstance(cast_type, BooleanDtype): - # error: Unexpected keyword argument "true_values" for - # "_from_sequence_of_strings" of "ExtensionArray" - values_str = [str(val) for val in values] - return array_type._from_sequence_of_strings( # type: ignore[call-arg] - values_str, - dtype=cast_type, - true_values=self.true_values, - false_values=self.false_values, - none_values=self.na_values, - ) - else: - return array_type._from_sequence_of_strings(values, dtype=cast_type) - except NotImplementedError as err: - raise NotImplementedError( - f"Extension Array: {array_type} must implement " - "_from_sequence_of_strings in order to be used in parser methods" - ) from err - - elif isinstance(values, ExtensionArray): - values = values.astype(cast_type, copy=False) - elif issubclass(cast_type.type, str): - # TODO: why skipna=True here and False above? some tests depend - # on it here, but nothing fails if we change it above - # (as no tests get there as of 2022-12-06) - values = lib.ensure_string_array( - values, skipna=True, convert_na_value=False - ) - else: - try: - values = astype_array(values, cast_type, copy=True) - except ValueError as err: - raise ValueError( - f"Unable to convert column {column} to type {cast_type}" - ) from err - return values - @overload def _do_date_conversions( self, @@ -799,16 +580,25 @@ def _do_date_conversions( names: Sequence[Hashable] | Index, data: Mapping[Hashable, ArrayLike] | DataFrame, ) -> Mapping[Hashable, ArrayLike] | DataFrame: - if isinstance(self.parse_dates, list): - return _process_date_conversion( - data, - self._date_conv, - self.parse_dates, - self.index_col, - self.index_names, - names, - dtype_backend=self.dtype_backend, + if not isinstance(self.parse_dates, list): + return data + for colspec in self.parse_dates: + if isinstance(colspec, int) and colspec not in data: + colspec = names[colspec] + if (isinstance(self.index_col, list) and colspec in self.index_col) or ( + isinstance(self.index_names, list) and colspec in self.index_names + ): + continue + result = date_converter( + data[colspec], + col=colspec, + dayfirst=self.dayfirst, + cache_dates=self.cache_dates, + date_format=self.date_format, ) + # error: Unsupported target for indexed assignment + # ("Mapping[Hashable, ExtensionArray | ndarray[Any, Any]] | DataFrame") + data[colspec] = result # type: ignore[index] return data @@ -903,56 +693,6 @@ def _validate_usecols_names(self, usecols: SequenceT, names: Sequence) -> Sequen return usecols - @final - def _validate_usecols_arg(self, usecols): - """ - Validate the 'usecols' parameter. - - Checks whether or not the 'usecols' parameter contains all integers - (column selection by index), strings (column by name) or is a callable. - Raises a ValueError if that is not the case. - - Parameters - ---------- - usecols : list-like, callable, or None - List of columns to use when parsing or a callable that can be used - to filter a list of table columns. - - Returns - ------- - usecols_tuple : tuple - A tuple of (verified_usecols, usecols_dtype). - - 'verified_usecols' is either a set if an array-like is passed in or - 'usecols' if a callable or None is passed in. - - 'usecols_dtype` is the inferred dtype of 'usecols' if an array-like - is passed in or None if a callable or None is passed in. - """ - msg = ( - "'usecols' must either be list-like of all strings, all unicode, " - "all integers or a callable." - ) - if usecols is not None: - if callable(usecols): - return usecols, None - - if not is_list_like(usecols): - # see gh-20529 - # - # Ensure it is iterable container but not string. - raise ValueError(msg) - - usecols_dtype = lib.infer_dtype(usecols, skipna=False) - - if usecols_dtype not in ("empty", "integer", "string"): - raise ValueError(msg) - - usecols = set(usecols) - - return usecols, usecols_dtype - return usecols, None - @final def _clean_index_names(self, columns, index_col) -> tuple[list | None, list, list]: if not is_index_col(index_col): @@ -1042,40 +782,37 @@ def _get_empty_meta( return index, columns, col_dict -def _make_date_converter( +def date_converter( + date_col, + col: Hashable, dayfirst: bool = False, cache_dates: bool = True, date_format: dict[Hashable, str] | str | None = None, ): - def converter(date_col, col: Hashable): - if date_col.dtype.kind in "Mm": - return date_col - - date_fmt = ( - date_format.get(col) if isinstance(date_format, dict) else date_format + if date_col.dtype.kind in "Mm": + return date_col + + date_fmt = date_format.get(col) if isinstance(date_format, dict) else date_format + + str_objs = lib.ensure_string_array(np.asarray(date_col)) + try: + result = tools.to_datetime( + str_objs, + format=date_fmt, + utc=False, + dayfirst=dayfirst, + cache=cache_dates, ) + except (ValueError, TypeError): + # test_usecols_with_parse_dates4 + # test_multi_index_parse_dates + return str_objs - str_objs = lib.ensure_string_array(date_col) - try: - result = tools.to_datetime( - str_objs, - format=date_fmt, - utc=False, - dayfirst=dayfirst, - cache=cache_dates, - ) - except (ValueError, TypeError): - # test_usecols_with_parse_dates4 - # test_multi_index_parse_dates - return str_objs - - if isinstance(result, DatetimeIndex): - arr = result.to_numpy() - arr.flags.writeable = True - return arr - return result._values - - return converter + if isinstance(result, DatetimeIndex): + arr = result.to_numpy() + arr.flags.writeable = True + return arr + return result._values parser_defaults = { @@ -1118,43 +855,7 @@ def converter(date_col, col: Hashable): } -def _process_date_conversion( - data_dict: Mapping[Hashable, ArrayLike] | DataFrame, - converter: Callable, - parse_spec: list, - index_col, - index_names, - columns: Sequence[Hashable] | Index, - dtype_backend=lib.no_default, -) -> Mapping[Hashable, ArrayLike] | DataFrame: - for colspec in parse_spec: - if isinstance(colspec, int) and colspec not in data_dict: - colspec = columns[colspec] - if (isinstance(index_col, list) and colspec in index_col) or ( - isinstance(index_names, list) and colspec in index_names - ): - continue - elif dtype_backend == "pyarrow": - import pyarrow as pa - - dtype = data_dict[colspec].dtype - if isinstance(dtype, ArrowDtype) and ( - pa.types.is_timestamp(dtype.pyarrow_dtype) - or pa.types.is_date(dtype.pyarrow_dtype) - ): - continue - - # Pyarrow engine returns Series which we need to convert to - # numpy array before converter, its a no-op for other parsers - result = converter(np.asarray(data_dict[colspec]), col=colspec) - # error: Unsupported target for indexed assignment - # ("Mapping[Hashable, ExtensionArray | ndarray[Any, Any]] | DataFrame") - data_dict[colspec] = result # type: ignore[index] - - return data_dict - - -def _get_na_values(col, na_values, na_fvalues, keep_default_na: bool): +def get_na_values(col, na_values, na_fvalues, keep_default_na: bool): """ Get the NaN values for a given column. @@ -1191,3 +892,99 @@ def _get_na_values(col, na_values, na_fvalues, keep_default_na: bool): def is_index_col(col) -> bool: return col is not None and col is not False + + +def validate_parse_dates_presence( + parse_dates: bool | list, columns: Sequence[Hashable] +) -> set: + """ + Check if parse_dates are in columns. + + If user has provided names for parse_dates, check if those columns + are available. + + Parameters + ---------- + columns : list + List of names of the dataframe. + + Returns + ------- + The names of the columns which will get parsed later if a list + is given as specification. + + Raises + ------ + ValueError + If column to parse_date is not in dataframe. + + """ + if not isinstance(parse_dates, list): + return set() + + missing = set() + unique_cols = set() + for col in parse_dates: + if isinstance(col, str): + if col not in columns: + missing.add(col) + else: + unique_cols.add(col) + elif col in columns: + unique_cols.add(col) + else: + unique_cols.add(columns[col]) + if missing: + missing_cols = ", ".join(sorted(missing)) + raise ValueError(f"Missing column provided to 'parse_dates': '{missing_cols}'") + return unique_cols + + +def _validate_usecols_arg(usecols): + """ + Validate the 'usecols' parameter. + + Checks whether or not the 'usecols' parameter contains all integers + (column selection by index), strings (column by name) or is a callable. + Raises a ValueError if that is not the case. + + Parameters + ---------- + usecols : list-like, callable, or None + List of columns to use when parsing or a callable that can be used + to filter a list of table columns. + + Returns + ------- + usecols_tuple : tuple + A tuple of (verified_usecols, usecols_dtype). + + 'verified_usecols' is either a set if an array-like is passed in or + 'usecols' if a callable or None is passed in. + + 'usecols_dtype` is the inferred dtype of 'usecols' if an array-like + is passed in or None if a callable or None is passed in. + """ + msg = ( + "'usecols' must either be list-like of all strings, all unicode, " + "all integers or a callable." + ) + if usecols is not None: + if callable(usecols): + return usecols, None + + if not is_list_like(usecols): + # see gh-20529 + # + # Ensure it is iterable container but not string. + raise ValueError(msg) + + usecols_dtype = lib.infer_dtype(usecols, skipna=False) + + if usecols_dtype not in ("empty", "integer", "string"): + raise ValueError(msg) + + usecols = set(usecols) + + return usecols, usecols_dtype + return usecols, None diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index 4de626288aa41..b59a778624c49 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -30,7 +30,9 @@ from pandas.io.parsers.base_parser import ( ParserBase, ParserError, + date_converter, is_index_col, + validate_parse_dates_presence, ) if TYPE_CHECKING: @@ -160,7 +162,7 @@ def __init__(self, src: ReadCsvBuffer[str], **kwds) -> None: ) # error: Cannot determine type of 'names' - self._validate_parse_dates_presence(self.names) # type: ignore[has-type] + validate_parse_dates_presence(self.parse_dates, self.names) # type: ignore[has-type] self._set_noconvert_columns() # error: Cannot determine type of 'names' @@ -344,9 +346,12 @@ def _filter_usecols(self, names: SequenceT) -> SequenceT | list[Hashable]: def _maybe_parse_dates(self, values, index: int, try_parse_dates: bool = True): if try_parse_dates and self._should_parse_dates(index): - values = self._date_conv( + values = date_converter( values, col=self.index_names[index] if self.index_names is not None else None, + dayfirst=self.dayfirst, + cache_dates=self.cache_dates, + date_format=self.date_format, ) return values diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index f7d2aa2419429..05fe963e9b2b7 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -10,9 +10,11 @@ from typing import ( IO, TYPE_CHECKING, + Any, DefaultDict, Literal, cast, + final, ) import warnings @@ -27,20 +29,39 @@ from pandas.util._decorators import cache_readonly from pandas.util._exceptions import find_stack_level +from pandas.core.dtypes.astype import astype_array from pandas.core.dtypes.common import ( is_bool_dtype, + is_extension_array_dtype, is_integer, is_numeric_dtype, + is_object_dtype, + is_string_dtype, + pandas_dtype, +) +from pandas.core.dtypes.dtypes import ( + CategoricalDtype, + ExtensionDtype, ) from pandas.core.dtypes.inference import is_dict_like +from pandas.core import algorithms +from pandas.core.arrays import ( + Categorical, + ExtensionArray, +) +from pandas.core.arrays.boolean import BooleanDtype +from pandas.core.indexes.api import Index + from pandas.io.common import ( dedup_names, is_potential_multi_index, ) from pandas.io.parsers.base_parser import ( ParserBase, + get_na_values, parser_defaults, + validate_parse_dates_presence, ) if TYPE_CHECKING: @@ -53,13 +74,13 @@ from pandas._typing import ( ArrayLike, + DtypeObj, ReadCsvBuffer, Scalar, T, ) from pandas import ( - Index, MultiIndex, Series, ) @@ -157,7 +178,6 @@ def __init__(self, f: ReadCsvBuffer[str] | list, **kwds) -> None: if self._col_indices is None: self._col_indices = list(range(len(self.columns))) - self._parse_date_cols = self._validate_parse_dates_presence(self.columns) self._no_thousands_columns = self._set_no_thousand_columns() if len(self.decimal) != 1: @@ -370,6 +390,165 @@ def _convert_data( clean_dtypes, ) + @final + def _convert_to_ndarrays( + self, + dct: Mapping, + na_values, + na_fvalues, + converters=None, + dtypes=None, + ) -> dict[Any, np.ndarray]: + result = {} + parse_date_cols = validate_parse_dates_presence(self.parse_dates, self.columns) + for c, values in dct.items(): + conv_f = None if converters is None else converters.get(c, None) + if isinstance(dtypes, dict): + cast_type = dtypes.get(c, None) + else: + # single dtype or None + cast_type = dtypes + + if self.na_filter: + col_na_values, col_na_fvalues = get_na_values( + c, na_values, na_fvalues, self.keep_default_na + ) + else: + col_na_values, col_na_fvalues = set(), set() + + if c in parse_date_cols: + # GH#26203 Do not convert columns which get converted to dates + # but replace nans to ensure to_datetime works + mask = algorithms.isin(values, set(col_na_values) | col_na_fvalues) # pyright: ignore[reportArgumentType] + np.putmask(values, mask, np.nan) + result[c] = values + continue + + if conv_f is not None: + # conv_f applied to data before inference + if cast_type is not None: + warnings.warn( + ( + "Both a converter and dtype were specified " + f"for column {c} - only the converter will be used." + ), + ParserWarning, + stacklevel=find_stack_level(), + ) + + try: + values = lib.map_infer(values, conv_f) + except ValueError: + mask = algorithms.isin(values, list(na_values)).view(np.uint8) + values = lib.map_infer_mask(values, conv_f, mask) + + cvals, na_count = self._infer_types( + values, + set(col_na_values) | col_na_fvalues, + cast_type is None, + try_num_bool=False, + ) + else: + is_ea = is_extension_array_dtype(cast_type) + is_str_or_ea_dtype = is_ea or is_string_dtype(cast_type) + # skip inference if specified dtype is object + # or casting to an EA + try_num_bool = not (cast_type and is_str_or_ea_dtype) + + # general type inference and conversion + cvals, na_count = self._infer_types( + values, + set(col_na_values) | col_na_fvalues, + cast_type is None, + try_num_bool, + ) + + # type specified in dtype param or cast_type is an EA + if cast_type is not None: + cast_type = pandas_dtype(cast_type) + if cast_type and (cvals.dtype != cast_type or is_ea): + if not is_ea and na_count > 0: + if is_bool_dtype(cast_type): + raise ValueError(f"Bool column has NA values in column {c}") + cvals = self._cast_types(cvals, cast_type, c) + + result[c] = cvals + return result + + @final + def _cast_types(self, values: ArrayLike, cast_type: DtypeObj, column) -> ArrayLike: + """ + Cast values to specified type + + Parameters + ---------- + values : ndarray or ExtensionArray + cast_type : np.dtype or ExtensionDtype + dtype to cast values to + column : string + column name - used only for error reporting + + Returns + ------- + converted : ndarray or ExtensionArray + """ + if isinstance(cast_type, CategoricalDtype): + known_cats = cast_type.categories is not None + + if not is_object_dtype(values.dtype) and not known_cats: + # TODO: this is for consistency with + # c-parser which parses all categories + # as strings + values = lib.ensure_string_array( + values, skipna=False, convert_na_value=False + ) + + cats = Index(values).unique().dropna() + values = Categorical._from_inferred_categories( + cats, cats.get_indexer(values), cast_type, true_values=self.true_values + ) + + # use the EA's implementation of casting + elif isinstance(cast_type, ExtensionDtype): + array_type = cast_type.construct_array_type() + try: + if isinstance(cast_type, BooleanDtype): + # error: Unexpected keyword argument "true_values" for + # "_from_sequence_of_strings" of "ExtensionArray" + values_str = [str(val) for val in values] + return array_type._from_sequence_of_strings( # type: ignore[call-arg] + values_str, + dtype=cast_type, + true_values=self.true_values, # pyright: ignore[reportCallIssue] + false_values=self.false_values, # pyright: ignore[reportCallIssue] + none_values=self.na_values, # pyright: ignore[reportCallIssue] + ) + else: + return array_type._from_sequence_of_strings(values, dtype=cast_type) + except NotImplementedError as err: + raise NotImplementedError( + f"Extension Array: {array_type} must implement " + "_from_sequence_of_strings in order to be used in parser methods" + ) from err + + elif isinstance(values, ExtensionArray): + values = values.astype(cast_type, copy=False) + elif issubclass(cast_type.type, str): + # TODO: why skipna=True here and False above? some tests depend + # on it here, but nothing fails if we change it above + # (as no tests get there as of 2022-12-06) + values = lib.ensure_string_array( + values, skipna=True, convert_na_value=False + ) + else: + try: + values = astype_array(values, cast_type, copy=True) + except ValueError as err: + raise ValueError( + f"Unable to convert column {column} to type {cast_type}" + ) from err + return values + @cache_readonly def _have_mi_columns(self) -> bool: if self.header is None: diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py index f63cc3d56bf89..4ccfa8e81e883 100644 --- a/pandas/tests/io/parser/test_network.py +++ b/pandas/tests/io/parser/test_network.py @@ -75,6 +75,7 @@ def tips_df(datapath): @pytest.mark.single_cpu +@pytest.mark.network @pytest.mark.usefixtures("s3_resource") @td.skip_if_not_us_locale() class TestS3: From 236d89b8565df844da0badfbc9f2db7084883933 Mon Sep 17 00:00:00 2001 From: mutricyl <118692416+mutricyl@users.noreply.github.com> Date: Sat, 6 Jul 2024 18:14:37 +0200 Subject: [PATCH 154/272] update algo.take to solve #59177 (#59181) * update algo.take to solve #59177 * forgot to update TestExtensionTake::test_take_coerces_list * fixing pandas/tests/dtypes/test_generic.py::TestABCClasses::test_abc_hierarchy * ABCExtensionArray set formatting --------- Co-authored-by: Laurent Mutricy --- pandas/core/algorithms.py | 10 +++++++--- pandas/tests/test_take.py | 10 +++++++++- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 0d97f8a298fdb..92bd55cac9c5e 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -68,6 +68,7 @@ ABCExtensionArray, ABCIndex, ABCMultiIndex, + ABCNumpyExtensionArray, ABCSeries, ABCTimedeltaArray, ) @@ -1161,11 +1162,14 @@ def take( ... ) array([ 10, 10, -10]) """ - if not isinstance(arr, (np.ndarray, ABCExtensionArray, ABCIndex, ABCSeries)): + if not isinstance( + arr, + (np.ndarray, ABCExtensionArray, ABCIndex, ABCSeries, ABCNumpyExtensionArray), + ): # GH#52981 raise TypeError( - "pd.api.extensions.take requires a numpy.ndarray, " - f"ExtensionArray, Index, or Series, got {type(arr).__name__}." + "pd.api.extensions.take requires a numpy.ndarray, ExtensionArray, " + f"Index, Series, or NumpyExtensionArray got {type(arr).__name__}." ) indices = ensure_platform_int(indices) diff --git a/pandas/tests/test_take.py b/pandas/tests/test_take.py index ce2e4e0f6cec5..451ef42fff3d1 100644 --- a/pandas/tests/test_take.py +++ b/pandas/tests/test_take.py @@ -5,6 +5,7 @@ from pandas._libs import iNaT +from pandas import array import pandas._testing as tm import pandas.core.algorithms as algos @@ -303,7 +304,14 @@ def test_take_coerces_list(self): arr = [1, 2, 3] msg = ( "pd.api.extensions.take requires a numpy.ndarray, ExtensionArray, " - "Index, or Series, got list" + "Index, Series, or NumpyExtensionArray got list" ) with pytest.raises(TypeError, match=msg): algos.take(arr, [0, 0]) + + def test_take_NumpyExtensionArray(self): + # GH#59177 + arr = array([1 + 1j, 2, 3]) # NumpyEADtype('complex128') (NumpyExtensionArray) + assert algos.take(arr, [2]) == 2 + arr = array([1, 2, 3]) # Int64Dtype() (ExtensionArray) + assert algos.take(arr, [2]) == 2 From 262fcfbffcee5c3116e86a951d8b693f90411e68 Mon Sep 17 00:00:00 2001 From: Abdulaziz Aloqeely <52792999+Aloqeely@users.noreply.github.com> Date: Mon, 8 Jul 2024 18:21:20 +0300 Subject: [PATCH 155/272] BUG: Fix .dt.microsecond accessor for pyarrow-backed Series (#59183) * BUG: Fix .dt.microsecond accessor for pyarrow-backed Series * Add whatsnew entry * Write test --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/arrays/arrow/array.py | 5 ++++- pandas/tests/extension/test_arrow.py | 28 +++++++++++++++++++++++++--- 3 files changed, 30 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 41d18feaa532c..d24c39d83bad5 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -502,6 +502,7 @@ Datetimelike - Bug in :meth:`DatetimeIndex.is_year_start` and :meth:`DatetimeIndex.is_quarter_start` does not raise on Custom business days frequencies bigger then "1C" (:issue:`58664`) - Bug in :meth:`DatetimeIndex.is_year_start` and :meth:`DatetimeIndex.is_quarter_start` returning ``False`` on double-digit frequencies (:issue:`58523`) - Bug in :meth:`DatetimeIndex.union` when ``unit`` was non-nanosecond (:issue:`59036`) +- Bug in :meth:`Series.dt.microsecond` producing incorrect results for pyarrow backed :class:`Series`. (:issue:`59154`) - Bug in setting scalar values with mismatched resolution into arrays with non-nanosecond ``datetime64``, ``timedelta64`` or :class:`DatetimeTZDtype` incorrectly truncating those scalars (:issue:`56410`) Timedelta diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 4ff7553af2b69..943656ba48432 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2794,7 +2794,10 @@ def _dt_days_in_month(self) -> Self: @property def _dt_microsecond(self) -> Self: - return type(self)(pc.microsecond(self._pa_array)) + # GH 59154 + us = pc.microsecond(self._pa_array) + ms_to_us = pc.multiply(pc.millisecond(self._pa_array), 1000) + return type(self)(pc.add(us, ms_to_us)) @property def _dt_minute(self) -> Self: diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index f2e9d2321f33e..4fad5e45409b9 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -2437,13 +2437,13 @@ def test_unsupported_dt(data): ["hour", 3], ["minute", 4], ["is_leap_year", False], - ["microsecond", 5], + ["microsecond", 2000], ["month", 1], ["nanosecond", 6], ["quarter", 1], ["second", 7], ["date", date(2023, 1, 2)], - ["time", time(3, 4, 7, 5)], + ["time", time(3, 4, 7, 2000)], ], ) def test_dt_properties(prop, expected): @@ -2456,7 +2456,7 @@ def test_dt_properties(prop, expected): hour=3, minute=4, second=7, - microsecond=5, + microsecond=2000, nanosecond=6, ), None, @@ -2473,6 +2473,28 @@ def test_dt_properties(prop, expected): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize("microsecond", [2000, 5, 0]) +def test_dt_microsecond(microsecond): + # GH 59183 + ser = pd.Series( + [ + pd.Timestamp( + year=2024, + month=7, + day=7, + second=5, + microsecond=microsecond, + nanosecond=6, + ), + None, + ], + dtype=ArrowDtype(pa.timestamp("ns")), + ) + result = ser.dt.microsecond + expected = pd.Series([microsecond, None], dtype="int64[pyarrow]") + tm.assert_series_equal(result, expected) + + def test_dt_is_month_start_end(): ser = pd.Series( [ From aa6d611de0f641cc93a27bfb281566fb545fbce9 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Mon, 8 Jul 2024 21:03:20 +0530 Subject: [PATCH 156/272] DOC: add SA01 for pandas.Period.now (#59202) --- ci/code_checks.sh | 1 - pandas/_libs/tslibs/period.pyx | 13 +++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index d0123e64eb542..b01866a6d6c82 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -95,7 +95,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Period.freq GL08" \ -i "pandas.Period.freqstr SA01" \ -i "pandas.Period.month SA01" \ - -i "pandas.Period.now SA01" \ -i "pandas.Period.ordinal GL08" \ -i "pandas.Period.strftime PR01,SA01" \ -i "pandas.Period.to_timestamp SA01" \ diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 023a0f52e320f..c6ba97fe9f1a2 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -2472,11 +2472,24 @@ cdef class _Period(PeriodMixin): """ Return the period of now's date. + The `now` method provides a convenient way to generate a period + object for the current date and time. This can be particularly + useful in financial and economic analysis, where data is often + collected and analyzed in regular intervals (e.g., hourly, daily, + monthly). By specifying the frequency, users can create periods + that match the granularity of their data. + Parameters ---------- freq : str, BaseOffset Frequency to use for the returned period. + See Also + -------- + to_datetime : Convert argument to datetime. + Period : Represents a period of time. + Period.to_timestamp : Return the Timestamp representation of the Period. + Examples -------- >>> pd.Period.now('h') # doctest: +SKIP From 14297271da10273ebc20ecc443e0aae77b9945e9 Mon Sep 17 00:00:00 2001 From: Anurag Varma Date: Mon, 8 Jul 2024 21:05:40 +0530 Subject: [PATCH 157/272] DOC: fix the Return type for pandas.Timestamp.asm8 (#59200) updated documentation for asm8 return type --- pandas/_libs/tslibs/nattype.pyx | 2 +- pandas/_libs/tslibs/timestamps.pyx | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index c1f2341328570..4544cf56a11ec 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -1516,7 +1516,7 @@ default 'raise' See Also -------- - Timestamp.asm8 : Return numpy datetime64 format in nanoseconds. + Timestamp.asm8 : Return numpy datetime64 format with same precision. Timestamp.to_pydatetime : Convert Timestamp object to a native Python datetime object. to_timedelta : Convert argument into timedelta object, diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index c4bd9e1b47bbe..cd749effd1a5f 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -1139,7 +1139,7 @@ cdef class _Timestamp(ABCTimestamp): See Also -------- - Timestamp.asm8 : Return numpy datetime64 format in nanoseconds. + Timestamp.asm8 : Return numpy datetime64 format with same precision. Timestamp.to_pydatetime : Convert Timestamp object to a native Python datetime object. to_timedelta : Convert argument into timedelta object, @@ -1170,7 +1170,7 @@ cdef class _Timestamp(ABCTimestamp): @property def asm8(self) -> np.datetime64: """ - Return numpy datetime64 format in nanoseconds. + Return numpy datetime64 format with same precision. See Also -------- From 61c5fbffde5f12aac1b3ae9cd131ebbe9b8ab644 Mon Sep 17 00:00:00 2001 From: wooseogchoi Date: Mon, 8 Jul 2024 11:36:57 -0400 Subject: [PATCH 158/272] DOC: added adbc connection in con parameter of to_sql and example of its usage (#59198) * doc: to_sql docs should mention ADBC #59095 * remove two blank lines at the end of docstring. * added one more space in in-line comments of example. --- pandas/core/generic.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 312f5d20d794f..43003553d7ad6 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2779,7 +2779,8 @@ def to_sql( ---------- name : str Name of SQL table. - con : sqlalchemy.engine.(Engine or Connection) or sqlite3.Connection + con : ADBC connection, sqlalchemy.engine.(Engine or Connection) or sqlite3.Connection + ADBC provides high performance I/O with native type support, where available. Using SQLAlchemy makes it possible to use any DB supported by that library. Legacy support is provided for sqlite3.Connection objects. The user is responsible for engine disposal and connection closure for the SQLAlchemy @@ -2966,6 +2967,22 @@ def to_sql( >>> with engine.connect() as conn: ... conn.execute(text("SELECT * FROM integers")).fetchall() [(1,), (None,), (2,)] + + .. versionadded:: 2.2.0 + + pandas now supports writing via ADBC drivers + + >>> df = pd.DataFrame({'name' : ['User 10', 'User 11', 'User 12']}) + >>> df + name + 0 User 10 + 1 User 11 + 2 User 12 + + >>> from adbc_driver_sqlite import dbapi # doctest:+SKIP + >>> with dbapi.connect("sqlite://") as conn: # doctest:+SKIP + ... df.to_sql(name="users", con=conn) + 3 """ # noqa: E501 from pandas.io import sql From a93e2e22c54bcd47b2a6189c7f8e2449fd8d3269 Mon Sep 17 00:00:00 2001 From: Ilya <34696956+LuckIlNe@users.noreply.github.com> Date: Mon, 8 Jul 2024 19:38:58 +0400 Subject: [PATCH 159/272] =?UTF-8?q?51500:=20Add=20test=20for=20unexpected?= =?UTF-8?q?=20behavior=20math=20operations=20using=20multiin=E2=80=A6=20(#?= =?UTF-8?q?59191)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 51500: Add test for unexpected behavior math operations using multiindexes --- .../indexing/multiindex/test_multiindex.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/pandas/tests/indexing/multiindex/test_multiindex.py b/pandas/tests/indexing/multiindex/test_multiindex.py index 481a77fd03b05..7140ad7d1e9f5 100644 --- a/pandas/tests/indexing/multiindex/test_multiindex.py +++ b/pandas/tests/indexing/multiindex/test_multiindex.py @@ -232,3 +232,20 @@ def test_multiindex_from_tuples_with_nan(self): [("a", "b", "c"), (np.nan, np.nan, np.nan), ("d", "", "")] ) tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("operation", ["div", "mul", "add", "sub"]) + def test_groupyby_rename_categories_operation_with_multiindex(self, operation): + # GH#51500 + data = DataFrame( + [["C", "B", "B"], ["B", "A", "A"], ["B", "A", "B"]], columns=["0", "1", "2"] + ) + data["0"] = data["0"].astype("category") + data["0"] = data["0"].cat.rename_categories({"C": "B", "B": "C"}) + + a = data.groupby(by=["0", "1"])["2"].value_counts() + b = data.groupby(by=["0", "1"]).size() + + result = getattr(a, operation)(b) + expected = getattr(a, operation)(b.sort_index(ascending=False)) + + tm.assert_series_equal(result, expected) From 60900429bc7ac8b0981977740a86f807d19e069e Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Mon, 8 Jul 2024 16:42:07 +0100 Subject: [PATCH 160/272] CLEAN: Enforce pdep6 (#59007) * enforce pdep6 * fixup Block.time_test benchmark * update comment * update warn to raise * add missing assertion * simplify * remove default value for `raise_on_upcast` * add whatsnew --- asv_bench/benchmarks/indexing.py | 13 +- doc/source/user_guide/categorical.rst | 2 +- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/indexing.py | 16 +- pandas/core/internals/blocks.py | 41 +- pandas/tests/copy_view/test_indexing.py | 26 +- pandas/tests/copy_view/test_methods.py | 26 +- pandas/tests/frame/indexing/test_coercion.py | 36 +- pandas/tests/frame/indexing/test_indexing.py | 87 ++-- pandas/tests/frame/indexing/test_set_value.py | 7 +- pandas/tests/frame/indexing/test_setitem.py | 10 +- pandas/tests/frame/indexing/test_where.py | 50 +-- pandas/tests/frame/methods/test_update.py | 11 +- pandas/tests/frame/test_constructors.py | 10 +- .../tests/indexing/multiindex/test_setitem.py | 10 +- pandas/tests/indexing/test_at.py | 6 +- .../indexing/test_chaining_and_caching.py | 8 +- pandas/tests/indexing/test_iloc.py | 10 +- pandas/tests/indexing/test_indexing.py | 37 +- pandas/tests/indexing/test_loc.py | 86 +--- pandas/tests/internals/test_internals.py | 12 +- pandas/tests/series/indexing/test_indexing.py | 37 +- pandas/tests/series/indexing/test_setitem.py | 418 ++++++++---------- pandas/tests/series/indexing/test_where.py | 19 +- .../series/methods/test_convert_dtypes.py | 2 +- pandas/tests/series/methods/test_fillna.py | 3 +- pandas/tests/series/methods/test_update.py | 44 +- pandas/tests/series/test_missing.py | 7 +- web/pandas/pdeps/0006-ban-upcasting.md | 3 +- 29 files changed, 383 insertions(+), 655 deletions(-) diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index 15e691d46f693..b2495356f134c 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -546,24 +546,17 @@ def time_chained_indexing(self, mode): class Block: - params = [ - (True, "True"), - (np.array(True), "np.array(True)"), - ] - - def setup(self, true_value, mode): + def setup(self): self.df = DataFrame( False, columns=np.arange(500).astype(str), index=date_range("2010-01-01", "2011-01-01"), ) - self.true_value = true_value - - def time_test(self, true_value, mode): + def time_test(self): start = datetime(2010, 5, 1) end = datetime(2010, 9, 1) - self.df.loc[start:end, :] = true_value + self.df.loc[start:end, :] = True from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/doc/source/user_guide/categorical.rst b/doc/source/user_guide/categorical.rst index 7b2fd32303845..1e7d66dfeb142 100644 --- a/doc/source/user_guide/categorical.rst +++ b/doc/source/user_guide/categorical.rst @@ -793,7 +793,7 @@ Assigning a ``Categorical`` to parts of a column of other types will use the val :okwarning: df = pd.DataFrame({"a": [1, 1, 1, 1, 1], "b": ["a", "a", "a", "a", "a"]}) - df.loc[1:2, "a"] = pd.Categorical(["b", "b"], categories=["a", "b"]) + df.loc[1:2, "a"] = pd.Categorical([2, 2], categories=[2, 3]) df.loc[2:3, "b"] = pd.Categorical(["b", "b"], categories=["a", "b"]) df df.dtypes diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index d24c39d83bad5..711bd417d979c 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -360,6 +360,7 @@ Other Removals - Changed the default value of ``na_action`` in :meth:`Categorical.map` to ``None`` (:issue:`51645`) - Changed the default value of ``observed`` in :meth:`DataFrame.groupby` and :meth:`Series.groupby` to ``True`` (:issue:`51811`) - Enforce deprecation in :func:`testing.assert_series_equal` and :func:`testing.assert_frame_equal` with object dtype and mismatched null-like values, which are now considered not-equal (:issue:`18463`) +- Enforce banning of upcasting in in-place setitem-like operations (:issue:`59007`) (see `PDEP6 `_) - Enforced deprecation ``all`` and ``any`` reductions with ``datetime64``, :class:`DatetimeTZDtype`, and :class:`PeriodDtype` dtypes (:issue:`58029`) - Enforced deprecation disallowing ``float`` "periods" in :func:`date_range`, :func:`period_range`, :func:`timedelta_range`, :func:`interval_range`, (:issue:`56036`) - Enforced deprecation disallowing parsing datetimes with mixed time zones unless user passes ``utc=True`` to :func:`to_datetime` (:issue:`57275`) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 8d1239ff71174..455e61b8bc254 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -25,7 +25,6 @@ ) from pandas.errors.cow import _chained_assignment_msg from pandas.util._decorators import doc -from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.cast import ( can_hold_element, @@ -2124,14 +2123,14 @@ def _setitem_single_column(self, loc: int, value, plane_indexer) -> None: self.obj._mgr.column_setitem( loc, plane_indexer, value, inplace_only=True ) - except (ValueError, TypeError, LossySetitemError): + except (ValueError, TypeError, LossySetitemError) as exc: # If we're setting an entire column and we can't do it inplace, # then we can use value's dtype (or inferred dtype) # instead of object dtype = self.obj.dtypes.iloc[loc] if dtype not in (np.void, object) and not self.obj.empty: # - Exclude np.void, as that is a special case for expansion. - # We want to warn for + # We want to raise for # df = pd.DataFrame({'a': [1, 2]}) # df.loc[:, 'a'] = .3 # but not for @@ -2140,14 +2139,9 @@ def _setitem_single_column(self, loc: int, value, plane_indexer) -> None: # - Exclude `object`, as then no upcasting happens. # - Exclude empty initial object with enlargement, # as then there's nothing to be inconsistent with. - warnings.warn( - f"Setting an item of incompatible dtype is deprecated " - "and will raise in a future error of pandas. " - f"Value '{value}' has dtype incompatible with {dtype}, " - "please explicitly cast to a compatible dtype first.", - FutureWarning, - stacklevel=find_stack_level(), - ) + raise TypeError( + f"Invalid value '{value}' for dtype '{dtype}'" + ) from exc self.obj.isetitem(loc, value) else: # set value into the column (first attempting to operate inplace, then diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 6bb335bca12b3..149bef6258bfa 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -428,7 +428,7 @@ def split_and_operate(self, func, *args, **kwargs) -> list[Block]: # Up/Down-casting @final - def coerce_to_target_dtype(self, other, warn_on_upcast: bool = False) -> Block: + def coerce_to_target_dtype(self, other, raise_on_upcast: bool) -> Block: """ coerce the current block to a dtype compat for other we will return a block, possibly object, and not raise @@ -455,7 +455,7 @@ def coerce_to_target_dtype(self, other, warn_on_upcast: bool = False) -> Block: isinstance(other, (np.datetime64, np.timedelta64)) and np.isnat(other) ) ): - warn_on_upcast = False + raise_on_upcast = False elif ( isinstance(other, np.ndarray) and other.ndim == 1 @@ -463,17 +463,10 @@ def coerce_to_target_dtype(self, other, warn_on_upcast: bool = False) -> Block: and is_float_dtype(other.dtype) and lib.has_only_ints_or_nan(other) ): - warn_on_upcast = False - - if warn_on_upcast: - warnings.warn( - f"Setting an item of incompatible dtype is deprecated " - "and will raise an error in a future version of pandas. " - f"Value '{other}' has dtype incompatible with {self.values.dtype}, " - "please explicitly cast to a compatible dtype first.", - FutureWarning, - stacklevel=find_stack_level(), - ) + raise_on_upcast = False + + if raise_on_upcast: + raise TypeError(f"Invalid value '{other}' for dtype '{self.values.dtype}'") if self.values.dtype == new_dtype: raise AssertionError( f"Did not expect new dtype {new_dtype} to equal self.dtype " @@ -720,7 +713,7 @@ def replace( if value is None or value is NA: blk = self.astype(np.dtype(object)) else: - blk = self.coerce_to_target_dtype(value) + blk = self.coerce_to_target_dtype(value, raise_on_upcast=False) return blk.replace( to_replace=to_replace, value=value, @@ -1105,7 +1098,7 @@ def setitem(self, indexer, value) -> Block: casted = np_can_hold_element(values.dtype, value) except LossySetitemError: # current dtype cannot store value, coerce to common dtype - nb = self.coerce_to_target_dtype(value, warn_on_upcast=True) + nb = self.coerce_to_target_dtype(value, raise_on_upcast=True) return nb.setitem(indexer, value) else: if self.dtype == _dtype_obj: @@ -1176,7 +1169,7 @@ def putmask(self, mask, new) -> list[Block]: if not is_list_like(new): # using just new[indexer] can't save us the need to cast return self.coerce_to_target_dtype( - new, warn_on_upcast=True + new, raise_on_upcast=True ).putmask(mask, new) else: indexer = mask.nonzero()[0] @@ -1244,7 +1237,7 @@ def where(self, other, cond) -> list[Block]: if self.ndim == 1 or self.shape[0] == 1: # no need to split columns - block = self.coerce_to_target_dtype(other) + block = self.coerce_to_target_dtype(other, raise_on_upcast=False) return block.where(orig_other, cond) else: @@ -1438,7 +1431,7 @@ def shift(self, periods: int, fill_value: Any = None) -> list[Block]: fill_value, ) except LossySetitemError: - nb = self.coerce_to_target_dtype(fill_value) + nb = self.coerce_to_target_dtype(fill_value, raise_on_upcast=False) return nb.shift(periods, fill_value=fill_value) else: @@ -1637,11 +1630,11 @@ def setitem(self, indexer, value): except (ValueError, TypeError): if isinstance(self.dtype, IntervalDtype): # see TestSetitemFloatIntervalWithIntIntervalValues - nb = self.coerce_to_target_dtype(orig_value, warn_on_upcast=True) + nb = self.coerce_to_target_dtype(orig_value, raise_on_upcast=True) return nb.setitem(orig_indexer, orig_value) elif isinstance(self, NDArrayBackedExtensionBlock): - nb = self.coerce_to_target_dtype(orig_value, warn_on_upcast=True) + nb = self.coerce_to_target_dtype(orig_value, raise_on_upcast=True) return nb.setitem(orig_indexer, orig_value) else: @@ -1676,13 +1669,13 @@ def where(self, other, cond) -> list[Block]: if self.ndim == 1 or self.shape[0] == 1: if isinstance(self.dtype, IntervalDtype): # TestSetitemFloatIntervalWithIntIntervalValues - blk = self.coerce_to_target_dtype(orig_other) + blk = self.coerce_to_target_dtype(orig_other, raise_on_upcast=False) return blk.where(orig_other, orig_cond) elif isinstance(self, NDArrayBackedExtensionBlock): # NB: not (yet) the same as # isinstance(values, NDArrayBackedExtensionArray) - blk = self.coerce_to_target_dtype(orig_other) + blk = self.coerce_to_target_dtype(orig_other, raise_on_upcast=False) return blk.where(orig_other, orig_cond) else: @@ -1737,13 +1730,13 @@ def putmask(self, mask, new) -> list[Block]: if isinstance(self.dtype, IntervalDtype): # Discussion about what we want to support in the general # case GH#39584 - blk = self.coerce_to_target_dtype(orig_new, warn_on_upcast=True) + blk = self.coerce_to_target_dtype(orig_new, raise_on_upcast=True) return blk.putmask(orig_mask, orig_new) elif isinstance(self, NDArrayBackedExtensionBlock): # NB: not (yet) the same as # isinstance(values, NDArrayBackedExtensionArray) - blk = self.coerce_to_target_dtype(orig_new, warn_on_upcast=True) + blk = self.coerce_to_target_dtype(orig_new, raise_on_upcast=True) return blk.putmask(orig_mask, orig_new) else: diff --git a/pandas/tests/copy_view/test_indexing.py b/pandas/tests/copy_view/test_indexing.py index b10141b0d63f4..37a21e1098e78 100644 --- a/pandas/tests/copy_view/test_indexing.py +++ b/pandas/tests/copy_view/test_indexing.py @@ -725,15 +725,13 @@ def test_column_as_series_set_with_upcast(backend): with pytest.raises(TypeError, match="Invalid value"): s[0] = "foo" expected = Series([1, 2, 3], name="a") + tm.assert_series_equal(s, expected) + tm.assert_frame_equal(df, df_orig) + # ensure cached series on getitem is not the changed series + tm.assert_series_equal(df["a"], df_orig["a"]) else: - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): s[0] = "foo" - expected = Series(["foo", 2, 3], dtype=object, name="a") - - tm.assert_series_equal(s, expected) - tm.assert_frame_equal(df, df_orig) - # ensure cached series on getitem is not the changed series - tm.assert_series_equal(df["a"], df_orig["a"]) @pytest.mark.parametrize( @@ -805,16 +803,14 @@ def test_set_value_copy_only_necessary_column(indexer_func, indexer, val, col): view = df[:] if val == "a": - with tm.assert_produces_warning( - FutureWarning, match="Setting an item of incompatible dtype is deprecated" - ): + with pytest.raises(TypeError, match="Invalid value"): indexer_func(df)[indexer] = val + else: + indexer_func(df)[indexer] = val - indexer_func(df)[indexer] = val - - assert np.shares_memory(get_array(df, "b"), get_array(view, "b")) - assert not np.shares_memory(get_array(df, "a"), get_array(view, "a")) - tm.assert_frame_equal(view, df_orig) + assert np.shares_memory(get_array(df, "b"), get_array(view, "b")) + assert not np.shares_memory(get_array(df, "a"), get_array(view, "a")) + tm.assert_frame_equal(view, df_orig) def test_series_midx_slice(): diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index 3712a74fe54ed..6f0cbe12a2ea0 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -1105,26 +1105,26 @@ def test_putmask_aligns_rhs_no_reference(dtype): assert np.shares_memory(arr_a, get_array(df, "a")) -@pytest.mark.parametrize( - "val, exp, warn", [(5.5, True, FutureWarning), (5, False, None)] -) -def test_putmask_dont_copy_some_blocks(val, exp, warn): +@pytest.mark.parametrize("val, exp, raises", [(5.5, True, True), (5, False, False)]) +def test_putmask_dont_copy_some_blocks(val, exp, raises: bool): df = DataFrame({"a": [1, 2], "b": 1, "c": 1.5}) view = df[:] df_orig = df.copy() indexer = DataFrame( [[True, False, False], [True, False, False]], columns=list("abc") ) - with tm.assert_produces_warning(warn, match="incompatible dtype"): + if raises: + with pytest.raises(TypeError, match="Invalid value"): + df[indexer] = val + else: df[indexer] = val - - assert not np.shares_memory(get_array(view, "a"), get_array(df, "a")) - # TODO(CoW): Could split blocks to avoid copying the whole block - assert np.shares_memory(get_array(view, "b"), get_array(df, "b")) is exp - assert np.shares_memory(get_array(view, "c"), get_array(df, "c")) - assert df._mgr._has_no_reference(1) is not exp - assert not df._mgr._has_no_reference(2) - tm.assert_frame_equal(view, df_orig) + assert not np.shares_memory(get_array(view, "a"), get_array(df, "a")) + # TODO(CoW): Could split blocks to avoid copying the whole block + assert np.shares_memory(get_array(view, "b"), get_array(df, "b")) is exp + assert np.shares_memory(get_array(view, "c"), get_array(df, "c")) + assert df._mgr._has_no_reference(1) is not exp + assert not df._mgr._has_no_reference(2) + tm.assert_frame_equal(view, df_orig) @pytest.mark.parametrize("dtype", ["int64", "Int64"]) diff --git a/pandas/tests/frame/indexing/test_coercion.py b/pandas/tests/frame/indexing/test_coercion.py index f55605d1ffa12..472bfb7772a80 100644 --- a/pandas/tests/frame/indexing/test_coercion.py +++ b/pandas/tests/frame/indexing/test_coercion.py @@ -49,35 +49,19 @@ def test_loc_setitem_multiindex_columns(self, consolidate): def test_37477(): # fixed by GH#45121 orig = DataFrame({"A": [1, 2, 3], "B": [3, 4, 5]}) - expected = DataFrame({"A": [1, 2, 3], "B": [3, 1.2, 5]}) df = orig.copy() - with tm.assert_produces_warning( - FutureWarning, match="Setting an item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): df.at[1, "B"] = 1.2 - tm.assert_frame_equal(df, expected) - df = orig.copy() - with tm.assert_produces_warning( - FutureWarning, match="Setting an item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): df.loc[1, "B"] = 1.2 - tm.assert_frame_equal(df, expected) - df = orig.copy() - with tm.assert_produces_warning( - FutureWarning, match="Setting an item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): df.iat[1, 1] = 1.2 - tm.assert_frame_equal(df, expected) - df = orig.copy() - with tm.assert_produces_warning( - FutureWarning, match="Setting an item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): df.iloc[1, 1] = 1.2 - tm.assert_frame_equal(df, expected) def test_6942(indexer_al): @@ -107,19 +91,11 @@ def test_26395(indexer_al): expected = DataFrame({"D": [0, 0, 2]}, index=["A", "B", "C"], dtype=np.int64) tm.assert_frame_equal(df, expected) - with tm.assert_produces_warning( - FutureWarning, match="Setting an item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): indexer_al(df)["C", "D"] = 44.5 - expected = DataFrame({"D": [0, 0, 44.5]}, index=["A", "B", "C"], dtype=np.float64) - tm.assert_frame_equal(df, expected) - with tm.assert_produces_warning( - FutureWarning, match="Setting an item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): indexer_al(df)["C", "D"] = "hello" - expected = DataFrame({"D": [0, 0, "hello"]}, index=["A", "B", "C"], dtype=object) - tm.assert_frame_equal(df, expected) @pytest.mark.xfail(reason="unwanted upcast") diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 9cd2c2515f49a..693075a881833 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -25,7 +25,6 @@ Timestamp, date_range, isna, - notna, to_datetime, ) import pandas._testing as tm @@ -833,13 +832,8 @@ def test_setitem_single_column_mixed_datetime(self): tm.assert_series_equal(result, expected) # GH#16674 iNaT is treated as an integer when given by the user - with tm.assert_produces_warning( - FutureWarning, match="Setting an item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): df.loc["b", "timestamp"] = iNaT - assert not isna(df.loc["b", "timestamp"]) - assert df["timestamp"].dtype == np.object_ - assert df.loc["b", "timestamp"] == iNaT # allow this syntax (as of GH#3216) df.loc["c", "timestamp"] = np.nan @@ -851,35 +845,11 @@ def test_setitem_single_column_mixed_datetime(self): def test_setitem_mixed_datetime(self): # GH 9336 - expected = DataFrame( - { - "a": [0, 0, 0, 0, 13, 14], - "b": [ - datetime(2012, 1, 1), - 1, - "x", - "y", - datetime(2013, 1, 1), - datetime(2014, 1, 1), - ], - } - ) df = DataFrame(0, columns=list("ab"), index=range(6)) df["b"] = pd.NaT df.loc[0, "b"] = datetime(2012, 1, 1) - with tm.assert_produces_warning( - FutureWarning, match="Setting an item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): df.loc[1, "b"] = 1 - df.loc[[2, 3], "b"] = "x", "y" - A = np.array( - [ - [13, np.datetime64("2013-01-01T00:00:00")], - [14, np.datetime64("2014-01-01T00:00:00")], - ] - ) - df.loc[[4, 5], ["a", "b"]] = A - tm.assert_frame_equal(df, expected) def test_setitem_frame_float(self, float_frame): piece = float_frame.loc[float_frame.index[:2], ["A", "B"]] @@ -936,8 +906,12 @@ def test_setitem_frame_upcast(self): # needs upcasting df = DataFrame([[1, 2, "foo"], [3, 4, "bar"]], columns=["A", "B", "C"]) df2 = df.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): df2.loc[:, ["A", "B"]] = df.loc[:, ["A", "B"]] + 0.5 + # Manually upcast so we can add .5 + df = df.astype({"A": "float64", "B": "float64"}) + df2 = df2.astype({"A": "float64", "B": "float64"}) + df2.loc[:, ["A", "B"]] = df.loc[:, ["A", "B"]] + 0.5 expected = df.reindex(columns=["A", "B"]) expected += 0.5 expected["C"] = df["C"] @@ -1366,12 +1340,8 @@ def test_loc_setitem_rhs_frame(self, idxr, val): # GH#47578 df = DataFrame({"a": [1, 2]}) - with tm.assert_produces_warning( - FutureWarning, match="Setting an item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): df.loc[:, idxr] = DataFrame({"a": [val, 11]}, index=[1, 2]) - expected = DataFrame({"a": [np.nan, val]}) - tm.assert_frame_equal(df, expected) def test_iloc_setitem_enlarge_no_warning(self): # GH#47381 @@ -1579,18 +1549,9 @@ def test_setitem(self): # With NaN: because uint64 has no NaN element, # the column should be cast to object. df2 = df.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): df2.iloc[1, 1] = pd.NaT df2.iloc[1, 2] = pd.NaT - result = df2["B"] - tm.assert_series_equal(notna(result), Series([True, False, True], name="B")) - tm.assert_series_equal( - df2.dtypes, - Series( - [np.dtype("uint64"), np.dtype("O"), np.dtype("O")], - index=["A", "B", "C"], - ), - ) def test_object_casting_indexing_wraps_datetimelike(): @@ -1925,23 +1886,30 @@ def test_add_new_column_infer_string(): class TestSetitemValidation: # This is adapted from pandas/tests/arrays/masked/test_indexing.py - # but checks for warnings instead of errors. - def _check_setitem_invalid(self, df, invalid, indexer, warn): - msg = "Setting an item of incompatible dtype is deprecated" - msg = re.escape(msg) - + def _check_setitem_invalid(self, df, invalid, indexer): orig_df = df.copy() # iloc - with tm.assert_produces_warning(warn, match=msg): + with pytest.raises(TypeError, match="Invalid value"): df.iloc[indexer, 0] = invalid df = orig_df.copy() # loc - with tm.assert_produces_warning(warn, match=msg): + with pytest.raises(TypeError, match="Invalid value"): df.loc[indexer, "a"] = invalid df = orig_df.copy() + def _check_setitem_valid(self, df, value, indexer): + orig_df = df.copy() + + # iloc + df.iloc[indexer, 0] = value + df = orig_df.copy() + + # loc + df.loc[indexer, "a"] = value + df = orig_df.copy() + _invalid_scalars = [ 1 + 2j, "True", @@ -1959,20 +1927,19 @@ def _check_setitem_invalid(self, df, invalid, indexer, warn): @pytest.mark.parametrize("indexer", _indexers) def test_setitem_validation_scalar_bool(self, invalid, indexer): df = DataFrame({"a": [True, False, False]}, dtype="bool") - self._check_setitem_invalid(df, invalid, indexer, FutureWarning) + self._check_setitem_invalid(df, invalid, indexer) @pytest.mark.parametrize("invalid", _invalid_scalars + [True, 1.5, np.float64(1.5)]) @pytest.mark.parametrize("indexer", _indexers) def test_setitem_validation_scalar_int(self, invalid, any_int_numpy_dtype, indexer): df = DataFrame({"a": [1, 2, 3]}, dtype=any_int_numpy_dtype) if isna(invalid) and invalid is not pd.NaT and not np.isnat(invalid): - warn = None + self._check_setitem_valid(df, invalid, indexer) else: - warn = FutureWarning - self._check_setitem_invalid(df, invalid, indexer, warn) + self._check_setitem_invalid(df, invalid, indexer) @pytest.mark.parametrize("invalid", _invalid_scalars + [True]) @pytest.mark.parametrize("indexer", _indexers) def test_setitem_validation_scalar_float(self, invalid, float_numpy_dtype, indexer): df = DataFrame({"a": [1, 2, None]}, dtype=float_numpy_dtype) - self._check_setitem_invalid(df, invalid, indexer, FutureWarning) + self._check_setitem_invalid(df, invalid, indexer) diff --git a/pandas/tests/frame/indexing/test_set_value.py b/pandas/tests/frame/indexing/test_set_value.py index ce771280bc264..aaf95daf232e2 100644 --- a/pandas/tests/frame/indexing/test_set_value.py +++ b/pandas/tests/frame/indexing/test_set_value.py @@ -1,4 +1,5 @@ import numpy as np +import pytest from pandas.core.dtypes.common import is_float_dtype @@ -6,7 +7,6 @@ DataFrame, isna, ) -import pandas._testing as tm class TestSetValue: @@ -40,11 +40,8 @@ def test_set_value_resize(self, float_frame, using_infer_string): assert is_float_dtype(res["baz"]) assert isna(res["baz"].drop(["foobar"])).all() - with tm.assert_produces_warning( - FutureWarning, match="Setting an item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): res._set_value("foobar", "baz", "sam") - assert res.loc["foobar", "baz"] == "sam" def test_set_value_with_index_dtype_change(self): df_orig = DataFrame( diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index 15cdc6566b570..df3b058ca51f9 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -1356,18 +1356,12 @@ def test_frame_setitem_empty_dataframe(self): def test_full_setter_loc_incompatible_dtype(): # https://github.com/pandas-dev/pandas/issues/55791 df = DataFrame({"a": [1, 2]}) - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): df.loc[:, "a"] = True - expected = DataFrame({"a": [True, True]}) - tm.assert_frame_equal(df, expected) - df = DataFrame({"a": [1, 2]}) - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): df.loc[:, "a"] = {0: 3.5, 1: 4.5} - expected = DataFrame({"a": [3.5, 4.5]}) - tm.assert_frame_equal(df, expected) - df = DataFrame({"a": [1, 2]}) df.loc[:, "a"] = {0: 3, 1: 4} expected = DataFrame({"a": [3, 4]}) tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py index aeffc4835a347..0f22ff52d5212 100644 --- a/pandas/tests/frame/indexing/test_where.py +++ b/pandas/tests/frame/indexing/test_where.py @@ -513,26 +513,15 @@ def test_where_axis_with_upcast(self): tm.assert_frame_equal(result, expected) result = df.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): - return_value = result.where(mask, ser, axis="index", inplace=True) - assert return_value is None - tm.assert_frame_equal(result, expected) + with pytest.raises(TypeError, match="Invalid value"): + result.where(mask, ser, axis="index", inplace=True) expected = DataFrame([[0, np.nan], [0, np.nan]]) result = df.where(mask, ser, axis="columns") tm.assert_frame_equal(result, expected) - expected = DataFrame( - { - 0: np.array([0, 0], dtype="int64"), - 1: np.array([np.nan, np.nan], dtype="float64"), - } - ) - result = df.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): - return_value = result.where(mask, ser, axis="columns", inplace=True) - assert return_value is None - tm.assert_frame_equal(result, expected) + with pytest.raises(TypeError, match="Invalid value"): + df.where(mask, ser, axis="columns", inplace=True) def test_where_axis_multiple_dtypes(self): # Multiple dtypes (=> multiple Blocks) @@ -584,15 +573,10 @@ def test_where_axis_multiple_dtypes(self): result = df.where(mask, d1, axis="index") tm.assert_frame_equal(result, expected) result = df.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): - return_value = result.where(mask, d1, inplace=True) - assert return_value is None - tm.assert_frame_equal(result, expected) - result = df.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): + result.where(mask, d1, inplace=True) + with pytest.raises(TypeError, match="Invalid value"): return_value = result.where(mask, d1, inplace=True, axis="index") - assert return_value is None - tm.assert_frame_equal(result, expected) d2 = df.copy().drop(1, axis=1) expected = df.copy() @@ -739,11 +723,8 @@ def test_where_interval_fullop_downcast(self, frame_or_series): res = obj.where(~obj.notna(), other) tm.assert_equal(res, other) - with tm.assert_produces_warning( - FutureWarning, match="Setting an item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): obj.mask(obj.notna(), other, inplace=True) - tm.assert_equal(obj, other.astype(object)) @pytest.mark.parametrize( "dtype", @@ -773,14 +754,10 @@ def test_where_datetimelike_noop(self, dtype): res4 = df.mask(mask2, "foo") tm.assert_frame_equal(res4, df) - expected = DataFrame(4, index=df.index, columns=df.columns) # unlike where, Block.putmask does not downcast - with tm.assert_produces_warning( - FutureWarning, match="Setting an item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): df.mask(~mask2, 4, inplace=True) - tm.assert_frame_equal(df, expected.astype(object)) def test_where_int_downcasting_deprecated(): @@ -934,11 +911,8 @@ def test_where_period_invalid_na(frame_or_series, as_cat, request): result = obj.mask(mask, tdnat) tm.assert_equal(result, expected) - with tm.assert_produces_warning( - FutureWarning, match="Setting an item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): obj.mask(mask, tdnat, inplace=True) - tm.assert_equal(obj, expected) def test_where_nullable_invalid_na(frame_or_series, any_numeric_ea_dtype): @@ -1020,9 +994,7 @@ def test_where_dt64_2d(): "B": dta[:, 1], } ) - with tm.assert_produces_warning( - FutureWarning, match="Setting an item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): _check_where_equivalences(df, mask, other, expected) # setting nothing in either column diff --git a/pandas/tests/frame/methods/test_update.py b/pandas/tests/frame/methods/test_update.py index 269b9e372bd70..ea63b2264d4f6 100644 --- a/pandas/tests/frame/methods/test_update.py +++ b/pandas/tests/frame/methods/test_update.py @@ -152,18 +152,9 @@ def test_update_with_different_dtype(self): # GH#3217 df = DataFrame({"a": [1, 3], "b": [np.nan, 2]}) df["c"] = np.nan - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): df.update({"c": Series(["foo"], index=[0])}) - expected = DataFrame( - { - "a": [1, 3], - "b": [np.nan, 2], - "c": Series(["foo", np.nan], dtype="object"), - } - ) - tm.assert_frame_equal(df, expected) - def test_update_modify_view(self, using_infer_string): # GH#47188 df = DataFrame({"A": ["1", np.nan], "B": ["100", np.nan]}) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index c0b9e6549c4ba..2d5772eb5cb53 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2522,11 +2522,13 @@ def check_views(c_only: bool = False): check_views() # TODO: most of the rest of this test belongs in indexing tests - if lib.is_np_dtype(df.dtypes.iloc[0], "fciuO"): - warn = None + should_raise = not lib.is_np_dtype(df.dtypes.iloc[0], "fciuO") + if should_raise: + with pytest.raises(TypeError, match="Invalid value"): + df.iloc[0, 0] = 0 + df.iloc[0, 1] = 0 + return else: - warn = FutureWarning - with tm.assert_produces_warning(warn, match="incompatible dtype"): df.iloc[0, 0] = 0 df.iloc[0, 1] = 0 if not copy: diff --git a/pandas/tests/indexing/multiindex/test_setitem.py b/pandas/tests/indexing/multiindex/test_setitem.py index abf89c2b0d096..d732cb4d7fbbc 100644 --- a/pandas/tests/indexing/multiindex/test_setitem.py +++ b/pandas/tests/indexing/multiindex/test_setitem.py @@ -213,13 +213,11 @@ def test_multiindex_assignment_single_dtype(self): tm.assert_series_equal(result, exp) # arr + 0.5 cannot be cast losslessly to int, so we upcast - with tm.assert_produces_warning( - FutureWarning, match="item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): df.loc[4, "c"] = arr + 0.5 - result = df.loc[4, "c"] - exp = exp + 0.5 - tm.assert_series_equal(result, exp) + # Upcast so that we can add .5 + df = df.astype({"c": "float64"}) + df.loc[4, "c"] = arr + 0.5 # scalar ok df.loc[4, "c"] = 10 diff --git a/pandas/tests/indexing/test_at.py b/pandas/tests/indexing/test_at.py index 217ca74bd7fbd..10a8fa88b4b5e 100644 --- a/pandas/tests/indexing/test_at.py +++ b/pandas/tests/indexing/test_at.py @@ -24,12 +24,8 @@ def test_at_timezone(): # https://github.com/pandas-dev/pandas/issues/33544 result = DataFrame({"foo": [datetime(2000, 1, 1)]}) - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): result.at[0, "foo"] = datetime(2000, 1, 2, tzinfo=timezone.utc) - expected = DataFrame( - {"foo": [datetime(2000, 1, 2, tzinfo=timezone.utc)]}, dtype=object - ) - tm.assert_frame_equal(result, expected) def test_selection_methods_of_assigned_col(): diff --git a/pandas/tests/indexing/test_chaining_and_caching.py b/pandas/tests/indexing/test_chaining_and_caching.py index efae0b4dd84cc..64d8068fa9291 100644 --- a/pandas/tests/indexing/test_chaining_and_caching.py +++ b/pandas/tests/indexing/test_chaining_and_caching.py @@ -285,11 +285,9 @@ def test_detect_chained_assignment_changing_dtype(self): df.loc[2]["C"] = "foo" tm.assert_frame_equal(df, df_original) # TODO: Use tm.raises_chained_assignment_error() when PDEP-6 is enforced - with tm.raises_chained_assignment_error( - extra_warnings=(FutureWarning,), extra_match=(None,) - ): - df["C"][2] = "foo" - tm.assert_frame_equal(df, df_original) + with pytest.raises(TypeError, match="Invalid value"): + with tm.raises_chained_assignment_error(): + df["C"][2] = "foo" def test_setting_with_copy_bug(self): # operating on a copy diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 8b90a6c32849d..417925f8ecb0d 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -9,7 +9,6 @@ from pandas.errors import IndexingError from pandas import ( - NA, Categorical, CategoricalDtype, DataFrame, @@ -528,10 +527,9 @@ def test_iloc_setitem_frame_duplicate_columns_multiple_blocks(self): assert len(df._mgr.blocks) == 1 # if the assigned values cannot be held by existing integer arrays, - # we cast - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + # we raise + with pytest.raises(TypeError, match="Invalid value"): df.iloc[:, 0] = df.iloc[:, 0] + 0.5 - assert len(df._mgr.blocks) == 2 expected = df.copy() @@ -1445,7 +1443,5 @@ def test_iloc_setitem_pure_position_based(self): def test_iloc_nullable_int64_size_1_nan(self): # GH 31861 result = DataFrame({"a": ["test"], "b": [np.nan]}) - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): result.loc[:, "b"] = result.loc[:, "b"].astype("Int64") - expected = DataFrame({"a": ["test"], "b": array([NA], dtype="Int64")}) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 60a3ccf0b7483..61cbb1983e49a 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -180,14 +180,8 @@ def test_setitem_dtype_upcast(self): df["c"] = np.nan assert df["c"].dtype == np.float64 - with tm.assert_produces_warning( - FutureWarning, match="item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): df.loc[0, "c"] = "foo" - expected = DataFrame( - {"a": [1, 3], "b": [np.nan, 2], "c": Series(["foo", np.nan], dtype=object)} - ) - tm.assert_frame_equal(df, expected) @pytest.mark.parametrize("val", [3.14, "wxyz"]) def test_setitem_dtype_upcast2(self, val): @@ -199,19 +193,8 @@ def test_setitem_dtype_upcast2(self, val): ) left = df.copy() - with tm.assert_produces_warning( - FutureWarning, match="item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): left.loc["a", "bar"] = val - right = DataFrame( - [[0, val, 2], [3, 4, 5]], - index=list("ab"), - columns=["foo", "bar", "baz"], - ) - - tm.assert_frame_equal(left, right) - assert is_integer_dtype(left["foo"]) - assert is_integer_dtype(left["baz"]) def test_setitem_dtype_upcast3(self): left = DataFrame( @@ -219,21 +202,9 @@ def test_setitem_dtype_upcast3(self): index=list("ab"), columns=["foo", "bar", "baz"], ) - with tm.assert_produces_warning( - FutureWarning, match="item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): left.loc["a", "bar"] = "wxyz" - right = DataFrame( - [[0, "wxyz", 0.2], [0.3, 0.4, 0.5]], - index=list("ab"), - columns=["foo", "bar", "baz"], - ) - - tm.assert_frame_equal(left, right) - assert is_float_dtype(left["foo"]) - assert is_float_dtype(left["baz"]) - def test_dups_fancy_indexing(self): # GH 3455 @@ -728,7 +699,7 @@ def run_tests(df, rhs, right_loc, right_iloc): frame["jolie"] = frame["jolie"].map(lambda x: f"@{x}") right_iloc["joe"] = [1.0, "@-28", "@-20", "@-12", 17.0] right_iloc["jolie"] = ["@2", -26.0, -18.0, -10.0, "@18"] - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): run_tests(df, rhs, right_loc, right_iloc) @pytest.mark.parametrize( diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 903ad24ce53b3..b8d012eca28ce 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -16,7 +16,6 @@ from pandas._config import using_pyarrow_string_dtype from pandas._libs import index as libindex -from pandas.compat.numpy import np_version_gt2 from pandas.errors import IndexingError import pandas as pd @@ -383,12 +382,8 @@ def test_loc_setitem_slice(self): df2 = DataFrame({"a": [0, 1, 1], "b": [100, 200, 300]}, dtype="uint64") ix = df1["a"] == 1 newb2 = df2.loc[ix, "b"] - with tm.assert_produces_warning( - FutureWarning, match="item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): df1.loc[ix, "b"] = newb2 - expected = DataFrame({"a": [0, 1, 1], "b": [100, 200, 300]}, dtype="uint64") - tm.assert_frame_equal(df2, expected) def test_loc_setitem_dtype(self): # GH31340 @@ -572,54 +567,31 @@ def frame_for_consistency(self): def test_loc_setitem_consistency(self, frame_for_consistency, val): # GH 6149 # coerce similarly for setitem and loc when rows have a null-slice - expected = DataFrame( - { - "date": Series(0, index=range(5), dtype=np.int64), - "val": Series(range(5), dtype=np.int64), - } - ) df = frame_for_consistency.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): df.loc[:, "date"] = val - tm.assert_frame_equal(df, expected) def test_loc_setitem_consistency_dt64_to_str(self, frame_for_consistency): # GH 6149 # coerce similarly for setitem and loc when rows have a null-slice - expected = DataFrame( - { - "date": Series("foo", index=range(5)), - "val": Series(range(5), dtype=np.int64), - } - ) df = frame_for_consistency.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): df.loc[:, "date"] = "foo" - tm.assert_frame_equal(df, expected) def test_loc_setitem_consistency_dt64_to_float(self, frame_for_consistency): # GH 6149 # coerce similarly for setitem and loc when rows have a null-slice - expected = DataFrame( - { - "date": Series(1.0, index=range(5)), - "val": Series(range(5), dtype=np.int64), - } - ) df = frame_for_consistency.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): df.loc[:, "date"] = 1.0 - tm.assert_frame_equal(df, expected) def test_loc_setitem_consistency_single_row(self): # GH 15494 # setting on frame with single row df = DataFrame({"date": Series([Timestamp("20180101")])}) - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): df.loc[:, "date"] = "string" - expected = DataFrame({"date": Series(["string"])}) - tm.assert_frame_equal(df, expected) def test_loc_setitem_consistency_empty(self): # empty (essentially noops) @@ -677,16 +649,11 @@ def test_loc_setitem_consistency_slice_column_len(self): # timedelta64[m] -> float, so this cannot be done inplace, so # no warning - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): df.loc[:, ("Respondent", "Duration")] = df.loc[ :, ("Respondent", "Duration") ] / Timedelta(60_000_000_000) - expected = Series( - [23.0, 12.0, 14.0, 36.0], index=df.index, name=("Respondent", "Duration") - ) - tm.assert_series_equal(df[("Respondent", "Duration")], expected) - @pytest.mark.parametrize("unit", ["Y", "M", "D", "h", "m", "s", "ms", "us"]) def test_loc_assign_non_ns_datetime(self, unit): # GH 27395, non-ns dtype assignment via .loc should work @@ -1411,13 +1378,9 @@ def test_loc_setitem_categorical_values_partial_column_slice(self): # Assigning a Category to parts of a int/... column uses the values of # the Categorical df = DataFrame({"a": [1, 1, 1, 1, 1], "b": list("aaaaa")}) - exp = DataFrame({"a": [1, "b", "b", 1, 1], "b": list("aabba")}) - with tm.assert_produces_warning( - FutureWarning, match="item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): df.loc[1:2, "a"] = Categorical(["b", "b"], categories=["a", "b"]) df.loc[2:3, "b"] = Categorical(["b", "b"], categories=["a", "b"]) - tm.assert_frame_equal(df, exp) def test_loc_setitem_single_row_categorical(self, using_infer_string): # GH#25495 @@ -1444,9 +1407,8 @@ def test_loc_setitem_datetime_coercion(self): df.loc[0:1, "c"] = np.datetime64("2008-08-08") assert Timestamp("2008-08-08") == df.loc[0, "c"] assert Timestamp("2008-08-08") == df.loc[1, "c"] - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): df.loc[2, "c"] = date(2005, 5, 5) - assert Timestamp("2005-05-05").date() == df.loc[2, "c"] @pytest.mark.parametrize("idxer", ["var", ["var"]]) def test_loc_setitem_datetimeindex_tz(self, idxer, tz_naive_fixture): @@ -1457,12 +1419,13 @@ def test_loc_setitem_datetimeindex_tz(self, idxer, tz_naive_fixture): # if result started off with object dtype, then the .loc.__setitem__ # below would retain object dtype result = DataFrame(index=idx, columns=["var"], dtype=np.float64) - with tm.assert_produces_warning( - FutureWarning if idxer == "var" else None, match="incompatible dtype" - ): + if idxer == "var": + with pytest.raises(TypeError, match="Invalid value"): + result.loc[:, idxer] = expected + else: # See https://github.com/pandas-dev/pandas/issues/56223 result.loc[:, idxer] = expected - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_loc_setitem_time_key(self): index = date_range("2012-01-01", "2012-01-05", freq="30min") @@ -1608,16 +1571,8 @@ def test_loc_setitem_cast2(self): # dtype conversion on setting df = DataFrame(np.random.default_rng(2).random((30, 3)), columns=tuple("ABC")) df["event"] = np.nan - with tm.assert_produces_warning( - FutureWarning, match="item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): df.loc[10, "event"] = "foo" - result = df.dtypes - expected = Series( - [np.dtype("float64")] * 3 + [np.dtype("object")], - index=["A", "B", "C", "event"], - ) - tm.assert_series_equal(result, expected) def test_loc_setitem_cast3(self): # Test that data type is preserved . GH#5782 @@ -2972,20 +2927,9 @@ def test_loc_setitem_uint8_upcast(value): # GH#26049 df = DataFrame([1, 2, 3, 4], columns=["col1"], dtype="uint8") - with tm.assert_produces_warning(FutureWarning, match="item of incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): df.loc[2, "col1"] = value # value that can't be held in uint8 - if np_version_gt2 and isinstance(value, np.int16): - # Note, result type of uint8 + int16 is int16 - # in numpy < 2, though, numpy would inspect the - # value and see that it could fit in an uint16, resulting in a uint16 - dtype = "int16" - else: - dtype = "uint16" - - expected = DataFrame([1, 2, 300, 4], columns=["col1"], dtype=dtype) - tm.assert_frame_equal(df, expected) - @pytest.mark.parametrize( "fill_val,exp_dtype", diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index fca1ed39c0f9c..579d3fbfb3435 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -1280,20 +1280,19 @@ def test_interval_can_hold_element(self, dtype, element): # `elem` to not have the same length as `arr` ii2 = IntervalIndex.from_breaks(arr[:-1], closed="neither") elem = element(ii2) - msg = "Setting an item of incompatible dtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): + with pytest.raises(TypeError, match="Invalid value"): self.check_series_setitem(elem, ii, False) assert not blk._can_hold_element(elem) ii3 = IntervalIndex.from_breaks([Timestamp(1), Timestamp(3), Timestamp(4)]) elem = element(ii3) - with tm.assert_produces_warning(FutureWarning, match=msg): + with pytest.raises(TypeError, match="Invalid value"): self.check_series_setitem(elem, ii, False) assert not blk._can_hold_element(elem) ii4 = IntervalIndex.from_breaks([Timedelta(1), Timedelta(3), Timedelta(4)]) elem = element(ii4) - with tm.assert_produces_warning(FutureWarning, match=msg): + with pytest.raises(TypeError, match="Invalid value"): self.check_series_setitem(elem, ii, False) assert not blk._can_hold_element(elem) @@ -1313,13 +1312,12 @@ def test_period_can_hold_element(self, element): # `elem` to not have the same length as `arr` pi2 = pi.asfreq("D")[:-1] elem = element(pi2) - msg = "Setting an item of incompatible dtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): + with pytest.raises(TypeError, match="Invalid value"): self.check_series_setitem(elem, pi, False) dti = pi.to_timestamp("s")[:-1] elem = element(dti) - with tm.assert_produces_warning(FutureWarning, match=msg): + with pytest.raises(TypeError, match="Invalid value"): self.check_series_setitem(elem, pi, False) def check_can_hold_element(self, obj, elem, inplace: bool): diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index 5002b6d20da09..228e5cb509982 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -432,28 +432,38 @@ def test_setitem_dict_and_set_disallowed_multiindex(self, key): class TestSetitemValidation: # This is adapted from pandas/tests/arrays/masked/test_indexing.py - # but checks for warnings instead of errors. - def _check_setitem_invalid(self, ser, invalid, indexer, warn): - msg = "Setting an item of incompatible dtype is deprecated" - msg = re.escape(msg) - + def _check_setitem_invalid(self, ser, invalid, indexer): orig_ser = ser.copy() - with tm.assert_produces_warning(warn, match=msg): + with pytest.raises(TypeError, match="Invalid value"): ser[indexer] = invalid ser = orig_ser.copy() - with tm.assert_produces_warning(warn, match=msg): + with pytest.raises(TypeError, match="Invalid value"): ser.iloc[indexer] = invalid ser = orig_ser.copy() - with tm.assert_produces_warning(warn, match=msg): + with pytest.raises(TypeError, match="Invalid value"): ser.loc[indexer] = invalid ser = orig_ser.copy() - with tm.assert_produces_warning(warn, match=msg): + with pytest.raises(TypeError, match="Invalid value"): ser[:] = invalid + def _check_setitem_valid(self, ser, value, indexer): + orig_ser = ser.copy() + + ser[indexer] = value + ser = orig_ser.copy() + + ser.iloc[indexer] = value + ser = orig_ser.copy() + + ser.loc[indexer] = value + ser = orig_ser.copy() + + ser[:] = value + _invalid_scalars = [ 1 + 2j, "True", @@ -471,20 +481,19 @@ def _check_setitem_invalid(self, ser, invalid, indexer, warn): @pytest.mark.parametrize("indexer", _indexers) def test_setitem_validation_scalar_bool(self, invalid, indexer): ser = Series([True, False, False], dtype="bool") - self._check_setitem_invalid(ser, invalid, indexer, FutureWarning) + self._check_setitem_invalid(ser, invalid, indexer) @pytest.mark.parametrize("invalid", _invalid_scalars + [True, 1.5, np.float64(1.5)]) @pytest.mark.parametrize("indexer", _indexers) def test_setitem_validation_scalar_int(self, invalid, any_int_numpy_dtype, indexer): ser = Series([1, 2, 3], dtype=any_int_numpy_dtype) if isna(invalid) and invalid is not NaT and not np.isnat(invalid): - warn = None + self._check_setitem_valid(ser, invalid, indexer) else: - warn = FutureWarning - self._check_setitem_invalid(ser, invalid, indexer, warn) + self._check_setitem_invalid(ser, invalid, indexer) @pytest.mark.parametrize("invalid", _invalid_scalars + [True]) @pytest.mark.parametrize("indexer", _indexers) def test_setitem_validation_scalar_float(self, invalid, float_numpy_dtype, indexer): ser = Series([1, 2, None], dtype=float_numpy_dtype) - self._check_setitem_invalid(ser, invalid, indexer, FutureWarning) + self._check_setitem_invalid(ser, invalid, indexer) diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index 69fba8925784e..253339f8a6446 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -1,3 +1,4 @@ +import contextlib from datetime import ( date, datetime, @@ -273,25 +274,16 @@ def test_setitem_mask_align_and_promote(self): mask = ts > 0 left = ts.copy() right = ts[mask].copy().map(str) - with tm.assert_produces_warning( - FutureWarning, match="item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): left[mask] = right - expected = ts.map(lambda t: str(t) if t > 0 else t) - tm.assert_series_equal(left, expected) def test_setitem_mask_promote_strs(self): ser = Series([0, 1, 2, 0]) mask = ser > 0 ser2 = ser[mask].map(str) - with tm.assert_produces_warning( - FutureWarning, match="item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): ser[mask] = ser2 - expected = Series([0, "1", "2", 0]) - tm.assert_series_equal(ser, expected) - def test_setitem_mask_promote(self): ser = Series([0, "foo", "bar", 0]) mask = Series([False, True, True, False]) @@ -379,12 +371,8 @@ def test_setitem_with_bool_mask_and_values_matching_n_trues_in_length(self): def test_setitem_nan_with_bool(self): # GH 13034 result = Series([True, False, True]) - with tm.assert_produces_warning( - FutureWarning, match="item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): result[0] = np.nan - expected = Series([np.nan, False, True], dtype=object) - tm.assert_series_equal(result, expected) def test_setitem_mask_smallint_upcast(self): orig = Series([1, 2, 3], dtype="int8") @@ -393,22 +381,14 @@ def test_setitem_mask_smallint_upcast(self): mask = np.array([True, False, True]) ser = orig.copy() - with tm.assert_produces_warning( - FutureWarning, match="item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): ser[mask] = Series(alt) - expected = Series([999, 2, 1001]) - tm.assert_series_equal(ser, expected) - ser2 = orig.copy() - with tm.assert_produces_warning( - FutureWarning, match="item of incompatible dtype" - ): - ser2.mask(mask, alt, inplace=True) - tm.assert_series_equal(ser2, expected) + with pytest.raises(TypeError, match="Invalid value"): + ser.mask(mask, alt, inplace=True) - ser3 = orig.copy() - res = ser3.where(~mask, Series(alt)) + res = ser.where(~mask, Series(alt)) + expected = Series([999, 2, 1001]) tm.assert_series_equal(res, expected) def test_setitem_mask_smallint_no_upcast(self): @@ -575,32 +555,35 @@ def test_setitem_keep_precision(self, any_numeric_ea_dtype): tm.assert_series_equal(ser, expected) @pytest.mark.parametrize( - "na, target_na, dtype, target_dtype, indexer, warn", + "na, target_na, dtype, target_dtype, indexer, raises", [ - (NA, NA, "Int64", "Int64", 1, None), - (NA, NA, "Int64", "Int64", 2, None), - (NA, np.nan, "int64", "float64", 1, None), - (NA, np.nan, "int64", "float64", 2, None), - (NaT, NaT, "int64", "object", 1, FutureWarning), - (NaT, NaT, "int64", "object", 2, None), - (np.nan, NA, "Int64", "Int64", 1, None), - (np.nan, NA, "Int64", "Int64", 2, None), - (np.nan, NA, "Float64", "Float64", 1, None), - (np.nan, NA, "Float64", "Float64", 2, None), - (np.nan, np.nan, "int64", "float64", 1, None), - (np.nan, np.nan, "int64", "float64", 2, None), + (NA, NA, "Int64", "Int64", 1, False), + (NA, NA, "Int64", "Int64", 2, False), + (NA, np.nan, "int64", "float64", 1, False), + (NA, np.nan, "int64", "float64", 2, False), + (NaT, NaT, "int64", "object", 1, True), + (NaT, NaT, "int64", "object", 2, False), + (np.nan, NA, "Int64", "Int64", 1, False), + (np.nan, NA, "Int64", "Int64", 2, False), + (np.nan, NA, "Float64", "Float64", 1, False), + (np.nan, NA, "Float64", "Float64", 2, False), + (np.nan, np.nan, "int64", "float64", 1, False), + (np.nan, np.nan, "int64", "float64", 2, False), ], ) def test_setitem_enlarge_with_na( - self, na, target_na, dtype, target_dtype, indexer, warn + self, na, target_na, dtype, target_dtype, indexer, raises ): # GH#32346 ser = Series([1, 2], dtype=dtype) - with tm.assert_produces_warning(warn, match="incompatible dtype"): + if raises: + with pytest.raises(TypeError, match="Invalid value"): + ser[indexer] = na + else: ser[indexer] = na - expected_values = [1, target_na] if indexer == 1 else [1, 2, target_na] - expected = Series(expected_values, dtype=target_dtype) - tm.assert_series_equal(ser, expected) + expected_values = [1, target_na] if indexer == 1 else [1, 2, target_na] + expected = Series(expected_values, dtype=target_dtype) + tm.assert_series_equal(ser, expected) def test_setitem_enlargement_object_none(self, nulls_fixture, using_infer_string): # GH#48665 @@ -694,14 +677,8 @@ def test_setitem_non_bool_into_bool(self, val, indexer_sli, unique): if not unique: ser.index = [1, 1] - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): indexer_sli(ser)[1] = val - assert type(ser.iloc[1]) == type(val) - - expected = Series([True, val], dtype=object, index=ser.index) - if not unique and indexer_sli is not tm.iloc: - expected = Series([val, val], dtype=object, index=[1, 1]) - tm.assert_series_equal(ser, expected) def test_setitem_boolean_array_into_npbool(self): # GH#45462 @@ -712,10 +689,8 @@ def test_setitem_boolean_array_into_npbool(self): ser[:2] = arr[:2] # no NAs -> can set inplace assert ser._values is values - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): ser[1:] = arr[1:] # has an NA -> cast to boolean dtype - expected = Series(arr) - tm.assert_series_equal(ser, expected) class SetitemCastingEquivalents: @@ -759,64 +734,72 @@ def _check_inplace(self, is_inplace, orig, arr, obj): # otherwise original array should be unchanged tm.assert_equal(arr, orig._values) - def test_int_key(self, obj, key, expected, warn, val, indexer_sli, is_inplace): + def test_int_key(self, obj, key, expected, raises, val, indexer_sli, is_inplace): if not isinstance(key, int): pytest.skip("Not relevant for int key") + if raises: + ctx = pytest.raises(TypeError, match="Invalid value") + else: + ctx = contextlib.nullcontext() - with tm.assert_produces_warning(warn, match="incompatible dtype"): + with ctx: self.check_indexer(obj, key, expected, val, indexer_sli, is_inplace) if indexer_sli is tm.loc: - with tm.assert_produces_warning(warn, match="incompatible dtype"): + with ctx: self.check_indexer(obj, key, expected, val, tm.at, is_inplace) elif indexer_sli is tm.iloc: - with tm.assert_produces_warning(warn, match="incompatible dtype"): + with ctx: self.check_indexer(obj, key, expected, val, tm.iat, is_inplace) rng = range(key, key + 1) - with tm.assert_produces_warning(warn, match="incompatible dtype"): + with ctx: self.check_indexer(obj, rng, expected, val, indexer_sli, is_inplace) if indexer_sli is not tm.loc: # Note: no .loc because that handles slice edges differently slc = slice(key, key + 1) - with tm.assert_produces_warning(warn, match="incompatible dtype"): + with ctx: self.check_indexer(obj, slc, expected, val, indexer_sli, is_inplace) ilkey = [key] - with tm.assert_produces_warning(warn, match="incompatible dtype"): + with ctx: self.check_indexer(obj, ilkey, expected, val, indexer_sli, is_inplace) indkey = np.array(ilkey) - with tm.assert_produces_warning(warn, match="incompatible dtype"): + with ctx: self.check_indexer(obj, indkey, expected, val, indexer_sli, is_inplace) genkey = (x for x in [key]) - with tm.assert_produces_warning(warn, match="incompatible dtype"): + with ctx: self.check_indexer(obj, genkey, expected, val, indexer_sli, is_inplace) - def test_slice_key(self, obj, key, expected, warn, val, indexer_sli, is_inplace): + def test_slice_key(self, obj, key, expected, raises, val, indexer_sli, is_inplace): if not isinstance(key, slice): pytest.skip("Not relevant for slice key") + if raises: + ctx = pytest.raises(TypeError, match="Invalid value") + else: + ctx = contextlib.nullcontext() if indexer_sli is not tm.loc: # Note: no .loc because that handles slice edges differently - with tm.assert_produces_warning(warn, match="incompatible dtype"): + with ctx: self.check_indexer(obj, key, expected, val, indexer_sli, is_inplace) ilkey = list(range(len(obj)))[key] - with tm.assert_produces_warning(warn, match="incompatible dtype"): + with ctx: self.check_indexer(obj, ilkey, expected, val, indexer_sli, is_inplace) indkey = np.array(ilkey) - with tm.assert_produces_warning(warn, match="incompatible dtype"): + with ctx: self.check_indexer(obj, indkey, expected, val, indexer_sli, is_inplace) genkey = (x for x in indkey) - with tm.assert_produces_warning(warn, match="incompatible dtype"): + with ctx: self.check_indexer(obj, genkey, expected, val, indexer_sli, is_inplace) - def test_mask_key(self, obj, key, expected, warn, val, indexer_sli): + def test_mask_key(self, obj, key, expected, raises, val, indexer_sli): # setitem with boolean mask mask = np.zeros(obj.shape, dtype=bool) mask[key] = True @@ -829,11 +812,13 @@ def test_mask_key(self, obj, key, expected, warn, val, indexer_sli): indexer_sli(obj)[mask] = val return - with tm.assert_produces_warning(warn, match="incompatible dtype"): + if raises: + with pytest.raises(TypeError, match="Invalid value"): + indexer_sli(obj)[mask] = val + else: indexer_sli(obj)[mask] = val - tm.assert_series_equal(obj, expected) - def test_series_where(self, obj, key, expected, warn, val, is_inplace): + def test_series_where(self, obj, key, expected, raises, val, is_inplace): mask = np.zeros(obj.shape, dtype=bool) mask[key] = True @@ -860,7 +845,7 @@ def test_series_where(self, obj, key, expected, warn, val, is_inplace): self._check_inplace(is_inplace, orig, arr, obj) - def test_index_where(self, obj, key, expected, warn, val, using_infer_string): + def test_index_where(self, obj, key, expected, raises, val, using_infer_string): mask = np.zeros(obj.shape, dtype=bool) mask[key] = True @@ -872,7 +857,7 @@ def test_index_where(self, obj, key, expected, warn, val, using_infer_string): expected_idx = Index(expected, dtype=expected.dtype) tm.assert_index_equal(res, expected_idx) - def test_index_putmask(self, obj, key, expected, warn, val, using_infer_string): + def test_index_putmask(self, obj, key, expected, raises, val, using_infer_string): mask = np.zeros(obj.shape, dtype=bool) mask[key] = True @@ -885,7 +870,7 @@ def test_index_putmask(self, obj, key, expected, warn, val, using_infer_string): @pytest.mark.parametrize( - "obj,expected,key,warn", + "obj,expected,key,raises", [ pytest.param( # GH#45568 setting a valid NA value into IntervalDtype[int] should @@ -896,7 +881,7 @@ def test_index_putmask(self, obj, key, expected, warn, val, using_infer_string): dtype="interval[float64]", ), 1, - FutureWarning, + True, id="interval_int_na_value", ), pytest.param( @@ -904,14 +889,14 @@ def test_index_putmask(self, obj, key, expected, warn, val, using_infer_string): Series([2, 3, 4, 5, 6, 7, 8, 9, 10]), Series([np.nan, 3, np.nan, 5, np.nan, 7, np.nan, 9, np.nan]), slice(None, None, 2), - None, + False, id="int_series_slice_key_step", ), pytest.param( Series([True, True, False, False]), Series([np.nan, True, np.nan, False], dtype=object), slice(None, None, 2), - FutureWarning, + True, id="bool_series_slice_key_step", ), pytest.param( @@ -919,7 +904,7 @@ def test_index_putmask(self, obj, key, expected, warn, val, using_infer_string): Series(np.arange(10)), Series([np.nan, np.nan, np.nan, np.nan, np.nan, 5, 6, 7, 8, 9]), slice(None, 5), - None, + False, id="int_series_slice_key", ), pytest.param( @@ -927,7 +912,7 @@ def test_index_putmask(self, obj, key, expected, warn, val, using_infer_string): Series([1, 2, 3]), Series([np.nan, 2, 3]), 0, - None, + False, id="int_series_int_key", ), pytest.param( @@ -936,7 +921,7 @@ def test_index_putmask(self, obj, key, expected, warn, val, using_infer_string): Series([np.nan], dtype=object), # TODO: maybe go to float64 since we are changing the _whole_ Series? 0, - FutureWarning, + True, id="bool_series_int_key_change_all", ), pytest.param( @@ -944,7 +929,7 @@ def test_index_putmask(self, obj, key, expected, warn, val, using_infer_string): Series([False, True]), Series([np.nan, True], dtype=object), 0, - FutureWarning, + True, id="bool_series_int_key", ), ], @@ -994,8 +979,8 @@ def key(self): return 0 @pytest.fixture - def warn(self): - return FutureWarning + def raises(self): + return True class TestSetitemDT64IntoInt(SetitemCastingEquivalents): @@ -1034,8 +1019,8 @@ def val(self, scalar, request): return box([scalar, scalar]) @pytest.fixture - def warn(self): - return FutureWarning + def raises(self): + return True class TestSetitemNAPeriodDtype(SetitemCastingEquivalents): @@ -1061,8 +1046,8 @@ def val(self, request): return request.param @pytest.fixture - def warn(self): - return None + def raises(self): + return False class TestSetitemNADatetimeLikeDtype(SetitemCastingEquivalents): @@ -1114,8 +1099,8 @@ def key(self): return 0 @pytest.fixture - def warn(self, is_inplace): - return None if is_inplace else FutureWarning + def raises(self, is_inplace): + return False if is_inplace else True class TestSetitemMismatchedTZCastsToObject(SetitemCastingEquivalents): @@ -1146,24 +1131,23 @@ def expected(self, obj, val): return expected @pytest.fixture - def warn(self): - return None + def raises(self): + return False @pytest.mark.parametrize( - "obj,expected,warn", + "obj,expected", [ # For numeric series, we should coerce to NaN. - (Series([1, 2, 3]), Series([np.nan, 2, 3]), None), - (Series([1.0, 2.0, 3.0]), Series([np.nan, 2.0, 3.0]), None), + (Series([1, 2, 3]), Series([np.nan, 2, 3])), + (Series([1.0, 2.0, 3.0]), Series([np.nan, 2.0, 3.0])), # For datetime series, we should coerce to NaT. ( Series([datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)]), Series([NaT, datetime(2000, 1, 2), datetime(2000, 1, 3)]), - None, ), # For objects, we should preserve the None value. - (Series(["foo", "bar", "baz"]), Series([None, "bar", "baz"]), None), + (Series(["foo", "bar", "baz"]), Series([None, "bar", "baz"])), ], ) class TestSeriesNoneCoercion(SetitemCastingEquivalents): @@ -1175,6 +1159,10 @@ def key(self): def val(self): return None + @pytest.fixture + def raises(self): + return False + class TestSetitemFloatIntervalWithIntIntervalValues(SetitemCastingEquivalents): # GH#44201 Cast to shared IntervalDtype rather than object @@ -1185,11 +1173,8 @@ def test_setitem_example(self): obj = Series(idx) val = Interval(0.5, 1.5) - with tm.assert_produces_warning( - FutureWarning, match="Setting an item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): obj[0] = val - assert obj.dtype == "Interval[float64, right]" @pytest.fixture def obj(self): @@ -1211,8 +1196,8 @@ def expected(self, obj, val): return Series(idx) @pytest.fixture - def warn(self): - return FutureWarning + def raises(self): + return True class TestSetitemRangeIntoIntegerSeries(SetitemCastingEquivalents): @@ -1240,18 +1225,18 @@ def expected(self, any_int_numpy_dtype): return exp @pytest.fixture - def warn(self): - return None + def raises(self): + return False @pytest.mark.parametrize( - "val, warn", + "val, raises", [ - (np.array([2.0, 3.0]), None), - (np.array([2.5, 3.5]), FutureWarning), + (np.array([2.0, 3.0]), False), + (np.array([2.5, 3.5]), True), ( np.array([2**65, 2**65 + 1], dtype=np.float64), - FutureWarning, + True, ), # all ints, but can't cast ], ) @@ -1291,8 +1276,8 @@ def expected(self): return Series([1, 512, 3], dtype=np.int16) @pytest.fixture - def warn(self): - return FutureWarning + def raises(self): + return True @pytest.mark.parametrize("val", [2**33 + 1.0, 2**33 + 1.1, 2**62]) @@ -1315,8 +1300,8 @@ def expected(self, val): return Series([val, 2, 3], dtype=dtype) @pytest.fixture - def warn(self): - return FutureWarning + def raises(self): + return True class CoercionTest(SetitemCastingEquivalents): @@ -1334,8 +1319,8 @@ def expected(self, obj, key, val, exp_dtype): @pytest.mark.parametrize( - "val,exp_dtype,warn", - [(np.int32(1), np.int8, None), (np.int16(2**9), np.int16, FutureWarning)], + "val,exp_dtype,raises", + [(np.int32(1), np.int8, None), (np.int16(2**9), np.int16, True)], ) class TestCoercionInt8(CoercionTest): # previously test_setitem_series_int8 in tests.indexing.test_coercion @@ -1353,17 +1338,17 @@ def obj(self): return Series(["a", "b", "c", "d"], dtype=object) @pytest.fixture - def warn(self): - return None + def raises(self): + return False @pytest.mark.parametrize( - "val,exp_dtype,warn", + "val,exp_dtype,raises", [ - (1, np.complex128, None), - (1.1, np.complex128, None), - (1 + 1j, np.complex128, None), - (True, object, FutureWarning), + (1, np.complex128, False), + (1.1, np.complex128, False), + (1 + 1j, np.complex128, False), + (True, object, True), ], ) class TestCoercionComplex(CoercionTest): @@ -1374,14 +1359,14 @@ def obj(self): @pytest.mark.parametrize( - "val,exp_dtype,warn", + "val,exp_dtype,raises", [ - (1, object, FutureWarning), - ("3", object, FutureWarning), - (3, object, FutureWarning), - (1.1, object, FutureWarning), - (1 + 1j, object, FutureWarning), - (True, bool, None), + (1, object, True), + ("3", object, True), + (3, object, True), + (1.1, object, True), + (1 + 1j, object, True), + (True, bool, False), ], ) class TestCoercionBool(CoercionTest): @@ -1392,12 +1377,12 @@ def obj(self): @pytest.mark.parametrize( - "val,exp_dtype,warn", + "val,exp_dtype,raises", [ - (1, np.int64, None), - (1.1, np.float64, FutureWarning), - (1 + 1j, np.complex128, FutureWarning), - (True, object, FutureWarning), + (1, np.int64, False), + (1.1, np.float64, True), + (1 + 1j, np.complex128, True), + (True, object, True), ], ) class TestCoercionInt64(CoercionTest): @@ -1408,12 +1393,12 @@ def obj(self): @pytest.mark.parametrize( - "val,exp_dtype,warn", + "val,exp_dtype,raises", [ - (1, np.float64, None), - (1.1, np.float64, None), - (1 + 1j, np.complex128, FutureWarning), - (True, object, FutureWarning), + (1, np.float64, False), + (1.1, np.float64, False), + (1 + 1j, np.complex128, True), + (True, object, True), ], ) class TestCoercionFloat64(CoercionTest): @@ -1424,13 +1409,13 @@ def obj(self): @pytest.mark.parametrize( - "val,exp_dtype,warn", + "val,exp_dtype,raises", [ - (1, np.float32, None), + (1, np.float32, False), pytest.param( 1.1, np.float32, - None, + False, marks=pytest.mark.xfail( ( not np_version_gte1p24 @@ -1440,16 +1425,16 @@ def obj(self): "np_can_hold_element raises and we cast to float64", ), ), - (1 + 1j, np.complex128, FutureWarning), - (True, object, FutureWarning), - (np.uint8(2), np.float32, None), - (np.uint32(2), np.float32, None), + (1 + 1j, np.complex128, True), + (True, object, True), + (np.uint8(2), np.float32, False), + (np.uint32(2), np.float32, False), # float32 cannot hold np.iinfo(np.uint32).max exactly # (closest it can hold is 4294967300.0 which off by 5.0), so # we cast to float64 - (np.uint32(np.iinfo(np.uint32).max), np.float64, FutureWarning), - (np.uint64(2), np.float32, None), - (np.int64(2), np.float32, None), + (np.uint32(np.iinfo(np.uint32).max), np.float64, True), + (np.uint64(2), np.float32, False), + (np.int64(2), np.float32, False), ], ) class TestCoercionFloat32(CoercionTest): @@ -1457,8 +1442,8 @@ class TestCoercionFloat32(CoercionTest): def obj(self): return Series([1.1, 2.2, 3.3, 4.4], dtype=np.float32) - def test_slice_key(self, obj, key, expected, warn, val, indexer_sli, is_inplace): - super().test_slice_key(obj, key, expected, warn, val, indexer_sli, is_inplace) + def test_slice_key(self, obj, key, expected, raises, val, indexer_sli, is_inplace): + super().test_slice_key(obj, key, expected, raises, val, indexer_sli, is_inplace) if isinstance(val, float): # the xfail would xpass bc test_slice_key short-circuits @@ -1494,16 +1479,16 @@ def val(self, exp_dtype): return ts @pytest.fixture - def warn(self): - return FutureWarning + def raises(self): + return True @pytest.mark.parametrize( - "val,exp_dtype,warn", + "val,exp_dtype,raises", [ - (Timestamp("2012-01-01"), "datetime64[ns]", None), - (1, object, FutureWarning), - ("x", object, FutureWarning), + (Timestamp("2012-01-01"), "datetime64[ns]", False), + (1, object, True), + ("x", object, True), ], ) class TestCoercionDatetime64(CoercionTest): @@ -1514,18 +1499,18 @@ def obj(self): return Series(date_range("2011-01-01", freq="D", periods=4)) @pytest.fixture - def warn(self): - return None + def raises(self): + return False @pytest.mark.parametrize( - "val,exp_dtype,warn", + "val,exp_dtype,raises", [ - (Timestamp("2012-01-01", tz="US/Eastern"), "datetime64[ns, US/Eastern]", None), + (Timestamp("2012-01-01", tz="US/Eastern"), "datetime64[ns, US/Eastern]", False), # pre-2.0, a mis-matched tz would end up casting to object - (Timestamp("2012-01-01", tz="US/Pacific"), "datetime64[ns, US/Eastern]", None), - (Timestamp("2012-01-01"), object, FutureWarning), - (1, object, FutureWarning), + (Timestamp("2012-01-01", tz="US/Pacific"), "datetime64[ns, US/Eastern]", False), + (Timestamp("2012-01-01"), object, True), + (1, object, True), ], ) class TestCoercionDatetime64TZ(CoercionTest): @@ -1536,16 +1521,16 @@ def obj(self): return Series(date_range("2011-01-01", freq="D", periods=4, tz=tz)) @pytest.fixture - def warn(self): - return None + def raises(self): + return False @pytest.mark.parametrize( - "val,exp_dtype,warn", + "val,exp_dtype,raises", [ - (Timedelta("12 day"), "timedelta64[ns]", None), - (1, object, FutureWarning), - ("x", object, FutureWarning), + (Timedelta("12 day"), "timedelta64[ns]", False), + (1, object, True), + ("x", object, True), ], ) class TestCoercionTimedelta64(CoercionTest): @@ -1555,8 +1540,8 @@ def obj(self): return Series(timedelta_range("1 day", periods=4)) @pytest.fixture - def warn(self): - return None + def raises(self): + return False @pytest.mark.parametrize( @@ -1575,63 +1560,45 @@ def obj(self, request): return Series(request.param) @pytest.fixture - def warn(self): - return FutureWarning + def raises(self): + return True def test_20643(): # closed by GH#45121 orig = Series([0, 1, 2], index=["a", "b", "c"]) - expected = Series([0, 2.7, 2], index=["a", "b", "c"]) - ser = orig.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): ser.at["b"] = 2.7 - tm.assert_series_equal(ser, expected) - ser = orig.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): ser.loc["b"] = 2.7 - tm.assert_series_equal(ser, expected) - ser = orig.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): ser["b"] = 2.7 - tm.assert_series_equal(ser, expected) ser = orig.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): ser.iat[1] = 2.7 - tm.assert_series_equal(ser, expected) - ser = orig.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): ser.iloc[1] = 2.7 - tm.assert_series_equal(ser, expected) orig_df = orig.to_frame("A") - expected_df = expected.to_frame("A") df = orig_df.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): df.at["b", "A"] = 2.7 - tm.assert_frame_equal(df, expected_df) - df = orig_df.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): df.loc["b", "A"] = 2.7 - tm.assert_frame_equal(df, expected_df) - df = orig_df.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): df.iloc[1, 0] = 2.7 - tm.assert_frame_equal(df, expected_df) - df = orig_df.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): df.iat[1, 0] = 2.7 - tm.assert_frame_equal(df, expected_df) def test_20643_comment(): @@ -1653,35 +1620,23 @@ def test_15413(): # fixed by GH#45121 ser = Series([1, 2, 3]) - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): ser[ser == 2] += 0.5 - expected = Series([1, 2.5, 3]) - tm.assert_series_equal(ser, expected) - ser = Series([1, 2, 3]) - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): ser[1] += 0.5 - tm.assert_series_equal(ser, expected) - ser = Series([1, 2, 3]) - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): ser.loc[1] += 0.5 - tm.assert_series_equal(ser, expected) - ser = Series([1, 2, 3]) - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): ser.iloc[1] += 0.5 - tm.assert_series_equal(ser, expected) - ser = Series([1, 2, 3]) - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): ser.iat[1] += 0.5 - tm.assert_series_equal(ser, expected) - ser = Series([1, 2, 3]) - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): ser.at[1] += 0.5 - tm.assert_series_equal(ser, expected) def test_32878_int_itemsize(): @@ -1689,10 +1644,8 @@ def test_32878_int_itemsize(): arr = np.arange(5).astype("i4") ser = Series(arr) val = np.int64(np.iinfo(np.int64).max) - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): ser[0] = val - expected = Series([val, 1, 2, 3, 4], dtype=np.int64) - tm.assert_series_equal(ser, expected) def test_32878_complex_itemsize(): @@ -1702,20 +1655,15 @@ def test_32878_complex_itemsize(): val = val.astype("c16") # GH#32878 used to coerce val to inf+0.000000e+00j - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): ser[0] = val - assert ser[0] == val - expected = Series([val, 1, 2, 3, 4], dtype="c16") - tm.assert_series_equal(ser, expected) def test_37692(indexer_al): # GH#37692 ser = Series([1, 2, 3], index=["a", "b", "c"]) - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): indexer_al(ser)["b"] = "test" - expected = Series([1, "test", 3], index=["a", "b", "c"], dtype=object) - tm.assert_series_equal(ser, expected) def test_setitem_bool_int_float_consistency(indexer_sli): @@ -1725,14 +1673,12 @@ def test_setitem_bool_int_float_consistency(indexer_sli): # as the setitem can be done losslessly for dtype in [np.float64, np.int64]: ser = Series(0, index=range(3), dtype=dtype) - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): indexer_sli(ser)[0] = True - assert ser.dtype == object ser = Series(0, index=range(3), dtype=bool) - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): ser[0] = dtype(1) - assert ser.dtype == object # 1.0 can be held losslessly, so no casting ser = Series(0, index=range(3), dtype=np.int64) diff --git a/pandas/tests/series/indexing/test_where.py b/pandas/tests/series/indexing/test_where.py index 4979bcb42d7ab..7718899ff234b 100644 --- a/pandas/tests/series/indexing/test_where.py +++ b/pandas/tests/series/indexing/test_where.py @@ -55,15 +55,13 @@ def test_where_unsafe_upcast(dtype, expected_dtype): s = Series(np.arange(10), dtype=dtype) values = [2.5, 3.5, 4.5, 5.5, 6.5] mask = s < 5 - expected = Series(values + list(range(5, 10)), dtype=expected_dtype) - warn = ( - None - if np.dtype(dtype).kind == np.dtype(expected_dtype).kind == "f" - else FutureWarning - ) - with tm.assert_produces_warning(warn, match="incompatible dtype"): + if np.dtype(dtype).kind == np.dtype(expected_dtype).kind == "f": s[mask] = values - tm.assert_series_equal(s, expected) + expected = Series(values + list(range(5, 10)), dtype=expected_dtype) + tm.assert_series_equal(s, expected) + else: + with pytest.raises(TypeError, match="Invalid value"): + s[mask] = values def test_where_unsafe(): @@ -74,9 +72,10 @@ def test_where_unsafe(): mask = s > 5 expected = Series(list(range(6)) + values, dtype="float64") - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): s[mask] = values - tm.assert_series_equal(s, expected) + s = s.astype("float64") + s[mask] = values # see gh-3235 s = Series(np.arange(10), dtype="int64") diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index f6f3a3b0fb07e..7c96a5b0f00d1 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -231,7 +231,7 @@ def test_convert_dtypes( copy = series.copy(deep=True) if result.notna().sum() > 0 and result.dtype in ["interval[int64, right]"]: - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): result[result.notna()] = np.nan else: result[result.notna()] = np.nan diff --git a/pandas/tests/series/methods/test_fillna.py b/pandas/tests/series/methods/test_fillna.py index e1c771ec6e658..f53d75df83124 100644 --- a/pandas/tests/series/methods/test_fillna.py +++ b/pandas/tests/series/methods/test_fillna.py @@ -158,9 +158,8 @@ def test_fillna_consistency(self): # assignment ser2 = ser.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): ser2[1] = "foo" - tm.assert_series_equal(ser2, expected) def test_timedelta_fillna(self, frame_or_series, unit): # GH#3371 diff --git a/pandas/tests/series/methods/test_update.py b/pandas/tests/series/methods/test_update.py index 1d29e116be5c2..9b5fb098bf3ee 100644 --- a/pandas/tests/series/methods/test_update.py +++ b/pandas/tests/series/methods/test_update.py @@ -35,37 +35,39 @@ def test_update(self): tm.assert_frame_equal(df, expected) @pytest.mark.parametrize( - "other, dtype, expected, warn", + "other, dtype, expected, raises", [ # other is int - ([61, 63], "int32", Series([10, 61, 12], dtype="int32"), None), - ([61, 63], "int64", Series([10, 61, 12]), None), - ([61, 63], float, Series([10.0, 61.0, 12.0]), None), - ([61, 63], object, Series([10, 61, 12], dtype=object), None), + ([61, 63], "int32", Series([10, 61, 12], dtype="int32"), False), + ([61, 63], "int64", Series([10, 61, 12]), False), + ([61, 63], float, Series([10.0, 61.0, 12.0]), False), + ([61, 63], object, Series([10, 61, 12], dtype=object), False), # other is float, but can be cast to int - ([61.0, 63.0], "int32", Series([10, 61, 12], dtype="int32"), None), - ([61.0, 63.0], "int64", Series([10, 61, 12]), None), - ([61.0, 63.0], float, Series([10.0, 61.0, 12.0]), None), - ([61.0, 63.0], object, Series([10, 61.0, 12], dtype=object), None), + ([61.0, 63.0], "int32", Series([10, 61, 12], dtype="int32"), False), + ([61.0, 63.0], "int64", Series([10, 61, 12]), False), + ([61.0, 63.0], float, Series([10.0, 61.0, 12.0]), False), + ([61.0, 63.0], object, Series([10, 61.0, 12], dtype=object), False), # others is float, cannot be cast to int - ([61.1, 63.1], "int32", Series([10.0, 61.1, 12.0]), FutureWarning), - ([61.1, 63.1], "int64", Series([10.0, 61.1, 12.0]), FutureWarning), - ([61.1, 63.1], float, Series([10.0, 61.1, 12.0]), None), - ([61.1, 63.1], object, Series([10, 61.1, 12], dtype=object), None), + ([61.1, 63.1], "int32", Series([10.0, 61.1, 12.0]), True), + ([61.1, 63.1], "int64", Series([10.0, 61.1, 12.0]), True), + ([61.1, 63.1], float, Series([10.0, 61.1, 12.0]), False), + ([61.1, 63.1], object, Series([10, 61.1, 12], dtype=object), False), # other is object, cannot be cast - ([(61,), (63,)], "int32", Series([10, (61,), 12]), FutureWarning), - ([(61,), (63,)], "int64", Series([10, (61,), 12]), FutureWarning), - ([(61,), (63,)], float, Series([10.0, (61,), 12.0]), FutureWarning), - ([(61,), (63,)], object, Series([10, (61,), 12]), None), + ([(61,), (63,)], "int32", Series([10, (61,), 12]), True), + ([(61,), (63,)], "int64", Series([10, (61,), 12]), True), + ([(61,), (63,)], float, Series([10.0, (61,), 12.0]), True), + ([(61,), (63,)], object, Series([10, (61,), 12]), False), ], ) - def test_update_dtypes(self, other, dtype, expected, warn): + def test_update_dtypes(self, other, dtype, expected, raises): ser = Series([10, 11, 12], dtype=dtype) other = Series(other, index=[1, 3]) - with tm.assert_produces_warning(warn, match="item of incompatible dtype"): + if raises: + with pytest.raises(TypeError, match="Invalid value"): + ser.update(other) + else: ser.update(other) - - tm.assert_series_equal(ser, expected) + tm.assert_series_equal(ser, expected) @pytest.mark.parametrize( "values, other, expected", diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index 108c3aabb1aa4..1c88329a83b0e 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -37,13 +37,8 @@ def test_timedelta64_nan(self): assert not isna(td1[0]) # GH#16674 iNaT is treated as an integer when given by the user - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): td1[1] = iNaT - assert not isna(td1[1]) - assert td1.dtype == np.object_ - assert td1[1] == iNaT - td1[1] = td[1] - assert not isna(td1[1]) td1[2] = NaT assert isna(td1[2]) diff --git a/web/pandas/pdeps/0006-ban-upcasting.md b/web/pandas/pdeps/0006-ban-upcasting.md index a86455b70c71a..ae5872186bf23 100644 --- a/web/pandas/pdeps/0006-ban-upcasting.md +++ b/web/pandas/pdeps/0006-ban-upcasting.md @@ -1,7 +1,7 @@ # PDEP-6: Ban upcasting in setitem-like operations - Created: 23 December 2022 -- Status: Accepted +- Status: Implemented - Discussion: [#39584](https://github.com/pandas-dev/pandas/pull/50402) - Author: [Marco Gorelli](https://github.com/MarcoGorelli) ([original issue](https://github.com/pandas-dev/pandas/issues/39584) by [Joris Van den Bossche](https://github.com/jorisvandenbossche)) - Revision: 1 @@ -244,3 +244,4 @@ Deprecate sometime in the 2.x releases (after 2.0.0 has already been released), ### PDEP History - 23 December 2022: Initial draft +- 4 July 2024: Change status to "implemented" From bd405e850524f359f88f03a207e2543aa341bc48 Mon Sep 17 00:00:00 2001 From: Trevor Serrao Date: Mon, 8 Jul 2024 10:44:17 -0500 Subject: [PATCH 161/272] ENH: Allow adjust=False when times is provided (#59142) * add adjust parameter to the ewma variable times test. Add tests for disallowed decay-specification parameters when times is specified and adjust=False * allow adjust=False when times is provided * re-calculate alpha each iteration for irregular-spaced time series * whatsnew entry for allowing adjust=False with times * pre-commit style fixes * reduce line lengths to comply with pre-commit * reduce line lengths and apply ruff-reformat changes --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/_libs/window/aggregations.pyx | 3 ++ pandas/core/window/ewm.py | 11 +++-- pandas/core/window/numba_.py | 6 +++ pandas/tests/window/test_ewm.py | 61 ++++++++++++++++++++++++---- 5 files changed, 72 insertions(+), 10 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 711bd417d979c..0acb82ffeca3e 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -42,6 +42,7 @@ Other enhancements - :func:`DataFrame.to_excel` argument ``merge_cells`` now accepts a value of ``"columns"`` to only merge :class:`MultiIndex` column header header cells (:issue:`35384`) - :meth:`DataFrame.corrwith` now accepts ``min_periods`` as optional arguments, as in :meth:`DataFrame.corr` and :meth:`Series.corr` (:issue:`9490`) - :meth:`DataFrame.cummin`, :meth:`DataFrame.cummax`, :meth:`DataFrame.cumprod` and :meth:`DataFrame.cumsum` methods now have a ``numeric_only`` parameter (:issue:`53072`) +- :meth:`DataFrame.ewm` now allows ``adjust=False`` when ``times`` is provided (:issue:`54328`) - :meth:`DataFrame.fillna` and :meth:`Series.fillna` can now accept ``value=None``; for non-object dtype the corresponding NA value will be used (:issue:`57723`) - :meth:`DataFrame.pivot_table` and :func:`pivot_table` now allow the passing of keyword arguments to ``aggfunc`` through ``**kwargs`` (:issue:`57884`) - :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`) diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 6365c030b695b..5b9ee095d4643 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -1813,6 +1813,9 @@ def ewm(const float64_t[:] vals, const int64_t[:] start, const int64_t[:] end, if normalize: # avoid numerical errors on constant series if weighted != cur: + if not adjust and com == 1: + # update in case of irregular-interval series + new_wt = 1. - old_wt weighted = old_wt * weighted + new_wt * cur weighted /= (old_wt + new_wt) if adjust: diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py index b2855ff1f4048..43a3c03b6cef9 100644 --- a/pandas/core/window/ewm.py +++ b/pandas/core/window/ewm.py @@ -134,8 +134,10 @@ class ExponentialMovingWindow(BaseWindow): Provide exponentially weighted (EW) calculations. Exactly one of ``com``, ``span``, ``halflife``, or ``alpha`` must be - provided if ``times`` is not provided. If ``times`` is provided, + provided if ``times`` is not provided. If ``times`` is provided and ``adjust=True``, ``halflife`` and one of ``com``, ``span`` or ``alpha`` may be provided. + If ``times`` is provided and ``adjust=False``, ``halflife`` must be the only + provided decay-specification parameter. Parameters ---------- @@ -358,8 +360,6 @@ def __init__( self.ignore_na = ignore_na self.times = times if self.times is not None: - if not self.adjust: - raise NotImplementedError("times is not supported with adjust=False.") times_dtype = getattr(self.times, "dtype", None) if not ( is_datetime64_dtype(times_dtype) @@ -376,6 +376,11 @@ def __init__( # Halflife is no longer applicable when calculating COM # But allow COM to still be calculated if the user passes other decay args if common.count_not_none(self.com, self.span, self.alpha) > 0: + if not self.adjust: + raise NotImplementedError( + "None of com, span, or alpha can be specified if " + "times is provided and adjust=False" + ) self._com = get_center_of_mass(self.com, self.span, None, self.alpha) else: self._com = 1.0 diff --git a/pandas/core/window/numba_.py b/pandas/core/window/numba_.py index 621b0f2c0f2d8..171d3bc1d1c35 100644 --- a/pandas/core/window/numba_.py +++ b/pandas/core/window/numba_.py @@ -149,6 +149,9 @@ def ewm( # note that len(deltas) = len(vals) - 1 and deltas[i] # is to be used in conjunction with vals[i+1] old_wt *= old_wt_factor ** deltas[start + j - 1] + if not adjust and com == 1: + # update in case of irregular-interval time series + new_wt = 1.0 - old_wt else: weighted = old_wt_factor * weighted if is_observation: @@ -324,6 +327,9 @@ def ewm_table( # note that len(deltas) = len(vals) - 1 and deltas[i] # is to be used in conjunction with vals[i+1] old_wt[j] *= old_wt_factor ** deltas[i - 1] + if not adjust and com == 1: + # update in case of irregular-interval time series + new_wt = 1.0 - old_wt[j] else: weighted[j] = old_wt_factor * weighted[j] if is_observations[j]: diff --git a/pandas/tests/window/test_ewm.py b/pandas/tests/window/test_ewm.py index 35c896dc0090b..4ea6c805a2ee4 100644 --- a/pandas/tests/window/test_ewm.py +++ b/pandas/tests/window/test_ewm.py @@ -102,7 +102,8 @@ def test_ewma_with_times_equal_spacing(halflife_with_times, times, min_periods): tm.assert_frame_equal(result, expected) -def test_ewma_with_times_variable_spacing(tz_aware_fixture, unit): +def test_ewma_with_times_variable_spacing(tz_aware_fixture, unit, adjust): + # GH 54328 tz = tz_aware_fixture halflife = "23 days" times = ( @@ -112,8 +113,11 @@ def test_ewma_with_times_variable_spacing(tz_aware_fixture, unit): ) data = np.arange(3) df = DataFrame(data) - result = df.ewm(halflife=halflife, times=times).mean() - expected = DataFrame([0.0, 0.5674161888241773, 1.545239952073459]) + result = df.ewm(halflife=halflife, times=times, adjust=adjust).mean() + if adjust: + expected = DataFrame([0.0, 0.5674161888241773, 1.545239952073459]) + else: + expected = DataFrame([0.0, 0.23762518642226227, 1.534926369128742]) tm.assert_frame_equal(result, expected) @@ -148,13 +152,56 @@ def test_ewm_getitem_attributes_retained(arg, adjust, ignore_na): assert result == expected -def test_ewma_times_adjust_false_raises(): - # GH 40098 +def test_ewma_times_adjust_false_with_disallowed_com(): + # GH 54328 + with pytest.raises( + NotImplementedError, + match=( + "None of com, span, or alpha can be specified " + "if times is provided and adjust=False" + ), + ): + Series(range(1)).ewm( + 0.1, + adjust=False, + times=date_range("2000", freq="D", periods=1), + halflife="1D", + ) + + +def test_ewma_times_adjust_false_with_disallowed_alpha(): + # GH 54328 with pytest.raises( - NotImplementedError, match="times is not supported with adjust=False." + NotImplementedError, + match=( + "None of com, span, or alpha can be specified " + "if times is provided and adjust=False" + ), + ): + Series(range(1)).ewm( + 0.1, + adjust=False, + times=date_range("2000", freq="D", periods=1), + alpha=0.5, + halflife="1D", + ) + + +def test_ewma_times_adjust_false_with_disallowed_span(): + # GH 54328 + with pytest.raises( + NotImplementedError, + match=( + "None of com, span, or alpha can be specified " + "if times is provided and adjust=False" + ), ): Series(range(1)).ewm( - 0.1, adjust=False, times=date_range("2000", freq="D", periods=1) + 0.1, + adjust=False, + times=date_range("2000", freq="D", periods=1), + span=10, + halflife="1D", ) From ad09dc6108896e175979c247cff2878d259acf3d Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 8 Jul 2024 08:54:02 -0700 Subject: [PATCH 162/272] Bump pypa/cibuildwheel from 2.19.1 to 2.19.2 (#59208) Bumps [pypa/cibuildwheel](https://github.com/pypa/cibuildwheel) from 2.19.1 to 2.19.2. - [Release notes](https://github.com/pypa/cibuildwheel/releases) - [Changelog](https://github.com/pypa/cibuildwheel/blob/main/docs/changelog.md) - [Commits](https://github.com/pypa/cibuildwheel/compare/v2.19.1...v2.19.2) --- updated-dependencies: - dependency-name: pypa/cibuildwheel dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/wheels.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index b92588d81f4ed..f61ef550f74df 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -140,7 +140,7 @@ jobs: run: echo "sdist_name=$(cd ./dist && ls -d */)" >> "$GITHUB_ENV" - name: Build wheels - uses: pypa/cibuildwheel@v2.19.1 + uses: pypa/cibuildwheel@v2.19.2 with: package-dir: ./dist/${{ startsWith(matrix.buildplat[1], 'macosx') && env.sdist_name || needs.build_sdist.outputs.sdist_file }} env: From 9b9e447a0a4409bc0978073e6ce8b8a0b4e7531c Mon Sep 17 00:00:00 2001 From: Ritwiz Sinha <43509699+ritwizsinha@users.noreply.github.com> Date: Mon, 8 Jul 2024 22:36:33 +0530 Subject: [PATCH 163/272] Update read_html docs (#59209) --- pandas/io/html.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/io/html.py b/pandas/io/html.py index db4c5f8507946..4b8bc48130fab 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -1178,7 +1178,10 @@ def read_html( **after** `skiprows` is applied. This function will *always* return a list of :class:`DataFrame` *or* - it will fail, i.e., it will *not* return an empty list. + it will fail, i.e., it will *not* return an empty list, save for some + rare cases. + It might return an empty list in case of inputs with single row and + ```` containing only whitespaces. Examples -------- From 16e9a3eb6f24aa41a3e9c8642a588088513abe78 Mon Sep 17 00:00:00 2001 From: Viswa Sai Ammiraju Bonam <113131386+ViswaBonam@users.noreply.github.com> Date: Mon, 8 Jul 2024 13:00:07 -0500 Subject: [PATCH 164/272] DOC: Grammatically updated the tech docs "Package Overview" (#59206) * Grammatically updated the tech docs * Addressing the suggested change --- doc/source/getting_started/overview.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/getting_started/overview.rst b/doc/source/getting_started/overview.rst index 05a7d63b7ff47..a8b7a387d80ec 100644 --- a/doc/source/getting_started/overview.rst +++ b/doc/source/getting_started/overview.rst @@ -6,11 +6,11 @@ Package overview **************** -pandas is a `Python `__ package providing fast, +pandas is a `Python `__ package that provides fast, flexible, and expressive data structures designed to make working with "relational" or "labeled" data both easy and intuitive. It aims to be the -fundamental high-level building block for doing practical, **real-world** data -analysis in Python. Additionally, it has the broader goal of becoming **the +fundamental high-level building block for Python's practical, **real-world** data +analysis. Additionally, it seeks to become **the most powerful and flexible open source data analysis/manipulation tool available in any language**. It is already well on its way toward this goal. From 58da4b04098c151ef66dab1d4c27573324107bcc Mon Sep 17 00:00:00 2001 From: AnaDenisa Date: Mon, 8 Jul 2024 19:15:25 +0100 Subject: [PATCH 165/272] DOC: Add notes section to .isin() docs (#59201) * Add note to doc * Rephrase --- pandas/core/frame.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index fab798dd617b7..5ef663564a016 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -13328,6 +13328,11 @@ def isin(self, values: Series | DataFrame | Sequence | Mapping) -> DataFrame: Series.str.contains: Test if pattern or regex is contained within a string of a Series or Index. + Notes + ----- + ``__iter__`` is used (and not ``__contains__``) to iterate over values + when checking if it contains the elements in DataFrame. + Examples -------- >>> df = pd.DataFrame( From ab433af410464f4f5c377c82a3d4f5680bf3c65c Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 8 Jul 2024 08:16:15 -1000 Subject: [PATCH 166/272] REF: Add back attr passing in concat by attribute (#59195) * REF: Add back attr passing in concat by attribute * define reference once --- pandas/core/generic.py | 2 +- pandas/core/reshape/concat.py | 9 ++++++--- pandas/tests/generic/test_frame.py | 3 +-- pandas/tests/generic/test_series.py | 7 +++++-- 4 files changed, 13 insertions(+), 8 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 43003553d7ad6..5d9e04bd50979 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6034,7 +6034,7 @@ def __finalize__(self, other, method: str | None = None, **kwargs) -> Self: object.__setattr__(self, name, getattr(other, name, None)) if method == "concat": - objs = kwargs["objs"] + objs = other.objs # propagate attrs only if all concat arguments have the same attrs if all(bool(obj.attrs) for obj in objs): # all concatenate arguments have non-empty attrs diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 6381869c3e559..6836ba3f65691 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -5,6 +5,7 @@ from __future__ import annotations from collections import abc +import types from typing import ( TYPE_CHECKING, Literal, @@ -536,7 +537,9 @@ def _get_result( result = sample._constructor_from_mgr(mgr, axes=mgr.axes) result._name = name - return result.__finalize__(object(), method="concat", objs=objs) + return result.__finalize__( + types.SimpleNamespace(objs=objs), method="concat" + ) # combine as columns in a frame else: @@ -556,7 +559,7 @@ def _get_result( ) df = cons(data, index=index, copy=False) df.columns = columns - return df.__finalize__(object(), method="concat", objs=objs) + return df.__finalize__(types.SimpleNamespace(objs=objs), method="concat") # combine block managers else: @@ -595,7 +598,7 @@ def _get_result( ) out = sample._constructor_from_mgr(new_data, axes=new_data.axes) - return out.__finalize__(object(), method="concat", objs=objs) + return out.__finalize__(types.SimpleNamespace(objs=objs), method="concat") def new_axes( diff --git a/pandas/tests/generic/test_frame.py b/pandas/tests/generic/test_frame.py index d06bfad930d7c..1d0f491529b56 100644 --- a/pandas/tests/generic/test_frame.py +++ b/pandas/tests/generic/test_frame.py @@ -84,9 +84,8 @@ def finalize(self, other, method=None, **kwargs): value = getattr(left, name, "") + "|" + getattr(right, name, "") object.__setattr__(self, name, value) elif method == "concat": - objs = kwargs["objs"] value = "+".join( - [getattr(o, name) for o in objs if getattr(o, name, None)] + [getattr(o, name) for o in other.objs if getattr(o, name, None)] ) object.__setattr__(self, name, value) else: diff --git a/pandas/tests/generic/test_series.py b/pandas/tests/generic/test_series.py index 9e2dae8d132eb..7dcdcd96cce51 100644 --- a/pandas/tests/generic/test_series.py +++ b/pandas/tests/generic/test_series.py @@ -94,9 +94,12 @@ def test_metadata_propagation_indiv(self, monkeypatch): def finalize(self, other, method=None, **kwargs): for name in self._metadata: if method == "concat" and name == "filename": - objs = kwargs["objs"] value = "+".join( - [getattr(obj, name) for obj in objs if getattr(obj, name, None)] + [ + getattr(obj, name) + for obj in other.objs + if getattr(obj, name, None) + ] ) object.__setattr__(self, name, value) else: From 374f3862039e6ddbbadc1c37ed76d5c1282971b0 Mon Sep 17 00:00:00 2001 From: taranarmo Date: Tue, 9 Jul 2024 01:33:03 +0200 Subject: [PATCH 167/272] BUG: make JSONTableWriter fail if no index.name and 'index' in columns (#58985) * BUG: make JSONTableWriter fail if no index.name and 'index' in columns This commit is itended to fix GH #58925. If index.name is empty it will use set_default_names inside __init__ to make check on overlapping names fail. Otherwise it's done during schema creation and not reflected on the dataframe itself which creates inconsistency between the data and its schema. add mention of the raised error to the `to_json` documentation move new logic description from IO docs to to_json docstring * Accept the suggestion by mroeschke Rephrase the what's new addition Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/generic.py | 3 ++- pandas/io/json/_json.py | 3 +++ pandas/tests/io/json/test_pandas.py | 7 +++++++ 4 files changed, 13 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 0acb82ffeca3e..2025474fecb0b 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -559,6 +559,7 @@ MultiIndex I/O ^^^ - Bug in :class:`DataFrame` and :class:`Series` ``repr`` of :py:class:`collections.abc.Mapping`` elements. (:issue:`57915`) +- Bug in :meth:`.DataFrame.to_json` when ``"index"`` was a value in the :attr:`DataFrame.column` and :attr:`Index.name` was ``None``. Now, this will fail with a ``ValueError`` (:issue:`58925`) - Bug in :meth:`DataFrame.to_dict` raises unnecessary ``UserWarning`` when columns are not unique and ``orient='tight'``. (:issue:`58281`) - Bug in :meth:`DataFrame.to_excel` when writing empty :class:`DataFrame` with :class:`MultiIndex` on both axes (:issue:`57696`) - Bug in :meth:`DataFrame.to_stata` when writing :class:`DataFrame` and ``byteorder=`big```. (:issue:`58969`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 5d9e04bd50979..2a0495dff6681 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2387,7 +2387,8 @@ def to_json( index : bool or None, default None The index is only used when 'orient' is 'split', 'index', 'column', or 'table'. Of these, 'index' and 'column' do not support - `index=False`. + `index=False`. The string 'index' as a column name with empty :class:`Index` + or if it is 'index' will raise a ``ValueError``. indent : int, optional Length of whitespace used to indent each record. diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 24fcb78a41e9d..b29ead1d14b1d 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -59,6 +59,7 @@ from pandas.io.json._table_schema import ( build_table_schema, parse_table_schema, + set_default_names, ) from pandas.io.parsers.readers import validate_integer @@ -353,6 +354,8 @@ def __init__( raise ValueError(msg) self.schema = build_table_schema(obj, index=self.index) + if self.index: + obj = set_default_names(obj) # NotImplemented on a column MultiIndex if obj.ndim == 2 and isinstance(obj.columns, MultiIndex): diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index e00c193fd471a..a34c0adc69821 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1610,6 +1610,13 @@ def test_to_json_from_json_columns_dtypes(self, orient): ) tm.assert_frame_equal(result, expected) + def test_to_json_with_index_as_a_column_name(self): + df = DataFrame(data={"index": [1, 2], "a": [2, 3]}) + with pytest.raises( + ValueError, match="Overlapping names between the index and columns" + ): + df.to_json(orient="table") + @pytest.mark.parametrize("dtype", [True, {"b": int, "c": int}]) def test_read_json_table_dtype_raises(self, dtype): # GH21345 From 9eaa4bc7f98a31a472d2dc46ae6dc4d4482d8aff Mon Sep 17 00:00:00 2001 From: Ritwiz Sinha <43509699+ritwizsinha@users.noreply.github.com> Date: Tue, 9 Jul 2024 22:08:24 +0530 Subject: [PATCH 168/272] Add support for NumpyExtensionArray in pd.unique() (#59214) * Add support for NumpyExtensionArray in unique * Add space to end of string --- pandas/core/algorithms.py | 19 +++++++++++++++---- pandas/tests/test_algos.py | 19 +++++++++++++++++-- 2 files changed, 32 insertions(+), 6 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 92bd55cac9c5e..948836bf6a51d 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -223,13 +223,17 @@ def _ensure_arraylike(values, func_name: str) -> ArrayLike: """ ensure that we are arraylike if not already """ - if not isinstance(values, (ABCIndex, ABCSeries, ABCExtensionArray, np.ndarray)): + if not isinstance( + values, + (ABCIndex, ABCSeries, ABCExtensionArray, np.ndarray, ABCNumpyExtensionArray), + ): # GH#52986 if func_name != "isin-targets": # Make an exception for the comps argument in isin. raise TypeError( f"{func_name} requires a Series, Index, " - f"ExtensionArray, or np.ndarray, got {type(values).__name__}." + f"ExtensionArray, np.ndarray or NumpyExtensionArray " + f"got {type(values).__name__}." ) inferred = lib.infer_dtype(values, skipna=False) @@ -325,7 +329,7 @@ def unique(values): Returns ------- - numpy.ndarray or ExtensionArray + numpy.ndarray, ExtensionArray or NumpyExtensionArray The return can be: @@ -333,7 +337,7 @@ def unique(values): * Categorical : when the input is a Categorical dtype * ndarray : when the input is a Series/ndarray - Return numpy.ndarray or ExtensionArray. + Return numpy.ndarray, ExtensionArray or NumpyExtensionArray. See Also -------- @@ -405,6 +409,13 @@ def unique(values): >>> pd.unique(pd.Series([("a", "b"), ("b", "a"), ("a", "c"), ("b", "a")]).values) array([('a', 'b'), ('b', 'a'), ('a', 'c')], dtype=object) + + An NumpyExtensionArray of complex + + >>> pd.unique(pd.array([1 + 1j, 2, 3])) + + [(1+1j), (2+0j), (3+0j)] + Length: 3, dtype: complex128 """ return unique_with_mask(values) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 134ebededd163..cdcd36846c560 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -873,6 +873,14 @@ def test_unique_masked(self, any_numeric_ea_dtype): expected = pd.array([1, pd.NA, 2], dtype=any_numeric_ea_dtype) tm.assert_extension_array_equal(result, expected) + def test_unique_NumpyExtensionArray(self): + arr_complex = pd.array( + [1 + 1j, 2, 3] + ) # NumpyEADtype('complex128') => NumpyExtensionArray + result = pd.unique(arr_complex) + expected = pd.array([1 + 1j, 2 + 0j, 3 + 0j]) + tm.assert_extension_array_equal(result, expected) + def test_nunique_ints(index_or_series_or_array): # GH#36327 @@ -1638,7 +1646,10 @@ def test_unique_tuples(self, arr, uniques): expected = np.empty(len(uniques), dtype=object) expected[:] = uniques - msg = "unique requires a Series, Index, ExtensionArray, or np.ndarray, got list" + msg = ( + r"unique requires a Series, Index, ExtensionArray, np.ndarray " + r"or NumpyExtensionArray got list" + ) with pytest.raises(TypeError, match=msg): # GH#52986 pd.unique(arr) @@ -1657,7 +1668,11 @@ def test_unique_tuples(self, arr, uniques): ) def test_unique_complex_numbers(self, array, expected): # GH 17927 - msg = "unique requires a Series, Index, ExtensionArray, or np.ndarray, got list" + msg = ( + r"unique requires a Series, Index, ExtensionArray, np.ndarray " + r"or NumpyExtensionArray got list" + ) + with pytest.raises(TypeError, match=msg): # GH#52986 pd.unique(array) From 3a34e078aa2eeeda86fcd02fb2f132ce594fbb94 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 9 Jul 2024 06:39:45 -1000 Subject: [PATCH 169/272] REF: Move methods in core/reshape/util.py to where they are used (#59172) * Move methods in core/reshape/util.py to where they are used * Remove unit tests --- pandas/core/indexes/multi.py | 58 ++++++++++++- pandas/core/reshape/melt.py | 4 +- pandas/core/reshape/pivot.py | 9 +- pandas/core/reshape/util.py | 85 ------------------- .../{reshape => indexes/multi}/test_util.py | 18 +--- 5 files changed, 62 insertions(+), 112 deletions(-) delete mode 100644 pandas/core/reshape/util.py rename pandas/tests/{reshape => indexes/multi}/test_util.py (78%) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 8e3ebc7816fed..19c94fa4104d7 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -638,7 +638,6 @@ def from_product( (2, 'purple')], names=['number', 'color']) """ - from pandas.core.reshape.util import cartesian_product if not is_list_like(iterables): raise TypeError("Input must be a list / sequence of iterables.") @@ -4105,3 +4104,60 @@ def _require_listlike(level, arr, arrname: str): if not is_list_like(arr) or not is_list_like(arr[0]): raise TypeError(f"{arrname} must be list of lists-like") return level, arr + + +def cartesian_product(X: list[np.ndarray]) -> list[np.ndarray]: + """ + Numpy version of itertools.product. + Sometimes faster (for large inputs)... + + Parameters + ---------- + X : list-like of list-likes + + Returns + ------- + product : list of ndarrays + + Examples + -------- + >>> cartesian_product([list("ABC"), [1, 2]]) + [array(['A', 'A', 'B', 'B', 'C', 'C'], dtype=' list[np.ndarray]: - """ - Numpy version of itertools.product. - Sometimes faster (for large inputs)... - - Parameters - ---------- - X : list-like of list-likes - - Returns - ------- - product : list of ndarrays - - Examples - -------- - >>> cartesian_product([list("ABC"), [1, 2]]) - [array(['A', 'A', 'B', 'B', 'C', 'C'], dtype=' NumpyIndexT: - """ - Index compat for np.tile. - - Notes - ----- - Does not support multi-dimensional `num`. - """ - if isinstance(arr, np.ndarray): - return np.tile(arr, num) - - # Otherwise we have an Index - taker = np.tile(np.arange(len(arr)), num) - return arr.take(taker) diff --git a/pandas/tests/reshape/test_util.py b/pandas/tests/indexes/multi/test_util.py similarity index 78% rename from pandas/tests/reshape/test_util.py rename to pandas/tests/indexes/multi/test_util.py index d2971db3d7aa2..68792ce53f04e 100644 --- a/pandas/tests/reshape/test_util.py +++ b/pandas/tests/indexes/multi/test_util.py @@ -6,7 +6,7 @@ date_range, ) import pandas._testing as tm -from pandas.core.reshape.util import cartesian_product +from pandas.core.indexes.multi import cartesian_product class TestCartesianProduct: @@ -28,22 +28,6 @@ def test_datetimeindex(self): tm.assert_index_equal(result1, expected1) tm.assert_index_equal(result2, expected2) - def test_tzaware_retained(self): - x = date_range("2000-01-01", periods=2, tz="US/Pacific") - y = np.array([3, 4]) - result1, result2 = cartesian_product([x, y]) - - expected = x.repeat(2) - tm.assert_index_equal(result1, expected) - - def test_tzaware_retained_categorical(self): - x = date_range("2000-01-01", periods=2, tz="US/Pacific").astype("category") - y = np.array([3, 4]) - result1, result2 = cartesian_product([x, y]) - - expected = x.repeat(2) - tm.assert_index_equal(result1, expected) - @pytest.mark.parametrize("x, y", [[[], []], [[0, 1], []], [[], ["a", "b", "c"]]]) def test_empty(self, x, y): # product of empty factors From d96646219618e007f64ee49e0a6e20f4aea761b5 Mon Sep 17 00:00:00 2001 From: Natalia Mokeeva <91160475+natmokval@users.noreply.github.com> Date: Tue, 9 Jul 2024 19:19:30 +0200 Subject: [PATCH 170/272] CLN: enforced the deprecation of strings 'H', 'BH', 'CBH' in favor of 'h', 'bh', 'cbh' (#59143) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * CLN: enforced the deprecation of strings ‘H’, ‘BH’, ‘CBH’ in favour of ‘h’, ‘bh’, ‘cbh’ * fix tests * add a note to v3.0.0 * fixup * add def INVALID_FREQ_ERR_MSG to dtypes.pxd * Revert "add def INVALID_FREQ_ERR_MSG to dtypes.pxd" This reverts commit 4085d5c3215c4cd6d022df702f2734a0106c40d1. * remove dict c_REMOVED_ABBREVS, add msg if raise KeyError in get_reso_from_freqstr --- doc/source/whatsnew/v3.0.0.rst | 2 + pandas/_libs/tslibs/dtypes.pxd | 2 +- pandas/_libs/tslibs/dtypes.pyx | 42 +++--------------- pandas/_libs/tslibs/offsets.pyx | 11 ----- pandas/tests/arrays/test_datetimes.py | 9 +++- .../tests/indexes/datetimes/test_datetime.py | 27 +++--------- .../tests/indexes/period/test_period_range.py | 8 ++-- .../timedeltas/test_timedelta_range.py | 44 +++++-------------- pandas/tests/scalar/period/test_asfreq.py | 5 +-- pandas/tests/tslibs/test_resolution.py | 10 ++--- pandas/tests/tslibs/test_to_offset.py | 20 ++++----- 11 files changed, 50 insertions(+), 130 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 2025474fecb0b..cd917924880f1 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -383,6 +383,8 @@ Other Removals - Enforced deprecation of string ``A`` denoting frequency in :class:`YearEnd` and strings ``A-DEC``, ``A-JAN``, etc. denoting annual frequencies with various fiscal year ends (:issue:`57699`) - Enforced deprecation of string ``BAS`` denoting frequency in :class:`BYearBegin` and strings ``BAS-DEC``, ``BAS-JAN``, etc. denoting annual frequencies with various fiscal year starts (:issue:`57793`) - Enforced deprecation of string ``BA`` denoting frequency in :class:`BYearEnd` and strings ``BA-DEC``, ``BA-JAN``, etc. denoting annual frequencies with various fiscal year ends (:issue:`57793`) +- Enforced deprecation of strings ``H``, ``BH``, and ``CBH`` denoting frequencies in :class:`Hour`, :class:`BusinessHour`, :class:`CustomBusinessHour` (:issue:`59143`) +- Enforced deprecation of strings ``H``, ``BH``, and ``CBH`` denoting units in :class:`Timedelta` (:issue:`59143`) - Enforced deprecation of strings ``T``, ``L``, ``U``, and ``N`` denoting frequencies in :class:`Minute`, :class:`Milli`, :class:`Micro`, :class:`Nano` (:issue:`57627`) - Enforced deprecation of strings ``T``, ``L``, ``U``, and ``N`` denoting units in :class:`Timedelta` (:issue:`57627`) - Enforced deprecation of the behavior of :func:`concat` when ``len(keys) != len(objs)`` would truncate to the shorter of the two. Now this raises a ``ValueError`` (:issue:`43485`) diff --git a/pandas/_libs/tslibs/dtypes.pxd b/pandas/_libs/tslibs/dtypes.pxd index 204d582294a5b..d8c536a34bc04 100644 --- a/pandas/_libs/tslibs/dtypes.pxd +++ b/pandas/_libs/tslibs/dtypes.pxd @@ -14,12 +14,12 @@ cdef bint is_supported_unit(NPY_DATETIMEUNIT reso) cdef dict c_OFFSET_TO_PERIOD_FREQSTR cdef dict c_PERIOD_TO_OFFSET_FREQSTR cdef dict c_OFFSET_RENAMED_FREQSTR -cdef dict c_DEPR_ABBREVS cdef dict c_DEPR_UNITS cdef dict c_PERIOD_AND_OFFSET_DEPR_FREQSTR cdef dict attrname_to_abbrevs cdef dict npy_unit_to_attrname cdef dict attrname_to_npy_unit +cdef str INVALID_FREQ_ERR_MSG cdef enum c_FreqGroup: # Mirrors FreqGroup in the .pyx file diff --git a/pandas/_libs/tslibs/dtypes.pyx b/pandas/_libs/tslibs/dtypes.pyx index 40d2395b38f04..7e6e382c17cc6 100644 --- a/pandas/_libs/tslibs/dtypes.pyx +++ b/pandas/_libs/tslibs/dtypes.pyx @@ -1,9 +1,6 @@ # period frequency constants corresponding to scikits timeseries # originals from enum import Enum -import warnings - -from pandas.util._exceptions import find_stack_level from pandas._libs.tslibs.ccalendar cimport c_MONTH_NUMBERS from pandas._libs.tslibs.np_datetime cimport ( @@ -338,14 +335,6 @@ PERIOD_TO_OFFSET_FREQSTR = { cdef dict c_OFFSET_TO_PERIOD_FREQSTR = OFFSET_TO_PERIOD_FREQSTR cdef dict c_PERIOD_TO_OFFSET_FREQSTR = PERIOD_TO_OFFSET_FREQSTR -# Map deprecated resolution abbreviations to correct resolution abbreviations -cdef dict c_DEPR_ABBREVS = { - "H": "h", - "BH": "bh", - "CBH": "cbh", - "S": "s", -} - cdef dict c_DEPR_UNITS = { "w": "W", "d": "D", @@ -372,6 +361,8 @@ cdef dict c_PERIOD_AND_OFFSET_DEPR_FREQSTR = { "MIN": "min", } +cdef str INVALID_FREQ_ERR_MSG = "Invalid frequency: {0}" + class FreqGroup(Enum): # Mirrors c_FreqGroup in the .pxd file @@ -461,39 +452,18 @@ class Resolution(Enum): >>> Resolution.get_reso_from_freqstr('h') == Resolution.RESO_HR True """ - cdef: - str abbrev try: - if freq in c_DEPR_ABBREVS: - abbrev = c_DEPR_ABBREVS[freq] - warnings.warn( - f"\'{freq}\' is deprecated and will be removed in a future " - f"version. Please use \'{abbrev}\' " - f"instead of \'{freq}\'.", - FutureWarning, - stacklevel=find_stack_level(), - ) - freq = abbrev attr_name = _abbrev_to_attrnames[freq] - except KeyError: + except KeyError as exc: + msg = INVALID_FREQ_ERR_MSG.format(freq) # For quarterly and yearly resolutions, we need to chop off # a month string. split_freq = freq.split("-") if len(split_freq) != 2: - raise + raise ValueError(msg) from exc if split_freq[1] not in _month_names: # i.e. we want e.g. "Q-DEC", not "Q-INVALID" - raise - if split_freq[0] in c_DEPR_ABBREVS: - abbrev = c_DEPR_ABBREVS[split_freq[0]] - warnings.warn( - f"\'{split_freq[0]}\' is deprecated and will be removed in a " - f"future version. Please use \'{abbrev}\' " - f"instead of \'{split_freq[0]}\'.", - FutureWarning, - stacklevel=find_stack_level(), - ) - split_freq[0] = abbrev + raise ValueError(msg) from exc attr_name = _abbrev_to_attrnames[split_freq[0]] return cls.from_attrname(attr_name) diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 5ae2de907af18..0afeb002a8151 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -56,7 +56,6 @@ from pandas._libs.tslibs.ccalendar cimport ( ) from pandas._libs.tslibs.conversion cimport localize_pydatetime from pandas._libs.tslibs.dtypes cimport ( - c_DEPR_ABBREVS, c_OFFSET_RENAMED_FREQSTR, c_OFFSET_TO_PERIOD_FREQSTR, c_PERIOD_AND_OFFSET_DEPR_FREQSTR, @@ -4908,16 +4907,6 @@ cpdef to_offset(freq, bint is_period=False): if not stride: stride = 1 - if prefix in c_DEPR_ABBREVS: - warnings.warn( - f"\'{prefix}\' is deprecated and will be removed " - f"in a future version, please use " - f"\'{c_DEPR_ABBREVS.get(prefix)}\' instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - prefix = c_DEPR_ABBREVS[prefix] - if prefix in {"D", "h", "min", "s", "ms", "us", "ns"}: # For these prefixes, we have something like "3h" or # "2.5min", so we can construct a Timedelta with the diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index 0a00264a7156f..de189b7e2f724 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -760,7 +760,7 @@ def test_date_range_frequency_M_Q_Y_raises(self, freq): with pytest.raises(ValueError, match=msg): pd.date_range("1/1/2000", periods=4, freq=freq) - @pytest.mark.parametrize("freq_depr", ["2H", "2CBH", "2MIN", "2S", "2mS", "2Us"]) + @pytest.mark.parametrize("freq_depr", ["2MIN", "2mS", "2Us"]) def test_date_range_uppercase_frequency_deprecated(self, freq_depr): # GH#9586, GH#54939 depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed in a " @@ -807,6 +807,13 @@ def test_date_range_frequency_A_raises(self, freq): with pytest.raises(ValueError, match=msg): pd.date_range("1/1/2000", periods=4, freq=freq) + @pytest.mark.parametrize("freq", ["2H", "2CBH", "2S"]) + def test_date_range_uppercase_frequency_raises(self, freq): + msg = f"Invalid frequency: {freq}" + + with pytest.raises(ValueError, match=msg): + pd.date_range("1/1/2000", periods=4, freq=freq) + def test_factorize_sort_without_freq(): dta = DatetimeArray._from_sequence([0, 2, 1], dtype="M8[ns]") diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index cc2b802de2a16..04334a1d8d0c8 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -133,29 +133,12 @@ def test_asarray_tz_aware(self): tm.assert_numpy_array_equal(result, expected) - def test_CBH_deprecated(self): - msg = "'CBH' is deprecated and will be removed in a future version." - - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = date_range( - dt.datetime(2022, 12, 11), dt.datetime(2022, 12, 13), freq="CBH" - ) - result = DatetimeIndex( - [ - "2022-12-12 09:00:00", - "2022-12-12 10:00:00", - "2022-12-12 11:00:00", - "2022-12-12 12:00:00", - "2022-12-12 13:00:00", - "2022-12-12 14:00:00", - "2022-12-12 15:00:00", - "2022-12-12 16:00:00", - ], - dtype="datetime64[ns]", - freq="cbh", - ) + @pytest.mark.parametrize("freq", ["2H", "2BH", "2S"]) + def test_CBH_raises(self, freq): + msg = f"Invalid frequency: {freq}" - tm.assert_index_equal(result, expected) + with pytest.raises(ValueError, match=msg): + date_range(dt.datetime(2022, 12, 11), dt.datetime(2022, 12, 13), freq=freq) @pytest.mark.parametrize("freq", ["2BM", "1bm", "2BQ", "1BQ-MAR", "2BY-JUN", "1by"]) def test_BM_BQ_BY_raises(self, freq): diff --git a/pandas/tests/indexes/period/test_period_range.py b/pandas/tests/indexes/period/test_period_range.py index 4e58dc1f324b2..51b03024ce272 100644 --- a/pandas/tests/indexes/period/test_period_range.py +++ b/pandas/tests/indexes/period/test_period_range.py @@ -203,7 +203,7 @@ def test_constructor_U(self): with pytest.raises(ValueError, match="Invalid frequency: X"): period_range("2007-1-1", periods=500, freq="X") - @pytest.mark.parametrize("freq_depr", ["2H", "2MIN", "2S", "2US", "2NS"]) + @pytest.mark.parametrize("freq_depr", ["2MIN", "2US", "2NS"]) def test_uppercase_freq_deprecated_from_time_series(self, freq_depr): # GH#52536, GH#54939 msg = f"'{freq_depr[1:]}' is deprecated and will be removed in a " @@ -212,9 +212,9 @@ def test_uppercase_freq_deprecated_from_time_series(self, freq_depr): with tm.assert_produces_warning(FutureWarning, match=msg): period_range("2020-01-01 00:00:00 00:00", periods=2, freq=freq_depr) - @pytest.mark.parametrize("freq", ["2m", "2q-sep", "2y"]) - def test_lowercase_freq_from_time_series_raises(self, freq): - # GH#52536, GH#54939 + @pytest.mark.parametrize("freq", ["2m", "2q-sep", "2y", "2H", "2S"]) + def test_incorrect_case_freq_from_time_series_raises(self, freq): + # GH#52536, GH#54939, GH#59143 msg = f"Invalid frequency: {freq}" with pytest.raises(ValueError, match=msg): diff --git a/pandas/tests/indexes/timedeltas/test_timedelta_range.py b/pandas/tests/indexes/timedeltas/test_timedelta_range.py index 1b645e2bc607f..6f3d29fb4240a 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta_range.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta_range.py @@ -3,7 +3,6 @@ from pandas import ( Timedelta, - TimedeltaIndex, timedelta_range, to_timedelta, ) @@ -70,14 +69,12 @@ def test_linspace_behavior(self, periods, freq): expected = timedelta_range(start="0 days", end="4 days", freq=freq) tm.assert_index_equal(result, expected) - def test_timedelta_range_H_deprecated(self): + def test_timedelta_range_H_raises(self): # GH#52536 - msg = "'H' is deprecated and will be removed in a future version." + msg = "Invalid frequency: H" - result = timedelta_range(start="0 days", end="4 days", periods=6) - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = timedelta_range(start="0 days", end="4 days", freq="19H12min") - tm.assert_index_equal(result, expected) + with pytest.raises(ValueError, match=msg): + timedelta_range(start="0 days", end="4 days", freq="19H12min") def test_timedelta_range_T_raises(self): msg = "Invalid frequency: T" @@ -130,33 +127,6 @@ def test_timedelta_range_infer_freq(self): result = timedelta_range("0s", "1s", periods=31) assert result.freq is None - @pytest.mark.parametrize( - "freq_depr, start, end, expected_values, expected_freq", - [ - ( - "3.5S", - "05:03:01", - "05:03:10", - ["0 days 05:03:01", "0 days 05:03:04.500000", "0 days 05:03:08"], - "3500ms", - ), - ], - ) - def test_timedelta_range_deprecated_freq( - self, freq_depr, start, end, expected_values, expected_freq - ): - # GH#52536 - msg = ( - f"'{freq_depr[-1]}' is deprecated and will be removed in a future version." - ) - - with tm.assert_produces_warning(FutureWarning, match=msg): - result = timedelta_range(start=start, end=end, freq=freq_depr) - expected = TimedeltaIndex( - expected_values, dtype="timedelta64[ns]", freq=expected_freq - ) - tm.assert_index_equal(result, expected) - @pytest.mark.parametrize( "freq_depr, start, end", [ @@ -170,9 +140,15 @@ def test_timedelta_range_deprecated_freq( "5 hours", "5 hours 8 minutes", ), + ( + "3.5S", + "05:03:01", + "05:03:10", + ), ], ) def test_timedelta_range_removed_freq(self, freq_depr, start, end): + # GH#59143 msg = f"Invalid frequency: {freq_depr}" with pytest.raises(ValueError, match=msg): timedelta_range(start=start, end=end, freq=freq_depr) diff --git a/pandas/tests/scalar/period/test_asfreq.py b/pandas/tests/scalar/period/test_asfreq.py index 90d4a7d0cc23b..0ae5389a3e9b5 100644 --- a/pandas/tests/scalar/period/test_asfreq.py +++ b/pandas/tests/scalar/period/test_asfreq.py @@ -111,8 +111,7 @@ def test_conv_annual(self): assert ival_A.asfreq("B", "E") == ival_A_to_B_end assert ival_A.asfreq("D", "s") == ival_A_to_D_start assert ival_A.asfreq("D", "E") == ival_A_to_D_end - msg_depr = "'H' is deprecated and will be removed in a future version." - with tm.assert_produces_warning(FutureWarning, match=msg_depr): + with pytest.raises(ValueError, match=msg): assert ival_A.asfreq("H", "s") == ival_A_to_H_start assert ival_A.asfreq("H", "E") == ival_A_to_H_end assert ival_A.asfreq("min", "s") == ival_A_to_T_start @@ -120,8 +119,6 @@ def test_conv_annual(self): with pytest.raises(ValueError, match=msg): assert ival_A.asfreq("T", "s") == ival_A_to_T_start assert ival_A.asfreq("T", "E") == ival_A_to_T_end - msg_depr = "'S' is deprecated and will be removed in a future version." - with tm.assert_produces_warning(FutureWarning, match=msg_depr): assert ival_A.asfreq("S", "S") == ival_A_to_S_start assert ival_A.asfreq("S", "E") == ival_A_to_S_end diff --git a/pandas/tests/tslibs/test_resolution.py b/pandas/tests/tslibs/test_resolution.py index 722359380f6a3..0e7705ad7ed94 100644 --- a/pandas/tests/tslibs/test_resolution.py +++ b/pandas/tests/tslibs/test_resolution.py @@ -9,8 +9,6 @@ ) from pandas._libs.tslibs.dtypes import NpyDatetimeUnit -import pandas._testing as tm - def test_get_resolution_nano(): # don't return the fallback RESO_DAY @@ -50,9 +48,9 @@ def test_get_attrname_from_abbrev(freqstr, expected): @pytest.mark.parametrize("freq", ["H", "S"]) -def test_units_H_S_deprecated_from_attrname_to_abbrevs(freq): - # GH#52536 - msg = f"'{freq}' is deprecated and will be removed in a future version." +def test_unit_H_S_raises(freq): + # GH#59143 + msg = f"Invalid frequency: {freq}" - with tm.assert_produces_warning(FutureWarning, match=msg): + with pytest.raises(ValueError, match=msg): Resolution.get_reso_from_freqstr(freq) diff --git a/pandas/tests/tslibs/test_to_offset.py b/pandas/tests/tslibs/test_to_offset.py index c123c00e749db..9e32a33650591 100644 --- a/pandas/tests/tslibs/test_to_offset.py +++ b/pandas/tests/tslibs/test_to_offset.py @@ -203,17 +203,7 @@ def test_to_offset_lowercase_frequency_raises(freq_depr): to_offset(freq_depr) -@pytest.mark.parametrize( - "freq_depr", - [ - "2H", - "2BH", - "2MIN", - "2S", - "2Us", - "2NS", - ], -) +@pytest.mark.parametrize("freq_depr", ["2MIN", "2Us", "2NS"]) def test_to_offset_uppercase_frequency_deprecated(freq_depr): # GH#54939 depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed in a " @@ -238,3 +228,11 @@ def test_to_offset_lowercase_frequency_deprecated(freq_depr, expected): with tm.assert_produces_warning(FutureWarning, match=msg): result = to_offset(freq_depr) assert result == expected + + +@pytest.mark.parametrize("freq", ["2H", "2BH", "2S"]) +def test_to_offset_uppercase_frequency_raises(freq): + msg = f"Invalid frequency: {freq}" + + with pytest.raises(ValueError, match=msg): + to_offset(freq) From 9e2bab16df25d649395d44ceaa611a503595c3b7 Mon Sep 17 00:00:00 2001 From: William Andrea <22385371+wjandrea@users.noreply.github.com> Date: Thu, 11 Jul 2024 12:09:44 -0400 Subject: [PATCH 171/272] DOC: inline link (#59230) avoid showing bare URL --- pandas/core/frame.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5ef663564a016..9e0844f255eb2 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4554,8 +4554,8 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | No For example, ```it's` > `that's``` will raise an error, as it forms a quoted string (``'s > `that'``) with a backtick inside. - See also the Python documentation about lexical analysis - (https://docs.python.org/3/reference/lexical_analysis.html) + See also the `Python documentation about lexical analysis + `__ in combination with the source code in :mod:`pandas.core.computation.parsing`. Examples From ac13a093e26ab217743e2fb028274dc50f8334a9 Mon Sep 17 00:00:00 2001 From: Annika <163510717+annika-rudolph@users.noreply.github.com> Date: Thu, 11 Jul 2024 18:30:41 +0200 Subject: [PATCH 172/272] BUG: Add frequency to DatetimeArray/TimedeltaArray take (#58382) * add take function including frequency for Timedelta and Datetime Arrays * add test for frequency of DatetimeIndex in MultiIndex * use super() in take function * add description to whatsnew and revert unwanted changes in datetimearray docstring make pre-commit happy * switch .freq to ._freq --------- Co-authored-by: [Annika Rudolph] <[annika.rudolph@analytical-software.de]> --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/arrays/datetimelike.py | 22 +++++++++++++++++++ .../indexes/multi/test_get_level_values.py | 9 ++++++++ 3 files changed, 32 insertions(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index cd917924880f1..639655ab28199 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -556,6 +556,7 @@ MultiIndex - :func:`DataFrame.loc` with ``axis=0`` and :class:`MultiIndex` when setting a value adds extra columns (:issue:`58116`) - :meth:`DataFrame.melt` would not accept multiple names in ``var_name`` when the columns were a :class:`MultiIndex` (:issue:`58033`) - :meth:`MultiIndex.insert` would not insert NA value correctly at unified location of index -1 (:issue:`59003`) +- :func:`MultiIndex.get_level_values` accessing a :class:`DatetimeIndex` does not carry the frequency attribute along (:issue:`58327`, :issue:`57949`) - I/O diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index c90ff410b4b93..ad0bde3abbdd4 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -65,6 +65,7 @@ ScalarIndexer, Self, SequenceIndexer, + TakeIndexer, TimeAmbiguous, TimeNonexistent, npt, @@ -2340,6 +2341,27 @@ def interpolate( return self return type(self)._simple_new(out_data, dtype=self.dtype) + def take( + self, + indices: TakeIndexer, + *, + allow_fill: bool = False, + fill_value: Any = None, + axis: AxisInt = 0, + ) -> Self: + result = super().take( + indices=indices, allow_fill=allow_fill, fill_value=fill_value, axis=axis + ) + + indices = np.asarray(indices, dtype=np.intp) + maybe_slice = lib.maybe_indices_to_slice(indices, len(self)) + + if isinstance(maybe_slice, slice): + freq = self._get_getitem_freq(maybe_slice) + result._freq = freq + + return result + # -------------------------------------------------------------- # Unsorted diff --git a/pandas/tests/indexes/multi/test_get_level_values.py b/pandas/tests/indexes/multi/test_get_level_values.py index 28c77e78924cb..4db74a716c514 100644 --- a/pandas/tests/indexes/multi/test_get_level_values.py +++ b/pandas/tests/indexes/multi/test_get_level_values.py @@ -122,3 +122,12 @@ def test_values_loses_freq_of_underlying_index(): midx.values assert idx.freq is not None tm.assert_index_equal(idx, expected) + + +def test_get_level_values_gets_frequency_correctly(): + # GH#57949 GH#58327 + datetime_index = date_range(start=pd.to_datetime("1/1/2018"), periods=4, freq="YS") + other_index = ["A"] + multi_index = MultiIndex.from_product([datetime_index, other_index]) + + assert multi_index.get_level_values(0).freq == datetime_index.freq From 61e209e4e9b628e997a648e12e24ac47fa3e1e26 Mon Sep 17 00:00:00 2001 From: Borja Elizalde Date: Thu, 11 Jul 2024 20:07:53 +0200 Subject: [PATCH 173/272] =?UTF-8?q?modified=20the=20automatically=20genera?= =?UTF-8?q?ted=20docstring=20to=20include=20the=20return=20=E2=80=A6=20(#5?= =?UTF-8?q?9229)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * modified the automatically generated docstring to include the return type specification * removed Series.skew from the list as is no longer giving error --------- Co-authored-by: Borja Elizalde --- ci/code_checks.sh | 6 +----- pandas/core/generic.py | 2 ++ 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index b01866a6d6c82..c4d91da70adb5 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -70,10 +70,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then --format=actions \ -i ES01 `# For now it is ok if docstrings are missing the extended summary` \ -i "pandas.Series.dt PR01" `# Accessors are implemented as classes, but we do not document the Parameters section` \ - -i "pandas.DataFrame.max RT03" \ - -i "pandas.DataFrame.mean RT03" \ - -i "pandas.DataFrame.median RT03" \ - -i "pandas.DataFrame.min RT03" \ -i "pandas.DataFrame.plot PR02" \ -i "pandas.Grouper PR02" \ -i "pandas.MultiIndex.append PR07,SA01" \ @@ -166,7 +162,7 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.product RT03" \ -i "pandas.Series.reorder_levels RT03,SA01" \ -i "pandas.Series.sem PR01,RT03,SA01" \ - -i "pandas.Series.skew RT03,SA01" \ + -i "pandas.Series.skew SA01" \ -i "pandas.Series.sparse PR01,SA01" \ -i "pandas.Series.sparse.density SA01" \ -i "pandas.Series.sparse.fill_value SA01" \ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 2a0495dff6681..fc9821a65777d 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -11777,6 +11777,8 @@ def last_valid_index(self) -> Hashable: Returns ------- {name1} or scalar\ + + Value containing the calculation referenced in the description.\ {see_also}\ {examples} """ From 1b6d717cdd716f5d62e8c22337801a83f9e1327d Mon Sep 17 00:00:00 2001 From: Ritwiz Sinha <43509699+ritwizsinha@users.noreply.github.com> Date: Fri, 12 Jul 2024 16:13:10 +0530 Subject: [PATCH 174/272] DOC: "list" is not a keyword - .query (#59236) Fix documentation --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9e0844f255eb2..ee48f546815bb 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4473,7 +4473,7 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | No or punctuations (besides underscores) or starting with digits must be surrounded by backticks. (For example, a column named "Area (cm^2)" would be referenced as ```Area (cm^2)```). Column names which are Python keywords - (like "list", "for", "import", etc) cannot be used. + (like "if", "for", "import", etc) cannot be used. For example, if one of your columns is called ``a a`` and you want to sum it with ``b``, your query should be ```a a` + b``. From 1165859bd147a0b4afa66663ad66078beb91f4fe Mon Sep 17 00:00:00 2001 From: Paul Bissex Date: Fri, 12 Jul 2024 10:52:23 -0400 Subject: [PATCH 175/272] DOC: Edited note on index_col/parse_dates params for clarity (#59223) * Edited note on index_col/parse_dates params for clarity (The sentence as it stands is missing a verb; maybe the result of an editing mishap?) * Update doc/source/getting_started/intro_tutorials/04_plotting.rst --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/getting_started/intro_tutorials/04_plotting.rst | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/doc/source/getting_started/intro_tutorials/04_plotting.rst b/doc/source/getting_started/intro_tutorials/04_plotting.rst index 49cf7d32e0ef5..e9f83c602d086 100644 --- a/doc/source/getting_started/intro_tutorials/04_plotting.rst +++ b/doc/source/getting_started/intro_tutorials/04_plotting.rst @@ -32,8 +32,10 @@ How do I create plots in pandas? air_quality.head() .. note:: - The usage of the ``index_col`` and ``parse_dates`` parameters of the ``read_csv`` function to define the first (0th) column as - index of the resulting ``DataFrame`` and convert the dates in the column to :class:`Timestamp` objects, respectively. + The ``index_col=0`` and ``parse_dates=True`` parameters passed to the ``read_csv`` function define + the first (0th) column as index of the resulting ``DataFrame`` and convert the dates in the column + to :class:`Timestamp` objects, respectively. + .. raw:: html From 2a9855b55b9912d336592569136d580ae0aa8209 Mon Sep 17 00:00:00 2001 From: Agriya Khetarpal <74401230+agriyakhetarpal@users.noreply.github.com> Date: Sat, 13 Jul 2024 01:44:09 +0530 Subject: [PATCH 176/272] BLD, CI: Use `cibuildwheel` to build Emscripten/Pyodide wheels, push nightlies to Anaconda.org (#58647) * BLD: Add note about keeping jobs in sync * BLD, CI: Upload Emscripten wheels nightly to Anaconda * Add configuration for `cibuildwheel`-Pyodide * Use unreleased `cibuildwheel` in wheels CI job * Temporarily move config from TOML to env vars * Rename job, to match update comment * Try out Pyodide 0.26.1 * Move Pyodide configuration to `pyproject.toml` * Use cibuildwheel v2.19 + clean up workflow * Skip a test that uses subprocesses * Match tests args with other Pyodide tests; use `not single_cpu` * Bump to cibuildwheel version 2.19.1 * Don't add `cp312` in job name Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> * Don't use separate job for Pyodide wheels * Fix matrix inclusion * Add separate step, set `CIBW_PLATFORM` to Pyodide * Add condition for non-Pyodide jobs * Use just one step, inherit `CIBW_PLATFORM` if not set Co-Authored-By: Thomas Li <47963215+lithomas1@users.noreply.github.com> * Remove condition that skips the step Co-Authored-By: Thomas Li <47963215+lithomas1@users.noreply.github.com> --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- .github/workflows/unit-tests.yml | 1 + .github/workflows/wheels.yml | 8 ++++++++ pandas/tests/test_common.py | 3 +++ pyproject.toml | 10 ++++++++++ 4 files changed, 22 insertions(+) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 982877ee7f365..ddb6ecbe83126 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -391,6 +391,7 @@ jobs: env: PYTHON_GIL: 0 + # NOTE: this job must be kept in sync with the Pyodide build job in wheels.yml emscripten: # Note: the Python version, Emscripten toolchain version are determined # by the Pyodide version. The appropriate versions can be found in the diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index f61ef550f74df..02100648b636a 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -100,6 +100,13 @@ jobs: - [windows-2022, win_amd64] # TODO: support PyPy? python: [["cp310", "3.10"], ["cp311", "3.11"], ["cp312", "3.12"]] + + # Build Pyodide wheels and upload them to Anaconda.org + # NOTE: this job is similar to the one in unit-tests.yml except for the fact + # that it uses cibuildwheel instead of a standard Pyodide xbuildenv setup. + include: + - buildplat: [ubuntu-22.04, pyodide_wasm32] + python: ["cp312", "3.12"] env: IS_PUSH: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') }} IS_SCHEDULE_DISPATCH: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }} @@ -146,6 +153,7 @@ jobs: env: CIBW_PRERELEASE_PYTHONS: True CIBW_BUILD: ${{ matrix.python[0] }}-${{ matrix.buildplat[1] }} + CIBW_PLATFORM: ${{ matrix.buildplat[1] == 'pyodide_wasm32' && 'pyodide' || 'auto' }} - name: Set up Python uses: mamba-org/setup-micromamba@v1 diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index 7b93416600f8f..ca97af0d3eb32 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -7,6 +7,8 @@ import numpy as np import pytest +from pandas.compat import WASM + import pandas as pd from pandas import Series import pandas._testing as tm @@ -233,6 +235,7 @@ def test_temp_setattr(with_exception): assert ser.name == "first" +@pytest.mark.skipif(WASM, reason="Can't start subprocesses in WASM") @pytest.mark.single_cpu def test_str_size(): # GH#21758 diff --git a/pyproject.toml b/pyproject.toml index 9156c73efbb35..47fd540d67ab2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -177,6 +177,16 @@ test-command = "" select = "*-macosx*" environment = {CFLAGS="-g0"} +[[tool.cibuildwheel.overrides]] +select = "*pyodide*" +test-requires = "pytest>=7.3.2 hypothesis>=6.46.1" +# Pyodide repairs wheels on its own, using auditwheel-emscripten +repair-wheel-command = "" +test-command = """ + PANDAS_CI='1' python -c 'import pandas as pd; \ + pd.test(extra_args=["-m not clipboard and not single_cpu and not slow and not network and not db", "--no-strict-data-files"]);' \ + """ + [tool.ruff] line-length = 88 target-version = "py310" From 39bd3d38ac97177c22e68a9259bf4f09f7315277 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Sat, 13 Jul 2024 12:24:51 -0400 Subject: [PATCH 177/272] BUG/TST: non-numeric EA reductions (#59234) * BUG/TST: non-numeric EA reductions * whatsnew * add keepdims keyword to StringArray._reduce --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/arrays/arrow/array.py | 2 -- pandas/core/arrays/base.py | 5 ++- pandas/core/arrays/datetimes.py | 13 +++++++ pandas/core/arrays/period.py | 11 ++++++ pandas/core/arrays/string_.py | 13 +++++-- pandas/tests/extension/base/reduce.py | 3 -- pandas/tests/extension/test_arrow.py | 47 +++++++++++++++++++------ pandas/tests/extension/test_datetime.py | 5 +++ 9 files changed, 81 insertions(+), 19 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 639655ab28199..ef06f57f611d1 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -616,6 +616,7 @@ ExtensionArray ^^^^^^^^^^^^^^ - Bug in :meth:`.arrays.ArrowExtensionArray.__setitem__` which caused wrong behavior when using an integer array with repeated values as a key (:issue:`58530`) - Bug in :meth:`api.types.is_datetime64_any_dtype` where a custom :class:`ExtensionDtype` would return ``False`` for array-likes (:issue:`57055`) +- Bug in various :class:`DataFrame` reductions for pyarrow temporal dtypes returning incorrect dtype when result was null (:issue:`59234`) Styler ^^^^^^ diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 943656ba48432..5da479760047f 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1706,8 +1706,6 @@ def pyarrow_meth(data, skip_nulls, **kwargs): if name == "median": # GH 52679: Use quantile instead of approximate_median; returns array result = result[0] - if pc.is_null(result).as_py(): - return result if name in ["min", "max", "sum"] and pa.types.is_duration(pa_type): result = result.cast(pa_type) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 1e8fec7fde3de..b429b7c1b1fc4 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1986,7 +1986,10 @@ def _reduce( ) result = meth(skipna=skipna, **kwargs) if keepdims: - result = np.array([result]) + if name in ["min", "max"]: + result = self._from_sequence([result], dtype=self.dtype) + else: + result = np.array([result]) return result diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 34d25f04b69e1..dddfc440109d3 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -2275,6 +2275,19 @@ def to_julian_date(self) -> npt.NDArray[np.float64]: # ----------------------------------------------------------------- # Reductions + def _reduce( + self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs + ): + result = super()._reduce(name, skipna=skipna, keepdims=keepdims, **kwargs) + if keepdims and isinstance(result, np.ndarray): + if name == "std": + from pandas.core.arrays import TimedeltaArray + + return TimedeltaArray._from_sequence(result) + else: + return self._from_sequence(result, dtype=self.dtype) + return result + def std( self, axis=None, diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index e762c3e547819..b3513dd083e41 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -956,6 +956,17 @@ def _check_timedeltalike_freq_compat(self, other): delta = delta.view("i8") return lib.item_from_zerodim(delta) + # ------------------------------------------------------------------ + # Reductions + + def _reduce( + self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs + ): + result = super()._reduce(name, skipna=skipna, keepdims=keepdims, **kwargs) + if keepdims and isinstance(result, np.ndarray): + return self._from_sequence(result, dtype=self.dtype) + return result + def raise_on_incompatible(left, right) -> IncompatibleFrequency: """ diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 291cc2e62be62..13c26f0c97934 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -522,10 +522,19 @@ def astype(self, dtype, copy: bool = True): return super().astype(dtype, copy) def _reduce( - self, name: str, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs + self, + name: str, + *, + skipna: bool = True, + keepdims: bool = False, + axis: AxisInt | None = 0, + **kwargs, ): if name in ["min", "max"]: - return getattr(self, name)(skipna=skipna, axis=axis) + result = getattr(self, name)(skipna=skipna, axis=axis) + if keepdims: + return self._from_sequence([result], dtype=self.dtype) + return result raise TypeError(f"Cannot perform reduction '{name}' with string dtype") diff --git a/pandas/tests/extension/base/reduce.py b/pandas/tests/extension/base/reduce.py index c3a6daee2dd54..3e357f99cfb03 100644 --- a/pandas/tests/extension/base/reduce.py +++ b/pandas/tests/extension/base/reduce.py @@ -4,7 +4,6 @@ import pandas as pd import pandas._testing as tm -from pandas.api.types import is_numeric_dtype class BaseReduceTests: @@ -119,8 +118,6 @@ def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna): def test_reduce_frame(self, data, all_numeric_reductions, skipna): op_name = all_numeric_reductions ser = pd.Series(data) - if not is_numeric_dtype(ser.dtype): - pytest.skip(f"{ser.dtype} is not numeric dtype") if op_name in ["count", "kurt", "sem"]: pytest.skip(f"{op_name} not an array method") diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 4fad5e45409b9..6d14f04383a65 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -67,7 +67,10 @@ pa = pytest.importorskip("pyarrow") -from pandas.core.arrays.arrow.array import ArrowExtensionArray +from pandas.core.arrays.arrow.array import ( + ArrowExtensionArray, + get_unit_from_pa_dtype, +) from pandas.core.arrays.arrow.extension_types import ArrowPeriodType @@ -505,6 +508,16 @@ def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: # behavior which does not support this. return False + if pa.types.is_boolean(pa_dtype) and op_name in [ + "median", + "std", + "var", + "skew", + "kurt", + "sem", + ]: + return False + return True def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool): @@ -540,18 +553,9 @@ def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna, reque f"pyarrow={pa.__version__} for {pa_dtype}" ), ) - if all_numeric_reductions in {"skew", "kurt"} and ( - dtype._is_numeric or dtype.kind == "b" - ): + if all_numeric_reductions in {"skew", "kurt"} and dtype._is_numeric: request.applymarker(xfail_mark) - elif pa.types.is_boolean(pa_dtype) and all_numeric_reductions in { - "sem", - "std", - "var", - "median", - }: - request.applymarker(xfail_mark) super().test_reduce_series_numeric(data, all_numeric_reductions, skipna) @pytest.mark.parametrize("skipna", [True, False]) @@ -574,8 +578,23 @@ def test_reduce_series_boolean( return super().test_reduce_series_boolean(data, all_boolean_reductions, skipna) def _get_expected_reduction_dtype(self, arr, op_name: str, skipna: bool): + pa_type = arr._pa_array.type + if op_name in ["max", "min"]: cmp_dtype = arr.dtype + elif pa.types.is_temporal(pa_type): + if op_name in ["std", "sem"]: + if pa.types.is_duration(pa_type): + cmp_dtype = arr.dtype + elif pa.types.is_date(pa_type): + cmp_dtype = ArrowDtype(pa.duration("s")) + elif pa.types.is_time(pa_type): + unit = get_unit_from_pa_dtype(pa_type) + cmp_dtype = ArrowDtype(pa.duration(unit)) + else: + cmp_dtype = ArrowDtype(pa.duration(pa_type.unit)) + else: + cmp_dtype = arr.dtype elif arr.dtype.name == "decimal128(7, 3)[pyarrow]": if op_name not in ["median", "var", "std"]: cmp_dtype = arr.dtype @@ -583,6 +602,8 @@ def _get_expected_reduction_dtype(self, arr, op_name: str, skipna: bool): cmp_dtype = "float64[pyarrow]" elif op_name in ["median", "var", "std", "mean", "skew"]: cmp_dtype = "float64[pyarrow]" + elif op_name in ["sum", "prod"] and pa.types.is_boolean(pa_type): + cmp_dtype = "uint64[pyarrow]" else: cmp_dtype = { "i": "int64[pyarrow]", @@ -598,6 +619,10 @@ def test_reduce_frame(self, data, all_numeric_reductions, skipna, request): if data.dtype._is_numeric: mark = pytest.mark.xfail(reason="skew not implemented") request.applymarker(mark) + elif op_name == "std" and pa.types.is_date64(data._pa_array.type) and skipna: + # overflow + mark = pytest.mark.xfail(reason="Cannot cast") + request.applymarker(mark) return super().test_reduce_frame(data, all_numeric_reductions, skipna) @pytest.mark.parametrize("typ", ["int64", "uint64", "float64"]) diff --git a/pandas/tests/extension/test_datetime.py b/pandas/tests/extension/test_datetime.py index a42fa6088d9c8..356d5352f41f4 100644 --- a/pandas/tests/extension/test_datetime.py +++ b/pandas/tests/extension/test_datetime.py @@ -95,6 +95,11 @@ def _get_expected_exception(self, op_name, obj, other): return None return super()._get_expected_exception(op_name, obj, other) + def _get_expected_reduction_dtype(self, arr, op_name: str, skipna: bool): + if op_name == "std": + return "timedelta64[ns]" + return arr.dtype + def _supports_accumulation(self, ser, op_name: str) -> bool: return op_name in ["cummin", "cummax"] From 3f82ed3928aa405b3b4c5a4c836152e86d763e0e Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Mon, 15 Jul 2024 13:28:42 -0400 Subject: [PATCH 178/272] TST: unskip EA reduction tests (#59247) unskip more EA reduction tests --- pandas/core/arrays/masked.py | 2 +- pandas/tests/extension/base/reduce.py | 4 +- .../tests/extension/decimal/test_decimal.py | 2 + pandas/tests/extension/test_arrow.py | 40 +++++-------------- pandas/tests/extension/test_masked.py | 6 +-- 5 files changed, 18 insertions(+), 36 deletions(-) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 93471788e72ab..92ed690e527c7 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1198,7 +1198,7 @@ def _wrap_na_result(self, *, name, axis, mask_size): mask = np.ones(mask_size, dtype=bool) float_dtyp = "float32" if self.dtype == "Float32" else "float64" - if name in ["mean", "median", "var", "std", "skew", "kurt"]: + if name in ["mean", "median", "var", "std", "skew", "kurt", "sem"]: np_dtype = float_dtyp elif name in ["min", "max"] or self.dtype.itemsize == 8: np_dtype = self.dtype.numpy_dtype.name diff --git a/pandas/tests/extension/base/reduce.py b/pandas/tests/extension/base/reduce.py index 3e357f99cfb03..4b3431d938f96 100644 --- a/pandas/tests/extension/base/reduce.py +++ b/pandas/tests/extension/base/reduce.py @@ -56,7 +56,7 @@ def check_reduce_frame(self, ser: pd.Series, op_name: str, skipna: bool): arr = ser.array df = pd.DataFrame({"a": arr}) - kwargs = {"ddof": 1} if op_name in ["var", "std"] else {} + kwargs = {"ddof": 1} if op_name in ["var", "std", "sem"] else {} cmp_dtype = self._get_expected_reduction_dtype(arr, op_name, skipna) @@ -119,7 +119,7 @@ def test_reduce_frame(self, data, all_numeric_reductions, skipna): op_name = all_numeric_reductions ser = pd.Series(data) - if op_name in ["count", "kurt", "sem"]: + if op_name == "count": pytest.skip(f"{op_name} not an array method") if not self._supports_reduction(ser, op_name): diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 6f18761f77138..070feb1fec4b9 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -72,6 +72,8 @@ def _get_expected_exception( return None def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: + if op_name in ["kurt", "sem"]: + return False return True def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool): diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 6d14f04383a65..ea9c5096638d5 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -467,17 +467,14 @@ def test_accumulate_series(self, data, all_numeric_accumulations, skipna, reques self.check_accumulate(ser, op_name, skipna) def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: + if op_name in ["kurt", "skew"]: + return False + dtype = ser.dtype # error: Item "dtype[Any]" of "dtype[Any] | ExtensionDtype" has # no attribute "pyarrow_dtype" pa_dtype = dtype.pyarrow_dtype # type: ignore[union-attr] - if pa.types.is_temporal(pa_dtype) and op_name in [ - "sum", - "var", - "skew", - "kurt", - "prod", - ]: + if pa.types.is_temporal(pa_dtype) and op_name in ["sum", "var", "prod"]: if pa.types.is_duration(pa_dtype) and op_name in ["sum"]: # summing timedeltas is one case that *is* well-defined pass @@ -493,8 +490,6 @@ def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: "std", "sem", "var", - "skew", - "kurt", ]: return False @@ -541,23 +536,6 @@ def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool): expected = getattr(alt, op_name)(skipna=skipna) tm.assert_almost_equal(result, expected) - @pytest.mark.parametrize("skipna", [True, False]) - def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna, request): - dtype = data.dtype - pa_dtype = dtype.pyarrow_dtype - - xfail_mark = pytest.mark.xfail( - raises=TypeError, - reason=( - f"{all_numeric_reductions} is not implemented in " - f"pyarrow={pa.__version__} for {pa_dtype}" - ), - ) - if all_numeric_reductions in {"skew", "kurt"} and dtype._is_numeric: - request.applymarker(xfail_mark) - - super().test_reduce_series_numeric(data, all_numeric_reductions, skipna) - @pytest.mark.parametrize("skipna", [True, False]) def test_reduce_series_boolean( self, data, all_boolean_reductions, skipna, na_value, request @@ -596,11 +574,11 @@ def _get_expected_reduction_dtype(self, arr, op_name: str, skipna: bool): else: cmp_dtype = arr.dtype elif arr.dtype.name == "decimal128(7, 3)[pyarrow]": - if op_name not in ["median", "var", "std"]: + if op_name not in ["median", "var", "std", "sem"]: cmp_dtype = arr.dtype else: cmp_dtype = "float64[pyarrow]" - elif op_name in ["median", "var", "std", "mean", "skew"]: + elif op_name in ["median", "var", "std", "mean", "skew", "sem"]: cmp_dtype = "float64[pyarrow]" elif op_name in ["sum", "prod"] and pa.types.is_boolean(pa_type): cmp_dtype = "uint64[pyarrow]" @@ -619,7 +597,11 @@ def test_reduce_frame(self, data, all_numeric_reductions, skipna, request): if data.dtype._is_numeric: mark = pytest.mark.xfail(reason="skew not implemented") request.applymarker(mark) - elif op_name == "std" and pa.types.is_date64(data._pa_array.type) and skipna: + elif ( + op_name in ["std", "sem"] + and pa.types.is_date64(data._pa_array.type) + and skipna + ): # overflow mark = pytest.mark.xfail(reason="Cannot cast") request.applymarker(mark) diff --git a/pandas/tests/extension/test_masked.py b/pandas/tests/extension/test_masked.py index 69ce42203d510..3b9079d06e231 100644 --- a/pandas/tests/extension/test_masked.py +++ b/pandas/tests/extension/test_masked.py @@ -301,7 +301,7 @@ def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool): def _get_expected_reduction_dtype(self, arr, op_name: str, skipna: bool): if is_float_dtype(arr.dtype): cmp_dtype = arr.dtype.name - elif op_name in ["mean", "median", "var", "std", "skew"]: + elif op_name in ["mean", "median", "var", "std", "skew", "kurt", "sem"]: cmp_dtype = "Float64" elif op_name in ["max", "min"]: cmp_dtype = arr.dtype.name @@ -323,9 +323,7 @@ def _get_expected_reduction_dtype(self, arr, op_name: str, skipna: bool): else "UInt64" ) elif arr.dtype.kind == "b": - if op_name in ["mean", "median", "var", "std", "skew"]: - cmp_dtype = "Float64" - elif op_name in ["min", "max"]: + if op_name in ["min", "max"]: cmp_dtype = "boolean" elif op_name in ["sum", "prod"]: cmp_dtype = ( From c437139bb5b77364861f39dd2a04eef2ab6f7b41 Mon Sep 17 00:00:00 2001 From: Abhinav Reddy Date: Mon, 15 Jul 2024 14:06:33 -0400 Subject: [PATCH 179/272] DOC: Add SA01 for pandas.api.types.is_signed_integer_dtype (#59246) * adding See Also section for is_signed_integer_dtype * remove from code_check * Remove pandas and additional line break --------- Co-authored-by: Abhinav Thimma --- ci/code_checks.sh | 1 - pandas/core/dtypes/common.py | 9 +++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index c4d91da70adb5..f14cdbc354be0 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -310,7 +310,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.api.types.is_period_dtype SA01" \ -i "pandas.api.types.is_re PR07,SA01" \ -i "pandas.api.types.is_re_compilable PR07,SA01" \ - -i "pandas.api.types.is_signed_integer_dtype SA01" \ -i "pandas.api.types.is_sparse SA01" \ -i "pandas.api.types.is_string_dtype SA01" \ -i "pandas.api.types.is_timedelta64_ns_dtype SA01" \ diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index bee8af46baa64..975a4237dd43f 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -715,6 +715,15 @@ def is_signed_integer_dtype(arr_or_dtype) -> bool: Whether or not the array or dtype is of a signed integer dtype and not an instance of timedelta64. + See Also + -------- + api.types.is_integer_dtype: Check whether the provided array or dtype + is of an integer dtype. + api.types.is_numeric_dtype: Check whether the provided array or dtype + is of a numeric dtype. + api.types.is_unsigned_integer_dtype: Check whether the provided array + or dtype is of an unsigned integer dtype. + Examples -------- >>> from pandas.core.dtypes.common import is_signed_integer_dtype From b608ddb3efbc9530fd26a486fab1c1ceba963299 Mon Sep 17 00:00:00 2001 From: Henry Cuzco <40706933+hfactor13@users.noreply.github.com> Date: Mon, 15 Jul 2024 11:07:23 -0700 Subject: [PATCH 180/272] DOC: Added a missing docstring to pandas/conftest.py. (#59244) Added a missing docstring to pandas/conftest.py It came up in the pytest output. See function ea_scalar_and_dtype. Co-authored-by: Henry Cuzco <40706933+uiucmeche1317@users.noreply.github.com> --- pandas/conftest.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/conftest.py b/pandas/conftest.py index 70e729dfb98a4..5e0dfd7ee644d 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -951,6 +951,9 @@ def rand_series_with_duplicate_datetimeindex() -> Series: ] ) def ea_scalar_and_dtype(request): + """ + Fixture that tests each scalar and datetime type. + """ return request.param From b46bae4f0b41c8402a642973219ea83aa067711a Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 15 Jul 2024 08:21:24 -1000 Subject: [PATCH 181/272] CLN: More read_csv state (#59210) * CLean up index methods * Remove unused try_parse_dates * Clean usecol and date processing * Clean clear buffer * remove some single use * Typing --- pandas/io/parsers/base_parser.py | 119 ++++++++++++-------------- pandas/io/parsers/c_parser_wrapper.py | 49 +++++------ pandas/io/parsers/python_parser.py | 24 ++---- 3 files changed, 84 insertions(+), 108 deletions(-) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index e8faea76897c6..719afe160614f 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -274,46 +274,34 @@ def _make_index( self, data, alldata, columns, indexnamerow: list[Scalar] | None = None ) -> tuple[Index | None, Sequence[Hashable] | MultiIndex]: index: Index | None - if not is_index_col(self.index_col) or not self.index_col: - index = None + if isinstance(self.index_col, list) and len(self.index_col): + to_remove = [] + indexes = [] + for idx in self.index_col: + if isinstance(idx, str): + raise ValueError(f"Index {idx} invalid") + to_remove.append(idx) + indexes.append(alldata[idx]) + # remove index items from content and columns, don't pop in + # loop + for i in sorted(to_remove, reverse=True): + alldata.pop(i) + if not self._implicit_index: + columns.pop(i) + index = self._agg_index(indexes) + + # add names for the index + if indexnamerow: + coffset = len(indexnamerow) - len(columns) + index = index.set_names(indexnamerow[:coffset]) else: - simple_index = self._get_simple_index(alldata, columns) - index = self._agg_index(simple_index) - - # add names for the index - if indexnamerow: - coffset = len(indexnamerow) - len(columns) - assert index is not None - index = index.set_names(indexnamerow[:coffset]) + index = None # maybe create a mi on the columns columns = self._maybe_make_multi_index_columns(columns, self.col_names) return index, columns - @final - def _get_simple_index(self, data, columns): - def ix(col): - if not isinstance(col, str): - return col - raise ValueError(f"Index {col} invalid") - - to_remove = [] - index = [] - for idx in self.index_col: - i = ix(idx) - to_remove.append(i) - index.append(data[i]) - - # remove index items from content and columns, don't pop in - # loop - for i in sorted(to_remove, reverse=True): - data.pop(i) - if not self._implicit_index: - columns.pop(i) - - return index - @final def _clean_mapping(self, mapping): """converts col numbers to names""" @@ -333,12 +321,13 @@ def _clean_mapping(self, mapping): return clean @final - def _agg_index(self, index, try_parse_dates: bool = True) -> Index: + def _agg_index(self, index) -> Index: arrays = [] converters = self._clean_mapping(self.converters) + clean_dtypes = self._clean_mapping(self.dtype) for i, arr in enumerate(index): - if try_parse_dates and self._should_parse_dates(i): + if self._should_parse_dates(i): arr = date_converter( arr, col=self.index_names[i] if self.index_names is not None else None, @@ -364,8 +353,6 @@ def _agg_index(self, index, try_parse_dates: bool = True) -> Index: else: col_na_values, col_na_fvalues = set(), set() - clean_dtypes = self._clean_mapping(self.dtype) - cast_type = None index_converter = False if self.index_names is not None: @@ -632,35 +619,6 @@ def _check_data_length( stacklevel=find_stack_level(), ) - @overload - def _evaluate_usecols( - self, - usecols: Callable[[Hashable], object], - names: Iterable[Hashable], - ) -> set[int]: ... - - @overload - def _evaluate_usecols( - self, usecols: SequenceT, names: Iterable[Hashable] - ) -> SequenceT: ... - - @final - def _evaluate_usecols( - self, - usecols: Callable[[Hashable], object] | SequenceT, - names: Iterable[Hashable], - ) -> SequenceT | set[int]: - """ - Check whether or not the 'usecols' parameter - is a callable. If so, enumerates the 'names' - parameter and returns a set of indices for - each entry in 'names' that evaluates to True. - If not a callable, returns 'usecols'. - """ - if callable(usecols): - return {i for i, name in enumerate(names) if usecols(name)} - return usecols - @final def _validate_usecols_names(self, usecols: SequenceT, names: Sequence) -> SequenceT: """ @@ -988,3 +946,32 @@ def _validate_usecols_arg(usecols): return usecols, usecols_dtype return usecols, None + + +@overload +def evaluate_callable_usecols( + usecols: Callable[[Hashable], object], + names: Iterable[Hashable], +) -> set[int]: ... + + +@overload +def evaluate_callable_usecols( + usecols: SequenceT, names: Iterable[Hashable] +) -> SequenceT: ... + + +def evaluate_callable_usecols( + usecols: Callable[[Hashable], object] | SequenceT, + names: Iterable[Hashable], +) -> SequenceT | set[int]: + """ + Check whether or not the 'usecols' parameter + is a callable. If so, enumerates the 'names' + parameter and returns a set of indices for + each entry in 'names' that evaluates to True. + If not a callable, returns 'usecols'. + """ + if callable(usecols): + return {i for i, name in enumerate(names) if usecols(name)} + return usecols diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index b59a778624c49..f4198ac2a1443 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -31,6 +31,7 @@ ParserBase, ParserError, date_converter, + evaluate_callable_usecols, is_index_col, validate_parse_dates_presence, ) @@ -133,7 +134,7 @@ def __init__(self, src: ReadCsvBuffer[str], **kwds) -> None: self.orig_names = self.names[:] # type: ignore[has-type] if self.usecols: - usecols = self._evaluate_usecols(self.usecols, self.orig_names) + usecols = evaluate_callable_usecols(self.usecols, self.orig_names) # GH 14671 # assert for mypy, orig_names is List or None, None would error in issubset @@ -256,8 +257,7 @@ def read( columns, self.col_names ) - if self.usecols is not None: - columns = self._filter_usecols(columns) + columns = _filter_usecols(self.usecols, columns) col_dict = {k: v for k, v in col_dict.items() if k in columns} @@ -290,13 +290,21 @@ def read( else: values = data.pop(self.index_col[i]) - values = self._maybe_parse_dates(values, i, try_parse_dates=True) + if self._should_parse_dates(i): + values = date_converter( + values, + col=self.index_names[i] + if self.index_names is not None + else None, + dayfirst=self.dayfirst, + cache_dates=self.cache_dates, + date_format=self.date_format, + ) arrays.append(values) index = ensure_index_from_sequences(arrays) - if self.usecols is not None: - names = self._filter_usecols(names) + names = _filter_usecols(self.usecols, names) names = dedup_names(names, is_potential_multi_index(names, self.index_col)) @@ -320,8 +328,7 @@ def read( names = list(self.orig_names) names = dedup_names(names, is_potential_multi_index(names, self.index_col)) - if self.usecols is not None: - names = self._filter_usecols(names) + names = _filter_usecols(self.usecols, names) # columns as list alldata = [x[1] for x in data_tups] @@ -335,25 +342,13 @@ def read( return index, column_names, date_data - def _filter_usecols(self, names: SequenceT) -> SequenceT | list[Hashable]: - # hackish - usecols = self._evaluate_usecols(self.usecols, names) - if usecols is not None and len(names) != len(usecols): - return [ - name for i, name in enumerate(names) if i in usecols or name in usecols - ] - return names - - def _maybe_parse_dates(self, values, index: int, try_parse_dates: bool = True): - if try_parse_dates and self._should_parse_dates(index): - values = date_converter( - values, - col=self.index_names[index] if self.index_names is not None else None, - dayfirst=self.dayfirst, - cache_dates=self.cache_dates, - date_format=self.date_format, - ) - return values + +def _filter_usecols(usecols, names: SequenceT) -> SequenceT | list[Hashable]: + # hackish + usecols = evaluate_callable_usecols(usecols, names) + if usecols is not None and len(names) != len(usecols): + return [name for i, name in enumerate(names) if i in usecols or name in usecols] + return names def _concatenate_chunks( diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 05fe963e9b2b7..c445529a6db48 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -59,6 +59,7 @@ ) from pandas.io.parsers.base_parser import ( ParserBase, + evaluate_callable_usecols, get_na_values, parser_defaults, validate_parse_dates_presence, @@ -127,9 +128,8 @@ def __init__(self, f: ReadCsvBuffer[str] | list, **kwds) -> None: self.quoting = kwds["quoting"] self.skip_blank_lines = kwds["skip_blank_lines"] - self.has_index_names = False - if "has_index_names" in kwds: - self.has_index_names = kwds["has_index_names"] + # Passed from read_excel + self.has_index_names = kwds.get("has_index_names", False) self.thousands = kwds["thousands"] self.decimal = kwds["decimal"] @@ -299,9 +299,10 @@ def read( return index, conv_columns, col_dict # handle new style for names in index - count_empty_content_vals = count_empty_vals(content[0]) indexnamerow = None - if self.has_index_names and count_empty_content_vals == len(columns): + if self.has_index_names and sum( + int(v == "" or v is None) for v in content[0] + ) == len(columns): indexnamerow = content[0] content = content[1:] @@ -605,7 +606,7 @@ def _infer_columns( # serve as the 'line' for parsing if have_mi_columns and hr > 0: if clear_buffer: - self._clear_buffer() + self.buf.clear() columns.append([None] * len(columns[-1])) return columns, num_original_columns, unnamed_cols @@ -687,7 +688,7 @@ def _infer_columns( num_original_columns = len(this_columns) if clear_buffer: - self._clear_buffer() + self.buf.clear() first_line: list[Scalar] | None if names is not None: @@ -774,7 +775,7 @@ def _handle_usecols( col_indices: set[int] | list[int] if self.usecols is not None: if callable(self.usecols): - col_indices = self._evaluate_usecols(self.usecols, usecols_key) + col_indices = evaluate_callable_usecols(self.usecols, usecols_key) elif any(isinstance(u, str) for u in self.usecols): if len(columns) > 1: raise ValueError( @@ -1094,9 +1095,6 @@ def _check_decimal(self, lines: list[list[Scalar]]) -> list[list[Scalar]]: lines=lines, search=self.decimal, replace="." ) - def _clear_buffer(self) -> None: - self.buf = [] - def _get_index_name( self, ) -> tuple[Sequence[Hashable] | None, list[Hashable], list[Hashable]]: @@ -1526,10 +1524,6 @@ def _remove_empty_lines(self, lines: list[list[T]]) -> list[list[T]]: ] -def count_empty_vals(vals) -> int: - return sum(1 for v in vals if v == "" or v is None) - - def _validate_skipfooter_arg(skipfooter: int) -> int: """ Validate the 'skipfooter' parameter. From d6724bc5aa7a13a164270e7b62010a4990ee1ca3 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 15 Jul 2024 08:21:59 -1000 Subject: [PATCH 182/272] BUG: .plot(kind='pie') with ArrowDtype (#59211) --- doc/source/whatsnew/v3.0.0.rst | 4 ++-- pandas/plotting/_matplotlib/core.py | 11 +++++++---- pandas/tests/plotting/test_series.py | 6 ++++++ 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index ef06f57f611d1..f0cfc592bc03b 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -582,9 +582,9 @@ Period Plotting ^^^^^^^^ -- Bug in :meth:`.DataFrameGroupBy.boxplot` failed when there were multiple groupings (:issue:`14701`) +- Bug in :meth:`.DataFrameGroupBy.boxplot` failed when there were multiple groupings (:issue:`14701`) - Bug in :meth:`DataFrame.plot` that causes a shift to the right when the frequency multiplier is greater than one. (:issue:`57587`) -- +- Bug in :meth:`Series.plot` with ``kind="pie"`` with :class:`ArrowDtype` (:issue:`59192`) Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 8b108346160d6..fb7d785a94bc4 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -55,7 +55,6 @@ from pandas.core.dtypes.missing import isna import pandas.core.common as com -from pandas.core.frame import DataFrame from pandas.util.version import Version from pandas.io.formats.printing import pprint_thing @@ -94,6 +93,7 @@ ) from pandas import ( + DataFrame, Index, Series, ) @@ -183,7 +183,7 @@ def __init__( # Assign the rest of columns into self.columns if by is explicitly defined # while column is not, only need `columns` in hist/box plot when it's DF # TODO: Might deprecate `column` argument in future PR (#28373) - if isinstance(data, DataFrame): + if isinstance(data, ABCDataFrame): if column: self.columns = com.maybe_make_list(column) elif self.by is None: @@ -2035,9 +2035,12 @@ def _kind(self) -> Literal["pie"]: _layout_type = "horizontal" - def __init__(self, data, kind=None, **kwargs) -> None: + def __init__(self, data: Series | DataFrame, kind=None, **kwargs) -> None: data = data.fillna(value=0) - if (data < 0).any().any(): + lt_zero = data < 0 + if isinstance(data, ABCDataFrame) and lt_zero.any().any(): + raise ValueError(f"{self._kind} plot doesn't allow negative values") + elif isinstance(data, ABCSeries) and lt_zero.any(): raise ValueError(f"{self._kind} plot doesn't allow negative values") MPLPlot.__init__(self, data, kind=kind, **kwargs) diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index 279d9a18d8df7..2ca9dbf92e617 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -377,6 +377,12 @@ def test_pie_series(self): _check_text_labels(ax.texts, series.index) assert ax.get_ylabel() == "" + def test_pie_arrow_type(self): + # GH 59192 + pytest.importorskip("pyarrow") + ser = Series([1, 2, 3, 4], dtype="int32[pyarrow]") + _check_plot_works(ser.plot.pie) + def test_pie_series_no_label(self): series = Series( np.random.default_rng(2).integers(1, 5), From ed09b58a1aff4814f437082e1dd0847f9a54e16f Mon Sep 17 00:00:00 2001 From: matiaslindgren Date: Mon, 15 Jul 2024 22:25:00 +0200 Subject: [PATCH 183/272] BUG: Fix 58807 (#59243) * throw when frame apply is given invalid axis+func * test that frame_apply throws on invalid func+axis * add a note on unsupported func+axis combination for frame_apply * add bug fix to release notes * fix based on review comments * test also with named axis * Update doc/source/whatsnew/v3.0.0.rst Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> * fix rst --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/apply.py | 9 ++++++--- pandas/core/shared_docs.py | 2 ++ pandas/tests/apply/test_frame_apply.py | 8 ++++++++ 4 files changed, 17 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index f0cfc592bc03b..fafad73bf3915 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -32,6 +32,7 @@ Other enhancements - :class:`pandas.api.typing.SASReader` is available for typing the output of :func:`read_sas` (:issue:`55689`) - :func:`DataFrame.to_excel` now raises an ``UserWarning`` when the character count in a cell exceeds Excel's limitation of 32767 characters (:issue:`56954`) - :func:`read_stata` now returns ``datetime64`` resolutions better matching those natively stored in the stata format (:issue:`55642`) +- :meth:`DataFrame.agg` called with ``axis=1`` and a ``func`` which relabels the result index now raises a ``NotImplementedError`` (:issue:`58807`). - :meth:`Styler.set_tooltips` provides alternative method to storing tooltips by using title attribute of td elements. (:issue:`56981`) - Allow dictionaries to be passed to :meth:`pandas.Series.str.replace` via ``pat`` parameter (:issue:`51748`) - Support passing a :class:`Series` input to :func:`json_normalize` that retains the :class:`Series` :class:`Index` (:issue:`51452`) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 607a65598783f..d024afa570a1e 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -90,16 +90,19 @@ def frame_apply( kwargs=None, ) -> FrameApply: """construct and return a row or column based frame apply object""" + _, func, columns, _ = reconstruct_func(func, **kwargs) + axis = obj._get_axis_number(axis) klass: type[FrameApply] if axis == 0: klass = FrameRowApply elif axis == 1: + if columns: + raise NotImplementedError( + f"Named aggregation is not supported when {axis=}." + ) klass = FrameColumnApply - _, func, _, _ = reconstruct_func(func, **kwargs) - assert func is not None - return klass( obj, func, diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py index 38a443b56ee3d..5725b96f66cd4 100644 --- a/pandas/core/shared_docs.py +++ b/pandas/core/shared_docs.py @@ -49,6 +49,8 @@ for more details. A passed user-defined-function will be passed a Series for evaluation. + +If ``func`` defines an index relabeling, ``axis`` must be ``0`` or ``index``. {examples}""" _shared_docs["compare"] = """ diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index 939997f44c1a9..78c52d3ddfbdf 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -1330,6 +1330,14 @@ def test_agg_reduce(axis, float_frame): tm.assert_frame_equal(result, expected) +def test_named_agg_reduce_axis1_raises(float_frame): + name1, name2 = float_frame.axes[0].unique()[:2].sort_values() + msg = "Named aggregation is not supported when axis=1." + for axis in [1, "columns"]: + with pytest.raises(NotImplementedError, match=msg): + float_frame.agg(row1=(name1, "sum"), row2=(name2, "max"), axis=axis) + + def test_nuiscance_columns(): # GH 15015 df = DataFrame( From d207d52045bd0ec262d9f5457e0fb5b5e8a21ca1 Mon Sep 17 00:00:00 2001 From: Rajvi Gemawat <55595770+rgemawat2000@users.noreply.github.com> Date: Mon, 15 Jul 2024 17:50:43 -0400 Subject: [PATCH 184/272] DOC: Add SA01 for pandas.api.types.is_unsigned_integer_dtype (#59250) adding see also --- ci/code_checks.sh | 1 - pandas/core/dtypes/common.py | 9 +++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index f14cdbc354be0..a96af7dff7392 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -313,7 +313,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.api.types.is_sparse SA01" \ -i "pandas.api.types.is_string_dtype SA01" \ -i "pandas.api.types.is_timedelta64_ns_dtype SA01" \ - -i "pandas.api.types.is_unsigned_integer_dtype SA01" \ -i "pandas.api.types.pandas_dtype PR07,RT03,SA01" \ -i "pandas.api.types.union_categoricals RT03,SA01" \ -i "pandas.arrays.ArrowExtensionArray PR07,SA01" \ diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 975a4237dd43f..7db3f8ecebf2a 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -780,6 +780,15 @@ def is_unsigned_integer_dtype(arr_or_dtype) -> bool: boolean Whether or not the array or dtype is of an unsigned integer dtype. + See Also + -------- + api.types.is_signed_integer_dtype : Check whether the provided array + or dtype is of an signed integer dtype. + api.types.is_integer_dtype : Check whether the provided array or dtype + is of an integer dtype. + api.types.is_numeric_dtype : Check whether the provided array or dtype + is of a numeric dtype. + Examples -------- >>> from pandas.api.types import is_unsigned_integer_dtype From 40c63d871d890858166fa6daa782e552635e22b8 Mon Sep 17 00:00:00 2001 From: James Bourbeau Date: Mon, 15 Jul 2024 17:28:35 -0500 Subject: [PATCH 185/272] Remove extra space in ``resample`` freq deprecation (#59251) Remove extra space in resample freq deprecation --- pandas/_libs/tslibs/offsets.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 0afeb002a8151..991f155847ac6 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -4894,7 +4894,7 @@ cpdef to_offset(freq, bint is_period=False): f"\'{name}\' is deprecated and will be removed " f"in a future version, please use " f"\'{c_PERIOD_AND_OFFSET_DEPR_FREQSTR.get(name)}\' " - f" instead.", + f"instead.", FutureWarning, stacklevel=find_stack_level(), ) From 794913c8ff02e44ec82e302c18dbbadfc0825650 Mon Sep 17 00:00:00 2001 From: aram-cinnamon <97805700+aram-cinnamon@users.noreply.github.com> Date: Tue, 16 Jul 2024 05:10:25 -0400 Subject: [PATCH 186/272] BUG: `pandas.tseries.frequencies.to_offset()` raises `ValueError` when parsing a `LastWeekOfMonth` frequency string (#59245) * add LastWeekOfMonth to prefix_mapping * update whatsnew --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/_libs/tslibs/offsets.pyx | 1 + pandas/tests/tslibs/test_to_offset.py | 1 + 3 files changed, 3 insertions(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index fafad73bf3915..cc7706741e653 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -503,6 +503,7 @@ Datetimelike - Bug in :func:`date_range` where the last valid timestamp would sometimes not be produced (:issue:`56134`) - Bug in :func:`date_range` where using a negative frequency value would not include all points between the start and end values (:issue:`56382`) - Bug in :func:`tseries.api.guess_datetime_format` would fail to infer time format when "%Y" == "%H%M" (:issue:`57452`) +- Bug in :func:`tseries.frequencies.to_offset` would fail to parse frequency strings starting with "LWOM" (:issue:`59218`) - Bug in :meth:`Dataframe.agg` with df with missing values resulting in IndexError (:issue:`58810`) - Bug in :meth:`DatetimeIndex.is_year_start` and :meth:`DatetimeIndex.is_quarter_start` does not raise on Custom business days frequencies bigger then "1C" (:issue:`58664`) - Bug in :meth:`DatetimeIndex.is_year_start` and :meth:`DatetimeIndex.is_quarter_start` returning ``False`` on double-digit frequencies (:issue:`58523`) diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 991f155847ac6..db35cc0c93237 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -4676,6 +4676,7 @@ prefix_mapping = { Hour, # 'h' Day, # 'D' WeekOfMonth, # 'WOM' + LastWeekOfMonth, # 'LWOM' FY5253, FY5253Quarter, ] diff --git a/pandas/tests/tslibs/test_to_offset.py b/pandas/tests/tslibs/test_to_offset.py index 9e32a33650591..67521c7e2a3ac 100644 --- a/pandas/tests/tslibs/test_to_offset.py +++ b/pandas/tests/tslibs/test_to_offset.py @@ -31,6 +31,7 @@ ("2SME-16", offsets.SemiMonthEnd(2, day_of_month=16)), ("2SMS-14", offsets.SemiMonthBegin(2, day_of_month=14)), ("2SMS-15", offsets.SemiMonthBegin(2)), + ("LWOM-MON", offsets.LastWeekOfMonth()), ], ) def test_to_offset(freq_input, expected): From a8875e17e09bf50a0c2726e3f8eb129474ac4604 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Tue, 16 Jul 2024 22:09:28 +0530 Subject: [PATCH 187/272] DOC: add PR02 for pandas.DataFrame.plot (#59255) * DOC: add PR02 for pandas.DataFrame.plot * DOC: fix PR02 for pandas.Series.plot --- ci/code_checks.sh | 2 -- pandas/plotting/_core.py | 3 +++ 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index a96af7dff7392..255e680d8dacb 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -70,7 +70,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then --format=actions \ -i ES01 `# For now it is ok if docstrings are missing the extended summary` \ -i "pandas.Series.dt PR01" `# Accessors are implemented as classes, but we do not document the Parameters section` \ - -i "pandas.DataFrame.plot PR02" \ -i "pandas.Grouper PR02" \ -i "pandas.MultiIndex.append PR07,SA01" \ -i "pandas.MultiIndex.copy PR07,RT03,SA01" \ @@ -156,7 +155,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.lt SA01" \ -i "pandas.Series.ne SA01" \ -i "pandas.Series.pad PR01,SA01" \ - -i "pandas.Series.plot PR02" \ -i "pandas.Series.pop SA01" \ -i "pandas.Series.prod RT03" \ -i "pandas.Series.product RT03" \ diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index 61c44e58b643a..17df98f026656 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -652,6 +652,9 @@ class PlotAccessor(PandasObject): ---------- data : Series or DataFrame The object for which the method is called. + + Attributes + ---------- x : label or position, default None Only used if data is a DataFrame. y : label, position or list of label, positions, default None From 56c80f89cafadcd7050eff2c9bf34bf890ee93fc Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Tue, 16 Jul 2024 18:42:04 +0200 Subject: [PATCH 188/272] ENH: Globally enable Cython free-threading directive (#59248) * ENH: Globally enable Cython free-threading directive This is the Cython equivalent of adding a `Py_mod_gil` slot with `Py_MOD_GIL_NOT_USED` like we did in #59135. * Use add_project_arguments * Mark json with Py_MOD_GIL_NOT_USED & remove PYTHON_GIL env var from ci test job --- .github/workflows/unit-tests.yml | 2 -- meson.build | 5 +++++ pandas/_libs/src/vendored/ujson/python/ujson.c | 4 ++++ 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index ddb6ecbe83126..a9585c17454fb 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -388,8 +388,6 @@ jobs: - name: Run Tests uses: ./.github/actions/run-tests - env: - PYTHON_GIL: 0 # NOTE: this job must be kept in sync with the Pyodide build job in wheels.yml emscripten: diff --git a/meson.build b/meson.build index 06623a305ab54..efe543b7a267c 100644 --- a/meson.build +++ b/meson.build @@ -44,6 +44,11 @@ else meson.add_dist_script(py, versioneer, '-o', '_version_meson.py') endif +cy = meson.get_compiler('cython') +if cy.version().version_compare('>=3.1.0') + add_project_arguments('-Xfreethreading_compatible=true', language : 'cython') +endif + # Needed by pandas.test() when it looks for the pytest ini options py.install_sources( 'pyproject.toml', diff --git a/pandas/_libs/src/vendored/ujson/python/ujson.c b/pandas/_libs/src/vendored/ujson/python/ujson.c index 075411a23b075..f369d122a3dbe 100644 --- a/pandas/_libs/src/vendored/ujson/python/ujson.c +++ b/pandas/_libs/src/vendored/ujson/python/ujson.c @@ -384,6 +384,10 @@ PyMODINIT_FUNC PyInit_json(void) { return NULL; } +#ifdef Py_GIL_DISABLED + PyUnstable_Module_SetGIL(module, Py_MOD_GIL_NOT_USED); +#endif + #ifndef PYPY_VERSION PyObject *mod_decimal = PyImport_ImportModule("decimal"); if (mod_decimal) { From b4bd4ae270a2e42ad95498c9ce9f4b8abdad3bdd Mon Sep 17 00:00:00 2001 From: Abdulaziz Aloqeely <52792999+Aloqeely@users.noreply.github.com> Date: Tue, 16 Jul 2024 19:45:38 +0300 Subject: [PATCH 189/272] CI/DOC: Fix CI failure for Status: Draft PDEPs and don't show them on the roadmap webpage (#59254) * CI: fix CI failure for PDEP with status: Draft * Exclude draft PDEPs --- web/pandas_web.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/web/pandas_web.py b/web/pandas_web.py index aac07433f2712..b3872b829c73a 100755 --- a/web/pandas_web.py +++ b/web/pandas_web.py @@ -280,6 +280,7 @@ def roadmap_pdeps(context): PDEP's in different status from the directory tree and GitHub. """ KNOWN_STATUS = { + "Draft", "Under discussion", "Accepted", "Implemented", @@ -319,7 +320,7 @@ def roadmap_pdeps(context): github_repo_url = context["main"]["github_repo_url"] resp = requests.get( "https://api.github.com/search/issues?" - f"q=is:pr is:open label:PDEP repo:{github_repo_url}", + f"q=is:pr is:open label:PDEP draft:false repo:{github_repo_url}", headers=GITHUB_API_HEADERS, timeout=5, ) From a2710a87adb87c997d1003a2bf057cece4572b11 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Wed, 17 Jul 2024 22:34:09 +0530 Subject: [PATCH 190/272] DOC: fix PR02 for pandas.Grouper (#59259) --- ci/code_checks.sh | 1 - pandas/core/groupby/grouper.py | 3 +++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 255e680d8dacb..364f86e7edd5d 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -70,7 +70,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then --format=actions \ -i ES01 `# For now it is ok if docstrings are missing the extended summary` \ -i "pandas.Series.dt PR01" `# Accessors are implemented as classes, but we do not document the Parameters section` \ - -i "pandas.Grouper PR02" \ -i "pandas.MultiIndex.append PR07,SA01" \ -i "pandas.MultiIndex.copy PR07,RT03,SA01" \ -i "pandas.MultiIndex.drop PR07,RT03,SA01" \ diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 5f680de77649f..5f9ebdcea4a2d 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -72,6 +72,9 @@ class Grouper: Currently unused, reserved for future use. **kwargs Dictionary of the keyword arguments to pass to Grouper. + + Attributes + ---------- key : str, defaults to None Groupby key, which selects the grouping column of the target. level : name/number, defaults to None From dec86b3090611af08c328d04a6189d8933d45cf0 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Wed, 17 Jul 2024 22:34:41 +0530 Subject: [PATCH 191/272] DOC: fix PR07,RT03,SA01 for pandas.MultiIndex.drop (#59264) --- ci/code_checks.sh | 1 - pandas/core/indexes/multi.py | 16 ++++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 364f86e7edd5d..59cbc075b5bc8 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -72,7 +72,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.dt PR01" `# Accessors are implemented as classes, but we do not document the Parameters section` \ -i "pandas.MultiIndex.append PR07,SA01" \ -i "pandas.MultiIndex.copy PR07,RT03,SA01" \ - -i "pandas.MultiIndex.drop PR07,RT03,SA01" \ -i "pandas.MultiIndex.get_level_values SA01" \ -i "pandas.MultiIndex.get_loc PR07" \ -i "pandas.MultiIndex.get_loc_level PR07" \ diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 19c94fa4104d7..ee24e485a9331 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2316,16 +2316,32 @@ def drop( # type: ignore[override] """ Make a new :class:`pandas.MultiIndex` with the passed list of codes deleted. + This method allows for the removal of specified labels from a MultiIndex. + The labels to be removed can be provided as a list of tuples if no level + is specified, or as a list of labels from a specific level if the level + parameter is provided. This can be useful for refining the structure of a + MultiIndex to fit specific requirements. + Parameters ---------- codes : array-like Must be a list of tuples when ``level`` is not specified. level : int or level name, default None + Level from which the labels will be dropped. errors : str, default 'raise' + If 'ignore', suppress error and existing labels are dropped. Returns ------- MultiIndex + A new MultiIndex with the specified labels removed. + + See Also + -------- + MultiIndex.remove_unused_levels : Create new MultiIndex from current that + removes unused levels. + MultiIndex.reorder_levels : Rearrange levels using input order. + MultiIndex.rename : Rename levels in a MultiIndex. Examples -------- From 288af5f6cff8f864a587985c2b0f644ea51b0663 Mon Sep 17 00:00:00 2001 From: Abdulaziz Aloqeely <52792999+Aloqeely@users.noreply.github.com> Date: Wed, 17 Jul 2024 20:09:04 +0300 Subject: [PATCH 192/272] BUG: Fix to_datetime not respecting dayfirst (#58876) * ENH: Warn when to_datetime falls back to dateutil when dayfirst is passed * Assert warnings * Remove warnings and fix functionality * Add whatsnew, write test --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/_libs/tslibs/conversion.pyx | 65 ++++++++++++++------------ pandas/_libs/tslibs/parsing.pyx | 49 +++++++++---------- pandas/tests/tools/test_to_datetime.py | 2 + 4 files changed, 63 insertions(+), 54 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index cc7706741e653..ba6636cb42b6c 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -509,6 +509,7 @@ Datetimelike - Bug in :meth:`DatetimeIndex.is_year_start` and :meth:`DatetimeIndex.is_quarter_start` returning ``False`` on double-digit frequencies (:issue:`58523`) - Bug in :meth:`DatetimeIndex.union` when ``unit`` was non-nanosecond (:issue:`59036`) - Bug in :meth:`Series.dt.microsecond` producing incorrect results for pyarrow backed :class:`Series`. (:issue:`59154`) +- Bug in :meth:`to_datetime` not respecting dayfirst if an uncommon date string was passed. (:issue:`58859`) - Bug in setting scalar values with mismatched resolution into arrays with non-nanosecond ``datetime64``, ``timedelta64`` or :class:`DatetimeTZDtype` incorrectly truncating those scalars (:issue:`56410`) Timedelta diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 3a55f5fa0c003..0fadbbbed2c72 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -606,37 +606,42 @@ cdef _TSObject convert_str_to_tsobject(str ts, tzinfo tz, # equiv: datetime.today().replace(tzinfo=tz) return convert_datetime_to_tsobject(dt, tz, nanos=0, reso=NPY_FR_us) else: - string_to_dts_failed = string_to_dts( - ts, &dts, &out_bestunit, &out_local, - &out_tzoffset, False - ) - if not string_to_dts_failed: - reso = get_supported_reso(out_bestunit) - check_dts_bounds(&dts, reso) - obj = _TSObject() - obj.dts = dts - obj.creso = reso - ival = npy_datetimestruct_to_datetime(reso, &dts) - - if out_local == 1: - obj.tzinfo = timezone(timedelta(minutes=out_tzoffset)) - obj.value = tz_localize_to_utc_single( - ival, obj.tzinfo, ambiguous="raise", nonexistent=None, creso=reso - ) - if tz is None: - check_overflows(obj, reso) - return obj - _adjust_tsobject_tz_using_offset(obj, tz) - return obj - else: - if tz is not None: - # shift for _localize_tso - ival = tz_localize_to_utc_single( - ival, tz, ambiguous="raise", nonexistent=None, creso=reso + if not dayfirst: # GH 58859 + string_to_dts_failed = string_to_dts( + ts, &dts, &out_bestunit, &out_local, + &out_tzoffset, False + ) + if not string_to_dts_failed: + reso = get_supported_reso(out_bestunit) + check_dts_bounds(&dts, reso) + obj = _TSObject() + obj.dts = dts + obj.creso = reso + ival = npy_datetimestruct_to_datetime(reso, &dts) + + if out_local == 1: + obj.tzinfo = timezone(timedelta(minutes=out_tzoffset)) + obj.value = tz_localize_to_utc_single( + ival, + obj.tzinfo, + ambiguous="raise", + nonexistent=None, + creso=reso, ) - obj.value = ival - maybe_localize_tso(obj, tz, obj.creso) - return obj + if tz is None: + check_overflows(obj, reso) + return obj + _adjust_tsobject_tz_using_offset(obj, tz) + return obj + else: + if tz is not None: + # shift for _localize_tso + ival = tz_localize_to_utc_single( + ival, tz, ambiguous="raise", nonexistent=None, creso=reso + ) + obj.value = ival + maybe_localize_tso(obj, tz, obj.creso) + return obj dt = parse_datetime_string( ts, diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 35d2433a707a0..308183402198d 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -377,32 +377,33 @@ def parse_datetime_string_with_reso( raise ValueError(f'Given date string "{date_string}" not likely a datetime') # Try iso8601 first, as it handles nanoseconds - string_to_dts_failed = string_to_dts( - date_string, &dts, &out_bestunit, &out_local, - &out_tzoffset, False - ) - if not string_to_dts_failed: - # Match Timestamp and drop picoseconds, femtoseconds, attoseconds - # The new resolution will just be nano - # GH#50417 - if out_bestunit in _timestamp_units: - out_bestunit = NPY_DATETIMEUNIT.NPY_FR_ns - - if out_bestunit == NPY_DATETIMEUNIT.NPY_FR_ns: - # TODO: avoid circular import - from pandas import Timestamp - parsed = Timestamp(date_string) - else: - if out_local: - tz = timezone(timedelta(minutes=out_tzoffset)) + if not dayfirst: # GH 58859 + string_to_dts_failed = string_to_dts( + date_string, &dts, &out_bestunit, &out_local, + &out_tzoffset, False + ) + if not string_to_dts_failed: + # Match Timestamp and drop picoseconds, femtoseconds, attoseconds + # The new resolution will just be nano + # GH#50417 + if out_bestunit in _timestamp_units: + out_bestunit = NPY_DATETIMEUNIT.NPY_FR_ns + + if out_bestunit == NPY_DATETIMEUNIT.NPY_FR_ns: + # TODO: avoid circular import + from pandas import Timestamp + parsed = Timestamp(date_string) else: - tz = None - parsed = datetime_new( - dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us, tz - ) + if out_local: + tz = timezone(timedelta(minutes=out_tzoffset)) + else: + tz = None + parsed = datetime_new( + dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us, tz + ) - reso = npy_unit_to_attrname[out_bestunit] - return parsed, reso + reso = npy_unit_to_attrname[out_bestunit] + return parsed, reso parsed = _parse_delimited_date(date_string, dayfirst, &out_bestunit) if parsed is not None: diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index c1d6baaf17c92..3a47d87286711 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -2988,6 +2988,8 @@ def test_parsers_nat(self): ("20/12/21", True, False, datetime(2021, 12, 20)), ("20/12/21", False, True, datetime(2020, 12, 21)), ("20/12/21", True, True, datetime(2020, 12, 21)), + # GH 58859 + ("20201012", True, False, datetime(2020, 12, 10)), ], ) def test_parsers_dayfirst_yearfirst( From d8cfd5245020ea8249ee006e683fdef7d5f30f68 Mon Sep 17 00:00:00 2001 From: mattbest Date: Wed, 17 Jul 2024 14:16:00 -0500 Subject: [PATCH 193/272] DOC: Fix numpy docstring validation errors in pandas.Series.skew (#59266) adding a see also section to pandas.Series.skew --- ci/code_checks.sh | 1 - pandas/core/generic.py | 10 +++++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 59cbc075b5bc8..f37547e9cea9e 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -158,7 +158,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.product RT03" \ -i "pandas.Series.reorder_levels RT03,SA01" \ -i "pandas.Series.sem PR01,RT03,SA01" \ - -i "pandas.Series.skew SA01" \ -i "pandas.Series.sparse PR01,SA01" \ -i "pandas.Series.sparse.density SA01" \ -i "pandas.Series.sparse.fill_value SA01" \ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index fc9821a65777d..5913532e28ec2 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -12481,6 +12481,14 @@ def last_valid_index(self) -> Hashable: stat_func="min", verb="Min", default_output=0, level_output_0=2, level_output_1=0 ) +_skew_see_also = """ + +See Also +-------- +Series.skew : Return unbiased skew over requested axis. +Series.var : Return unbiased variance over requested axis. +Series.std : Return unbiased standard deviation over requested axis.""" + _stat_func_see_also = """ See Also @@ -12740,7 +12748,7 @@ def make_doc(name: str, ndim: int) -> str: elif name == "skew": base_doc = _num_doc desc = "Return unbiased skew over requested axis.\n\nNormalized by N-1." - see_also = "" + see_also = _skew_see_also examples = """ Examples From fc8fc82233e7910f5748acbc24ea6df77f4a233a Mon Sep 17 00:00:00 2001 From: Aditya060 <51037240+Aditya060@users.noreply.github.com> Date: Thu, 18 Jul 2024 00:47:01 +0530 Subject: [PATCH 194/272] =?UTF-8?q?Removed=20SA01=20error=20for=20pandas.S?= =?UTF-8?q?eries.to=5Fframe.=20Added=20See=20Also=20section=E2=80=A6=20(#5?= =?UTF-8?q?9262)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Removed SA01 error for pandas.Series.to_frame. Added See Also section to pandas.Series.to_frame. Modified ci/codecheks.sh and pandas/core/series.py. * Update pandas/core/series.py Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> * Update pandas/core/series.py Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- ci/code_checks.sh | 1 - pandas/core/series.py | 4 ++++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index f37547e9cea9e..3876e493ce91a 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -196,7 +196,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.str.zfill RT03" \ -i "pandas.Series.struct.dtypes SA01" \ -i "pandas.Series.to_dict SA01" \ - -i "pandas.Series.to_frame SA01" \ -i "pandas.Series.to_markdown SA01" \ -i "pandas.Series.update PR07,SA01" \ -i "pandas.Timedelta.asm8 SA01" \ diff --git a/pandas/core/series.py b/pandas/core/series.py index 184c774d04a47..5b73c94442f1c 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1749,6 +1749,10 @@ def to_frame(self, name: Hashable = lib.no_default) -> DataFrame: DataFrame DataFrame representation of Series. + See Also + -------- + Series.to_dict : Convert Series to dict object. + Examples -------- >>> s = pd.Series(["a", "b", "c"], name="vals") From a94dd996241d9b4f48e9379e0068e4bec9ca1287 Mon Sep 17 00:00:00 2001 From: ktseng4096 <32848825+ktseng4096@users.noreply.github.com> Date: Thu, 18 Jul 2024 08:52:25 -0700 Subject: [PATCH 195/272] DOC: add See Also section to series.to_dict (#59269) add docstring to series.to_dict --- ci/code_checks.sh | 1 - pandas/core/series.py | 6 ++++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 3876e493ce91a..1e9250fd77fe5 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -195,7 +195,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.str.wrap RT03,SA01" \ -i "pandas.Series.str.zfill RT03" \ -i "pandas.Series.struct.dtypes SA01" \ - -i "pandas.Series.to_dict SA01" \ -i "pandas.Series.to_markdown SA01" \ -i "pandas.Series.update PR07,SA01" \ -i "pandas.Timedelta.asm8 SA01" \ diff --git a/pandas/core/series.py b/pandas/core/series.py index 5b73c94442f1c..9209a80ada0d1 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1712,6 +1712,12 @@ def to_dict( collections.abc.MutableMapping Key-value representation of Series. + See Also + -------- + Series.to_list: Converts Series to a list of the values. + Series.to_numpy: Converts Series to NumPy ndarray. + Series.array: ExtensionArray of the data backing this Series. + Examples -------- >>> s = pd.Series([1, 2, 3, 4]) From 941d0790952d3f51a292f1c6d100b72f3286aa05 Mon Sep 17 00:00:00 2001 From: Kirill Date: Thu, 18 Jul 2024 11:54:01 -0400 Subject: [PATCH 196/272] BUG: pandas-dev#58594 (#59258) * BUG: pandas-dev#58594 * updating whatsnew doc * updates based on feedback from @mroeschke * switching to default_index() --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/indexes/base.py | 5 ++++- pandas/tests/frame/constructors/test_from_records.py | 6 ++++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index ba6636cb42b6c..c5c886912eae0 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -547,6 +547,7 @@ Interval Indexing ^^^^^^^^ - Bug in :meth:`DataFrame.__getitem__` returning modified columns when called with ``slice`` in Python 3.12 (:issue:`57500`) +- Bug in :meth:`DataFrame.from_records` throwing a ``ValueError`` when passed an empty list in ``index`` (:issue:`58594`) - Missing diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 7d43498d4267b..5bffac5fa64b6 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -7473,9 +7473,12 @@ def ensure_index_from_sequences(sequences, names=None) -> Index: -------- ensure_index """ + from pandas.core.indexes.api import default_index from pandas.core.indexes.multi import MultiIndex - if len(sequences) == 1: + if len(sequences) == 0: + return default_index(0) + elif len(sequences) == 1: if names is not None: names = names[0] return Index(maybe_sequence_to_range(sequences[0]), name=names) diff --git a/pandas/tests/frame/constructors/test_from_records.py b/pandas/tests/frame/constructors/test_from_records.py index 5be42d41af03a..ed2f0aa9c4679 100644 --- a/pandas/tests/frame/constructors/test_from_records.py +++ b/pandas/tests/frame/constructors/test_from_records.py @@ -148,6 +148,12 @@ def test_from_records_sequencelike_empty(self): assert len(result) == 0 assert len(result.columns) == 0 + def test_from_records_sequencelike_empty_index(self): + result = DataFrame.from_records([], index=[]) + assert len(result) == 0 + assert len(result.columns) == 0 + assert len(result.index) == 0 + def test_from_records_dictlike(self): # test the dict methods df = DataFrame( From 71cacde87b8bfe377202b8799031ef9a76bae01e Mon Sep 17 00:00:00 2001 From: Kushagr Arora Date: Thu, 18 Jul 2024 11:55:04 -0400 Subject: [PATCH 197/272] DOC: Add fixtures for testing DropDuplicates for datetimelike dataframes (#59268) Adding fixtures for DropDuplicates for datetimelike dataframes --- .../indexes/datetimelike_/test_drop_duplicates.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/pandas/tests/indexes/datetimelike_/test_drop_duplicates.py b/pandas/tests/indexes/datetimelike_/test_drop_duplicates.py index 61a79c4ceabf9..c2d76c0bcc8bd 100644 --- a/pandas/tests/indexes/datetimelike_/test_drop_duplicates.py +++ b/pandas/tests/indexes/datetimelike_/test_drop_duplicates.py @@ -70,20 +70,32 @@ def test_drop_duplicates(self, keep, expected, index, idx): class TestDropDuplicatesPeriodIndex(DropDuplicates): @pytest.fixture(params=["D", "3D", "h", "2h", "min", "2min", "s", "3s"]) def freq(self, request): + """ + Fixture to test for different frequencies for PeriodIndex. + """ return request.param @pytest.fixture def idx(self, freq): + """ + Fixture to get PeriodIndex for 10 periods for different frequencies. + """ return period_range("2011-01-01", periods=10, freq=freq, name="idx") class TestDropDuplicatesDatetimeIndex(DropDuplicates): @pytest.fixture def idx(self, freq_sample): + """ + Fixture to get DatetimeIndex for 10 periods for different frequencies. + """ return date_range("2011-01-01", freq=freq_sample, periods=10, name="idx") class TestDropDuplicatesTimedeltaIndex(DropDuplicates): @pytest.fixture def idx(self, freq_sample): + """ + Fixture to get TimedeltaIndex for 10 periods for different frequencies. + """ return timedelta_range("1 day", periods=10, freq=freq_sample, name="idx") From 7a4a7bf0908d6af80a99ddc2fbcebbdfb16d161e Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 18 Jul 2024 05:57:59 -1000 Subject: [PATCH 198/272] DEPS: Bump misc testing dependencies (#59257) * DEPS: Bump misc testing dependencies * pyqt alignment --- .circleci/config.yml | 2 +- .github/workflows/unit-tests.yml | 8 ++++---- .github/workflows/wheels.yml | 2 +- ci/deps/actions-310-minimum_versions.yaml | 8 ++++---- ci/deps/actions-310.yaml | 8 ++++---- ci/deps/actions-311-downstream_compat.yaml | 8 ++++---- ci/deps/actions-311-numpydev.yaml | 4 ++-- ci/deps/actions-311-pyarrownightly.yaml | 4 ++-- ci/deps/actions-311.yaml | 8 ++++---- ci/deps/actions-312.yaml | 8 ++++---- ci/deps/actions-pypy-39.yaml | 4 ++-- ci/deps/circle-311-arm64.yaml | 8 ++++---- ci/meta.yaml | 4 ++-- environment.yml | 6 +++--- pandas/compat/_optional.py | 2 +- pandas/conftest.py | 8 ++++---- pyproject.toml | 10 +++++----- requirements-dev.txt | 6 +++--- scripts/tests/data/deps_expected_random.yaml | 2 +- scripts/tests/data/deps_minimum.toml | 10 +++++----- scripts/tests/data/deps_unmodified_random.yaml | 2 +- 21 files changed, 61 insertions(+), 61 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 4acc6473e6add..745b04a5159f7 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -54,7 +54,7 @@ jobs: /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev . ~/virtualenvs/pandas-dev/bin/activate python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.13.1 meson[ninja]==1.2.1 - python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 hypothesis>=6.46.1 + python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=3.4.0 hypothesis>=6.84.0 python -m pip install --no-cache-dir --no-build-isolation -e . --config-settings=setup-args="--werror" python -m pip list --no-cache-dir export PANDAS_CI=1 diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index a9585c17454fb..261859a14459a 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -227,7 +227,7 @@ jobs: . ~/virtualenvs/pandas-dev/bin/activate python -m pip install --no-cache-dir -U pip wheel setuptools meson[ninja]==1.2.1 meson-python==0.13.1 python -m pip install numpy --config-settings=setup-args="-Dallow-noblas=true" - python -m pip install --no-cache-dir versioneer[toml] cython python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 hypothesis>=6.46.1 + python -m pip install --no-cache-dir versioneer[toml] cython python-dateutil pytz pytest>=7.3.2 pytest-xdist>=3.4.0 hypothesis>=6.84.0 python -m pip install --no-cache-dir --no-build-isolation -e . --config-settings=setup-args="--werror" python -m pip list --no-cache-dir export PANDAS_CI=1 @@ -265,7 +265,7 @@ jobs: /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev . ~/virtualenvs/pandas-dev/bin/activate python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.13.1 meson[ninja]==1.2.1 - python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 hypothesis>=6.46.1 + python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=3.4.0 hypothesis>=6.84.0 python -m pip install --no-cache-dir --no-build-isolation -e . --config-settings=setup-args="--werror" python -m pip list --no-cache-dir @@ -339,7 +339,7 @@ jobs: python -m pip install --upgrade pip setuptools wheel meson[ninja]==1.2.1 meson-python==0.13.1 python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy python -m pip install versioneer[toml] - python -m pip install python-dateutil pytz tzdata cython hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-cov + python -m pip install python-dateutil pytz tzdata cython hypothesis>=6.84.0 pytest>=7.3.2 pytest-xdist>=3.4.0 pytest-cov python -m pip install -ve . --no-build-isolation --no-index --no-deps --config-settings=setup-args="--werror" python -m pip list @@ -382,7 +382,7 @@ jobs: python -m pip install --upgrade pip setuptools wheel meson[ninja]==1.2.1 meson-python==0.13.1 python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy cython python -m pip install versioneer[toml] - python -m pip install python-dateutil pytz tzdata hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-cov + python -m pip install python-dateutil pytz tzdata hypothesis>=6.84.0 pytest>=7.3.2 pytest-xdist>=3.4.0 pytest-cov python -m pip install -ve . --no-build-isolation --no-index --no-deps --config-settings=setup-args="--werror" python -m pip list diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 02100648b636a..6405156f09833 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -178,7 +178,7 @@ jobs: shell: pwsh run: | $TST_CMD = @" - python -m pip install hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0; + python -m pip install hypothesis>=6.84.0 pytest>=7.3.2 pytest-xdist>=3.4.0; python -m pip install `$(Get-Item pandas\wheelhouse\*.whl); python -c `'import pandas as pd; pd.test(extra_args=[`\"--no-strict-data-files`\", `\"-m not clipboard and not single_cpu and not slow and not network and not db`\"])`'; "@ diff --git a/ci/deps/actions-310-minimum_versions.yaml b/ci/deps/actions-310-minimum_versions.yaml index a9c205d24d212..0c46f476893dd 100644 --- a/ci/deps/actions-310-minimum_versions.yaml +++ b/ci/deps/actions-310-minimum_versions.yaml @@ -15,9 +15,9 @@ dependencies: # test dependencies - pytest>=7.3.2 - pytest-cov - - pytest-xdist>=2.2.0 - - pytest-localserver>=0.7.1 - - pytest-qt>=4.2.0 + - pytest-xdist>=3.4.0 + - pytest-localserver>=0.8.1 + - pytest-qt>=4.4.0 - boto3 # required dependencies @@ -32,7 +32,7 @@ dependencies: - fastparquet=2023.10.0 - fsspec=2022.11.0 - html5lib=1.1 - - hypothesis=6.46.1 + - hypothesis=6.84.0 - gcsfs=2022.11.0 - jinja2=3.1.2 - lxml=4.9.2 diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index ed7dfe1a3c17e..0af46752f5b3d 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -13,8 +13,9 @@ dependencies: # test dependencies - pytest>=7.3.2 - pytest-cov - - pytest-xdist>=2.2.0 - - pytest-qt>=4.2.0 + - pytest-xdist>=3.4.0 + - pytest-localserver>=0.8.1 + - pytest-qt>=4.4.0 - boto3 # required dependencies @@ -29,7 +30,7 @@ dependencies: - fastparquet>=2023.10.0 - fsspec>=2022.11.0 - html5lib>=1.1 - - hypothesis>=6.46.1 + - hypothesis>=6.84.0 - gcsfs>=2022.11.0 - jinja2>=3.1.2 - lxml>=4.9.2 @@ -60,4 +61,3 @@ dependencies: - adbc-driver-postgresql>=0.10.0 - adbc-driver-sqlite>=0.8.0 - tzdata>=2022.7 - - pytest-localserver>=0.7.1 diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml index dd1d341c70a9b..1a842c7212c1f 100644 --- a/ci/deps/actions-311-downstream_compat.yaml +++ b/ci/deps/actions-311-downstream_compat.yaml @@ -14,9 +14,9 @@ dependencies: # test dependencies - pytest>=7.3.2 - pytest-cov - - pytest-xdist>=2.2.0 - - pytest-localserver>=0.7.1 - - pytest-qt>=4.2.0 + - pytest-xdist>=3.4.0 + - pytest-localserver>=0.8.1 + - pytest-qt>=4.4.0 - boto3 # required dependencies @@ -31,7 +31,7 @@ dependencies: - fastparquet>=2023.10.0 - fsspec>=2022.11.0 - html5lib>=1.1 - - hypothesis>=6.46.1 + - hypothesis>=6.84.0 - gcsfs>=2022.11.0 - jinja2>=3.1.2 - lxml>=4.9.2 diff --git a/ci/deps/actions-311-numpydev.yaml b/ci/deps/actions-311-numpydev.yaml index 61a0eabbf133c..748cfa861ec32 100644 --- a/ci/deps/actions-311-numpydev.yaml +++ b/ci/deps/actions-311-numpydev.yaml @@ -13,8 +13,8 @@ dependencies: # test dependencies - pytest>=7.3.2 - pytest-cov - - pytest-xdist>=2.2.0 - - hypothesis>=6.46.1 + - pytest-xdist>=3.4.0 + - hypothesis>=6.84.0 # pandas dependencies - python-dateutil diff --git a/ci/deps/actions-311-pyarrownightly.yaml b/ci/deps/actions-311-pyarrownightly.yaml index 5455b9b84b034..469fb1bfb9138 100644 --- a/ci/deps/actions-311-pyarrownightly.yaml +++ b/ci/deps/actions-311-pyarrownightly.yaml @@ -13,8 +13,8 @@ dependencies: # test dependencies - pytest>=7.3.2 - pytest-cov - - pytest-xdist>=2.2.0 - - hypothesis>=6.46.1 + - pytest-xdist>=3.4.0 + - hypothesis>=6.84.0 # required dependencies - python-dateutil diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index 388116439f944..75394e2c8e109 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -13,8 +13,9 @@ dependencies: # test dependencies - pytest>=7.3.2 - pytest-cov - - pytest-xdist>=2.2.0 - - pytest-qt>=4.2.0 + - pytest-xdist>=3.4.0 + - pytest-localserver>=0.8.1 + - pytest-qt>=4.4.0 - boto3 # required dependencies @@ -29,7 +30,7 @@ dependencies: - fastparquet>=2023.10.0 - fsspec>=2022.11.0 - html5lib>=1.1 - - hypothesis>=6.46.1 + - hypothesis>=6.84.0 - gcsfs>=2022.11.0 - jinja2>=3.1.2 - lxml>=4.9.2 @@ -59,4 +60,3 @@ dependencies: - pip: - adbc-driver-postgresql>=0.10.0 - adbc-driver-sqlite>=0.8.0 - - pytest-localserver>=0.7.1 diff --git a/ci/deps/actions-312.yaml b/ci/deps/actions-312.yaml index 1d9f8aa3b092a..d4b43ddef3601 100644 --- a/ci/deps/actions-312.yaml +++ b/ci/deps/actions-312.yaml @@ -13,8 +13,9 @@ dependencies: # test dependencies - pytest>=7.3.2 - pytest-cov - - pytest-xdist>=2.2.0 - - pytest-qt>=4.2.0 + - pytest-xdist>=3.4.0 + - pytest-localserver>=0.8.1 + - pytest-qt>=4.4.0 - boto3 # required dependencies @@ -29,7 +30,7 @@ dependencies: - fastparquet>=2023.10.0 - fsspec>=2022.11.0 - html5lib>=1.1 - - hypothesis>=6.46.1 + - hypothesis>=6.84.0 - gcsfs>=2022.11.0 - jinja2>=3.1.2 - lxml>=4.9.2 @@ -60,4 +61,3 @@ dependencies: - adbc-driver-postgresql>=0.10.0 - adbc-driver-sqlite>=0.8.0 - tzdata>=2022.7 - - pytest-localserver>=0.7.1 diff --git a/ci/deps/actions-pypy-39.yaml b/ci/deps/actions-pypy-39.yaml index d9c8dd81b7c33..b0ae9f1e48473 100644 --- a/ci/deps/actions-pypy-39.yaml +++ b/ci/deps/actions-pypy-39.yaml @@ -16,8 +16,8 @@ dependencies: # test dependencies - pytest>=7.3.2 - pytest-cov - - pytest-xdist>=2.2.0 - - hypothesis>=6.46.1 + - pytest-xdist>=3.4.0 + - hypothesis>=6.84.0 # required - numpy diff --git a/ci/deps/circle-311-arm64.yaml b/ci/deps/circle-311-arm64.yaml index 1c31d353699f8..18535d81e6985 100644 --- a/ci/deps/circle-311-arm64.yaml +++ b/ci/deps/circle-311-arm64.yaml @@ -13,9 +13,9 @@ dependencies: # test dependencies - pytest>=7.3.2 - pytest-cov - - pytest-xdist>=2.2.0 - - pytest-localserver>=0.7.1 - - pytest-qt>=4.2.0 + - pytest-xdist>=3.4.0 + - pytest-localserver>=0.8.1 + - pytest-qt>=4.4.0 - boto3 # required dependencies @@ -30,7 +30,7 @@ dependencies: - fastparquet>=2023.10.0 - fsspec>=2022.11.0 - html5lib>=1.1 - - hypothesis>=6.46.1 + - hypothesis>=6.84.0 - gcsfs>=2022.11.0 - jinja2>=3.1.2 - lxml>=4.9.2 diff --git a/ci/meta.yaml b/ci/meta.yaml index aac5593e493b7..b76bef2f630b7 100644 --- a/ci/meta.yaml +++ b/ci/meta.yaml @@ -64,9 +64,9 @@ test: requires: - pip - pytest >=7.3.2 - - pytest-xdist >=2.2.0 + - pytest-xdist >=3.4.0 - pytest-cov - - hypothesis >=6.46.1 + - hypothesis >=6.84.0 - tomli # [py<311] about: diff --git a/environment.yml b/environment.yml index dcc7aa5280b2c..e5646af07c45c 100644 --- a/environment.yml +++ b/environment.yml @@ -15,8 +15,8 @@ dependencies: # test dependencies - pytest>=7.3.2 - pytest-cov - - pytest-xdist>=2.2.0 - - pytest-qt>=4.2.0 + - pytest-xdist>=3.4.0 + - pytest-qt>=4.4.0 - pytest-localserver - pyqt>=5.15.9 - coverage @@ -33,7 +33,7 @@ dependencies: - fastparquet>=2023.10.0 - fsspec>=2022.11.0 - html5lib>=1.1 - - hypothesis>=6.46.1 + - hypothesis>=6.84.0 - gcsfs>=2022.11.0 - ipython - jinja2>=3.1.2 diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index b62a4c8dcc8c8..06082e71af32a 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -28,7 +28,7 @@ "fastparquet": "2023.10.0", "fsspec": "2022.11.0", "html5lib": "1.1", - "hypothesis": "6.46.1", + "hypothesis": "6.84.0", "gcsfs": "2022.11.0", "jinja2": "3.1.2", "lxml.etree": "4.9.2", diff --git a/pandas/conftest.py b/pandas/conftest.py index 5e0dfd7ee644d..c36789d2950bc 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -76,7 +76,6 @@ Index, MultiIndex, ) -from pandas.util.version import Version if TYPE_CHECKING: from collections.abc import ( @@ -182,9 +181,10 @@ def pytest_collection_modifyitems(items, config) -> None: ignore_doctest_warning(item, path, message) -hypothesis_health_checks = [hypothesis.HealthCheck.too_slow] -if Version(hypothesis.__version__) >= Version("6.83.2"): - hypothesis_health_checks.append(hypothesis.HealthCheck.differing_executors) +hypothesis_health_checks = [ + hypothesis.HealthCheck.too_slow, + hypothesis.HealthCheck.differing_executors, +] # Hypothesis hypothesis.settings.register_profile( diff --git a/pyproject.toml b/pyproject.toml index 47fd540d67ab2..7d3b55ce63ee4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -58,7 +58,7 @@ repository = 'https://github.com/pandas-dev/pandas' matplotlib = "pandas:plotting._matplotlib" [project.optional-dependencies] -test = ['hypothesis>=6.46.1', 'pytest>=7.3.2', 'pytest-xdist>=2.2.0'] +test = ['hypothesis>=6.84.0', 'pytest>=7.3.2', 'pytest-xdist>=3.4.0'] pyarrow = ['pyarrow>=10.0.1'] performance = ['bottleneck>=1.3.6', 'numba>=0.56.4', 'numexpr>=2.8.4'] computation = ['scipy>=1.10.0', 'xarray>=2022.12.0'] @@ -91,7 +91,7 @@ all = ['adbc-driver-postgresql>=0.10.0', 'fsspec>=2022.11.0', 'gcsfs>=2022.11.0', 'html5lib>=1.1', - 'hypothesis>=6.46.1', + 'hypothesis>=6.84.0', 'jinja2>=3.1.2', 'lxml>=4.9.2', 'matplotlib>=3.6.3', @@ -105,7 +105,7 @@ all = ['adbc-driver-postgresql>=0.10.0', 'PyQt5>=5.15.9', 'pyreadstat>=1.2.0', 'pytest>=7.3.2', - 'pytest-xdist>=2.2.0', + 'pytest-xdist>=3.4.0', 'python-calamine>=0.1.7', 'pyxlsb>=1.0.10', 'qtpy>=2.3.0', @@ -148,7 +148,7 @@ setup = ['--vsenv'] # For Windows skip = "cp36-* cp37-* cp38-* cp39-* pp* *_i686 *_ppc64le *_s390x" build-verbosity = "3" environment = {LDFLAGS="-Wl,--strip-all"} -test-requires = "hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0" +test-requires = "hypothesis>=6.84.0 pytest>=7.3.2 pytest-xdist>=3.4.0" test-command = """ PANDAS_CI='1' python -c 'import pandas as pd; \ pd.test(extra_args=["-m not clipboard and not single_cpu and not slow and not network and not db", "-n 2", "--no-strict-data-files"]); \ @@ -179,7 +179,7 @@ environment = {CFLAGS="-g0"} [[tool.cibuildwheel.overrides]] select = "*pyodide*" -test-requires = "pytest>=7.3.2 hypothesis>=6.46.1" +test-requires = "pytest>=7.3.2 hypothesis>=6.84.0" # Pyodide repairs wheels on its own, using auditwheel-emscripten repair-wheel-command = "" test-command = """ diff --git a/requirements-dev.txt b/requirements-dev.txt index f5da7f70ccdba..dbfd7c6bf7bf5 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -8,8 +8,8 @@ meson[ninja]==1.2.1 meson-python==0.13.1 pytest>=7.3.2 pytest-cov -pytest-xdist>=2.2.0 -pytest-qt>=4.2.0 +pytest-xdist>=3.4.0 +pytest-qt>=4.4.0 pytest-localserver PyQt5>=5.15.9 coverage @@ -22,7 +22,7 @@ bottleneck>=1.3.6 fastparquet>=2023.10.0 fsspec>=2022.11.0 html5lib>=1.1 -hypothesis>=6.46.1 +hypothesis>=6.84.0 gcsfs>=2022.11.0 ipython jinja2>=3.1.2 diff --git a/scripts/tests/data/deps_expected_random.yaml b/scripts/tests/data/deps_expected_random.yaml index 7bb95d05afb45..d1db7989a95a4 100644 --- a/scripts/tests/data/deps_expected_random.yaml +++ b/scripts/tests/data/deps_expected_random.yaml @@ -12,7 +12,7 @@ dependencies: # test dependencies - pytest>=7.3.2 - pytest-cov - - pytest-xdist>=2.2.0 + - pytest-xdist>=3.4.0 - psutil - boto3 diff --git a/scripts/tests/data/deps_minimum.toml b/scripts/tests/data/deps_minimum.toml index b832b6aa95198..0a53225a5d995 100644 --- a/scripts/tests/data/deps_minimum.toml +++ b/scripts/tests/data/deps_minimum.toml @@ -53,7 +53,7 @@ repository = 'https://github.com/pandas-dev/pandas' matplotlib = "pandas:plotting._matplotlib" [project.optional-dependencies] -test = ['hypothesis>=6.34.2', 'pytest>=7.3.2', 'pytest-xdist>=2.2.0'] +test = ['hypothesis>=6.34.2', 'pytest>=7.3.2', 'pytest-xdist>=3.4.0'] performance = ['bottleneck>=1.3.2', 'numba>=0.53.1', 'numexpr>=2.7.1'] timezone = ['tzdata>=2022.1'] computation = ['scipy>=1.7.1', 'xarray>=0.21.0'] @@ -74,7 +74,7 @@ html = ['beautifulsoup4>=4.9.3', 'html5lib>=1.1', 'lxml>=4.6.3'] xml = ['lxml>=4.6.3'] plot = ['matplotlib>=3.6.1'] output_formatting = ['jinja2>=3.0.0', 'tabulate>=0.8.9'] -clipboard = ['PyQt5>=5.15.1', 'qtpy>=2.2.0'] +clipboard = ['PyQt5>=5.15.1', 'qtpy>=2.3.0'] compression = ['zstandard>=0.15.2'] all = ['beautifulsoup4>=5.9.3', # blosc only available on conda (https://github.com/Blosc/python-blosc/issues/297) @@ -98,10 +98,10 @@ all = ['beautifulsoup4>=5.9.3', 'PyQt5>=5.15.1', 'pyreadstat>=1.1.2', 'pytest>=7.3.2', - 'pytest-xdist>=2.2.0', + 'pytest-xdist>=3.4.0', 'python-calamine>=0.1.7', 'pyxlsb>=1.0.8', - 'qtpy>=2.2.0', + 'qtpy>=2.3.0', 'scipy>=1.7.1', 's3fs>=2021.08.0', 'SQLAlchemy>=1.4.16', @@ -138,7 +138,7 @@ parentdir_prefix = "pandas-" [tool.cibuildwheel] skip = "cp36-* cp37-* pp37-* *-manylinux_i686 *_ppc64le *_s390x *-musllinux*" build-verbosity = "3" -test-requires = "hypothesis>=6.34.2 pytest>=7.3.2 pytest-xdist>=2.2.0" +test-requires = "hypothesis>=6.34.2 pytest>=7.3.2 pytest-xdist>=3.4.0" test-command = "python {project}/ci/test_wheels.py" [tool.cibuildwheel.macos] diff --git a/scripts/tests/data/deps_unmodified_random.yaml b/scripts/tests/data/deps_unmodified_random.yaml index 49299aa078ce4..afb28dd2c08bb 100644 --- a/scripts/tests/data/deps_unmodified_random.yaml +++ b/scripts/tests/data/deps_unmodified_random.yaml @@ -12,7 +12,7 @@ dependencies: # test dependencies - pytest>=7.3.2 - pytest-cov - - pytest-xdist>=2.2.0 + - pytest-xdist>=3.4.0 - psutil - boto3 From c3b72aae6f23f903d590a84586b9113f7b2ac986 Mon Sep 17 00:00:00 2001 From: ivonastojanovic <80911834+ivonastojanovic@users.noreply.github.com> Date: Thu, 18 Jul 2024 21:55:13 +0100 Subject: [PATCH 199/272] DOC: Add docstrings to fixtures in /series/methods/test_drop_duplicates.py (#59265) * Add docstrings to fixtures in /series/methods/test_drop_duplicates.py * fixup! Add docstrings to fixtures in /series/methods/test_drop_duplicates.py --- pandas/tests/series/methods/test_drop_duplicates.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/pandas/tests/series/methods/test_drop_duplicates.py b/pandas/tests/series/methods/test_drop_duplicates.py index 31ef8ff896bcc..2dbd61530dc41 100644 --- a/pandas/tests/series/methods/test_drop_duplicates.py +++ b/pandas/tests/series/methods/test_drop_duplicates.py @@ -75,10 +75,16 @@ class TestSeriesDropDuplicates: params=["int_", "uint", "float64", "str_", "timedelta64[h]", "datetime64[D]"] ) def dtype(self, request): + """ + Fixture that provides different data types for testing. + """ return request.param @pytest.fixture def cat_series_unused_category(self, dtype, ordered): + """ + Fixture that creates a Categorical Series with some unused categories. + """ # Test case 1 cat_array = np.array([1, 2, 3, 4, 5], dtype=np.dtype(dtype)) @@ -141,7 +147,9 @@ def test_drop_duplicates_categorical_non_bool_keepfalse( @pytest.fixture def cat_series(self, dtype, ordered): - # no unused categories, unlike cat_series_unused_category + """ + Fixture that creates a Categorical Series with no unused categories. + """ cat_array = np.array([1, 2, 3, 4, 5], dtype=np.dtype(dtype)) input2 = np.array([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype(dtype)) From 5a6025c3efb7417f6a12007f8e67ed386de168ec Mon Sep 17 00:00:00 2001 From: ivonastojanovic <80911834+ivonastojanovic@users.noreply.github.com> Date: Fri, 19 Jul 2024 18:13:09 +0100 Subject: [PATCH 200/272] DOCS: Add docstrings to fixtures in /indexes/datetimelike_/test_equals.py (#59278) Add docstrings to fixtures in /indexes/datetimelike_/test_equals.py file. --- pandas/tests/indexes/datetimelike_/test_equals.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/tests/indexes/datetimelike_/test_equals.py b/pandas/tests/indexes/datetimelike_/test_equals.py index 08134d9f3efb4..df9182b2dd1c2 100644 --- a/pandas/tests/indexes/datetimelike_/test_equals.py +++ b/pandas/tests/indexes/datetimelike_/test_equals.py @@ -52,6 +52,7 @@ def test_not_equals_misc_strs(self, index): class TestPeriodIndexEquals(EqualsTests): @pytest.fixture def index(self): + """Fixture for creating a PeriodIndex for use in equality tests.""" return period_range("2013-01-01", periods=5, freq="D") # TODO: de-duplicate with other test_equals2 methods @@ -91,6 +92,7 @@ def test_equals2(self, freq): class TestDatetimeIndexEquals(EqualsTests): @pytest.fixture def index(self): + """Fixture for creating a DatetimeIndex for use in equality tests.""" return date_range("2013-01-01", periods=5) def test_equals2(self): @@ -143,6 +145,7 @@ def test_not_equals_bday(self, freq): class TestTimedeltaIndexEquals(EqualsTests): @pytest.fixture def index(self): + """Fixture for creating a TimedeltaIndex for use in equality tests.""" return timedelta_range("1 day", periods=10) def test_equals2(self): From fe9ff741c4f0ed612250f2a36874406d0453f99b Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Fri, 19 Jul 2024 18:18:42 +0100 Subject: [PATCH 201/272] clean: simplify lite-rule-alias and dont-uppercase (#59240) --- pandas/_libs/tslibs/dtypes.pyx | 2 + pandas/_libs/tslibs/offsets.pyx | 91 +++++++++----------- pandas/tests/arrays/test_datetimes.py | 2 +- pandas/tests/tseries/offsets/test_offsets.py | 4 +- 4 files changed, 44 insertions(+), 55 deletions(-) diff --git a/pandas/_libs/tslibs/dtypes.pyx b/pandas/_libs/tslibs/dtypes.pyx index 7e6e382c17cc6..4100f3d90e817 100644 --- a/pandas/_libs/tslibs/dtypes.pyx +++ b/pandas/_libs/tslibs/dtypes.pyx @@ -359,6 +359,8 @@ cdef dict c_PERIOD_AND_OFFSET_DEPR_FREQSTR = { "b": "B", "c": "C", "MIN": "min", + "US": "us", + "NS": "ns", } cdef str INVALID_FREQ_ERR_MSG = "Invalid frequency: {0}" diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index db35cc0c93237..ff24c2942cb76 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -4697,13 +4697,9 @@ _lite_rule_alias = { "BYS": "BYS-JAN", # BYearBegin(month=1), "Min": "min", - "min": "min", - "ms": "ms", - "us": "us", - "ns": "ns", } -_dont_uppercase = {"h", "bh", "cbh", "MS", "ms", "s"} +_dont_uppercase = {"min", "h", "bh", "cbh", "s", "ms", "us", "ns"} INVALID_FREQ_ERR_MSG = "Invalid frequency: {0}" @@ -4713,6 +4709,37 @@ INVALID_FREQ_ERR_MSG = "Invalid frequency: {0}" _offset_map = {} +def _warn_about_deprecated_aliases(name: str, is_period: bool) -> str: + if name in _lite_rule_alias: + return name + if name in c_PERIOD_AND_OFFSET_DEPR_FREQSTR: + warnings.warn( + f"\'{name}\' is deprecated and will be removed " + f"in a future version, please use " + f"\'{c_PERIOD_AND_OFFSET_DEPR_FREQSTR.get(name)}\' " + f" instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + return c_PERIOD_AND_OFFSET_DEPR_FREQSTR[name] + + for _name in (name.lower(), name.upper()): + if name == _name: + continue + if _name in c_PERIOD_AND_OFFSET_DEPR_FREQSTR.values(): + warnings.warn( + f"\'{name}\' is deprecated and will be removed " + f"in a future version, please use " + f"\'{_name}\' " + f" instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + return _name + + return name + + def _validate_to_offset_alias(alias: str, is_period: bool) -> None: if not is_period: if alias.upper() in c_OFFSET_RENAMED_FREQSTR: @@ -4750,35 +4777,6 @@ def _get_offset(name: str) -> BaseOffset: -------- _get_offset('EOM') --> BMonthEnd(1) """ - if ( - name not in _lite_rule_alias - and (name.upper() in _lite_rule_alias) - and name != "ms" - ): - warnings.warn( - f"\'{name}\' is deprecated and will be removed " - f"in a future version, please use \'{name.upper()}\' instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - elif ( - name not in _lite_rule_alias - and (name.lower() in _lite_rule_alias) - and name != "MS" - ): - warnings.warn( - f"\'{name}\' is deprecated and will be removed " - f"in a future version, please use \'{name.lower()}\' instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - if name not in _dont_uppercase: - name = name.upper() - name = _lite_rule_alias.get(name, name) - name = _lite_rule_alias.get(name.lower(), name) - else: - name = _lite_rule_alias.get(name, name) - if name not in _offset_map: try: split = name.split("-") @@ -4880,6 +4878,7 @@ cpdef to_offset(freq, bint is_period=False): tups = zip(split[0::4], split[1::4], split[2::4]) for n, (sep, stride, name) in enumerate(tups): + name = _warn_about_deprecated_aliases(name, is_period) _validate_to_offset_alias(name, is_period) if is_period: if name.upper() in c_PERIOD_TO_OFFSET_FREQSTR: @@ -4888,31 +4887,21 @@ cpdef to_offset(freq, bint is_period=False): f"\'{name}\' is no longer supported, " f"please use \'{name.upper()}\' instead.", ) - name = c_PERIOD_TO_OFFSET_FREQSTR.get(name.upper()) - - if name in c_PERIOD_AND_OFFSET_DEPR_FREQSTR: - warnings.warn( - f"\'{name}\' is deprecated and will be removed " - f"in a future version, please use " - f"\'{c_PERIOD_AND_OFFSET_DEPR_FREQSTR.get(name)}\' " - f"instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - name = c_PERIOD_AND_OFFSET_DEPR_FREQSTR.get(name) + name = c_PERIOD_TO_OFFSET_FREQSTR[name.upper()] + name = _lite_rule_alias.get(name, name) + if sep != "" and not sep.isspace(): raise ValueError("separator must be spaces") - prefix = _lite_rule_alias.get(name) or name if stride_sign is None: stride_sign = -1 if stride.startswith("-") else 1 if not stride: stride = 1 - if prefix in {"D", "h", "min", "s", "ms", "us", "ns"}: + if name in {"D", "h", "min", "s", "ms", "us", "ns"}: # For these prefixes, we have something like "3h" or # "2.5min", so we can construct a Timedelta with the # matching unit and get our offset from delta_to_tick - td = Timedelta(1, unit=prefix) + td = Timedelta(1, unit=name) off = delta_to_tick(td) offset = off * float(stride) if n != 0: @@ -4921,7 +4910,7 @@ cpdef to_offset(freq, bint is_period=False): offset *= stride_sign else: stride = int(stride) - offset = _get_offset(prefix) + offset = _get_offset(name) offset = offset * int(np.fabs(stride) * stride_sign) if result is None: @@ -4931,7 +4920,7 @@ cpdef to_offset(freq, bint is_period=False): except (ValueError, TypeError) as err: raise ValueError(INVALID_FREQ_ERR_MSG.format( f"{freq}, failed to parse with error message: {repr(err)}") - ) + ) from err else: result = None diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index de189b7e2f724..8e348805de978 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -760,7 +760,7 @@ def test_date_range_frequency_M_Q_Y_raises(self, freq): with pytest.raises(ValueError, match=msg): pd.date_range("1/1/2000", periods=4, freq=freq) - @pytest.mark.parametrize("freq_depr", ["2MIN", "2mS", "2Us"]) + @pytest.mark.parametrize("freq_depr", ["2MIN", "2nS", "2Us"]) def test_date_range_uppercase_frequency_deprecated(self, freq_depr): # GH#9586, GH#54939 depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed in a " diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index 1e5bfa6033216..d19717e87c7d2 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -788,9 +788,7 @@ def test_get_offset(): pairs = [ ("B", BDay()), - ("b", BDay()), - ("bme", BMonthEnd()), - ("Bme", BMonthEnd()), + ("BME", BMonthEnd()), ("W-MON", Week(weekday=0)), ("W-TUE", Week(weekday=1)), ("W-WED", Week(weekday=2)), From abcf477a4b95e2f255580fe10c11ed1f24e5dc66 Mon Sep 17 00:00:00 2001 From: Asish Mahapatra Date: Fri, 19 Jul 2024 15:10:58 -0400 Subject: [PATCH 202/272] TST: Added test for non-nano DTI intersection and updated whatsnew (#59280) * TST: add intersection test non-nano * modify whatsnew --- doc/source/whatsnew/v3.0.0.rst | 2 +- pandas/tests/indexes/datetimes/test_setops.py | 13 +++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index c5c886912eae0..e5917c9176c54 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -507,7 +507,7 @@ Datetimelike - Bug in :meth:`Dataframe.agg` with df with missing values resulting in IndexError (:issue:`58810`) - Bug in :meth:`DatetimeIndex.is_year_start` and :meth:`DatetimeIndex.is_quarter_start` does not raise on Custom business days frequencies bigger then "1C" (:issue:`58664`) - Bug in :meth:`DatetimeIndex.is_year_start` and :meth:`DatetimeIndex.is_quarter_start` returning ``False`` on double-digit frequencies (:issue:`58523`) -- Bug in :meth:`DatetimeIndex.union` when ``unit`` was non-nanosecond (:issue:`59036`) +- Bug in :meth:`DatetimeIndex.union` and :meth:`DatetimeIndex.intersection` when ``unit`` was non-nanosecond (:issue:`59036`) - Bug in :meth:`Series.dt.microsecond` producing incorrect results for pyarrow backed :class:`Series`. (:issue:`59154`) - Bug in :meth:`to_datetime` not respecting dayfirst if an uncommon date string was passed. (:issue:`58859`) - Bug in setting scalar values with mismatched resolution into arrays with non-nanosecond ``datetime64``, ``timedelta64`` or :class:`DatetimeTZDtype` incorrectly truncating those scalars (:issue:`56410`) diff --git a/pandas/tests/indexes/datetimes/test_setops.py b/pandas/tests/indexes/datetimes/test_setops.py index f04f1592ea0c1..7ef6efad0ff6f 100644 --- a/pandas/tests/indexes/datetimes/test_setops.py +++ b/pandas/tests/indexes/datetimes/test_setops.py @@ -681,3 +681,16 @@ def test_union_non_nano_rangelike(): freq="D", ) tm.assert_index_equal(result, expected) + + +def test_intersection_non_nano_rangelike(): + # GH 59271 + l1 = date_range("2024-01-01", "2024-01-03", unit="s") + l2 = date_range("2024-01-02", "2024-01-04", unit="s") + result = l1.intersection(l2) + expected = DatetimeIndex( + ["2024-01-02", "2024-01-03"], + dtype="datetime64[s]", + freq="D", + ) + tm.assert_index_equal(result, expected) From 18a3eec55523513c5e08fe014646c044cc825fa4 Mon Sep 17 00:00:00 2001 From: Jay Ahn Date: Fri, 19 Jul 2024 18:53:49 -0400 Subject: [PATCH 203/272] DOC: DataFrame.groupy.agg with a list of tuples (#59282) Add doc for groupby.agg with a list of tuples Co-authored-by: hye ryung cho --- doc/source/user_guide/groupby.rst | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index 267499edfae6f..8c80fa7052dd5 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -668,8 +668,9 @@ column, which produces an aggregated result with a hierarchical column index: grouped[["C", "D"]].agg(["sum", "mean", "std"]) -The resulting aggregations are named after the functions themselves. If you -need to rename, then you can add in a chained operation for a ``Series`` like this: +The resulting aggregations are named after the functions themselves. + +For a ``Series``, if you need to rename, you can add in a chained operation like this: .. ipython:: python @@ -679,8 +680,19 @@ need to rename, then you can add in a chained operation for a ``Series`` like th .rename(columns={"sum": "foo", "mean": "bar", "std": "baz"}) ) +Or, you can simply pass a list of tuples each with the name of the new column and the aggregate function: + +.. ipython:: python + + ( + grouped["C"] + .agg([("foo", "sum"), ("bar", "mean"), ("baz", "std")]) + ) + For a grouped ``DataFrame``, you can rename in a similar manner: +By chaining ``rename`` operation, + .. ipython:: python ( @@ -689,6 +701,16 @@ For a grouped ``DataFrame``, you can rename in a similar manner: ) ) +Or, passing a list of tuples, + +.. ipython:: python + + ( + grouped[["C", "D"]].agg( + [("foo", "sum"), ("bar", "mean"), ("baz", "std")] + ) + ) + .. note:: In general, the output column names should be unique, but pandas will allow From 080add14720218a20ca1dbde5e59d82d05534c5a Mon Sep 17 00:00:00 2001 From: AndreyKolomiets Date: Sun, 21 Jul 2024 17:36:14 +0300 Subject: [PATCH 204/272] TYP: Typing improvements for Index (#59105) * Typing improvements for Index * better numpy type hints for Index.delete * replace some hints with literals, move slice_type to _typing.py --- pandas/_typing.py | 2 ++ pandas/core/indexes/base.py | 41 +++++++++++++++++++++++++++---------- 2 files changed, 32 insertions(+), 11 deletions(-) diff --git a/pandas/_typing.py b/pandas/_typing.py index 09a3f58d6ab7f..d43e6e900546d 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -526,3 +526,5 @@ def closed(self) -> bool: # maintaine the sub-type of any hashable sequence SequenceT = TypeVar("SequenceT", bound=Sequence[Hashable]) + +SliceType = Optional[Hashable] diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 5bffac5fa64b6..b187e578a252b 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -45,7 +45,9 @@ ArrayLike, Axes, Axis, + AxisInt, DropKeep, + Dtype, DtypeObj, F, IgnoreRaise, @@ -57,6 +59,7 @@ ReindexMethod, Self, Shape, + SliceType, npt, ) from pandas.compat.numpy import function as nv @@ -1087,7 +1090,7 @@ def view(self, cls=None): result._id = self._id return result - def astype(self, dtype, copy: bool = True): + def astype(self, dtype: Dtype, copy: bool = True): """ Create an Index with values cast to dtypes. @@ -2957,7 +2960,7 @@ def _dti_setop_align_tzs(self, other: Index, setop: str_t) -> tuple[Index, Index return self, other @final - def union(self, other, sort=None): + def union(self, other, sort: bool | None = None): """ Form the union of two Index objects. @@ -3334,7 +3337,7 @@ def _intersection_via_get_indexer( return result @final - def difference(self, other, sort=None): + def difference(self, other, sort: bool | None = None): """ Return a new Index with elements of index not in `other`. @@ -3420,7 +3423,12 @@ def _wrap_difference_result(self, other, result): # We will override for MultiIndex to handle empty results return self._wrap_setop_result(other, result) - def symmetric_difference(self, other, result_name=None, sort=None): + def symmetric_difference( + self, + other, + result_name: abc.Hashable | None = None, + sort: bool | None = None, + ): """ Compute the symmetric difference of two Index objects. @@ -6389,7 +6397,7 @@ def _transform_index(self, func, *, level=None) -> Index: items = [func(x) for x in self] return Index(items, name=self.name, tupleize_cols=False) - def isin(self, values, level=None) -> npt.NDArray[np.bool_]: + def isin(self, values, level: str_t | int | None = None) -> npt.NDArray[np.bool_]: """ Return a boolean array where the index values are in `values`. @@ -6687,7 +6695,12 @@ def get_slice_bound(self, label, side: Literal["left", "right"]) -> int: else: return slc - def slice_locs(self, start=None, end=None, step=None) -> tuple[int, int]: + def slice_locs( + self, + start: SliceType = None, + end: SliceType = None, + step: int | None = None, + ) -> tuple[int, int]: """ Compute slice locations for input labels. @@ -6781,7 +6794,9 @@ def slice_locs(self, start=None, end=None, step=None) -> tuple[int, int]: return start_slice, end_slice - def delete(self, loc) -> Self: + def delete( + self, loc: int | np.integer | list[int] | npt.NDArray[np.integer] + ) -> Self: """ Make new Index with passed location(-s) deleted. @@ -7227,7 +7242,9 @@ def _maybe_disable_logical_methods(self, opname: str_t) -> None: raise TypeError(f"cannot perform {opname} with {type(self).__name__}") @Appender(IndexOpsMixin.argmin.__doc__) - def argmin(self, axis=None, skipna: bool = True, *args, **kwargs) -> int: + def argmin( + self, axis: AxisInt | None = None, skipna: bool = True, *args, **kwargs + ) -> int: nv.validate_argmin(args, kwargs) nv.validate_minmax_axis(axis) @@ -7240,7 +7257,9 @@ def argmin(self, axis=None, skipna: bool = True, *args, **kwargs) -> int: return super().argmin(skipna=skipna) @Appender(IndexOpsMixin.argmax.__doc__) - def argmax(self, axis=None, skipna: bool = True, *args, **kwargs) -> int: + def argmax( + self, axis: AxisInt | None = None, skipna: bool = True, *args, **kwargs + ) -> int: nv.validate_argmax(args, kwargs) nv.validate_minmax_axis(axis) @@ -7251,7 +7270,7 @@ def argmax(self, axis=None, skipna: bool = True, *args, **kwargs) -> int: raise ValueError("Encountered all NA values") return super().argmax(skipna=skipna) - def min(self, axis=None, skipna: bool = True, *args, **kwargs): + def min(self, axis: AxisInt | None = None, skipna: bool = True, *args, **kwargs): """ Return the minimum value of the Index. @@ -7314,7 +7333,7 @@ def min(self, axis=None, skipna: bool = True, *args, **kwargs): return nanops.nanmin(self._values, skipna=skipna) - def max(self, axis=None, skipna: bool = True, *args, **kwargs): + def max(self, axis: AxisInt | None = None, skipna: bool = True, *args, **kwargs): """ Return the maximum value of the Index. From 559a187092fba9b4976a983d5e3f115467acf91b Mon Sep 17 00:00:00 2001 From: jayendhargautham Date: Mon, 22 Jul 2024 13:27:28 -0400 Subject: [PATCH 205/272] DOC: Fix sentence fragment in string methods (#59291) fixed documentation --- doc/source/user_guide/text.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index ad2690ae395be..827e7a3c884d9 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -204,7 +204,7 @@ and replacing any remaining whitespaces with underscores: .. warning:: - The type of the Series is inferred and the allowed types (i.e. strings). + The type of the Series is inferred and is one among the allowed types (i.e. strings). Generally speaking, the ``.str`` accessor is intended to work only on strings. With very few exceptions, other uses are not supported, and may be disabled at a later point. From e01870390f19dab34d8fea945b5a01a8b4c6d1af Mon Sep 17 00:00:00 2001 From: ivonastojanovic <80911834+ivonastojanovic@users.noreply.github.com> Date: Mon, 22 Jul 2024 18:28:14 +0100 Subject: [PATCH 206/272] DOCS: Add docstrings to fixture in /indexing/interval/test_interval_new.py (#59290) Add docstrings to fixture in /indexing/interval/test_interval_new.py file. --- pandas/tests/indexing/interval/test_interval_new.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/tests/indexing/interval/test_interval_new.py b/pandas/tests/indexing/interval/test_interval_new.py index 283921a23e368..4c1efe9e4f81d 100644 --- a/pandas/tests/indexing/interval/test_interval_new.py +++ b/pandas/tests/indexing/interval/test_interval_new.py @@ -17,6 +17,9 @@ class TestIntervalIndex: @pytest.fixture def series_with_interval_index(self): + """ + Fixture providing a Series with an IntervalIndex. + """ return Series(np.arange(5), IntervalIndex.from_breaks(np.arange(6))) def test_loc_with_interval(self, series_with_interval_index, indexer_sl): From dd464e291228009d1d2ad34cff1acab469fdb341 Mon Sep 17 00:00:00 2001 From: ivonastojanovic <80911834+ivonastojanovic@users.noreply.github.com> Date: Mon, 22 Jul 2024 18:28:41 +0100 Subject: [PATCH 207/272] DOCS: Add docstrings to fixture in /indexes/categorical/test_category.py (#59289) Add docstrings to fixture in /indexes/categorical/test_category.py file. --- pandas/tests/indexes/categorical/test_category.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/tests/indexes/categorical/test_category.py b/pandas/tests/indexes/categorical/test_category.py index 03a298a13dc2b..4a65d65c03d91 100644 --- a/pandas/tests/indexes/categorical/test_category.py +++ b/pandas/tests/indexes/categorical/test_category.py @@ -21,6 +21,9 @@ class TestCategoricalIndex: @pytest.fixture def simple_index(self) -> CategoricalIndex: + """ + Fixture that provides a CategoricalIndex. + """ return CategoricalIndex(list("aabbca"), categories=list("cab"), ordered=False) def test_can_hold_identifiers(self): From dd32055297d6a2af1227f61b0efc3e680da3fb53 Mon Sep 17 00:00:00 2001 From: ivonastojanovic <80911834+ivonastojanovic@users.noreply.github.com> Date: Mon, 22 Jul 2024 18:29:09 +0100 Subject: [PATCH 208/272] DOCS: Add docstrings to fixtures in /indexes/interval/test_constructors.py (#59288) Add docstrings to fixtures in /indexes/interval/test_constructors.py file. --- pandas/tests/indexes/interval/test_constructors.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/tests/indexes/interval/test_constructors.py b/pandas/tests/indexes/interval/test_constructors.py index 4a9eb4dd9fc0c..8db483751438c 100644 --- a/pandas/tests/indexes/interval/test_constructors.py +++ b/pandas/tests/indexes/interval/test_constructors.py @@ -210,6 +210,7 @@ class TestFromArrays(ConstructorTests): @pytest.fixture def constructor(self): + """Fixture for IntervalIndex.from_arrays constructor""" return IntervalIndex.from_arrays def get_kwargs_from_breaks(self, breaks, closed="right"): @@ -282,6 +283,7 @@ class TestFromBreaks(ConstructorTests): @pytest.fixture def constructor(self): + """Fixture for IntervalIndex.from_breaks constructor""" return IntervalIndex.from_breaks def get_kwargs_from_breaks(self, breaks, closed="right"): @@ -320,6 +322,7 @@ class TestFromTuples(ConstructorTests): @pytest.fixture def constructor(self): + """Fixture for IntervalIndex.from_tuples constructor""" return IntervalIndex.from_tuples def get_kwargs_from_breaks(self, breaks, closed="right"): @@ -370,6 +373,7 @@ class TestClassConstructors(ConstructorTests): @pytest.fixture def constructor(self): + """Fixture for IntervalIndex class constructor""" return IntervalIndex def get_kwargs_from_breaks(self, breaks, closed="right"): From a9b99f73cb2108209c7a6bd1d2a52ac1ddc14243 Mon Sep 17 00:00:00 2001 From: ivonastojanovic <80911834+ivonastojanovic@users.noreply.github.com> Date: Mon, 22 Jul 2024 18:29:38 +0100 Subject: [PATCH 209/272] DOCS: Add docstrings to fixtures in /io/json/test_json_table_schema_ext_dtype.py (#59287) Add docstrings to fixtures in /io/json/test_json_table_schema_ext_dtype.py file. --- pandas/tests/io/json/test_json_table_schema_ext_dtype.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/tests/io/json/test_json_table_schema_ext_dtype.py b/pandas/tests/io/json/test_json_table_schema_ext_dtype.py index 68c7a96920533..8de289afe9ff9 100644 --- a/pandas/tests/io/json/test_json_table_schema_ext_dtype.py +++ b/pandas/tests/io/json/test_json_table_schema_ext_dtype.py @@ -97,18 +97,22 @@ def test_as_json_table_type_ext_integer_dtype(self): class TestTableOrient: @pytest.fixture def da(self): + """Fixture for creating a DateArray.""" return DateArray([dt.date(2021, 10, 10)]) @pytest.fixture def dc(self): + """Fixture for creating a DecimalArray.""" return DecimalArray([decimal.Decimal(10)]) @pytest.fixture def sa(self): + """Fixture for creating a StringDtype array.""" return array(["pandas"], dtype="string") @pytest.fixture def ia(self): + """Fixture for creating an Int64Dtype array.""" return array([10], dtype="Int64") def test_build_date_series(self, da): From 8f737b331f18aa35f34b61644d3e86615b6d0c61 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 22 Jul 2024 07:41:42 -1000 Subject: [PATCH 210/272] CLN: Remove __nonzero__ in favor of __bool__ (#59275) --- pandas/core/generic.py | 4 +--- pandas/core/indexes/base.py | 4 +--- pandas/core/internals/managers.py | 5 +---- 3 files changed, 3 insertions(+), 10 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 5913532e28ec2..8a6fc69d47cc3 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1491,14 +1491,12 @@ def __invert__(self) -> Self: return res.__finalize__(self, method="__invert__") @final - def __nonzero__(self) -> NoReturn: + def __bool__(self) -> NoReturn: raise ValueError( f"The truth value of a {type(self).__name__} is ambiguous. " "Use a.empty, a.bool(), a.item(), a.any() or a.all()." ) - __bool__ = __nonzero__ - @final def abs(self) -> Self: """ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index b187e578a252b..541ed7dce3aac 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2910,14 +2910,12 @@ def __iadd__(self, other): return self + other @final - def __nonzero__(self) -> NoReturn: + def __bool__(self) -> NoReturn: raise ValueError( f"The truth value of a {type(self).__name__} is ambiguous. " "Use a.empty, a.bool(), a.item(), a.any() or a.all()." ) - __bool__ = __nonzero__ - # -------------------------------------------------------------------- # Set Operation Methods diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index b47d5fe18b9c9..c42ea44b2fc89 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -263,12 +263,9 @@ def make_empty(self, axes=None) -> Self: blocks = [] return type(self).from_blocks(blocks, axes) - def __nonzero__(self) -> bool: + def __bool__(self) -> bool: return True - # Python3 compat - __bool__ = __nonzero__ - def set_axis(self, axis: AxisInt, new_labels: Index) -> None: # Caller is responsible for ensuring we have an Index object. self._validate_set_axis(axis, new_labels) From 04424b0382ab4c03bf0243995e6b62b0d7d1cea8 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 22 Jul 2024 07:51:15 -1000 Subject: [PATCH 211/272] PERF: Don't create a CategoricalIndex._engine in __contains__ if categories are RangeIndex (#59178) * PERF: Don't create a CategoricalIndex._engine in __contains__ if categories are RangeIndex * Fix typing --- pandas/core/indexes/category.py | 9 +++++++-- pandas/tests/indexes/categorical/test_category.py | 7 +++++++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 3b04d95cb7cbd..312219eb7b91a 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -377,8 +377,13 @@ def __contains__(self, key: Any) -> bool: # if key is a NaN, check if any NaN is in self. if is_valid_na_for_dtype(key, self.categories.dtype): return self.hasnans - - return contains(self, key, container=self._engine) + if self.categories._typ == "rangeindex": + container: Index | libindex.IndexEngine | libindex.ExtensionEngine = ( + self.categories + ) + else: + container = self._engine + return contains(self, key, container=container) def reindex( self, target, method=None, level=None, limit: int | None = None, tolerance=None diff --git a/pandas/tests/indexes/categorical/test_category.py b/pandas/tests/indexes/categorical/test_category.py index 4a65d65c03d91..87ec8289089dc 100644 --- a/pandas/tests/indexes/categorical/test_category.py +++ b/pandas/tests/indexes/categorical/test_category.py @@ -395,3 +395,10 @@ def test_remove_maintains_order(self): ["a", "b", np.nan, "d", "d", "a"], categories=list("dba"), ordered=True ), ) + + +def test_contains_rangeindex_categories_no_engine(): + ci = CategoricalIndex(range(3)) + assert 2 in ci + assert 5 not in ci + assert "_engine" not in ci._cache From efb99be258d023e325f657270e2a869f3cdf5927 Mon Sep 17 00:00:00 2001 From: Chaarvi Bansal <49508554+chaarvii@users.noreply.github.com> Date: Mon, 22 Jul 2024 21:06:16 +0100 Subject: [PATCH 212/272] DOC: Using np.histogram_bin_edges with pd.cut (#59281) * update documentation * Update pandas/core/reshape/tile.py Co-authored-by: Asish Mahapatra * Update pandas/core/reshape/tile.py Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --------- Co-authored-by: Asish Mahapatra Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/core/reshape/tile.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 0052bcfe09147..18517199f073c 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -142,12 +142,17 @@ def cut( fixed set of values. Series : One-dimensional array with axis labels (including time series). IntervalIndex : Immutable Index implementing an ordered, sliceable set. + numpy.histogram_bin_edges: Function to calculate only the edges of the bins + used by the histogram function. Notes ----- Any NA values will be NA in the result. Out of bounds values will be NA in the resulting Series or Categorical object. + ``numpy.histogram_bin_edges`` can be used along with cut to calculate bins according + to some predefined methods. + Reference :ref:`the user guide ` for more examples. Examples @@ -239,6 +244,16 @@ def cut( >>> pd.cut([0, 0.5, 1.5, 2.5, 4.5], bins) [NaN, (0.0, 1.0], NaN, (2.0, 3.0], (4.0, 5.0]] Categories (3, interval[int64, right]): [(0, 1] < (2, 3] < (4, 5]] + + Using np.histogram_bin_edges with cut + + >>> pd.cut( + ... np.array([1, 7, 5, 4]), + ... bins=np.histogram_bin_edges(np.array([1, 7, 5, 4]), bins="auto"), + ... ) + ... # doctest: +ELLIPSIS + [NaN, (5.0, 7.0], (3.0, 5.0], (3.0, 5.0]] + Categories (3, interval[float64, right]): [(1.0, 3.0] < (3.0, 5.0] < (5.0, 7.0]] """ # NOTE: this binning code is changed a bit from histogram for var(x) == 0 From 7c0ee27e6c00e9645154583917de0f385190d8d8 Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Mon, 22 Jul 2024 22:26:45 +0200 Subject: [PATCH 213/272] Upload 3.13 & free-threaded nightly wheels (#59136) * Upload free-threaded nightly wheels on Linux and macOS * Consolidate jobs into one * Install build dependencies in before-build and pass --no-build-isolation * Fix {project} placeholder in cibuildwheel config * Correctly quote echo CIBW_BUILD_FRONTEND command * Run echo -e * Add {package} to before-build * Include cibw script in sdist & add matrix value for build frontend * Change manifest and gitattributes * Change gitattributes * Install verioneer in before-build * Add cibw_before_test to install nightly NumPy * Expand before-test to musl * Better comments plus always run before-build/before-test on 3.13 * Add --no-build-isolation in 3.13 as well * Install nightly numpy before windows tests * Address feedback; add todo for NumPy nightly and move default outside matrix * Set build_frontend to 'build' in pyodide build --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- .gitattributes | 6 +++++- .github/workflows/wheels.yml | 28 +++++++++++++++++++++------- MANIFEST.in | 4 ++++ pyproject.toml | 7 +++++-- scripts/cibw_before_build.sh | 9 +++++++++ scripts/cibw_before_test.sh | 8 ++++++++ 6 files changed, 52 insertions(+), 10 deletions(-) create mode 100644 scripts/cibw_before_build.sh create mode 100644 scripts/cibw_before_test.sh diff --git a/.gitattributes b/.gitattributes index 19c6fd2fd1d47..b3d70ca8b24fb 100644 --- a/.gitattributes +++ b/.gitattributes @@ -68,7 +68,7 @@ ci export-ignore doc export-ignore gitpod export-ignore MANIFEST.in export-ignore -scripts export-ignore +scripts/** export-ignore typings export-ignore web export-ignore CITATION.cff export-ignore @@ -82,3 +82,7 @@ setup.py export-ignore # csv_dir_path fixture checks the existence of the directory # exclude the whole directory to avoid running related tests in sdist pandas/tests/io/parser/data export-ignore + +# Include cibw script in sdist since it's needed for building wheels +scripts/cibw_before_build.sh -export-ignore +scripts/cibw_before_test.sh -export-ignore diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 6405156f09833..9f07648b254dd 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -99,14 +99,25 @@ jobs: - [macos-14, macosx_arm64] - [windows-2022, win_amd64] # TODO: support PyPy? - python: [["cp310", "3.10"], ["cp311", "3.11"], ["cp312", "3.12"]] - + python: [["cp310", "3.10"], ["cp311", "3.11"], ["cp312", "3.12"], ["cp313", "3.13"], ["cp313t", "3.13"]] + include: + # TODO: Remove this plus installing build deps in cibw_before_build.sh + # and test deps in cibw_before_test.sh after pandas can be built with a released NumPy/Cython + - python: ["cp313", "3.13"] + cibw_build_frontend: 'pip; args: --no-build-isolation' + - python: ["cp313t", "3.13"] + cibw_build_frontend: 'pip; args: --no-build-isolation' # Build Pyodide wheels and upload them to Anaconda.org # NOTE: this job is similar to the one in unit-tests.yml except for the fact # that it uses cibuildwheel instead of a standard Pyodide xbuildenv setup. - include: - - buildplat: [ubuntu-22.04, pyodide_wasm32] - python: ["cp312", "3.12"] + - buildplat: [ubuntu-22.04, pyodide_wasm32] + python: ["cp312", "3.12"] + cibw_build_frontend: 'build' + # TODO: Build free-threaded wheels for Windows + exclude: + - buildplat: [windows-2022, win_amd64] + python: ["cp313t", "3.13"] + env: IS_PUSH: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') }} IS_SCHEDULE_DISPATCH: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }} @@ -153,6 +164,7 @@ jobs: env: CIBW_PRERELEASE_PYTHONS: True CIBW_BUILD: ${{ matrix.python[0] }}-${{ matrix.buildplat[1] }} + CIBW_BUILD_FRONTEND: ${{ matrix.cibw_build_frontend || 'pip' }} CIBW_PLATFORM: ${{ matrix.buildplat[1] == 'pyodide_wasm32' && 'pyodide' || 'auto' }} - name: Set up Python @@ -176,15 +188,17 @@ jobs: - name: Test Windows Wheels if: ${{ matrix.buildplat[1] == 'win_amd64' }} shell: pwsh + # TODO: Remove NumPy nightly install when there's a 3.13 wheel on PyPI run: | $TST_CMD = @" python -m pip install hypothesis>=6.84.0 pytest>=7.3.2 pytest-xdist>=3.4.0; + ${{ matrix.python[1] == '3.13' && 'python -m pip install -i https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy;' }} python -m pip install `$(Get-Item pandas\wheelhouse\*.whl); python -c `'import pandas as pd; pd.test(extra_args=[`\"--no-strict-data-files`\", `\"-m not clipboard and not single_cpu and not slow and not network and not db`\"])`'; "@ # add rc to the end of the image name if the Python version is unreleased - docker pull python:${{ matrix.python[1] == '3.12' && '3.12-rc' || format('{0}-windowsservercore', matrix.python[1]) }} - docker run --env PANDAS_CI='1' -v ${PWD}:C:\pandas python:${{ matrix.python[1] == '3.12' && '3.12-rc' || format('{0}-windowsservercore', matrix.python[1]) }} powershell -Command $TST_CMD + docker pull python:${{ matrix.python[1] == '3.13' && '3.13-rc' || format('{0}-windowsservercore', matrix.python[1]) }} + docker run --env PANDAS_CI='1' -v ${PWD}:C:\pandas python:${{ matrix.python[1] == '3.13' && '3.13-rc' || format('{0}-windowsservercore', matrix.python[1]) }} powershell -Command $TST_CMD - uses: actions/upload-artifact@v4 with: diff --git a/MANIFEST.in b/MANIFEST.in index 9894381ed6252..f586d457eaaf8 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -62,3 +62,7 @@ prune pandas/tests/io/parser/data # Selectively re-add *.cxx files that were excluded above graft pandas/_libs/src graft pandas/_libs/include + +# Include cibw script in sdist since it's needed for building wheels +include scripts/cibw_before_build.sh +include scripts/cibw_before_test.sh diff --git a/pyproject.toml b/pyproject.toml index 7d3b55ce63ee4..cc5cc1cf84d0c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -154,14 +154,17 @@ test-command = """ pd.test(extra_args=["-m not clipboard and not single_cpu and not slow and not network and not db", "-n 2", "--no-strict-data-files"]); \ pd.test(extra_args=["-m not clipboard and single_cpu and not slow and not network and not db", "--no-strict-data-files"]);' \ """ +free-threaded-support = true +before-build = "bash {package}/scripts/cibw_before_build.sh" +before-test = "bash {package}/scripts/cibw_before_test.sh" [tool.cibuildwheel.windows] -before-build = "pip install delvewheel" +before-build = "pip install delvewheel && bash {package}/scripts/cibw_before_build.sh" repair-wheel-command = "delvewheel repair -w {dest_dir} {wheel}" [[tool.cibuildwheel.overrides]] select = "*-musllinux*" -before-test = "apk update && apk add musl-locales" +before-test = "apk update && apk add musl-locales && bash {package}/scripts/cibw_before_test.sh" [[tool.cibuildwheel.overrides]] select = "*-win*" diff --git a/scripts/cibw_before_build.sh b/scripts/cibw_before_build.sh new file mode 100644 index 0000000000000..f3049b27ed5d1 --- /dev/null +++ b/scripts/cibw_before_build.sh @@ -0,0 +1,9 @@ +# TODO: Delete when there's PyPI NumPy/Cython releases the support Python 3.13. +# If free-threading support is not included in those releases, this script will have +# to whether this runs for a free-threaded build instead. +PYTHON_VERSION="$(python -c "import sys; print(f'{sys.version_info.major}{sys.version_info.minor}')")" +if [[ $PYTHON_VERSION == "313" ]]; then + python -m pip install -U pip + python -m pip install -i https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy cython + python -m pip install ninja meson-python versioneer[toml] +fi diff --git a/scripts/cibw_before_test.sh b/scripts/cibw_before_test.sh new file mode 100644 index 0000000000000..7d1b143881ced --- /dev/null +++ b/scripts/cibw_before_test.sh @@ -0,0 +1,8 @@ +# TODO: Delete when there's PyPI NumPy/Cython releases the support Python 3.13. +# If free-threading support is not included in those releases, this script will have +# to whether this runs for a free-threaded build instead. +PYTHON_VERSION="$(python -c "import sys; print(f'{sys.version_info.major}{sys.version_info.minor}')")" +if [[ $PYTHON_VERSION == "313" ]]; then + python -m pip install -U pip + python -m pip install -i https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy +fi From b8bc510bcf731da3ff24f0454103916f450b50c2 Mon Sep 17 00:00:00 2001 From: matiaslindgren Date: Tue, 23 Jul 2024 00:47:11 +0200 Subject: [PATCH 214/272] BUG: Hash and compare tuple subclasses as builtin tuples (#59286) * cast all tuple subclass index keys to tuple * fix docs typo * add multi-index namedtuple test * hash and compare all tuple subclasses as tuples * test hashtable with namedtuples * remove redundant index key conversion * add comments * update whatsnew * check key error message * fix whatsnew section * test namedtuple and tuple interchangeable in hashtable * Update doc/source/whatsnew/v3.0.0.rst Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> * use pytest.raises regexp instead of str eq --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v3.0.0.rst | 2 + .../pandas/vendored/klib/khash_python.h | 6 ++- pandas/tests/indexes/multi/test_indexing.py | 24 ++++++++++ pandas/tests/libs/test_hashtable.py | 46 ++++++++++++++++--- 4 files changed, 70 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index e5917c9176c54..d940d564b8df2 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -33,6 +33,7 @@ Other enhancements - :func:`DataFrame.to_excel` now raises an ``UserWarning`` when the character count in a cell exceeds Excel's limitation of 32767 characters (:issue:`56954`) - :func:`read_stata` now returns ``datetime64`` resolutions better matching those natively stored in the stata format (:issue:`55642`) - :meth:`DataFrame.agg` called with ``axis=1`` and a ``func`` which relabels the result index now raises a ``NotImplementedError`` (:issue:`58807`). +- :meth:`Index.get_loc` now accepts also subclasses of ``tuple`` as keys (:issue:`57922`) - :meth:`Styler.set_tooltips` provides alternative method to storing tooltips by using title attribute of td elements. (:issue:`56981`) - Allow dictionaries to be passed to :meth:`pandas.Series.str.replace` via ``pat`` parameter (:issue:`51748`) - Support passing a :class:`Series` input to :func:`json_normalize` that retains the :class:`Series` :class:`Index` (:issue:`51452`) @@ -231,6 +232,7 @@ Other API changes ^^^^^^^^^^^^^^^^^ - 3rd party ``py.path`` objects are no longer explicitly supported in IO methods. Use :py:class:`pathlib.Path` objects instead (:issue:`57091`) - :func:`read_table`'s ``parse_dates`` argument defaults to ``None`` to improve consistency with :func:`read_csv` (:issue:`57476`) +- All classes inheriting from builtin ``tuple`` (including types created with :func:`collections.namedtuple`) are now hashed and compared as builtin ``tuple`` during indexing operations (:issue:`57922`) - Made ``dtype`` a required argument in :meth:`ExtensionArray._from_sequence_of_strings` (:issue:`56519`) - Passing a :class:`Series` input to :func:`json_normalize` will now retain the :class:`Series` :class:`Index`, previously output had a new :class:`RangeIndex` (:issue:`51452`) - Updated :meth:`DataFrame.to_excel` so that the output spreadsheet has no styling. Custom styling can still be done using :meth:`Styler.to_excel` (:issue:`54154`) diff --git a/pandas/_libs/include/pandas/vendored/klib/khash_python.h b/pandas/_libs/include/pandas/vendored/klib/khash_python.h index 8d4c382241d39..2fa61642968cf 100644 --- a/pandas/_libs/include/pandas/vendored/klib/khash_python.h +++ b/pandas/_libs/include/pandas/vendored/klib/khash_python.h @@ -207,7 +207,8 @@ static inline int pyobject_cmp(PyObject *a, PyObject *b) { if (PyComplex_CheckExact(a)) { return complexobject_cmp((PyComplexObject *)a, (PyComplexObject *)b); } - if (PyTuple_CheckExact(a)) { + if (PyTuple_Check(a)) { + // compare tuple subclasses as builtin tuples return tupleobject_cmp((PyTupleObject *)a, (PyTupleObject *)b); } // frozenset isn't yet supported @@ -311,7 +312,8 @@ static inline khuint32_t kh_python_hash_func(PyObject *key) { // because complex(k,0) == k holds for any int-object k // and kh_complex128_hash_func doesn't respect it hash = complexobject_hash((PyComplexObject *)key); - } else if (PyTuple_CheckExact(key)) { + } else if (PyTuple_Check(key)) { + // hash tuple subclasses as builtin tuples hash = tupleobject_hash((PyTupleObject *)key); } else { hash = PyObject_Hash(key); diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index f08a7625e7f8a..d82203a53a60f 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -1,3 +1,4 @@ +from collections import namedtuple from datetime import timedelta import re @@ -1006,3 +1007,26 @@ def test_get_indexer_for_multiindex_with_nans(nulls_fixture): result = idx1.get_indexer(idx2) expected = np.array([-1, 1], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) + + +def test_get_loc_namedtuple_behaves_like_tuple(): + # GH57922 + NamedIndex = namedtuple("NamedIndex", ("a", "b")) + multi_idx = MultiIndex.from_tuples( + [NamedIndex("i1", "i2"), NamedIndex("i3", "i4"), NamedIndex("i5", "i6")] + ) + for idx in (multi_idx, multi_idx.to_flat_index()): + assert idx.get_loc(NamedIndex("i1", "i2")) == 0 + assert idx.get_loc(NamedIndex("i3", "i4")) == 1 + assert idx.get_loc(NamedIndex("i5", "i6")) == 2 + assert idx.get_loc(("i1", "i2")) == 0 + assert idx.get_loc(("i3", "i4")) == 1 + assert idx.get_loc(("i5", "i6")) == 2 + multi_idx = MultiIndex.from_tuples([("i1", "i2"), ("i3", "i4"), ("i5", "i6")]) + for idx in (multi_idx, multi_idx.to_flat_index()): + assert idx.get_loc(NamedIndex("i1", "i2")) == 0 + assert idx.get_loc(NamedIndex("i3", "i4")) == 1 + assert idx.get_loc(NamedIndex("i5", "i6")) == 2 + assert idx.get_loc(("i1", "i2")) == 0 + assert idx.get_loc(("i3", "i4")) == 1 + assert idx.get_loc(("i5", "i6")) == 2 diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py index b70386191d9d9..50b561aefcf49 100644 --- a/pandas/tests/libs/test_hashtable.py +++ b/pandas/tests/libs/test_hashtable.py @@ -1,3 +1,4 @@ +from collections import namedtuple from collections.abc import Generator from contextlib import contextmanager import re @@ -405,9 +406,8 @@ def test_nan_complex_real(self): table = ht.PyObjectHashTable() table.set_item(nan1, 42) assert table.get_item(nan2) == 42 - with pytest.raises(KeyError, match=None) as error: + with pytest.raises(KeyError, match=re.escape(repr(other))): table.get_item(other) - assert str(error.value) == str(other) def test_nan_complex_imag(self): nan1 = complex(1, float("nan")) @@ -417,9 +417,8 @@ def test_nan_complex_imag(self): table = ht.PyObjectHashTable() table.set_item(nan1, 42) assert table.get_item(nan2) == 42 - with pytest.raises(KeyError, match=None) as error: + with pytest.raises(KeyError, match=re.escape(repr(other))): table.get_item(other) - assert str(error.value) == str(other) def test_nan_in_tuple(self): nan1 = (float("nan"),) @@ -436,9 +435,28 @@ def test_nan_in_nested_tuple(self): table = ht.PyObjectHashTable() table.set_item(nan1, 42) assert table.get_item(nan2) == 42 - with pytest.raises(KeyError, match=None) as error: + with pytest.raises(KeyError, match=re.escape(repr(other))): + table.get_item(other) + + def test_nan_in_namedtuple(self): + T = namedtuple("T", ["x"]) + nan1 = T(float("nan")) + nan2 = T(float("nan")) + assert nan1.x is not nan2.x + table = ht.PyObjectHashTable() + table.set_item(nan1, 42) + assert table.get_item(nan2) == 42 + + def test_nan_in_nested_namedtuple(self): + T = namedtuple("T", ["x", "y"]) + nan1 = T(1, (2, (float("nan"),))) + nan2 = T(1, (2, (float("nan"),))) + other = T(1, 2) + table = ht.PyObjectHashTable() + table.set_item(nan1, 42) + assert table.get_item(nan2) == 42 + with pytest.raises(KeyError, match=re.escape(repr(other))): table.get_item(other) - assert str(error.value) == str(other) def test_hash_equal_tuple_with_nans(): @@ -448,6 +466,22 @@ def test_hash_equal_tuple_with_nans(): assert ht.objects_are_equal(a, b) +def test_hash_equal_namedtuple_with_nans(): + T = namedtuple("T", ["x", "y"]) + a = T(float("nan"), (float("nan"), float("nan"))) + b = T(float("nan"), (float("nan"), float("nan"))) + assert ht.object_hash(a) == ht.object_hash(b) + assert ht.objects_are_equal(a, b) + + +def test_hash_equal_namedtuple_and_tuple(): + T = namedtuple("T", ["x", "y"]) + a = T(1, (2, 3)) + b = (1, (2, 3)) + assert ht.object_hash(a) == ht.object_hash(b) + assert ht.objects_are_equal(a, b) + + def test_get_labels_groupby_for_Int64(writable): table = ht.Int64HashTable() vals = np.array([1, 2, -1, 2, 1, -1], dtype=np.int64) From 67a58cddc2f8c0e30cb0123589947d9b3f073720 Mon Sep 17 00:00:00 2001 From: cmjcharlton <90400333+cmjcharlton@users.noreply.github.com> Date: Tue, 23 Jul 2024 01:12:41 +0100 Subject: [PATCH 215/272] ENH: Add support for reading 102-format Stata dta files (#58978) * ENH: Add support for reading 102-format Stata dta files * Add reference to pull request in whatsnew file * Update doc/source/whatsnew/v3.0.0.rst Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> * Remove extra space * Use datapath() for specifying test file locations --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/io/stata.py | 27 ++++++++++++++---- .../tests/io/data/stata/stata-compat-102.dta | Bin 0 -> 558 bytes pandas/tests/io/data/stata/stata4_102.dta | Bin 0 -> 778 bytes pandas/tests/io/test_stata.py | 16 ++++++++++- 5 files changed, 38 insertions(+), 6 deletions(-) create mode 100644 pandas/tests/io/data/stata/stata-compat-102.dta create mode 100644 pandas/tests/io/data/stata/stata4_102.dta diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index d940d564b8df2..30dab4e2d71e3 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -50,6 +50,7 @@ Other enhancements - :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`) - :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`) - Restore support for reading Stata 104-format and enable reading 103-format dta files (:issue:`58554`) +- Support reading Stata 102-format (Stata 1) dta files (:issue:`58978`) - Support reading Stata 110-format (Stata 7) dta files (:issue:`47176`) .. --------------------------------------------------------------------------- diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 5146876d20374..dd92b1bbfdba0 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -91,9 +91,9 @@ _version_error = ( "Version of given Stata file is {version}. pandas supports importing " - "versions 103, 104, 105, 108, 110 (Stata 7), 111 (Stata 7SE), 113 (Stata 8/9), " - "114 (Stata 10/11), 115 (Stata 12), 117 (Stata 13), 118 (Stata 14/15/16)," - "and 119 (Stata 15/16, over 32,767 variables)." + "versions 102, 103, 104, 105, 108, 110 (Stata 7), 111 (Stata 7SE), " + "113 (Stata 8/9), 114 (Stata 10/11), 115 (Stata 12), 117 (Stata 13), " + "118 (Stata 14/15/16), and 119 (Stata 15/16, over 32,767 variables)." ) _statafile_processing_params1 = """\ @@ -1352,8 +1352,10 @@ def _get_variable_labels(self) -> list[str]: def _get_nobs(self) -> int: if self._format_version >= 118: return self._read_uint64() - else: + elif self._format_version >= 103: return self._read_uint32() + else: + return self._read_uint16() def _get_data_label(self) -> str: if self._format_version >= 118: @@ -1393,9 +1395,24 @@ def _get_seek_variable_labels(self) -> int: def _read_old_header(self, first_char: bytes) -> None: self._format_version = int(first_char[0]) - if self._format_version not in [103, 104, 105, 108, 110, 111, 113, 114, 115]: + if self._format_version not in [ + 102, + 103, + 104, + 105, + 108, + 110, + 111, + 113, + 114, + 115, + ]: raise ValueError(_version_error.format(version=self._format_version)) self._set_encoding() + # Note 102 format will have a zero in this header position, so support + # relies on little-endian being set whenever this value isn't one, + # even though for later releases strictly speaking the value should + # be either one or two to be valid self._byteorder = ">" if self._read_int8() == 0x1 else "<" self._filetype = self._read_int8() self._path_or_buf.read(1) # unused diff --git a/pandas/tests/io/data/stata/stata-compat-102.dta b/pandas/tests/io/data/stata/stata-compat-102.dta new file mode 100644 index 0000000000000000000000000000000000000000..424b767b0011c543ebd55ef7ccd632b45481cd4b GIT binary patch literal 558 zcmYdeU}RutU}hi$axyb>(o#|~^HNePKx#8BpgJ-Q%^<8yV_kTggE*N)q zCcpRD|4W+`)S*62=X&K5=EoXb~4XJOxyIDl z-o`RCF=^FFLFyQ5{W|rv%JhhjL~R?ak0Q}(=yaJ?hhd zhBTrv--h4~_Y%c@NVcpDGDT=ABnrEehT6VKy*4}J*)w)sMz-E2fhbvG^~+(N=Ut!i O^Ug%>l$`s&?C~2P@1jcp literal 0 HcmV?d00001 diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 2534df6a82f89..6d6f222fc0660 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -267,7 +267,7 @@ def test_read_dta4(self, version, datapath): # stata doesn't save .category metadata tm.assert_frame_equal(parsed, expected) - @pytest.mark.parametrize("version", [103, 104, 105, 108]) + @pytest.mark.parametrize("version", [102, 103, 104, 105, 108]) def test_readold_dta4(self, version, datapath): # This test is the same as test_read_dta4 above except that the columns # had to be renamed to match the restrictions in older file format @@ -2058,6 +2058,19 @@ def test_backward_compat_nodateconversion(version, datapath): tm.assert_frame_equal(old_dta, expected, check_dtype=False) +@pytest.mark.parametrize("version", [102]) +def test_backward_compat_nostring(version, datapath): + # The Stata data format prior to 105 did not support a date format + # so read the raw values for comparison + ref = datapath("io", "data", "stata", "stata-compat-118.dta") + old = datapath("io", "data", "stata", f"stata-compat-{version}.dta") + expected = read_stata(ref, convert_dates=False) + # The Stata data format prior to 103 did not support string data + expected = expected.drop(columns=["s10"]) + old_dta = read_stata(old, convert_dates=False) + tm.assert_frame_equal(old_dta, expected, check_dtype=False) + + @pytest.mark.parametrize("version", [105, 108, 110, 111, 113, 114, 118]) def test_bigendian(version, datapath): ref = datapath("io", "data", "stata", f"stata-compat-{version}.dta") @@ -2067,6 +2080,7 @@ def test_bigendian(version, datapath): tm.assert_frame_equal(big_dta, expected) +# Note: 102 format does not support big-endian byte order @pytest.mark.parametrize("version", [103, 104]) def test_bigendian_nodateconversion(version, datapath): # The Stata data format prior to 105 did not support a date format From 31a1df1d62fb5760dc85a5e955635d739ed22072 Mon Sep 17 00:00:00 2001 From: sliuos <175488904+sliuos@users.noreply.github.com> Date: Tue, 23 Jul 2024 12:31:50 -0400 Subject: [PATCH 216/272] Add fixture docstring for series indexing (#59292) * Add fixture docstring for series indexing * Make fixture docstring one line only --------- Co-authored-by: Shawn Liu --- pandas/tests/series/indexing/test_setitem.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index 253339f8a6446..62f2c93ef691a 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -1178,25 +1178,40 @@ def test_setitem_example(self): @pytest.fixture def obj(self): + """ + Fixture to create a Series [(0, 1], (1, 2], (2, 3]] + """ idx = IntervalIndex.from_breaks(range(4)) return Series(idx) @pytest.fixture def val(self): + """ + Fixture to get an interval (0.5, 1.5] + """ return Interval(0.5, 1.5) @pytest.fixture def key(self): + """ + Fixture to get a key 0 + """ return 0 @pytest.fixture def expected(self, obj, val): + """ + Fixture to get a Series [(0.5, 1.5], (1.0, 2.0], (2.0, 3.0]] + """ data = [val] + list(obj[1:]) idx = IntervalIndex(data, dtype="Interval[float64]") return Series(idx) @pytest.fixture def raises(self): + """ + Fixture to enable raising pytest exceptions + """ return True From e191a06002176917f6f5dd90d0bb995565865654 Mon Sep 17 00:00:00 2001 From: Natalia Mokeeva <91160475+natmokval@users.noreply.github.com> Date: Tue, 23 Jul 2024 22:54:31 +0200 Subject: [PATCH 217/272] DEPR: group by one element list gets scalar keys (#59179) --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/groupby/groupby.py | 11 +++++- pandas/tests/groupby/test_grouping.py | 53 ++++++++++++++++++--------- pandas/tests/groupby/test_raises.py | 3 ++ 4 files changed, 50 insertions(+), 18 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 30dab4e2d71e3..5cfb36986f8ef 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -283,6 +283,7 @@ Other Deprecations - Deprecated allowing non-keyword arguments in :meth:`DataFrame.all`, :meth:`DataFrame.min`, :meth:`DataFrame.max`, :meth:`DataFrame.sum`, :meth:`DataFrame.prod`, :meth:`DataFrame.mean`, :meth:`DataFrame.median`, :meth:`DataFrame.sem`, :meth:`DataFrame.var`, :meth:`DataFrame.std`, :meth:`DataFrame.skew`, :meth:`DataFrame.kurt`, :meth:`Series.all`, :meth:`Series.min`, :meth:`Series.max`, :meth:`Series.sum`, :meth:`Series.prod`, :meth:`Series.mean`, :meth:`Series.median`, :meth:`Series.sem`, :meth:`Series.var`, :meth:`Series.std`, :meth:`Series.skew`, and :meth:`Series.kurt`. (:issue:`57087`) - Deprecated allowing non-keyword arguments in :meth:`Series.to_markdown` except ``buf``. (:issue:`57280`) - Deprecated allowing non-keyword arguments in :meth:`Series.to_string` except ``buf``. (:issue:`57280`) +- Deprecated behavior of :meth:`.DataFrameGroupBy.groups` and :meth:`.SeriesGroupBy.groups`, in a future version ``groups`` by one element list will return tuple instead of scalar. (:issue:`58858`) - Deprecated behavior of :meth:`Series.dt.to_pytimedelta`, in a future version this will return a :class:`Series` containing python ``datetime.timedelta`` objects instead of an ``ndarray`` of timedelta; this matches the behavior of other :meth:`Series.dt` properties. (:issue:`57463`) - Deprecated lowercase strings ``d``, ``b`` and ``c`` denoting frequencies in :class:`Day`, :class:`BusinessDay` and :class:`CustomBusinessDay` in favour of ``D``, ``B`` and ``C`` (:issue:`58998`) - Deprecated lowercase strings ``w``, ``w-mon``, ``w-tue``, etc. denoting frequencies in :class:`Week` in favour of ``W``, ``W-MON``, ``W-TUE``, etc. (:issue:`58998`) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index bf71bb80b3623..945173bc48fe9 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -634,7 +634,7 @@ def groups(self) -> dict[Hashable, Index]: 0 1 2 3 1 1 5 6 2 7 8 9 - >>> df.groupby(by=["a"]).groups + >>> df.groupby(by="a").groups {1: [0, 1], 7: [2]} For Resampler: @@ -654,6 +654,15 @@ def groups(self) -> dict[Hashable, Index]: >>> ser.resample("MS").groups {Timestamp('2023-01-01 00:00:00'): 2, Timestamp('2023-02-01 00:00:00'): 4} """ + if isinstance(self.keys, list) and len(self.keys) == 1: + warnings.warn( + "`groups` by one element list returns scalar is deprecated " + "and will be removed. In a future version `groups` by one element " + "list will return tuple. Use ``df.groupby(by='a').groups`` " + "instead of ``df.groupby(by=['a']).groups`` to avoid this warning", + FutureWarning, + stacklevel=find_stack_level(), + ) return self._grouper.groups @final diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 39eadd32f300d..814b35ad577f1 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -545,31 +545,38 @@ def test_multiindex_columns_empty_level(self): df = DataFrame([[1, "A"]], columns=midx) + msg = "`groups` by one element list returns scalar is deprecated" grouped = df.groupby("to filter").groups assert grouped["A"] == [0] - grouped = df.groupby([("to filter", "")]).groups + with tm.assert_produces_warning(FutureWarning, match=msg): + grouped = df.groupby([("to filter", "")]).groups assert grouped["A"] == [0] df = DataFrame([[1, "A"], [2, "B"]], columns=midx) expected = df.groupby("to filter").groups - result = df.groupby([("to filter", "")]).groups + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby([("to filter", "")]).groups assert result == expected df = DataFrame([[1, "A"], [2, "A"]], columns=midx) expected = df.groupby("to filter").groups - result = df.groupby([("to filter", "")]).groups + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby([("to filter", "")]).groups tm.assert_dict_equal(result, expected) def test_groupby_multiindex_tuple(self): - # GH 17979 + # GH 17979, GH#59179 df = DataFrame( [[1, 2, 3, 4], [3, 4, 5, 6], [1, 4, 2, 3]], columns=MultiIndex.from_arrays([["a", "b", "b", "c"], [1, 1, 2, 2]]), ) - expected = df.groupby([("b", 1)]).groups + + msg = "`groups` by one element list returns scalar is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = df.groupby([("b", 1)]).groups result = df.groupby(("b", 1)).groups tm.assert_dict_equal(expected, result) @@ -579,17 +586,21 @@ def test_groupby_multiindex_tuple(self): [["a", "b", "b", "c"], ["d", "d", "e", "e"]] ), ) - expected = df2.groupby([("b", "d")]).groups + + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = df2.groupby([("b", "d")]).groups result = df.groupby(("b", 1)).groups tm.assert_dict_equal(expected, result) df3 = DataFrame(df.values, columns=[("a", "d"), ("b", "d"), ("b", "e"), "c"]) - expected = df3.groupby([("b", "d")]).groups + + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = df3.groupby([("b", "d")]).groups result = df.groupby(("b", 1)).groups tm.assert_dict_equal(expected, result) def test_groupby_multiindex_partial_indexing_equivalence(self): - # GH 17977 + # GH 17977, GH#59179 df = DataFrame( [[1, 2, 3, 4], [3, 4, 5, 6], [1, 4, 2, 3]], columns=MultiIndex.from_arrays([["a", "b", "b", "c"], [1, 1, 2, 2]]), @@ -615,8 +626,10 @@ def test_groupby_multiindex_partial_indexing_equivalence(self): result_max = df.groupby([("a", 1)])["b"].max() tm.assert_frame_equal(expected_max, result_max) - expected_groups = df.groupby([("a", 1)])[[("b", 1), ("b", 2)]].groups - result_groups = df.groupby([("a", 1)])["b"].groups + msg = "`groups` by one element list returns scalar is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected_groups = df.groupby([("a", 1)])[[("b", 1), ("b", 2)]].groups + result_groups = df.groupby([("a", 1)])["b"].groups tm.assert_dict_equal(expected_groups, result_groups) def test_groupby_level(self, sort, multiindex_dataframe_random_data, df): @@ -719,15 +732,18 @@ def test_grouping_labels(self, multiindex_dataframe_random_data): tm.assert_almost_equal(grouped._grouper.codes[0], exp_labels) def test_list_grouper_with_nat(self): - # GH 14715 + # GH 14715, GH#59179 df = DataFrame({"date": date_range("1/1/2011", periods=365, freq="D")}) df.iloc[-1] = pd.NaT grouper = Grouper(key="date", freq="YS") + msg = "`groups` by one element list returns scalar is deprecated" # Grouper in a list grouping - result = df.groupby([grouper]) + gb = df.groupby([grouper]) expected = {Timestamp("2011-01-01"): Index(list(range(364)))} - tm.assert_dict_equal(result.groups, expected) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = gb.groups + tm.assert_dict_equal(result, expected) # Test case without a list result = df.groupby(grouper) @@ -994,17 +1010,20 @@ def test_gb_key_len_equal_axis_len(self): class TestIteration: def test_groups(self, df): grouped = df.groupby(["A"]) - groups = grouped.groups - assert groups is grouped.groups # caching works + msg = "`groups` by one element list returns scalar is deprecated" + + with tm.assert_produces_warning(FutureWarning, match=msg): + groups = grouped.groups + assert groups is grouped.groups # caching works - for k, v in grouped.groups.items(): + for k, v in groups.items(): assert (df.loc[v]["A"] == k).all() grouped = df.groupby(["A", "B"]) groups = grouped.groups assert groups is grouped.groups # caching works - for k, v in grouped.groups.items(): + for k, v in groups.items(): assert (df.loc[v]["A"] == k[0]).all() assert (df.loc[v]["B"] == k[1]).all() diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index 5a8192a9ffe02..9f3e620ca9872 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -534,6 +534,9 @@ def test_groupby_raises_category_np( _call_and_check(klass, msg, how, gb, groupby_func_np, ()) +@pytest.mark.filterwarnings( + "ignore:`groups` by one element list returns scalar is deprecated" +) @pytest.mark.parametrize("how", ["method", "agg", "transform"]) def test_groupby_raises_category_on_category( how, From 1fa50252e1f411dbd5ee37b45f3ee602b39fd68c Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 23 Jul 2024 11:01:03 -1000 Subject: [PATCH 218/272] CLN: Remove Index.sort (#59283) --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/indexes/base.py | 7 ------- pandas/tests/indexes/test_any_index.py | 6 ------ 3 files changed, 1 insertion(+), 13 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 5cfb36986f8ef..5d89613bd3d4f 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -236,6 +236,7 @@ Other API changes - All classes inheriting from builtin ``tuple`` (including types created with :func:`collections.namedtuple`) are now hashed and compared as builtin ``tuple`` during indexing operations (:issue:`57922`) - Made ``dtype`` a required argument in :meth:`ExtensionArray._from_sequence_of_strings` (:issue:`56519`) - Passing a :class:`Series` input to :func:`json_normalize` will now retain the :class:`Series` :class:`Index`, previously output had a new :class:`RangeIndex` (:issue:`51452`) +- Removed :meth:`Index.sort` which always raised a ``TypeError``. This attribute is not defined and will raise an ``AttributeError`` (:issue:`59283`) - Updated :meth:`DataFrame.to_excel` so that the output spreadsheet has no styling. Custom styling can still be done using :meth:`Styler.to_excel` (:issue:`54154`) - pickle and HDF (``.h5``) files created with Python 2 are no longer explicitly supported (:issue:`57387`) - pickled objects from pandas version less than ``1.0.0`` are no longer supported (:issue:`57155`) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 541ed7dce3aac..e67c59c86dd0c 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -5785,13 +5785,6 @@ def sort_values( else: return sorted_index - @final - def sort(self, *args, **kwargs): - """ - Use sort_values instead. - """ - raise TypeError("cannot sort an Index object in-place, use sort_values instead") - def shift(self, periods: int = 1, freq=None) -> Self: """ Shift index by desired number of time frequency increments. diff --git a/pandas/tests/indexes/test_any_index.py b/pandas/tests/indexes/test_any_index.py index 284e219fd20e4..e1ed96195e0a7 100644 --- a/pandas/tests/indexes/test_any_index.py +++ b/pandas/tests/indexes/test_any_index.py @@ -22,12 +22,6 @@ def test_boolean_context_compat(index): bool(index) -def test_sort(index): - msg = "cannot sort an Index object in-place, use sort_values instead" - with pytest.raises(TypeError, match=msg): - index.sort() - - def test_hash_error(index): with pytest.raises(TypeError, match=f"unhashable type: '{type(index).__name__}'"): hash(index) From 29b0b28a2334cc3a8543b69426bed90eee3c6e67 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 24 Jul 2024 18:08:31 +0200 Subject: [PATCH 219/272] PDEP-14: Dedicated string data type for pandas 3.0 (#58551) Co-authored-by: Simon Hawkins Co-authored-by: Irv Lustig Co-authored-by: William Ayd Co-authored-by: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- web/pandas/pdeps/0014-string-dtype.md | 375 ++++++++++++++++++++++++++ 1 file changed, 375 insertions(+) create mode 100644 web/pandas/pdeps/0014-string-dtype.md diff --git a/web/pandas/pdeps/0014-string-dtype.md b/web/pandas/pdeps/0014-string-dtype.md new file mode 100644 index 0000000000000..5b74f71216454 --- /dev/null +++ b/web/pandas/pdeps/0014-string-dtype.md @@ -0,0 +1,375 @@ +# PDEP-14: Dedicated string data type for pandas 3.0 + +- Created: May 3, 2024 +- Status: Accepted +- Discussion: https://github.com/pandas-dev/pandas/pull/58551 +- Author: [Joris Van den Bossche](https://github.com/jorisvandenbossche) +- Revision: 1 + +## Abstract + +This PDEP proposes to introduce a dedicated string dtype that will be used by +default in pandas 3.0: + +* In pandas 3.0, enable a string dtype (`"str"`) by default, using PyArrow if available + or otherwise a string dtype using numpy object-dtype under the hood as fallback. +* The default string dtype will use missing value semantics (using NaN) consistent + with the other default data types. + +This will give users a long-awaited proper string dtype for 3.0, while 1) not +(yet) making PyArrow a _hard_ dependency, but only a dependency used by default, +and 2) leaving room for future improvements (different missing value semantics, +using NumPy 2.0 strings, etc). + +## Background + +Currently, pandas by default stores text data in an `object`-dtype NumPy array. +The current implementation has two primary drawbacks. First, `object` dtype is +not specific to strings: any Python object can be stored in an `object`-dtype +array, not just strings, and seeing `object` as the dtype for a column with +strings is confusing for users. Second: this is not efficient (all string +methods on a Series are eventually calling Python methods on the individual +string objects). + +To solve the first issue, a dedicated extension dtype for string data has +already been +[added in pandas 1.0](https://pandas.pydata.org/docs/whatsnew/v1.0.0.html#dedicated-string-data-type). +This has always been opt-in for now, requiring users to explicitly request the +dtype (with `dtype="string"` or `dtype=pd.StringDtype()`). The array backing +this string dtype was initially almost the same as the default implementation, +i.e. an `object`-dtype NumPy array of Python strings. + +To solve the second issue (performance), pandas contributed to the development +of string kernels in the PyArrow package, and a variant of the string dtype +backed by PyArrow was +[added in pandas 1.3](https://pandas.pydata.org/docs/whatsnew/v1.3.0.html#pyarrow-backed-string-data-type). +This could be specified with the `storage` keyword in the opt-in string dtype +(`pd.StringDtype(storage="pyarrow")`). + +Since its introduction, the `StringDtype` has always been opt-in, and has used +the experimental `pd.NA` sentinel for missing values (which was also [introduced +in pandas 1.0](https://pandas.pydata.org/docs/whatsnew/v1.0.0.html#experimental-na-scalar-to-denote-missing-values)). +However, up to this date, pandas has not yet taken the step to use `pd.NA` for +for any default dtype, and thus the `StringDtype` deviates in missing value +behaviour compared to the default data types. + +In 2023, [PDEP-10](https://pandas.pydata.org/pdeps/0010-required-pyarrow-dependency.html) +proposed to start using a PyArrow-backed string dtype by default in pandas 3.0 +(i.e. infer this type for string data instead of object dtype). To ensure we +could use the variant of `StringDtype` backed by PyArrow instead of Python +objects (for better performance), it proposed to make `pyarrow` a new required +runtime dependency of pandas. + +In the meantime, NumPy has also been working on a native variable-width string +data type, which was made available [starting with NumPy +2.0](https://numpy.org/devdocs/release/2.0.0-notes.html#stringdtype-has-been-added-to-numpy). +This can provide a potential alternative to PyArrow for implementing a string +data type in pandas that is not backed by Python objects. + +After acceptance of PDEP-10, two aspects of the proposal have been under +reconsideration: + +- Based on feedback from users and maintainers from other packages (mostly + around installation complexity and size), it has been considered to relax the + new `pyarrow` requirement to not be a _hard_ runtime dependency. In addition, + NumPy 2.0 could in the future potentially reduce the need to make PyArrow a + required dependency specifically for a dedicated pandas string dtype. +- PDEP-10 did not consider the usage of the experimental `pd.NA` as a + consequence of adopting one of the existing implementations of the + `StringDtype`. + +For the second aspect, another variant of the `StringDtype` was +[introduced in pandas 2.1](https://pandas.pydata.org/docs/whatsnew/v2.1.0.html#whatsnew-210-enhancements-infer-strings) +that is still backed by PyArrow but follows the default missing values semantics +pandas uses for all other default data types (and using `NaN` as the missing +value sentinel) ([GH-54792](https://github.com/pandas-dev/pandas/issues/54792)). +At the time, the `storage` option for this new variant was called +`"pyarrow_numpy"` to disambiguate from the existing `"pyarrow"` option using +`pd.NA` (but this PDEP proposes a better naming scheme, see the "Naming" +subsection below). + +This last dtype variant is what users currently (pandas 2.2) get for string data +when enabling the ``future.infer_string`` option (to enable the behaviour which +is intended to become the default in pandas 3.0). + +## Proposal + +To be able to move forward with a string data type in pandas 3.0, this PDEP proposes: + +1. For pandas 3.0, a `"str"` string dtype is enabled by default, i.e. this + string dtype will be used as the default dtype for text data when creating + pandas objects (e.g. inference in constructors, I/O functions). +2. This default string dtype will follow the same behaviour for missing values + as other default data types, and use `NaN` as the missing value sentinel. +3. The string dtype will use PyArrow if installed, and otherwise falls back to + an in-house functionally-equivalent (but slower) version. This fallback can + reuse (with minor code additions) the existing numpy object-dtype backed + StringArray for its implementation. +4. Installation guidelines are updated to clearly encourage users to install + pyarrow for the default user experience. + +Those string dtypes enabled by default will then no longer be considered as +experimental. + +### Default inference of a string dtype + +By default, pandas will infer this new string dtype instead of object dtype for +string data (when creating pandas objects, such as in constructors or IO +functions). + +In pandas 2.2, the existing `future.infer_string` option can be used to opt-in to the future +default behaviour: + +```python +>>> pd.options.future.infer_string = True +>>> pd.Series(["a", "b", None]) +0 a +1 b +2 NaN +dtype: string +``` + +Right now (pandas 2.2), the existing option only enables the PyArrow-based +future dtype. For the remaining 2.x releases, this option will be expanded to +also work when PyArrow is not installed to enable the object-dtype fallback in +that case. + +### Missing value semantics + +As mentioned in the background section, the original `StringDtype` has always +used the experimental `pd.NA` sentinel for missing values. In addition to using +`pd.NA` as the scalar for a missing value, this essentially means that: + +- String columns follow ["NA-semantics"](https://pandas.pydata.org/docs/user_guide/missing_data.html#na-semantics) + for missing values, where `NA` propagates in boolean operations such as + comparisons or predicates. +- Operations on the string column that give a numeric or boolean result use the + nullable Integer/Float/Boolean data types (e.g. `ser.str.len()` returns the + nullable `"Int64"` / `pd.Int64Dtype()` dtype instead of the numpy `int64` + dtype (or `float64` in case of missing values)). + +However, up to this date, all other default data types still use `NaN` semantics +for missing values. Therefore, this proposal says that a new default string +dtype should also still use the same default missing value semantics and return +default data types when doing operations on the string column, to be consistent +with the other default dtypes at this point. + +In practice, this means that the default string dtype will use `NaN` as +the missing value sentinel, and: + +- String columns will follow NaN-semantics for missing values, where `NaN` gives + False in boolean operations such as comparisons or predicates. +- Operations on the string column that give a numeric or boolean result will use + the default data types (i.e. numpy `int64`/`float64`/`bool`). + +Because the original `StringDtype` implementations already use `pd.NA` and +return masked integer and boolean arrays in operations, a new variant of the +existing dtypes that uses `NaN` and default data types was needed. The original +variant of `StringDtype` using `pd.NA` will continue to be available for those +who were already using it. + +### Object-dtype "fallback" implementation + +To avoid a hard dependency on PyArrow for pandas 3.0, this PDEP proposes to keep +a "fallback" option in case PyArrow is not installed. The original `StringDtype` +backed by a numpy object-dtype array of Python strings can be mostly reused for +this (adding a new variant of the dtype) and a new `StringArray` subclass only +needs minor changes to follow the above-mentioned missing value semantics +([GH-58451](https://github.com/pandas-dev/pandas/pull/58451)). + +For pandas 3.0, this is the most realistic option given this implementation has +already been available for a long time. Beyond 3.0, further improvements such as +using NumPy 2.0 ([GH-58503](https://github.com/pandas-dev/pandas/issues/58503)) +or nanoarrow ([GH-58552](https://github.com/pandas-dev/pandas/issues/58552)) can +still be explored, but at that point that is an implementation detail that +should not have a direct impact on users (except for performance). + +For the original variant of `StringDtype` using `pd.NA`, currently the default +storage is `"python"` (the object-dtype based implementation). Also for this +variant, it is proposed to follow the same logic for determining the default +storage, i.e. default to `"pyarrow"` if available, and otherwise +fall back to `"python"`. + +### Naming + +Given the long history of this topic, the naming of the dtypes is a difficult +topic. + +In the first place, it should be acknowledged that most users should not need to +use storage-specific options. Users are expected to specify a generic name (such +as `"str"` or `"string"`), and that will give them their default string dtype +(which depends on whether PyArrow is installed or not). + +For the generic string alias to specify the dtype, `"string"` is already used +for the `StringDtype` using `pd.NA`. This PDEP proposes to use `"str"` for the +new default `StringDtype` using `NaN`. This ensures backwards compatibility for +code using `dtype="string"`, and was also chosen because `dtype="str"` or +`dtype=str` currently already works to ensure your data is converted to +strings (only using object dtype for the result). + +But for testing purposes and advanced use cases that want control over the exact +variant of the `StringDtype`, we need some way to specify this and distinguish +them from the other string dtypes. + +Currently (pandas 2.2), `StringDtype(storage="pyarrow_numpy")` is used for the new variant using `NaN`, +where the `"pyarrow_numpy"` storage was used to disambiguate from the existing +`"pyarrow"` option using `pd.NA`. However, `"pyarrow_numpy"` is a rather confusing +option and doesn't generalize well. Therefore, this PDEP proposes a new naming +scheme as outlined below, and `"pyarrow_numpy"` will be deprecated as an alias +in pandas 2.3 and removed in pandas 3.0. + +The `storage` keyword of `StringDtype` is kept to disambiguate the underlying +storage of the string data (using pyarrow or python objects), but an additional +`na_value` is introduced to disambiguate the the variants using NA semantics +and NaN semantics. + +Overview of the different ways to specify a dtype and the resulting concrete +dtype of the data: + +| User specification | Concrete dtype | String alias | Note | +|---------------------------------------------|---------------------------------------------------------------|---------------------------------------|----------| +| Unspecified (inference) | `StringDtype(storage="pyarrow"\|"python", na_value=np.nan)` | "str" | (1) | +| `"str"` or `StringDtype(na_value=np.nan)` | `StringDtype(storage="pyarrow"\|"python", na_value=np.nan)` | "str" | (1) | +| `StringDtype("pyarrow", na_value=np.nan)` | `StringDtype(storage="pyarrow", na_value=np.nan)` | "str" | | +| `StringDtype("python", na_value=np.nan)` | `StringDtype(storage="python", na_value=np.nan)` | "str" | | +| `StringDtype("pyarrow")` | `StringDtype(storage="pyarrow", na_value=pd.NA)` | "string[pyarrow]" | | +| `StringDtype("python")` | `StringDtype(storage="python", na_value=pd.NA)` | "string[python]" | | +| `"string"` or `StringDtype()` | `StringDtype(storage="pyarrow"\|"python", na_value=pd.NA)` | "string[pyarrow]" or "string[python]" | (1) | +| `StringDtype("pyarrow_numpy")` | `StringDtype(storage="pyarrow", na_value=np.nan)` | "string[pyarrow_numpy]" | (2) | + +Notes: + +- (1) You get "pyarrow" or "python" depending on pyarrow being installed. +- (2) "pyarrow_numpy" is kept temporarily because this is already in a released + version, but it will be deprecated in 2.x and removed for 3.0. + +For the new default string dtype, only the `"str"` alias can be used to +specify the dtype as a string, i.e. pandas would not provide a way to make the +underlying storage (pyarrow or python) explicit through the string alias. This +string alias is only a convenience shortcut and for most users `"str"` is +sufficient (they don't need to specify the storage), and the explicit +`pd.StringDtype(storage=..., na_value=np.nan)` is still available for more +fine-grained control. + +Also for the existing variant using `pd.NA`, specifying the storage through the +string alias could be deprecated, but that is left for a separate decision. + +## Alternatives + +### Why not delay introducing a default string dtype? + +To avoid introducing a new string dtype while other discussions and changes are +in flux (eventually making pyarrow a required dependency? adopting `pd.NA` as +the default missing value sentinel? using the new NumPy 2.0 capabilities? +overhauling all our dtypes to use a logical data type system?), introducing a +default string dtype could also be delayed until there is more clarity in those +other discussions. Specifically, it would avoid temporarily switching to use +`NaN` for the string dtype, while in a future version we might switch back +to `pd.NA` by default. + +However: + +1. Delaying has a cost: it further postpones introducing a dedicated string + dtype that has significant benefits for users, both in usability as (for the + part of the user base that has PyArrow installed) in performance. +2. In case pandas eventually transitions to use `pd.NA` as the default missing value + sentinel, a migration path for _all_ pandas data types will be needed, and thus + the challenges around this will not be unique to the string dtype and + therefore not a reason to delay this. + +Making this change now for 3.0 will benefit the majority of users, and the PDEP +author believes this is worth the cost of the added complexity around "yet +another dtype" (also for other data types we already have multiple variants). + +### Why not use the existing StringDtype with `pd.NA`? + +Wouldn't adding even more variants of the string dtype make things only more +confusing? Indeed, this proposal unfortunately introduces more variants of the +string dtype. However, the reason for this is to ensure the actual default user +experience is _less_ confusing, and the new string dtype fits better with the +other default data types. + +If the new default string data type would use `pd.NA`, then after some +operations, a user can easily end up with a DataFrame that mixes columns using +`NaN` semantics and columns using `NA` semantics (and thus a DataFrame that +could have columns with two different int64, two different float64, two different +bool, etc dtypes). This would lead to a very confusing default experience. + +With the proposed new variant of the StringDtype, this will ensure that for the +_default_ experience, a user will only see only 1 kind of integer dtype, only +kind of 1 bool dtype, etc. For now, a user should only get columns using `pd.NA` +when explicitly opting into this. + +### Naming alternatives + +An initial version of this PDEP proposed to use the `"string"` alias and the +default `pd.StringDtype()` class constructor for the new default dtype. +However, that caused a lot of discussion around backwards compatibility for +existing users of `dtype=pd.StringDtype()` and `dtype="string"`, that uses +`pd.NA` to represent missing values. + +During the discussion, several alternatives have been brought up. Both +alternative keyword names as using a different constructor. In the end, +this PDEP proposes to use a different string alias (`"str"`) but to keep +using the existing `pd.StringDtype` (with the existing `storage` keyword but +with an additional `na_value` keyword) for now to keep the changes as +minimal as possible, leaving a larger overhaul of the dtype system (potentially +including different constructor functions or namespace) for a future discussion. +See [GH-58613](https://github.com/pandas-dev/pandas/issues/58613) for the full +discussion. + +One consequence is that when using the class constructor for the default dtype, +it has to be used with non-default arguments, i.e. a user needs to specify +`pd.StringDtype(na_value=np.nan)` to get the default dtype using `NaN`. +Therefore, the pandas documentation will focus on the usage of `dtype="str"`. + +## Backward compatibility + +The most visible backwards incompatible change will be that columns with string +data will no longer have an `object` dtype. Therefore, code that assumes +`object` dtype (such as `ser.dtype == object`) will need to be updated. This +change is done as a hard break in a major release, as warning in advance for the +changed inference is deemed too noisy. + +To allow testing code in advance, the +`pd.options.future.infer_string = True` option is available for users. + +Otherwise, the actual string-specific functionality (such as the `.str` accessor +methods) should generally all keep working as is. + +By preserving the current missing value semantics, this proposal is also mostly +backwards compatible on this aspect. When storing strings in object dtype, pandas +however did allow using `None` as the missing value indicator as well (and in +certain cases such as the `shift` method, pandas even introduced this itself). +For all the cases where currently `None` was used as the missing value sentinel, +this will change to consistently use `NaN`. + +### For existing users of `StringDtype` + +Existing code that already opted in to use the `StringDtype` using `pd.NA` +should generally keep working as is. The latest version of this PDEP preserves +the behaviour of `dtype="string"` or `dtype=pd.StringDtype()` to mean the +`pd.NA` variant of the dtype. + +It does propose the change the default storage to `"pyarrow"` (if available) for +the opt-in `pd.NA` variant as well, but this should have limited, if any, +user-visible impact. + +## Timeline + +The future PyArrow-backed string dtype was already made available behind a feature +flag in pandas 2.1 (enabled by `pd.options.future.infer_string = True`). + +The variant using numpy object-dtype can also be backported to the 2.2.x branch +to allow easier testing. It is proposed to release this as 2.3.0 (created from +the 2.2.x branch, given that the main branch already includes many other changes +targeted for 3.0), together with the changes to the naming scheme. + +The 2.3.0 release would then have all future string functionality available +(both the pyarrow and object-dtype based variants of the default string dtype). + +For pandas 3.0, this `future.infer_string` flag becomes enabled by default. + +## PDEP-14 History + +- 3 May 2024: Initial version From 6b9f2f5ca38bc9397f0c50cc4843487e3e7c4871 Mon Sep 17 00:00:00 2001 From: Matt Braymer-Hayes Date: Wed, 24 Jul 2024 12:39:11 -0400 Subject: [PATCH 220/272] DOC: pyarrow.rst: Add missing IO readers (#59304) pyarrow.rst: Add missing IO readers Add read_parquet and read_table (experimental) to the list of IO readers that support the PyArrow engine. --- doc/source/user_guide/pyarrow.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/doc/source/user_guide/pyarrow.rst b/doc/source/user_guide/pyarrow.rst index 61b383afb7c43..aecbce0441b53 100644 --- a/doc/source/user_guide/pyarrow.rst +++ b/doc/source/user_guide/pyarrow.rst @@ -159,9 +159,11 @@ PyArrow also provides IO reading functionality that has been integrated into sev functions provide an ``engine`` keyword that can dispatch to PyArrow to accelerate reading from an IO source. * :func:`read_csv` +* :func:`read_feather` * :func:`read_json` * :func:`read_orc` -* :func:`read_feather` +* :func:`read_parquet` +* :func:`read_table` (experimental) .. ipython:: python From 1afc7a3673806be18537bd3bc19b46f8fc13c808 Mon Sep 17 00:00:00 2001 From: ritwika314 Date: Wed, 24 Jul 2024 12:39:55 -0400 Subject: [PATCH 221/272] DOC: Add Bodo to out-of-core projects in ecosystem (#59302) * DOC: ecosystem.md: add Bodo. * DOC: ecosystem.md: add Bodo. * DOC: ecosystem.md: add info about paid and free tiers. * DOC: ecosystem.md: remove trailing whitespace --------- Co-authored-by: ritwika314 --- web/pandas/community/ecosystem.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/web/pandas/community/ecosystem.md b/web/pandas/community/ecosystem.md index 6cd67302b2a0e..49ece5564c300 100644 --- a/web/pandas/community/ecosystem.md +++ b/web/pandas/community/ecosystem.md @@ -362,6 +362,18 @@ any Delta table into Pandas dataframe. ## Out-of-core +### [Bodo](https://bodo.ai/) + +Bodo is a high-performance Python computing engine that automatically parallelizes and +optimizes your code through compilation using HPC (high-performance computing) techniques. +Designed to operate with native pandas dataframes, Bodo compiles your pandas code to execute +across multiple cores on a single machine or distributed clusters of multiple compute nodes efficiently. +Bodo also makes distributed pandas dataframes queryable with SQL. + +The community edition of Bodo is free to use on up to 8 cores. Beyond that, Bodo offers a paid +enterprise edition. Free licenses of Bodo (for more than 8 cores) are available +[upon request](https://www.bodo.ai/contact) for academic and non-profit use. + ### [Cylon](https://cylondata.org/) Cylon is a fast, scalable, distributed memory parallel runtime with a pandas From f7327491063dfc2c51679379625fb9aa18a5b0e6 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 24 Jul 2024 06:42:09 -1000 Subject: [PATCH 222/272] TST: Clean tests that constuct Index equivalent to RangeIndexes (#57441) * API: Check index and column classess exactly by default * Add a todo * Change test for expected behavior * add ignore index check * ignore column checking for some test * Ignore index checking for test_concat_all_na_block * Ignore adjust some tests * Fix another test * Adjust more tests * Fix more tests * Adjust more tests * adjust another test * Adjust more tests * Adjust test * Adjust test * Adjust test * Fix more tests * Fix more tests * Fix more tests * Fix tests * Adjust more tests * Adjust more tests * Fix some tests * Adjust tests * Fix test * Fix more test * Adjust more tests * Undo some strictness checking * update tests * Adjust more tests * Another test * Adjust more tests * fix another test * Fix test * Fix another test * fix more test * More indexes * Undo assert_ functions for strict checking * Fix tests --- pandas/tests/apply/test_frame_apply.py | 10 +-- pandas/tests/arithmetic/test_numeric.py | 2 +- pandas/tests/computation/test_eval.py | 2 +- pandas/tests/extension/base/getitem.py | 5 +- pandas/tests/extension/base/reshaping.py | 4 +- pandas/tests/extension/base/setitem.py | 2 +- pandas/tests/frame/indexing/test_indexing.py | 6 +- pandas/tests/frame/indexing/test_setitem.py | 2 +- pandas/tests/frame/methods/test_compare.py | 9 +-- .../frame/methods/test_drop_duplicates.py | 13 ++-- pandas/tests/frame/methods/test_dropna.py | 4 +- pandas/tests/frame/methods/test_explode.py | 2 +- pandas/tests/frame/methods/test_nlargest.py | 17 +++-- pandas/tests/frame/methods/test_quantile.py | 2 +- .../tests/frame/methods/test_sort_values.py | 14 ++-- pandas/tests/frame/methods/test_transpose.py | 3 +- pandas/tests/frame/test_constructors.py | 22 +++--- pandas/tests/frame/test_query_eval.py | 9 +-- pandas/tests/frame/test_reductions.py | 30 ++++---- pandas/tests/frame/test_stack_unstack.py | 36 ++++++---- pandas/tests/groupby/test_filters.py | 4 +- pandas/tests/groupby/test_groupby.py | 68 +++++++++++-------- .../datetimes/methods/test_to_series.py | 2 +- pandas/tests/indexes/numeric/test_setops.py | 2 +- pandas/tests/indexes/test_common.py | 4 +- pandas/tests/indexes/test_old_base.py | 5 +- .../indexes/timedeltas/test_timedelta.py | 4 +- pandas/tests/indexing/test_indexing.py | 4 +- pandas/tests/indexing/test_loc.py | 8 +-- pandas/tests/io/excel/test_readers.py | 2 +- pandas/tests/io/excel/test_writers.py | 28 ++++++-- pandas/tests/io/json/test_normalize.py | 2 +- .../io/parser/dtypes/test_dtypes_basic.py | 2 +- pandas/tests/io/parser/test_header.py | 2 +- pandas/tests/io/test_sql.py | 8 +-- pandas/tests/io/xml/test_xml.py | 5 +- pandas/tests/reductions/test_reductions.py | 2 +- pandas/tests/reshape/concat/test_datetimes.py | 4 +- pandas/tests/reshape/concat/test_index.py | 6 +- pandas/tests/reshape/merge/test_merge.py | 2 +- pandas/tests/series/methods/test_nlargest.py | 2 +- pandas/tests/series/methods/test_reindex.py | 28 ++++---- pandas/tests/strings/test_strings.py | 6 +- pandas/tests/test_sorting.py | 1 - pandas/tests/tools/test_to_datetime.py | 4 +- pandas/tests/window/test_expanding.py | 12 ++-- pandas/tests/window/test_pairwise.py | 13 ++-- 47 files changed, 235 insertions(+), 189 deletions(-) diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index 78c52d3ddfbdf..ba405d4bd1cab 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -368,18 +368,18 @@ def test_apply_mixed_dtype_corner(): result = df[:0].apply(np.mean, axis=1) # the result here is actually kind of ambiguous, should it be a Series # or a DataFrame? - expected = Series(np.nan, index=pd.Index([], dtype="int64")) + expected = Series(dtype=np.float64) tm.assert_series_equal(result, expected) def test_apply_mixed_dtype_corner_indexing(): df = DataFrame({"A": ["foo"], "B": [1.0]}) result = df.apply(lambda x: x["A"], axis=1) - expected = Series(["foo"], index=[0]) + expected = Series(["foo"], index=range(1)) tm.assert_series_equal(result, expected) result = df.apply(lambda x: x["B"], axis=1) - expected = Series([1.0], index=[0]) + expected = Series([1.0], index=range(1)) tm.assert_series_equal(result, expected) @@ -1037,7 +1037,7 @@ def test_result_type(int_frame_const_col): result = df.apply(lambda x: [1, 2, 3], axis=1, result_type="expand") expected = df.copy() - expected.columns = [0, 1, 2] + expected.columns = range(3) tm.assert_frame_equal(result, expected) @@ -1047,7 +1047,7 @@ def test_result_type_shorter_list(int_frame_const_col): df = int_frame_const_col result = df.apply(lambda x: [1, 2], axis=1, result_type="expand") expected = df[["A", "B"]].copy() - expected.columns = [0, 1] + expected.columns = range(2) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/arithmetic/test_numeric.py b/pandas/tests/arithmetic/test_numeric.py index 1b8ad1922b9d2..d205569270705 100644 --- a/pandas/tests/arithmetic/test_numeric.py +++ b/pandas/tests/arithmetic/test_numeric.py @@ -1451,7 +1451,7 @@ def test_fill_value_inf_masking(): expected = pd.DataFrame( {"A": [np.inf, 1.0, 0.0, 1.0], "B": [0.0, np.nan, 0.0, np.nan]} ) - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected, check_index_type=False) def test_dataframe_div_silenced(): diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index 1844b47847e95..31d568d7c1e0c 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -1800,7 +1800,7 @@ def test_numexpr_option_incompatible_op(): {"A": [True, False, True, False, None, None], "B": [1, 2, 3, 4, 5, 6]} ) result = df.query("A.isnull()") - expected = DataFrame({"A": [None, None], "B": [5, 6]}, index=[4, 5]) + expected = DataFrame({"A": [None, None], "B": [5, 6]}, index=range(4, 6)) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py index 3fa2f50bf4930..27fa1206f6f7f 100644 --- a/pandas/tests/extension/base/getitem.py +++ b/pandas/tests/extension/base/getitem.py @@ -408,7 +408,7 @@ def test_take_series(self, data): result = s.take([0, -1]) expected = pd.Series( data._from_sequence([data[0], data[len(data) - 1]], dtype=s.dtype), - index=[0, len(data) - 1], + index=range(0, 198, 99), ) tm.assert_series_equal(result, expected) @@ -428,7 +428,8 @@ def test_reindex(self, data, na_value): result = s.reindex([n, n + 1]) expected = pd.Series( - data._from_sequence([na_value, na_value], dtype=s.dtype), index=[n, n + 1] + data._from_sequence([na_value, na_value], dtype=s.dtype), + index=range(n, n + 2, 1), ) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py index 24be94443c5ba..2915c0585f373 100644 --- a/pandas/tests/extension/base/reshaping.py +++ b/pandas/tests/extension/base/reshaping.py @@ -33,8 +33,8 @@ def test_concat(self, data, in_frame): @pytest.mark.parametrize("in_frame", [True, False]) def test_concat_all_na_block(self, data_missing, in_frame): - valid_block = pd.Series(data_missing.take([1, 1]), index=[0, 1]) - na_block = pd.Series(data_missing.take([0, 0]), index=[2, 3]) + valid_block = pd.Series(data_missing.take([1, 1]), index=range(2)) + na_block = pd.Series(data_missing.take([0, 0]), index=range(2, 4)) if in_frame: valid_block = pd.DataFrame({"a": valid_block}) na_block = pd.DataFrame({"a": na_block}) diff --git a/pandas/tests/extension/base/setitem.py b/pandas/tests/extension/base/setitem.py index a455b21b9932a..1d613ced2c03f 100644 --- a/pandas/tests/extension/base/setitem.py +++ b/pandas/tests/extension/base/setitem.py @@ -374,7 +374,7 @@ def test_setitem_preserves_views(self, data): def test_setitem_with_expansion_dataframe_column(self, data, full_indexer): # https://github.com/pandas-dev/pandas/issues/32395 - df = expected = pd.DataFrame({0: pd.Series(data)}) + df = expected = pd.DataFrame(pd.Series(data)) result = pd.DataFrame(index=df.index) key = full_indexer(df) diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 693075a881833..a95fc10157a29 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -991,7 +991,7 @@ def test_single_element_ix_dont_upcast(self, float_frame): result = df.loc[0, "b"] assert is_integer(result) - expected = Series([666], [0], name="b") + expected = Series([666], index=range(1), name="b") result = df.loc[[0], "b"] tm.assert_series_equal(result, expected) @@ -1193,7 +1193,7 @@ def test_type_error_multiindex(self): # See gh-12218 mi = MultiIndex.from_product([["x", "y"], [0, 1]], names=[None, "c"]) dg = DataFrame( - [[1, 1, 2, 2], [3, 3, 4, 4]], columns=mi, index=Index([0, 1], name="i") + [[1, 1, 2, 2], [3, 3, 4, 4]], columns=mi, index=Index(range(2), name="i") ) with pytest.raises(InvalidIndexError, match="slice"): dg[:, 0] @@ -1452,7 +1452,7 @@ def test_iloc_ea_series_indexer(self): indexer = Series([0, 1], dtype="Int64") row_indexer = Series([1], dtype="Int64") result = df.iloc[row_indexer, indexer] - expected = DataFrame([[5, 6]], index=[1]) + expected = DataFrame([[5, 6]], index=range(1, 2)) tm.assert_frame_equal(result, expected) result = df.iloc[row_indexer.values, indexer.values] diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index df3b058ca51f9..75f52a57a0949 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -165,7 +165,7 @@ def test_setitem_timestamp_empty_columns(self): df["now"] = Timestamp("20130101", tz="UTC") expected = DataFrame( - [[Timestamp("20130101", tz="UTC")]] * 3, index=[0, 1, 2], columns=["now"] + [[Timestamp("20130101", tz="UTC")]] * 3, index=range(3), columns=["now"] ) tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/frame/methods/test_compare.py b/pandas/tests/frame/methods/test_compare.py index 75e60a4816902..2ffc3f933e246 100644 --- a/pandas/tests/frame/methods/test_compare.py +++ b/pandas/tests/frame/methods/test_compare.py @@ -21,7 +21,7 @@ def test_compare_axis(align_axis): result = df.compare(df2, align_axis=align_axis) if align_axis in (1, "columns"): - indices = pd.Index([0, 2]) + indices = pd.RangeIndex(0, 4, 2) columns = pd.MultiIndex.from_product([["col1", "col3"], ["self", "other"]]) expected = pd.DataFrame( [["a", "c", np.nan, np.nan], [np.nan, np.nan, 3.0, 4.0]], @@ -29,7 +29,7 @@ def test_compare_axis(align_axis): columns=columns, ) else: - indices = pd.MultiIndex.from_product([[0, 2], ["self", "other"]]) + indices = pd.MultiIndex.from_product([range(0, 4, 2), ["self", "other"]]) columns = pd.Index(["col1", "col3"]) expected = pd.DataFrame( [["a", np.nan], ["c", np.nan], [np.nan, 3.0], [np.nan, 4.0]], @@ -60,7 +60,7 @@ def test_compare_various_formats(keep_shape, keep_equal): result = df.compare(df2, keep_shape=keep_shape, keep_equal=keep_equal) if keep_shape: - indices = pd.Index([0, 1, 2]) + indices = pd.RangeIndex(3) columns = pd.MultiIndex.from_product( [["col1", "col2", "col3"], ["self", "other"]] ) @@ -85,7 +85,7 @@ def test_compare_various_formats(keep_shape, keep_equal): columns=columns, ) else: - indices = pd.Index([0, 2]) + indices = pd.RangeIndex(0, 4, 2) columns = pd.MultiIndex.from_product([["col1", "col3"], ["self", "other"]]) expected = pd.DataFrame( [["a", "c", 1.0, 1.0], ["c", "c", 3.0, 4.0]], index=indices, columns=columns @@ -203,6 +203,7 @@ def test_compare_result_names(): }, ) result = df1.compare(df2, result_names=("left", "right")) + result.index = pd.Index([0, 2]) expected = pd.DataFrame( { ("col1", "left"): {0: "a", 2: np.nan}, diff --git a/pandas/tests/frame/methods/test_drop_duplicates.py b/pandas/tests/frame/methods/test_drop_duplicates.py index 6bea97b2cf189..419fb75cb3669 100644 --- a/pandas/tests/frame/methods/test_drop_duplicates.py +++ b/pandas/tests/frame/methods/test_drop_duplicates.py @@ -411,10 +411,15 @@ def test_drop_duplicates_inplace(): @pytest.mark.parametrize( "origin_dict, output_dict, ignore_index, output_index", [ - ({"A": [2, 2, 3]}, {"A": [2, 3]}, True, [0, 1]), - ({"A": [2, 2, 3]}, {"A": [2, 3]}, False, [0, 2]), - ({"A": [2, 2, 3], "B": [2, 2, 4]}, {"A": [2, 3], "B": [2, 4]}, True, [0, 1]), - ({"A": [2, 2, 3], "B": [2, 2, 4]}, {"A": [2, 3], "B": [2, 4]}, False, [0, 2]), + ({"A": [2, 2, 3]}, {"A": [2, 3]}, True, range(2)), + ({"A": [2, 2, 3]}, {"A": [2, 3]}, False, range(0, 4, 2)), + ({"A": [2, 2, 3], "B": [2, 2, 4]}, {"A": [2, 3], "B": [2, 4]}, True, range(2)), + ( + {"A": [2, 2, 3], "B": [2, 2, 4]}, + {"A": [2, 3], "B": [2, 4]}, + False, + range(0, 4, 2), + ), ], ) def test_drop_duplicates_ignore_index( diff --git a/pandas/tests/frame/methods/test_dropna.py b/pandas/tests/frame/methods/test_dropna.py index 7899b4aeac3fd..11893d7fac1a4 100644 --- a/pandas/tests/frame/methods/test_dropna.py +++ b/pandas/tests/frame/methods/test_dropna.py @@ -195,7 +195,7 @@ def test_dropna_tz_aware_datetime(self): # Ex2 df = DataFrame({"Time": [dt1, None, np.nan, dt2]}) result = df.dropna(axis=0) - expected = DataFrame([dt1, dt2], columns=["Time"], index=[0, 3]) + expected = DataFrame([dt1, dt2], columns=["Time"], index=range(0, 6, 3)) tm.assert_frame_equal(result, expected) def test_dropna_categorical_interval_index(self): @@ -233,7 +233,7 @@ def test_set_single_column_subset(self): # GH 41021 df = DataFrame({"A": [1, 2, 3], "B": list("abc"), "C": [4, np.nan, 5]}) expected = DataFrame( - {"A": [1, 3], "B": list("ac"), "C": [4.0, 5.0]}, index=[0, 2] + {"A": [1, 3], "B": list("ac"), "C": [4.0, 5.0]}, index=range(0, 4, 2) ) result = df.dropna(subset="C") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_explode.py b/pandas/tests/frame/methods/test_explode.py index ca9764c023244..876ad5539d603 100644 --- a/pandas/tests/frame/methods/test_explode.py +++ b/pandas/tests/frame/methods/test_explode.py @@ -210,7 +210,7 @@ def test_ignore_index(): df = pd.DataFrame({"id": range(0, 20, 10), "values": [list("ab"), list("cd")]}) result = df.explode("values", ignore_index=True) expected = pd.DataFrame( - {"id": [0, 0, 10, 10], "values": list("abcd")}, index=[0, 1, 2, 3] + {"id": [0, 0, 10, 10], "values": list("abcd")}, index=range(4) ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_nlargest.py b/pandas/tests/frame/methods/test_nlargest.py index 7b6a0487c296a..56bb3126455a5 100644 --- a/pandas/tests/frame/methods/test_nlargest.py +++ b/pandas/tests/frame/methods/test_nlargest.py @@ -82,6 +82,7 @@ def test_nlargest_n(self, nselect_method, n, order): else: ascending = nselect_method == "nsmallest" result = getattr(df, nselect_method)(n, order) + result.index = pd.Index(list(result.index)) expected = df.sort_values(order, ascending=ascending).head(n) tm.assert_frame_equal(result, expected) @@ -132,7 +133,7 @@ def test_nlargest_n_identical_values(self): df = pd.DataFrame({"a": [1] * 5, "b": [1, 2, 3, 4, 5]}) result = df.nlargest(3, "a") - expected = pd.DataFrame({"a": [1] * 3, "b": [1, 2, 3]}, index=[0, 1, 2]) + expected = pd.DataFrame({"a": [1] * 3, "b": [1, 2, 3]}, index=range(3)) tm.assert_frame_equal(result, expected) result = df.nsmallest(3, "a") @@ -179,18 +180,20 @@ def test_nlargest_duplicate_keep_all_ties(self): result = df.nlargest(4, "a", keep="all") expected = pd.DataFrame( { - "a": {0: 5, 1: 4, 2: 4, 4: 3, 5: 3, 6: 3, 7: 3}, - "b": {0: 10, 1: 9, 2: 8, 4: 5, 5: 50, 6: 10, 7: 20}, - } + "a": [5, 4, 4, 3, 3, 3, 3], + "b": [10, 9, 8, 5, 50, 10, 20], + }, + index=[0, 1, 2, 4, 5, 6, 7], ) tm.assert_frame_equal(result, expected) result = df.nsmallest(2, "a", keep="all") expected = pd.DataFrame( { - "a": {3: 2, 4: 3, 5: 3, 6: 3, 7: 3}, - "b": {3: 7, 4: 5, 5: 50, 6: 10, 7: 20}, - } + "a": [2, 3, 3, 3, 3], + "b": [7, 5, 50, 10, 20], + }, + index=range(3, 8), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py index f35b77da0b547..4181740d62627 100644 --- a/pandas/tests/frame/methods/test_quantile.py +++ b/pandas/tests/frame/methods/test_quantile.py @@ -127,7 +127,7 @@ def test_axis_numeric_only_true(self, interp_method): result = df.quantile( 0.5, axis=1, numeric_only=True, interpolation=interpolation, method=method ) - expected = Series([3.0, 4.0], index=[0, 1], name=0.5) + expected = Series([3.0, 4.0], index=range(2), name=0.5) if interpolation == "nearest": expected = expected.astype(np.int64) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_sort_values.py b/pandas/tests/frame/methods/test_sort_values.py index c146dcc9c2d71..e728526519e9d 100644 --- a/pandas/tests/frame/methods/test_sort_values.py +++ b/pandas/tests/frame/methods/test_sort_values.py @@ -170,7 +170,7 @@ def test_sort_values_multicolumn_uint64(self): "a": pd.Series([18446637057563306014, 1162265347240853609]), "b": pd.Series([1, 2]), }, - index=pd.Index([1, 0]), + index=range(1, -1, -1), ) tm.assert_frame_equal(result, expected) @@ -360,7 +360,7 @@ def test_sort_values_nat_values_in_int_column(self): df_reversed = DataFrame( {"int": int_values[::-1], "float": float_values[::-1]}, columns=["int", "float"], - index=[1, 0], + index=range(1, -1, -1), ) # NaT is not a "na" for int64 columns, so na_position must not @@ -385,7 +385,7 @@ def test_sort_values_nat_values_in_int_column(self): df_reversed = DataFrame( {"datetime": [NaT, Timestamp("2016-01-01")], "float": float_values[::-1]}, columns=["datetime", "float"], - index=[1, 0], + index=range(1, -1, -1), ) df_sorted = df.sort_values(["datetime", "float"], na_position="first") @@ -540,19 +540,19 @@ def test_sort_values_na_position_with_categories_raises(self): @pytest.mark.parametrize( "original_dict, sorted_dict, ignore_index, output_index", [ - ({"A": [1, 2, 3]}, {"A": [3, 2, 1]}, True, [0, 1, 2]), - ({"A": [1, 2, 3]}, {"A": [3, 2, 1]}, False, [2, 1, 0]), + ({"A": [1, 2, 3]}, {"A": [3, 2, 1]}, True, range(3)), + ({"A": [1, 2, 3]}, {"A": [3, 2, 1]}, False, range(2, -1, -1)), ( {"A": [1, 2, 3], "B": [2, 3, 4]}, {"A": [3, 2, 1], "B": [4, 3, 2]}, True, - [0, 1, 2], + range(3), ), ( {"A": [1, 2, 3], "B": [2, 3, 4]}, {"A": [3, 2, 1], "B": [4, 3, 2]}, False, - [2, 1, 0], + range(2, -1, -1), ), ], ) diff --git a/pandas/tests/frame/methods/test_transpose.py b/pandas/tests/frame/methods/test_transpose.py index f42fd4483e9ac..1b7b30ac40363 100644 --- a/pandas/tests/frame/methods/test_transpose.py +++ b/pandas/tests/frame/methods/test_transpose.py @@ -25,6 +25,7 @@ def test_transpose_td64_intervals(self): df = DataFrame(ii) result = df.T + result.columns = Index(list(range(len(ii)))) expected = DataFrame({i: ii[i : i + 1] for i in range(len(ii))}) tm.assert_frame_equal(result, expected) @@ -153,7 +154,6 @@ def test_transpose_not_inferring_dt(self): result = df.T expected = DataFrame( [[Timestamp("2019-12-31"), Timestamp("2019-12-31")]], - columns=[0, 1], index=["a"], dtype=object, ) @@ -175,7 +175,6 @@ def test_transpose_not_inferring_dt_mixed_blocks(self): [Timestamp("2019-12-31"), Timestamp("2019-12-31")], [Timestamp("2019-12-31"), Timestamp("2019-12-31")], ], - columns=[0, 1], index=["a", "b"], dtype=object, ) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 2d5772eb5cb53..dfcd0d7bfea54 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -101,7 +101,7 @@ def test_constructor_dict_with_tzaware_scalar(self): df = DataFrame({"dt": dt}, index=[0]) expected = DataFrame({"dt": [dt]}) - tm.assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected, check_index_type=False) # Non-homogeneous df = DataFrame({"dt": dt, "value": [1]}) @@ -566,7 +566,7 @@ def test_constructor_invalid_items_unused(self, scalar): expected = DataFrame(columns=["b"]) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("value", [2, np.nan, None, float("nan")]) + @pytest.mark.parametrize("value", [4, np.nan, None, float("nan")]) def test_constructor_dict_nan_key(self, value): # GH 18455 cols = [1, value, 3] @@ -852,10 +852,10 @@ def create_data(constructor): expected = DataFrame( [ - {0: 0, 1: None, 2: None, 3: None}, - {0: None, 1: 2, 2: None, 3: None}, - {0: None, 1: None, 2: 4, 3: None}, - {0: None, 1: None, 2: None, 3: 6}, + [0, None, None, None], + [None, 2, None, None], + [None, None, 4, None], + [None, None, None, 6], ], index=[Timestamp(dt) for dt in dates_as_str], ) @@ -933,7 +933,7 @@ def test_constructor_dict_extension_scalar(self, ea_scalar_and_dtype): ) def test_constructor_extension_scalar_data(self, data, dtype): # GH 34832 - df = DataFrame(index=[0, 1], columns=["a", "b"], data=data) + df = DataFrame(index=range(2), columns=["a", "b"], data=data) assert df["a"].dtype == dtype assert df["b"].dtype == dtype @@ -1269,7 +1269,7 @@ def test_constructor_list_of_lists(self, using_infer_string): # GH 4851 # list of 0-dim ndarrays - expected = DataFrame({0: np.arange(10)}) + expected = DataFrame(np.arange(10)) data = [np.array(x) for x in range(10)] result = DataFrame(data) tm.assert_frame_equal(result, expected) @@ -1326,7 +1326,7 @@ def test_constructor_unequal_length_nested_list_column(self): ) def test_constructor_one_element_data_list(self, data): # GH#42810 - result = DataFrame(data, index=[0, 1, 2], columns=["x"]) + result = DataFrame(data, index=range(3), columns=["x"]) expected = DataFrame({"x": [Timestamp("2021-01-01")] * 3}) tm.assert_frame_equal(result, expected) @@ -1633,7 +1633,7 @@ def test_constructor_Series_named(self): s = Series(arr, index=range(3, 13)) df = DataFrame(s) expected = DataFrame({0: s}) - tm.assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected, check_column_type=False) msg = r"Shape of passed values is \(10, 1\), indices imply \(10, 2\)" with pytest.raises(ValueError, match=msg): @@ -1652,7 +1652,7 @@ def test_constructor_Series_named(self): # this is a bit non-intuitive here; the series collapse down to arrays df = DataFrame([arr, s1]).T - expected = DataFrame({1: s1, 0: arr}, columns=[0, 1]) + expected = DataFrame({1: s1, 0: arr}, columns=range(2)) tm.assert_frame_equal(df, expected) def test_constructor_Series_named_and_columns(self): diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index b791868b173e4..4f10fb2e0e9f5 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -1177,6 +1177,7 @@ def test_query_string_null_elements(self, in_list): df_expected = DataFrame({"a": expected}, dtype="string") df_expected.index = df_expected.index.astype("int64") df = DataFrame({"a": in_list}, dtype="string") + df.index = Index(list(df.index), dtype=df.index.dtype) res1 = df.query("a == 'asdf'", parser=parser, engine=engine) res2 = df[df["a"] == "asdf"] res3 = df.query("a <= 'asdf'", parser=parser, engine=engine) @@ -1419,12 +1420,12 @@ def test_query_ea_dtypes(self, dtype): if dtype == "int64[pyarrow]": pytest.importorskip("pyarrow") # GH#50261 - df = DataFrame({"a": Series([1, 2], dtype=dtype)}) + df = DataFrame({"a": [1, 2]}, dtype=dtype) ref = {2} # noqa: F841 warning = RuntimeWarning if dtype == "Int64" and NUMEXPR_INSTALLED else None with tm.assert_produces_warning(warning): result = df.query("a in @ref") - expected = DataFrame({"a": Series([2], dtype=dtype, index=[1])}) + expected = DataFrame({"a": [2]}, index=range(1, 2), dtype=dtype) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("engine", ["python", "numexpr"]) @@ -1443,8 +1444,8 @@ def test_query_ea_equality_comparison(self, dtype, engine): result = df.query("A == B", engine=engine) expected = DataFrame( { - "A": Series([1, 2], dtype="Int64", index=[0, 2]), - "B": Series([1, 2], dtype=dtype, index=[0, 2]), + "A": Series([1, 2], dtype="Int64", index=range(0, 4, 2)), + "B": Series([1, 2], dtype=dtype, index=range(0, 4, 2)), } ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 5118561f67338..649c30bdec790 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -490,10 +490,8 @@ def test_nunique(self): tm.assert_series_equal( df.nunique(dropna=False), Series({"A": 1, "B": 3, "C": 3}) ) - tm.assert_series_equal(df.nunique(axis=1), Series({0: 1, 1: 2, 2: 2})) - tm.assert_series_equal( - df.nunique(axis=1, dropna=False), Series({0: 1, 1: 3, 2: 2}) - ) + tm.assert_series_equal(df.nunique(axis=1), Series([1, 2, 2])) + tm.assert_series_equal(df.nunique(axis=1, dropna=False), Series([1, 3, 2])) @pytest.mark.parametrize("tz", [None, "UTC"]) def test_mean_mixed_datetime_numeric(self, tz): @@ -707,8 +705,8 @@ def test_mode_sortwarning(self, using_infer_string): def test_mode_empty_df(self): df = DataFrame([], columns=["a", "b"]) + expected = df.copy() result = df.mode() - expected = DataFrame([], columns=["a", "b"], index=Index([], dtype=np.int64)) tm.assert_frame_equal(result, expected) def test_operators_timedelta64(self): @@ -769,7 +767,7 @@ def test_operators_timedelta64(self): # excludes non-numeric result = mixed.min(axis=1, numeric_only=True) - expected = Series([1, 1, 1.0], index=[0, 1, 2]) + expected = Series([1, 1, 1.0]) tm.assert_series_equal(result, expected) # works when only those columns are selected @@ -1186,21 +1184,21 @@ def test_idxmax_mixed_dtype(self): df = DataFrame({1: [0, 2, 1], 2: range(3)[::-1], 3: dti}) result = df.idxmax() - expected = Series([1, 0, 2], index=[1, 2, 3]) + expected = Series([1, 0, 2], index=range(1, 4)) tm.assert_series_equal(result, expected) result = df.idxmin() - expected = Series([0, 2, 0], index=[1, 2, 3]) + expected = Series([0, 2, 0], index=range(1, 4)) tm.assert_series_equal(result, expected) # with NaTs df.loc[0, 3] = pd.NaT result = df.idxmax() - expected = Series([1, 0, 2], index=[1, 2, 3]) + expected = Series([1, 0, 2], index=range(1, 4)) tm.assert_series_equal(result, expected) result = df.idxmin() - expected = Series([0, 2, 1], index=[1, 2, 3]) + expected = Series([0, 2, 1], index=range(1, 4)) tm.assert_series_equal(result, expected) # with multi-column dt64 block @@ -1208,11 +1206,11 @@ def test_idxmax_mixed_dtype(self): df._consolidate_inplace() result = df.idxmax() - expected = Series([1, 0, 2, 0], index=[1, 2, 3, 4]) + expected = Series([1, 0, 2, 0], index=range(1, 5)) tm.assert_series_equal(result, expected) result = df.idxmin() - expected = Series([0, 2, 1, 2], index=[1, 2, 3, 4]) + expected = Series([0, 2, 1, 2], index=range(1, 5)) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( @@ -1829,7 +1827,7 @@ def test_df_empty_min_count_0(self, opname, dtype, exp_value, exp_dtype): df = DataFrame({0: [], 1: []}, dtype=dtype) result = getattr(df, opname)(min_count=0) - expected = Series([exp_value, exp_value], dtype=exp_dtype) + expected = Series([exp_value, exp_value], dtype=exp_dtype, index=range(2)) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( @@ -1852,7 +1850,7 @@ def test_df_empty_min_count_1(self, opname, dtype, exp_dtype): df = DataFrame({0: [], 1: []}, dtype=dtype) result = getattr(df, opname)(min_count=1) - expected = Series([np.nan, np.nan], dtype=exp_dtype) + expected = Series([np.nan, np.nan], dtype=exp_dtype, index=Index([0, 1])) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( @@ -1875,7 +1873,7 @@ def test_df_empty_nullable_min_count_0(self, opname, dtype, exp_value, exp_dtype df = DataFrame({0: [], 1: []}, dtype=dtype) result = getattr(df, opname)(min_count=0) - expected = Series([exp_value, exp_value], dtype=exp_dtype) + expected = Series([exp_value, exp_value], dtype=exp_dtype, index=Index([0, 1])) tm.assert_series_equal(result, expected) # TODO: why does min_count=1 impact the resulting Windows dtype @@ -1900,7 +1898,7 @@ def test_df_empty_nullable_min_count_1(self, opname, dtype, exp_dtype): df = DataFrame({0: [], 1: []}, dtype=dtype) result = getattr(df, opname)(min_count=1) - expected = Series([pd.NA, pd.NA], dtype=exp_dtype) + expected = Series([pd.NA, pd.NA], dtype=exp_dtype, index=Index([0, 1])) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index a3a1da6e57cb0..fc532a565a173 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -714,13 +714,13 @@ def test_unstack_unused_levels(self): df = DataFrame([[1, 0]] * 3, index=idx) result = df.unstack() - exp_col = MultiIndex.from_product([[0, 1], ["A", "B", "C"]]) + exp_col = MultiIndex.from_product([range(2), ["A", "B", "C"]]) expected = DataFrame([[1, 1, 1, 0, 0, 0]], index=["a"], columns=exp_col) tm.assert_frame_equal(result, expected) assert (result.columns.levels[1] == idx.levels[1]).all() # Unused items on both levels - levels = [[0, 1, 7], [0, 1, 2, 3]] + levels = [range(3), range(4)] codes = [[0, 0, 1, 1], [0, 2, 0, 2]] idx = MultiIndex(levels, codes) block = np.arange(4).reshape(2, 2) @@ -752,7 +752,7 @@ def test_unstack_unused_levels_mixed_with_nan( result = df.unstack(level=level) exp_data = np.zeros(18) * np.nan exp_data[idces] = data - cols = MultiIndex.from_product([[0, 1], col_level]) + cols = MultiIndex.from_product([range(2), col_level]) expected = DataFrame(exp_data.reshape(3, 6), index=idx_level, columns=cols) tm.assert_frame_equal(result, expected) @@ -1067,7 +1067,7 @@ def test_stack_datetime_column_multiIndex(self, future_stack): with tm.assert_produces_warning(warn, match=msg): result = df.stack(future_stack=future_stack) - eidx = MultiIndex.from_product([(0, 1, 2, 3), ("B",)]) + eidx = MultiIndex.from_product([range(4), ("B",)]) ecols = MultiIndex.from_tuples([(t, "A")]) expected = DataFrame([1, 2, 3, 4], index=eidx, columns=ecols) tm.assert_frame_equal(result, expected) @@ -1150,7 +1150,7 @@ def test_stack_full_multiIndex(self, future_stack): expected = DataFrame( [[0, 2], [1, np.nan], [3, 5], [4, np.nan]], index=MultiIndex( - levels=[[0, 1], ["u", "x", "y", "z"]], + levels=[range(2), ["u", "x", "y", "z"]], codes=[[0, 0, 1, 1], [1, 3, 1, 3]], names=[None, "Lower"], ), @@ -1201,7 +1201,7 @@ def test_stack_multi_preserve_categorical_dtype( s_cidx = pd.CategoricalIndex(labels, ordered=ordered) expected_data = sorted(data) if future_stack else data expected = Series( - expected_data, index=MultiIndex.from_product([[0], s_cidx, cidx2]) + expected_data, index=MultiIndex.from_product([range(1), s_cidx, cidx2]) ) tm.assert_series_equal(result, expected) @@ -1214,7 +1214,7 @@ def test_stack_preserve_categorical_dtype_values(self, future_stack): cat = pd.Categorical(["a", "a", "b", "c"]) df = DataFrame({"A": cat, "B": cat}) result = df.stack(future_stack=future_stack) - index = MultiIndex.from_product([[0, 1, 2, 3], ["A", "B"]]) + index = MultiIndex.from_product([range(4), ["A", "B"]]) expected = Series( pd.Categorical(["a", "a", "a", "a", "b", "b", "c", "c"]), index=index ) @@ -1298,7 +1298,7 @@ def test_unstack_mixed_extension_types(self, level): @pytest.mark.parametrize("level", [0, "baz"]) def test_unstack_swaplevel_sortlevel(self, level): # GH 20994 - mi = MultiIndex.from_product([[0], ["d", "c"]], names=["bar", "baz"]) + mi = MultiIndex.from_product([range(1), ["d", "c"]], names=["bar", "baz"]) df = DataFrame([[0, 2], [1, 3]], index=mi, columns=["B", "A"]) df.columns.name = "foo" @@ -1339,7 +1339,9 @@ def test_unstack_sort_false(frame_or_series, dtype): result = obj.unstack(level=-1, sort=False) if frame_or_series is DataFrame: - expected_columns = MultiIndex.from_tuples([(0, "b"), (0, "a")]) + expected_columns = MultiIndex( + levels=[range(1), ["b", "a"]], codes=[[0, 0], [0, 1]] + ) else: expected_columns = ["b", "a"] expected = DataFrame( @@ -1355,7 +1357,9 @@ def test_unstack_sort_false(frame_or_series, dtype): result = obj.unstack(level=[1, 2], sort=False) if frame_or_series is DataFrame: - expected_columns = MultiIndex.from_tuples([(0, "z", "b"), (0, "y", "a")]) + expected_columns = MultiIndex( + levels=[range(1), ["z", "y"], ["b", "a"]], codes=[[0, 0], [0, 1], [0, 1]] + ) else: expected_columns = MultiIndex.from_tuples([("z", "b"), ("y", "a")]) expected = DataFrame( @@ -1432,7 +1436,7 @@ def test_stack_timezone_aware_values(future_stack): @pytest.mark.parametrize("dropna", [True, False, lib.no_default]) def test_stack_empty_frame(dropna, future_stack): # GH 36113 - levels = [np.array([], dtype=np.int64), np.array([], dtype=np.int64)] + levels = [pd.RangeIndex(0), pd.RangeIndex(0)] expected = Series(dtype=np.float64, index=MultiIndex(levels=levels, codes=[[], []])) if future_stack and dropna is not lib.no_default: with pytest.raises(ValueError, match="dropna must be unspecified"): @@ -1510,7 +1514,9 @@ def test_stack_positional_level_duplicate_column_names(future_stack): result = df.stack(0, future_stack=future_stack) new_columns = Index(["y", "z"], name="a") - new_index = MultiIndex.from_tuples([(0, "x"), (0, "y")], names=[None, "a"]) + new_index = MultiIndex( + levels=[range(1), ["x", "y"]], codes=[[0, 0], [0, 1]], names=[None, "a"] + ) expected = DataFrame([[1, 1], [1, 1]], index=new_index, columns=new_columns) tm.assert_frame_equal(result, expected) @@ -2318,7 +2324,7 @@ def test_stack_unstack_unordered_multiindex(self, future_stack): ) expected = DataFrame( [["a0", "b0"], ["a1", "b1"], ["a2", "b2"], ["a3", "b3"], ["a4", "b4"]], - index=[0, 1, 2, 3, 4], + index=range(5), columns=MultiIndex.from_tuples( [("a", "x"), ("b", "x")], names=["first", "second"] ), @@ -2520,7 +2526,7 @@ def test_multi_level_stack_categorical(self, future_stack): ] ), ) - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected, check_index_type=False) @pytest.mark.filterwarnings( "ignore:The previous implementation of stack is deprecated" @@ -2657,7 +2663,7 @@ def test_stack_tuple_columns(future_stack): expected = Series( [1, 2, 3, 4, 5, 6, 7, 8, 9], index=MultiIndex( - levels=[[0, 1, 2], [("a", 1), ("a", 2), ("b", 1)]], + levels=[range(3), [("a", 1), ("a", 2), ("b", 1)]], codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]], ), ) diff --git a/pandas/tests/groupby/test_filters.py b/pandas/tests/groupby/test_filters.py index 04883b3ef6b78..4fe3aac629513 100644 --- a/pandas/tests/groupby/test_filters.py +++ b/pandas/tests/groupby/test_filters.py @@ -248,7 +248,7 @@ def test_filter_using_len(): actual = grouped.filter(lambda x: len(x) > 2) expected = DataFrame( {"A": np.arange(2, 6), "B": list("bbbb"), "C": np.arange(2, 6)}, - index=np.arange(2, 6, dtype=np.int64), + index=range(2, 6), ) tm.assert_frame_equal(actual, expected) @@ -262,7 +262,7 @@ def test_filter_using_len_series(): s = Series(list("aabbbbcc"), name="B") grouped = s.groupby(s) actual = grouped.filter(lambda x: len(x) > 2) - expected = Series(4 * ["b"], index=np.arange(2, 6, dtype=np.int64), name="B") + expected = Series(4 * ["b"], index=range(2, 6), name="B") tm.assert_series_equal(actual, expected) actual = grouped.filter(lambda x: len(x) > 4) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 13fb9cfc4c0e4..93e891c51b86c 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -74,7 +74,7 @@ def max_value(group): tm.assert_series_equal(result, expected) -def test_pass_args_kwargs(ts, tsframe): +def test_pass_args_kwargs(ts): def f(x, q=None, axis=0): return np.percentile(x, q, axis=axis) @@ -100,28 +100,31 @@ def f(x, q=None, axis=0): tm.assert_series_equal(apply_result, agg_expected) tm.assert_series_equal(trans_result, trans_expected) - # DataFrame - for as_index in [True, False]: - df_grouped = tsframe.groupby(lambda x: x.month, as_index=as_index) - agg_result = df_grouped.agg(np.percentile, 80, axis=0) - apply_result = df_grouped.apply(DataFrame.quantile, 0.8) - expected = df_grouped.quantile(0.8) - tm.assert_frame_equal(apply_result, expected, check_names=False) - tm.assert_frame_equal(agg_result, expected) - - apply_result = df_grouped.apply(DataFrame.quantile, [0.4, 0.8]) - expected_seq = df_grouped.quantile([0.4, 0.8]) - if not as_index: - # apply treats the op as a transform; .quantile knows it's a reduction - apply_result.index = range(4) - apply_result.insert(loc=0, column="level_0", value=[1, 1, 2, 2]) - apply_result.insert(loc=1, column="level_1", value=[0.4, 0.8, 0.4, 0.8]) - tm.assert_frame_equal(apply_result, expected_seq, check_names=False) - - agg_result = df_grouped.agg(f, q=80) - apply_result = df_grouped.apply(DataFrame.quantile, q=0.8) - tm.assert_frame_equal(agg_result, expected) - tm.assert_frame_equal(apply_result, expected, check_names=False) + +def test_pass_args_kwargs_dataframe(tsframe, as_index): + def f(x, q=None, axis=0): + return np.percentile(x, q, axis=axis) + + df_grouped = tsframe.groupby(lambda x: x.month, as_index=as_index) + agg_result = df_grouped.agg(np.percentile, 80, axis=0) + apply_result = df_grouped.apply(DataFrame.quantile, 0.8) + expected = df_grouped.quantile(0.8) + tm.assert_frame_equal(apply_result, expected, check_names=False) + tm.assert_frame_equal(agg_result, expected) + + apply_result = df_grouped.apply(DataFrame.quantile, [0.4, 0.8]) + expected_seq = df_grouped.quantile([0.4, 0.8]) + if not as_index: + # apply treats the op as a transform; .quantile knows it's a reduction + apply_result.index = range(4) + apply_result.insert(loc=0, column="level_0", value=[1, 1, 2, 2]) + apply_result.insert(loc=1, column="level_1", value=[0.4, 0.8, 0.4, 0.8]) + tm.assert_frame_equal(apply_result, expected_seq, check_names=False) + + agg_result = df_grouped.agg(f, q=80) + apply_result = df_grouped.apply(DataFrame.quantile, q=0.8) + tm.assert_frame_equal(agg_result, expected) + tm.assert_frame_equal(apply_result, expected, check_names=False) def test_len(): @@ -828,7 +831,7 @@ def test_groupby_level_mapper(multiindex_dataframe_random_data): def test_groupby_level_nonmulti(): # GH 1313, GH 13901 s = Series([1, 2, 3, 10, 4, 5, 20, 6], Index([1, 2, 3, 1, 4, 5, 2, 6], name="foo")) - expected = Series([11, 22, 3, 4, 5, 6], Index(range(1, 7), name="foo")) + expected = Series([11, 22, 3, 4, 5, 6], Index(list(range(1, 7)), name="foo")) result = s.groupby(level=0).sum() tm.assert_series_equal(result, expected) @@ -860,7 +863,7 @@ def test_groupby_level_nonmulti(): def test_groupby_complex(): # GH 12902 a = Series(data=np.arange(4) * (1 + 2j), index=[0, 0, 1, 1]) - expected = Series((1 + 2j, 5 + 10j)) + expected = Series((1 + 2j, 5 + 10j), index=Index([0, 1])) result = a.groupby(level=0).sum() tm.assert_series_equal(result, expected) @@ -1205,7 +1208,10 @@ def test_groupby_nat_exclude(): ) grouped = df.groupby("dt") - expected = [Index([1, 7]), Index([3, 5])] + expected = [ + RangeIndex(start=1, stop=13, step=6), + RangeIndex(start=3, stop=7, step=2), + ] keys = sorted(grouped.groups.keys()) assert len(keys) == 2 for k, e in zip(keys, expected): @@ -1955,9 +1961,9 @@ def test_groups_sort_dropna(sort, dropna): df = DataFrame([[2.0, 1.0], [np.nan, 4.0], [0.0, 3.0]]) keys = [(2.0, 1.0), (np.nan, 4.0), (0.0, 3.0)] values = [ - Index([0], dtype="int64"), - Index([1], dtype="int64"), - Index([2], dtype="int64"), + RangeIndex(0, 1), + RangeIndex(1, 2), + RangeIndex(2, 3), ] if sort: taker = [2, 0] if dropna else [2, 0, 1] @@ -2665,7 +2671,9 @@ def test_groupby_method_drop_na(method): Series(["a", "b", "c"], name="A") ) else: - expected = DataFrame({"A": ["a", "b", "c"], "B": [0, 2, 4]}, index=[0, 2, 4]) + expected = DataFrame( + {"A": ["a", "b", "c"], "B": [0, 2, 4]}, index=range(0, 6, 2) + ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexes/datetimes/methods/test_to_series.py b/pandas/tests/indexes/datetimes/methods/test_to_series.py index 0c397c8ab2cd3..cd67775b7a5fc 100644 --- a/pandas/tests/indexes/datetimes/methods/test_to_series.py +++ b/pandas/tests/indexes/datetimes/methods/test_to_series.py @@ -13,6 +13,6 @@ def test_to_series(self): idx = naive.tz_localize("US/Pacific") expected = Series(np.array(idx.tolist(), dtype="object"), name="B") - result = idx.to_series(index=[0, 1]) + result = idx.to_series(index=range(2)) assert expected.dtype == idx.dtype tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexes/numeric/test_setops.py b/pandas/tests/indexes/numeric/test_setops.py index e9e5a57dfe9e5..5d3981dbf93d0 100644 --- a/pandas/tests/indexes/numeric/test_setops.py +++ b/pandas/tests/indexes/numeric/test_setops.py @@ -41,7 +41,7 @@ def test_intersection(self): other = Index([1, 2, 3, 4, 5]) result = index.intersection(other) - expected = Index(np.sort(np.intersect1d(index.values, other.values))) + expected = Index(range(1, 5)) tm.assert_index_equal(result, expected) result = other.intersection(index) diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index 43445433e2a04..bf16554871efc 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -223,7 +223,9 @@ def test_unique(self, index_flat): pass result = idx.unique() - tm.assert_index_equal(result, idx_unique) + tm.assert_index_equal( + result, idx_unique, exact=not isinstance(index, RangeIndex) + ) # nans: if not index._can_hold_na: diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py index b929616c814ee..4b8751fb3ba20 100644 --- a/pandas/tests/indexes/test_old_base.py +++ b/pandas/tests/indexes/test_old_base.py @@ -822,8 +822,9 @@ def test_append_preserves_dtype(self, simple_index): result = index.append(index) assert result.dtype == index.dtype - tm.assert_index_equal(result[:N], index, check_exact=True) - tm.assert_index_equal(result[N:], index, check_exact=True) + + tm.assert_index_equal(result[:N], index, exact=False, check_exact=True) + tm.assert_index_equal(result[N:], index, exact=False, check_exact=True) alt = index.take(list(range(N)) * 2) tm.assert_index_equal(result, alt, check_exact=True) diff --git a/pandas/tests/indexes/timedeltas/test_timedelta.py b/pandas/tests/indexes/timedeltas/test_timedelta.py index 3120066741ffa..2066be8976e7f 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta.py @@ -51,9 +51,9 @@ def test_fields(self): s = Series(rng) s[1] = np.nan - tm.assert_series_equal(s.dt.days, Series([1, np.nan], index=[0, 1])) + tm.assert_series_equal(s.dt.days, Series([1, np.nan], index=range(2))) tm.assert_series_equal( - s.dt.seconds, Series([10 * 3600 + 11 * 60 + 12, np.nan], index=[0, 1]) + s.dt.seconds, Series([10 * 3600 + 11 * 60 + 12, np.nan], index=range(2)) ) # preserve name (GH15589) diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 61cbb1983e49a..58255edb8e6df 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -751,10 +751,10 @@ def test_loc_range_in_series_indexing(self, size): # GH 11652 s = Series(index=range(size), dtype=np.float64) s.loc[range(1)] = 42 - tm.assert_series_equal(s.loc[range(1)], Series(42.0, index=[0])) + tm.assert_series_equal(s.loc[range(1)], Series(42.0, index=range(1))) s.loc[range(2)] = 43 - tm.assert_series_equal(s.loc[range(2)], Series(43.0, index=[0, 1])) + tm.assert_series_equal(s.loc[range(2)], Series(43.0, index=range(2))) def test_partial_boolean_frame_indexing(self): # GH 17170 diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index b8d012eca28ce..bd1c378642924 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -1928,7 +1928,7 @@ def test_loc_setitem_empty_series(self): # partially set with an empty object series ser = Series(dtype=object) ser.loc[1] = 1 - tm.assert_series_equal(ser, Series([1], index=[1])) + tm.assert_series_equal(ser, Series([1], index=range(1, 2))) ser.loc[3] = 3 tm.assert_series_equal(ser, Series([1, 3], index=[1, 3])) @@ -1938,7 +1938,7 @@ def test_loc_setitem_empty_series_float(self): # partially set with an empty object series ser = Series(dtype=object) ser.loc[1] = 1.0 - tm.assert_series_equal(ser, Series([1.0], index=[1])) + tm.assert_series_equal(ser, Series([1.0], index=range(1, 2))) ser.loc[3] = 3.0 tm.assert_series_equal(ser, Series([1.0, 3.0], index=[1, 3])) @@ -2061,7 +2061,7 @@ def test_loc_setitem_with_expansion_nonunique_index(self, index): N = len(index) arr = np.arange(N).astype(np.int64) - orig = DataFrame(arr, index=index, columns=[0]) + orig = DataFrame(arr, index=index) # key that will requiring object-dtype casting in the index key = "kapow" @@ -2074,7 +2074,7 @@ def test_loc_setitem_with_expansion_nonunique_index(self, index): else: assert exp_index[-1] == key exp_data = np.arange(N + 1).astype(np.float64) - expected = DataFrame(exp_data, index=exp_index, columns=[0]) + expected = DataFrame(exp_data, index=exp_index) # Add new row, but no new columns df = orig.copy() diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 5ce78b1c90e76..5591f8ec710e2 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -1098,7 +1098,7 @@ def test_read_excel_multiindex(self, request, engine, read_ext): tm.assert_frame_equal(actual, expected) # "mi_column_name" sheet - expected.index = list(range(4)) + expected.index = range(4) expected.columns = mi.set_names(["c1", "c2"]) actual = pd.read_excel( mi_file, sheet_name="mi_column_name", header=[0, 1], index_col=0 diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index 482b331332462..d81fde42d5386 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -330,6 +330,7 @@ def test_multiindex_interval_datetimes(self, tmp_excel): ], ] ), + columns=Index([0]), ) tm.assert_frame_equal(result, expected) @@ -375,7 +376,10 @@ def test_excel_sheet_size(self, tmp_excel): col_df.to_excel(tmp_excel) def test_excel_sheet_by_name_raise(self, tmp_excel): - gt = DataFrame(np.random.default_rng(2).standard_normal((10, 2))) + gt = DataFrame( + np.random.default_rng(2).standard_normal((10, 2)), + index=Index(list(range(10))), + ) gt.to_excel(tmp_excel) with ExcelFile(tmp_excel) as xl: @@ -496,7 +500,9 @@ def test_int_types(self, np_type, tmp_excel): # Test np.int values read come back as int # (rather than float which is Excel's format). df = DataFrame( - np.random.default_rng(2).integers(-10, 10, size=(10, 2)), dtype=np_type + np.random.default_rng(2).integers(-10, 10, size=(10, 2)), + dtype=np_type, + index=Index(list(range(10))), ) df.to_excel(tmp_excel, sheet_name="test1") @@ -512,7 +518,11 @@ def test_int_types(self, np_type, tmp_excel): @pytest.mark.parametrize("np_type", [np.float16, np.float32, np.float64]) def test_float_types(self, np_type, tmp_excel): # Test np.float values read come back as float. - df = DataFrame(np.random.default_rng(2).random(10), dtype=np_type) + df = DataFrame( + np.random.default_rng(2).random(10), + dtype=np_type, + index=Index(list(range(10))), + ) df.to_excel(tmp_excel, sheet_name="test1") with ExcelFile(tmp_excel) as reader: @@ -524,7 +534,7 @@ def test_float_types(self, np_type, tmp_excel): def test_bool_types(self, tmp_excel): # Test np.bool_ values read come back as float. - df = DataFrame([1, 0, True, False], dtype=np.bool_) + df = DataFrame([1, 0, True, False], dtype=np.bool_, index=Index(list(range(4)))) df.to_excel(tmp_excel, sheet_name="test1") with ExcelFile(tmp_excel) as reader: @@ -535,7 +545,7 @@ def test_bool_types(self, tmp_excel): tm.assert_frame_equal(df, recons) def test_inf_roundtrip(self, tmp_excel): - df = DataFrame([(1, np.inf), (2, 3), (5, -np.inf)]) + df = DataFrame([(1, np.inf), (2, 3), (5, -np.inf)], index=Index(list(range(3)))) df.to_excel(tmp_excel, sheet_name="test1") with ExcelFile(tmp_excel) as reader: @@ -632,7 +642,13 @@ def test_roundtrip_indexlabels(self, merge_cells, frame, tmp_excel): df.index.names = ["test"] assert df.index.names == recons.index.names - df = DataFrame(np.random.default_rng(2).standard_normal((10, 2))) >= 0 + df = ( + DataFrame( + np.random.default_rng(2).standard_normal((10, 2)), + index=Index(list(range(10))), + ) + >= 0 + ) df.to_excel( tmp_excel, sheet_name="test1", index_label="test", merge_cells=merge_cells ) diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index d83e7b4641e88..fdbfbd004617e 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -516,7 +516,7 @@ def test_nonetype_record_path(self, nulls_fixture): ], record_path=["info"], ) - expected = DataFrame({"i": 2}, index=[0]) + expected = DataFrame({"i": 2}, index=range(1)) tm.assert_equal(result, expected) @pytest.mark.parametrize("value", ["false", "true", "{}", "1", '"text"']) diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index d45368dece6d2..ba928abcb30ad 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -139,7 +139,7 @@ def test_numeric_dtype(all_parsers, any_real_numpy_dtype): expected = DataFrame([0, 1], dtype=any_real_numpy_dtype) result = parser.read_csv(StringIO(data), header=None, dtype=any_real_numpy_dtype) - tm.assert_frame_equal(expected, result) + tm.assert_frame_equal(expected, result, check_column_type=False) @pytest.mark.usefixtures("pyarrow_xfail") diff --git a/pandas/tests/io/parser/test_header.py b/pandas/tests/io/parser/test_header.py index b7e3a13ec28b8..c6efbd8059138 100644 --- a/pandas/tests/io/parser/test_header.py +++ b/pandas/tests/io/parser/test_header.py @@ -368,7 +368,7 @@ def test_header_multi_index_common_format_malformed2(all_parsers): parser = all_parsers expected = DataFrame( np.array([[2, 3, 4, 5, 6], [8, 9, 10, 11, 12]], dtype="int64"), - index=Index([1, 7]), + index=range(1, 13, 6), columns=MultiIndex( levels=[["a", "b", "c"], ["r", "s", "t", "u", "v"]], codes=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]], diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index df821fb740af8..35a3ceb98132d 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -1699,11 +1699,9 @@ def test_api_roundtrip(conn, request, test_frame1): # HACK! if "adbc" in conn_name: - result = result.rename(columns={"__index_level_0__": "level_0"}) - result.index = test_frame1.index - result.set_index("level_0", inplace=True) - result.index.astype(int) - result.index.name = None + result = result.drop(columns="__index_level_0__") + else: + result = result.drop(columns="level_0") tm.assert_frame_equal(result, test_frame1) diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index 4454607606395..6c9d374935ed5 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -244,7 +244,8 @@ "-87.65362593118043,41.94742799535678,0" ), }, - } + }, + index=range(5), ) @@ -414,7 +415,7 @@ def test_string_charset(parser): df_str = read_xml(StringIO(txt), parser=parser) - df_expected = DataFrame({"c1": 1, "c2": 2}, index=[0]) + df_expected = DataFrame({"c1": 1, "c2": 2}, index=range(1)) tm.assert_frame_equal(df_str, df_expected) diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index 422ed8d4f3d2b..c781e35e71ca6 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -1568,7 +1568,7 @@ def test_mode_boolean_with_na(self): # GH#42107 ser = Series([True, False, True, pd.NA], dtype="boolean") result = ser.mode() - expected = Series({0: True}, dtype="boolean") + expected = Series([True], dtype="boolean") tm.assert_series_equal(result, expected) @pytest.mark.parametrize( diff --git a/pandas/tests/reshape/concat/test_datetimes.py b/pandas/tests/reshape/concat/test_datetimes.py index 89a3c3c5ed8bc..0cf3192ea3a74 100644 --- a/pandas/tests/reshape/concat/test_datetimes.py +++ b/pandas/tests/reshape/concat/test_datetimes.py @@ -539,8 +539,8 @@ def test_concat_timedelta64_block(): df = DataFrame({"time": rng}) result = concat([df, df]) - tm.assert_frame_equal(result.iloc[:10], df) - tm.assert_frame_equal(result.iloc[10:], df) + tm.assert_frame_equal(result.iloc[:10], df, check_index_type=False) + tm.assert_frame_equal(result.iloc[10:], df, check_index_type=False) def test_concat_multiindex_datetime_nat(): diff --git a/pandas/tests/reshape/concat/test_index.py b/pandas/tests/reshape/concat/test_index.py index 68d77b79a59e7..e13b042192fc6 100644 --- a/pandas/tests/reshape/concat/test_index.py +++ b/pandas/tests/reshape/concat/test_index.py @@ -346,9 +346,11 @@ def test_concat_with_key_not_unique(self, performance_warning): performance_warning, match="indexing past lexsort depth" ): out_a = df_a.loc[("x", 0), :] - df_b = DataFrame( - {"name": [1, 2, 3]}, index=Index([("x", 0), ("y", 0), ("x", 0)]) + {"name": [1, 2, 3]}, + index=MultiIndex( + levels=[["x", "y"], range(1)], codes=[[0, 1, 0], [0, 0, 0]] + ), ) with tm.assert_produces_warning( performance_warning, match="indexing past lexsort depth" diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 0ab4d08db7cc9..4a6228e47eba0 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2970,7 +2970,7 @@ def test_merge_empty_frames_column_order(left_empty, right_empty): df2 = df2.iloc[:0] result = merge(df1, df2, on=["A"], how="outer") - expected = DataFrame(1, index=[0], columns=["A", "B", "C", "D"]) + expected = DataFrame(1, index=range(1), columns=["A", "B", "C", "D"]) if left_empty and right_empty: expected = expected.iloc[:0] elif left_empty: diff --git a/pandas/tests/series/methods/test_nlargest.py b/pandas/tests/series/methods/test_nlargest.py index 6a5b58c5da6b5..67ba1d7ca51b7 100644 --- a/pandas/tests/series/methods/test_nlargest.py +++ b/pandas/tests/series/methods/test_nlargest.py @@ -15,7 +15,7 @@ def assert_check_nselect_boundary(vals, dtype, method): # helper function for 'test_boundary_{dtype}' tests ser = Series(vals, dtype=dtype) result = getattr(ser, method)(3) - expected_idxr = [0, 1, 2] if method == "nsmallest" else [3, 2, 1] + expected_idxr = range(3) if method == "nsmallest" else range(3, 0, -1) expected = ser.loc[expected_idxr] tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_reindex.py b/pandas/tests/series/methods/test_reindex.py index d049f446edb0c..831c2338045ff 100644 --- a/pandas/tests/series/methods/test_reindex.py +++ b/pandas/tests/series/methods/test_reindex.py @@ -234,13 +234,15 @@ def test_reindex_categorical(): tm.assert_series_equal(result, expected) # partial reindexing - expected = Series(Categorical(values=["b", "c"], categories=["a", "b", "c"])) - expected.index = [1, 2] + expected = Series( + Categorical(values=["b", "c"], categories=["a", "b", "c"]), index=range(1, 3) + ) result = s.reindex([1, 2]) tm.assert_series_equal(result, expected) - expected = Series(Categorical(values=["c", np.nan], categories=["a", "b", "c"])) - expected.index = [2, 3] + expected = Series( + Categorical(values=["c", np.nan], categories=["a", "b", "c"]), index=range(2, 4) + ) result = s.reindex([2, 3]) tm.assert_series_equal(result, expected) @@ -261,11 +263,11 @@ def test_reindex_fill_value(): # floats floats = Series([1.0, 2.0, 3.0]) result = floats.reindex([1, 2, 3]) - expected = Series([2.0, 3.0, np.nan], index=[1, 2, 3]) + expected = Series([2.0, 3.0, np.nan], index=range(1, 4)) tm.assert_series_equal(result, expected) result = floats.reindex([1, 2, 3], fill_value=0) - expected = Series([2.0, 3.0, 0], index=[1, 2, 3]) + expected = Series([2.0, 3.0, 0], index=range(1, 4)) tm.assert_series_equal(result, expected) # ----------------------------------------------------------- @@ -273,12 +275,12 @@ def test_reindex_fill_value(): ints = Series([1, 2, 3]) result = ints.reindex([1, 2, 3]) - expected = Series([2.0, 3.0, np.nan], index=[1, 2, 3]) + expected = Series([2.0, 3.0, np.nan], index=range(1, 4)) tm.assert_series_equal(result, expected) # don't upcast result = ints.reindex([1, 2, 3], fill_value=0) - expected = Series([2, 3, 0], index=[1, 2, 3]) + expected = Series([2, 3, 0], index=range(1, 4)) assert issubclass(result.dtype.type, np.integer) tm.assert_series_equal(result, expected) @@ -287,11 +289,11 @@ def test_reindex_fill_value(): objects = Series([1, 2, 3], dtype=object) result = objects.reindex([1, 2, 3]) - expected = Series([2, 3, np.nan], index=[1, 2, 3], dtype=object) + expected = Series([2, 3, np.nan], index=range(1, 4), dtype=object) tm.assert_series_equal(result, expected) result = objects.reindex([1, 2, 3], fill_value="foo") - expected = Series([2, 3, "foo"], index=[1, 2, 3], dtype=object) + expected = Series([2, 3, "foo"], index=range(1, 4), dtype=object) tm.assert_series_equal(result, expected) # ------------------------------------------------------------ @@ -299,11 +301,11 @@ def test_reindex_fill_value(): bools = Series([True, False, True]) result = bools.reindex([1, 2, 3]) - expected = Series([False, True, np.nan], index=[1, 2, 3], dtype=object) + expected = Series([False, True, np.nan], index=range(1, 4), dtype=object) tm.assert_series_equal(result, expected) result = bools.reindex([1, 2, 3], fill_value=False) - expected = Series([False, True, False], index=[1, 2, 3]) + expected = Series([False, True, False], index=range(1, 4)) tm.assert_series_equal(result, expected) @@ -318,7 +320,7 @@ def test_reindex_fill_value_datetimelike_upcast(dtype, fill_value): ser = Series([NaT], dtype=dtype) result = ser.reindex([0, 1], fill_value=fill_value) - expected = Series([NaT, fill_value], index=[0, 1], dtype=object) + expected = Series([NaT, fill_value], index=range(2), dtype=object) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index 25e4e1f9ec50c..1ea1b030604a3 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -119,16 +119,16 @@ def test_empty_str_methods(any_string_dtype): tm.assert_series_equal(empty_str, empty.str.repeat(3)) tm.assert_series_equal(empty_bool, empty.str.match("^a")) tm.assert_frame_equal( - DataFrame(columns=[0], dtype=any_string_dtype), + DataFrame(columns=range(1), dtype=any_string_dtype), empty.str.extract("()", expand=True), ) tm.assert_frame_equal( - DataFrame(columns=[0, 1], dtype=any_string_dtype), + DataFrame(columns=range(2), dtype=any_string_dtype), empty.str.extract("()()", expand=True), ) tm.assert_series_equal(empty_str, empty.str.extract("()", expand=False)) tm.assert_frame_equal( - DataFrame(columns=[0, 1], dtype=any_string_dtype), + DataFrame(columns=range(2), dtype=any_string_dtype), empty.str.extract("()()", expand=False), ) tm.assert_frame_equal(empty_df.set_axis([], axis=1), empty.str.get_dummies()) diff --git a/pandas/tests/test_sorting.py b/pandas/tests/test_sorting.py index 56de3f7f39175..2a225bda953cf 100644 --- a/pandas/tests/test_sorting.py +++ b/pandas/tests/test_sorting.py @@ -223,7 +223,6 @@ def test_int64_overflow_how_merge(self, left_right, join_type): out = merge(left, right, how="outer") out.sort_values(out.columns.tolist(), inplace=True) - out.index = np.arange(len(out)) tm.assert_frame_equal(out, merge(left, right, how=join_type, sort=True)) @pytest.mark.slow diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 3a47d87286711..658e16bfe5682 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -2013,6 +2013,7 @@ def test_dataframe(self, df, cache): # dict-like result = to_datetime(df[["year", "month", "day"]].to_dict(), cache=cache) + expected.index = Index([0, 1]) tm.assert_series_equal(result, expected) def test_dataframe_dict_with_constructable(self, df, cache): @@ -2021,7 +2022,8 @@ def test_dataframe_dict_with_constructable(self, df, cache): df2["month"] = 2 result = to_datetime(df2, cache=cache) expected2 = Series( - [Timestamp("20150204 00:00:00"), Timestamp("20160205 00:0:00")] + [Timestamp("20150204 00:00:00"), Timestamp("20160205 00:0:00")], + index=Index([0, 1]), ) tm.assert_series_equal(result, expected2) diff --git a/pandas/tests/window/test_expanding.py b/pandas/tests/window/test_expanding.py index b4a045cd26fe4..b2f76bdd0e2ad 100644 --- a/pandas/tests/window/test_expanding.py +++ b/pandas/tests/window/test_expanding.py @@ -502,8 +502,8 @@ def test_expanding_apply_min_periods_0(engine_and_raw): def test_expanding_cov_diff_index(): # GH 7512 - s1 = Series([1, 2, 3], index=[0, 1, 2]) - s2 = Series([1, 3], index=[0, 2]) + s1 = Series([1, 2, 3], index=range(3)) + s2 = Series([1, 3], index=range(0, 4, 2)) result = s1.expanding().cov(s2) expected = Series([None, None, 2.0]) tm.assert_series_equal(result, expected) @@ -515,14 +515,14 @@ def test_expanding_cov_diff_index(): s1 = Series([7, 8, 10], index=[0, 1, 3]) s2 = Series([7, 9, 10], index=[0, 2, 3]) result = s1.expanding().cov(s2) - expected = Series([None, None, None, 4.5]) + expected = Series([None, None, None, 4.5], index=list(range(4))) tm.assert_series_equal(result, expected) def test_expanding_corr_diff_index(): # GH 7512 - s1 = Series([1, 2, 3], index=[0, 1, 2]) - s2 = Series([1, 3], index=[0, 2]) + s1 = Series([1, 2, 3], index=range(3)) + s2 = Series([1, 3], index=range(0, 4, 2)) result = s1.expanding().corr(s2) expected = Series([None, None, 1.0]) tm.assert_series_equal(result, expected) @@ -534,7 +534,7 @@ def test_expanding_corr_diff_index(): s1 = Series([7, 8, 10], index=[0, 1, 3]) s2 = Series([7, 9, 10], index=[0, 2, 3]) result = s1.expanding().corr(s2) - expected = Series([None, None, None, 1.0]) + expected = Series([None, None, None, 1.0], index=list(range(4))) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/window/test_pairwise.py b/pandas/tests/window/test_pairwise.py index 6fae79ee70702..d23c6501ed1d1 100644 --- a/pandas/tests/window/test_pairwise.py +++ b/pandas/tests/window/test_pairwise.py @@ -103,6 +103,7 @@ def test_flex_binary_frame(method, frame): ) res3 = getattr(frame.rolling(window=10), method)(frame2) + res3.columns = Index(list(res3.columns)) exp = DataFrame( {k: getattr(frame[k].rolling(window=10), method)(frame2[k]) for k in frame} ) @@ -143,26 +144,26 @@ def test_corr_sanity(): def test_rolling_cov_diff_length(): # GH 7512 - s1 = Series([1, 2, 3], index=[0, 1, 2]) - s2 = Series([1, 3], index=[0, 2]) + s1 = Series([1, 2, 3], index=range(3)) + s2 = Series([1, 3], index=range(0, 4, 2)) result = s1.rolling(window=3, min_periods=2).cov(s2) expected = Series([None, None, 2.0]) tm.assert_series_equal(result, expected) - s2a = Series([1, None, 3], index=[0, 1, 2]) + s2a = Series([1, None, 3], index=range(3)) result = s1.rolling(window=3, min_periods=2).cov(s2a) tm.assert_series_equal(result, expected) def test_rolling_corr_diff_length(): # GH 7512 - s1 = Series([1, 2, 3], index=[0, 1, 2]) - s2 = Series([1, 3], index=[0, 2]) + s1 = Series([1, 2, 3], index=range(3)) + s2 = Series([1, 3], index=range(0, 4, 2)) result = s1.rolling(window=3, min_periods=2).corr(s2) expected = Series([None, None, 1.0]) tm.assert_series_equal(result, expected) - s2a = Series([1, None, 3], index=[0, 1, 2]) + s2a = Series([1, None, 3], index=range(3)) result = s1.rolling(window=3, min_periods=2).corr(s2a) tm.assert_series_equal(result, expected) From 845d1e24eca366f0dd73667bf533a8710de29369 Mon Sep 17 00:00:00 2001 From: Yi-Han Chen <20080114+tan-i-ham@users.noreply.github.com> Date: Wed, 24 Jul 2024 15:04:00 -0400 Subject: [PATCH 223/272] DOC: Add SA01 for `pandas.api.types.is_string_dtype` (#59305) DOC: Add see also for pandas.api.types.is_string_dtype --- ci/code_checks.sh | 1 - pandas/core/dtypes/common.py | 5 +++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 1e9250fd77fe5..a1f8977b6b115 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -304,7 +304,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.api.types.is_re PR07,SA01" \ -i "pandas.api.types.is_re_compilable PR07,SA01" \ -i "pandas.api.types.is_sparse SA01" \ - -i "pandas.api.types.is_string_dtype SA01" \ -i "pandas.api.types.is_timedelta64_ns_dtype SA01" \ -i "pandas.api.types.pandas_dtype PR07,RT03,SA01" \ -i "pandas.api.types.union_categoricals RT03,SA01" \ diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 7db3f8ecebf2a..cd1d5366d6a08 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -558,6 +558,11 @@ def is_string_dtype(arr_or_dtype) -> bool: boolean Whether or not the array or dtype is of the string dtype. + See Also + -------- + api.types.is_string_dtype : Check whether the provided array or dtype + is of the string dtype. + Examples -------- >>> from pandas.api.types import is_string_dtype From dca602bda9efe0fefa352d5c226cfc53939793a4 Mon Sep 17 00:00:00 2001 From: taranarmo Date: Wed, 24 Jul 2024 23:58:30 +0200 Subject: [PATCH 224/272] BUG: add note to docstring about `dtype` keyword when creating Series from abother Series (#59300) * BUG: add note on creating Series from Series with dtype keyword When creating Series from another Series `dtype` keyword is ignored. See GH #59060. Add note to the docstring to underline this behaviour to Series and DataFrame constructors. * make backticks double Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/core/frame.py | 1 + pandas/core/series.py | 1 + 2 files changed, 2 insertions(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ee48f546815bb..1a0d564197417 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -531,6 +531,7 @@ class DataFrame(NDFrame, OpsMixin): will perform column selection instead. dtype : dtype, default None Data type to force. Only a single dtype is allowed. If None, infer. + If ``data`` is DataFrame then is ignored. copy : bool or None, default None Copy data from inputs. For dict data, the default of None behaves like ``copy=True``. For DataFrame diff --git a/pandas/core/series.py b/pandas/core/series.py index 9209a80ada0d1..f340821775015 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -256,6 +256,7 @@ class Series(base.IndexOpsMixin, NDFrame): # type: ignore[misc] Data type for the output Series. If not specified, this will be inferred from `data`. See the :ref:`user guide ` for more usages. + If ``data`` is Series then is ignored. name : Hashable, default None The name to give to the Series. copy : bool, default False From 1500b525977de5d91f5316413cdf8dcd97b28060 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 24 Jul 2024 12:05:56 -1000 Subject: [PATCH 225/272] CLN: Remove unnecessary iterators (#59297) --- pandas/core/apply.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index d024afa570a1e..5959156d11123 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -486,20 +486,14 @@ def compute_dict_like( cols = df[key] if cols.ndim == 1: - series_list = [obj._gotitem(key, ndim=1, subset=cols)] + series = obj._gotitem(key, ndim=1, subset=cols) + results.append(getattr(series, op_name)(how, **kwargs)) + keys.append(key) else: - series_list = [] - for index in range(cols.shape[1]): - col = cols.iloc[:, index] - + for _, col in cols.items(): series = obj._gotitem(key, ndim=1, subset=col) - series_list.append(series) - - for series in series_list: - result = getattr(series, op_name)(how, **kwargs) - results.append(result) - keys.append(key) - + results.append(getattr(series, op_name)(how, **kwargs)) + keys.append(key) else: results = [ getattr(obj._gotitem(key, ndim=1), op_name)(how, **kwargs) From 63dc1bb4f99d24b46bacb113d740d54459fdbe5e Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 24 Jul 2024 13:45:41 -1000 Subject: [PATCH 226/272] CI: xfail test_to_read_gcs for pyarrow=17 (#59306) --- pandas/tests/io/test_gcs.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py index 17b89c9f31616..434642ed7fc90 100644 --- a/pandas/tests/io/test_gcs.py +++ b/pandas/tests/io/test_gcs.py @@ -7,6 +7,8 @@ import numpy as np import pytest +from pandas.compat.pyarrow import pa_version_under17p0 + from pandas import ( DataFrame, Index, @@ -52,7 +54,7 @@ def ls(self, path, **kwargs): # Patches pyarrow; other processes should not pick up change @pytest.mark.single_cpu @pytest.mark.parametrize("format", ["csv", "json", "parquet", "excel", "markdown"]) -def test_to_read_gcs(gcs_buffer, format, monkeypatch, capsys): +def test_to_read_gcs(gcs_buffer, format, monkeypatch, capsys, request): """ Test that many to/read functions support GCS. @@ -96,6 +98,13 @@ def from_uri(path): to_local = pathlib.Path(path.replace("gs://", "")).absolute().as_uri() return pa_fs.LocalFileSystem(to_local) + request.applymarker( + pytest.mark.xfail( + not pa_version_under17p0, + raises=TypeError, + reason="pyarrow 17 broke the mocked filesystem", + ) + ) with monkeypatch.context() as m: m.setattr(pa_fs, "FileSystem", MockFileSystem) df1.to_parquet(path) From ecea7c31283c490e29da62dc0b0027a272998a6f Mon Sep 17 00:00:00 2001 From: Yuri Batista Ishizawa Date: Thu, 25 Jul 2024 12:16:57 -0300 Subject: [PATCH 227/272] DOC: Clarify row delete comparison to SQL (#59311) --- doc/source/getting_started/comparison/comparison_with_sql.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/getting_started/comparison/comparison_with_sql.rst b/doc/source/getting_started/comparison/comparison_with_sql.rst index daa528c7d408a..dc0590f18751a 100644 --- a/doc/source/getting_started/comparison/comparison_with_sql.rst +++ b/doc/source/getting_started/comparison/comparison_with_sql.rst @@ -505,7 +505,7 @@ DELETE DELETE FROM tips WHERE tip > 9; -In pandas we select the rows that should remain instead of deleting them: +In pandas we select the rows that should remain instead of deleting the rows that should be removed: .. ipython:: python From d6c9941b730855aaa03fd38f0d1216140f90e1ec Mon Sep 17 00:00:00 2001 From: cmjcharlton <90400333+cmjcharlton@users.noreply.github.com> Date: Thu, 25 Jul 2024 18:50:37 +0100 Subject: [PATCH 228/272] =?UTF-8?q?BUG:=20Integer=20values=20at=20the=20to?= =?UTF-8?q?p=20end=20of=20the=20supported=20range=20incorrectly=E2=80=A6?= =?UTF-8?q?=20(#59310)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * BUG: Integer values at the top end of the supported range incorrectly interpreted as missing for format versions 111 and prior * StataMissingValue expects value passed in to be of float type, so cast to this * Add type hint to StataParser.MISSING_VALUES to avoid mypy error when constructing StataMissingValue from value --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/io/stata.py | 37 ++++++-- pandas/tests/io/data/stata/stata1_108.dta | Bin 0 -> 703 bytes pandas/tests/io/data/stata/stata1_110.dta | Bin 0 -> 945 bytes pandas/tests/io/data/stata/stata1_111.dta | Bin 0 -> 945 bytes pandas/tests/io/data/stata/stata1_113.dta | Bin 0 -> 945 bytes pandas/tests/io/data/stata/stata1_115.dta | Bin 0 -> 1130 bytes pandas/tests/io/data/stata/stata1_118.dta | Bin 0 -> 3774 bytes pandas/tests/io/data/stata/stata1_119.dta | Bin 0 -> 3788 bytes pandas/tests/io/data/stata/stata8_108.dta | Bin 0 -> 703 bytes pandas/tests/io/data/stata/stata8_110.dta | Bin 0 -> 945 bytes pandas/tests/io/data/stata/stata8_111.dta | Bin 0 -> 945 bytes .../data/stata/stata_int_validranges_102.dta | Bin 0 -> 238 bytes .../data/stata/stata_int_validranges_103.dta | Bin 0 -> 240 bytes .../data/stata/stata_int_validranges_104.dta | Bin 0 -> 238 bytes .../data/stata/stata_int_validranges_105.dta | Bin 0 -> 274 bytes .../data/stata/stata_int_validranges_108.dta | Bin 0 -> 470 bytes .../data/stata/stata_int_validranges_110.dta | Bin 0 -> 616 bytes .../data/stata/stata_int_validranges_111.dta | Bin 0 -> 616 bytes .../data/stata/stata_int_validranges_113.dta | Bin 0 -> 616 bytes .../data/stata/stata_int_validranges_114.dta | Bin 0 -> 727 bytes .../data/stata/stata_int_validranges_115.dta | Bin 0 -> 727 bytes .../data/stata/stata_int_validranges_117.dta | Bin 0 -> 1174 bytes .../data/stata/stata_int_validranges_118.dta | Bin 0 -> 2499 bytes .../data/stata/stata_int_validranges_119.dta | Bin 0 -> 2509 bytes pandas/tests/io/test_stata.py | 83 +++++++++++++++++- 26 files changed, 112 insertions(+), 9 deletions(-) create mode 100644 pandas/tests/io/data/stata/stata1_108.dta create mode 100644 pandas/tests/io/data/stata/stata1_110.dta create mode 100644 pandas/tests/io/data/stata/stata1_111.dta create mode 100644 pandas/tests/io/data/stata/stata1_113.dta create mode 100644 pandas/tests/io/data/stata/stata1_115.dta create mode 100644 pandas/tests/io/data/stata/stata1_118.dta create mode 100644 pandas/tests/io/data/stata/stata1_119.dta create mode 100644 pandas/tests/io/data/stata/stata8_108.dta create mode 100644 pandas/tests/io/data/stata/stata8_110.dta create mode 100644 pandas/tests/io/data/stata/stata8_111.dta create mode 100644 pandas/tests/io/data/stata/stata_int_validranges_102.dta create mode 100644 pandas/tests/io/data/stata/stata_int_validranges_103.dta create mode 100644 pandas/tests/io/data/stata/stata_int_validranges_104.dta create mode 100644 pandas/tests/io/data/stata/stata_int_validranges_105.dta create mode 100644 pandas/tests/io/data/stata/stata_int_validranges_108.dta create mode 100644 pandas/tests/io/data/stata/stata_int_validranges_110.dta create mode 100644 pandas/tests/io/data/stata/stata_int_validranges_111.dta create mode 100644 pandas/tests/io/data/stata/stata_int_validranges_113.dta create mode 100644 pandas/tests/io/data/stata/stata_int_validranges_114.dta create mode 100644 pandas/tests/io/data/stata/stata_int_validranges_115.dta create mode 100644 pandas/tests/io/data/stata/stata_int_validranges_117.dta create mode 100644 pandas/tests/io/data/stata/stata_int_validranges_118.dta create mode 100644 pandas/tests/io/data/stata/stata_int_validranges_119.dta diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 5d89613bd3d4f..e71220102cbb4 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -583,6 +583,7 @@ I/O - Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`) - Bug in :meth:`read_json` not validating the ``typ`` argument to not be exactly ``"frame"`` or ``"series"`` (:issue:`59124`) - Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`) +- Bug in :meth:`read_stata` where extreme value integers were incorrectly interpreted as missing for format versions 111 and prior (:issue:`58130`) Period ^^^^^^ diff --git a/pandas/io/stata.py b/pandas/io/stata.py index dd92b1bbfdba0..03c15d0ab07bb 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -983,6 +983,19 @@ def __init__(self) -> None: np.float64(struct.unpack(" None: # These missing values are the generic '.' in Stata, and are used # to replace nans - self.MISSING_VALUES = { + self.MISSING_VALUES: dict[str, int | np.float32 | np.float64] = { "b": 101, "h": 32741, "l": 2147483621, @@ -1808,11 +1821,18 @@ def _do_convert_missing(self, data: DataFrame, convert_missing: bool) -> DataFra replacements = {} for i in range(len(data.columns)): fmt = self._typlist[i] - if fmt not in self.VALID_RANGE: - continue + if self._format_version <= 111: + if fmt not in self.OLD_VALID_RANGE: + continue - fmt = cast(str, fmt) # only strs in VALID_RANGE - nmin, nmax = self.VALID_RANGE[fmt] + fmt = cast(str, fmt) # only strs in OLD_VALID_RANGE + nmin, nmax = self.OLD_VALID_RANGE[fmt] + else: + if fmt not in self.VALID_RANGE: + continue + + fmt = cast(str, fmt) # only strs in VALID_RANGE + nmin, nmax = self.VALID_RANGE[fmt] series = data.iloc[:, i] # appreciably faster to do this with ndarray instead of Series @@ -1827,7 +1847,12 @@ def _do_convert_missing(self, data: DataFrame, convert_missing: bool) -> DataFra umissing, umissing_loc = np.unique(series[missing], return_inverse=True) replacement = Series(series, dtype=object) for j, um in enumerate(umissing): - missing_value = StataMissingValue(um) + if self._format_version <= 111: + missing_value = StataMissingValue( + float(self.MISSING_VALUES[fmt]) + ) + else: + missing_value = StataMissingValue(um) loc = missing_loc[umissing_loc == j] replacement.iloc[loc] = missing_value diff --git a/pandas/tests/io/data/stata/stata1_108.dta b/pandas/tests/io/data/stata/stata1_108.dta new file mode 100644 index 0000000000000000000000000000000000000000..6c948b44905899da630faf406fe6b41ac683e434 GIT binary patch literal 703 zcmc~{Vq{=tU}T^HFf>tcEKOH1GB7k&Ff_3;v1CX~Ny^Mgi_gt0E(Y3{0%Ih>7@06e z4vc~9ELBTAgLJ5Xs-Xd#qiO+XGN7;xjSv#Z292_ZYzWjtT>hZG{(t@d|NrX&c9|dv literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/stata/stata1_110.dta b/pandas/tests/io/data/stata/stata1_110.dta new file mode 100644 index 0000000000000000000000000000000000000000..c9e2ca72dbd4e4ac7fea459d0125fec04824c841 GIT binary patch literal 945 zcmc~}Vq{=tU}T^HFf>tcEKOH1GB7k&Ff_3;v1CX~Ny^Mg%gIkHiOt%lmyX8*REMcr>KUX%g;Wg<;2c#8IFkW|ZD@p$Ks9QV XHyQ#YhCn?;{e$}Y|Mma>|E~uCWKkzd literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/stata/stata1_111.dta b/pandas/tests/io/data/stata/stata1_111.dta new file mode 100644 index 0000000000000000000000000000000000000000..21370d302745819ee36866fcc9c7d2a13e210d15 GIT binary patch literal 945 zcmd02Vq{=tU}T^HFf>tcEKOH1GB7k&Ff_3;v1Iu7|M#E2X*v0cCGokL#l<-6Ovx`z z%1OneBB`5sDDsj|G)nK|Nr#>`hO}r literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/stata/stata1_113.dta b/pandas/tests/io/data/stata/stata1_113.dta new file mode 100644 index 0000000000000000000000000000000000000000..6fcf55f0406e99492a718976080d00f98ec99dd6 GIT binary patch literal 945 zcmXS9Vq{=tU}T^HFf>tcEKOH1GB7k&Ff_3;v1Iu7|M#E2X*v0cCGokL#l<-6Ovx`z z%1OneBB`5sDDtO`n3M(|Nr#>05&P_ literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/stata/stata1_115.dta b/pandas/tests/io/data/stata/stata1_115.dta new file mode 100644 index 0000000000000000000000000000000000000000..2e5258da49c3c90f7ade257ec2318fb63d0ab0a8 GIT binary patch literal 1130 zcmXSBVq{=tU}T^HFf>tcEKOH1GB7k&Ff_3;v1Iu7|M#E2X*v0cCGokL#l<-6Ovx`z z%1OneBB`Pbf{X8rk;Ux4TeS} cT7ln-qq3tRKp+I_A@UFEQ=is9{r|rn0D!D1@c;k- literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/stata/stata1_118.dta b/pandas/tests/io/data/stata/stata1_118.dta new file mode 100644 index 0000000000000000000000000000000000000000..26d7beccb745c58247d09744af47435900ea0d5d GIT binary patch literal 3774 zcmeHKJx;?g6gEN?8xkX1m#(-e9a{8c34vIOx^RHVZQM4J(^R&L)F~S`VC4`TfQuj@ zmR^Cs#l*4Ol(Iz?kx(+^%k%T|=X`ngi$N-?s2?awV8Up?1OdS~qmmKN>i}LEFjy!S z3)A8H<_d6bLXXrC;2wcK*+cdM+$X^4fN_FSpM3%qMl9uvX5`3g*U#tUdaLQ3*1h&w zy9+q?0$xoeqO)`Wdp$Tjb|jY$U-qp-JH$0>KOb1UIJ7u&c<1om;iJPRhky+$+)@$J z0cXBi%$Owa`b~f7PyL|*Z@Em@DNSFEjCf3yAB9p{%0Isc@!U+gycw9ZD3Knaa9ZjI zK#0fFaj6Pq`DybJWGoa9rCKi{laYYe%`l2ZM3p3s?n!f8BwZU`vsh%KQ!EY@5A<5a zC;_izN|!pHGe6tUe5-L0(W;nAft@N~*8Fy2<`pbUSNUTP*eQE8SfxOvK&3#X!2efZ zWH;S2Q-7@nd F>o+7w&29hy literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/stata/stata1_119.dta b/pandas/tests/io/data/stata/stata1_119.dta new file mode 100644 index 0000000000000000000000000000000000000000..284daa78bf6db8ba8e3d357cd425d5bd58b73fbe GIT binary patch literal 3788 zcmeHKy-ve05Do&w1QIh#m#(-e9Vj|1v2-ayLOej^Hf|fqX)4=A>Xet@0g!kR9)O4F z(t#J?ZwYqnHdPQ(6{$o*@sQ7F-}x?I&b}CgB$9Z(Bp61N_>^Pdlu;rmcHK6hr2xH| zq!HH&*LNL2nFw9H?>G*iE`}Stqq@2Qy1~Fmk20(U(*qfVRCx?1_}FdLuBO9Uv*DiA z+}1@)`O0PiErSWkL@i+2qs0^Jq+@ZuYbFl&3|{JZRC_%zcza|}T70nhWbxVJi$y@% zE8G(vkRGF+oK2{}@5x*8m3$^23837RRvi=dYIwjRBE2vW!npkNryosq$mK<#6Qe}; zjuVVa{Q!=`qVcd)2GaaA`3NH7k_AGpHc>t!0j*SE5b=;mf$Qzl#xSqAiClLZ#VqUR z#q6BohHkT%OF&DS)V60+>ZNC!XZYTPq)Mk^V5fdTy(@zs@6*PqysFpudXUL;8jD4hWzIn#y~ N*NT}Q&579s_8Sim%=Q2P literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/stata/stata8_108.dta b/pandas/tests/io/data/stata/stata8_108.dta new file mode 100644 index 0000000000000000000000000000000000000000..962f7f4331fb34331f381d9c62d336cc30d11a90 GIT binary patch literal 703 zcmc~{Vq{=tU}T^HFf>|%B3`vwpYHGY|{NFg5}) y(sJ?>!Avm2%mgH+#snn5K-E&uAe~G$RNE*ultZBYfBpae|LcKX0AV0_P!9le5FZQx literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/stata/stata8_110.dta b/pandas/tests/io/data/stata/stata8_110.dta new file mode 100644 index 0000000000000000000000000000000000000000..a7fe9a3b7e639c4fba3d4746e7f0844e8c185070 GIT binary patch literal 945 zcmc~}Vq{=tU}T^HFf>|%B3`vh#v|NsBj19gHh5Im>{ E0J3Wz4gdfE literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/stata/stata8_111.dta b/pandas/tests/io/data/stata/stata8_111.dta new file mode 100644 index 0000000000000000000000000000000000000000..cb96ac0e0f5d3700cf104804fc31858539cd9376 GIT binary patch literal 945 zcmd02Vq{=tU}T^HFf>|%B48Q;U{r5jJuf!rApLIY5hGzIw02LS; z;Zu;7lb?uJ30Q%d2|hE_m|$*HwbV06hjK_|!^{|^M?-*&5UBrO|NsC0dZ11a27(9m E07!`;{{R30 literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/stata/stata_int_validranges_102.dta b/pandas/tests/io/data/stata/stata_int_validranges_102.dta new file mode 100644 index 0000000000000000000000000000000000000000..69de2e2f7f91d930ef6796a70410dfad29b2f991 GIT binary patch literal 238 zcmYdeU}RusU}EshD@jdHEmFwI%*`w*R?sjsFj7d%FUn0U(PRLD%*>pm%92zFJFf&x sGvwswr9)UyQq@Aw04M^Z42{5C6eV=!H2i0109x2k$MCQI-~a#h0M@V@4FCWD literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/stata/stata_int_validranges_103.dta b/pandas/tests/io/data/stata/stata_int_validranges_103.dta new file mode 100644 index 0000000000000000000000000000000000000000..71f03873808e228f3ed7ba48d5194d2a19713d7d GIT binary patch literal 240 zcmYdiVq{=uU}9ik@XRYoO;0US$jQvjEGbsdFf=e$NXswEO)Sv_$}%uyX67VSmZU=1 wc_m<)AtygC9m0Z=sup?%KoJ;aXawe>D4{E-;XgwI(87i~hJW?{{{OEB0NX4Z5dZ)H literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/stata/stata_int_validranges_104.dta b/pandas/tests/io/data/stata/stata_int_validranges_104.dta new file mode 100644 index 0000000000000000000000000000000000000000..f6dff2a6b42d9aff6e6974078cca14d1575defed GIT binary patch literal 238 zcmc~`Vq{=uU}9ik@XRYoO;0US$jQvjEGbsdFf=exNXswEO)Sv_$}%t{W#%MRmZXB& vnRz8(njt4YFCD^ylByPZ20#%QWoQKEq9~y&r-7jXXkSC!zxsdw|JMTmpXnM6 literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/stata/stata_int_validranges_105.dta b/pandas/tests/io/data/stata/stata_int_validranges_105.dta new file mode 100644 index 0000000000000000000000000000000000000000..d0a7ad0f01d16a37c331897ccbe4a79bd6bc3d17 GIT binary patch literal 274 zcmc~~Vq{=uU}9ik@XRYoO;0US$jQvjEGbsdFf=e#NXswEO)Sv_%3=danK?<7C8=Pk t%)AnalAQd!bQHy^7J3FS5frwe5kdk*2faBB3=Kdl8|wbm|NH;H9sra<8V>*f literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/stata/stata_int_validranges_108.dta b/pandas/tests/io/data/stata/stata_int_validranges_108.dta new file mode 100644 index 0000000000000000000000000000000000000000..47b715bce21efa225230a841b0ce84994462dcfb GIT binary patch literal 470 zcmc~{Vq{=uU}9ik@XRYoO;0US$jQvjEGbsdFf_1GNXswEO)Sv_%2EX+W#%MRmZXBs u%FHW)n3JVIuRKFeeqK6$6;N4K3q1pvN))!C5ki6}lSUPh5dsYi4M1HDb^q%B{r_JN0NXPf A5C8xG literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/stata/stata_int_validranges_111.dta b/pandas/tests/io/data/stata/stata_int_validranges_111.dta new file mode 100644 index 0000000000000000000000000000000000000000..07052d824f13229f45844e63fe408a780775ef67 GIT binary patch literal 616 zcmd02Vq{=uU}9ik@XRYoO;0US$jQvjEGbsdFf=q&NXswEO)Sv_%2EaV{_{7fvLqF+ zS($kyc;y*#^7GR1tANU?TIdL6pNs6_F7Fjf@S93=9n^PwJoi|6dOP D-$EOT literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/stata/stata_int_validranges_115.dta b/pandas/tests/io/data/stata/stata_int_validranges_115.dta new file mode 100644 index 0000000000000000000000000000000000000000..80e1dc8670b38cdd5cf0ae1e58a2139e475dd75b GIT binary patch literal 727 zcmXSBVq{=uU}9ik@XRYoO;0US$jQvjEGbsdFf=q(NXswEO)Sv_%2EaV{_{7fvLqF+ zS($kyc;y*#^7GR1tANU?TId-Nr;=<9hDIb>L6pNs6_F7Fjf@S93=9n^PwJoi|6dOP DQD&}dH&zue$OwqK^ty(oEVNUB%Y8$P$Y?K!gl)-*dl=5*if!q!TrN6 zuquQOwi>`X2wmK3G;FtvAbF1@9^Pe!2Bab>Qekv-Vwa4ba+Pv(3e4Al5h+;}=?LxA zDgd_7lX5#0+DE{nrtjPY{0#UNaO#xUcF^(|?R&fq_!uw^_!jUzATY-YFI)-UlN6ir zh&28dKgD+etd@3V8MnJY=ZYs<^A(v{GZNA=&juSSN_Ni%I#)&tZI<=`Tc*JKtnv8j z?0oZ*HNNd9v@f=!0Jackt&`6bmq#J?yQ-<#Md&=N(^mhw6r-1(?n1ynU32AR2NR$_ xt2a$%lUY$rfR#zmMoBj(;EL-mTMMhFXby%#yFLH_ literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/stata/stata_int_validranges_118.dta b/pandas/tests/io/data/stata/stata_int_validranges_118.dta new file mode 100644 index 0000000000000000000000000000000000000000..4bbd823bff63ef38509cf1511ffe2787ef60c2a5 GIT binary patch literal 2499 zcmeHJy-ve05N@d$cm|eA2niNUm^z$pM1n#W9w2fX(^_(p%07@fwamN$Prw^6G4Up- z4*@K}j@_mzTbBqa9`fbRKX?A_Y#$in#N&iG!bI>yXap^k;6`8+^?-SRY}4doAC94Gvr#RopL$_<_7`ER9IKi zDcYqf0ocl^l-sV*UIQMr{m*s4&tSd?_}=mT<9&~}2Og&Zp98)Gd<*yy5SX(Hk6cSW zR3fI?R2cjzzKgE{*p}K2%edVII%>S3n!Yllnk)yiD$mBXdQe8kM*~c*Nl8QN^Z~Y3 zfzLyO$Gx-jom}31Vsx>aNnk6dZOW+!5tm0O_Pec>^~Shkz~$a{SgJdu43=5vVnbQO zK*PYlWMHEL2!|R*6R!D)gtpI(92o@mN0Zoc3!mBxTe2*Y<$PHbb6}+s3~A-&1l+hq KoOVmB>b76MbG>8$ literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/stata/stata_int_validranges_119.dta b/pandas/tests/io/data/stata/stata_int_validranges_119.dta new file mode 100644 index 0000000000000000000000000000000000000000..6bd9bbde1d22d5d560540682de2df35bc79df337 GIT binary patch literal 2509 zcmeHJu};G<5KTc0d;?3DO0Zzc1k&k7B-E-417qYiM;pmWifxd}(()1f0TX}0#Kd3p zBY-8?vC~xTmL*F(FEnM+P8iIAYZyz3e9oHFpTzm7!mXT>+9v_dKe1Gil#p9dDcaNVQzdZu8R^ge; zn2!a8dNC)3fBA3zlLOmP+i58`yFlw4XQ-iXoa%-wd$S@-r;RcwqT`|gDwA5oN^jDq zWCmpIVR4i~x9f6`EjH!>n}fSysqT^@SZXgP z8)MrVZavU?ppFN&Dgl3}VKU>APc%vM?8Fm=!1gqeDZKEtvofX2Dq1hsdA>7_-w}{p5h*bsm51<~r$p8QV literal 0 HcmV?d00001 diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 6d6f222fc0660..fb7182fdefb32 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -120,9 +120,11 @@ def test_read_index_col_none(self, version, temp_file): expected["a"] = expected["a"].astype(np.int32) tm.assert_frame_equal(read_df, expected, check_index_type=True) - @pytest.mark.parametrize("file", ["stata1_114", "stata1_117"]) - def test_read_dta1(self, file, datapath): - file = datapath("io", "data", "stata", f"{file}.dta") + # Note this test starts at format version 108 as the missing code for double + # was different prior to this (see GH 58149) and would therefore fail + @pytest.mark.parametrize("version", [108, 110, 111, 113, 114, 115, 117, 118, 119]) + def test_read_dta1(self, version, datapath): + file = datapath("io", "data", "stata", f"stata1_{version}.dta") parsed = self.read_dta(file) # Pandas uses np.nan as missing value. @@ -136,6 +138,18 @@ def test_read_dta1(self, file, datapath): # the casting doesn't fail so need to match stata here expected["float_miss"] = expected["float_miss"].astype(np.float32) + # Column names too long for older Stata formats + if version <= 108: + expected = expected.rename( + columns={ + "float_miss": "f_miss", + "double_miss": "d_miss", + "byte_miss": "b_miss", + "int_miss": "i_miss", + "long_miss": "l_miss", + } + ) + tm.assert_frame_equal(parsed, expected) def test_read_dta2(self, datapath): @@ -920,6 +934,23 @@ def test_missing_value_conversion(self, file, datapath): ) tm.assert_frame_equal(parsed, expected) + # Note this test starts at format version 108 as the missing code for double + # was different prior to this (see GH 58149) and would therefore fail + @pytest.mark.parametrize("file", ["stata8_108", "stata8_110", "stata8_111"]) + def test_missing_value_conversion_compat(self, file, datapath): + columns = ["int8_", "int16_", "int32_", "float32_", "float64_"] + smv = StataMissingValue(101) + keys = sorted(smv.MISSING_VALUES.keys()) + data = [] + row = [StataMissingValue(keys[j * 27]) for j in range(5)] + data.append(row) + expected = DataFrame(data, columns=columns) + + parsed = read_stata( + datapath("io", "data", "stata", f"{file}.dta"), convert_missing=True + ) + tm.assert_frame_equal(parsed, expected) + def test_big_dates(self, datapath, temp_file): yr = [1960, 2000, 9999, 100, 2262, 1677] mo = [1, 1, 12, 1, 4, 9] @@ -2035,6 +2066,52 @@ def test_read_write_ea_dtypes(self, dtype_backend, temp_file, tmp_path): tm.assert_frame_equal(written_and_read_again.set_index("index"), expected) + @pytest.mark.parametrize("version", [113, 114, 115, 117, 118, 119]) + def test_read_data_int_validranges(self, version, datapath): + expected = DataFrame( + { + "byte": np.array([-127, 100], dtype=np.int8), + "int": np.array([-32767, 32740], dtype=np.int16), + "long": np.array([-2147483647, 2147483620], dtype=np.int32), + } + ) + + parsed = read_stata( + datapath("io", "data", "stata", f"stata_int_validranges_{version}.dta") + ) + tm.assert_frame_equal(parsed, expected) + + @pytest.mark.parametrize("version", [104, 105, 108, 110, 111]) + def test_read_data_int_validranges_compat(self, version, datapath): + expected = DataFrame( + { + "byte": np.array([-128, 126], dtype=np.int8), + "int": np.array([-32768, 32766], dtype=np.int16), + "long": np.array([-2147483648, 2147483646], dtype=np.int32), + } + ) + + parsed = read_stata( + datapath("io", "data", "stata", f"stata_int_validranges_{version}.dta") + ) + tm.assert_frame_equal(parsed, expected) + + # The byte type was not supported prior to the 104 format + @pytest.mark.parametrize("version", [102, 103]) + def test_read_data_int_validranges_compat_nobyte(self, version, datapath): + expected = DataFrame( + { + "byte": np.array([-128, 126], dtype=np.int16), + "int": np.array([-32768, 32766], dtype=np.int16), + "long": np.array([-2147483648, 2147483646], dtype=np.int32), + } + ) + + parsed = read_stata( + datapath("io", "data", "stata", f"stata_int_validranges_{version}.dta") + ) + tm.assert_frame_equal(parsed, expected) + @pytest.mark.parametrize("version", [105, 108, 110, 111, 113, 114]) def test_backward_compat(version, datapath): From ebc60f2d812487cbdb35d0dc61ca8fa144b9a327 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 26 Jul 2024 11:36:25 +0200 Subject: [PATCH 229/272] TST / string dtype: add env variable to enable future_string and add test build (#58459) --- .github/workflows/unit-tests.yml | 5 +++++ ci/run_tests.sh | 6 ++++++ pandas/core/config_init.py | 2 +- 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 261859a14459a..c0461943ce9c8 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -57,6 +57,10 @@ jobs: # Also install zh_CN (its encoding is gb2312) but do not activate it. # It will be temporarily activated during tests with locale.setlocale extra_loc: "zh_CN" + - name: "Future infer strings" + env_file: actions-311.yaml + pattern: "not slow and not network and not single_cpu" + pandas_future_infer_string: "1" - name: "Pypy" env_file: actions-pypy-39.yaml pattern: "not slow and not network and not single_cpu" @@ -75,6 +79,7 @@ jobs: LANG: ${{ matrix.lang || 'C.UTF-8' }} LC_ALL: ${{ matrix.lc_all || '' }} PANDAS_CI: '1' + PANDAS_FUTURE_INFER_STRING: ${{ matrix.pandas_future_infer_string || '0' }} TEST_ARGS: ${{ matrix.test_args || '' }} PYTEST_WORKERS: 'auto' PYTEST_TARGET: ${{ matrix.pytest_target || 'pandas' }} diff --git a/ci/run_tests.sh b/ci/run_tests.sh index d2c2f58427a23..c6071100fc86f 100755 --- a/ci/run_tests.sh +++ b/ci/run_tests.sh @@ -16,5 +16,11 @@ if [[ "$PATTERN" ]]; then PYTEST_CMD="$PYTEST_CMD -m \"$PATTERN\"" fi +# temporarily let pytest always succeed (many tests are not yet passing in the +# build enabling the future string dtype) +if [[ "$PANDAS_FUTURE_INFER_STRING" == "1" ]]; then + PYTEST_CMD="$PYTEST_CMD || true" +fi + echo $PYTEST_CMD sh -c "$PYTEST_CMD" diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 05661033bd5ed..352020f45388f 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -858,7 +858,7 @@ def register_converter_cb(key: str) -> None: with cf.config_prefix("future"): cf.register_option( "infer_string", - False, + True if os.environ.get("PANDAS_FUTURE_INFER_STRING", "0") == "1" else False, "Whether to infer sequence of str objects as pyarrow string " "dtype, which will be the default in pandas 3.0 " "(at which point this option will be deprecated).", From e6c292ee99804c0d98de530a1204002878aebef5 Mon Sep 17 00:00:00 2001 From: zslsally <47940015+zslsally@users.noreply.github.com> Date: Fri, 26 Jul 2024 12:53:49 -0400 Subject: [PATCH 230/272] DOC: Add RT03,SA01 for pandas.api.types.union_categoricals (#59319) * Add RT03,SA01 for pandas.api.types.union_categoricals * Update after rerun the check --- ci/code_checks.sh | 1 - pandas/core/dtypes/concat.py | 6 ++++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index a1f8977b6b115..f0d04dd33640d 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -306,7 +306,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.api.types.is_sparse SA01" \ -i "pandas.api.types.is_timedelta64_ns_dtype SA01" \ -i "pandas.api.types.pandas_dtype PR07,RT03,SA01" \ - -i "pandas.api.types.union_categoricals RT03,SA01" \ -i "pandas.arrays.ArrowExtensionArray PR07,SA01" \ -i "pandas.arrays.BooleanArray SA01" \ -i "pandas.arrays.DatetimeArray SA01" \ diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 17e68b0e19a68..dcf8cb5c78536 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -190,6 +190,7 @@ def union_categoricals( Returns ------- Categorical + The union of categories being combined. Raises ------ @@ -201,6 +202,11 @@ def union_categoricals( ValueError Empty list of categoricals passed + See Also + -------- + CategoricalDtype : Type for categorical data with the categories and orderedness. + Categorical : Represent a categorical variable in classic R / S-plus fashion. + Notes ----- To learn more about categories, see `link From 0be983bf89a6de80432c5637099770afd8ea3ee9 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Fri, 26 Jul 2024 18:18:50 +0100 Subject: [PATCH 231/272] DOC: warn about when to not use the interchange protocol (#59322) docs: warn about when to not use the interchange protocol --- pandas/core/frame.py | 8 ++++++++ pandas/core/interchange/from_dataframe.py | 8 ++++++++ 2 files changed, 16 insertions(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1a0d564197417..b897e868ce134 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -901,6 +901,14 @@ def __dataframe__( """ Return the dataframe interchange object implementing the interchange protocol. + .. warning:: + + Due to severe implementation issues, we recommend only considering using the + interchange protocol in the following cases: + + - converting to pandas: for pandas >= 2.0.3 + - converting from pandas: for pandas >= 3.0.0 + Parameters ---------- nan_as_null : bool, default False diff --git a/pandas/core/interchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py index 4575837fb12fc..869ff43728860 100644 --- a/pandas/core/interchange/from_dataframe.py +++ b/pandas/core/interchange/from_dataframe.py @@ -36,6 +36,14 @@ def from_dataframe(df, allow_copy: bool = True) -> pd.DataFrame: """ Build a ``pd.DataFrame`` from any DataFrame supporting the interchange protocol. + .. warning:: + + Due to severe implementation issues, we recommend only considering using the + interchange protocol in the following cases: + + - converting to pandas: for pandas >= 2.0.3 + - converting from pandas: for pandas >= 3.0.0 + Parameters ---------- df : DataFrameXchg From 445a9d526cf87dbd9f35291630d7d3e5bd95fffc Mon Sep 17 00:00:00 2001 From: Benjamin M <137508630+bzm10@users.noreply.github.com> Date: Fri, 26 Jul 2024 20:19:59 +0300 Subject: [PATCH 232/272] DOC: Fix link to Conda package in README.md (#59321) Fix link to Conda package in README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index e5329d66c2d89..715b0c9dc459c 100644 --- a/README.md +++ b/README.md @@ -96,7 +96,7 @@ The source code is currently hosted on GitHub at: https://github.com/pandas-dev/pandas Binary installers for the latest released version are available at the [Python -Package Index (PyPI)](https://pypi.org/project/pandas) and on [Conda](https://docs.conda.io/en/latest/). +Package Index (PyPI)](https://pypi.org/project/pandas) and on [Conda](https://anaconda.org/conda-forge/pandas). ```sh # conda From 0e0814bdc40ea7e8d1c63b7591c33958c1ec018a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 26 Jul 2024 19:20:59 +0200 Subject: [PATCH 233/272] REF (string dtype): rename using_pyarrow_string_dtype to using_string_dtype (#59320) --- pandas/_config/__init__.py | 2 +- pandas/_libs/lib.pyx | 4 +- pandas/core/construction.py | 10 ++-- pandas/core/dtypes/cast.py | 4 +- pandas/core/internals/construction.py | 4 +- pandas/io/feather_format.py | 6 +-- pandas/io/orc.py | 4 +- pandas/io/parquet.py | 4 +- pandas/io/parsers/arrow_parser_wrapper.py | 4 +- pandas/io/pytables.py | 10 ++-- pandas/io/sql.py | 4 +- pandas/tests/arithmetic/test_object.py | 4 +- .../arrays/categorical/test_constructors.py | 4 +- pandas/tests/arrays/categorical/test_repr.py | 4 +- pandas/tests/base/test_misc.py | 4 +- pandas/tests/base/test_unique.py | 4 +- pandas/tests/extension/base/ops.py | 4 +- pandas/tests/extension/test_categorical.py | 4 +- .../frame/constructors/test_from_dict.py | 6 +-- .../frame/constructors/test_from_records.py | 4 +- pandas/tests/frame/methods/test_fillna.py | 6 +-- .../tests/frame/methods/test_interpolate.py | 6 +-- pandas/tests/frame/methods/test_replace.py | 54 +++++-------------- pandas/tests/frame/test_api.py | 4 +- pandas/tests/frame/test_arithmetic.py | 6 +-- pandas/tests/frame/test_constructors.py | 6 +-- pandas/tests/frame/test_reductions.py | 8 ++- pandas/tests/frame/test_repr.py | 4 +- .../tests/indexes/base_class/test_formats.py | 6 +-- .../indexes/categorical/test_category.py | 4 +- .../tests/indexes/categorical/test_formats.py | 4 +- pandas/tests/indexes/interval/test_formats.py | 4 +- pandas/tests/indexes/test_old_base.py | 4 +- pandas/tests/indexing/test_coercion.py | 4 +- pandas/tests/indexing/test_indexing.py | 8 ++- pandas/tests/indexing/test_loc.py | 4 +- pandas/tests/io/excel/test_readers.py | 6 +-- pandas/tests/io/formats/test_format.py | 10 ++-- pandas/tests/io/formats/test_to_string.py | 4 +- pandas/tests/io/json/test_pandas.py | 6 +-- pandas/tests/reshape/test_pivot.py | 8 +-- pandas/tests/series/indexing/test_where.py | 4 +- pandas/tests/series/methods/test_reindex.py | 6 +-- pandas/tests/series/methods/test_replace.py | 6 +-- pandas/tests/series/test_formats.py | 4 +- 45 files changed, 117 insertions(+), 163 deletions(-) diff --git a/pandas/_config/__init__.py b/pandas/_config/__init__.py index e746933ac0bf7..80d9ea1b364f3 100644 --- a/pandas/_config/__init__.py +++ b/pandas/_config/__init__.py @@ -30,6 +30,6 @@ from pandas._config.display import detect_console_encoding -def using_pyarrow_string_dtype() -> bool: +def using_string_dtype() -> bool: _mode_options = _global_config["future"] return _mode_options["infer_string"] diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index b78ff19bcfd53..2650d60eb3cef 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -37,7 +37,7 @@ from cython cimport ( floating, ) -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._libs.missing import check_na_tuples_nonequal @@ -2699,7 +2699,7 @@ def maybe_convert_objects(ndarray[object] objects, seen.object_ = True elif seen.str_: - if using_pyarrow_string_dtype() and is_string_array(objects, skipna=True): + if using_string_dtype() and is_string_array(objects, skipna=True): from pandas.core.arrays.string_ import StringDtype dtype = StringDtype(storage="pyarrow_numpy") diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 360e1d5ddd3ff..32792aa7f0543 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -16,7 +16,7 @@ import numpy as np from numpy import ma -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._libs import lib from pandas._libs.tslibs import ( @@ -571,11 +571,7 @@ def sanitize_array( if not is_list_like(data): if index is None: raise ValueError("index must be specified when data is not list-like") - if ( - isinstance(data, str) - and using_pyarrow_string_dtype() - and original_dtype is None - ): + if isinstance(data, str) and using_string_dtype() and original_dtype is None: from pandas.core.arrays.string_ import StringDtype dtype = StringDtype("pyarrow_numpy") @@ -609,7 +605,7 @@ def sanitize_array( subarr = data if data.dtype == object and infer_object: subarr = maybe_infer_to_datetimelike(data) - elif data.dtype.kind == "U" and using_pyarrow_string_dtype(): + elif data.dtype.kind == "U" and using_string_dtype(): from pandas.core.arrays.string_ import StringDtype dtype = StringDtype(storage="pyarrow_numpy") diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index f2af69fcc9d84..21e45505b40fc 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -18,7 +18,7 @@ import numpy as np -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._libs import ( Interval, @@ -798,7 +798,7 @@ def infer_dtype_from_scalar(val) -> tuple[DtypeObj, Any]: # coming out as np.str_! dtype = _dtype_obj - if using_pyarrow_string_dtype(): + if using_string_dtype(): from pandas.core.arrays.string_ import StringDtype dtype = StringDtype(storage="pyarrow_numpy") diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 0d149f47fd08c..c31479b3011e5 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -14,7 +14,7 @@ import numpy as np from numpy import ma -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._libs import lib @@ -301,7 +301,7 @@ def ndarray_to_mgr( bp = BlockPlacement(slice(len(columns))) nb = new_block_2d(values, placement=bp, refs=refs) block_values = [nb] - elif dtype is None and values.dtype.kind == "U" and using_pyarrow_string_dtype(): + elif dtype is None and values.dtype.kind == "U" and using_string_dtype(): dtype = StringDtype(storage="pyarrow_numpy") obj_columns = list(values) diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index 16d4e1f9ea25d..3df3e77a851a3 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -8,7 +8,7 @@ ) import warnings -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._libs import lib from pandas.compat._optional import import_optional_dependency @@ -131,7 +131,7 @@ def read_feather( with get_handle( path, "rb", storage_options=storage_options, is_text=False ) as handles: - if dtype_backend is lib.no_default and not using_pyarrow_string_dtype(): + if dtype_backend is lib.no_default and not using_string_dtype(): with warnings.catch_warnings(): warnings.filterwarnings( "ignore", @@ -155,7 +155,7 @@ def read_feather( elif dtype_backend == "pyarrow": return pa_table.to_pandas(types_mapper=pd.ArrowDtype) - elif using_pyarrow_string_dtype(): + elif using_string_dtype(): return pa_table.to_pandas(types_mapper=arrow_string_types_mapper()) else: raise NotImplementedError diff --git a/pandas/io/orc.py b/pandas/io/orc.py index 3bca8ea7ef1df..b297164d5d108 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -9,7 +9,7 @@ Literal, ) -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._libs import lib from pandas.compat._optional import import_optional_dependency @@ -136,7 +136,7 @@ def read_orc( df = pa_table.to_pandas(types_mapper=mapping.get) return df else: - if using_pyarrow_string_dtype(): + if using_string_dtype(): types_mapper = arrow_string_types_mapper() else: types_mapper = None diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 306b144811898..77a9cc3fca644 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -15,7 +15,7 @@ filterwarnings, ) -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._libs import lib from pandas.compat._optional import import_optional_dependency @@ -257,7 +257,7 @@ def read( to_pandas_kwargs["types_mapper"] = mapping.get elif dtype_backend == "pyarrow": to_pandas_kwargs["types_mapper"] = pd.ArrowDtype # type: ignore[assignment] - elif using_pyarrow_string_dtype(): + elif using_string_dtype(): to_pandas_kwargs["types_mapper"] = arrow_string_types_mapper() path_or_handle, handles, filesystem = _get_path_or_handle( diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index cffdb28e2c9e4..86bb5f190e403 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -3,7 +3,7 @@ from typing import TYPE_CHECKING import warnings -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._libs import lib from pandas.compat._optional import import_optional_dependency @@ -301,7 +301,7 @@ def read(self) -> DataFrame: dtype_mapping = _arrow_dtype_mapping() dtype_mapping[pa.null()] = pd.Int64Dtype() frame = table.to_pandas(types_mapper=dtype_mapping.get) - elif using_pyarrow_string_dtype(): + elif using_string_dtype(): frame = table.to_pandas(types_mapper=arrow_string_types_mapper()) else: diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 1420ce84b4db8..4b569fb7e39e2 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -30,7 +30,7 @@ from pandas._config import ( config, get_option, - using_pyarrow_string_dtype, + using_string_dtype, ) from pandas._libs import ( @@ -3294,7 +3294,7 @@ def read( index = self.read_index("index", start=start, stop=stop) values = self.read_array("values", start=start, stop=stop) result = Series(values, index=index, name=self.name, copy=False) - if using_pyarrow_string_dtype() and is_string_array(values, skipna=True): + if using_string_dtype() and is_string_array(values, skipna=True): result = result.astype("string[pyarrow_numpy]") return result @@ -3363,7 +3363,7 @@ def read( columns = items[items.get_indexer(blk_items)] df = DataFrame(values.T, columns=columns, index=axes[1], copy=False) - if using_pyarrow_string_dtype() and is_string_array(values, skipna=True): + if using_string_dtype() and is_string_array(values, skipna=True): df = df.astype("string[pyarrow_numpy]") dfs.append(df) @@ -4735,9 +4735,9 @@ def read( else: # Categorical df = DataFrame._from_arrays([values], columns=cols_, index=index_) - if not (using_pyarrow_string_dtype() and values.dtype.kind == "O"): + if not (using_string_dtype() and values.dtype.kind == "O"): assert (df.dtypes == values.dtype).all(), (df.dtypes, values.dtype) - if using_pyarrow_string_dtype() and is_string_array( + if using_string_dtype() and is_string_array( values, # type: ignore[arg-type] skipna=True, ): diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 41b368c9b05c2..4fd7de7a28855 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -31,7 +31,7 @@ import numpy as np -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._libs import lib from pandas.compat._optional import import_optional_dependency @@ -2197,7 +2197,7 @@ def read_table( from pandas.io._util import _arrow_dtype_mapping mapping = _arrow_dtype_mapping().get - elif using_pyarrow_string_dtype(): + elif using_string_dtype(): from pandas.io._util import arrow_string_types_mapper arrow_string_types_mapper() diff --git a/pandas/tests/arithmetic/test_object.py b/pandas/tests/arithmetic/test_object.py index 4ffd76722286a..884e6e002800e 100644 --- a/pandas/tests/arithmetic/test_object.py +++ b/pandas/tests/arithmetic/test_object.py @@ -8,7 +8,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype import pandas.util._test_decorators as td @@ -303,7 +303,7 @@ def test_iadd_string(self): index += "_x" assert "a_x" in index - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="add doesn't work") + @pytest.mark.xfail(using_string_dtype(), reason="add doesn't work") def test_add(self): index = pd.Index([str(i) for i in range(10)]) expected = pd.Index(index.values * 2) diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index 1069a9e5aaa90..6752a503016f8 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -6,7 +6,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas.core.dtypes.common import ( is_float_dtype, @@ -442,7 +442,7 @@ def test_constructor_str_unknown(self): with pytest.raises(ValueError, match="Unknown dtype"): Categorical([1, 2], dtype="foo") - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="Can't be NumPy strings") + @pytest.mark.xfail(using_string_dtype(), reason="Can't be NumPy strings") def test_constructor_np_strs(self): # GH#31499 Hashtable.map_locations needs to work on np.str_ objects cat = Categorical(["1", "0", "1"], [np.str_("0"), np.str_("1")]) diff --git a/pandas/tests/arrays/categorical/test_repr.py b/pandas/tests/arrays/categorical/test_repr.py index ef0315130215c..e2e5d47f50209 100644 --- a/pandas/tests/arrays/categorical/test_repr.py +++ b/pandas/tests/arrays/categorical/test_repr.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas import ( Categorical, @@ -78,7 +78,7 @@ def test_print_none_width(self): assert exp == repr(a) @pytest.mark.skipif( - using_pyarrow_string_dtype(), + using_string_dtype(), reason="Change once infer_string is set to True by default", ) def test_unicode_print(self): diff --git a/pandas/tests/base/test_misc.py b/pandas/tests/base/test_misc.py index f6a4396ca5be0..bbd9b150b88a8 100644 --- a/pandas/tests/base/test_misc.py +++ b/pandas/tests/base/test_misc.py @@ -3,7 +3,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas.compat import PYPY @@ -82,7 +82,7 @@ def test_ndarray_compat_properties(index_or_series_obj): @pytest.mark.skipif( - PYPY or using_pyarrow_string_dtype(), + PYPY or using_string_dtype(), reason="not relevant for PyPy doesn't work properly for arrow strings", ) def test_memory_usage(index_or_series_memory_obj): diff --git a/pandas/tests/base/test_unique.py b/pandas/tests/base/test_unique.py index 3a8ed471f9dc0..42730519b32fd 100644 --- a/pandas/tests/base/test_unique.py +++ b/pandas/tests/base/test_unique.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype import pandas as pd import pandas._testing as tm @@ -100,7 +100,7 @@ def test_nunique_null(null_obj, index_or_series_obj): @pytest.mark.single_cpu -@pytest.mark.xfail(using_pyarrow_string_dtype(), reason="decoding fails") +@pytest.mark.xfail(using_string_dtype(), reason="decoding fails") def test_unique_bad_unicode(index_or_series): # regression test for #34550 uval = "\ud83d" # smiley emoji diff --git a/pandas/tests/extension/base/ops.py b/pandas/tests/extension/base/ops.py index 5cd66d8a874c7..fad2560265d21 100644 --- a/pandas/tests/extension/base/ops.py +++ b/pandas/tests/extension/base/ops.py @@ -5,7 +5,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas.core.dtypes.common import is_string_dtype @@ -37,7 +37,7 @@ def _get_expected_exception( else: result = self.frame_scalar_exc - if using_pyarrow_string_dtype() and result is not None: + if using_string_dtype() and result is not None: import pyarrow as pa result = ( # type: ignore[assignment] diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index 09662f7b793a9..8f8af607585df 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -19,7 +19,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype import pandas as pd from pandas import Categorical @@ -99,7 +99,7 @@ def test_contains(self, data, data_missing): continue assert na_value_obj not in data # this section suffers from super method - if not using_pyarrow_string_dtype(): + if not using_string_dtype(): assert na_value_obj in data_missing def test_empty(self, dtype): diff --git a/pandas/tests/frame/constructors/test_from_dict.py b/pandas/tests/frame/constructors/test_from_dict.py index 60a8e688b3b8a..4237e796e052e 100644 --- a/pandas/tests/frame/constructors/test_from_dict.py +++ b/pandas/tests/frame/constructors/test_from_dict.py @@ -3,7 +3,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas import ( DataFrame, @@ -44,9 +44,7 @@ def test_constructor_single_row(self): ) tm.assert_frame_equal(result, expected) - @pytest.mark.skipif( - using_pyarrow_string_dtype(), reason="columns inferring logic broken" - ) + @pytest.mark.skipif(using_string_dtype(), reason="columns inferring logic broken") def test_constructor_list_of_series(self): data = [ OrderedDict([["a", 1.5], ["b", 3.0], ["c", 4.0]]), diff --git a/pandas/tests/frame/constructors/test_from_records.py b/pandas/tests/frame/constructors/test_from_records.py index ed2f0aa9c4679..ed3f9ac611405 100644 --- a/pandas/tests/frame/constructors/test_from_records.py +++ b/pandas/tests/frame/constructors/test_from_records.py @@ -8,7 +8,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas.compat import is_platform_little_endian @@ -58,7 +58,7 @@ def test_from_records_with_datetimes(self): tm.assert_frame_equal(result, expected) @pytest.mark.skipif( - using_pyarrow_string_dtype(), reason="dtype checking logic doesn't work" + using_string_dtype(), reason="dtype checking logic doesn't work" ) def test_from_records_sequencelike(self): df = DataFrame( diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py index 1b852343266aa..5d18b5ed1f7cd 100644 --- a/pandas/tests/frame/methods/test_fillna.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas import ( Categorical, @@ -65,7 +65,7 @@ def test_fillna_datetime(self, datetime_frame): with pytest.raises(TypeError, match=msg): datetime_frame.fillna() - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't fill 0 in string") + @pytest.mark.xfail(using_string_dtype(), reason="can't fill 0 in string") def test_fillna_mixed_type(self, float_string_frame): mf = float_string_frame mf.loc[mf.index[5:20], "foo"] = np.nan @@ -537,7 +537,7 @@ def test_fillna_col_reordering(self): filled = df.ffill() assert df.columns.tolist() == filled.columns.tolist() - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't fill 0 in string") + @pytest.mark.xfail(using_string_dtype(), reason="can't fill 0 in string") def test_fill_corner(self, float_frame, float_string_frame): mf = float_string_frame mf.loc[mf.index[5:20], "foo"] = np.nan diff --git a/pandas/tests/frame/methods/test_interpolate.py b/pandas/tests/frame/methods/test_interpolate.py index cdb9ff8a67b6b..7b206cc67d40d 100644 --- a/pandas/tests/frame/methods/test_interpolate.py +++ b/pandas/tests/frame/methods/test_interpolate.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype import pandas.util._test_decorators as td @@ -65,7 +65,7 @@ def test_interpolate_inplace(self, frame_or_series, request): assert orig.squeeze()[1] == 1.5 @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="interpolate doesn't work for string" + using_string_dtype(), reason="interpolate doesn't work for string" ) def test_interp_basic(self): df = DataFrame( @@ -90,7 +90,7 @@ def test_interp_basic(self): assert np.shares_memory(df["D"]._values, dvalues) @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="interpolate doesn't work for string" + using_string_dtype(), reason="interpolate doesn't work for string" ) def test_interp_basic_with_non_range_index(self, using_infer_string): df = DataFrame( diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index 3fcc4aaa6960f..0a980e5d358a5 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -6,7 +6,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype import pandas as pd from pandas import ( @@ -30,9 +30,7 @@ def mix_abc() -> dict[str, list[float | str]]: class TestDataFrameReplace: - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) + @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_replace_inplace(self, datetime_frame, float_string_frame): datetime_frame.loc[datetime_frame.index[:5], "A"] = np.nan datetime_frame.loc[datetime_frame.index[-5:], "A"] = np.nan @@ -293,9 +291,7 @@ def test_regex_replace_dict_nested_non_first_character( expected = DataFrame({"first": [".bc", "bc.", "c.b"]}, dtype=dtype) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) + @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_regex_replace_dict_nested_gh4115(self): df = DataFrame({"Type": ["Q", "T", "Q", "Q", "T"], "tmp": 2}) expected = DataFrame( @@ -304,9 +300,7 @@ def test_regex_replace_dict_nested_gh4115(self): result = df.replace({"Type": {"Q": 0, "T": 1}}) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) + @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_regex_replace_list_to_scalar(self, mix_abc): df = DataFrame(mix_abc) expec = DataFrame( @@ -332,9 +326,7 @@ def test_regex_replace_list_to_scalar(self, mix_abc): tm.assert_frame_equal(res2, expec) tm.assert_frame_equal(res3, expec) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) + @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_regex_replace_str_to_numeric(self, mix_abc): # what happens when you try to replace a numeric value with a regex? df = DataFrame(mix_abc) @@ -350,9 +342,7 @@ def test_regex_replace_str_to_numeric(self, mix_abc): tm.assert_frame_equal(res2, expec) tm.assert_frame_equal(res3, expec) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) + @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_regex_replace_regex_list_to_numeric(self, mix_abc): df = DataFrame(mix_abc) res = df.replace([r"\s*\.\s*", "b"], 0, regex=True) @@ -545,9 +535,7 @@ def test_replace_series_dict(self): result = df.replace(s, df.mean()) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) + @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_replace_convert(self): # gh 3907 df = DataFrame([["foo", "bar", "bah"], ["bar", "foo", "bah"]]) @@ -557,9 +545,7 @@ def test_replace_convert(self): res = rep.dtypes tm.assert_series_equal(expec, res) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) + @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_replace_mixed(self, float_string_frame): mf = float_string_frame mf.iloc[5:20, mf.columns.get_loc("foo")] = np.nan @@ -902,9 +888,7 @@ def test_replace_input_formats_listlike(self): with pytest.raises(ValueError, match=msg): df.replace(to_rep, values[1:]) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) + @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_replace_input_formats_scalar(self): df = DataFrame( {"A": [np.nan, 0, np.inf], "B": [0, 2, 5], "C": ["", "asdf", "fd"]} @@ -933,9 +917,7 @@ def test_replace_limit(self): # TODO pass - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) + @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_replace_dict_no_regex(self): answer = Series( { @@ -957,9 +939,7 @@ def test_replace_dict_no_regex(self): result = answer.replace(weights) tm.assert_series_equal(result, expected) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) + @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_replace_series_no_regex(self): answer = Series( { @@ -1064,9 +1044,7 @@ def test_nested_dict_overlapping_keys_replace_str(self): expected = df.replace({"a": dict(zip(astr, bstr))}) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) + @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_replace_swapping_bug(self, using_infer_string): df = DataFrame({"a": [True, False, True]}) res = df.replace({"a": {True: "Y", False: "N"}}) @@ -1197,9 +1175,7 @@ def test_replace_commutative(self, df, to_replace, exp): result = df.replace(to_replace) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) + @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") @pytest.mark.parametrize( "replacer", [ @@ -1492,9 +1468,7 @@ def test_regex_replace_scalar( expected.loc[expected["a"] == ".", "a"] = expected_replace_val tm.assert_frame_equal(result, expected) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) + @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") @pytest.mark.parametrize("regex", [False, True]) def test_replace_regex_dtype_frame(self, regex): # GH-48644 diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index 48f51dfa981ca..e8ef0592ac432 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -5,7 +5,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._config.config import option_context import pandas as pd @@ -113,7 +113,7 @@ def test_not_hashable(self): with pytest.raises(TypeError, match=msg): hash(empty_frame) - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="surrogates not allowed") + @pytest.mark.xfail(using_string_dtype(), reason="surrogates not allowed") def test_column_name_contains_unicode_surrogate(self): # GH 25509 colname = "\ud83d" diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 91b5f905ada22..d42d1d0316892 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -11,7 +11,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype import pandas as pd from pandas import ( @@ -251,9 +251,7 @@ def test_timestamp_compare(self, left, right): with pytest.raises(TypeError, match=msg): right_f(pd.Timestamp("nat"), df) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't compare string and int" - ) + @pytest.mark.xfail(using_string_dtype(), reason="can't compare string and int") def test_mixed_comparison(self): # GH#13128, GH#22163 != datetime64 vs non-dt64 should be False, # not raise TypeError diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index dfcd0d7bfea54..6416ea6415eb3 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -21,7 +21,7 @@ from numpy.ma import mrecords import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._libs import lib from pandas.compat.numpy import np_version_gt2 @@ -299,14 +299,14 @@ def test_constructor_dtype_nocast_view_2d_array(self): df2 = DataFrame(df.values, dtype=df[0].dtype) assert df2._mgr.blocks[0].values.flags.c_contiguous - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="conversion copies") + @pytest.mark.xfail(using_string_dtype(), reason="conversion copies") def test_1d_object_array_does_not_copy(self): # https://github.com/pandas-dev/pandas/issues/39272 arr = np.array(["a", "b"], dtype="object") df = DataFrame(arr, copy=False) assert np.shares_memory(df.values, arr) - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="conversion copies") + @pytest.mark.xfail(using_string_dtype(), reason="conversion copies") def test_2d_object_array_does_not_copy(self): # https://github.com/pandas-dev/pandas/issues/39272 arr = np.array([["a", "b"], ["c", "d"]], dtype="object") diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 649c30bdec790..3f4a5f2c97b6c 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -6,7 +6,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas.compat import ( IS64, @@ -465,7 +465,7 @@ def test_mixed_ops(self, op): getattr(df, op)() @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="sum doesn't work for arrow strings" + using_string_dtype(), reason="sum doesn't work for arrow strings" ) def test_reduce_mixed_frame(self): # GH 6806 @@ -1930,9 +1930,7 @@ def test_sum_timedelta64_skipna_false(): tm.assert_series_equal(result, expected) -@pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="sum doesn't work with arrow strings" -) +@pytest.mark.xfail(using_string_dtype(), reason="sum doesn't work with arrow strings") def test_mixed_frame_with_integer_sum(): # https://github.com/pandas-dev/pandas/issues/34520 df = DataFrame([["a", 1]], columns=list("ab")) diff --git a/pandas/tests/frame/test_repr.py b/pandas/tests/frame/test_repr.py index f799495d8025a..10cc86385af1b 100644 --- a/pandas/tests/frame/test_repr.py +++ b/pandas/tests/frame/test_repr.py @@ -7,7 +7,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas import ( NA, @@ -176,7 +176,7 @@ def test_repr_mixed_big(self): repr(biggie) - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="/r in") + @pytest.mark.xfail(using_string_dtype(), reason="/r in") def test_repr(self): # columns but no index no_index = DataFrame(columns=[0, 1, 3]) diff --git a/pandas/tests/indexes/base_class/test_formats.py b/pandas/tests/indexes/base_class/test_formats.py index 4580e00069dc1..260b4203a4f04 100644 --- a/pandas/tests/indexes/base_class/test_formats.py +++ b/pandas/tests/indexes/base_class/test_formats.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype import pandas._config.config as cf from pandas import Index @@ -16,7 +16,7 @@ def test_repr_is_valid_construction_code(self): res = eval(repr(idx)) tm.assert_index_equal(res, idx) - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="repr different") + @pytest.mark.xfail(using_string_dtype(), reason="repr different") @pytest.mark.parametrize( "index,expected", [ @@ -81,7 +81,7 @@ def test_string_index_repr(self, index, expected): result = repr(index) assert result == expected - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="repr different") + @pytest.mark.xfail(using_string_dtype(), reason="repr different") @pytest.mark.parametrize( "index,expected", [ diff --git a/pandas/tests/indexes/categorical/test_category.py b/pandas/tests/indexes/categorical/test_category.py index 87ec8289089dc..d9c9fdc62b0bc 100644 --- a/pandas/tests/indexes/categorical/test_category.py +++ b/pandas/tests/indexes/categorical/test_category.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._libs import index as libindex from pandas._libs.arrays import NDArrayBacked @@ -199,7 +199,7 @@ def test_unique(self, data, categories, expected_data, ordered): expected = CategoricalIndex(expected_data, dtype=dtype) tm.assert_index_equal(idx.unique(), expected) - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="repr doesn't roundtrip") + @pytest.mark.xfail(using_string_dtype(), reason="repr doesn't roundtrip") def test_repr_roundtrip(self): ci = CategoricalIndex(["a", "b"], categories=["a", "b"], ordered=True) str(ci) diff --git a/pandas/tests/indexes/categorical/test_formats.py b/pandas/tests/indexes/categorical/test_formats.py index 491db3a63cc0d..b1361b3e8106e 100644 --- a/pandas/tests/indexes/categorical/test_formats.py +++ b/pandas/tests/indexes/categorical/test_formats.py @@ -4,14 +4,14 @@ import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype import pandas._config.config as cf from pandas import CategoricalIndex class TestCategoricalIndexRepr: - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="repr different") + @pytest.mark.xfail(using_string_dtype(), reason="repr different") def test_string_categorical_index_repr(self): # short idx = CategoricalIndex(["a", "bb", "ccc"]) diff --git a/pandas/tests/indexes/interval/test_formats.py b/pandas/tests/indexes/interval/test_formats.py index 3b8e18463160f..d20611a61b154 100644 --- a/pandas/tests/indexes/interval/test_formats.py +++ b/pandas/tests/indexes/interval/test_formats.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas import ( DataFrame, @@ -42,7 +42,7 @@ def test_repr_missing(self, constructor, expected, using_infer_string, request): result = repr(obj) assert result == expected - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="repr different") + @pytest.mark.xfail(using_string_dtype(), reason="repr different") def test_repr_floats(self): # GH 32553 diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py index 4b8751fb3ba20..2f22c2490755e 100644 --- a/pandas/tests/indexes/test_old_base.py +++ b/pandas/tests/indexes/test_old_base.py @@ -6,7 +6,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._libs.tslibs import Timestamp @@ -438,7 +438,7 @@ def test_insert_base(self, index): assert index[0:4].equals(result) @pytest.mark.skipif( - using_pyarrow_string_dtype(), + using_string_dtype(), reason="completely different behavior, tested elsewher", ) def test_insert_out_of_bounds(self, index): diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index 84cd0d3b08b7b..f889fb0686f1d 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -9,7 +9,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas.compat import ( IS64, @@ -826,7 +826,7 @@ def replacer(self, how, from_key, to_key): return replacer # Expected needs adjustment for the infer string option, seems to work as expecetd - @pytest.mark.skipif(using_pyarrow_string_dtype(), reason="TODO: test is to complex") + @pytest.mark.skipif(using_string_dtype(), reason="TODO: test is to complex") def test_replace_series(self, how, to_key, from_key, replacer): index = pd.Index([3, 4], name="xxx") obj = pd.Series(self.rep[from_key], index=index, name="yyy") diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 58255edb8e6df..67b3445e413f0 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -8,7 +8,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas.errors import IndexingError @@ -426,9 +426,7 @@ def test_set_index_nan(self): ) tm.assert_frame_equal(result, df) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't multiply arrow strings" - ) + @pytest.mark.xfail(using_string_dtype(), reason="can't multiply arrow strings") def test_multi_assign(self): # GH 3626, an assignment of a sub-df to a df # set float64 to avoid upcast when setting nan @@ -654,7 +652,7 @@ def test_loc_setitem_fullindex_views(self): df.loc[df.index] = df.loc[df.index] tm.assert_frame_equal(df, df2) - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't set int into string") + @pytest.mark.xfail(using_string_dtype(), reason="can't set int into string") def test_rhs_alignment(self): # GH8258, tests that both rows & columns are aligned to what is # assigned to. covers both uniform data-type & multi-type cases diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index bd1c378642924..07cb76adcaa10 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -13,7 +13,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._libs import index as libindex from pandas.errors import IndexingError @@ -1204,7 +1204,7 @@ def test_loc_reverse_assignment(self): tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't set int into string") + @pytest.mark.xfail(using_string_dtype(), reason="can't set int into string") def test_loc_setitem_str_to_small_float_conversion_type(self): # GH#20388 diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 5591f8ec710e2..0c62b7df8e2cc 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -17,7 +17,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype import pandas.util._test_decorators as td @@ -691,9 +691,7 @@ def test_dtype_backend_and_dtype(self, read_ext, tmp_excel): ) tm.assert_frame_equal(result, df) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="infer_string takes precedence" - ) + @pytest.mark.xfail(using_string_dtype(), reason="infer_string takes precedence") def test_dtype_backend_string(self, read_ext, string_storage, tmp_excel): # GH#36712 if read_ext in (".xlsb", ".xls"): diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index b12cfc6876a8e..af7b04d66096a 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -11,7 +11,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype import pandas as pd from pandas import ( @@ -1347,9 +1347,7 @@ def test_unicode_name_in_footer(self): sf = fmt.SeriesFormatter(s, name="\u05e2\u05d1\u05e8\u05d9\u05ea") sf._get_footer() # should not raise exception - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="Fixup when arrow is default" - ) + @pytest.mark.xfail(using_string_dtype(), reason="Fixup when arrow is default") def test_east_asian_unicode_series(self): # not aligned properly because of east asian width @@ -1724,9 +1722,7 @@ def chck_ncols(self, s): ncolsizes = len({len(line.strip()) for line in lines}) assert ncolsizes == 1 - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="change when arrow is default" - ) + @pytest.mark.xfail(using_string_dtype(), reason="change when arrow is default") def test_format_explicit(self): test_sers = gen_series_formatting() with option_context("display.max_rows", 4, "display.show_dimensions", False): diff --git a/pandas/tests/io/formats/test_to_string.py b/pandas/tests/io/formats/test_to_string.py index ed871577d677f..5731f74a03852 100644 --- a/pandas/tests/io/formats/test_to_string.py +++ b/pandas/tests/io/formats/test_to_string.py @@ -10,7 +10,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas import ( CategoricalIndex, @@ -849,7 +849,7 @@ def test_to_string(self): frame.to_string() # TODO: split or simplify this test? - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="fix when arrow is default") + @pytest.mark.xfail(using_string_dtype(), reason="fix when arrow is default") def test_to_string_index_with_nan(self): # GH#2850 df = DataFrame( diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index a34c0adc69821..3c551e80ef00b 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -10,7 +10,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas.compat import IS64 import pandas.util._test_decorators as td @@ -1573,7 +1573,7 @@ def test_from_json_to_json_table_dtypes(self): # TODO: We are casting to string which coerces None to NaN before casting back # to object, ending up with incorrect na values - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="incorrect na conversion") + @pytest.mark.xfail(using_string_dtype(), reason="incorrect na conversion") @pytest.mark.parametrize("orient", ["split", "records", "index", "columns"]) def test_to_json_from_json_columns_dtypes(self, orient): # GH21892 GH33205 @@ -1854,7 +1854,7 @@ def test_to_json_indent(self, indent): assert result == expected @pytest.mark.skipif( - using_pyarrow_string_dtype(), + using_string_dtype(), reason="Adjust expected when infer_string is default, no bug here, " "just a complicated parametrization", ) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 2872b1e29d629..476ec2fc76488 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -9,7 +9,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas.compat.numpy import np_version_gte1p25 @@ -2656,7 +2656,7 @@ def test_pivot_columns_not_given(self): with pytest.raises(TypeError, match="missing 1 required keyword-only argument"): df.pivot() - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="None is cast to NaN") + @pytest.mark.xfail(using_string_dtype(), reason="None is cast to NaN") def test_pivot_columns_is_none(self): # GH#48293 df = DataFrame({None: [1], "b": 2, "c": 3}) @@ -2672,7 +2672,7 @@ def test_pivot_columns_is_none(self): expected = DataFrame({1: 3}, index=Index([2], name="b")) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="None is cast to NaN") + @pytest.mark.xfail(using_string_dtype(), reason="None is cast to NaN") def test_pivot_index_is_none(self): # GH#48293 df = DataFrame({None: [1], "b": 2, "c": 3}) @@ -2686,7 +2686,7 @@ def test_pivot_index_is_none(self): expected = DataFrame(3, index=[1], columns=Index([2], name="b")) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="None is cast to NaN") + @pytest.mark.xfail(using_string_dtype(), reason="None is cast to NaN") def test_pivot_values_is_none(self): # GH#48293 df = DataFrame({None: [1], "b": 2, "c": 3}) diff --git a/pandas/tests/series/indexing/test_where.py b/pandas/tests/series/indexing/test_where.py index 7718899ff234b..053c290999f2f 100644 --- a/pandas/tests/series/indexing/test_where.py +++ b/pandas/tests/series/indexing/test_where.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas.core.dtypes.common import is_integer @@ -231,7 +231,7 @@ def test_where_ndframe_align(): tm.assert_series_equal(out, expected) -@pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't set ints into string") +@pytest.mark.xfail(using_string_dtype(), reason="can't set ints into string") def test_where_setitem_invalid(): # GH 2702 # make sure correct exceptions are raised on invalid list assignment diff --git a/pandas/tests/series/methods/test_reindex.py b/pandas/tests/series/methods/test_reindex.py index 831c2338045ff..8901330108cb1 100644 --- a/pandas/tests/series/methods/test_reindex.py +++ b/pandas/tests/series/methods/test_reindex.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas import ( NA, @@ -22,9 +22,7 @@ import pandas._testing as tm -@pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="share memory doesn't work for arrow" -) +@pytest.mark.xfail(using_string_dtype(), reason="share memory doesn't work for arrow") def test_reindex(datetime_series, string_series): identity = string_series.reindex(string_series.index) diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index 90654df155cf0..0df5b5a5d0108 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -3,7 +3,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype import pandas as pd import pandas._testing as tm @@ -359,7 +359,7 @@ def test_replace_mixed_types_with_string(self): expected = pd.Series([1, np.nan, 3, np.nan, 4, 5], dtype=object) tm.assert_series_equal(expected, result) - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't fill 0 in string") + @pytest.mark.xfail(using_string_dtype(), reason="can't fill 0 in string") @pytest.mark.parametrize( "categorical, numeric", [ @@ -620,7 +620,7 @@ def test_replace_nullable_numeric(self): with pytest.raises(TypeError, match="Invalid value"): ints.replace(1, 9.5) - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't fill 1 in string") + @pytest.mark.xfail(using_string_dtype(), reason="can't fill 1 in string") @pytest.mark.parametrize("regex", [False, True]) def test_replace_regex_dtype_series(self, regex): # GH-48644 diff --git a/pandas/tests/series/test_formats.py b/pandas/tests/series/test_formats.py index c001e0f9b028a..78be4843f7a4d 100644 --- a/pandas/tests/series/test_formats.py +++ b/pandas/tests/series/test_formats.py @@ -6,7 +6,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype import pandas as pd from pandas import ( @@ -144,7 +144,7 @@ def test_tidy_repr_name_0(self, arg): assert "Name: 0" in rep_str @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="TODO: investigate why this is failing" + using_string_dtype(), reason="TODO: investigate why this is failing" ) def test_newline(self): ser = Series(["a\n\r\tb"], name="a\n\r\td", index=["a\n\r\tf"]) From 5af55e0df8bd2ff346eb91dc461cd8cc2d1abd77 Mon Sep 17 00:00:00 2001 From: cmjcharlton <90400333+cmjcharlton@users.noreply.github.com> Date: Fri, 26 Jul 2024 20:44:57 +0100 Subject: [PATCH 234/272] =?UTF-8?q?BUG:=20Missing=20value=20code=20not=20r?= =?UTF-8?q?ecognised=20for=20Stata=20format=20version=20105=20a=E2=80=A6?= =?UTF-8?q?=20(#59325)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * BUG: Missing value code not recognised for Stata format version 105 and earlier * Move definition of the old missing value constant for the double type out of the loop --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/io/stata.py | 9 +++++ pandas/tests/io/data/stata/stata1_102.dta | Bin 0 -> 362 bytes pandas/tests/io/data/stata/stata1_103.dta | Bin 0 -> 364 bytes pandas/tests/io/data/stata/stata1_104.dta | Bin 0 -> 363 bytes pandas/tests/io/data/stata/stata1_105.dta | Bin 0 -> 409 bytes pandas/tests/io/data/stata/stata8_102.dta | Bin 0 -> 362 bytes pandas/tests/io/data/stata/stata8_103.dta | Bin 0 -> 364 bytes pandas/tests/io/data/stata/stata8_104.dta | Bin 0 -> 363 bytes pandas/tests/io/data/stata/stata8_105.dta | Bin 0 -> 409 bytes pandas/tests/io/test_stata.py | 39 ++++++++++++++++------ 11 files changed, 38 insertions(+), 11 deletions(-) create mode 100644 pandas/tests/io/data/stata/stata1_102.dta create mode 100644 pandas/tests/io/data/stata/stata1_103.dta create mode 100644 pandas/tests/io/data/stata/stata1_104.dta create mode 100644 pandas/tests/io/data/stata/stata1_105.dta create mode 100644 pandas/tests/io/data/stata/stata8_102.dta create mode 100644 pandas/tests/io/data/stata/stata8_103.dta create mode 100644 pandas/tests/io/data/stata/stata8_104.dta create mode 100644 pandas/tests/io/data/stata/stata8_105.dta diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index e71220102cbb4..768b12ba1007f 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -584,6 +584,7 @@ I/O - Bug in :meth:`read_json` not validating the ``typ`` argument to not be exactly ``"frame"`` or ``"series"`` (:issue:`59124`) - Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`) - Bug in :meth:`read_stata` where extreme value integers were incorrectly interpreted as missing for format versions 111 and prior (:issue:`58130`) +- Bug in :meth:`read_stata` where the missing code for double was not recognised for format versions 105 and prior (:issue:`58149`) Period ^^^^^^ diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 03c15d0ab07bb..4be06f93689f2 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1817,10 +1817,19 @@ def read( return data def _do_convert_missing(self, data: DataFrame, convert_missing: bool) -> DataFrame: + # missing code for double was different in version 105 and prior + old_missingdouble = float.fromhex("0x1.0p333") + # Check for missing values, and replace if found replacements = {} for i in range(len(data.columns)): fmt = self._typlist[i] + # recode instances of the old missing code to the currently used value + if self._format_version <= 105 and fmt == "d": + data.iloc[:, i] = data.iloc[:, i].replace( + old_missingdouble, self.MISSING_VALUES["d"] + ) + if self._format_version <= 111: if fmt not in self.OLD_VALID_RANGE: continue diff --git a/pandas/tests/io/data/stata/stata1_102.dta b/pandas/tests/io/data/stata/stata1_102.dta new file mode 100644 index 0000000000000000000000000000000000000000..d0ca1b2a8c02d7053e9dea85f60c070758ceba7a GIT binary patch literal 362 zcmYdeU}RtgVnQG-B{MT8Ej~B1xEQE31;$8%F*0F{92f)HL{&>YgLDQ4RYL<1t!e?` bK`BEcFc(?jy&kj+%J)H6tDU{EzQ0MV)z c5FV5=Gy-#x)ei%9J;Vjy&kj+%J)H6tDU{EzQ0MV)z d5FV5=Gy-#x)ei%9J;VLk}`AB;&U^Li-FoxV2mUfBNN8R wfiaM6QMJ@FNQVlj8XCYksupl20}9*F2qA%N(C}c_L%eYyr2c>X|NsB%0kr-fy8r+H literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/stata/stata8_102.dta b/pandas/tests/io/data/stata/stata8_102.dta new file mode 100644 index 0000000000000000000000000000000000000000..5d3a4fb171e9cd58d763080649c593989b4ba18b GIT binary patch literal 362 zcmYdeU}RtgVnQG@Gbb%2Gq1!V9;6b;Ff;?PfDB_J5F;%oKM~9XGt5jtVroo40t{3w e^$gM(81PXrrNe^$Uk`-;|JMUO1HwRXAOrxWY8(>) literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/stata/stata8_103.dta b/pandas/tests/io/data/stata/stata8_103.dta new file mode 100644 index 0000000000000000000000000000000000000000..623a21e37650f5a308047b14bb1df86d7abe88a1 GIT binary patch literal 364 zcmYdiVq{=tU}PW+GBb11QZn;OEaE|Gfeb@45DUmKHUcrya`F?wOfbXD1SF=$1SG&f f)l$zOoq+)#1yedK=>PRV`2T-B&@&(m1P4L@tB4#H literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/stata/stata8_104.dta b/pandas/tests/io/data/stata/stata8_104.dta new file mode 100644 index 0000000000000000000000000000000000000000..df79d6a8af23018aafb2a2bf2b4fac488bad6d67 GIT binary patch literal 363 zcmc~`Vq{=tU}PW+k}`ABQZn;OEaE|Gfeb@45DUmKHUcrya`F?wOfbXD1SF=$1SG&f g)l$zOoq+)#1yedK==%Tl|NsB52l@qsf#5(00DLtY5dZ)H literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/stata/stata8_105.dta b/pandas/tests/io/data/stata/stata8_105.dta new file mode 100644 index 0000000000000000000000000000000000000000..cf01463a83d8146fc7736a0ec4db0581bfd393f0 GIT binary patch literal 409 zcmc~~Vq{=tU}PW+49yiBOVbsM3=BnSjL9n1BQrs9Ne7q?5^pY8y6G{r~#^|Nqwm{R6^4a3BN#rLrEr literal 0 HcmV?d00001 diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index fb7182fdefb32..c2c4140fa304d 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -120,9 +120,9 @@ def test_read_index_col_none(self, version, temp_file): expected["a"] = expected["a"].astype(np.int32) tm.assert_frame_equal(read_df, expected, check_index_type=True) - # Note this test starts at format version 108 as the missing code for double - # was different prior to this (see GH 58149) and would therefore fail - @pytest.mark.parametrize("version", [108, 110, 111, 113, 114, 115, 117, 118, 119]) + @pytest.mark.parametrize( + "version", [102, 103, 104, 105, 108, 110, 111, 113, 114, 115, 117, 118, 119] + ) def test_read_dta1(self, version, datapath): file = datapath("io", "data", "stata", f"stata1_{version}.dta") parsed = self.read_dta(file) @@ -918,8 +918,8 @@ def test_missing_value_generator(self, temp_file): ) assert val.string == ".z" - @pytest.mark.parametrize("file", ["stata8_113", "stata8_115", "stata8_117"]) - def test_missing_value_conversion(self, file, datapath): + @pytest.mark.parametrize("version", [113, 115, 117]) + def test_missing_value_conversion(self, version, datapath): columns = ["int8_", "int16_", "int32_", "float32_", "float64_"] smv = StataMissingValue(101) keys = sorted(smv.MISSING_VALUES.keys()) @@ -930,14 +930,13 @@ def test_missing_value_conversion(self, file, datapath): expected = DataFrame(data, columns=columns) parsed = read_stata( - datapath("io", "data", "stata", f"{file}.dta"), convert_missing=True + datapath("io", "data", "stata", f"stata8_{version}.dta"), + convert_missing=True, ) tm.assert_frame_equal(parsed, expected) - # Note this test starts at format version 108 as the missing code for double - # was different prior to this (see GH 58149) and would therefore fail - @pytest.mark.parametrize("file", ["stata8_108", "stata8_110", "stata8_111"]) - def test_missing_value_conversion_compat(self, file, datapath): + @pytest.mark.parametrize("version", [104, 105, 108, 110, 111]) + def test_missing_value_conversion_compat(self, version, datapath): columns = ["int8_", "int16_", "int32_", "float32_", "float64_"] smv = StataMissingValue(101) keys = sorted(smv.MISSING_VALUES.keys()) @@ -947,7 +946,25 @@ def test_missing_value_conversion_compat(self, file, datapath): expected = DataFrame(data, columns=columns) parsed = read_stata( - datapath("io", "data", "stata", f"{file}.dta"), convert_missing=True + datapath("io", "data", "stata", f"stata8_{version}.dta"), + convert_missing=True, + ) + tm.assert_frame_equal(parsed, expected) + + # The byte type was not supported prior to the 104 format + @pytest.mark.parametrize("version", [102, 103]) + def test_missing_value_conversion_compat_nobyte(self, version, datapath): + columns = ["int8_", "int16_", "int32_", "float32_", "float64_"] + smv = StataMissingValue(101) + keys = sorted(smv.MISSING_VALUES.keys()) + data = [] + row = [StataMissingValue(keys[j * 27]) for j in [1, 1, 2, 3, 4]] + data.append(row) + expected = DataFrame(data, columns=columns) + + parsed = read_stata( + datapath("io", "data", "stata", f"stata8_{version}.dta"), + convert_missing=True, ) tm.assert_frame_equal(parsed, expected) From 9b375be5aa3610e8a21ef0b5b81e4db04270f3d3 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 27 Jul 2024 17:14:00 +0200 Subject: [PATCH 235/272] TST (string dtype): clean-up xpasssing tests with future string dtype (#59323) --- pandas/tests/arithmetic/test_object.py | 3 --- pandas/tests/base/test_unique.py | 5 +---- pandas/tests/frame/constructors/test_from_dict.py | 3 ++- pandas/tests/frame/constructors/test_from_records.py | 4 +--- pandas/tests/frame/methods/test_fillna.py | 2 ++ pandas/tests/frame/methods/test_info.py | 11 ++++++----- pandas/tests/frame/methods/test_interpolate.py | 1 + pandas/tests/frame/test_arithmetic.py | 3 --- pandas/tests/indexes/interval/test_formats.py | 7 ++----- pandas/tests/indexing/test_coercion.py | 4 ---- pandas/tests/indexing/test_indexing.py | 4 ---- pandas/tests/series/methods/test_reindex.py | 3 --- pandas/tests/series/methods/test_replace.py | 1 - pandas/tests/series/test_formats.py | 2 +- 14 files changed, 16 insertions(+), 37 deletions(-) diff --git a/pandas/tests/arithmetic/test_object.py b/pandas/tests/arithmetic/test_object.py index 884e6e002800e..4b5156d0007bb 100644 --- a/pandas/tests/arithmetic/test_object.py +++ b/pandas/tests/arithmetic/test_object.py @@ -8,8 +8,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas.util._test_decorators as td import pandas as pd @@ -303,7 +301,6 @@ def test_iadd_string(self): index += "_x" assert "a_x" in index - @pytest.mark.xfail(using_string_dtype(), reason="add doesn't work") def test_add(self): index = pd.Index([str(i) for i in range(10)]) expected = pd.Index(index.values * 2) diff --git a/pandas/tests/base/test_unique.py b/pandas/tests/base/test_unique.py index 42730519b32fd..7f094db6ea524 100644 --- a/pandas/tests/base/test_unique.py +++ b/pandas/tests/base/test_unique.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd import pandas._testing as tm from pandas.tests.base.common import allow_na_ops @@ -100,12 +98,11 @@ def test_nunique_null(null_obj, index_or_series_obj): @pytest.mark.single_cpu -@pytest.mark.xfail(using_string_dtype(), reason="decoding fails") def test_unique_bad_unicode(index_or_series): # regression test for #34550 uval = "\ud83d" # smiley emoji - obj = index_or_series([uval] * 2) + obj = index_or_series([uval] * 2, dtype=object) result = obj.unique() if isinstance(obj, pd.Index): diff --git a/pandas/tests/frame/constructors/test_from_dict.py b/pandas/tests/frame/constructors/test_from_dict.py index 4237e796e052e..fc7c03dc25839 100644 --- a/pandas/tests/frame/constructors/test_from_dict.py +++ b/pandas/tests/frame/constructors/test_from_dict.py @@ -44,7 +44,7 @@ def test_constructor_single_row(self): ) tm.assert_frame_equal(result, expected) - @pytest.mark.skipif(using_string_dtype(), reason="columns inferring logic broken") + @pytest.mark.xfail(using_string_dtype(), reason="columns inferring logic broken") def test_constructor_list_of_series(self): data = [ OrderedDict([["a", 1.5], ["b", 3.0], ["c", 4.0]]), @@ -108,6 +108,7 @@ def test_constructor_list_of_series(self): expected = DataFrame.from_dict(sdict, orient="index") tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="columns inferring logic broken") def test_constructor_orient(self, float_string_frame): data_dict = float_string_frame.T._series recons = DataFrame.from_dict(data_dict, orient="index") diff --git a/pandas/tests/frame/constructors/test_from_records.py b/pandas/tests/frame/constructors/test_from_records.py index ed3f9ac611405..abc3aab1c1492 100644 --- a/pandas/tests/frame/constructors/test_from_records.py +++ b/pandas/tests/frame/constructors/test_from_records.py @@ -57,9 +57,7 @@ def test_from_records_with_datetimes(self): expected["EXPIRY"] = expected["EXPIRY"].astype("M8[s]") tm.assert_frame_equal(result, expected) - @pytest.mark.skipif( - using_string_dtype(), reason="dtype checking logic doesn't work" - ) + @pytest.mark.xfail(using_string_dtype(), reason="dtype checking logic doesn't work") def test_from_records_sequencelike(self): df = DataFrame( { diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py index 5d18b5ed1f7cd..b72cac6f3f9a1 100644 --- a/pandas/tests/frame/methods/test_fillna.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -65,6 +65,7 @@ def test_fillna_datetime(self, datetime_frame): with pytest.raises(TypeError, match=msg): datetime_frame.fillna() + # TODO(infer_string) test as actual error instead of xfail @pytest.mark.xfail(using_string_dtype(), reason="can't fill 0 in string") def test_fillna_mixed_type(self, float_string_frame): mf = float_string_frame @@ -537,6 +538,7 @@ def test_fillna_col_reordering(self): filled = df.ffill() assert df.columns.tolist() == filled.columns.tolist() + # TODO(infer_string) test as actual error instead of xfail @pytest.mark.xfail(using_string_dtype(), reason="can't fill 0 in string") def test_fill_corner(self, float_frame, float_string_frame): mf = float_string_frame diff --git a/pandas/tests/frame/methods/test_info.py b/pandas/tests/frame/methods/test_info.py index 4e3726f4dc51d..17cb989626e70 100644 --- a/pandas/tests/frame/methods/test_info.py +++ b/pandas/tests/frame/methods/test_info.py @@ -15,6 +15,7 @@ from pandas import ( CategoricalIndex, DataFrame, + Index, MultiIndex, Series, date_range, @@ -360,7 +361,7 @@ def test_info_memory_usage(): df = DataFrame(data) df.columns = dtypes - df_with_object_index = DataFrame({"a": [1]}, index=["foo"]) + df_with_object_index = DataFrame({"a": [1]}, index=Index(["foo"], dtype=object)) df_with_object_index.info(buf=buf, memory_usage=True) res = buf.getvalue().splitlines() assert re.match(r"memory usage: [^+]+\+", res[-1]) @@ -398,25 +399,25 @@ def test_info_memory_usage(): @pytest.mark.skipif(PYPY, reason="on PyPy deep=True doesn't change result") def test_info_memory_usage_deep_not_pypy(): - df_with_object_index = DataFrame({"a": [1]}, index=["foo"]) + df_with_object_index = DataFrame({"a": [1]}, index=Index(["foo"], dtype=object)) assert ( df_with_object_index.memory_usage(index=True, deep=True).sum() > df_with_object_index.memory_usage(index=True).sum() ) - df_object = DataFrame({"a": ["a"]}) + df_object = DataFrame({"a": Series(["a"], dtype=object)}) assert df_object.memory_usage(deep=True).sum() > df_object.memory_usage().sum() @pytest.mark.xfail(not PYPY, reason="on PyPy deep=True does not change result") def test_info_memory_usage_deep_pypy(): - df_with_object_index = DataFrame({"a": [1]}, index=["foo"]) + df_with_object_index = DataFrame({"a": [1]}, index=Index(["foo"], dtype=object)) assert ( df_with_object_index.memory_usage(index=True, deep=True).sum() == df_with_object_index.memory_usage(index=True).sum() ) - df_object = DataFrame({"a": ["a"]}) + df_object = DataFrame({"a": Series(["a"], dtype=object)}) assert df_object.memory_usage(deep=True).sum() == df_object.memory_usage().sum() diff --git a/pandas/tests/frame/methods/test_interpolate.py b/pandas/tests/frame/methods/test_interpolate.py index 7b206cc67d40d..b8a34d5eaa226 100644 --- a/pandas/tests/frame/methods/test_interpolate.py +++ b/pandas/tests/frame/methods/test_interpolate.py @@ -64,6 +64,7 @@ def test_interpolate_inplace(self, frame_or_series, request): assert np.shares_memory(orig, obj.values) assert orig.squeeze()[1] == 1.5 + # TODO(infer_string) raise proper TypeError in case of string dtype @pytest.mark.xfail( using_string_dtype(), reason="interpolate doesn't work for string" ) diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index d42d1d0316892..3971e58e8235e 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -11,8 +11,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( DataFrame, @@ -251,7 +249,6 @@ def test_timestamp_compare(self, left, right): with pytest.raises(TypeError, match=msg): right_f(pd.Timestamp("nat"), df) - @pytest.mark.xfail(using_string_dtype(), reason="can't compare string and int") def test_mixed_comparison(self): # GH#13128, GH#22163 != datetime64 vs non-dt64 should be False, # not raise TypeError diff --git a/pandas/tests/indexes/interval/test_formats.py b/pandas/tests/indexes/interval/test_formats.py index d20611a61b154..f858ae137ca4e 100644 --- a/pandas/tests/indexes/interval/test_formats.py +++ b/pandas/tests/indexes/interval/test_formats.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas import ( DataFrame, DatetimeIndex, @@ -42,12 +40,11 @@ def test_repr_missing(self, constructor, expected, using_infer_string, request): result = repr(obj) assert result == expected - @pytest.mark.xfail(using_string_dtype(), reason="repr different") def test_repr_floats(self): # GH 32553 markers = Series( - ["foo", "bar"], + [1, 2], index=IntervalIndex( [ Interval(left, right) @@ -59,7 +56,7 @@ def test_repr_floats(self): ), ) result = str(markers) - expected = "(329.973, 345.137] foo\n(345.137, 360.191] bar\ndtype: object" + expected = "(329.973, 345.137] 1\n(345.137, 360.191] 2\ndtype: int64" assert result == expected @pytest.mark.parametrize( diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index f889fb0686f1d..d5002a47c3447 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -9,8 +9,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat import ( IS64, is_platform_windows, @@ -825,8 +823,6 @@ def replacer(self, how, from_key, to_key): raise ValueError return replacer - # Expected needs adjustment for the infer string option, seems to work as expecetd - @pytest.mark.skipif(using_string_dtype(), reason="TODO: test is to complex") def test_replace_series(self, how, to_key, from_key, replacer): index = pd.Index([3, 4], name="xxx") obj = pd.Series(self.rep[from_key], index=index, name="yyy") diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 67b3445e413f0..e8d16f8240db6 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -8,8 +8,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import IndexingError from pandas.core.dtypes.common import ( @@ -426,7 +424,6 @@ def test_set_index_nan(self): ) tm.assert_frame_equal(result, df) - @pytest.mark.xfail(using_string_dtype(), reason="can't multiply arrow strings") def test_multi_assign(self): # GH 3626, an assignment of a sub-df to a df # set float64 to avoid upcast when setting nan @@ -652,7 +649,6 @@ def test_loc_setitem_fullindex_views(self): df.loc[df.index] = df.loc[df.index] tm.assert_frame_equal(df, df2) - @pytest.mark.xfail(using_string_dtype(), reason="can't set int into string") def test_rhs_alignment(self): # GH8258, tests that both rows & columns are aligned to what is # assigned to. covers both uniform data-type & multi-type cases diff --git a/pandas/tests/series/methods/test_reindex.py b/pandas/tests/series/methods/test_reindex.py index 8901330108cb1..068446a5e216b 100644 --- a/pandas/tests/series/methods/test_reindex.py +++ b/pandas/tests/series/methods/test_reindex.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas import ( NA, Categorical, @@ -22,7 +20,6 @@ import pandas._testing as tm -@pytest.mark.xfail(using_string_dtype(), reason="share memory doesn't work for arrow") def test_reindex(datetime_series, string_series): identity = string_series.reindex(string_series.index) diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index 0df5b5a5d0108..97151784eb94c 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -359,7 +359,6 @@ def test_replace_mixed_types_with_string(self): expected = pd.Series([1, np.nan, 3, np.nan, 4, 5], dtype=object) tm.assert_series_equal(expected, result) - @pytest.mark.xfail(using_string_dtype(), reason="can't fill 0 in string") @pytest.mark.parametrize( "categorical, numeric", [ diff --git a/pandas/tests/series/test_formats.py b/pandas/tests/series/test_formats.py index 78be4843f7a4d..1d95fbf8dccb8 100644 --- a/pandas/tests/series/test_formats.py +++ b/pandas/tests/series/test_formats.py @@ -144,7 +144,7 @@ def test_tidy_repr_name_0(self, arg): assert "Name: 0" in rep_str @pytest.mark.xfail( - using_string_dtype(), reason="TODO: investigate why this is failing" + using_string_dtype(), reason="TODO(infer_string): investigate failure" ) def test_newline(self): ser = Series(["a\n\r\tb"], name="a\n\r\td", index=["a\n\r\tf"]) From 7acec4fc4dccc9b9133cac53ab3a9a427e9208b0 Mon Sep 17 00:00:00 2001 From: Tirthraj Parmar Date: Mon, 29 Jul 2024 13:59:59 -0400 Subject: [PATCH 236/272] TST: Add test for `numpy.maximum` with `Timestamp` and `Series` of `datetime64` (#59338) Add test for `numpy.maximum` with `Timestamp` and `datetime64` `Series` --- pandas/tests/arithmetic/test_datetime64.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index cfc93ecae295d..26dfcf088e74b 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -389,6 +389,22 @@ def test_dt64_compare_datetime_scalar(self, datetimelike, op, expected): expected = Series(expected, name="A") tm.assert_series_equal(result, expected) + def test_ts_series_numpy_maximum(self): + # GH#50864, test numpy.maximum does not fail + # given a TimeStamp and Series(with dtype datetime64) comparison + ts = Timestamp("2024-07-01") + ts_series = Series( + ["2024-06-01", "2024-07-01", "2024-08-01"], + dtype="datetime64[us]", + ) + + expected = Series( + ["2024-07-01", "2024-07-01", "2024-08-01"], + dtype="datetime64[us]", + ) + + tm.assert_series_equal(expected, np.maximum(ts, ts_series)) + class TestDatetimeIndexComparisons: # TODO: moved from tests.indexes.test_base; parametrize and de-duplicate From 00cd5f16fef86946c63230e9839aa5f39ed4f8dc Mon Sep 17 00:00:00 2001 From: Sukriti <42755203+sukriti1@users.noreply.github.com> Date: Mon, 29 Jul 2024 14:00:56 -0400 Subject: [PATCH 237/272] Doc: Fix docstring SA01 error for pandas.core.groupby median and pandas.core.resample median (#59339) * Fix docstring SA01 error for pandas.core.groupby median * fix SA01 docstring code check for pandas core resample median * fix E501 docstring for pandas core groupby median --- ci/code_checks.sh | 3 --- pandas/core/groupby/groupby.py | 6 ++++++ 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index f0d04dd33640d..13685052078dc 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -328,7 +328,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.core.groupby.DataFrameGroupBy.hist RT03" \ -i "pandas.core.groupby.DataFrameGroupBy.indices SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.max SA01" \ - -i "pandas.core.groupby.DataFrameGroupBy.median SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.min SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.nth PR02" \ -i "pandas.core.groupby.DataFrameGroupBy.nunique SA01" \ @@ -347,7 +346,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.core.groupby.SeriesGroupBy.is_monotonic_decreasing SA01" \ -i "pandas.core.groupby.SeriesGroupBy.is_monotonic_increasing SA01" \ -i "pandas.core.groupby.SeriesGroupBy.max SA01" \ - -i "pandas.core.groupby.SeriesGroupBy.median SA01" \ -i "pandas.core.groupby.SeriesGroupBy.min SA01" \ -i "pandas.core.groupby.SeriesGroupBy.nth PR02" \ -i "pandas.core.groupby.SeriesGroupBy.ohlc SA01" \ @@ -362,7 +360,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.core.resample.Resampler.indices SA01" \ -i "pandas.core.resample.Resampler.max PR01,RT03,SA01" \ -i "pandas.core.resample.Resampler.mean SA01" \ - -i "pandas.core.resample.Resampler.median SA01" \ -i "pandas.core.resample.Resampler.min PR01,RT03,SA01" \ -i "pandas.core.resample.Resampler.ohlc SA01" \ -i "pandas.core.resample.Resampler.prod SA01" \ diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 945173bc48fe9..c07bc56377151 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2347,6 +2347,12 @@ def median(self, numeric_only: bool = False) -> NDFrameT: Series or DataFrame Median of values within each group. + See Also + -------- + Series.groupby : Apply a function groupby to a Series. + DataFrame.groupby : Apply a function groupby to each row or column of a + DataFrame. + Examples -------- For SeriesGroupBy: From dd28adb0a37815679a813ad9402ded54b3b0fa88 Mon Sep 17 00:00:00 2001 From: Dipanshi Bansal Date: Mon, 29 Jul 2024 18:02:24 +0000 Subject: [PATCH 238/272] BUG: Added test for setitem using loc not aligning on index (#59340) * initial commit * Added test --- pandas/tests/indexing/test_loc.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 07cb76adcaa10..72cda194bec53 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -3264,3 +3264,11 @@ def test_loc_nonunique_masked_index(self): index=Index(np.array(ids).repeat(1000), dtype="Int64"), ) tm.assert_frame_equal(result, expected) + + def test_loc_index_alignment_for_series(self): + # GH #56024 + df = DataFrame({"a": [1, 2], "b": [3, 4]}) + other = Series([200, 999], index=[1, 0]) + df.loc[:, "a"] = other + expected = DataFrame({"a": [999, 200], "b": [3, 4]}) + tm.assert_frame_equal(expected, df) From c8bdce929eab2b91ab554508e6a8f4b27f7eccae Mon Sep 17 00:00:00 2001 From: CaesarTY <32744105+CaesarTY@users.noreply.github.com> Date: Mon, 29 Jul 2024 14:03:18 -0400 Subject: [PATCH 239/272] DOC: add SA01 for pandas.Timestamp.isoweekday (#59341) add SA01 for pandas.Timestamp.isoweekday --- ci/code_checks.sh | 1 - pandas/_libs/tslibs/nattype.pyx | 7 +++++++ pandas/_libs/tslibs/timestamps.pyx | 7 +++++++ 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 13685052078dc..2c713a8e7bbea 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -223,7 +223,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Timestamp.fromordinal SA01" \ -i "pandas.Timestamp.fromtimestamp PR01,SA01" \ -i "pandas.Timestamp.hour GL08" \ - -i "pandas.Timestamp.isoweekday SA01" \ -i "pandas.Timestamp.max PR02" \ -i "pandas.Timestamp.microsecond GL08" \ -i "pandas.Timestamp.min PR02" \ diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index 4544cf56a11ec..130e41e5104a2 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -441,6 +441,13 @@ class NaTType(_NaT): Monday == 1 ... Sunday == 7. + See Also + -------- + Timestamp.weekday : Return the day of the week with Monday=0, Sunday=6. + Timestamp.isocalendar : Return a tuple containing ISO year, week number + and weekday. + datetime.date.isoweekday : Equivalent method in datetime module. + Examples -------- >>> ts = pd.Timestamp('2023-01-01 10:00:00') diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index cd749effd1a5f..369184d9df40c 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -2775,6 +2775,13 @@ default 'raise' Monday == 1 ... Sunday == 7. + See Also + -------- + Timestamp.weekday : Return the day of the week with Monday=0, Sunday=6. + Timestamp.isocalendar : Return a tuple containing ISO year, week number + and weekday. + datetime.date.isoweekday : Equivalent method in datetime module. + Examples -------- >>> ts = pd.Timestamp('2023-01-01 10:00:00') From f4c454a069ff3767eee999d034ec3ee577e17565 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 29 Jul 2024 09:26:54 -1000 Subject: [PATCH 240/272] TST: Remove unnecessary test_boolean_types.xlsx (#59348) --- .../tests/io/data/excel/test_boolean_types.xlsx | Bin 5279 -> 0 bytes pandas/tests/io/excel/test_readers.py | 10 +++++----- 2 files changed, 5 insertions(+), 5 deletions(-) delete mode 100644 pandas/tests/io/data/excel/test_boolean_types.xlsx diff --git a/pandas/tests/io/data/excel/test_boolean_types.xlsx b/pandas/tests/io/data/excel/test_boolean_types.xlsx deleted file mode 100644 index 234703c32f0abe61516c3e44aa35275242d14f08..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 5279 zcmZ`-2T+sS)(s#fRD<*`O-krZ1nDL8geFBqN`z1X1Q4Y6Dn$g94gvzwR1lF~l&XM$ z^xh%#s?d8U{%ze zwCR=#p2{uXIdD1 zV=GY)eLXB#wP0Wzc2F_jgJuCTM6FryIT+_bg>Ds5x|s3T{#5Re2nmvsA{i8)6(b56 z*RbXF#}W4}s}<>>Yj5TG7Hwz5q18J_!Kyw;9d35;iP!)x_NJ|N<{9>=9G)iA2syd& zFkGg6@Osnv%-C(a8Xb22bR`qX#&XrqFXU*8u?k#1kM0p9|L_mmFZUO3D3`=OAx0d4 zn%DYTD($~

?J`EE6YCX9KdqYbSAxk4ng2NT~ae+&!6Z@OE---;km8!-D>y|4ydb z=n6)64Qv2_2(y}_;0~?={1@+%geL74L1MX0ALaSu6eNToJFkW%cY;$?rjZ><|2`(? zQ}D)SWV{@43<5vaWo^l(3;k6(W;I6lfPolXL`Q+!)g9j5*mZ_*A>$#!3unU@cFrYE zU3_q=$);Y|Q5R=S32Ddkdh#R7z4Er!3*K0YPMt1<0(Z~AO2os)*uB@Kn{G!P0gEAp zmECfq)f@$bkC+ik3S!_&BI=QSh86yK94Q!@`-C4}$f%Yz$84lH)+|JK{ z^k!#OrrCn@3bkjuOd8x7#>{|AJ&!dMKAeAKNC^#M^DqWloe1~ckm&UR{HchNyXn?4 zMv0FYMd&cAr-Q&RQ9PVb_Lfdg_7^$%k3e=Uf>)I`PYK^1nmi;|=YHbK9eQ_c4=1Fe zQqw`r_5B*@0k{PcJ@8NUYowWl{4hjTUP4wRCEV2^XS`vakjhf`b+OT_yR z)nrVRk0EagnFlwA?Yl+ht0Ao6p1RX7=Asj=8Awy2u6|p}T{pr;70#AdQs+-glcKm%Tw8#iKNu+MQL-vqg7>?1#z_OQCTD8j0rod9pg3STx zs15zHm?YKEPp7Il=+VSCO{l5hyqt(&?Q3zHrXrrn`AIYPhE;DChPeW~!ZMVWcnYf=Zp9g9Wc?y{g=Aed9Io|L~dD4J;NLryh~ zcQ1*w{SXaW8DC7~+uA)sk8xBKFr3>&9mWDW$S+=ajz=y0un4r07}6zuo_>P=XApvE z>^LBpAaG)WK=wNb|DQE` z(E8|@_G`HZ+r%$t$#=dA2U2b(PW9U5NTu_pzd}&mQXDe0-1G~>hPR~;H1A2~>Q%2H ztUHK}WT(?9>N&4RFHx@p7wQirzm$VX7rzM{&iGluy$BAOoE6R~R#OgcI8Y}9m_=JV z)3m9Vs;gh??0una)4^c$o!eb?w8&4~N030#Dq)uXPtl7H*{EM&#ID9X7dIH@WN@{$ zK*6o`+)xOljq63$i^T|$HVYCfduW3m<#Q7gWQ#J!-^BXDRS~A5zNRZhfYhO(nL79K zD)nEozE=gYUcKL6aX%i9iS72Ssx%$+ZJKX#RuIkMCVX741{AB^YxTaS+*{1WNfylv zxihqdG9LF(y-{5)owfIA;0gn2Yn6^3;^C0{c>K9yM6aMeRMKZ7QjwE|4MO?o#zJe9 zVO@E{$Y9ku_MbbUtEKCR!=#TD4gf&%_fFg%Il%wgLf%uD{kR};+teZV0mfzoyI^OH>Ky#A)OpLEwHiq)~!Lx>}dn32kc%s{-B>GbOo^s` zmsh3!L+mxFucav^;v>1C>ytw7Mk0v7C@^58A~BYn2jAx^Gxc4B7xXDO32wtO70}&Xyu*-eOH$>2XLI;1!RS* z0U8Z#Y-<#M@UYI%epKjHWGFcU&d^v6Xdg&t4K696-DL_cQ|fSNv6=1d;aH`R6sM6I ziA(F`zqU+JUWtps_LF8IOAMuXMu@B^F*$p?R#J437Ewr9L1!j=p6JFQFIc+d#>&tQ zOs}tTf!lO9+}kp5-S?_P_9X9iVGSQP>DmC%2d3IKt$H|=&%0`C#L5;8x9QGX$^CNr zS3Kz{eXIgKo^O?y^iP!D+1kL~!5Y&=Y*to_g^@qFu)mtN~bRF6C;r^6GN{@Il& zSv-?H6Ae$QRehT{n7Q5JUXs32c}~0xO^!tN(##5raT8>h{&=v-zv}$J zvOzKSHH3@cizXOYRJDC8gNuP)tW~K0iNxC|O?4*iP;-|m{i`eSg0Ws$Px_QxO*MH} zH!PQxTHkEGMch>;RBq#(*ZT4?c6??I88SCeAfm;}c%03k7-d<}lLQ0aD~Wd?acS)6 zbe_snN-rzk>=hKg0eNV{+F5G0(jcVOZP)r(LBen9vCWpFr)aYJtQ*n2&|_xtNk(b4 z1K|A8as6@qW_`4;hXG&Nm?JgG;Na41nt3e>!USt2{hs(FRoLk(c_Qi2n`8mh%b%JV zD$d4HMbip_yY}o=M=xG43pR5{xS)X9VB>J|yt~b6jV2vZvf8?WGE=(`Y~4sxE%gXFHsbmD!rQNt%nh4X?uf$O?>|$iT2tfWJHVB0$Uw)JiSDmaRySmG#}J8r z_V&=~5T1HCM3L4-2!dg1G_g=NBb*#|2Th(#G!DYhq7?#>Rg9A73 ztt#9av8-p@!ikVc(McpA-_|VxH~d_@>V`zL!i&#FgzP{;gHfL5^IL_IHE)|oY@)qg zZM^Z!y=`hn@A1P-rP}9<@?z?vZmbF5Oj`*+KAhqGdBBtO zE>f^#u4X8vvjbtSW@{%aD9Xv%Rlv##1;6Mp(i6U3^wlAozJxh7d?UbYg0aBL`2o3X ze!e;&;%S75EX0nn!I$!e3;7mz+xIViQZ)s4Hw;ow0(bkC2^VPPfwstyfiXf~(V{&N zhm0Vfk-eyX0e;-GeZ>yGh1RB25vsfK(@zQdGEk=3W(3bH*M5>x>>9f*b0Ht=fj{** zzJ;HJJ7Q6XfZqf)ULhi84k%=dTn?e|Q5rQLA_X)FRbq6Dg#takQ0;*O+?#2@I#X|nE4TEKyEw2j)!u4F_+_`TqkQxb=@^8C?$cBWO2e&`b z17{24jy{bco>(zQu;XR5RoN`K9-QAH6Z$28$~>-JX#jjjv`>5z znSOD6aBblHPf71p=l6SHB)y91cCP-nq!!N3zZ7lKMqfA;jM2UAu6un?Us9zYz-UV; zkIIPNrqAeROzkzIXFm@4dw!9gVkxoz23yT_(m?ni}Y4ne-dVlY~Ke%eLPFIhjR7nGyD` z&+2$WnDVXDqf{g<@yD`hDo7x(`X4{SNC4T>R<7}SoBF9l#>8Rl+A%&?)Q%`ztlF_< z26-u0sAnzJr3K#r=c<=@{k9l&j-J)+IYRmQ;^S7Esw8{}>HTvEV$sr$QSLdc`YU*? z?r|h8nN=^>PrFn;^69Jg^vL`F>3rtbd(|GVs=8n_rq002VFHvki7Ao<0w F{{s~i>9PO- diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 0c62b7df8e2cc..bc041882b9fab 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -165,13 +165,13 @@ def xfail_datetimes_with_pyxlsb(engine, request): class TestReaders: @pytest.mark.parametrize("col", [[True, None, False], [True], [True, False]]) - def test_read_excel_type_check(self, col, datapath): + def test_read_excel_type_check(self, col, tmp_excel, read_ext): # GH 58159 + if read_ext in (".xlsb", ".xls"): + pytest.skip(f"No engine for filetype: '{read_ext}'") df = DataFrame({"bool_column": col}, dtype="boolean") - f_path = datapath("io", "data", "excel", "test_boolean_types.xlsx") - - df.to_excel(f_path, index=False) - df2 = pd.read_excel(f_path, dtype={"bool_column": "boolean"}, engine="openpyxl") + df.to_excel(tmp_excel, index=False) + df2 = pd.read_excel(tmp_excel, dtype={"bool_column": "boolean"}) tm.assert_frame_equal(df, df2) def test_pass_none_type(self, datapath): From 56ea76ac5e7b700102dfbf7bcc3a24635692fb29 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Mon, 29 Jul 2024 15:27:36 -0400 Subject: [PATCH 241/272] DOC: Promote Arrow C Data Interface over Interchange Protocol (#59347) --- pandas/core/frame.py | 5 +++++ pandas/core/interchange/from_dataframe.py | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b897e868ce134..ea91046f4b8e4 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -901,6 +901,11 @@ def __dataframe__( """ Return the dataframe interchange object implementing the interchange protocol. + .. note:: + + For new development, we highly recommend using the Arrow C Data Interface + alongside the Arrow PyCapsule Interface instead of the interchange protocol + .. warning:: Due to severe implementation issues, we recommend only considering using the diff --git a/pandas/core/interchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py index 869ff43728860..7f2647d64b190 100644 --- a/pandas/core/interchange/from_dataframe.py +++ b/pandas/core/interchange/from_dataframe.py @@ -36,6 +36,11 @@ def from_dataframe(df, allow_copy: bool = True) -> pd.DataFrame: """ Build a ``pd.DataFrame`` from any DataFrame supporting the interchange protocol. + .. note:: + + For new development, we highly recommend using the Arrow C Data Interface + alongside the Arrow PyCapsule Interface instead of the interchange protocol + .. warning:: Due to severe implementation issues, we recommend only considering using the From f25a09eacee843f643604a7406a5bb6bcc4361e5 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 29 Jul 2024 21:40:31 +0200 Subject: [PATCH 242/272] String dtype: rename the storage options and add `na_value` keyword in `StringDtype()` (#59330) * rename storage option and add na_value keyword * update init * fix propagating na_value to Array class + fix some tests * fix more tests * disallow pyarrow_numpy as option + fix more cases of checking storage to be pyarrow_numpy * restore pyarrow_numpy as option for now * linting * try fix typing * try fix typing * fix dtype equality to take into account the NaN vs NA * fix pickling of dtype * fix test_convert_dtypes * update expected result for dtype='string' * suppress typing error with _metadata attribute --- pandas/_libs/lib.pyx | 2 +- pandas/_testing/__init__.py | 4 +- pandas/core/arrays/arrow/array.py | 6 +- pandas/core/arrays/string_.py | 89 ++++++++++++++----- pandas/core/arrays/string_arrow.py | 11 ++- pandas/core/construction.py | 4 +- pandas/core/dtypes/cast.py | 2 +- pandas/core/indexes/base.py | 3 +- pandas/core/internals/construction.py | 2 +- pandas/core/reshape/encoding.py | 3 +- pandas/core/reshape/merge.py | 3 +- pandas/core/tools/numeric.py | 9 +- pandas/io/_util.py | 6 +- pandas/tests/arrays/string_/test_string.py | 76 +++++++++------- .../tests/arrays/string_/test_string_arrow.py | 4 +- pandas/tests/extension/base/methods.py | 8 +- pandas/tests/extension/test_string.py | 44 +++++---- .../frame/methods/test_convert_dtypes.py | 1 + pandas/tests/series/test_constructors.py | 7 +- pandas/tests/strings/__init__.py | 2 +- 20 files changed, 176 insertions(+), 110 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 2650d60eb3cef..0bb47541e5963 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2702,7 +2702,7 @@ def maybe_convert_objects(ndarray[object] objects, if using_string_dtype() and is_string_array(objects, skipna=True): from pandas.core.arrays.string_ import StringDtype - dtype = StringDtype(storage="pyarrow_numpy") + dtype = StringDtype(storage="pyarrow", na_value=np.nan) return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) elif convert_to_nullable_dtype and is_string_array(objects, skipna=True): diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 1cd91ee5b120c..3aa53d4b07aa5 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -509,14 +509,14 @@ def shares_memory(left, right) -> bool: if ( isinstance(left, ExtensionArray) and is_string_dtype(left.dtype) - and left.dtype.storage in ("pyarrow", "pyarrow_numpy") # type: ignore[attr-defined] + and left.dtype.storage == "pyarrow" # type: ignore[attr-defined] ): # https://github.com/pandas-dev/pandas/pull/43930#discussion_r736862669 left = cast("ArrowExtensionArray", left) if ( isinstance(right, ExtensionArray) and is_string_dtype(right.dtype) - and right.dtype.storage in ("pyarrow", "pyarrow_numpy") # type: ignore[attr-defined] + and right.dtype.storage == "pyarrow" # type: ignore[attr-defined] ): right = cast("ArrowExtensionArray", right) left_pa_data = left._pa_array diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 5da479760047f..a17056b51a014 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -575,10 +575,8 @@ def __getitem__(self, item: PositionalIndexer): if isinstance(item, np.ndarray): if not len(item): # Removable once we migrate StringDtype[pyarrow] to ArrowDtype[string] - if self._dtype.name == "string" and self._dtype.storage in ( - "pyarrow", - "pyarrow_numpy", - ): + if self._dtype.name == "string" and self._dtype.storage == "pyarrow": + # TODO(infer_string) should this be large_string? pa_dtype = pa.string() else: pa_dtype = self._dtype.pyarrow_dtype diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 13c26f0c97934..cae770d85637c 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -9,7 +9,10 @@ import numpy as np -from pandas._config import get_option +from pandas._config import ( + get_option, + using_string_dtype, +) from pandas._libs import ( lib, @@ -81,8 +84,10 @@ class StringDtype(StorageExtensionDtype): Parameters ---------- - storage : {"python", "pyarrow", "pyarrow_numpy"}, optional + storage : {"python", "pyarrow"}, optional If not given, the value of ``pd.options.mode.string_storage``. + na_value : {np.nan, pd.NA}, default pd.NA + Whether the dtype follows NaN or NA missing value semantics. Attributes ---------- @@ -113,30 +118,67 @@ class StringDtype(StorageExtensionDtype): # follows NumPy semantics, which uses nan. @property def na_value(self) -> libmissing.NAType | float: # type: ignore[override] - if self.storage == "pyarrow_numpy": - return np.nan - else: - return libmissing.NA + return self._na_value - _metadata = ("storage",) + _metadata = ("storage", "_na_value") # type: ignore[assignment] - def __init__(self, storage=None) -> None: + def __init__( + self, + storage: str | None = None, + na_value: libmissing.NAType | float = libmissing.NA, + ) -> None: + # infer defaults if storage is None: - infer_string = get_option("future.infer_string") - if infer_string: - storage = "pyarrow_numpy" + if using_string_dtype(): + storage = "pyarrow" else: storage = get_option("mode.string_storage") - if storage not in {"python", "pyarrow", "pyarrow_numpy"}: + + if storage == "pyarrow_numpy": + # TODO raise a deprecation warning + storage = "pyarrow" + na_value = np.nan + + # validate options + if storage not in {"python", "pyarrow"}: raise ValueError( - f"Storage must be 'python', 'pyarrow' or 'pyarrow_numpy'. " - f"Got {storage} instead." + f"Storage must be 'python' or 'pyarrow'. Got {storage} instead." ) - if storage in ("pyarrow", "pyarrow_numpy") and pa_version_under10p1: + if storage == "pyarrow" and pa_version_under10p1: raise ImportError( "pyarrow>=10.0.1 is required for PyArrow backed StringArray." ) + + if isinstance(na_value, float) and np.isnan(na_value): + # when passed a NaN value, always set to np.nan to ensure we use + # a consistent NaN value (and we can use `dtype.na_value is np.nan`) + na_value = np.nan + elif na_value is not libmissing.NA: + raise ValueError("'na_value' must be np.nan or pd.NA, got {na_value}") + self.storage = storage + self._na_value = na_value + + def __eq__(self, other: object) -> bool: + # we need to override the base class __eq__ because na_value (NA or NaN) + # cannot be checked with normal `==` + if isinstance(other, str): + if other == self.name: + return True + try: + other = self.construct_from_string(other) + except TypeError: + return False + if isinstance(other, type(self)): + return self.storage == other.storage and self.na_value is other.na_value + return False + + def __hash__(self) -> int: + # need to override __hash__ as well because of overriding __eq__ + return super().__hash__() + + def __reduce__(self): + return StringDtype, (self.storage, self.na_value) @property def type(self) -> type[str]: @@ -181,6 +223,7 @@ def construct_from_string(cls, string) -> Self: elif string == "string[pyarrow]": return cls(storage="pyarrow") elif string == "string[pyarrow_numpy]": + # TODO deprecate return cls(storage="pyarrow_numpy") else: raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'") @@ -205,7 +248,7 @@ def construct_array_type( # type: ignore[override] if self.storage == "python": return StringArray - elif self.storage == "pyarrow": + elif self.storage == "pyarrow" and self._na_value is libmissing.NA: return ArrowStringArray else: return ArrowStringArrayNumpySemantics @@ -217,13 +260,17 @@ def __from_arrow__( Construct StringArray from pyarrow Array/ChunkedArray. """ if self.storage == "pyarrow": - from pandas.core.arrays.string_arrow import ArrowStringArray + if self._na_value is libmissing.NA: + from pandas.core.arrays.string_arrow import ArrowStringArray + + return ArrowStringArray(array) + else: + from pandas.core.arrays.string_arrow import ( + ArrowStringArrayNumpySemantics, + ) - return ArrowStringArray(array) - elif self.storage == "pyarrow_numpy": - from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics + return ArrowStringArrayNumpySemantics(array) - return ArrowStringArrayNumpySemantics(array) else: import pyarrow diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 97c06149d0b7e..869cc34d5f61d 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -131,6 +131,7 @@ class ArrowStringArray(ObjectStringArrayMixin, ArrowExtensionArray, BaseStringAr # base class "ArrowExtensionArray" defined the type as "ArrowDtype") _dtype: StringDtype # type: ignore[assignment] _storage = "pyarrow" + _na_value: libmissing.NAType | float = libmissing.NA def __init__(self, values) -> None: _chk_pyarrow_available() @@ -140,7 +141,7 @@ def __init__(self, values) -> None: values = pc.cast(values, pa.large_string()) super().__init__(values) - self._dtype = StringDtype(storage=self._storage) + self._dtype = StringDtype(storage=self._storage, na_value=self._na_value) if not pa.types.is_large_string(self._pa_array.type) and not ( pa.types.is_dictionary(self._pa_array.type) @@ -187,10 +188,7 @@ def _from_sequence( if dtype and not (isinstance(dtype, str) and dtype == "string"): dtype = pandas_dtype(dtype) - assert isinstance(dtype, StringDtype) and dtype.storage in ( - "pyarrow", - "pyarrow_numpy", - ) + assert isinstance(dtype, StringDtype) and dtype.storage == "pyarrow" if isinstance(scalars, BaseMaskedArray): # avoid costly conversion to object dtype in ensure_string_array and @@ -597,7 +595,8 @@ def _rank( class ArrowStringArrayNumpySemantics(ArrowStringArray): - _storage = "pyarrow_numpy" + _storage = "pyarrow" + _na_value = np.nan @classmethod def _result_converter(cls, values, na=None): diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 32792aa7f0543..81aeb40f375b0 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -574,7 +574,7 @@ def sanitize_array( if isinstance(data, str) and using_string_dtype() and original_dtype is None: from pandas.core.arrays.string_ import StringDtype - dtype = StringDtype("pyarrow_numpy") + dtype = StringDtype("pyarrow", na_value=np.nan) data = construct_1d_arraylike_from_scalar(data, len(index), dtype) return data @@ -608,7 +608,7 @@ def sanitize_array( elif data.dtype.kind == "U" and using_string_dtype(): from pandas.core.arrays.string_ import StringDtype - dtype = StringDtype(storage="pyarrow_numpy") + dtype = StringDtype(storage="pyarrow", na_value=np.nan) subarr = dtype.construct_array_type()._from_sequence(data, dtype=dtype) if subarr is data and copy: diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 21e45505b40fc..d750451a1ca84 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -801,7 +801,7 @@ def infer_dtype_from_scalar(val) -> tuple[DtypeObj, Any]: if using_string_dtype(): from pandas.core.arrays.string_ import StringDtype - dtype = StringDtype(storage="pyarrow_numpy") + dtype = StringDtype(storage="pyarrow", na_value=np.nan) elif isinstance(val, (np.datetime64, dt.datetime)): try: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index e67c59c86dd0c..50f44cc728aea 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -5453,9 +5453,10 @@ def equals(self, other: Any) -> bool: if ( isinstance(self.dtype, StringDtype) - and self.dtype.storage == "pyarrow_numpy" + and self.dtype.na_value is np.nan and other.dtype != self.dtype ): + # TODO(infer_string) can we avoid this special case? # special case for object behavior return other.equals(self.astype(object)) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index c31479b3011e5..08e1650a5de12 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -302,7 +302,7 @@ def ndarray_to_mgr( nb = new_block_2d(values, placement=bp, refs=refs) block_values = [nb] elif dtype is None and values.dtype.kind == "U" and using_string_dtype(): - dtype = StringDtype(storage="pyarrow_numpy") + dtype = StringDtype(storage="pyarrow", na_value=np.nan) obj_columns = list(values) block_values = [ diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py index 9d88e61951e99..c397c1c2566a5 100644 --- a/pandas/core/reshape/encoding.py +++ b/pandas/core/reshape/encoding.py @@ -10,6 +10,7 @@ import numpy as np +from pandas._libs import missing as libmissing from pandas._libs.sparse import IntIndex from pandas.core.dtypes.common import ( @@ -256,7 +257,7 @@ def _get_dummies_1d( dtype = ArrowDtype(pa.bool_()) # type: ignore[assignment] elif ( isinstance(input_dtype, StringDtype) - and input_dtype.storage != "pyarrow_numpy" + and input_dtype.na_value is libmissing.NA ): dtype = pandas_dtype("boolean") # type: ignore[assignment] else: diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 2ce77ac19b9c5..6364072fd215c 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -2677,8 +2677,7 @@ def _factorize_keys( elif isinstance(lk, ExtensionArray) and lk.dtype == rk.dtype: if (isinstance(lk.dtype, ArrowDtype) and is_string_dtype(lk.dtype)) or ( - isinstance(lk.dtype, StringDtype) - and lk.dtype.storage in ["pyarrow", "pyarrow_numpy"] + isinstance(lk.dtype, StringDtype) and lk.dtype.storage == "pyarrow" ): import pyarrow as pa import pyarrow.compute as pc diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index 3d406d3bfb115..26e73794af298 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -7,7 +7,10 @@ import numpy as np -from pandas._libs import lib +from pandas._libs import ( + lib, + missing as libmissing, +) from pandas.util._validators import check_dtype_backend from pandas.core.dtypes.cast import maybe_downcast_numeric @@ -218,7 +221,7 @@ def to_numeric( coerce_numeric=coerce_numeric, convert_to_masked_nullable=dtype_backend is not lib.no_default or isinstance(values_dtype, StringDtype) - and not values_dtype.storage == "pyarrow_numpy", + and values_dtype.na_value is libmissing.NA, ) if new_mask is not None: @@ -229,7 +232,7 @@ def to_numeric( dtype_backend is not lib.no_default and new_mask is None or isinstance(values_dtype, StringDtype) - and not values_dtype.storage == "pyarrow_numpy" + and values_dtype.na_value is libmissing.NA ): new_mask = np.zeros(values.shape, dtype=np.bool_) diff --git a/pandas/io/_util.py b/pandas/io/_util.py index cb0f89945e440..a72a16269959d 100644 --- a/pandas/io/_util.py +++ b/pandas/io/_util.py @@ -2,6 +2,8 @@ from typing import TYPE_CHECKING +import numpy as np + from pandas.compat._optional import import_optional_dependency import pandas as pd @@ -32,6 +34,6 @@ def arrow_string_types_mapper() -> Callable: pa = import_optional_dependency("pyarrow") return { - pa.string(): pd.StringDtype(storage="pyarrow_numpy"), - pa.large_string(): pd.StringDtype(storage="pyarrow_numpy"), + pa.string(): pd.StringDtype(storage="pyarrow", na_value=np.nan), + pa.large_string(): pd.StringDtype(storage="pyarrow", na_value=np.nan), }.get diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 597b407a29c94..7757847f3c841 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -20,13 +20,6 @@ ) -def na_val(dtype): - if dtype.storage == "pyarrow_numpy": - return np.nan - else: - return pd.NA - - @pytest.fixture def dtype(string_storage): """Fixture giving StringDtype from parametrized 'string_storage'""" @@ -39,24 +32,45 @@ def cls(dtype): return dtype.construct_array_type() +def test_dtype_equality(): + pytest.importorskip("pyarrow") + + dtype1 = pd.StringDtype("python") + dtype2 = pd.StringDtype("pyarrow") + dtype3 = pd.StringDtype("pyarrow", na_value=np.nan) + + assert dtype1 == pd.StringDtype("python", na_value=pd.NA) + assert dtype1 != dtype2 + assert dtype1 != dtype3 + + assert dtype2 == pd.StringDtype("pyarrow", na_value=pd.NA) + assert dtype2 != dtype1 + assert dtype2 != dtype3 + + assert dtype3 == pd.StringDtype("pyarrow", na_value=np.nan) + assert dtype3 == pd.StringDtype("pyarrow", na_value=float("nan")) + assert dtype3 != dtype1 + assert dtype3 != dtype2 + + def test_repr(dtype): df = pd.DataFrame({"A": pd.array(["a", pd.NA, "b"], dtype=dtype)}) - if dtype.storage == "pyarrow_numpy": + if dtype.na_value is np.nan: expected = " A\n0 a\n1 NaN\n2 b" else: expected = " A\n0 a\n1 \n2 b" assert repr(df) == expected - if dtype.storage == "pyarrow_numpy": + if dtype.na_value is np.nan: expected = "0 a\n1 NaN\n2 b\nName: A, dtype: string" else: expected = "0 a\n1 \n2 b\nName: A, dtype: string" assert repr(df.A) == expected - if dtype.storage == "pyarrow": + if dtype.storage == "pyarrow" and dtype.na_value is pd.NA: arr_name = "ArrowStringArray" expected = f"<{arr_name}>\n['a', , 'b']\nLength: 3, dtype: string" - elif dtype.storage == "pyarrow_numpy": + elif dtype.storage == "pyarrow" and dtype.na_value is np.nan: arr_name = "ArrowStringArrayNumpySemantics" expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: string" else: @@ -68,7 +82,7 @@ def test_repr(dtype): def test_none_to_nan(cls, dtype): a = cls._from_sequence(["a", None, "b"], dtype=dtype) assert a[1] is not None - assert a[1] is na_val(a.dtype) + assert a[1] is a.dtype.na_value def test_setitem_validates(cls, dtype): @@ -225,7 +239,7 @@ def test_comparison_methods_scalar(comparison_op, dtype): a = pd.array(["a", None, "c"], dtype=dtype) other = "a" result = getattr(a, op_name)(other) - if dtype.storage == "pyarrow_numpy": + if dtype.na_value is np.nan: expected = np.array([getattr(item, op_name)(other) for item in a]) if comparison_op == operator.ne: expected[1] = True @@ -244,7 +258,7 @@ def test_comparison_methods_scalar_pd_na(comparison_op, dtype): a = pd.array(["a", None, "c"], dtype=dtype) result = getattr(a, op_name)(pd.NA) - if dtype.storage == "pyarrow_numpy": + if dtype.na_value is np.nan: if operator.ne == comparison_op: expected = np.array([True, True, True]) else: @@ -271,7 +285,7 @@ def test_comparison_methods_scalar_not_string(comparison_op, dtype): result = getattr(a, op_name)(other) - if dtype.storage == "pyarrow_numpy": + if dtype.na_value is np.nan: expected_data = { "__eq__": [False, False, False], "__ne__": [True, True, True], @@ -293,7 +307,7 @@ def test_comparison_methods_array(comparison_op, dtype): a = pd.array(["a", None, "c"], dtype=dtype) other = [None, None, "c"] result = getattr(a, op_name)(other) - if dtype.storage == "pyarrow_numpy": + if dtype.na_value is np.nan: if operator.ne == comparison_op: expected = np.array([True, True, False]) else: @@ -387,7 +401,7 @@ def test_astype_int(dtype): tm.assert_numpy_array_equal(result, expected) arr = pd.array(["1", pd.NA, "3"], dtype=dtype) - if dtype.storage == "pyarrow_numpy": + if dtype.na_value is np.nan: err = ValueError msg = "cannot convert float NaN to integer" else: @@ -441,7 +455,7 @@ def test_min_max(method, skipna, dtype): expected = "a" if method == "min" else "c" assert result == expected else: - assert result is na_val(arr.dtype) + assert result is arr.dtype.na_value @pytest.mark.parametrize("method", ["min", "max"]) @@ -490,7 +504,7 @@ def test_arrow_array(dtype): data = pd.array(["a", "b", "c"], dtype=dtype) arr = pa.array(data) expected = pa.array(list(data), type=pa.large_string(), from_pandas=True) - if dtype.storage in ("pyarrow", "pyarrow_numpy") and pa_version_under12p0: + if dtype.storage == "pyarrow" and pa_version_under12p0: expected = pa.chunked_array(expected) if dtype.storage == "python": expected = pc.cast(expected, pa.string()) @@ -522,7 +536,7 @@ def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string): expected = df.astype(f"string[{string_storage2}]") tm.assert_frame_equal(result, expected) # ensure the missing value is represented by NA and not np.nan or None - assert result.loc[2, "a"] is na_val(result["a"].dtype) + assert result.loc[2, "a"] is result["a"].dtype.na_value @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") @@ -556,10 +570,10 @@ def test_arrow_load_from_zero_chunks( def test_value_counts_na(dtype): - if getattr(dtype, "storage", "") == "pyarrow": - exp_dtype = "int64[pyarrow]" - elif getattr(dtype, "storage", "") == "pyarrow_numpy": + if dtype.na_value is np.nan: exp_dtype = "int64" + elif dtype.storage == "pyarrow": + exp_dtype = "int64[pyarrow]" else: exp_dtype = "Int64" arr = pd.array(["a", "b", "a", pd.NA], dtype=dtype) @@ -573,10 +587,10 @@ def test_value_counts_na(dtype): def test_value_counts_with_normalize(dtype): - if getattr(dtype, "storage", "") == "pyarrow": - exp_dtype = "double[pyarrow]" - elif getattr(dtype, "storage", "") == "pyarrow_numpy": + if dtype.na_value is np.nan: exp_dtype = np.float64 + elif dtype.storage == "pyarrow": + exp_dtype = "double[pyarrow]" else: exp_dtype = "Float64" ser = pd.Series(["a", "b", "a", pd.NA], dtype=dtype) @@ -586,10 +600,10 @@ def test_value_counts_with_normalize(dtype): def test_value_counts_sort_false(dtype): - if getattr(dtype, "storage", "") == "pyarrow": - exp_dtype = "int64[pyarrow]" - elif getattr(dtype, "storage", "") == "pyarrow_numpy": + if dtype.na_value is np.nan: exp_dtype = "int64" + elif dtype.storage == "pyarrow": + exp_dtype = "int64[pyarrow]" else: exp_dtype = "Int64" ser = pd.Series(["a", "b", "c", "b"], dtype=dtype) @@ -621,7 +635,7 @@ def test_astype_from_float_dtype(float_dtype, dtype): def test_to_numpy_returns_pdna_default(dtype): arr = pd.array(["a", pd.NA, "b"], dtype=dtype) result = np.array(arr) - expected = np.array(["a", na_val(dtype), "b"], dtype=object) + expected = np.array(["a", dtype.na_value, "b"], dtype=object) tm.assert_numpy_array_equal(result, expected) @@ -661,7 +675,7 @@ def test_setitem_scalar_with_mask_validation(dtype): mask = np.array([False, True, False]) ser[mask] = None - assert ser.array[1] is na_val(ser.dtype) + assert ser.array[1] is ser.dtype.na_value # for other non-string we should also raise an error ser = pd.Series(["a", "b", "c"], dtype=dtype) diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index 405c1c217b04d..c610ef5315723 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -29,6 +29,8 @@ def test_eq_all_na(): def test_config(string_storage, request, using_infer_string): if using_infer_string and string_storage != "pyarrow_numpy": request.applymarker(pytest.mark.xfail(reason="infer string takes precedence")) + if string_storage == "pyarrow_numpy": + request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)")) with pd.option_context("string_storage", string_storage): assert StringDtype().storage == string_storage result = pd.array(["a", "b"]) @@ -260,6 +262,6 @@ def test_pickle_roundtrip(dtype): def test_string_dtype_error_message(): # GH#55051 pytest.importorskip("pyarrow") - msg = "Storage must be 'python', 'pyarrow' or 'pyarrow_numpy'." + msg = "Storage must be 'python' or 'pyarrow'." with pytest.raises(ValueError, match=msg): StringDtype("bla") diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index b7f0f973e640a..dd2ed0bd62a02 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -66,14 +66,14 @@ def test_value_counts_with_normalize(self, data): expected = pd.Series(0.0, index=result.index, name="proportion") expected[result > 0] = 1 / len(values) - if getattr(data.dtype, "storage", "") == "pyarrow" or isinstance( + if isinstance(data.dtype, pd.StringDtype) and data.dtype.na_value is np.nan: + # TODO: avoid special-casing + expected = expected.astype("float64") + elif getattr(data.dtype, "storage", "") == "pyarrow" or isinstance( data.dtype, pd.ArrowDtype ): # TODO: avoid special-casing expected = expected.astype("double[pyarrow]") - elif getattr(data.dtype, "storage", "") == "pyarrow_numpy": - # TODO: avoid special-casing - expected = expected.astype("float64") elif na_value_for_dtype(data.dtype) is pd.NA: # TODO(GH#44692): avoid special-casing expected = expected.astype("Float64") diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 49ad3fce92a5c..4628c5568b49b 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -96,9 +96,15 @@ def data_for_grouping(dtype, chunked): class TestStringArray(base.ExtensionTests): def test_eq_with_str(self, dtype): - assert dtype == f"string[{dtype.storage}]" super().test_eq_with_str(dtype) + if dtype.na_value is pd.NA: + # only the NA-variant supports parametrized string alias + assert dtype == f"string[{dtype.storage}]" + elif dtype.storage == "pyarrow": + # TODO(infer_string) deprecate this + assert dtype == "string[pyarrow_numpy]" + def test_is_not_string_type(self, dtype): # Different from BaseDtypeTests.test_is_not_string_type # because StringDtype is a string type @@ -140,28 +146,21 @@ def _get_expected_exception( self, op_name: str, obj, other ) -> type[Exception] | None: if op_name in ["__divmod__", "__rdivmod__"]: - if isinstance(obj, pd.Series) and cast( - StringDtype, tm.get_dtype(obj) - ).storage in [ - "pyarrow", - "pyarrow_numpy", - ]: + if ( + isinstance(obj, pd.Series) + and cast(StringDtype, tm.get_dtype(obj)).storage == "pyarrow" + ): # TODO: re-raise as TypeError? return NotImplementedError - elif isinstance(other, pd.Series) and cast( - StringDtype, tm.get_dtype(other) - ).storage in [ - "pyarrow", - "pyarrow_numpy", - ]: + elif ( + isinstance(other, pd.Series) + and cast(StringDtype, tm.get_dtype(other)).storage == "pyarrow" + ): # TODO: re-raise as TypeError? return NotImplementedError return TypeError elif op_name in ["__mod__", "__rmod__", "__pow__", "__rpow__"]: - if cast(StringDtype, tm.get_dtype(obj)).storage in [ - "pyarrow", - "pyarrow_numpy", - ]: + if cast(StringDtype, tm.get_dtype(obj)).storage == "pyarrow": return NotImplementedError return TypeError elif op_name in ["__mul__", "__rmul__"]: @@ -175,10 +174,7 @@ def _get_expected_exception( "__sub__", "__rsub__", ]: - if cast(StringDtype, tm.get_dtype(obj)).storage in [ - "pyarrow", - "pyarrow_numpy", - ]: + if cast(StringDtype, tm.get_dtype(obj)).storage == "pyarrow": import pyarrow as pa # TODO: better to re-raise as TypeError? @@ -190,7 +186,7 @@ def _get_expected_exception( def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: return ( op_name in ["min", "max"] - or ser.dtype.storage == "pyarrow_numpy" # type: ignore[union-attr] + or ser.dtype.na_value is np.nan # type: ignore[union-attr] and op_name in ("any", "all") ) @@ -198,10 +194,10 @@ def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result): dtype = cast(StringDtype, tm.get_dtype(obj)) if op_name in ["__add__", "__radd__"]: cast_to = dtype + elif dtype.na_value is np.nan: + cast_to = np.bool_ # type: ignore[assignment] elif dtype.storage == "pyarrow": cast_to = "boolean[pyarrow]" # type: ignore[assignment] - elif dtype.storage == "pyarrow_numpy": - cast_to = np.bool_ # type: ignore[assignment] else: cast_to = "boolean" # type: ignore[assignment] return pointwise_result.astype(cast_to) diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py index 521d2cb14ac6a..9cbbebf35b2d1 100644 --- a/pandas/tests/frame/methods/test_convert_dtypes.py +++ b/pandas/tests/frame/methods/test_convert_dtypes.py @@ -18,6 +18,7 @@ def test_convert_dtypes( # Just check that it works for DataFrame here if using_infer_string: string_storage = "pyarrow_numpy" + df = pd.DataFrame( { "a": pd.Series([1, 2, 3], dtype=np.dtype("int32")), diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 44a7862c21273..91cf1708ed43b 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -2113,9 +2113,12 @@ def test_series_string_inference_array_string_dtype(self): tm.assert_series_equal(ser, expected) def test_series_string_inference_storage_definition(self): - # GH#54793 + # https://github.com/pandas-dev/pandas/issues/54793 + # but after PDEP-14 (string dtype), it was decided to keep dtype="string" + # returning the NA string dtype, so expected is changed from + # "string[pyarrow_numpy]" to "string[pyarrow]" pytest.importorskip("pyarrow") - expected = Series(["a", "b"], dtype="string[pyarrow_numpy]") + expected = Series(["a", "b"], dtype="string[pyarrow]") with pd.option_context("future.infer_string", True): result = Series(["a", "b"], dtype="string") tm.assert_series_equal(result, expected) diff --git a/pandas/tests/strings/__init__.py b/pandas/tests/strings/__init__.py index 01b49b5e5b633..e94f656fc9823 100644 --- a/pandas/tests/strings/__init__.py +++ b/pandas/tests/strings/__init__.py @@ -7,7 +7,7 @@ def _convert_na_value(ser, expected): if ser.dtype != object: - if ser.dtype.storage == "pyarrow_numpy": + if ser.dtype.na_value is np.nan: expected = expected.fillna(np.nan) else: # GH#18463 From aa4dc71f54764252ff795cc42b1c465e20a204c0 Mon Sep 17 00:00:00 2001 From: matiaslindgren Date: Mon, 29 Jul 2024 21:42:54 +0200 Subject: [PATCH 243/272] BUG: Fix 57735 (#59335) * Revert "CLN: Remove special cases in indexing ops (#52063)" This reverts commit 8e456d3599541dc1a7fe7ec742274774f768f97d. * remove old comments, add test * use better test name * Update pandas/tests/indexing/test_loc.py Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> * check for empty index first * assert assign to empty does not change frame * format --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/core/indexing.py | 16 +++++++++++----- pandas/tests/indexing/test_loc.py | 7 +++++++ 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 455e61b8bc254..debb5bdd4fc4b 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -810,8 +810,11 @@ def _maybe_mask_setitem_value(self, indexer, value): if is_scalar_indexer(icols, self.ndim - 1) and ndim == 1: # e.g. test_loc_setitem_boolean_mask_allfalse - # test_loc_setitem_ndframe_values_alignment - value = self.obj.iloc._align_series(indexer, value) + if len(newkey) == 0: + value = value.iloc[:0] + else: + # test_loc_setitem_ndframe_values_alignment + value = self.obj.iloc._align_series(indexer, value) indexer = (newkey, icols) elif ( @@ -827,8 +830,11 @@ def _maybe_mask_setitem_value(self, indexer, value): indexer = (newkey, icols) elif ndim == 2 and value.shape[1] == 1: - # test_loc_setitem_ndframe_values_alignment - value = self.obj.iloc._align_frame(indexer, value) + if len(newkey) == 0: + value = value.iloc[:0] + else: + # test_loc_setitem_ndframe_values_alignment + value = self.obj.iloc._align_frame(indexer, value) indexer = (newkey, icols) elif com.is_bool_indexer(indexer): indexer = indexer.nonzero()[0] @@ -2389,7 +2395,7 @@ def ravel(i): new_ix = Index([new_ix]) else: new_ix = Index(new_ix) - if ser.index.equals(new_ix): + if not len(new_ix) or ser.index.equals(new_ix): if using_cow: return ser return ser._values.copy() diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 72cda194bec53..f90bd9e6802c8 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -3272,3 +3272,10 @@ def test_loc_index_alignment_for_series(self): df.loc[:, "a"] = other expected = DataFrame({"a": [999, 200], "b": [3, 4]}) tm.assert_frame_equal(expected, df) + + def test_loc_reindexing_of_empty_index(self): + # GH 57735 + df = DataFrame(index=[1, 1, 2, 2], data=["1", "1", "2", "2"]) + df.loc[Series([False] * 4, index=df.index, name=0), 0] = df[0] + expected = DataFrame(index=[1, 1, 2, 2], data=["1", "1", "2", "2"]) + tm.assert_frame_equal(df, expected) From 9c8c685f481fcd63f08da39885ed48e93de58855 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 29 Jul 2024 23:17:22 +0200 Subject: [PATCH 244/272] TST (string dtype): xfail all currently failing tests with future.infer_string (#59329) * TST (string dtype): xfail all currently failing tests with future.infer_string * more xfails * more xfails * add missing strict=False * also run slow and single cpu tests * fix single_cpu tests * xfail some slow tests * stop suppressing non-zero exit code from pytest on string CI build * remove accidentally added xlsx file --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- .github/workflows/unit-tests.yml | 1 - ci/run_tests.sh | 6 ------ pandas/tests/apply/test_frame_apply.py | 6 ++++++ pandas/tests/apply/test_numba.py | 4 ++++ pandas/tests/apply/test_str.py | 4 ++++ pandas/tests/arrays/categorical/test_analytics.py | 3 +++ pandas/tests/arrays/categorical/test_api.py | 3 +++ .../tests/arrays/categorical/test_constructors.py | 1 + pandas/tests/arrays/floating/test_arithmetic.py | 3 +++ pandas/tests/arrays/integer/test_arithmetic.py | 3 +++ pandas/tests/arrays/masked/test_function.py | 3 +++ pandas/tests/copy_view/test_array.py | 3 +++ pandas/tests/copy_view/test_astype.py | 6 ++++++ pandas/tests/copy_view/test_constructors.py | 3 +++ pandas/tests/copy_view/test_functions.py | 8 ++++++++ pandas/tests/copy_view/test_internals.py | 3 +++ pandas/tests/copy_view/test_interp_fillna.py | 4 ++++ pandas/tests/copy_view/test_methods.py | 4 ++++ pandas/tests/copy_view/test_replace.py | 5 +++++ pandas/tests/dtypes/test_dtypes.py | 3 +++ pandas/tests/extension/test_arrow.py | 3 +++ pandas/tests/extension/test_string.py | 6 ++++++ pandas/tests/frame/indexing/test_coercion.py | 3 +++ pandas/tests/frame/indexing/test_indexing.py | 7 +++++++ pandas/tests/frame/indexing/test_insert.py | 3 +++ pandas/tests/frame/indexing/test_setitem.py | 8 ++++++++ pandas/tests/frame/indexing/test_where.py | 5 +++++ pandas/tests/frame/indexing/test_xs.py | 3 +++ pandas/tests/frame/methods/test_combine_first.py | 3 +++ pandas/tests/frame/methods/test_convert_dtypes.py | 4 ++++ pandas/tests/frame/methods/test_cov_corr.py | 3 +++ pandas/tests/frame/methods/test_dropna.py | 3 +++ pandas/tests/frame/methods/test_dtypes.py | 3 +++ pandas/tests/frame/methods/test_fillna.py | 2 ++ pandas/tests/frame/methods/test_info.py | 5 +++++ pandas/tests/frame/methods/test_quantile.py | 8 ++++++++ pandas/tests/frame/methods/test_replace.py | 3 +++ pandas/tests/frame/methods/test_reset_index.py | 3 +++ pandas/tests/frame/methods/test_to_csv.py | 8 ++++++++ .../tests/frame/methods/test_to_dict_of_blocks.py | 3 +++ pandas/tests/frame/test_arithmetic.py | 3 +++ pandas/tests/frame/test_arrow_interface.py | 4 ++++ pandas/tests/frame/test_block_internals.py | 5 +++++ pandas/tests/frame/test_constructors.py | 1 + pandas/tests/frame/test_query_eval.py | 3 +++ pandas/tests/frame/test_reductions.py | 4 ++++ pandas/tests/frame/test_stack_unstack.py | 4 ++++ pandas/tests/frame/test_unary.py | 3 +++ pandas/tests/groupby/aggregate/test_aggregate.py | 4 ++++ pandas/tests/groupby/aggregate/test_cython.py | 4 ++++ pandas/tests/groupby/aggregate/test_other.py | 3 +++ pandas/tests/groupby/methods/test_describe.py | 4 ++++ pandas/tests/groupby/methods/test_nth.py | 3 +++ pandas/tests/groupby/methods/test_quantile.py | 4 ++++ pandas/tests/groupby/methods/test_size.py | 3 +++ pandas/tests/groupby/methods/test_value_counts.py | 4 ++++ pandas/tests/groupby/test_categorical.py | 3 +++ pandas/tests/groupby/test_groupby.py | 6 ++++++ pandas/tests/groupby/test_groupby_dropna.py | 3 +++ pandas/tests/groupby/test_grouping.py | 4 ++++ pandas/tests/groupby/test_pipe.py | 4 ++++ pandas/tests/groupby/test_raises.py | 4 ++++ pandas/tests/groupby/test_reductions.py | 3 +++ pandas/tests/groupby/test_timegrouper.py | 3 +++ pandas/tests/groupby/transform/test_transform.py | 6 ++++++ pandas/tests/indexes/base_class/test_setops.py | 3 +++ pandas/tests/indexes/test_old_base.py | 1 + pandas/tests/indexing/test_iloc.py | 3 +++ pandas/tests/indexing/test_indexing.py | 3 +++ pandas/tests/indexing/test_loc.py | 1 + pandas/tests/interchange/test_impl.py | 4 ++++ pandas/tests/io/excel/test_readers.py | 1 + pandas/tests/io/excel/test_writers.py | 4 ++++ pandas/tests/io/formats/style/test_to_latex.py | 3 +++ pandas/tests/io/json/test_pandas.py | 9 +++++++++ pandas/tests/io/parser/common/test_chunksize.py | 3 +++ pandas/tests/io/parser/common/test_common_basic.py | 3 +++ .../tests/io/parser/common/test_file_buffer_url.py | 3 +++ pandas/tests/io/parser/common/test_index.py | 3 +++ pandas/tests/io/parser/dtypes/test_dtypes_basic.py | 7 +++++++ pandas/tests/io/parser/test_c_parser_only.py | 3 +++ pandas/tests/io/parser/test_converters.py | 3 +++ pandas/tests/io/parser/test_mangle_dupes.py | 3 +++ pandas/tests/io/parser/test_na_values.py | 7 +++++++ pandas/tests/io/parser/test_parse_dates.py | 5 +++++ pandas/tests/io/parser/test_python_parser_only.py | 4 ++++ pandas/tests/io/parser/test_read_fwf.py | 3 +++ pandas/tests/io/parser/test_upcast.py | 3 +++ pandas/tests/io/pytables/test_append.py | 7 ++++++- pandas/tests/io/pytables/test_categorical.py | 7 ++++++- pandas/tests/io/pytables/test_complex.py | 6 ++++++ pandas/tests/io/pytables/test_errors.py | 7 ++++++- pandas/tests/io/pytables/test_file_handling.py | 7 ++++++- pandas/tests/io/pytables/test_put.py | 7 ++++++- pandas/tests/io/pytables/test_read.py | 7 ++++++- pandas/tests/io/pytables/test_round_trip.py | 7 ++++++- pandas/tests/io/pytables/test_select.py | 7 ++++++- pandas/tests/io/pytables/test_store.py | 7 ++++++- pandas/tests/io/pytables/test_timezones.py | 6 ++++++ pandas/tests/io/sas/test_sas7bdat.py | 6 ++++++ pandas/tests/io/test_clipboard.py | 6 ++++++ pandas/tests/io/test_common.py | 7 +++++++ pandas/tests/io/test_compression.py | 3 +++ pandas/tests/io/test_feather.py | 4 ++++ pandas/tests/io/test_fsspec.py | 4 ++++ pandas/tests/io/test_gcs.py | 3 +++ pandas/tests/io/test_html.py | 6 ++++++ pandas/tests/io/test_http_headers.py | 3 +++ pandas/tests/io/test_orc.py | 11 ++++++++--- pandas/tests/io/test_parquet.py | 3 +++ pandas/tests/io/test_sql.py | 11 ++++++++--- pandas/tests/io/test_stata.py | 13 +++++++++++++ pandas/tests/io/xml/test_xml.py | 3 +++ pandas/tests/io/xml/test_xml_dtypes.py | 4 ++++ pandas/tests/reductions/test_reductions.py | 5 +++++ pandas/tests/resample/test_resampler_grouper.py | 3 +++ pandas/tests/reshape/concat/test_concat.py | 3 +++ pandas/tests/reshape/merge/test_merge_asof.py | 3 +++ pandas/tests/reshape/test_from_dummies.py | 3 +++ pandas/tests/reshape/test_melt.py | 10 ++++++++++ pandas/tests/reshape/test_pivot.py | 2 ++ pandas/tests/reshape/test_union_categoricals.py | 3 +++ pandas/tests/series/accessors/test_dt_accessor.py | 6 ++++++ pandas/tests/series/indexing/test_indexing.py | 3 +++ pandas/tests/series/indexing/test_setitem.py | 6 ++++++ pandas/tests/series/methods/test_info.py | 3 +++ pandas/tests/series/methods/test_replace.py | 1 + pandas/tests/series/methods/test_to_csv.py | 3 +++ pandas/tests/series/methods/test_unstack.py | 3 +++ pandas/tests/series/test_arithmetic.py | 3 +++ pandas/tests/series/test_logical_ops.py | 3 +++ pandas/tests/test_algos.py | 4 ++++ 132 files changed, 543 insertions(+), 22 deletions(-) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index c0461943ce9c8..4539884e6afd3 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -59,7 +59,6 @@ jobs: extra_loc: "zh_CN" - name: "Future infer strings" env_file: actions-311.yaml - pattern: "not slow and not network and not single_cpu" pandas_future_infer_string: "1" - name: "Pypy" env_file: actions-pypy-39.yaml diff --git a/ci/run_tests.sh b/ci/run_tests.sh index c6071100fc86f..d2c2f58427a23 100755 --- a/ci/run_tests.sh +++ b/ci/run_tests.sh @@ -16,11 +16,5 @@ if [[ "$PATTERN" ]]; then PYTEST_CMD="$PYTEST_CMD -m \"$PATTERN\"" fi -# temporarily let pytest always succeed (many tests are not yet passing in the -# build enabling the future string dtype) -if [[ "$PANDAS_FUTURE_INFER_STRING" == "1" ]]; then - PYTEST_CMD="$PYTEST_CMD || true" -fi - echo $PYTEST_CMD sh -c "$PYTEST_CMD" diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index ba405d4bd1cab..b0475b64a844e 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.core.dtypes.dtypes import CategoricalDtype import pandas as pd @@ -61,6 +63,7 @@ def test_apply(float_frame, engine, request): assert result.index is float_frame.index +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize("raw", [True, False]) @pytest.mark.parametrize("nopython", [True, False]) @@ -1213,6 +1216,7 @@ def test_agg_with_name_as_column_name(): tm.assert_series_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_agg_multiple_mixed(): # GH 20909 mdf = DataFrame( @@ -1338,6 +1342,7 @@ def test_named_agg_reduce_axis1_raises(float_frame): float_frame.agg(row1=(name1, "sum"), row2=(name2, "max"), axis=axis) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_nuiscance_columns(): # GH 15015 df = DataFrame( @@ -1514,6 +1519,7 @@ def test_apply_datetime_tz_issue(engine, request): tm.assert_series_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("df", [DataFrame({"A": ["a", None], "B": ["c", "d"]})]) @pytest.mark.parametrize("method", ["min", "max", "sum"]) def test_mixed_column_raises(df, method, using_infer_string): diff --git a/pandas/tests/apply/test_numba.py b/pandas/tests/apply/test_numba.py index 57b81711ddb48..aee9100702350 100644 --- a/pandas/tests/apply/test_numba.py +++ b/pandas/tests/apply/test_numba.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas.util._test_decorators as td from pandas import ( @@ -17,6 +19,7 @@ def apply_axis(request): return request.param +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_numba_vs_python_noop(float_frame, apply_axis): func = lambda x: x result = float_frame.apply(func, engine="numba", axis=apply_axis) @@ -40,6 +43,7 @@ def test_numba_vs_python_string_index(): ) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_numba_vs_python_indexing(): frame = DataFrame( {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7.0, 8.0, 9.0]}, diff --git a/pandas/tests/apply/test_str.py b/pandas/tests/apply/test_str.py index e224b07a1097b..732652f24e2eb 100644 --- a/pandas/tests/apply/test_str.py +++ b/pandas/tests/apply/test_str.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat import WASM from pandas.core.dtypes.common import is_number @@ -79,6 +81,7 @@ def test_apply_np_transformer(float_frame, op, how): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "series, func, expected", chain( @@ -137,6 +140,7 @@ def test_agg_cython_table_series(series, func, expected): assert result == expected +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "series, func, expected", chain( diff --git a/pandas/tests/arrays/categorical/test_analytics.py b/pandas/tests/arrays/categorical/test_analytics.py index 1021b18f4ae71..dca33dffa3996 100644 --- a/pandas/tests/arrays/categorical/test_analytics.py +++ b/pandas/tests/arrays/categorical/test_analytics.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat import PYPY from pandas import ( @@ -294,6 +296,7 @@ def test_nbytes(self): exp = 3 + 3 * 8 # 3 int8s for values + 3 int64s for categories assert cat.nbytes == exp + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_memory_usage(self): cat = Categorical([1, 2, 3]) diff --git a/pandas/tests/arrays/categorical/test_api.py b/pandas/tests/arrays/categorical/test_api.py index 2791fd55f54d7..2ccc5781c608e 100644 --- a/pandas/tests/arrays/categorical/test_api.py +++ b/pandas/tests/arrays/categorical/test_api.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat import PY311 from pandas import ( @@ -149,6 +151,7 @@ def test_reorder_categories_raises(self, new_categories): with pytest.raises(ValueError, match=msg): cat.reorder_categories(new_categories) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_add_categories(self): cat = Categorical(["a", "b", "c", "a"], ordered=True) old = cat.copy() diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index 6752a503016f8..e0bd8386b2c41 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -735,6 +735,7 @@ def test_interval(self): tm.assert_numpy_array_equal(cat.codes, expected_codes) tm.assert_index_equal(cat.categories, idx) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_categorical_extension_array_nullable(self, nulls_fixture): # GH: arr = pd.arrays.StringArray._from_sequence( diff --git a/pandas/tests/arrays/floating/test_arithmetic.py b/pandas/tests/arrays/floating/test_arithmetic.py index ba081bd01062a..768d3c1449fa4 100644 --- a/pandas/tests/arrays/floating/test_arithmetic.py +++ b/pandas/tests/arrays/floating/test_arithmetic.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd import pandas._testing as tm from pandas.core.arrays import FloatingArray @@ -122,6 +124,7 @@ def test_arith_zero_dim_ndarray(other): # ----------------------------------------------------------------------------- +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string): op = all_arithmetic_operators s = pd.Series(data) diff --git a/pandas/tests/arrays/integer/test_arithmetic.py b/pandas/tests/arrays/integer/test_arithmetic.py index 8acd298f37a07..8aa8c2db940b4 100644 --- a/pandas/tests/arrays/integer/test_arithmetic.py +++ b/pandas/tests/arrays/integer/test_arithmetic.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd import pandas._testing as tm from pandas.core import ops @@ -172,6 +174,7 @@ def test_numpy_zero_dim_ndarray(other): # ----------------------------------------------------------------------------- +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string): op = all_arithmetic_operators s = pd.Series(data) diff --git a/pandas/tests/arrays/masked/test_function.py b/pandas/tests/arrays/masked/test_function.py index b4b1761217826..6b352758b3ae6 100644 --- a/pandas/tests/arrays/masked/test_function.py +++ b/pandas/tests/arrays/masked/test_function.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.core.dtypes.common import is_integer_dtype import pandas as pd @@ -58,6 +60,7 @@ def test_tolist(data): tm.assert_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_to_numpy(): # GH#56991 diff --git a/pandas/tests/copy_view/test_array.py b/pandas/tests/copy_view/test_array.py index bb238d08bd9bd..bcc8a212fbb98 100644 --- a/pandas/tests/copy_view/test_array.py +++ b/pandas/tests/copy_view/test_array.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas import ( DataFrame, Series, @@ -117,6 +119,7 @@ def test_dataframe_array_ea_dtypes(): assert arr.flags.writeable is False +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_dataframe_array_string_dtype(): df = DataFrame({"a": ["a", "b"]}, dtype="string") arr = np.asarray(df) diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py index d1e4104e16465..a503841386fbc 100644 --- a/pandas/tests/copy_view/test_astype.py +++ b/pandas/tests/copy_view/test_astype.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat.pyarrow import pa_version_under12p0 import pandas.util._test_decorators as td @@ -82,6 +84,7 @@ def test_astype_numpy_to_ea(): assert np.shares_memory(get_array(ser), get_array(result)) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "dtype, new_dtype", [("object", "string"), ("string", "object")] ) @@ -95,6 +98,7 @@ def test_astype_string_and_object(dtype, new_dtype): tm.assert_frame_equal(df, df_orig) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "dtype, new_dtype", [("object", "string"), ("string", "object")] ) @@ -195,6 +199,7 @@ def test_astype_arrow_timestamp(): assert np.shares_memory(get_array(df, "a"), get_array(result, "a")._pa_array) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_convert_dtypes_infer_objects(): ser = Series(["a", "b", "c"]) ser_orig = ser.copy() @@ -210,6 +215,7 @@ def test_convert_dtypes_infer_objects(): tm.assert_series_equal(ser, ser_orig) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_convert_dtypes(): df = DataFrame({"a": ["a", "b"], "b": [1, 2], "c": [1.5, 2.5], "d": [True, False]}) df_orig = df.copy() diff --git a/pandas/tests/copy_view/test_constructors.py b/pandas/tests/copy_view/test_constructors.py index eb5177e393936..743e094032505 100644 --- a/pandas/tests/copy_view/test_constructors.py +++ b/pandas/tests/copy_view/test_constructors.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -207,6 +209,7 @@ def test_dataframe_from_dict_of_series_with_reindex(dtype): assert np.shares_memory(arr_before, arr_after) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "data, dtype", [([1, 2], None), ([1, 2], "int64"), (["a", "b"], None)] ) diff --git a/pandas/tests/copy_view/test_functions.py b/pandas/tests/copy_view/test_functions.py index 196d908a44a46..d2e2d43b0a42b 100644 --- a/pandas/tests/copy_view/test_functions.py +++ b/pandas/tests/copy_view/test_functions.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas import ( DataFrame, Index, @@ -12,6 +14,7 @@ from pandas.tests.copy_view.util import get_array +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_concat_frames(): df = DataFrame({"b": ["a"] * 3}) df2 = DataFrame({"a": ["a"] * 3}) @@ -30,6 +33,7 @@ def test_concat_frames(): tm.assert_frame_equal(df, df_orig) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_concat_frames_updating_input(): df = DataFrame({"b": ["a"] * 3}) df2 = DataFrame({"a": ["a"] * 3}) @@ -149,6 +153,7 @@ def test_concat_copy_keyword(): assert np.shares_memory(get_array(df2, "b"), get_array(result, "b")) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "func", [ @@ -200,6 +205,7 @@ def test_merge_on_index(): tm.assert_frame_equal(df2, df2_orig) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "func, how", [ @@ -243,6 +249,7 @@ def test_merge_copy_keyword(): assert np.shares_memory(get_array(df2, "b"), get_array(result, "b")) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_join_on_key(): df_index = Index(["a", "b", "c"], name="key") @@ -270,6 +277,7 @@ def test_join_on_key(): tm.assert_frame_equal(df2, df2_orig) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_join_multiple_dataframes_on_key(): df_index = Index(["a", "b", "c"], name="key") diff --git a/pandas/tests/copy_view/test_internals.py b/pandas/tests/copy_view/test_internals.py index a4cb1e6bea9c9..b2a26ceacd6c3 100644 --- a/pandas/tests/copy_view/test_internals.py +++ b/pandas/tests/copy_view/test_internals.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas import DataFrame import pandas._testing as tm from pandas.tests.copy_view.util import get_array @@ -40,6 +42,7 @@ def test_consolidate(): assert df.loc[0, "b"] == 0.1 +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("dtype", [np.intp, np.int8]) @pytest.mark.parametrize( "locs, arr", diff --git a/pandas/tests/copy_view/test_interp_fillna.py b/pandas/tests/copy_view/test_interp_fillna.py index abd87162ec32e..f80e9b7dcf838 100644 --- a/pandas/tests/copy_view/test_interp_fillna.py +++ b/pandas/tests/copy_view/test_interp_fillna.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas import ( NA, DataFrame, @@ -110,6 +112,7 @@ def test_interp_fill_functions_inplace(func, dtype): assert view._mgr._has_no_reference(0) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_interpolate_cannot_with_object_dtype(): df = DataFrame({"a": ["a", np.nan, "c"], "b": 1}) @@ -118,6 +121,7 @@ def test_interpolate_cannot_with_object_dtype(): df.interpolate() +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_interpolate_object_convert_no_op(): df = DataFrame({"a": ["a", "b", "c"], "b": 1}) arr_a = get_array(df, "a") diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index 6f0cbe12a2ea0..3716df8fbf855 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -712,6 +714,7 @@ def test_head_tail(method): tm.assert_frame_equal(df, df_orig) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_infer_objects(): df = DataFrame({"a": [1, 2], "b": "c", "c": 1, "d": "x"}) df_orig = df.copy() @@ -896,6 +899,7 @@ def test_sort_values_inplace(obj, kwargs): tm.assert_equal(view, obj_orig) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("decimals", [-1, 0, 1]) def test_round(decimals): df = DataFrame({"a": [1, 2], "b": "c"}) diff --git a/pandas/tests/copy_view/test_replace.py b/pandas/tests/copy_view/test_replace.py index 2eb88923c0087..c1120ccfea635 100644 --- a/pandas/tests/copy_view/test_replace.py +++ b/pandas/tests/copy_view/test_replace.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas import ( Categorical, DataFrame, @@ -9,6 +11,7 @@ from pandas.tests.copy_view.util import get_array +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "replace_kwargs", [ @@ -56,6 +59,7 @@ def test_replace_regex_inplace_refs(): tm.assert_frame_equal(view, df_orig) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_replace_regex_inplace(): df = DataFrame({"a": ["aaa", "bbb"]}) arr = get_array(df, "a") @@ -253,6 +257,7 @@ def test_replace_empty_list(): assert not df2._mgr._has_no_reference(0) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("value", ["d", None]) def test_replace_object_list_inplace(value): df = DataFrame({"a": ["a", "b", "c"]}) diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index 903c13587151a..b6c5becf49fa0 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs.tslibs.dtypes import NpyDatetimeUnit from pandas.core.dtypes.base import _registry as registry @@ -959,6 +961,7 @@ def test_same_categories_different_order(self): c2 = CategoricalDtype(["b", "a"], ordered=True) assert c1 is not c2 + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("ordered2", [True, False, None]) def test_categorical_equality(self, ordered, ordered2): # same categories, same order diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index ea9c5096638d5..dbf353d87178f 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -32,6 +32,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs import lib from pandas._libs.tslibs import timezones from pandas.compat import ( @@ -1993,6 +1995,7 @@ def test_str_find_large_start(): tm.assert_series_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.skipif( pa_version_under13p0, reason="https://github.com/apache/arrow/issues/36311" ) diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 4628c5568b49b..64b383ded97b5 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -22,6 +22,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd import pandas._testing as tm from pandas.api.types import is_string_dtype @@ -29,6 +31,10 @@ from pandas.core.arrays.string_ import StringDtype from pandas.tests.extension import base +pytestmark = pytest.mark.xfail( + using_string_dtype(), reason="TODO(infer_string)", strict=False +) + def maybe_split_array(arr, chunked): if not chunked: diff --git a/pandas/tests/frame/indexing/test_coercion.py b/pandas/tests/frame/indexing/test_coercion.py index 472bfb7772a80..cb1cbd68ede63 100644 --- a/pandas/tests/frame/indexing/test_coercion.py +++ b/pandas/tests/frame/indexing/test_coercion.py @@ -8,6 +8,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -82,6 +84,7 @@ def test_6942(indexer_al): assert df.iloc[0, 0] == t2 +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_26395(indexer_al): # .at case fixed by GH#45121 (best guess) df = DataFrame(index=["A", "B", "C"]) diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index a95fc10157a29..b0b33b4a565ec 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -9,6 +9,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs import iNaT from pandas.errors import InvalidIndexError @@ -174,6 +176,7 @@ def test_getitem_boolean(self, mixed_float_frame, mixed_int_frame, datetime_fram if bif[c].dtype != bifw[c].dtype: assert bif[c].dtype == df[c].dtype + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_getitem_boolean_casting(self, datetime_frame): # don't upcast if we don't need to df = datetime_frame.copy() @@ -501,6 +504,7 @@ def test_setitem_ambig(self, using_infer_string): else: assert dm[2].dtype == np.object_ + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_None(self, float_frame, using_infer_string): # GH #766 float_frame[None] = float_frame["A"] @@ -1121,6 +1125,7 @@ def test_setitem_with_unaligned_tz_aware_datetime_column(self): df.loc[[0, 1, 2], "dates"] = column[[1, 0, 2]] tm.assert_series_equal(df["dates"], column) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_loc_setitem_datetimelike_with_inference(self): # GH 7592 # assignment of timedeltas with NaT @@ -1143,6 +1148,7 @@ def test_loc_setitem_datetimelike_with_inference(self): ) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_getitem_boolean_indexing_mixed(self): df = DataFrame( { @@ -1871,6 +1877,7 @@ def test_adding_new_conditional_column_with_string(dtype, infer_string) -> None: tm.assert_frame_equal(df, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_add_new_column_infer_string(): # GH#55366 pytest.importorskip("pyarrow") diff --git a/pandas/tests/frame/indexing/test_insert.py b/pandas/tests/frame/indexing/test_insert.py index b530cb98ef46c..3dd8f7196c594 100644 --- a/pandas/tests/frame/indexing/test_insert.py +++ b/pandas/tests/frame/indexing/test_insert.py @@ -7,6 +7,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import PerformanceWarning from pandas import ( @@ -61,6 +63,7 @@ def test_insert_column_bug_4032(self): expected = DataFrame([[1.3, 1, 1.1], [2.3, 2, 2.2]], columns=["c", "a", "b"]) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_insert_with_columns_dups(self): # GH#14291 df = DataFrame() diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index 75f52a57a0949..cb971b31c13c4 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.core.dtypes.base import _registry as ea_registry from pandas.core.dtypes.common import is_object_dtype from pandas.core.dtypes.dtypes import ( @@ -144,6 +146,7 @@ def test_setitem_different_dtype(self): ) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_empty_columns(self): # GH 13522 df = DataFrame(index=["A", "B", "C"]) @@ -159,6 +162,7 @@ def test_setitem_dt64_index_empty_columns(self): df["A"] = rng assert df["A"].dtype == np.dtype("M8[ns]") + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_timestamp_empty_columns(self): # GH#19843 df = DataFrame(index=range(3)) @@ -198,6 +202,7 @@ def test_setitem_with_unaligned_sparse_value(self): expected = Series(SparseArray([1, 0, 0]), name="new_column") tm.assert_series_equal(df["new_column"], expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_period_preserves_dtype(self): # GH: 26861 data = [Period("2003-12", "D")] @@ -667,6 +672,7 @@ def test_setitem_iloc_two_dimensional_generator(self): expected = DataFrame({"a": [1, 2, 3], "b": [4, 1, 1]}) tm.assert_frame_equal(df, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_dtypes_bytes_type_to_object(self): # GH 20734 index = Series(name="id", dtype="S24") @@ -699,6 +705,7 @@ def test_setitem_ea_dtype_rhs_series(self): expected = DataFrame({"a": [1, 2]}, dtype="Int64") tm.assert_frame_equal(df, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_npmatrix_2d(self): # GH#42376 # for use-case df["x"] = sparse.random((10, 10)).mean(axis=1) @@ -920,6 +927,7 @@ def test_setitem_with_expansion_categorical_dtype(self): ser.name = "E" tm.assert_series_equal(result2.sort_index(), ser.sort_index()) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_scalars_no_index(self): # GH#16823 / GH#17894 df = DataFrame() diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py index 0f22ff52d5212..1d7b3e12b2e86 100644 --- a/pandas/tests/frame/indexing/test_where.py +++ b/pandas/tests/frame/indexing/test_where.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.core.dtypes.common import is_scalar import pandas as pd @@ -46,6 +48,7 @@ def is_ok(s): class TestDataFrameIndexingWhere: + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_where_get(self, where_frame, float_string_frame): def _check_get(df, cond, check_dtypes=True): other1 = _safe_add(df) @@ -96,6 +99,7 @@ def test_where_upcasting(self): tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_where_alignment(self, where_frame, float_string_frame): # aligning def _check_align(df, cond, other, check_dtypes=True): @@ -170,6 +174,7 @@ def test_where_invalid(self): with pytest.raises(ValueError, match=msg): df.mask(0) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_where_set(self, where_frame, float_string_frame, mixed_int_frame): # where inplace diff --git a/pandas/tests/frame/indexing/test_xs.py b/pandas/tests/frame/indexing/test_xs.py index 4878f74bd152e..a01b68f1fea2a 100644 --- a/pandas/tests/frame/indexing/test_xs.py +++ b/pandas/tests/frame/indexing/test_xs.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas import ( DataFrame, Index, @@ -72,6 +74,7 @@ def test_xs_other(self, float_frame): tm.assert_series_equal(float_frame["A"], float_frame_orig["A"]) assert not (expected == 5).all() + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_xs_corner(self): # pathological mixed-type reordering case df = DataFrame(index=[0]) diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index 99c8ddc643fee..87b7d5052a345 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.core.dtypes.cast import find_common_type from pandas.core.dtypes.common import is_dtype_equal @@ -30,6 +32,7 @@ def test_combine_first_mixed(self): combined = f.combine_first(g) tm.assert_frame_equal(combined, exp) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_combine_first(self, float_frame, using_infer_string): # disjoint head, tail = float_frame[:5], float_frame[5:] diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py index 9cbbebf35b2d1..91fa81b5bee2e 100644 --- a/pandas/tests/frame/methods/test_convert_dtypes.py +++ b/pandas/tests/frame/methods/test_convert_dtypes.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd import pandas._testing as tm @@ -181,6 +183,7 @@ def test_convert_dtypes_pyarrow_timestamp(self): result = expected.convert_dtypes(dtype_backend="pyarrow") tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_convert_dtypes_avoid_block_splitting(self): # GH#55341 df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": "a"}) @@ -195,6 +198,7 @@ def test_convert_dtypes_avoid_block_splitting(self): tm.assert_frame_equal(result, expected) assert result._mgr.nblocks == 2 + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_convert_dtypes_from_arrow(self): # GH#56581 df = pd.DataFrame([["a", datetime.time(18, 12)]], columns=["a", "b"]) diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index aeaf80f285f9d..c15952339ef18 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas.util._test_decorators as td import pandas as pd @@ -318,6 +320,7 @@ def test_corrwith_non_timeseries_data(self): for row in index[:4]: tm.assert_almost_equal(correls[row], df1.loc[row].corr(df2.loc[row])) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_corrwith_with_objects(self, using_infer_string): df1 = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), diff --git a/pandas/tests/frame/methods/test_dropna.py b/pandas/tests/frame/methods/test_dropna.py index 11893d7fac1a4..4a60dc09cfe07 100644 --- a/pandas/tests/frame/methods/test_dropna.py +++ b/pandas/tests/frame/methods/test_dropna.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -182,6 +184,7 @@ def test_dropna_multiple_axes(self): with pytest.raises(TypeError, match="supplying multiple axes"): inp.dropna(how="all", axis=(0, 1), inplace=True) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_dropna_tz_aware_datetime(self): # GH13407 df = DataFrame() diff --git a/pandas/tests/frame/methods/test_dtypes.py b/pandas/tests/frame/methods/test_dtypes.py index 0697f59cd271f..1685f9ee331f5 100644 --- a/pandas/tests/frame/methods/test_dtypes.py +++ b/pandas/tests/frame/methods/test_dtypes.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.core.dtypes.dtypes import DatetimeTZDtype import pandas as pd @@ -133,6 +135,7 @@ def test_dtypes_timedeltas(self): ) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_frame_apply_np_array_return_type(self, using_infer_string): # GH 35517 df = DataFrame([["foo"]]) diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py index b72cac6f3f9a1..ad1a37916e381 100644 --- a/pandas/tests/frame/methods/test_fillna.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -84,6 +84,7 @@ def test_fillna_mixed_float(self, mixed_float_frame): result = mf.ffill() _check_mixed_float(result, dtype={"C": None}) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_fillna_different_dtype(self, using_infer_string): # with different dtype (GH#3386) df = DataFrame( @@ -275,6 +276,7 @@ def test_fillna_dictlike_value_duplicate_colnames(self, columns): expected["A"] = 0.0 tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_fillna_dtype_conversion(self, using_infer_string): # make sure that fillna on an empty frame works df = DataFrame(index=["A", "B", "C"], columns=[1, 2, 3, 4, 5]) diff --git a/pandas/tests/frame/methods/test_info.py b/pandas/tests/frame/methods/test_info.py index 17cb989626e70..a4319f8a8ae7f 100644 --- a/pandas/tests/frame/methods/test_info.py +++ b/pandas/tests/frame/methods/test_info.py @@ -7,6 +7,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat import ( IS64, PYPY, @@ -433,6 +435,7 @@ def test_usage_via_getsizeof(): assert abs(diff) < 100 +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_info_memory_usage_qualified(): buf = StringIO() df = DataFrame(1, columns=list("ab"), index=[1, 2, 3]) @@ -493,6 +496,7 @@ def test_info_categorical(): df.info(buf=buf) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.xfail(not IS64, reason="GH 36579: fail on 32-bit system") def test_info_int_columns(): # GH#37245 @@ -516,6 +520,7 @@ def test_info_int_columns(): assert result == expected +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_memory_usage_empty_no_warning(): # GH#50066 df = DataFrame(index=["a", "b"]) diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py index 4181740d62627..fedbdbc98660f 100644 --- a/pandas/tests/frame/methods/test_quantile.py +++ b/pandas/tests/frame/methods/test_quantile.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -324,6 +326,7 @@ def test_quantile_multi_empty(self, interp_method): ) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_quantile_datetime(self, unit): dti = pd.to_datetime(["2010", "2011"]).as_unit(unit) df = DataFrame({"a": dti, "b": [0, 5]}) @@ -377,6 +380,7 @@ def test_quantile_datetime(self, unit): expected = DataFrame(index=[0.5], columns=[]) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "dtype", [ @@ -641,6 +645,7 @@ def test_quantile_nat(self, interp_method, unit): ) tm.assert_frame_equal(res, exp) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_quantile_empty_no_rows_floats(self, interp_method): interpolation, method = interp_method @@ -869,6 +874,7 @@ def test_quantile_ea_scalar(self, request, obj, index): else: tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "dtype, expected_data, expected_index, axis", [ @@ -887,6 +893,7 @@ def test_empty_numeric(self, dtype, expected_data, expected_index, axis): ) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "dtype, expected_data, expected_index, axis, expected_dtype", [ @@ -905,6 +912,7 @@ def test_empty_datelike( ) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "expected_data, expected_index, axis", [ diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index 0a980e5d358a5..6b872bf48d550 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -601,6 +601,7 @@ def test_replace_mixed_int_block_splitting(self): result = df.replace(0, 0.5) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_replace_mixed2(self, using_infer_string): # to object block upcasting df = DataFrame( @@ -1268,6 +1269,7 @@ def test_categorical_replace_with_dict(self, replace_dict, final_data): assert return_value is None tm.assert_frame_equal(df, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_replace_value_category_type(self): """ Test for #23305: to ensure category dtypes are maintained @@ -1364,6 +1366,7 @@ def test_replace_with_compiled_regex(self): expected = DataFrame(["z", "b", "c"]) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_replace_intervals(self): # https://github.com/pandas-dev/pandas/issues/35931 df = DataFrame({"a": [pd.Interval(0, 1), pd.Interval(0, 1)]}) diff --git a/pandas/tests/frame/methods/test_reset_index.py b/pandas/tests/frame/methods/test_reset_index.py index 980dd5243daa5..c487bc4cfb89a 100644 --- a/pandas/tests/frame/methods/test_reset_index.py +++ b/pandas/tests/frame/methods/test_reset_index.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.core.dtypes.common import ( is_float_dtype, is_integer_dtype, @@ -642,6 +644,7 @@ def test_rest_index_multiindex_categorical_with_missing_values(self, codes): tm.assert_frame_equal(res, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "array, dtype", [ diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py index 44794906b8e60..7fb1658394632 100644 --- a/pandas/tests/frame/methods/test_to_csv.py +++ b/pandas/tests/frame/methods/test_to_csv.py @@ -5,6 +5,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import ParserError import pandas as pd @@ -42,6 +44,7 @@ def test_to_csv_from_csv1(self, temp_file, float_frame): float_frame.to_csv(path, header=False) float_frame.to_csv(path, index=False) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_to_csv_from_csv1_datetime(self, temp_file, datetime_frame): path = str(temp_file) # test roundtrip @@ -436,6 +439,7 @@ def test_to_csv_empty(self): result, expected = self._return_result_expected(df, 1000) tm.assert_frame_equal(result, expected, check_column_type=False) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.slow def test_to_csv_chunksize(self): chunksize = 1000 @@ -448,6 +452,7 @@ def test_to_csv_chunksize(self): result, expected = self._return_result_expected(df, chunksize, rnlvl=2) tm.assert_frame_equal(result, expected, check_names=False) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.slow @pytest.mark.parametrize( "nrows", [2, 10, 99, 100, 101, 102, 198, 199, 200, 201, 202, 249, 250, 251] @@ -544,6 +549,7 @@ def test_to_csv_headers(self, temp_file): assert return_value is None tm.assert_frame_equal(to_df, recons) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_to_csv_multiindex(self, temp_file, float_frame, datetime_frame): frame = float_frame old_index = frame.index @@ -737,6 +743,7 @@ def test_to_csv_withcommas(self, temp_file): df2 = self.read_csv(path) tm.assert_frame_equal(df2, df) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_to_csv_mixed(self, temp_file): def create_cols(name): return [f"{name}{i:03d}" for i in range(5)] @@ -822,6 +829,7 @@ def test_to_csv_dups_cols(self, temp_file): result.columns = df.columns tm.assert_frame_equal(result, df) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_to_csv_dups_cols2(self, temp_file): # GH3457 df = DataFrame( diff --git a/pandas/tests/frame/methods/test_to_dict_of_blocks.py b/pandas/tests/frame/methods/test_to_dict_of_blocks.py index 0f1f643209db0..4f621b4643b70 100644 --- a/pandas/tests/frame/methods/test_to_dict_of_blocks.py +++ b/pandas/tests/frame/methods/test_to_dict_of_blocks.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas import ( DataFrame, MultiIndex, @@ -25,6 +27,7 @@ def test_no_copy_blocks(self, float_frame): assert _last_df is not None and not _last_df[column].equals(df[column]) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_to_dict_of_blocks_item_cache(): # Calling to_dict_of_blocks should not poison item_cache df = DataFrame({"a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"]}) diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 3971e58e8235e..11e51056d51d0 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -11,6 +11,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -1540,6 +1542,7 @@ def test_comparisons(self, simple_frame, float_frame, func): with pytest.raises(ValueError, match=msg): func(simple_frame, simple_frame[:2]) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_strings_to_numbers_comparisons_raises(self, compare_operators_no_eq_ne): # GH 11565 df = DataFrame( diff --git a/pandas/tests/frame/test_arrow_interface.py b/pandas/tests/frame/test_arrow_interface.py index 098d1829b973c..dc163268f64b9 100644 --- a/pandas/tests/frame/test_arrow_interface.py +++ b/pandas/tests/frame/test_arrow_interface.py @@ -2,6 +2,8 @@ import pytest +from pandas._config import using_string_dtype + import pandas.util._test_decorators as td import pandas as pd @@ -9,6 +11,7 @@ pa = pytest.importorskip("pyarrow") +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @td.skip_if_no("pyarrow", min_version="14.0") def test_dataframe_arrow_interface(): df = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}) @@ -31,6 +34,7 @@ def test_dataframe_arrow_interface(): assert table.equals(expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @td.skip_if_no("pyarrow", min_version="15.0") def test_dataframe_to_arrow(): df = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}) diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index 3f0e829f66361..c95c382bb5131 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -7,6 +7,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( Categorical, @@ -160,6 +162,7 @@ def test_constructor_with_convert(self): ) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_construction_with_mixed(self, float_string_frame, using_infer_string): # test construction edge cases with mixed types @@ -191,6 +194,7 @@ def test_construction_with_mixed(self, float_string_frame, using_infer_string): ) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_construction_with_conversions(self): # convert from a numpy array of non-ns timedelta64; as of 2.0 this does # *not* convert @@ -395,6 +399,7 @@ def test_update_inplace_sets_valid_block_values(): assert isinstance(df._mgr.blocks[0].values, Categorical) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_nonconsolidated_item_cache_take(): # https://github.com/pandas-dev/pandas/issues/35521 diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 6416ea6415eb3..607e333d82823 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -1935,6 +1935,7 @@ def test_constructor_with_datetimes4(self): df = DataFrame({"value": dr}) assert str(df.iat[0, 0].tz) == "US/Eastern" + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_constructor_with_datetimes5(self): # GH 7822 # preserver an index with a tz on dict construction diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index 4f10fb2e0e9f5..aa2fb19fe8528 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import ( NumExprClobberingError, UndefinedVariableError, @@ -759,6 +761,7 @@ def test_inf(self, op, f, engine, parser): result = df.query(q, engine=engine, parser=parser) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_check_tz_aware_index_query(self, tz_aware_fixture): # https://github.com/pandas-dev/pandas/issues/29463 tz = tz_aware_fixture diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 3f4a5f2c97b6c..4c355ed92b6c3 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -606,6 +606,7 @@ def test_sem(self, datetime_frame): result = nanops.nansem(arr, axis=0) assert not (result < 0).any() + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "dropna, expected", [ @@ -1057,6 +1058,7 @@ def test_sum_bools(self): # ---------------------------------------------------------------------- # Index of max / min + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("axis", [0, 1]) def test_idxmin(self, float_frame, int_frame, skipna, axis): frame = float_frame @@ -1107,6 +1109,7 @@ def test_idxmin_axis_2(self, float_frame): with pytest.raises(ValueError, match=msg): frame.idxmin(axis=2) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("axis", [0, 1]) def test_idxmax(self, float_frame, int_frame, skipna, axis): frame = float_frame @@ -1346,6 +1349,7 @@ def test_any_all_extra(self): result = df[["C"]].all(axis=None).item() assert result is True + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("axis", [0, 1]) def test_any_all_object_dtype( self, axis, all_boolean_reductions, skipna, using_infer_string diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index fc532a565a173..92bcd6f0c7d0c 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -5,6 +5,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs import lib import pandas as pd @@ -1669,6 +1671,7 @@ def test_unstack_multiple_no_empty_columns(self): expected = unstacked.dropna(axis=1, how="all") tm.assert_frame_equal(unstacked, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.filterwarnings( "ignore:The previous implementation of stack is deprecated" ) @@ -1919,6 +1922,7 @@ def test_stack_level_name(self, multiindex_dataframe_random_data, future_stack): expected = frame.stack(future_stack=future_stack) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.filterwarnings( "ignore:The previous implementation of stack is deprecated" ) diff --git a/pandas/tests/frame/test_unary.py b/pandas/tests/frame/test_unary.py index e89175ceff0c1..1887fa61ad081 100644 --- a/pandas/tests/frame/test_unary.py +++ b/pandas/tests/frame/test_unary.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat.numpy import np_version_gte1p25 import pandas as pd @@ -128,6 +130,7 @@ def test_pos_object(self, df_data): tm.assert_frame_equal(+df, df) tm.assert_series_equal(+df["a"], df["a"]) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.filterwarnings("ignore:Applying:DeprecationWarning") def test_pos_object_raises(self): # GH#21380 diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 26602baedb594..46c27849356b5 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -9,6 +9,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import SpecificationError from pandas.core.dtypes.common import is_integer_dtype @@ -294,6 +296,7 @@ def aggfun_1(ser): assert len(result) == 0 +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_wrap_agg_out(three_group): grouped = three_group.groupby(["A", "B"]) @@ -1114,6 +1117,7 @@ def test_lambda_named_agg(func): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_aggregate_mixed_types(): # GH 16916 df = DataFrame( diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index bf9e82480785c..4a4f5882b7e85 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -5,6 +5,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.core.dtypes.common import ( is_float_dtype, is_integer_dtype, @@ -90,6 +92,7 @@ def test_cython_agg_boolean(): tm.assert_series_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_cython_agg_nothing_to_agg(): frame = DataFrame( {"a": np.random.default_rng(2).integers(0, 5, 50), "b": ["foo", "bar"] * 25} @@ -143,6 +146,7 @@ def test_cython_agg_return_dict(): tm.assert_series_equal(ts, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_cython_fail_agg(): dr = bdate_range("1/1/2000", periods=50) ts = Series(["A", "B", "C", "D", "E"] * 10, index=dr) diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 78f2917e9a057..835cad0d13078 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -8,6 +8,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import SpecificationError import pandas as pd @@ -306,6 +308,7 @@ def test_series_agg_multikey(): tm.assert_series_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_series_agg_multi_pure_python(): data = DataFrame( { diff --git a/pandas/tests/groupby/methods/test_describe.py b/pandas/tests/groupby/methods/test_describe.py index 0f5fc915f9523..5f1f85d8179cd 100644 --- a/pandas/tests/groupby/methods/test_describe.py +++ b/pandas/tests/groupby/methods/test_describe.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -71,6 +73,7 @@ def test_series_describe_as_index(as_index, keys): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_frame_describe_multikey(tsframe): grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month]) result = grouped.describe() @@ -246,6 +249,7 @@ def test_describe_non_cython_paths(): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("dtype", [int, float, object]) @pytest.mark.parametrize( "kwargs", diff --git a/pandas/tests/groupby/methods/test_nth.py b/pandas/tests/groupby/methods/test_nth.py index 1b852abad6c8e..d20b30834dea2 100644 --- a/pandas/tests/groupby/methods/test_nth.py +++ b/pandas/tests/groupby/methods/test_nth.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -677,6 +679,7 @@ def test_first_multi_key_groupby_categorical(): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("method", ["first", "last", "nth"]) def test_groupby_last_first_nth_with_none(method, nulls_fixture): # GH29645 diff --git a/pandas/tests/groupby/methods/test_quantile.py b/pandas/tests/groupby/methods/test_quantile.py index af0deba138469..0e31c0698cb1e 100644 --- a/pandas/tests/groupby/methods/test_quantile.py +++ b/pandas/tests/groupby/methods/test_quantile.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -156,6 +158,7 @@ def test_groupby_quantile_with_arraylike_q_and_int_columns(frame_size, groupby, tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_quantile_raises(): df = DataFrame([["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"]) @@ -238,6 +241,7 @@ def test_groupby_quantile_nullable_array(values, q): tm.assert_series_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]]) @pytest.mark.parametrize("numeric_only", [True, False]) def test_groupby_quantile_raises_on_invalid_dtype(q, numeric_only): diff --git a/pandas/tests/groupby/methods/test_size.py b/pandas/tests/groupby/methods/test_size.py index 5a3eb49e97fb7..edeac642551a0 100644 --- a/pandas/tests/groupby/methods/test_size.py +++ b/pandas/tests/groupby/methods/test_size.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas.util._test_decorators as td from pandas import ( @@ -76,6 +78,7 @@ def test_size_series_masked_type_returns_Int64(dtype): tm.assert_series_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "dtype", [ diff --git a/pandas/tests/groupby/methods/test_value_counts.py b/pandas/tests/groupby/methods/test_value_counts.py index 0f136b06c782a..14d3dbd6fa496 100644 --- a/pandas/tests/groupby/methods/test_value_counts.py +++ b/pandas/tests/groupby/methods/test_value_counts.py @@ -7,6 +7,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas.util._test_decorators as td from pandas import ( @@ -273,6 +275,7 @@ def _frame_value_counts(df, keys, normalize, sort, ascending): return df[keys].value_counts(normalize=normalize, sort=sort, ascending=ascending) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("groupby", ["column", "array", "function"]) @pytest.mark.parametrize("normalize, name", [(True, "proportion"), (False, "count")]) @pytest.mark.parametrize( @@ -356,6 +359,7 @@ def test_against_frame_and_seriesgroupby( tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "dtype", [ diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 010bd9ee52555..c35f5d2bc26e8 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( Categorical, @@ -320,6 +322,7 @@ def test_apply(ordered): tm.assert_series_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_observed(observed): # multiple groupers, don't re-expand the output space # of the grouper diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 93e891c51b86c..5ac6dc990c092 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -6,6 +6,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import SpecificationError import pandas.util._test_decorators as td @@ -1261,6 +1263,7 @@ def test_groupby_two_group_keys_all_nan(): assert result == {} +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_groupby_2d_malformed(): d = DataFrame(index=range(2)) d["group"] = ["g1", "g2"] @@ -2325,6 +2328,7 @@ def test_groupby_all_nan_groups_drop(): tm.assert_series_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("numeric_only", [True, False]) def test_groupby_empty_multi_column(as_index, numeric_only): # GH 15106 & GH 41998 @@ -2341,6 +2345,7 @@ def test_groupby_empty_multi_column(as_index, numeric_only): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_groupby_aggregation_non_numeric_dtype(): # GH #43108 df = DataFrame( @@ -2498,6 +2503,7 @@ def test_groupby_none_in_first_mi_level(): tm.assert_series_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_groupby_none_column_name(): # GH#47348 df = DataFrame({None: [1, 1, 2, 2], "b": [1, 1, 2, 3], "c": [4, 5, 6, 7]}) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index cedbd577da0ca..d42aa06d6bbfe 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat.pyarrow import pa_version_under10p1 from pandas.core.dtypes.missing import na_value_for_dtype @@ -97,6 +99,7 @@ def test_groupby_dropna_multi_index_dataframe_nan_in_two_groups( tm.assert_frame_equal(grouped, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "dropna, idx, outputs", [ diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 814b35ad577f1..fc2a8a970010a 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -10,6 +10,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import SpecificationError import pandas as pd @@ -805,6 +807,7 @@ def test_groupby_empty(self): expected = ["name"] assert result == expected + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_groupby_level_index_value_all_na(self): # issue 20519 df = DataFrame( @@ -978,6 +981,7 @@ def test_groupby_with_empty(self): grouped = series.groupby(grouper) assert next(iter(grouped), None) is None + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_groupby_with_single_column(self): df = DataFrame({"a": list("abssbab")}) tm.assert_frame_equal(df.groupby("a").get_group("a"), df.iloc[[0, 5]]) diff --git a/pandas/tests/groupby/test_pipe.py b/pandas/tests/groupby/test_pipe.py index 7d5c1625b8ab4..1044c83e3e56b 100644 --- a/pandas/tests/groupby/test_pipe.py +++ b/pandas/tests/groupby/test_pipe.py @@ -1,4 +1,7 @@ import numpy as np +import pytest + +from pandas._config import using_string_dtype import pandas as pd from pandas import ( @@ -8,6 +11,7 @@ import pandas._testing as tm +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_pipe(): # Test the pipe method of DataFrameGroupBy. # Issue #17871 diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index 9f3e620ca9872..f28967fa81ddb 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -8,6 +8,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas import ( Categorical, DataFrame, @@ -104,6 +106,7 @@ def _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg=""): gb.transform(groupby_func, *args) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("how", ["method", "agg", "transform"]) def test_groupby_raises_string( how, by, groupby_series, groupby_func, df_with_string_col @@ -205,6 +208,7 @@ def func(x): getattr(gb, how)(func) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("how", ["agg", "transform"]) @pytest.mark.parametrize("groupby_func_np", [np.sum, np.mean]) def test_groupby_raises_string_np( diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index 00438c2100bad..8a421654cdf9b 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -5,6 +5,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs.tslibs import iNaT from pandas.core.dtypes.common import pandas_dtype @@ -468,6 +470,7 @@ def test_max_min_non_numeric(): assert "ss" in result +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_max_min_object_multiple_columns(): # GH#41111 case where the aggregation is valid for some columns but not # others; we split object blocks column-wise, consistent with diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index 44e8e050cb756..ee4973cbf18af 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -11,6 +11,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -74,6 +76,7 @@ def groupby_with_truncated_bingrouper(frame_for_truncated_bingrouper): class TestGroupBy: + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_groupby_with_timegrouper(self): # GH 4161 # TimeGrouper requires a sorted index diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index a189d6772ece4..a65dda1570944 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs import lib from pandas.core.dtypes.common import ensure_platform_int @@ -370,6 +372,7 @@ def test_transform_select_columns(df): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_transform_nuisance_raises(df): # case that goes through _transform_item_by_item @@ -442,6 +445,7 @@ def test_transform_coercion(): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_groupby_transform_with_int(): # GH 3740, make sure that we might upcast on item-by-item transform @@ -701,6 +705,7 @@ def test_cython_transform_frame(request, op, args, targop, df_fix, gb_target): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.slow @pytest.mark.parametrize( "op, args, targop", @@ -1025,6 +1030,7 @@ def test_groupby_transform_with_datetimes(func, values): tm.assert_series_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_groupby_transform_dtype(): # GH 22243 df = DataFrame({"a": [1], "val": [1.35]}) diff --git a/pandas/tests/indexes/base_class/test_setops.py b/pandas/tests/indexes/base_class/test_setops.py index d57df82b2358c..f9636ec19f2ec 100644 --- a/pandas/tests/indexes/base_class/test_setops.py +++ b/pandas/tests/indexes/base_class/test_setops.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( Index, @@ -231,6 +233,7 @@ def test_tuple_union_bug(self, method, expected, sort): expected = Index(expected) tm.assert_index_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("first_list", [["b", "a"], []]) @pytest.mark.parametrize("second_list", [["a", "b"], []]) @pytest.mark.parametrize( diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py index 2f22c2490755e..4b10dba4afc72 100644 --- a/pandas/tests/indexes/test_old_base.py +++ b/pandas/tests/indexes/test_old_base.py @@ -829,6 +829,7 @@ def test_append_preserves_dtype(self, simple_index): alt = index.take(list(range(N)) * 2) tm.assert_index_equal(result, alt, check_exact=True) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_inv(self, simple_index, using_infer_string): idx = simple_index diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 417925f8ecb0d..b05b5d3dea2dc 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -6,6 +6,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import IndexingError from pandas import ( @@ -1196,6 +1198,7 @@ def test_iloc_getitem_int_single_ea_block_view(self): arr[2] = arr[-1] assert ser[0] == arr[-1] + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_iloc_setitem_multicolumn_to_datetime(self): # GH#20511 df = DataFrame({"A": ["2022-01-01", "2022-01-02"], "B": ["2021", "2022"]}) diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index e8d16f8240db6..6b072bc27ed81 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -8,6 +8,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import IndexingError from pandas.core.dtypes.common import ( @@ -526,6 +528,7 @@ def test_string_slice_empty(self): with pytest.raises(KeyError, match="^0$"): df.loc["2011", 0] + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_astype_assignment(self, using_infer_string): # GH4312 (iloc) df_orig = DataFrame( diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index f90bd9e6802c8..1b2dc0819006c 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -609,6 +609,7 @@ def test_loc_setitem_consistency_empty(self): expected["x"] = expected["x"].astype(np.int64) tm.assert_frame_equal(df, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_loc_setitem_consistency_slice_column_len(self): # .loc[:,column] setting with slice == len of the column # GH10408 diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index 64eca6ac643ca..76910db941d36 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -6,6 +6,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs.tslibs import iNaT from pandas.compat import ( is_ci_environment, @@ -407,6 +409,7 @@ def test_empty_string_column(): tm.assert_frame_equal(df, result) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_large_string(): # GH#56702 pytest.importorskip("pyarrow") @@ -423,6 +426,7 @@ def test_non_str_names(): assert names == ["0"] +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_non_str_names_w_duplicates(): # https://github.com/pandas-dev/pandas/issues/56701 df = pd.DataFrame({"0": [1, 2, 3], 0: [4, 5, 6]}) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index bc041882b9fab..f7d01cc403d6c 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -629,6 +629,7 @@ def test_reader_dtype_str(self, read_ext, dtype, expected): expected = DataFrame(expected) tm.assert_frame_equal(actual, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_dtype_backend(self, read_ext, dtype_backend, engine, tmp_excel): # GH#36712 if read_ext in (".xlsb", ".xls"): diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index d81fde42d5386..0d753cb871c64 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -12,6 +12,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat._optional import import_optional_dependency import pandas.util._test_decorators as td @@ -280,6 +282,7 @@ def test_excel_multindex_roundtrip( ) tm.assert_frame_equal(df, act, check_names=check_names) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_read_excel_parse_dates(self, tmp_excel): # see gh-11544, gh-12051 df = DataFrame( @@ -1332,6 +1335,7 @@ def test_freeze_panes(self, tmp_excel): result = pd.read_excel(tmp_excel, index_col=0) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_path_path_lib(self, engine, ext): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), diff --git a/pandas/tests/io/formats/style/test_to_latex.py b/pandas/tests/io/formats/style/test_to_latex.py index eb221686dd165..1abe6238d3922 100644 --- a/pandas/tests/io/formats/style/test_to_latex.py +++ b/pandas/tests/io/formats/style/test_to_latex.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas import ( DataFrame, MultiIndex, @@ -729,6 +731,7 @@ def test_longtable_caption_label(styler, caption, cap_exp, label, lab_exp): ) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("index", [True, False]) @pytest.mark.parametrize( "columns, siunitx", diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 3c551e80ef00b..5867502f9cffb 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -189,6 +189,7 @@ def test_roundtrip_simple(self, orient, convert_axes, dtype, float_frame): assert_json_roundtrip_equal(result, expected, orient) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("dtype", [False, np.int64]) @pytest.mark.parametrize("convert_axes", [True, False]) def test_roundtrip_intframe(self, orient, convert_axes, dtype, int_frame): @@ -274,6 +275,7 @@ def test_roundtrip_empty(self, orient, convert_axes): tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("convert_axes", [True, False]) def test_roundtrip_timestamp(self, orient, convert_axes, datetime_frame): # TODO: improve coverage with date_format parameter @@ -701,6 +703,7 @@ def test_series_roundtrip_simple(self, orient, string_series, using_infer_string tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("dtype", [False, None]) def test_series_roundtrip_object(self, orient, dtype, object_series): data = StringIO(object_series.to_json(orient=orient)) @@ -810,6 +813,7 @@ def test_path(self, float_frame, int_frame, datetime_frame): df.to_json(path) read_json(path) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_axis_dates(self, datetime_series, datetime_frame): # frame json = StringIO(datetime_frame.to_json()) @@ -822,6 +826,7 @@ def test_axis_dates(self, datetime_series, datetime_frame): tm.assert_series_equal(result, datetime_series, check_names=False) assert result.name is None + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_convert_dates(self, datetime_series, datetime_frame): # frame df = datetime_frame @@ -912,6 +917,7 @@ def test_convert_dates_infer(self, infer_word): result = read_json(StringIO(ujson_dumps(data)))[["id", infer_word]] tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "date,date_unit", [ @@ -972,6 +978,7 @@ def test_date_format_series_raises(self, datetime_series): with pytest.raises(ValueError, match=msg): ts.to_json(date_format="iso", date_unit="foo") + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_date_unit(self, unit, datetime_frame): df = datetime_frame df["date"] = Timestamp("20130101 20:43:42").as_unit("ns") @@ -1112,6 +1119,7 @@ def test_round_trip_exception(self, datapath): res = res.fillna(np.nan) tm.assert_frame_equal(res, df) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.network @pytest.mark.single_cpu @pytest.mark.parametrize( @@ -2134,6 +2142,7 @@ def test_json_uint64(self): result = df.to_json(orient="split") assert result == expected + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_read_json_dtype_backend( self, string_storage, dtype_backend, orient, using_infer_string ): diff --git a/pandas/tests/io/parser/common/test_chunksize.py b/pandas/tests/io/parser/common/test_chunksize.py index 78a0b016bd353..a6504473fb55f 100644 --- a/pandas/tests/io/parser/common/test_chunksize.py +++ b/pandas/tests/io/parser/common/test_chunksize.py @@ -8,6 +8,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs import parsers as libparsers from pandas.errors import DtypeWarning @@ -229,6 +231,7 @@ def test_chunks_have_consistent_numerical_type(all_parsers, monkeypatch): assert result.a.dtype == float +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_warn_if_chunks_have_mismatched_type(all_parsers): warning_type = None parser = all_parsers diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index b665cfba8bdc0..511db2c6a33d8 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -13,6 +13,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import ( EmptyDataError, ParserError, @@ -764,6 +766,7 @@ def test_dict_keys_as_names(all_parsers): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @xfail_pyarrow # UnicodeDecodeError: 'utf-8' codec can't decode byte 0xed in position 0 def test_encoding_surrogatepass(all_parsers): # GH39017 diff --git a/pandas/tests/io/parser/common/test_file_buffer_url.py b/pandas/tests/io/parser/common/test_file_buffer_url.py index ba31a9bc15fb5..d8b8f24abcedd 100644 --- a/pandas/tests/io/parser/common/test_file_buffer_url.py +++ b/pandas/tests/io/parser/common/test_file_buffer_url.py @@ -15,6 +15,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat import WASM from pandas.errors import ( EmptyDataError, @@ -69,6 +71,7 @@ def test_local_file(all_parsers, csv_dir_path): pytest.skip("Failing on: " + " ".join(platform.uname())) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @xfail_pyarrow # AssertionError: DataFrame.index are different def test_path_path_lib(all_parsers): parser = all_parsers diff --git a/pandas/tests/io/parser/common/test_index.py b/pandas/tests/io/parser/common/test_index.py index 4cfc12cdc46aa..54b59ac4e25ed 100644 --- a/pandas/tests/io/parser/common/test_index.py +++ b/pandas/tests/io/parser/common/test_index.py @@ -9,6 +9,8 @@ import pytest +from pandas._config import using_string_dtype + from pandas import ( DataFrame, Index, @@ -86,6 +88,7 @@ def test_pass_names_with_index(all_parsers, data, kwargs, expected): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("index_col", [[0, 1], [1, 0]]) def test_multi_index_no_level_names(all_parsers, index_col): data = """index1,index2,A,B,C,D diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index ba928abcb30ad..a5c57a81d8069 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -9,6 +9,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import ParserWarning import pandas as pd @@ -55,6 +57,7 @@ def test_dtype_all_columns(all_parsers, dtype, check_orig): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.usefixtures("pyarrow_xfail") def test_dtype_per_column(all_parsers): parser = all_parsers @@ -299,6 +302,7 @@ def test_true_values_cast_to_bool(all_parsers): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.usefixtures("pyarrow_xfail") @pytest.mark.parametrize("dtypes, exp_value", [({}, "1"), ({"a.1": "int64"}, 1)]) def test_dtype_mangle_dup_cols(all_parsers, dtypes, exp_value): @@ -314,6 +318,7 @@ def test_dtype_mangle_dup_cols(all_parsers, dtypes, exp_value): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.usefixtures("pyarrow_xfail") def test_dtype_mangle_dup_cols_single_dtype(all_parsers): # GH#42022 @@ -456,6 +461,7 @@ def test_dtype_backend_and_dtype(all_parsers): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_dtype_backend_string(all_parsers, string_storage): # GH#36712 pa = pytest.importorskip("pyarrow") @@ -499,6 +505,7 @@ def test_dtype_backend_ea_dtype_specified(all_parsers): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_dtype_backend_pyarrow(all_parsers, request): # GH#36712 pa = pytest.importorskip("pyarrow") diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py index 39718ca2ec134..9226f265ca2b3 100644 --- a/pandas/tests/io/parser/test_c_parser_only.py +++ b/pandas/tests/io/parser/test_c_parser_only.py @@ -18,6 +18,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat import WASM from pandas.compat.numpy import np_version_gte1p24 from pandas.errors import ( @@ -182,6 +184,7 @@ def error(val: float, actual_val: Decimal) -> Decimal: assert max(precise_errors) <= max(normal_errors) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_usecols_dtypes(c_parser_only): parser = c_parser_only data = """\ diff --git a/pandas/tests/io/parser/test_converters.py b/pandas/tests/io/parser/test_converters.py index 7986df62a6b6f..0423327c7333c 100644 --- a/pandas/tests/io/parser/test_converters.py +++ b/pandas/tests/io/parser/test_converters.py @@ -9,6 +9,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -186,6 +188,7 @@ def convert_score(x): tm.assert_frame_equal(results[0], results[1]) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("conv_f", [lambda x: x, str]) def test_converter_index_col_bug(all_parsers, conv_f): # see gh-1835 , GH#40589 diff --git a/pandas/tests/io/parser/test_mangle_dupes.py b/pandas/tests/io/parser/test_mangle_dupes.py index 61d328138da96..6a2ae3bffdc74 100644 --- a/pandas/tests/io/parser/test_mangle_dupes.py +++ b/pandas/tests/io/parser/test_mangle_dupes.py @@ -8,6 +8,8 @@ import pytest +from pandas._config import using_string_dtype + from pandas import DataFrame import pandas._testing as tm @@ -119,6 +121,7 @@ def test_thorough_mangle_names(all_parsers, data, names, expected): parser.read_csv(StringIO(data), names=names) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @xfail_pyarrow # AssertionError: DataFrame.columns are different def test_mangled_unnamed_placeholders(all_parsers): # xref gh-13017 diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index 1e370f649aef8..360a5feebe073 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -8,6 +8,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs.parsers import STR_NA_VALUES from pandas import ( @@ -259,6 +261,7 @@ def test_na_value_dict_multi_index(all_parsers, index_col, expected): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "kwargs,expected", [ @@ -426,6 +429,7 @@ def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_v tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @xfail_pyarrow # mismatched dtypes in both cases, FutureWarning in the True case @pytest.mark.parametrize( "na_filter,row_data", @@ -532,6 +536,7 @@ def test_na_values_dict_aliasing(all_parsers): tm.assert_dict_equal(na_values, na_values_copy) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_na_values_dict_null_column_name(all_parsers): # see gh-57547 parser = all_parsers @@ -662,6 +667,7 @@ def test_inf_na_values_with_int_index(all_parsers): tm.assert_frame_equal(out, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @xfail_pyarrow # mismatched shape @pytest.mark.parametrize("na_filter", [True, False]) def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter): @@ -713,6 +719,7 @@ def test_cast_NA_to_bool_raises_error(all_parsers, data, na_values): # TODO: this test isn't about the na_values keyword, it is about the empty entries # being returned with NaN entries, whereas the pyarrow engine returns "nan" @xfail_pyarrow # mismatched shapes +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_str_nan_dropped(all_parsers): # see gh-21131 parser = all_parsers diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index ec7e5575b2e7d..386348c4bd687 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -13,6 +13,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -419,6 +421,7 @@ def test_parse_timezone(all_parsers): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @skip_pyarrow # pandas.errors.ParserError: CSV parse error @pytest.mark.parametrize( "date_string", @@ -606,6 +609,7 @@ def test_date_parser_usecols_thousands(all_parsers): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_dayfirst_warnings(): # GH 12585 @@ -748,6 +752,7 @@ def test_parse_dates_and_string_dtype(all_parsers): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_parse_dot_separated_dates(all_parsers): # https://github.com/pandas-dev/pandas/issues/2586 parser = all_parsers diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index c0ea5936164a1..26480010fc687 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -18,6 +18,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import ( ParserError, ParserWarning, @@ -497,6 +499,7 @@ def test_header_int_do_not_infer_multiindex_names_on_different_line(python_parse tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "dtype", [{"a": object}, {"a": str, "b": np.int64, "c": np.int64}] ) @@ -524,6 +527,7 @@ def test_no_thousand_convert_with_dot_for_non_numeric_cols(python_parser_only, d tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "dtype,expected", [ diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index 45d630c545565..b7b4a77c9e048 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -13,6 +13,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import EmptyDataError import pandas as pd @@ -939,6 +941,7 @@ def test_widths_and_usecols(): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_dtype_backend(string_storage, dtype_backend): # GH#50289 if string_storage == "python": diff --git a/pandas/tests/io/parser/test_upcast.py b/pandas/tests/io/parser/test_upcast.py index bc4c4c2e24e9c..d8c40670afcbd 100644 --- a/pandas/tests/io/parser/test_upcast.py +++ b/pandas/tests/io/parser/test_upcast.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs.parsers import ( _maybe_upcast, na_values, @@ -84,6 +86,7 @@ def test_maybe_upcaste_all_nan(): tm.assert_extension_array_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("val", [na_values[np.object_], "c"]) def test_maybe_upcast_object(val, string_storage): # GH#36712 diff --git a/pandas/tests/io/pytables/test_append.py b/pandas/tests/io/pytables/test_append.py index 7f7f7eccb2382..d3b4bb0ea6c72 100644 --- a/pandas/tests/io/pytables/test_append.py +++ b/pandas/tests/io/pytables/test_append.py @@ -5,6 +5,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs.tslibs import Timestamp from pandas.compat import PY312 @@ -23,7 +25,10 @@ ensure_clean_store, ) -pytestmark = pytest.mark.single_cpu +pytestmark = [ + pytest.mark.single_cpu, + pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), +] tables = pytest.importorskip("tables") diff --git a/pandas/tests/io/pytables/test_categorical.py b/pandas/tests/io/pytables/test_categorical.py index 2ab9f1ac8be1c..998021bad9001 100644 --- a/pandas/tests/io/pytables/test_categorical.py +++ b/pandas/tests/io/pytables/test_categorical.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas import ( Categorical, DataFrame, @@ -14,7 +16,10 @@ ensure_clean_store, ) -pytestmark = pytest.mark.single_cpu +pytestmark = [ + pytest.mark.single_cpu, + pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), +] def test_categorical(setup_path): diff --git a/pandas/tests/io/pytables/test_complex.py b/pandas/tests/io/pytables/test_complex.py index c5cac5a5caf09..d140cfc941e16 100644 --- a/pandas/tests/io/pytables/test_complex.py +++ b/pandas/tests/io/pytables/test_complex.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -11,6 +13,10 @@ from pandas.io.pytables import read_hdf +pytestmark = pytest.mark.xfail( + using_string_dtype(), reason="TODO(infer_string)", strict=False +) + def test_complex_fixed(tmp_path, setup_path): df = DataFrame( diff --git a/pandas/tests/io/pytables/test_errors.py b/pandas/tests/io/pytables/test_errors.py index 2021101098892..c31b9989ef35e 100644 --- a/pandas/tests/io/pytables/test_errors.py +++ b/pandas/tests/io/pytables/test_errors.py @@ -5,6 +5,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas import ( CategoricalIndex, DataFrame, @@ -22,7 +24,10 @@ _maybe_adjust_name, ) -pytestmark = pytest.mark.single_cpu +pytestmark = [ + pytest.mark.single_cpu, + pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), +] def test_pass_spec_to_storer(setup_path): diff --git a/pandas/tests/io/pytables/test_file_handling.py b/pandas/tests/io/pytables/test_file_handling.py index d8f38e9cdad1f..606b19ac0ed75 100644 --- a/pandas/tests/io/pytables/test_file_handling.py +++ b/pandas/tests/io/pytables/test_file_handling.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat import ( PY311, is_ci_environment, @@ -33,7 +35,10 @@ from pandas.io import pytables from pandas.io.pytables import Term -pytestmark = pytest.mark.single_cpu +pytestmark = [ + pytest.mark.single_cpu, + pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), +] @pytest.mark.parametrize("mode", ["r", "r+", "a", "w"]) diff --git a/pandas/tests/io/pytables/test_put.py b/pandas/tests/io/pytables/test_put.py index d526697c7574a..a4257b54dd6db 100644 --- a/pandas/tests/io/pytables/test_put.py +++ b/pandas/tests/io/pytables/test_put.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs.tslibs import Timestamp import pandas as pd @@ -22,7 +24,10 @@ ) from pandas.util import _test_decorators as td -pytestmark = pytest.mark.single_cpu +pytestmark = [ + pytest.mark.single_cpu, + pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), +] def test_format_type(tmp_path, setup_path): diff --git a/pandas/tests/io/pytables/test_read.py b/pandas/tests/io/pytables/test_read.py index ba108370a4a92..dd3a0eabe95ae 100644 --- a/pandas/tests/io/pytables/test_read.py +++ b/pandas/tests/io/pytables/test_read.py @@ -5,6 +5,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat import is_platform_windows import pandas as pd @@ -24,7 +26,10 @@ from pandas.io.pytables import TableIterator -pytestmark = pytest.mark.single_cpu +pytestmark = [ + pytest.mark.single_cpu, + pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), +] def test_read_missing_key_close_store(tmp_path, setup_path): diff --git a/pandas/tests/io/pytables/test_round_trip.py b/pandas/tests/io/pytables/test_round_trip.py index 3ad05cec3bca3..6b98a720e4299 100644 --- a/pandas/tests/io/pytables/test_round_trip.py +++ b/pandas/tests/io/pytables/test_round_trip.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs.tslibs import Timestamp from pandas.compat import is_platform_windows @@ -24,7 +26,10 @@ ) from pandas.util import _test_decorators as td -pytestmark = pytest.mark.single_cpu +pytestmark = [ + pytest.mark.single_cpu, + pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), +] def test_conv_read_write(): diff --git a/pandas/tests/io/pytables/test_select.py b/pandas/tests/io/pytables/test_select.py index 752e2fc570023..4b20b929ef447 100644 --- a/pandas/tests/io/pytables/test_select.py +++ b/pandas/tests/io/pytables/test_select.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs.tslibs import Timestamp from pandas.compat import PY312 @@ -25,7 +27,10 @@ from pandas.io.pytables import Term -pytestmark = pytest.mark.single_cpu +pytestmark = [ + pytest.mark.single_cpu, + pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), +] def test_select_columns_in_where(setup_path): diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index 3ce30e313cc30..a6fe9529c594a 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -7,6 +7,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat import PY312 import pandas as pd @@ -33,7 +35,10 @@ read_hdf, ) -pytestmark = pytest.mark.single_cpu +pytestmark = [ + pytest.mark.single_cpu, + pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), +] tables = pytest.importorskip("tables") diff --git a/pandas/tests/io/pytables/test_timezones.py b/pandas/tests/io/pytables/test_timezones.py index 9192804e49bd1..8f179f844e4d0 100644 --- a/pandas/tests/io/pytables/test_timezones.py +++ b/pandas/tests/io/pytables/test_timezones.py @@ -6,6 +6,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs.tslibs.timezones import maybe_get_tz import pandas.util._test_decorators as td @@ -23,6 +25,10 @@ ensure_clean_store, ) +pytestmark = pytest.mark.xfail( + using_string_dtype(), reason="TODO(infer_string)", strict=False +) + def _compare_with_tz(a, b): tm.assert_frame_equal(a, b) diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index 62f234ec2db4a..3f5b73f4aa8a4 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -7,6 +7,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat._constants import ( IS64, WASM, @@ -18,6 +20,10 @@ from pandas.io.sas.sas7bdat import SAS7BDATReader +pytestmark = pytest.mark.xfail( + using_string_dtype(), reason="TODO(infer_string)", strict=False +) + @pytest.fixture def dirpath(datapath): diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index babbddafa3b49..923b880004c26 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import ( PyperclipException, PyperclipWindowsException, @@ -28,6 +30,10 @@ init_qt_clipboard, ) +pytestmark = pytest.mark.xfail( + using_string_dtype(), reason="TODO(infer_string)", strict=False +) + def build_kwargs(sep, excel): kwargs = {} diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 26bb2be73838a..c583f9b2c4f99 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -19,6 +19,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat import ( WASM, is_platform_windows, @@ -137,6 +139,7 @@ def test_bytesiowrapper_returns_correct_bytes(self): assert result == data.encode("utf-8") # Test that pyarrow can handle a file opened with get_handle + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_get_handle_pyarrow_compat(self): pa_csv = pytest.importorskip("pyarrow.csv") @@ -334,6 +337,7 @@ def test_read_fspath_all(self, reader, module, path, datapath): ("to_stata", {"time_stamp": pd.to_datetime("2019-01-01 00:00")}, "os"), ], ) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_write_fspath_all(self, writer_name, writer_kwargs, module): if writer_name in ["to_latex"]: # uses Styler implementation pytest.importorskip("jinja2") @@ -439,6 +443,7 @@ def test_unknown_engine(self): with pytest.raises(ValueError, match="Unknown engine"): pd.read_csv(path, engine="pyt") + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_binary_mode(self): """ 'encoding' shouldn't be passed to 'open' in binary mode. @@ -497,6 +502,7 @@ def test_is_fsspec_url(): assert icom.is_fsspec_url("RFC-3986+compliant.spec://something") +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("encoding", [None, "utf-8"]) @pytest.mark.parametrize("format", ["csv", "json"]) def test_codecs_encoding(encoding, format): @@ -517,6 +523,7 @@ def test_codecs_encoding(encoding, format): tm.assert_frame_equal(expected, df) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_codecs_get_writer_reader(): # GH39247 expected = pd.DataFrame( diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py index efc3e71564260..5eb202dd5aa24 100644 --- a/pandas/tests/io/test_compression.py +++ b/pandas/tests/io/test_compression.py @@ -12,6 +12,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat import is_platform_windows import pandas as pd @@ -137,6 +139,7 @@ def test_compression_warning(compression_only): df.to_csv(handles.handle, compression=compression_only) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_compression_binary(compression_only): """ Binary file handles support compression. diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index dc82994bcbc7f..c20c5a45a12fa 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -5,6 +5,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd import pandas._testing as tm from pandas.core.arrays import ( @@ -146,6 +148,7 @@ def test_path_pathlib(self): result = tm.round_trip_pathlib(df.to_feather, read_feather) tm.assert_frame_equal(df, result) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_passthrough_keywords(self): df = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), @@ -164,6 +167,7 @@ def test_http_path(self, feather_file, httpserver): res = read_feather(httpserver.url) tm.assert_frame_equal(expected, res) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_read_feather_dtype_backend(self, string_storage, dtype_backend): # GH#50765 df = pd.DataFrame( diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index c609ae999d47d..59dd6d8f410df 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas import ( DataFrame, date_range, @@ -202,6 +204,7 @@ def test_arrowparquet_options(fsspectest): assert fsspectest.test[0] == "parquet_read" +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_fastparquet_options(fsspectest): """Regression test for writing to a not-yet-existent GCS Parquet file.""" pytest.importorskip("fastparquet") @@ -259,6 +262,7 @@ def test_s3_protocols(s3_public_bucket_with_data, tips_file, protocol, s3so): ) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.single_cpu def test_s3_parquet(s3_public_bucket, s3so, df1): pytest.importorskip("fastparquet") diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py index 434642ed7fc90..e113fa25b2a3f 100644 --- a/pandas/tests/io/test_gcs.py +++ b/pandas/tests/io/test_gcs.py @@ -7,6 +7,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat.pyarrow import pa_version_under17p0 from pandas import ( @@ -156,6 +158,7 @@ def assert_equal_zip_safe(result: bytes, expected: bytes, compression: str): assert result == expected +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("encoding", ["utf-8", "cp1251"]) def test_to_csv_compression_encoding_gcs( gcs_buffer, compression_only, encoding, compression_to_extension diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index dfc9b4156ecab..164646aedf464 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -13,6 +13,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat import is_platform_windows import pandas.util._test_decorators as td @@ -36,6 +38,10 @@ from pandas.io.common import file_path_to_url +pytestmark = pytest.mark.xfail( + using_string_dtype(), reason="TODO(infer_string)", strict=False +) + @pytest.fixture( params=[ diff --git a/pandas/tests/io/test_http_headers.py b/pandas/tests/io/test_http_headers.py index dfae294a147a2..b11fe931f46e5 100644 --- a/pandas/tests/io/test_http_headers.py +++ b/pandas/tests/io/test_http_headers.py @@ -8,6 +8,8 @@ import pytest +from pandas._config import using_string_dtype + import pandas.util._test_decorators as td import pandas as pd @@ -84,6 +86,7 @@ def stata_responder(df): return bio.getvalue() +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "responder, read_method", [ diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index c7d9300c0a638..a189afbac070d 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -9,6 +9,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import read_orc import pandas._testing as tm @@ -18,9 +20,12 @@ import pyarrow as pa -pytestmark = pytest.mark.filterwarnings( - "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" -) +pytestmark = [ + pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" + ), + pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), +] @pytest.fixture diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 930df8abea30f..561c718ea5851 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -9,6 +9,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat import is_platform_windows from pandas.compat.pyarrow import ( pa_version_under11p0, @@ -49,6 +51,7 @@ pytest.mark.filterwarnings( "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" ), + pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), ] diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 35a3ceb98132d..a21893f66722a 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -18,6 +18,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs import lib from pandas.compat import pa_version_under14p1 from pandas.compat._optional import import_optional_dependency @@ -58,9 +60,12 @@ import sqlalchemy -pytestmark = pytest.mark.filterwarnings( - "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" -) +pytestmark = [ + pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" + ), + pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), +] @pytest.fixture diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index c2c4140fa304d..9f5085ff2ad28 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -11,6 +11,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas.util._test_decorators as td import pandas as pd @@ -433,6 +435,7 @@ def test_write_dta6(self, datapath, temp_file): check_index_type=False, ) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) def test_read_write_dta10(self, version, temp_file): original = DataFrame( @@ -1273,6 +1276,7 @@ def test_categorical_ordering(self, file, datapath): assert parsed[col].cat.ordered assert not parsed_unordered[col].cat.ordered + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.filterwarnings("ignore::UserWarning") @pytest.mark.parametrize( "file", @@ -1365,6 +1369,7 @@ def test_iterator(self, datapath): from_chunks = pd.concat(itr) tm.assert_frame_equal(parsed, from_chunks) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.filterwarnings("ignore::UserWarning") @pytest.mark.parametrize( "file", @@ -1669,6 +1674,7 @@ def test_inf(self, infval, temp_file): path = temp_file df.to_stata(path) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_path_pathlib(self): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), @@ -1693,6 +1699,7 @@ def test_value_labels_iterator(self, write_index, temp_file): value_labels = dta_iter.value_labels() assert value_labels == {"A": {0: "A", 1: "B", 2: "C", 3: "E"}} + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_set_index(self, temp_file): # GH 17328 df = DataFrame( @@ -1726,6 +1733,7 @@ def test_date_parsing_ignores_format_details(self, column, datapath): formatted = df.loc[0, column + "_fmt"] assert unformatted == formatted + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("byteorder", ["little", "big"]) def test_writer_117(self, byteorder, temp_file): original = DataFrame( @@ -1837,6 +1845,7 @@ def test_invalid_date_conversion(self, temp_file): with pytest.raises(ValueError, match=msg): original.to_stata(path, convert_dates={"wrong_name": "tc"}) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) def test_nonfile_writing(self, version, temp_file): # GH 21041 @@ -1855,6 +1864,7 @@ def test_nonfile_writing(self, version, temp_file): reread = read_stata(path, index_col="index") tm.assert_frame_equal(df, reread) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_gzip_writing(self, temp_file): # writing version 117 requires seek and cannot be used with gzip df = DataFrame( @@ -1897,6 +1907,7 @@ def test_unicode_dta_118_119(self, file, datapath): tm.assert_frame_equal(unicode_df, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_mixed_string_strl(self, temp_file): # GH 23633 output = [{"mixed": "string" * 500, "number": 0}, {"mixed": None, "number": 1}] @@ -1989,6 +2000,7 @@ def test_stata_119(self, datapath): reader._ensure_open() assert reader._nvar == 32999 + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("version", [118, 119, None]) @pytest.mark.parametrize("byteorder", ["little", "big"]) def test_utf8_writer(self, version, byteorder, temp_file): @@ -2336,6 +2348,7 @@ def test_iterator_errors(datapath, chunksize): pass +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_iterator_value_labels(temp_file): # GH 31544 values = ["c_label", "b_label"] + ["a_label"] * 500 diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index 6c9d374935ed5..036a5d6265dd7 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -14,6 +14,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat import WASM from pandas.compat._optional import import_optional_dependency from pandas.errors import ( @@ -1990,6 +1992,7 @@ def test_s3_parser_consistency(s3_public_bucket_with_data, s3so): tm.assert_frame_equal(df_lxml, df_etree) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_read_xml_nullable_dtypes( parser, string_storage, dtype_backend, using_infer_string ): diff --git a/pandas/tests/io/xml/test_xml_dtypes.py b/pandas/tests/io/xml/test_xml_dtypes.py index 96ef50f9d7149..409aafee58e49 100644 --- a/pandas/tests/io/xml/test_xml_dtypes.py +++ b/pandas/tests/io/xml/test_xml_dtypes.py @@ -4,6 +4,8 @@ import pytest +from pandas._config import using_string_dtype + from pandas.errors import ParserWarning import pandas.util._test_decorators as td @@ -83,6 +85,7 @@ def read_xml_iterparse(data, **kwargs): # DTYPE +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_dtype_single_str(parser): df_result = read_xml(StringIO(xml_types), dtype={"degrees": "str"}, parser=parser) df_iter = read_xml_iterparse( @@ -208,6 +211,7 @@ def test_wrong_dtype(xml_books, parser, iterparse): ) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_both_dtype_converters(parser): df_expected = DataFrame( { diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index c781e35e71ca6..63e9e89cabd58 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -7,6 +7,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( Categorical, @@ -1387,6 +1389,7 @@ def test_mode_numerical_nan(self, dropna, expected): expected = Series(expected) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "dropna, expected1, expected2, expected3", [(True, ["b"], ["bar"], ["nan"]), (False, ["b"], [np.nan], ["nan"])], @@ -1414,6 +1417,7 @@ def test_mode_str_obj(self, dropna, expected1, expected2, expected3): expected3 = Series(expected3) tm.assert_series_equal(result, expected3) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "dropna, expected1, expected2", [(True, ["foo"], ["foo"]), (False, ["foo"], [np.nan])], @@ -1551,6 +1555,7 @@ def test_mode_intoverflow(self, dropna, expected1, expected2): expected2 = Series(expected2, dtype=np.uint64) tm.assert_series_equal(result, expected2) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_mode_sortwarning(self): # Check for the warning that is raised when the mode # results cannot be sorted diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index 520ef40153ecd..ff1b82210e20d 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat import is_platform_windows import pandas as pd @@ -491,6 +493,7 @@ def test_empty(keys): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("consolidate", [True, False]) def test_resample_groupby_agg_object_dtype_all_nan(consolidate): # https://github.com/pandas-dev/pandas/issues/39329 diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index 550b424371a95..b2caa1fadd1a5 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -10,6 +10,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import InvalidIndexError import pandas as pd @@ -45,6 +47,7 @@ def test_append_concat(self): assert isinstance(result.index, PeriodIndex) assert result.index[0] == s1.index[0] + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_concat_copy(self): df = DataFrame(np.random.default_rng(2).standard_normal((4, 3))) df2 = DataFrame(np.random.default_rng(2).integers(0, 10, size=4).reshape(4, 1)) diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index bd364de26a3c4..62fd8c5a7e231 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas.util._test_decorators as td import pandas as pd @@ -3062,6 +3064,7 @@ def test_on_float_by_int(self): tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_merge_datatype_error_raises(self, using_infer_string): if using_infer_string: msg = "incompatible merge keys" diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py index ba71bb24e8a16..bfb6a3c0167c8 100644 --- a/pandas/tests/reshape/test_from_dummies.py +++ b/pandas/tests/reshape/test_from_dummies.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas import ( DataFrame, Series, @@ -362,6 +364,7 @@ def test_with_prefix_contains_get_dummies_NaN_column(): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "default_category, expected", [ diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index 49200face66c5..be4f2ab4d183d 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -81,6 +83,7 @@ def test_default_col_names(self, df): result2 = df.melt(id_vars=["id1", "id2"]) assert result2.columns.tolist() == ["id1", "id2", "variable", "value"] + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_value_vars(self, df): result3 = df.melt(id_vars=["id1", "id2"], value_vars="A") assert len(result3) == 10 @@ -97,6 +100,7 @@ def test_value_vars(self, df): ) tm.assert_frame_equal(result4, expected4) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("type_", (tuple, list, np.array)) def test_value_vars_types(self, type_, df): # GH 15348 @@ -174,6 +178,7 @@ def test_tuple_vars_fail_with_multiindex(self, id_vars, value_vars, df1): with pytest.raises(ValueError, match=msg): df1.melt(id_vars=id_vars, value_vars=value_vars) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_custom_var_name(self, df, var_name): result5 = df.melt(var_name=var_name) assert result5.columns.tolist() == ["var", "value"] @@ -201,6 +206,7 @@ def test_custom_var_name(self, df, var_name): ) tm.assert_frame_equal(result9, expected9) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_custom_value_name(self, df, value_name): result10 = df.melt(value_name=value_name) assert result10.columns.tolist() == ["variable", "val"] @@ -230,6 +236,7 @@ def test_custom_value_name(self, df, value_name): ) tm.assert_frame_equal(result14, expected14) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_custom_var_and_value_name(self, df, value_name, var_name): result15 = df.melt(var_name=var_name, value_name=value_name) assert result15.columns.tolist() == ["var", "val"] @@ -354,6 +361,7 @@ def test_melt_missing_columns_raises(self): with pytest.raises(KeyError, match=msg): df.melt(["A"], ["F"], col_level=0) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_melt_mixed_int_str_id_vars(self): # GH 29718 df = DataFrame({0: ["foo"], "a": ["bar"], "b": [1], "d": [2]}) @@ -1214,6 +1222,7 @@ def test_raise_of_column_name_value(self): ): df.melt(id_vars="value", value_name="value") + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("dtype", ["O", "string"]) def test_missing_stubname(self, dtype): # GH46044 @@ -1239,6 +1248,7 @@ def test_missing_stubname(self, dtype): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_wide_to_long_pyarrow_string_columns(): # GH 57066 pytest.importorskip("pyarrow") diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 476ec2fc76488..44b96afaa4ef5 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1068,6 +1068,7 @@ def test_margins_dtype_len(self, data): tm.assert_frame_equal(expected, result) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("cols", [(1, 2), ("a", "b"), (1, "b"), ("a", 1)]) def test_pivot_table_multiindex_only(self, cols): # GH 17038 @@ -2569,6 +2570,7 @@ def test_pivot_empty(self): expected = DataFrame(index=[], columns=[]) tm.assert_frame_equal(result, expected, check_names=False) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("dtype", [object, "string"]) def test_pivot_integer_bug(self, dtype): df = DataFrame(data=[("A", "1", "A1"), ("B", "2", "B2")], dtype=dtype) diff --git a/pandas/tests/reshape/test_union_categoricals.py b/pandas/tests/reshape/test_union_categoricals.py index 8d78d34e936f0..1d5d16f39e648 100644 --- a/pandas/tests/reshape/test_union_categoricals.py +++ b/pandas/tests/reshape/test_union_categoricals.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.core.dtypes.concat import union_categoricals import pandas as pd @@ -122,6 +124,7 @@ def test_union_categoricals_nan(self): exp = Categorical([np.nan, np.nan, np.nan, np.nan]) tm.assert_categorical_equal(res, exp) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("val", [[], ["1"]]) def test_union_categoricals_empty(self, val, request, using_infer_string): # GH 13759 diff --git a/pandas/tests/series/accessors/test_dt_accessor.py b/pandas/tests/series/accessors/test_dt_accessor.py index 49ae0a60e6608..03e823ce607fb 100644 --- a/pandas/tests/series/accessors/test_dt_accessor.py +++ b/pandas/tests/series/accessors/test_dt_accessor.py @@ -11,6 +11,8 @@ import pytest import pytz +from pandas._config import using_string_dtype + from pandas._libs.tslibs.timezones import maybe_get_tz from pandas.core.dtypes.common import ( @@ -512,6 +514,7 @@ def test_dt_accessor_datetime_name_accessors(self, time_locale): ser = pd.concat([ser, Series([pd.NaT])]) assert np.isnan(ser.dt.month_name(locale=time_locale).iloc[-1]) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_strftime(self): # GH 10086 ser = Series(date_range("20130101", periods=5)) @@ -554,6 +557,7 @@ def test_strftime(self): ) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_strftime_dt64_days(self): ser = Series(date_range("20130101", periods=5)) ser.iloc[0] = pd.NaT @@ -584,6 +588,7 @@ def test_strftime_period_days(self, using_infer_string): expected = expected.astype("string[pyarrow_numpy]") tm.assert_index_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_strftime_dt64_microsecond_resolution(self): ser = Series([datetime(2013, 1, 1, 2, 32, 59), datetime(2013, 1, 2, 14, 32, 1)]) result = ser.dt.strftime("%Y-%m-%d %H:%M:%S") @@ -616,6 +621,7 @@ def test_strftime_period_minutes(self): ) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "data", [ diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index 228e5cb509982..9f310d8c8ab5f 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -6,6 +6,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import IndexingError from pandas import ( @@ -249,6 +251,7 @@ def test_slice(string_series, object_series): tm.assert_series_equal(string_series, original) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_timedelta_assignment(): # GH 8209 s = Series([], dtype=object) diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index 62f2c93ef691a..3fcf664c3f01b 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -8,6 +8,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat.numpy import np_version_gte1p24 from pandas.errors import IndexingError @@ -528,6 +530,7 @@ def test_append_timedelta_does_not_cast(self, td, using_infer_string, request): tm.assert_series_equal(ser, expected) assert isinstance(ser["td"], Timedelta) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_with_expansion_type_promotion(self): # GH#12599 ser = Series(dtype=object) @@ -537,6 +540,7 @@ def test_setitem_with_expansion_type_promotion(self): expected = Series([Timestamp("2016-01-01"), 3.0, "foo"], index=["a", "b", "c"]) tm.assert_series_equal(ser, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_not_contained(self, string_series): # set item that's not contained ser = string_series.copy() @@ -845,6 +849,7 @@ def test_series_where(self, obj, key, expected, raises, val, is_inplace): self._check_inplace(is_inplace, orig, arr, obj) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_index_where(self, obj, key, expected, raises, val, using_infer_string): mask = np.zeros(obj.shape, dtype=bool) mask[key] = True @@ -857,6 +862,7 @@ def test_index_where(self, obj, key, expected, raises, val, using_infer_string): expected_idx = Index(expected, dtype=expected.dtype) tm.assert_index_equal(res, expected_idx) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_index_putmask(self, obj, key, expected, raises, val, using_infer_string): mask = np.zeros(obj.shape, dtype=bool) mask[key] = True diff --git a/pandas/tests/series/methods/test_info.py b/pandas/tests/series/methods/test_info.py index bd1bc1781958c..097976b0a7ac0 100644 --- a/pandas/tests/series/methods/test_info.py +++ b/pandas/tests/series/methods/test_info.py @@ -5,6 +5,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat import PYPY from pandas import ( @@ -140,6 +142,7 @@ def test_info_memory_usage_deep_pypy(): assert s_object.memory_usage(deep=True) == s_object.memory_usage() +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "index, plus", [ diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index 97151784eb94c..de0855bf7192e 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -647,6 +647,7 @@ def test_replace_value_none_dtype_numeric(self, val): expected = pd.Series([1, None], dtype=object) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_replace_change_dtype_series(self, using_infer_string): # GH#25797 df = pd.DataFrame.from_dict({"Test": ["0.5", True, "0.6"]}) diff --git a/pandas/tests/series/methods/test_to_csv.py b/pandas/tests/series/methods/test_to_csv.py index 488d0cb9fe9da..0bcad49847291 100644 --- a/pandas/tests/series/methods/test_to_csv.py +++ b/pandas/tests/series/methods/test_to_csv.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import Series import pandas._testing as tm @@ -24,6 +26,7 @@ def read_csv(self, path, **kwargs): return out + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_from_csv(self, datetime_series, string_series, temp_file): # freq doesn't round-trip datetime_series.index = datetime_series.index._with_freq(None) diff --git a/pandas/tests/series/methods/test_unstack.py b/pandas/tests/series/methods/test_unstack.py index f9e6dc644e908..8c4f0ff3eaea7 100644 --- a/pandas/tests/series/methods/test_unstack.py +++ b/pandas/tests/series/methods/test_unstack.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -134,6 +136,7 @@ def test_unstack_mixed_type_name_in_multiindex( tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_unstack_multi_index_categorical_values(): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index f0930a831e98d..ff84b5c52183b 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -9,6 +9,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs import lib from pandas._libs.tslibs import IncompatibleFrequency @@ -500,6 +502,7 @@ def test_ser_cmp_result_names(self, names, comparison_op): result = op(ser, cidx) assert result.name == names[2] + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_comparisons(self, using_infer_string): s = Series(["a", "b", "c"]) s2 = Series([False, True, False]) diff --git a/pandas/tests/series/test_logical_ops.py b/pandas/tests/series/test_logical_ops.py index f59eacea3fe6c..939bf888fd61b 100644 --- a/pandas/tests/series/test_logical_ops.py +++ b/pandas/tests/series/test_logical_ops.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas import ( DataFrame, Index, @@ -348,6 +350,7 @@ def test_reverse_ops_with_index(self, op, expected): expected = Series(expected) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_logical_ops_label_based(self, using_infer_string): # GH#4947 # logical ops should be label based diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index cdcd36846c560..06fd81ed722d9 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs import ( algos as libalgos, hashtable as ht, @@ -1682,6 +1684,7 @@ def test_unique_complex_numbers(self, array, expected): class TestHashTable: + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "htable, data", [ @@ -1721,6 +1724,7 @@ def test_hashtable_unique(self, htable, data, writable): reconstr = result_unique[result_inverse] tm.assert_numpy_array_equal(reconstr, s_duplicated.values) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "htable, data", [ From dd2dbcd016762a7d1050fb5d0b746ceff3ed0770 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 30 Jul 2024 19:07:20 +0200 Subject: [PATCH 245/272] TST (string dtype): follow-up on GH-59329 fixing new xfails (#59352) * TST (string dtype): follow-up on GH-59329 fixing new xfails * add missing strict --- pandas/_testing/asserters.py | 10 ++++++++-- .../tests/arrays/interval/test_interval_pyarrow.py | 3 +++ pandas/tests/arrays/masked/test_arrow_compat.py | 12 +++++++++--- pandas/tests/arrays/masked/test_function.py | 3 --- pandas/tests/arrays/period/test_arrow_compat.py | 4 ++++ pandas/tests/arrays/string_/test_string.py | 4 ++++ pandas/tests/arrays/test_array.py | 3 +++ pandas/tests/dtypes/test_common.py | 3 +++ pandas/tests/frame/methods/test_astype.py | 3 +++ pandas/tests/frame/test_arithmetic.py | 1 + pandas/tests/groupby/test_apply.py | 3 +++ pandas/tests/indexes/base_class/test_formats.py | 1 + pandas/tests/indexes/multi/test_setops.py | 3 +++ pandas/tests/indexes/test_old_base.py | 1 + pandas/tests/io/excel/test_readers.py | 4 +++- pandas/tests/io/json/test_json_table_schema.py | 6 ++++++ pandas/tests/io/json/test_pandas.py | 2 ++ pandas/tests/io/parser/dtypes/test_dtypes_basic.py | 2 +- pandas/tests/io/parser/test_upcast.py | 2 +- pandas/tests/io/parser/usecols/test_usecols_basic.py | 3 +++ pandas/tests/io/test_feather.py | 11 ++++++----- pandas/tests/io/test_fsspec.py | 1 + pandas/tests/reshape/test_get_dummies.py | 3 +++ pandas/tests/series/methods/test_convert_dtypes.py | 3 +++ pandas/tests/test_downstream.py | 3 +++ 25 files changed, 78 insertions(+), 16 deletions(-) diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index 1127a4512643c..d52dabe47279a 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -578,13 +578,19 @@ def raise_assert_detail( if isinstance(left, np.ndarray): left = pprint_thing(left) - elif isinstance(left, (CategoricalDtype, NumpyEADtype, StringDtype)): + elif isinstance(left, (CategoricalDtype, NumpyEADtype)): left = repr(left) + elif isinstance(left, StringDtype): + # TODO(infer_string) this special case could be avoided if we have + # a more informative repr https://github.com/pandas-dev/pandas/issues/59342 + left = f"StringDtype(storage={left.storage}, na_value={left.na_value})" if isinstance(right, np.ndarray): right = pprint_thing(right) - elif isinstance(right, (CategoricalDtype, NumpyEADtype, StringDtype)): + elif isinstance(right, (CategoricalDtype, NumpyEADtype)): right = repr(right) + elif isinstance(right, StringDtype): + right = f"StringDtype(storage={right.storage}, na_value={right.na_value})" msg += f""" [left]: {left} diff --git a/pandas/tests/arrays/interval/test_interval_pyarrow.py b/pandas/tests/arrays/interval/test_interval_pyarrow.py index ef8701be81e2b..be87d5d3ef7ba 100644 --- a/pandas/tests/arrays/interval/test_interval_pyarrow.py +++ b/pandas/tests/arrays/interval/test_interval_pyarrow.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd import pandas._testing as tm from pandas.core.arrays import IntervalArray @@ -80,6 +82,7 @@ def test_arrow_array_missing(): assert result.storage.equals(expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.filterwarnings( "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" ) diff --git a/pandas/tests/arrays/masked/test_arrow_compat.py b/pandas/tests/arrays/masked/test_arrow_compat.py index 5f73370554473..c719e19a7c8d1 100644 --- a/pandas/tests/arrays/masked/test_arrow_compat.py +++ b/pandas/tests/arrays/masked/test_arrow_compat.py @@ -1,12 +1,18 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd import pandas._testing as tm -pytestmark = pytest.mark.filterwarnings( - "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" -) +pytestmark = [ + pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" + ), + pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), +] + pa = pytest.importorskip("pyarrow") diff --git a/pandas/tests/arrays/masked/test_function.py b/pandas/tests/arrays/masked/test_function.py index 6b352758b3ae6..b4b1761217826 100644 --- a/pandas/tests/arrays/masked/test_function.py +++ b/pandas/tests/arrays/masked/test_function.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.core.dtypes.common import is_integer_dtype import pandas as pd @@ -60,7 +58,6 @@ def test_tolist(data): tm.assert_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_to_numpy(): # GH#56991 diff --git a/pandas/tests/arrays/period/test_arrow_compat.py b/pandas/tests/arrays/period/test_arrow_compat.py index 431309aca0df2..ff86b696c8403 100644 --- a/pandas/tests/arrays/period/test_arrow_compat.py +++ b/pandas/tests/arrays/period/test_arrow_compat.py @@ -1,5 +1,7 @@ import pytest +from pandas._config import using_string_dtype + from pandas.compat.pyarrow import pa_version_under10p1 from pandas.core.dtypes.dtypes import PeriodDtype @@ -77,6 +79,7 @@ def test_arrow_array_missing(): assert result.storage.equals(expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_arrow_table_roundtrip(): from pandas.core.arrays.arrow.extension_types import ArrowPeriodType @@ -96,6 +99,7 @@ def test_arrow_table_roundtrip(): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_arrow_load_from_zero_chunks(): # GH-41040 diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 7757847f3c841..3fde3cbca8d8c 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -8,6 +8,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat.pyarrow import pa_version_under12p0 from pandas.core.dtypes.common import is_dtype_equal @@ -511,6 +513,7 @@ def test_arrow_array(dtype): assert arr.equals(expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string): # roundtrip possible from arrow 1.0.0 @@ -539,6 +542,7 @@ def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string): assert result.loc[2, "a"] is result["a"].dtype.na_value +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") def test_arrow_load_from_zero_chunks( dtype, string_storage2, request, using_infer_string diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index f7b76e7388ae9..76b8928f28b65 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -5,6 +5,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd import pandas._testing as tm from pandas.api.extensions import register_extension_dtype @@ -285,6 +287,7 @@ def test_array_copy(): assert tm.shares_memory(a, b) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "data, expected", [ diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index f47815ee059af..a6b549d24c66d 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas.util._test_decorators as td from pandas.core.dtypes.astype import astype_array @@ -128,6 +130,7 @@ def test_dtype_equal(name1, dtype1, name2, dtype2): assert not com.is_dtype_equal(dtype1, dtype2) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("name,dtype", list(dtypes.items()), ids=lambda x: str(x)) def test_pyarrow_string_import_error(name, dtype): # GH-44276 diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index edc90ce77ad3a..0b525c8d9e1de 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas.util._test_decorators as td import pandas as pd @@ -742,6 +744,7 @@ def test_astype_tz_object_conversion(self, tz): result = result.astype({"tz": "datetime64[ns, Europe/London]"}) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_astype_dt64_to_string( self, frame_or_series, tz_naive_fixture, using_infer_string ): diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 11e51056d51d0..734bfc8b30053 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -2097,6 +2097,7 @@ def test_enum_column_equality(): tm.assert_series_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_mixed_col_index_dtype(): # GH 47382 df1 = DataFrame(columns=list("abc"), data=1.0, index=[0]) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 75801b9e039f6..644f93a37a3a3 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -6,6 +6,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -920,6 +922,7 @@ def test_func_returns_object(): tm.assert_series_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "group_column_dtlike", [datetime.today(), datetime.today().date(), datetime.today().time()], diff --git a/pandas/tests/indexes/base_class/test_formats.py b/pandas/tests/indexes/base_class/test_formats.py index 260b4203a4f04..dc4763d96bc71 100644 --- a/pandas/tests/indexes/base_class/test_formats.py +++ b/pandas/tests/indexes/base_class/test_formats.py @@ -9,6 +9,7 @@ class TestIndexRendering: + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_repr_is_valid_construction_code(self): # for the case of Index, where the repr is traditional rather than # stylized diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py index 47f21cc7f8182..e85091aaae608 100644 --- a/pandas/tests/indexes/multi/test_setops.py +++ b/pandas/tests/indexes/multi/test_setops.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( CategoricalIndex, @@ -752,6 +754,7 @@ def test_intersection_keep_ea_dtypes(val, any_numeric_ea_dtype): tm.assert_index_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_union_with_na_when_constructing_dataframe(): # GH43222 series1 = Series( diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py index 4b10dba4afc72..6d01ba6adc87a 100644 --- a/pandas/tests/indexes/test_old_base.py +++ b/pandas/tests/indexes/test_old_base.py @@ -228,6 +228,7 @@ def test_logical_compat(self, simple_index): with pytest.raises(TypeError, match=msg): idx.any() + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_repr_roundtrip(self, simple_index): if isinstance(simple_index, IntervalIndex): pytest.skip(f"Not a valid repr for {type(simple_index).__name__}") diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index f7d01cc403d6c..65a52bc8e0794 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -692,7 +692,9 @@ def test_dtype_backend_and_dtype(self, read_ext, tmp_excel): ) tm.assert_frame_equal(result, df) - @pytest.mark.xfail(using_string_dtype(), reason="infer_string takes precedence") + @pytest.mark.xfail( + using_string_dtype(), reason="infer_string takes precedence", strict=False + ) def test_dtype_backend_string(self, read_ext, string_storage, tmp_excel): # GH#36712 if read_ext in (".xlsb", ".xls"): diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py index e61a8ee722443..bddd71d2bd5f6 100644 --- a/pandas/tests/io/json/test_json_table_schema.py +++ b/pandas/tests/io/json/test_json_table_schema.py @@ -7,6 +7,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.core.dtypes.dtypes import ( CategoricalDtype, DatetimeTZDtype, @@ -25,6 +27,10 @@ set_default_names, ) +pytestmark = pytest.mark.xfail( + using_string_dtype(), reason="TODO(infer_string)", strict=False +) + @pytest.fixture def df_schema(): diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 5867502f9cffb..d281729e9704c 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1560,6 +1560,7 @@ def test_data_frame_size_after_to_json(self): assert size_before == size_after + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "index", [None, [1, 2], [1.0, 2.0], ["a", "b"], ["1", "2"], ["1.", "2."]] ) @@ -1572,6 +1573,7 @@ def test_from_json_to_json_table_index_and_columns(self, index, columns): result = read_json(StringIO(dfjson), orient="table") tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_from_json_to_json_table_dtypes(self): # GH21345 expected = DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["5", "6"]}) diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index a5c57a81d8069..a27df95f7eb2a 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -461,7 +461,7 @@ def test_dtype_backend_and_dtype(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_dtype_backend_string(all_parsers, string_storage): # GH#36712 pa = pytest.importorskip("pyarrow") diff --git a/pandas/tests/io/parser/test_upcast.py b/pandas/tests/io/parser/test_upcast.py index d8c40670afcbd..01e576ba40f26 100644 --- a/pandas/tests/io/parser/test_upcast.py +++ b/pandas/tests/io/parser/test_upcast.py @@ -86,7 +86,7 @@ def test_maybe_upcaste_all_nan(): tm.assert_extension_array_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("val", [na_values[np.object_], "c"]) def test_maybe_upcast_object(val, string_storage): # GH#36712 diff --git a/pandas/tests/io/parser/usecols/test_usecols_basic.py b/pandas/tests/io/parser/usecols/test_usecols_basic.py index 82b42beb38ae0..d02364a77df90 100644 --- a/pandas/tests/io/parser/usecols/test_usecols_basic.py +++ b/pandas/tests/io/parser/usecols/test_usecols_basic.py @@ -8,6 +8,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import ParserError from pandas import ( @@ -529,6 +531,7 @@ def test_usecols_additional_columns_integer_columns(all_parsers): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_usecols_dtype(all_parsers): parser = all_parsers data = """ diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index c20c5a45a12fa..5aa8f1c69fe44 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -16,9 +16,12 @@ from pandas.io.feather_format import read_feather, to_feather # isort:skip -pytestmark = pytest.mark.filterwarnings( - "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" -) +pytestmark = [ + pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" + ), + pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), +] pa = pytest.importorskip("pyarrow") @@ -148,7 +151,6 @@ def test_path_pathlib(self): result = tm.round_trip_pathlib(df.to_feather, read_feather) tm.assert_frame_equal(df, result) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_passthrough_keywords(self): df = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), @@ -167,7 +169,6 @@ def test_http_path(self, feather_file, httpserver): res = read_feather(httpserver.url) tm.assert_frame_equal(expected, res) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_read_feather_dtype_backend(self, string_storage, dtype_backend): # GH#50765 df = pd.DataFrame( diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index 59dd6d8f410df..7ffee9ea78ddc 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -283,6 +283,7 @@ def test_not_present_exception(): read_csv("memory://test/test.csv") +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_feather_options(fsspectest): pytest.importorskip("pyarrow") df = DataFrame({"a": [0]}) diff --git a/pandas/tests/reshape/test_get_dummies.py b/pandas/tests/reshape/test_get_dummies.py index 082d5f0ee81ab..304ba65f38d3c 100644 --- a/pandas/tests/reshape/test_get_dummies.py +++ b/pandas/tests/reshape/test_get_dummies.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas.util._test_decorators as td from pandas.core.dtypes.common import is_integer_dtype @@ -214,6 +216,7 @@ def test_dataframe_dummies_all_obj(self, df, sparse): tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_dataframe_dummies_string_dtype(self, df, using_infer_string): # GH44965 df = df[["A", "B"]] diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index 7c96a5b0f00d1..4a8af259b4134 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs import lib import pandas as pd @@ -10,6 +12,7 @@ class TestSeriesConvertDtypes: + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "data, maindtype, expected_default, expected_other", [ diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index ee26fdae74960..1e6538ca5a8fb 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -10,6 +10,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import IntCastingNaNError import pandas as pd @@ -164,6 +166,7 @@ def test_pandas_datareader(): pytest.importorskip("pandas_datareader") +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") def test_pyarrow(df): pyarrow = pytest.importorskip("pyarrow") From 7e7735b6dd14eb106998665cdbee0782ef46eab3 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Tue, 30 Jul 2024 22:38:20 +0530 Subject: [PATCH 246/272] DOC: fix PR07,SA01 for pandas.MultiIndex.append (#59354) --- ci/code_checks.sh | 1 - pandas/core/indexes/multi.py | 13 +++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 2c713a8e7bbea..af1d93d1f153b 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -70,7 +70,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then --format=actions \ -i ES01 `# For now it is ok if docstrings are missing the extended summary` \ -i "pandas.Series.dt PR01" `# Accessors are implemented as classes, but we do not document the Parameters section` \ - -i "pandas.MultiIndex.append PR07,SA01" \ -i "pandas.MultiIndex.copy PR07,RT03,SA01" \ -i "pandas.MultiIndex.get_level_values SA01" \ -i "pandas.MultiIndex.get_loc PR07" \ diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index ee24e485a9331..c278927c1db6e 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2223,15 +2223,28 @@ def append(self, other): """ Append a collection of Index options together. + The `append` method is used to combine multiple `Index` objects into a single + `Index`. This is particularly useful when dealing with multi-level indexing + (MultiIndex) where you might need to concatenate different levels of indices. + The method handles the alignment of the levels and codes of the indices being + appended to ensure consistency in the resulting `MultiIndex`. + Parameters ---------- other : Index or list/tuple of indices + Index or list/tuple of Index objects to be appended. Returns ------- Index The combined index. + See Also + -------- + MultiIndex: A multi-level, or hierarchical, index object for pandas objects. + Index.append : Append a collection of Index options together. + concat : Concatenate pandas objects along a particular axis. + Examples -------- >>> mi = pd.MultiIndex.from_arrays([["a"], ["b"]]) From 12c8ec42d936f2cda45a970faac261301ac0db47 Mon Sep 17 00:00:00 2001 From: Jay Ahn Date: Tue, 30 Jul 2024 13:18:44 -0400 Subject: [PATCH 247/272] Add doc for counting categorical dtype (#59327) * Add doc for counting categorical dtype * Move example to docstring instead * Remove backslash * add line * Undo the change in categorical.rst * Rename it to Categorical Dtypes * undo categorical.rst --- pandas/core/base.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/pandas/core/base.py b/pandas/core/base.py index b784dc8b03292..863cf978426e2 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1049,6 +1049,34 @@ def value_counts( 4.0 1 NaN 1 Name: count, dtype: int64 + + **Categorical Dtypes** + + Rows with categorical type will be counted as one group + if they have same categories and order. + In the example below, even though ``a``, ``c``, and ``d`` + all have the same data types of ``category``, + only ``c`` and ``d`` will be counted as one group + since ``a`` doesn't have the same categories. + + >>> df = pd.DataFrame({"a": [1], "b": ["2"], "c": [3], "d": [3]}) + >>> df = df.astype({"a": "category", "c": "category", "d": "category"}) + >>> df + a b c d + 0 1 2 3 3 + + >>> df.dtypes + a category + b object + c category + d category + dtype: object + + >>> df.dtypes.value_counts() + category 2 + category 1 + object 1 + Name: count, dtype: int64 """ return algorithms.value_counts_internal( self, From 7acd629fea2a32d1ace93ceab2b62d5f5f9b2d47 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 30 Jul 2024 07:20:07 -1000 Subject: [PATCH 248/272] BUG: Avoid RangeIndex conversion in read_csv if dtype is specified (#59316) * BUG: Avoid RangeIndex conversion in read_csv if dtype is specified * Undo change * Typing --- pandas/io/parsers/base_parser.py | 39 +++++++++++++------ pandas/io/parsers/c_parser_wrapper.py | 2 +- pandas/io/parsers/python_parser.py | 4 +- .../io/parser/dtypes/test_dtypes_basic.py | 18 ++++++++- 4 files changed, 46 insertions(+), 17 deletions(-) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 719afe160614f..7294efe843cce 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -4,6 +4,7 @@ from copy import copy import csv from enum import Enum +import itertools from typing import ( TYPE_CHECKING, Any, @@ -271,7 +272,7 @@ def _maybe_make_multi_index_columns( @final def _make_index( - self, data, alldata, columns, indexnamerow: list[Scalar] | None = None + self, alldata, columns, indexnamerow: list[Scalar] | None = None ) -> tuple[Index | None, Sequence[Hashable] | MultiIndex]: index: Index | None if isinstance(self.index_col, list) and len(self.index_col): @@ -326,7 +327,11 @@ def _agg_index(self, index) -> Index: converters = self._clean_mapping(self.converters) clean_dtypes = self._clean_mapping(self.dtype) - for i, arr in enumerate(index): + if self.index_names is not None: + names: Iterable = self.index_names + else: + names = itertools.cycle([None]) + for i, (arr, name) in enumerate(zip(index, names)): if self._should_parse_dates(i): arr = date_converter( arr, @@ -369,12 +374,17 @@ def _agg_index(self, index) -> Index: arr, _ = self._infer_types( arr, col_na_values | col_na_fvalues, cast_type is None, try_num_bool ) - arrays.append(arr) - - names = self.index_names - index = ensure_index_from_sequences(arrays, names) + if cast_type is not None: + # Don't perform RangeIndex inference + idx = Index(arr, name=name, dtype=cast_type) + else: + idx = ensure_index_from_sequences([arr], [name]) + arrays.append(idx) - return index + if len(arrays) == 1: + return arrays[0] + else: + return MultiIndex.from_arrays(arrays) @final def _set_noconvert_dtype_columns( @@ -704,12 +714,11 @@ def _get_empty_meta( dtype_dict: defaultdict[Hashable, Any] if not is_dict_like(dtype): # if dtype == None, default will be object. - default_dtype = dtype or object - dtype_dict = defaultdict(lambda: default_dtype) + dtype_dict = defaultdict(lambda: dtype) else: dtype = cast(dict, dtype) dtype_dict = defaultdict( - lambda: object, + lambda: None, {columns[k] if is_integer(k) else k: v for k, v in dtype.items()}, ) @@ -726,8 +735,14 @@ def _get_empty_meta( if (index_col is None or index_col is False) or index_names is None: index = default_index(0) else: - data = [Series([], dtype=dtype_dict[name]) for name in index_names] - index = ensure_index_from_sequences(data, names=index_names) + # TODO: We could return default_index(0) if dtype_dict[name] is None + data = [ + Index([], name=name, dtype=dtype_dict[name]) for name in index_names + ] + if len(data) == 1: + index = data[0] + else: + index = MultiIndex.from_arrays(data) index_col.sort() for i, n in enumerate(index_col): diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index f4198ac2a1443..818c9f5ff6b80 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -338,7 +338,7 @@ def read( data = {k: v for k, (i, v) in zip(names, data_tups)} date_data = self._do_date_conversions(names, data) - index, column_names = self._make_index(date_data, alldata, names) + index, column_names = self._make_index(alldata, names) return index, column_names, date_data diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index c445529a6db48..3a2a1c37f1879 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -312,9 +312,7 @@ def read( conv_data = self._convert_data(data) conv_data = self._do_date_conversions(columns, conv_data) - index, result_columns = self._make_index( - conv_data, alldata, columns, indexnamerow - ) + index, result_columns = self._make_index(alldata, columns, indexnamerow) return index, result_columns, conv_data diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index a27df95f7eb2a..3f410a13c8f80 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -29,6 +29,8 @@ "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" ) +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") + @pytest.mark.parametrize("dtype", [str, object]) @pytest.mark.parametrize("check_orig", [True, False]) @@ -614,6 +616,7 @@ def test_string_inference_object_dtype(all_parsers, dtype): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_accurate_parsing_of_large_integers(all_parsers): # GH#52505 data = """SYMBOL,MOMENT,ID,ID_DEAL @@ -624,7 +627,7 @@ def test_accurate_parsing_of_large_integers(all_parsers): AMZN,20230301181139587,2023552585717889759,2023552585717263360 MSFT,20230301181139587,2023552585717889863,2023552585717263361 NVDA,20230301181139587,2023552585717889827,2023552585717263361""" - orders = pd.read_csv(StringIO(data), dtype={"ID_DEAL": pd.Int64Dtype()}) + orders = all_parsers.read_csv(StringIO(data), dtype={"ID_DEAL": pd.Int64Dtype()}) assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263358, "ID_DEAL"]) == 1 assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263359, "ID_DEAL"]) == 1 assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263360, "ID_DEAL"]) == 2 @@ -646,3 +649,16 @@ def test_dtypes_with_usecols(all_parsers): values = ["1", "4"] expected = DataFrame({"a": pd.Series(values, dtype=object), "c": [3, 6]}) tm.assert_frame_equal(result, expected) + + +def test_index_col_with_dtype_no_rangeindex(all_parsers): + data = StringIO("345.5,519.5,0\n519.5,726.5,1") + result = all_parsers.read_csv( + data, + header=None, + names=["start", "stop", "bin_id"], + dtype={"start": np.float32, "stop": np.float32, "bin_id": np.uint32}, + index_col="bin_id", + ).index + expected = pd.Index([0, 1], dtype=np.uint32, name="bin_id") + tm.assert_index_equal(result, expected) From 88a566886292a473072e6ae3bce9b16a303d7d71 Mon Sep 17 00:00:00 2001 From: William Andrea <22385371+wjandrea@users.noreply.github.com> Date: Tue, 30 Jul 2024 21:44:52 -0400 Subject: [PATCH 249/272] DOC: Typo in style.ipynb (#59361) Typo in style.ipynb Semicolon instead of colon --- doc/source/user_guide/style.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/user_guide/style.ipynb b/doc/source/user_guide/style.ipynb index 04ba3e5be8ff7..f4a55280cd1f1 100644 --- a/doc/source/user_guide/style.ipynb +++ b/doc/source/user_guide/style.ipynb @@ -351,7 +351,7 @@ "\n", "- Using [.set_table_styles()][table] to control broader areas of the table with specified internal CSS. Although table styles allow the flexibility to add CSS selectors and properties controlling all individual parts of the table, they are unwieldy for individual cell specifications. Also, note that table styles cannot be exported to Excel. \n", "- Using [.set_td_classes()][td_class] to directly link either external CSS classes to your data cells or link the internal CSS classes created by [.set_table_styles()][table]. See [here](#Setting-Classes-and-Linking-to-External-CSS). These cannot be used on column header rows or indexes, and also won't export to Excel. \n", - "- Using the [.apply()][apply] and [.map()][map] functions to add direct internal CSS to specific data cells. See [here](#Styler-Functions). As of v1.4.0 there are also methods that work directly on column header rows or indexes; [.apply_index()][applyindex] and [.map_index()][mapindex]. Note that only these methods add styles that will export to Excel. These methods work in a similar way to [DataFrame.apply()][dfapply] and [DataFrame.map()][dfmap].\n", + "- Using the [.apply()][apply] and [.map()][map] functions to add direct internal CSS to specific data cells. See [here](#Styler-Functions). As of v1.4.0 there are also methods that work directly on column header rows or indexes: [.apply_index()][applyindex] and [.map_index()][mapindex]. Note that only these methods add styles that will export to Excel. These methods work in a similar way to [DataFrame.apply()][dfapply] and [DataFrame.map()][dfmap].\n", "\n", "[table]: ../reference/api/pandas.io.formats.style.Styler.set_table_styles.rst\n", "[styler]: ../reference/api/pandas.io.formats.style.Styler.rst\n", From 73b5578967bedd1f94b8a54d9047f33364178783 Mon Sep 17 00:00:00 2001 From: Apoorv <113182336+ApoorvApoorv@users.noreply.github.com> Date: Wed, 31 Jul 2024 02:46:47 +0100 Subject: [PATCH 250/272] DOC: Added extra sentences to clarify series.GroupBy snippets in examples (#59331) * Added messages for each releveant snippet * some small corrections to clarify further * removed trailing whitespace * more formatting correction * more cleanup * reverting changes * trying to format documentation correctly * removed some part of addee text * testing if removing list works * reverting some changes * reverting changes * checking if minor changes also leads to failures * reverting all changes to pass the tests * checking is small changes causes errors as well * pusing the changes back --- pandas/core/series.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/pandas/core/series.py b/pandas/core/series.py index f340821775015..a197886748bce 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1815,14 +1815,30 @@ def _set_name( Parrot 30.0 Parrot 20.0 Name: Max Speed, dtype: float64 + + We can pass a list of values to group the Series data by custom labels: + >>> ser.groupby(["a", "b", "a", "b"]).mean() a 210.0 b 185.0 Name: Max Speed, dtype: float64 + + Grouping by numeric labels yields similar results: + + >>> ser.groupby([0, 1, 0, 1]).mean() + 0 210.0 + 1 185.0 + Name: Max Speed, dtype: float64 + + We can group by a level of the index: + >>> ser.groupby(level=0).mean() Falcon 370.0 Parrot 25.0 Name: Max Speed, dtype: float64 + + We can group by a condition applied to the Series values: + >>> ser.groupby(ser > 100).mean() Max Speed False 25.0 @@ -1845,11 +1861,16 @@ def _set_name( Parrot Captive 30.0 Wild 20.0 Name: Max Speed, dtype: float64 + >>> ser.groupby(level=0).mean() Animal Falcon 370.0 Parrot 25.0 Name: Max Speed, dtype: float64 + + We can also group by the 'Type' level of the hierarchical index + to get the mean speed for each type: + >>> ser.groupby(level="Type").mean() Type Captive 210.0 @@ -1865,12 +1886,17 @@ def _set_name( b 3 dtype: int64 + To include `NA` values in the group keys, set `dropna=False`: + >>> ser.groupby(level=0, dropna=False).sum() a 3 b 3 NaN 3 dtype: int64 + We can also group by a custom list with NaN values to handle + missing group labels: + >>> arrays = ['Falcon', 'Falcon', 'Parrot', 'Parrot'] >>> ser = pd.Series([390., 350., 30., 20.], index=arrays, name="Max Speed") >>> ser.groupby(["a", "b", "a", np.nan]).mean() From 70c7acab1e341cfe855d2f42b81b1f692f184164 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Wed, 31 Jul 2024 22:36:57 +0530 Subject: [PATCH 251/272] DOC: fix PR07,RT03,SA01 for pandas.MultiIndex.copy (#59363) --- ci/code_checks.sh | 1 - pandas/core/indexes/multi.py | 21 +++++++++++++++++++-- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index af1d93d1f153b..4d3ebe393c262 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -70,7 +70,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then --format=actions \ -i ES01 `# For now it is ok if docstrings are missing the extended summary` \ -i "pandas.Series.dt PR01" `# Accessors are implemented as classes, but we do not document the Parameters section` \ - -i "pandas.MultiIndex.copy PR07,RT03,SA01" \ -i "pandas.MultiIndex.get_level_values SA01" \ -i "pandas.MultiIndex.get_loc PR07" \ -i "pandas.MultiIndex.get_loc_level PR07" \ diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index c278927c1db6e..ee271cc7be86c 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1282,20 +1282,37 @@ def copy( # type: ignore[override] name=None, ) -> Self: """ - Make a copy of this object. + Make a copy of this object. Names, dtype, levels and codes can be passed and \ + will be set on new copy. - Names, dtype, levels and codes can be passed and will be set on new copy. + The `copy` method provides a mechanism to create a duplicate of an + existing MultiIndex object. This is particularly useful in scenarios where + modifications are required on an index, but the original MultiIndex should + remain unchanged. By specifying the `deep` parameter, users can control + whether the copy should be a deep or shallow copy, providing flexibility + depending on the size and complexity of the MultiIndex. Parameters ---------- names : sequence, optional + Names to set on the new MultiIndex object. deep : bool, default False + If False, the new object will be a shallow copy. If True, a deep copy + will be attempted. Deep copying can be potentially expensive for large + MultiIndex objects. name : Label Kept for compatibility with 1-dimensional Index. Should not be used. Returns ------- MultiIndex + A new MultiIndex object with the specified modifications. + + See Also + -------- + MultiIndex.from_arrays : Convert arrays to MultiIndex. + MultiIndex.from_tuples : Convert list of tuples to MultiIndex. + MultiIndex.from_frame : Convert DataFrame to MultiIndex. Notes ----- From 8456ead03f87c3f10cb222851206db82d4b1fa10 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Wed, 31 Jul 2024 22:37:37 +0530 Subject: [PATCH 252/272] DOC: fix ES01,PR07 for pandas.MultiIndex.get_loc (#59364) --- ci/code_checks.sh | 1 - pandas/core/indexes/multi.py | 11 ++++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 4d3ebe393c262..c6c005913134b 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -71,7 +71,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i ES01 `# For now it is ok if docstrings are missing the extended summary` \ -i "pandas.Series.dt PR01" `# Accessors are implemented as classes, but we do not document the Parameters section` \ -i "pandas.MultiIndex.get_level_values SA01" \ - -i "pandas.MultiIndex.get_loc PR07" \ -i "pandas.MultiIndex.get_loc_level PR07" \ -i "pandas.MultiIndex.levshape SA01" \ -i "pandas.MultiIndex.names SA01" \ diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index ee271cc7be86c..5cb9ec2d9d9f1 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2985,14 +2985,19 @@ def _get_loc_single_level_index(self, level_index: Index, key: Hashable) -> int: def get_loc(self, key): """ - Get location for a label or a tuple of labels. + Get location for a label or a tuple of labels. The location is returned \ + as an integer/slice or boolean mask. - The location is returned as an integer/slice or boolean - mask. + This method returns the integer location, slice object, or boolean mask + corresponding to the specified key, which can be a single label or a tuple + of labels. The key represents a position in the MultiIndex, and the location + indicates where the key is found within the index. Parameters ---------- key : label or tuple of labels (one for each level) + A label or tuple of labels that correspond to the levels of the MultiIndex. + The key must match the structure of the MultiIndex. Returns ------- From 7416a59391e5872168ad82f76ad380d371f2f815 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Wed, 31 Jul 2024 22:38:43 +0530 Subject: [PATCH 253/272] DOC: fix ES01,SA01 for pandas.MultiIndex.levshape (#59365) DOC: fix SA01 for pandas.MultiIndex.levshape --- ci/code_checks.sh | 1 - pandas/core/indexes/multi.py | 14 +++++++++++++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index c6c005913134b..5ef1ec7af3465 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -72,7 +72,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.dt PR01" `# Accessors are implemented as classes, but we do not document the Parameters section` \ -i "pandas.MultiIndex.get_level_values SA01" \ -i "pandas.MultiIndex.get_loc_level PR07" \ - -i "pandas.MultiIndex.levshape SA01" \ -i "pandas.MultiIndex.names SA01" \ -i "pandas.MultiIndex.remove_unused_levels RT03,SA01" \ -i "pandas.MultiIndex.reorder_levels RT03,SA01" \ diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 5cb9ec2d9d9f1..c46e3dce60d9f 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1051,7 +1051,19 @@ def nlevels(self) -> int: @property def levshape(self) -> Shape: """ - A tuple with the length of each level. + A tuple representing the length of each level in the MultiIndex. + + In a `MultiIndex`, each level can contain multiple unique values. The + `levshape` property provides a quick way to assess the size of each + level by returning a tuple where each entry represents the number of + unique values in that specific level. This is particularly useful in + scenarios where you need to understand the structure and distribution + of your index levels, such as when working with multidimensional data. + + See Also + -------- + MultiIndex.shape : Return a tuple of the shape of the MultiIndex. + MultiIndex.levels : Returns the levels of the MultiIndex. Examples -------- From 6fdafda3d01f716dde0dce4ba4d3144ef22ff218 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Wed, 31 Jul 2024 22:39:16 +0530 Subject: [PATCH 254/272] DOC: fix ES01,RT03,SA01 for pandas.MultiIndex.set_levels (#59367) --- ci/code_checks.sh | 1 - pandas/core/indexes/multi.py | 21 +++++++++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 5ef1ec7af3465..f6ff9296a9862 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -75,7 +75,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.MultiIndex.names SA01" \ -i "pandas.MultiIndex.remove_unused_levels RT03,SA01" \ -i "pandas.MultiIndex.reorder_levels RT03,SA01" \ - -i "pandas.MultiIndex.set_levels RT03,SA01" \ -i "pandas.MultiIndex.sortlevel PR07,SA01" \ -i "pandas.MultiIndex.to_frame RT03" \ -i "pandas.NA SA01" \ diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index c46e3dce60d9f..4863a75d68911 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -933,6 +933,19 @@ def set_levels( """ Set new levels on MultiIndex. Defaults to returning new index. + The `set_levels` method provides a flexible way to change the levels of a + `MultiIndex`. This is particularly useful when you need to update the + index structure of your DataFrame without altering the data. The method + returns a new `MultiIndex` unless the operation is performed in-place, + ensuring that the original index remains unchanged unless explicitly + modified. + + The method checks the integrity of the new levels against the existing + codes by default, but this can be disabled if you are confident that + your levels are consistent with the underlying data. This can be useful + when you want to perform optimizations or make specific adjustments to + the index levels that do not strictly adhere to the original structure. + Parameters ---------- levels : sequence or list of sequence @@ -945,6 +958,14 @@ def set_levels( Returns ------- MultiIndex + A new `MultiIndex` with the updated levels. + + See Also + -------- + MultiIndex.set_codes : Set new codes on the existing `MultiIndex`. + MultiIndex.remove_unused_levels : Create new MultiIndex from current that + removes unused levels. + Index.set_names : Set Index or MultiIndex name. Examples -------- From 4c39c0870732df810351ee51a722d7bf3290cca8 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Wed, 31 Jul 2024 22:39:52 +0530 Subject: [PATCH 255/272] DOC: fix ES01,RT03,SA01 for pandas.MultiIndex.remove_unused_levels (#59366) --- ci/code_checks.sh | 1 - pandas/core/indexes/multi.py | 13 +++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index f6ff9296a9862..ebcc99100be70 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -73,7 +73,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.MultiIndex.get_level_values SA01" \ -i "pandas.MultiIndex.get_loc_level PR07" \ -i "pandas.MultiIndex.names SA01" \ - -i "pandas.MultiIndex.remove_unused_levels RT03,SA01" \ -i "pandas.MultiIndex.reorder_levels RT03,SA01" \ -i "pandas.MultiIndex.sortlevel PR07,SA01" \ -i "pandas.MultiIndex.to_frame RT03" \ diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 4863a75d68911..799b6f32c1387 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2091,9 +2091,22 @@ def remove_unused_levels(self) -> MultiIndex: appearance, meaning the same .values and ordering. It will also be .equals() to the original. + The `remove_unused_levels` method is useful in cases where you have a + MultiIndex with hierarchical levels, but some of these levels are no + longer needed due to filtering or subsetting operations. By removing + the unused levels, the resulting MultiIndex becomes more compact and + efficient, which can improve performance in subsequent operations. + Returns ------- MultiIndex + A new MultiIndex with unused levels removed. + + See Also + -------- + MultiIndex.droplevel : Remove specified levels from a MultiIndex. + MultiIndex.reorder_levels : Rearrange levels of a MultiIndex. + MultiIndex.set_levels : Set new levels on a MultiIndex. Examples -------- From 89c8d7a0f45590888f8a2ac1889333c0f468694b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 31 Jul 2024 19:24:33 +0200 Subject: [PATCH 256/272] TST (string dtype): change any_string_dtype fixture to use actual dtype instances (#59345) * TST (string dtype): change any_string_dtype fixture to use actual dtype instances * avoid pyarrow import error during test collection * fix dtype equality in case pyarrow is not installed * keep using mode.string_storage as default for NA variant + more xfails * fix test_series_string_inference_storage_definition * remove no longer necessary xfails --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/conftest.py | 29 +++++--- pandas/core/arrays/string_.py | 6 +- .../arrays/categorical/test_constructors.py | 1 - pandas/tests/copy_view/test_array.py | 3 - pandas/tests/copy_view/test_astype.py | 2 - pandas/tests/dtypes/test_common.py | 3 - pandas/tests/io/parser/test_index_col.py | 3 + pandas/tests/series/test_constructors.py | 2 +- pandas/tests/strings/__init__.py | 10 ++- pandas/tests/strings/test_find_replace.py | 70 ++++++++++++++----- pandas/tests/strings/test_split_partition.py | 4 +- pandas/tests/strings/test_strings.py | 32 ++++++--- 12 files changed, 115 insertions(+), 50 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index c36789d2950bc..1d8334a7fe32c 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1354,20 +1354,33 @@ def object_dtype(request): @pytest.fixture( params=[ - "object", - "string[python]", - pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), - pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), - ] + np.dtype("object"), + ("python", pd.NA), + pytest.param(("pyarrow", pd.NA), marks=td.skip_if_no("pyarrow")), + pytest.param(("pyarrow", np.nan), marks=td.skip_if_no("pyarrow")), + ], + ids=[ + "string=object", + "string=string[python]", + "string=string[pyarrow]", + "string=str[pyarrow]", + ], ) def any_string_dtype(request): """ Parametrized fixture for string dtypes. * 'object' - * 'string[python]' - * 'string[pyarrow]' + * 'string[python]' (NA variant) + * 'string[pyarrow]' (NA variant) + * 'str' (NaN variant, with pyarrow) """ - return request.param + if isinstance(request.param, np.dtype): + return request.param + else: + # need to instantiate the StringDtype here instead of in the params + # to avoid importing pyarrow during test collection + storage, na_value = request.param + return pd.StringDtype(storage, na_value) @pytest.fixture(params=tm.DATETIME64_DTYPES) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index cae770d85637c..3c0cc3a8a9c70 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -129,7 +129,7 @@ def __init__( ) -> None: # infer defaults if storage is None: - if using_string_dtype(): + if using_string_dtype() and na_value is not libmissing.NA: storage = "pyarrow" else: storage = get_option("mode.string_storage") @@ -167,7 +167,9 @@ def __eq__(self, other: object) -> bool: return True try: other = self.construct_from_string(other) - except TypeError: + except (TypeError, ImportError): + # TypeError if `other` is not a valid string for StringDtype + # ImportError if pyarrow is not installed for "string[pyarrow]" return False if isinstance(other, type(self)): return self.storage == other.storage and self.na_value is other.na_value diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index e0bd8386b2c41..6752a503016f8 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -735,7 +735,6 @@ def test_interval(self): tm.assert_numpy_array_equal(cat.codes, expected_codes) tm.assert_index_equal(cat.categories, idx) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_categorical_extension_array_nullable(self, nulls_fixture): # GH: arr = pd.arrays.StringArray._from_sequence( diff --git a/pandas/tests/copy_view/test_array.py b/pandas/tests/copy_view/test_array.py index bcc8a212fbb98..bb238d08bd9bd 100644 --- a/pandas/tests/copy_view/test_array.py +++ b/pandas/tests/copy_view/test_array.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas import ( DataFrame, Series, @@ -119,7 +117,6 @@ def test_dataframe_array_ea_dtypes(): assert arr.flags.writeable is False -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_dataframe_array_string_dtype(): df = DataFrame({"a": ["a", "b"]}, dtype="string") arr = np.asarray(df) diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py index a503841386fbc..8724f62de1534 100644 --- a/pandas/tests/copy_view/test_astype.py +++ b/pandas/tests/copy_view/test_astype.py @@ -84,7 +84,6 @@ def test_astype_numpy_to_ea(): assert np.shares_memory(get_array(ser), get_array(result)) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "dtype, new_dtype", [("object", "string"), ("string", "object")] ) @@ -98,7 +97,6 @@ def test_astype_string_and_object(dtype, new_dtype): tm.assert_frame_equal(df, df_orig) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "dtype, new_dtype", [("object", "string"), ("string", "object")] ) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index a6b549d24c66d..f47815ee059af 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas.util._test_decorators as td from pandas.core.dtypes.astype import astype_array @@ -130,7 +128,6 @@ def test_dtype_equal(name1, dtype1, name2, dtype2): assert not com.is_dtype_equal(dtype1, dtype2) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("name,dtype", list(dtypes.items()), ids=lambda x: str(x)) def test_pyarrow_string_import_error(name, dtype): # GH-44276 diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py index 24d0a7626723e..ce2ed5e9764bd 100644 --- a/pandas/tests/io/parser/test_index_col.py +++ b/pandas/tests/io/parser/test_index_col.py @@ -9,6 +9,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas import ( DataFrame, Index, @@ -343,6 +345,7 @@ def test_infer_types_boolean_sum(all_parsers): tm.assert_frame_equal(result, expected, check_index_type=False) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("dtype, val", [(object, "01"), ("int64", 1)]) def test_specify_dtype_for_index_col(all_parsers, dtype, val, request): # GH#9435 diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 91cf1708ed43b..3efcd82da42e4 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -2118,7 +2118,7 @@ def test_series_string_inference_storage_definition(self): # returning the NA string dtype, so expected is changed from # "string[pyarrow_numpy]" to "string[pyarrow]" pytest.importorskip("pyarrow") - expected = Series(["a", "b"], dtype="string[pyarrow]") + expected = Series(["a", "b"], dtype="string[python]") with pd.option_context("future.infer_string", True): result = Series(["a", "b"], dtype="string") tm.assert_series_equal(result, expected) diff --git a/pandas/tests/strings/__init__.py b/pandas/tests/strings/__init__.py index e94f656fc9823..6c4bec6a23789 100644 --- a/pandas/tests/strings/__init__.py +++ b/pandas/tests/strings/__init__.py @@ -2,7 +2,15 @@ import pandas as pd -object_pyarrow_numpy = ("object", "string[pyarrow_numpy]") + +def is_object_or_nan_string_dtype(dtype): + """ + Check if string-like dtype is following NaN semantics, i.e. is object + dtype or a NaN-variant of the StringDtype. + """ + return (isinstance(dtype, np.dtype) and dtype == "object") or ( + dtype.na_value is np.nan + ) def _convert_na_value(ser, expected): diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index fb308b72e47f5..29adc1db994e9 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -13,7 +13,7 @@ ) from pandas.tests.strings import ( _convert_na_value, - object_pyarrow_numpy, + is_object_or_nan_string_dtype, ) # -------------------------------------------------------------------------------------- @@ -33,7 +33,9 @@ def test_contains(any_string_dtype): pat = "mmm[_]+" result = values.str.contains(pat) - expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series( np.array([False, np.nan, True, True, False], dtype=np.object_), dtype=expected_dtype, @@ -52,7 +54,9 @@ def test_contains(any_string_dtype): dtype=any_string_dtype, ) result = values.str.contains(pat) - expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series(np.array([False, False, True, True]), dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -79,14 +83,18 @@ def test_contains(any_string_dtype): pat = "mmm[_]+" result = values.str.contains(pat) - expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series( np.array([False, np.nan, True, True], dtype=np.object_), dtype=expected_dtype ) tm.assert_series_equal(result, expected) result = values.str.contains(pat, na=False) - expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series(np.array([False, False, True, True]), dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -171,7 +179,9 @@ def test_contains_moar(any_string_dtype): ) result = s.str.contains("a") - expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series( [False, False, False, True, True, False, np.nan, False, False, True], dtype=expected_dtype, @@ -212,7 +222,9 @@ def test_contains_nan(any_string_dtype): s = Series([np.nan, np.nan, np.nan], dtype=any_string_dtype) result = s.str.contains("foo", na=False) - expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series([False, False, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -230,7 +242,9 @@ def test_contains_nan(any_string_dtype): tm.assert_series_equal(result, expected) result = s.str.contains("foo") - expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series([np.nan, np.nan, np.nan], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -675,7 +689,9 @@ def test_replace_regex_single_character(regex, any_string_dtype): def test_match(any_string_dtype): # New match behavior introduced in 0.13 - expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) values = Series(["fooBAD__barBAD", np.nan, "foo"], dtype=any_string_dtype) result = values.str.match(".*(BAD[_]+).*(BAD)") @@ -730,12 +746,16 @@ def test_match_na_kwarg(any_string_dtype): s = Series(["a", "b", np.nan], dtype=any_string_dtype) result = s.str.match("a", na=False) - expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series([True, False, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) result = s.str.match("a") - expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series([True, False, np.nan], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -743,7 +763,9 @@ def test_match_na_kwarg(any_string_dtype): def test_match_case_kwarg(any_string_dtype): values = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype) result = values.str.match("ab", case=False) - expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series([True, True, True, True], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -759,7 +781,9 @@ def test_fullmatch(any_string_dtype): ["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype ) result = ser.str.fullmatch(".*BAD[_]+.*BAD") - expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series([True, False, np.nan, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -768,7 +792,9 @@ def test_fullmatch_dollar_literal(any_string_dtype): # GH 56652 ser = Series(["foo", "foo$foo", np.nan, "foo$"], dtype=any_string_dtype) result = ser.str.fullmatch("foo\\$") - expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series([False, False, np.nan, True], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -778,14 +804,18 @@ def test_fullmatch_na_kwarg(any_string_dtype): ["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype ) result = ser.str.fullmatch(".*BAD[_]+.*BAD", na=False) - expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series([True, False, False, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) def test_fullmatch_case_kwarg(any_string_dtype, performance_warning): ser = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype) - expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series([True, False, False, False], dtype=expected_dtype) @@ -859,7 +889,9 @@ def test_find(any_string_dtype): ser = Series( ["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF", "XXXX"], dtype=any_string_dtype ) - expected_dtype = np.int64 if any_string_dtype in object_pyarrow_numpy else "Int64" + expected_dtype = ( + np.int64 if is_object_or_nan_string_dtype(any_string_dtype) else "Int64" + ) result = ser.str.find("EF") expected = Series([4, 3, 1, 0, -1], dtype=expected_dtype) @@ -911,7 +943,9 @@ def test_find_nan(any_string_dtype): ser = Series( ["ABCDEFG", np.nan, "DEFGHIJEF", np.nan, "XXXX"], dtype=any_string_dtype ) - expected_dtype = np.float64 if any_string_dtype in object_pyarrow_numpy else "Int64" + expected_dtype = ( + np.float64 if is_object_or_nan_string_dtype(any_string_dtype) else "Int64" + ) result = ser.str.find("EF") expected = Series([4, np.nan, 1, np.nan, -1], dtype=expected_dtype) diff --git a/pandas/tests/strings/test_split_partition.py b/pandas/tests/strings/test_split_partition.py index 452e5ec5cf939..4fab6e7778002 100644 --- a/pandas/tests/strings/test_split_partition.py +++ b/pandas/tests/strings/test_split_partition.py @@ -14,7 +14,7 @@ ) from pandas.tests.strings import ( _convert_na_value, - object_pyarrow_numpy, + is_object_or_nan_string_dtype, ) @@ -385,7 +385,7 @@ def test_split_nan_expand(any_string_dtype): # check that these are actually np.nan/pd.NA and not None # TODO see GH 18463 # tm.assert_frame_equal does not differentiate - if any_string_dtype in object_pyarrow_numpy: + if is_object_or_nan_string_dtype(any_string_dtype): assert all(np.isnan(x) for x in result.iloc[1]) else: assert all(x is pd.NA for x in result.iloc[1]) diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index 1ea1b030604a3..1ce46497c3c22 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -14,7 +14,7 @@ ) import pandas._testing as tm from pandas.core.strings.accessor import StringMethods -from pandas.tests.strings import object_pyarrow_numpy +from pandas.tests.strings import is_object_or_nan_string_dtype @pytest.mark.parametrize("pattern", [0, True, Series(["foo", "bar"])]) @@ -41,7 +41,9 @@ def test_iter_raises(): def test_count(any_string_dtype): ser = Series(["foo", "foofoo", np.nan, "foooofooofommmfoo"], dtype=any_string_dtype) result = ser.str.count("f[o]+") - expected_dtype = np.float64 if any_string_dtype in object_pyarrow_numpy else "Int64" + expected_dtype = ( + np.float64 if is_object_or_nan_string_dtype(any_string_dtype) else "Int64" + ) expected = Series([1, 2, np.nan, 4], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -93,7 +95,7 @@ def test_repeat_with_null(any_string_dtype, arg, repeat): def test_empty_str_methods(any_string_dtype): empty_str = empty = Series(dtype=any_string_dtype) - if any_string_dtype in object_pyarrow_numpy: + if is_object_or_nan_string_dtype(any_string_dtype): empty_int = Series(dtype="int64") empty_bool = Series(dtype=bool) else: @@ -207,7 +209,9 @@ def test_ismethods(method, expected, any_string_dtype): ser = Series( ["A", "b", "Xy", "4", "3A", "", "TT", "55", "-", " "], dtype=any_string_dtype ) - expected_dtype = "bool" if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + "bool" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series(expected, dtype=expected_dtype) result = getattr(ser.str, method)() tm.assert_series_equal(result, expected) @@ -233,7 +237,9 @@ def test_isnumeric_unicode(method, expected, any_string_dtype): ["A", "3", "¼", "★", "፸", "3", "four"], # noqa: RUF001 dtype=any_string_dtype, ) - expected_dtype = "bool" if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + "bool" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series(expected, dtype=expected_dtype) result = getattr(ser.str, method)() tm.assert_series_equal(result, expected) @@ -253,7 +259,9 @@ def test_isnumeric_unicode(method, expected, any_string_dtype): def test_isnumeric_unicode_missing(method, expected, any_string_dtype): values = ["A", np.nan, "¼", "★", np.nan, "3", "four"] # noqa: RUF001 ser = Series(values, dtype=any_string_dtype) - expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series(expected, dtype=expected_dtype) result = getattr(ser.str, method)() tm.assert_series_equal(result, expected) @@ -284,7 +292,9 @@ def test_len(any_string_dtype): dtype=any_string_dtype, ) result = ser.str.len() - expected_dtype = "float64" if any_string_dtype in object_pyarrow_numpy else "Int64" + expected_dtype = ( + "float64" if is_object_or_nan_string_dtype(any_string_dtype) else "Int64" + ) expected = Series([3, 4, 6, np.nan, 8, 4, 1], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -313,7 +323,9 @@ def test_index(method, sub, start, end, index_or_series, any_string_dtype, expec obj = index_or_series( ["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF"], dtype=any_string_dtype ) - expected_dtype = np.int64 if any_string_dtype in object_pyarrow_numpy else "Int64" + expected_dtype = ( + np.int64 if is_object_or_nan_string_dtype(any_string_dtype) else "Int64" + ) expected = index_or_series(expected, dtype=expected_dtype) result = getattr(obj.str, method)(sub, start, end) @@ -354,7 +366,9 @@ def test_index_wrong_type_raises(index_or_series, any_string_dtype, method): ) def test_index_missing(any_string_dtype, method, exp): ser = Series(["abcb", "ab", "bcbe", np.nan], dtype=any_string_dtype) - expected_dtype = np.float64 if any_string_dtype in object_pyarrow_numpy else "Int64" + expected_dtype = ( + np.float64 if is_object_or_nan_string_dtype(any_string_dtype) else "Int64" + ) result = getattr(ser.str, method)("b") expected = Series(exp + [np.nan], dtype=expected_dtype) From 9c08431e0676fd959a02ea93423fb715ba645000 Mon Sep 17 00:00:00 2001 From: Florian Bourgey Date: Wed, 31 Jul 2024 13:25:37 -0400 Subject: [PATCH 257/272] ENH: Allow to plot weighted KDEs. (#59337) * added weights argument in _plot function and modified scipy.stats.gaussian_kde accordingly * Update pandas/plotting/_core.py Co-authored-by: Xiao Yuan * moved "weights" after "ind" * added entry in whatsnew v3.0.0.rst * added new test for weights * removed weights * moved message from Plotting section to Other enhancements --------- Co-authored-by: Xiao Yuan --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/plotting/_core.py | 6 +++++- pandas/plotting/_matplotlib/hist.py | 3 ++- pandas/tests/plotting/test_series.py | 16 ++++++++++++++++ 4 files changed, 24 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 768b12ba1007f..1627a90fc6ac0 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -35,6 +35,7 @@ Other enhancements - :meth:`DataFrame.agg` called with ``axis=1`` and a ``func`` which relabels the result index now raises a ``NotImplementedError`` (:issue:`58807`). - :meth:`Index.get_loc` now accepts also subclasses of ``tuple`` as keys (:issue:`57922`) - :meth:`Styler.set_tooltips` provides alternative method to storing tooltips by using title attribute of td elements. (:issue:`56981`) +- Added missing parameter ``weights`` in :meth:`DataFrame.plot.kde` for the estimation of the PDF (:issue:`59337`) - Allow dictionaries to be passed to :meth:`pandas.Series.str.replace` via ``pat`` parameter (:issue:`51748`) - Support passing a :class:`Series` input to :func:`json_normalize` that retains the :class:`Series` :class:`Index` (:issue:`51452`) - Support reading value labels from Stata 108-format (Stata 6) and earlier files (:issue:`58154`) diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index 17df98f026656..b60392368d944 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -1450,6 +1450,7 @@ def kde( self, bw_method: Literal["scott", "silverman"] | float | Callable | None = None, ind: np.ndarray | int | None = None, + weights: np.ndarray | None = None, **kwargs, ) -> PlotAccessor: """ @@ -1475,6 +1476,9 @@ def kde( 1000 equally spaced points are used. If `ind` is a NumPy array, the KDE is evaluated at the points passed. If `ind` is an integer, `ind` number of equally spaced points are used. + weights : NumPy array, optional + Weights of datapoints. This must be the same shape as datapoints. + If None (default), the samples are assumed to be equally weighted. **kwargs Additional keyword arguments are documented in :meth:`DataFrame.plot`. @@ -1560,7 +1564,7 @@ def kde( >>> ax = df.plot.kde(ind=[1, 2, 3, 4, 5, 6]) """ - return self(kind="kde", bw_method=bw_method, ind=ind, **kwargs) + return self(kind="kde", bw_method=bw_method, ind=ind, weights=weights, **kwargs) density = kde diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index 2c4d714bf1a0c..97e510982ab93 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -269,6 +269,7 @@ def _plot( # type: ignore[override] y: np.ndarray, style=None, bw_method=None, + weights=None, ind=None, column_num=None, stacking_id: int | None = None, @@ -277,7 +278,7 @@ def _plot( # type: ignore[override] from scipy.stats import gaussian_kde y = remove_na_arraylike(y) - gkde = gaussian_kde(y, bw_method=bw_method) + gkde = gaussian_kde(y, bw_method=bw_method, weights=weights) y = gkde.evaluate(ind) lines = MPLPlot._plot(ax, ind, y, style=style, **kwds) diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index 2ca9dbf92e617..52ca66c218862 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -538,6 +538,22 @@ def test_kde_kwargs(self, ts, bw_method, ind): pytest.importorskip("scipy") _check_plot_works(ts.plot.kde, bw_method=bw_method, ind=ind) + @pytest.mark.parametrize( + "bw_method, ind, weights", + [ + ["scott", 20, None], + [None, 20, None], + [None, np.int_(20), None], + [0.5, np.linspace(-100, 100, 20), None], + ["scott", 40, np.linspace(0.0, 2.0, 50)], + ], + ) + def test_kde_kwargs_weights(self, bw_method, ind, weights): + # GH59337 + pytest.importorskip("scipy") + s = Series(np.random.default_rng(2).uniform(size=50)) + _check_plot_works(s.plot.kde, bw_method=bw_method, ind=ind, weights=weights) + def test_density_kwargs(self, ts): pytest.importorskip("scipy") sample_points = np.linspace(-100, 100, 20) From 4b4c86ea8f41889ac3e8b209eb360427437ab240 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 31 Jul 2024 20:25:49 +0200 Subject: [PATCH 258/272] TST (string dtype): remove usage of arrow_string_storage fixture (#59368) * TST (string dtype): remove usage of arrow_string_storage fixture * fixup --- pandas/conftest.py | 8 -------- pandas/tests/arrays/string_/test_string.py | 16 ++++++++-------- pandas/tests/arrays/string_/test_string_arrow.py | 12 ++++++------ pandas/tests/extension/test_string.py | 12 ++++++------ 4 files changed, 20 insertions(+), 28 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 1d8334a7fe32c..e38ca38ffbe8b 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -2039,14 +2039,6 @@ def warsaw(request) -> str: return request.param -@pytest.fixture -def arrow_string_storage(): - """ - Fixture that lists possible PyArrow values for StringDtype storage field. - """ - return ("pyarrow", "pyarrow_numpy") - - @pytest.fixture def temp_file(tmp_path): """ diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 3fde3cbca8d8c..96a57e849d021 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -166,8 +166,8 @@ def test_add(dtype): tm.assert_series_equal(result, expected) -def test_add_2d(dtype, request, arrow_string_storage): - if dtype.storage in arrow_string_storage: +def test_add_2d(dtype, request): + if dtype.storage == "pyarrow": reason = "Failed: DID NOT RAISE " mark = pytest.mark.xfail(raises=None, reason=reason) request.applymarker(mark) @@ -462,8 +462,8 @@ def test_min_max(method, skipna, dtype): @pytest.mark.parametrize("method", ["min", "max"]) @pytest.mark.parametrize("box", [pd.Series, pd.array]) -def test_min_max_numpy(method, box, dtype, request, arrow_string_storage): - if dtype.storage in arrow_string_storage and box is pd.array: +def test_min_max_numpy(method, box, dtype, request): + if dtype.storage == "pyarrow" and box is pd.array: if box is pd.array: reason = "'<=' not supported between instances of 'str' and 'NoneType'" else: @@ -477,7 +477,7 @@ def test_min_max_numpy(method, box, dtype, request, arrow_string_storage): assert result == expected -def test_fillna_args(dtype, arrow_string_storage): +def test_fillna_args(dtype): # GH 37987 arr = pd.array(["a", pd.NA], dtype=dtype) @@ -490,7 +490,7 @@ def test_fillna_args(dtype, arrow_string_storage): expected = pd.array(["a", "b"], dtype=dtype) tm.assert_extension_array_equal(res, expected) - if dtype.storage in arrow_string_storage: + if dtype.storage == "pyarrow": msg = "Invalid value '1' for dtype string" else: msg = "Cannot set non-string value '1' into a StringArray." @@ -616,10 +616,10 @@ def test_value_counts_sort_false(dtype): tm.assert_series_equal(result, expected) -def test_memory_usage(dtype, arrow_string_storage): +def test_memory_usage(dtype): # GH 33963 - if dtype.storage in arrow_string_storage: + if dtype.storage == "pyarrow": pytest.skip(f"not applicable for {dtype.storage}") series = pd.Series(["a", "b", "c"], dtype=dtype) diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index c610ef5315723..0e0db74a37e58 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -48,18 +48,18 @@ def test_config_bad_storage_raises(): @pytest.mark.parametrize("chunked", [True, False]) -@pytest.mark.parametrize("array", ["numpy", "pyarrow"]) -def test_constructor_not_string_type_raises(array, chunked, arrow_string_storage): +@pytest.mark.parametrize("array_lib", ["numpy", "pyarrow"]) +def test_constructor_not_string_type_raises(array_lib, chunked): pa = pytest.importorskip("pyarrow") - array = pa if array in arrow_string_storage else np + array_lib = pa if array_lib == "pyarrow" else np - arr = array.array([1, 2, 3]) + arr = array_lib.array([1, 2, 3]) if chunked: - if array is np: + if array_lib is np: pytest.skip("chunked not applicable to numpy array") arr = pa.chunked_array(arr) - if array is np: + if array_lib is np: msg = "Unsupported type '' for ArrowExtensionArray" else: msg = re.escape( diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 64b383ded97b5..10fea981e0a72 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -116,8 +116,8 @@ def test_is_not_string_type(self, dtype): # because StringDtype is a string type assert is_string_dtype(dtype) - def test_view(self, data, request, arrow_string_storage): - if data.dtype.storage in arrow_string_storage: + def test_view(self, data): + if data.dtype.storage == "pyarrow": pytest.skip(reason="2D support not implemented for ArrowStringArray") super().test_view(data) @@ -125,13 +125,13 @@ def test_from_dtype(self, data): # base test uses string representation of dtype pass - def test_transpose(self, data, request, arrow_string_storage): - if data.dtype.storage in arrow_string_storage: + def test_transpose(self, data): + if data.dtype.storage == "pyarrow": pytest.skip(reason="2D support not implemented for ArrowStringArray") super().test_transpose(data) - def test_setitem_preserves_views(self, data, request, arrow_string_storage): - if data.dtype.storage in arrow_string_storage: + def test_setitem_preserves_views(self, data): + if data.dtype.storage == "pyarrow": pytest.skip(reason="2D support not implemented for ArrowStringArray") super().test_setitem_preserves_views(data) From 0d12b44315e19db7c1aaa76359b050e0b8d964e0 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 31 Jul 2024 22:44:18 +0200 Subject: [PATCH 259/272] TST (string dtype): replace string_storage fixture with explicit storage/na_value keyword arguments for dtype creation (#59375) --- pandas/conftest.py | 18 ++++++++++++++++++ pandas/tests/arrays/string_/test_string.py | 7 ++++--- pandas/tests/extension/test_string.py | 5 +++-- 3 files changed, 25 insertions(+), 5 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index e38ca38ffbe8b..512f6429be24a 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1310,6 +1310,24 @@ def string_storage(request): return request.param +@pytest.fixture( + params=[ + ("python", pd.NA), + pytest.param(("pyarrow", pd.NA), marks=td.skip_if_no("pyarrow")), + pytest.param(("pyarrow", np.nan), marks=td.skip_if_no("pyarrow")), + ] +) +def string_dtype_arguments(request): + """ + Parametrized fixture for StringDtype storage and na_value. + + * 'python' + pd.NA + * 'pyarrow' + pd.NA + * 'pyarrow' + np.nan + """ + return request.param + + @pytest.fixture( params=[ "numpy_nullable", diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 96a57e849d021..d741d6da43a8c 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -23,9 +23,10 @@ @pytest.fixture -def dtype(string_storage): - """Fixture giving StringDtype from parametrized 'string_storage'""" - return pd.StringDtype(storage=string_storage) +def dtype(string_dtype_arguments): + """Fixture giving StringDtype from parametrized storage and na_value arguments""" + storage, na_value = string_dtype_arguments + return pd.StringDtype(storage=storage, na_value=na_value) @pytest.fixture diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 10fea981e0a72..2ab248787a1cf 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -59,8 +59,9 @@ def chunked(request): @pytest.fixture -def dtype(string_storage): - return StringDtype(storage=string_storage) +def dtype(string_dtype_arguments): + storage, na_value = string_dtype_arguments + return StringDtype(storage=storage, na_value=na_value) @pytest.fixture From 6adba55e2f1dc0bf7c222ef4220caffbabb4544e Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 1 Aug 2024 17:55:37 +0200 Subject: [PATCH 260/272] String dtype: restrict options.mode.string_storage to python|pyarrow (remove pyarrow_numpy) (#59376) * String dtype: restrict options.mode.string_storage to python|pyarrow (remove pyarrow_numpy) * add type annotation --- pandas/conftest.py | 2 -- pandas/core/config_init.py | 18 ++++++++++- pandas/tests/arrays/string_/test_string.py | 32 +++++-------------- .../tests/arrays/string_/test_string_arrow.py | 10 +++--- pandas/tests/frame/methods/test_astype.py | 9 ++++++ .../frame/methods/test_convert_dtypes.py | 5 ++- pandas/tests/io/conftest.py | 16 ---------- 7 files changed, 42 insertions(+), 50 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 512f6429be24a..11196ad069366 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1296,7 +1296,6 @@ def nullable_string_dtype(request): params=[ "python", pytest.param("pyarrow", marks=td.skip_if_no("pyarrow")), - pytest.param("pyarrow_numpy", marks=td.skip_if_no("pyarrow")), ] ) def string_storage(request): @@ -1305,7 +1304,6 @@ def string_storage(request): * 'python' * 'pyarrow' - * 'pyarrow_numpy' """ return request.param diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 352020f45388f..e62cda0dfe8d0 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -14,6 +14,7 @@ from collections.abc import Callable import os +from typing import Any import pandas._config.config as cf from pandas._config.config import ( @@ -455,12 +456,27 @@ def is_terminal() -> bool: ``future.infer_string`` is set to True. """ + +def is_valid_string_storage(value: Any) -> None: + legal_values = ["python", "pyarrow"] + if value not in legal_values: + msg = "Value must be one of python|pyarrow" + if value == "pyarrow_numpy": + # TODO: we can remove extra message after 3.0 + msg += ( + ". 'pyarrow_numpy' was specified, but this option should be " + "enabled using pandas.options.future.infer_string instead" + ) + raise ValueError(msg) + + with cf.config_prefix("mode"): cf.register_option( "string_storage", "python", string_storage_doc, - validator=is_one_of_factory(["python", "pyarrow", "pyarrow_numpy"]), + # validator=is_one_of_factory(["python", "pyarrow"]), + validator=is_valid_string_storage, ) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index d741d6da43a8c..293f3c74223fd 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -514,19 +514,12 @@ def test_arrow_array(dtype): assert arr.equals(expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") -def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string): +def test_arrow_roundtrip(dtype, string_storage, using_infer_string): # roundtrip possible from arrow 1.0.0 pa = pytest.importorskip("pyarrow") - if using_infer_string and string_storage2 != "pyarrow_numpy": - request.applymarker( - pytest.mark.xfail( - reason="infer_string takes precedence over string storage" - ) - ) - data = pd.array(["a", "b", None], dtype=dtype) df = pd.DataFrame({"a": data}) table = pa.table(df) @@ -534,30 +527,21 @@ def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string): assert table.field("a").type == "string" else: assert table.field("a").type == "large_string" - with pd.option_context("string_storage", string_storage2): + with pd.option_context("string_storage", string_storage): result = table.to_pandas() assert isinstance(result["a"].dtype, pd.StringDtype) - expected = df.astype(f"string[{string_storage2}]") + expected = df.astype(f"string[{string_storage}]") tm.assert_frame_equal(result, expected) # ensure the missing value is represented by NA and not np.nan or None assert result.loc[2, "a"] is result["a"].dtype.na_value -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") -def test_arrow_load_from_zero_chunks( - dtype, string_storage2, request, using_infer_string -): +def test_arrow_load_from_zero_chunks(dtype, string_storage, using_infer_string): # GH-41040 pa = pytest.importorskip("pyarrow") - if using_infer_string and string_storage2 != "pyarrow_numpy": - request.applymarker( - pytest.mark.xfail( - reason="infer_string takes precedence over string storage" - ) - ) - data = pd.array([], dtype=dtype) df = pd.DataFrame({"a": data}) table = pa.table(df) @@ -567,10 +551,10 @@ def test_arrow_load_from_zero_chunks( assert table.field("a").type == "large_string" # Instantiate the same table with no chunks at all table = pa.table([pa.chunked_array([], type=pa.string())], schema=table.schema) - with pd.option_context("string_storage", string_storage2): + with pd.option_context("string_storage", string_storage): result = table.to_pandas() assert isinstance(result["a"].dtype, pd.StringDtype) - expected = df.astype(f"string[{string_storage2}]") + expected = df.astype(f"string[{string_storage}]") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index 0e0db74a37e58..65c6ce8e9cd08 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -27,16 +27,18 @@ def test_eq_all_na(): def test_config(string_storage, request, using_infer_string): - if using_infer_string and string_storage != "pyarrow_numpy": - request.applymarker(pytest.mark.xfail(reason="infer string takes precedence")) - if string_storage == "pyarrow_numpy": + if using_infer_string and string_storage == "python": + # python string storage with na_value=NaN is not yet implemented request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)")) + with pd.option_context("string_storage", string_storage): assert StringDtype().storage == string_storage result = pd.array(["a", "b"]) assert result.dtype.storage == string_storage - dtype = StringDtype(string_storage) + dtype = StringDtype( + string_storage, na_value=np.nan if using_infer_string else pd.NA + ) expected = dtype.construct_array_type()._from_sequence(["a", "b"], dtype=dtype) tm.assert_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index 0b525c8d9e1de..c6c702a1a0b1b 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -897,3 +897,12 @@ def test_astype_to_string_not_modifying_input(string_storage, val): with option_context("mode.string_storage", string_storage): df.astype("string") tm.assert_frame_equal(df, expected) + + +@pytest.mark.parametrize("val", [None, 1, 1.5, np.nan, NaT]) +def test_astype_to_string_dtype_not_modifying_input(any_string_dtype, val): + # GH#51073 - variant of the above test with explicit dtype instances + df = DataFrame({"a": ["a", "b", val]}) + expected = df.copy() + df.astype(any_string_dtype) + tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py index 91fa81b5bee2e..59779234b46d9 100644 --- a/pandas/tests/frame/methods/test_convert_dtypes.py +++ b/pandas/tests/frame/methods/test_convert_dtypes.py @@ -10,6 +10,8 @@ class TestConvertDtypes: + # TODO convert_dtypes should not use NaN variant of string dtype, but always NA + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "convert_integer, expected", [(False, np.dtype("int32")), (True, "Int32")] ) @@ -18,9 +20,6 @@ def test_convert_dtypes( ): # Specific types are tested in tests/series/test_dtypes.py # Just check that it works for DataFrame here - if using_infer_string: - string_storage = "pyarrow_numpy" - df = pd.DataFrame( { "a": pd.Series([1, 2, 3], dtype=np.dtype("int32")), diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index ab6cacc4cc860..bdefadf3dbec0 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -224,19 +224,3 @@ def compression_format(request): @pytest.fixture(params=_compression_formats_params) def compression_ext(request): return request.param[0] - - -@pytest.fixture( - params=[ - "python", - pytest.param("pyarrow", marks=td.skip_if_no("pyarrow")), - ] -) -def string_storage(request): - """ - Parametrized fixture for pd.options.mode.string_storage. - - * 'python' - * 'pyarrow' - """ - return request.param From 7ee10913d7192a285383ceec9539191b784c1ed4 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Thu, 1 Aug 2024 17:03:32 +0100 Subject: [PATCH 261/272] fix: show correct error message when invalid period alias is passed to to_timestamp (#59373) --- doc/source/whatsnew/v3.0.0.rst | 2 +- pandas/_libs/tslibs/offsets.pyx | 24 ++++++++----------- .../period/methods/test_to_timestamp.py | 7 ++++++ 3 files changed, 18 insertions(+), 15 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 1627a90fc6ac0..e3c4e69db7cbd 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -589,7 +589,7 @@ I/O Period ^^^^^^ -- +- Fixed error message when passing invalid period alias to :meth:`PeriodIndex.to_timestamp` (:issue:`58974`) - Plotting diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index ff24c2942cb76..fd1bb3fe3e173 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -4752,20 +4752,16 @@ def _validate_to_offset_alias(alias: str, is_period: bool) -> None: alias.lower() not in {"s", "ms", "us", "ns"} and alias.upper().split("-")[0].endswith(("S", "E"))): raise ValueError(INVALID_FREQ_ERR_MSG.format(alias)) - if (is_period and - alias.upper() in c_OFFSET_TO_PERIOD_FREQSTR and - alias != "ms" and - alias.upper().split("-")[0].endswith(("S", "E"))): - if (alias.upper().startswith("B") or - alias.upper().startswith("S") or - alias.upper().startswith("C")): - raise ValueError(INVALID_FREQ_ERR_MSG.format(alias)) - else: - alias_msg = "".join(alias.upper().split("E", 1)) - raise ValueError( - f"for Period, please use \'{alias_msg}\' " - f"instead of \'{alias}\'" - ) + if ( + is_period and + alias in c_OFFSET_TO_PERIOD_FREQSTR and + alias != c_OFFSET_TO_PERIOD_FREQSTR[alias] + ): + alias_msg = c_OFFSET_TO_PERIOD_FREQSTR.get(alias) + raise ValueError( + f"for Period, please use \'{alias_msg}\' " + f"instead of \'{alias}\'" + ) # TODO: better name? diff --git a/pandas/tests/indexes/period/methods/test_to_timestamp.py b/pandas/tests/indexes/period/methods/test_to_timestamp.py index 3867f9e3245dc..4fe429ce71ee4 100644 --- a/pandas/tests/indexes/period/methods/test_to_timestamp.py +++ b/pandas/tests/indexes/period/methods/test_to_timestamp.py @@ -140,3 +140,10 @@ def test_to_timestamp_1703(self): result = index.to_timestamp() assert result[0] == Timestamp("1/1/2012") + + +def test_ms_to_timestamp_error_message(): + # https://github.com/pandas-dev/pandas/issues/58974#issuecomment-2164265446 + ix = period_range("2000", periods=3, freq="M") + with pytest.raises(ValueError, match="for Period, please use 'M' instead of 'MS'"): + ix.to_timestamp("MS") From d5305258bf789a42dc17abb753b842d94f57b208 Mon Sep 17 00:00:00 2001 From: Kushagr Arora Date: Fri, 2 Aug 2024 12:47:27 -0400 Subject: [PATCH 262/272] DOC: Typo in docs for na_values parameter in pandas.read_csv function #59314 (#59318) * fixed formatting of default na values * bug fix * fixing docstring formatting * fixing formatting for quoting --- pandas/io/parsers/readers.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index c28d3aaaf4748..0cca1ebdb8c8f 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -142,8 +142,7 @@ class _read_shared(TypedDict, Generic[HashableT], total=False): _read_shared = dict -_doc_read_csv_and_table = ( - r""" +_doc_read_csv_and_table = r""" {summary} Also supports optionally iterating or breaking of the file @@ -272,10 +271,7 @@ class _read_shared(TypedDict, Generic[HashableT], total=False): na_values : Hashable, Iterable of Hashable or dict of {{Hashable : Iterable}}, optional Additional strings to recognize as ``NA``/``NaN``. If ``dict`` passed, specific per-column ``NA`` values. By default the following values are interpreted as - ``NaN``: " """ - + fill('", "'.join(sorted(STR_NA_VALUES)), 70, subsequent_indent=" ") - + """ ". - + ``NaN``: "{na_values_str}". keep_default_na : bool, default True Whether or not to include the default ``NaN`` values when parsing the data. Depending on whether ``na_values`` is passed in, the behavior is as follows: @@ -357,8 +353,7 @@ class _read_shared(TypedDict, Generic[HashableT], total=False): quotechar : str (length 1), optional Character used to denote the start and end of a quoted item. Quoted items can include the ``delimiter`` and it will be ignored. -quoting : {{0 or csv.QUOTE_MINIMAL, 1 or csv.QUOTE_ALL, 2 or csv.QUOTE_NONNUMERIC, \ -3 or csv.QUOTE_NONE}}, default csv.QUOTE_MINIMAL +quoting : {{0 or csv.QUOTE_MINIMAL, 1 or csv.QUOTE_ALL, 2 or csv.QUOTE_NONNUMERIC, 3 or csv.QUOTE_NONE}}, default csv.QUOTE_MINIMAL Control field quoting behavior per ``csv.QUOTE_*`` constants. Default is ``csv.QUOTE_MINIMAL`` (i.e., 0) which implies that only fields containing special characters are quoted (e.g., characters defined in ``quotechar``, ``delimiter``, @@ -544,8 +539,7 @@ class _read_shared(TypedDict, Generic[HashableT], total=False): col 2 datetime64[ns] col 3 datetime64[ns] dtype: object -""" -) +""" # noqa: E501 class _C_Parser_Defaults(TypedDict): @@ -756,6 +750,9 @@ def read_csv( summary="Read a comma-separated values (csv) file into DataFrame.", see_also_func_name="read_table", see_also_func_summary="Read general delimited file into DataFrame.", + na_values_str=fill( + '", "'.join(sorted(STR_NA_VALUES)), 70, subsequent_indent=" " + ), _default_sep="','", storage_options=_shared_docs["storage_options"], decompression_options=_shared_docs["decompression_options"] @@ -888,6 +885,9 @@ def read_table( see_also_func_summary=( "Read a comma-separated values (csv) file into DataFrame." ), + na_values_str=fill( + '", "'.join(sorted(STR_NA_VALUES)), 70, subsequent_indent=" " + ), _default_sep=r"'\\t' (tab-stop)", storage_options=_shared_docs["storage_options"], decompression_options=_shared_docs["decompression_options"] From 642d2446060afb11f9860c79a7339eb6ec96fea7 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 2 Aug 2024 06:48:35 -1000 Subject: [PATCH 263/272] CI: Use Miniforge over Mambaforge (#59357) * CI: Use Miniforge over Mambaforge * unbound variable * Unbound variable * Remove unset flag * Missing 3 --- .circleci/config.yml | 39 +++++++++++++++++---------------------- 1 file changed, 17 insertions(+), 22 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 745b04a5159f7..7533899f90470 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -15,15 +15,15 @@ jobs: - checkout - run: name: Install Environment and Run Tests - shell: /bin/bash -exuo pipefail + shell: /bin/bash -exo pipefail command: | - MAMBA_URL="https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Mambaforge-24.3.0-0-Linux-aarch64.sh" - wget -q $MAMBA_URL -O minimamba.sh - chmod +x minimamba.sh - MAMBA_DIR="$HOME/miniconda3" - rm -rf $MAMBA_DIR - ./minimamba.sh -b -p $MAMBA_DIR - export PATH=$MAMBA_DIR/bin:$PATH + MINI_URL="https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Miniforge3-24.3.0-0-Linux-aarch64.sh" + wget -q $MINI_URL -O Miniforge3.sh + chmod +x Miniforge3.sh + MINI_DIR="$HOME/miniconda3" + rm -rf $MINI_DIR + ./Miniforge3.sh -b -p $MINI_DIR + export PATH=$MINI_DIR/bin:$PATH conda info -a conda env create -q -n pandas-dev -f $ENV_FILE conda list -n pandas-dev @@ -97,21 +97,16 @@ jobs: - run: name: Install Anaconda Client & Upload Wheels + shell: /bin/bash -exo pipefail command: | - echo "Install Mambaforge" - MAMBA_URL="https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Mambaforge-24.3.0-0-Linux-aarch64.sh" - echo "Downloading $MAMBA_URL" - wget -q $MAMBA_URL -O minimamba.sh - chmod +x minimamba.sh - - MAMBA_DIR="$HOME/miniconda3" - rm -rf $MAMBA_DIR - ./minimamba.sh -b -p $MAMBA_DIR - - export PATH=$MAMBA_DIR/bin:$PATH - - mamba install -y -c conda-forge anaconda-client - + MINI_URL="https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Miniforge3-24.3.0-0-Linux-aarch64.sh" + wget -q $MINI_URL -O Miniforge3.sh + chmod +x Miniforge3.sh + MINI_DIR="$HOME/miniconda3" + rm -rf $MINI_DIR + ./Miniforge3.sh -b -p $MINI_DIR + export PATH=$MINI_DIR/bin:$PATH + conda install -y -c conda-forge anaconda-client source ci/upload_wheels.sh set_upload_vars upload_wheels From b6317f29198996130a71a99e8e2e5c0b5d04bef8 Mon Sep 17 00:00:00 2001 From: Shmulik Cohen <34924662+anuk909@users.noreply.github.com> Date: Mon, 5 Aug 2024 20:40:46 +0300 Subject: [PATCH 264/272] =?UTF-8?q?DOC:=20Remove=20outdated=20performance?= =?UTF-8?q?=20metrics=20from=20'Accelerated=20operations=E2=80=A6=20(#5941?= =?UTF-8?q?6)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DOC: Remove outdated performance metrics from 'Accelerated operations' section Co-authored-by: devin-ai-integration[bot] <158243242+devin-ai-integration[bot]@users.noreply.github.com> --- doc/source/user_guide/basics.rst | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/doc/source/user_guide/basics.rst b/doc/source/user_guide/basics.rst index 5cdc9779ef4e1..ffd7a2ad7bb01 100644 --- a/doc/source/user_guide/basics.rst +++ b/doc/source/user_guide/basics.rst @@ -155,16 +155,6 @@ speedups. ``numexpr`` uses smart chunking, caching, and multiple cores. ``bottle a set of specialized cython routines that are especially fast when dealing with arrays that have ``nans``. -Here is a sample (using 100 column x 100,000 row ``DataFrames``): - -.. csv-table:: - :header: "Operation", "0.11.0 (ms)", "Prior Version (ms)", "Ratio to Prior" - :widths: 25, 25, 25, 25 - - ``df1 > df2``, 13.32, 125.35, 0.1063 - ``df1 * df2``, 21.71, 36.63, 0.5928 - ``df1 + df2``, 22.04, 36.50, 0.6039 - You are highly encouraged to install both libraries. See the section :ref:`Recommended Dependencies ` for more installation info. From b3215b55ee99114f1c204c9b7fb3b132624fdff5 Mon Sep 17 00:00:00 2001 From: Katsia <47710336+KatsiarynaDzibrova@users.noreply.github.com> Date: Mon, 5 Aug 2024 19:15:04 +0100 Subject: [PATCH 265/272] DOC: Fix RT03 for pandas.Series.str.find (#59394) * fix RT03 for pandas.Series.str.find * remove check from code_checks.sh * Remove pandas.Series.str.rfind RT03 from ci --- ci/code_checks.sh | 2 -- pandas/core/strings/accessor.py | 7 +++++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index ebcc99100be70..ba7777ed44624 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -165,7 +165,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.str.center RT03,SA01" \ -i "pandas.Series.str.decode PR07,RT03,SA01" \ -i "pandas.Series.str.encode PR07,RT03,SA01" \ - -i "pandas.Series.str.find RT03" \ -i "pandas.Series.str.fullmatch RT03" \ -i "pandas.Series.str.get RT03,SA01" \ -i "pandas.Series.str.index RT03" \ @@ -177,7 +176,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.str.partition RT03" \ -i "pandas.Series.str.repeat SA01" \ -i "pandas.Series.str.replace SA01" \ - -i "pandas.Series.str.rfind RT03" \ -i "pandas.Series.str.rindex RT03" \ -i "pandas.Series.str.rjust RT03,SA01" \ -i "pandas.Series.str.rpartition RT03" \ diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index dd9276179cf4d..15269f439cc42 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -2971,6 +2971,9 @@ def extractall(self, pat, flags: int = 0) -> DataFrame: Returns ------- Series or Index of int. + A Series (if the input is a Series) or an Index (if the input is an + Index) of the %(side)s indexes corresponding to the positions where the + substring is found in each string of the input. See Also -------- @@ -2980,9 +2983,9 @@ def extractall(self, pat, flags: int = 0) -> DataFrame: -------- For Series.str.find: - >>> ser = pd.Series(["cow_", "duck_", "do_ve"]) + >>> ser = pd.Series(["_cow_", "duck_", "do_v_e"]) >>> ser.str.find("_") - 0 3 + 0 0 1 4 2 2 dtype: int64 From 72e3a321a7095e565a7e4714d7ca5344a599b67b Mon Sep 17 00:00:00 2001 From: Katsia <47710336+KatsiarynaDzibrova@users.noreply.github.com> Date: Mon, 5 Aug 2024 19:15:59 +0100 Subject: [PATCH 266/272] DOC: Fix pandas.Series.str.get RT03,SA01 (#59396) * DOC: Fix pandas.Series.str.get RT03,SA01 * DOC: remove pandas.Series.str.get from code_checks.sh --- ci/code_checks.sh | 1 - pandas/core/strings/accessor.py | 7 +++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index ba7777ed44624..420e86ecddaa1 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -166,7 +166,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.str.decode PR07,RT03,SA01" \ -i "pandas.Series.str.encode PR07,RT03,SA01" \ -i "pandas.Series.str.fullmatch RT03" \ - -i "pandas.Series.str.get RT03,SA01" \ -i "pandas.Series.str.index RT03" \ -i "pandas.Series.str.ljust RT03,SA01" \ -i "pandas.Series.str.lower RT03" \ diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 15269f439cc42..b37e22c9a91ec 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -1085,6 +1085,13 @@ def get(self, i): Returns ------- Series or Index + Series or Index where each value is the extracted element from + the corresponding input component. + + See Also + -------- + Series.str.extract : Extract capture groups in the regex as columns + in a DataFrame. Examples -------- From 81a12ddd270fc3bbed4e6c49098d3a5f37dfef77 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 5 Aug 2024 10:11:18 -1000 Subject: [PATCH 267/272] CI: Install libegl explicitly for pytest-qt on ubuntu (#59423) * CI: Install libegl explicitly for pytest-qt on ubuntu * Add libopengl * Wrong name --- .circleci/config.yml | 2 ++ .github/workflows/code-checks.yml | 5 +++++ .github/workflows/docbuild-and-upload.yml | 4 ++++ .github/workflows/unit-tests.yml | 4 ++-- 4 files changed, 13 insertions(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 7533899f90470..ebbca94718259 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -16,6 +16,7 @@ jobs: - run: name: Install Environment and Run Tests shell: /bin/bash -exo pipefail + # https://pytest-qt.readthedocs.io/en/latest/troubleshooting.html#github-actions-azure-pipelines-travis-ci-and-gitlab-ci-cd command: | MINI_URL="https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Miniforge3-24.3.0-0-Linux-aarch64.sh" wget -q $MINI_URL -O Miniforge3.sh @@ -33,6 +34,7 @@ jobs: fi python -m pip install --no-build-isolation -ve . --config-settings=setup-args="--werror" PATH=$HOME/miniconda3/envs/pandas-dev/bin:$HOME/miniconda3/condabin:$PATH + sudo apt-get update && sudo apt-get install -y libegl1 libopengl0 ci/run_tests.sh test-linux-musl: docker: diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index 937af7e49c6d3..7e9c056e75131 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -51,6 +51,11 @@ jobs: # TODO: The doctests have to be run first right now, since the Cython doctests only work # with pandas installed in non-editable mode # This can be removed once pytest-cython doesn't require C extensions to be installed inplace + + - name: Extra installs + # https://pytest-qt.readthedocs.io/en/latest/troubleshooting.html#github-actions-azure-pipelines-travis-ci-and-gitlab-ci-cd + run: sudo apt-get update && sudo apt-get install -y libegl1 libopengl0 + - name: Run doctests run: cd ci && ./code_checks.sh doctests if: ${{ steps.build.outcome == 'success' && always() }} diff --git a/.github/workflows/docbuild-and-upload.yml b/.github/workflows/docbuild-and-upload.yml index 924a6263f34d2..47b97fa57852a 100644 --- a/.github/workflows/docbuild-and-upload.yml +++ b/.github/workflows/docbuild-and-upload.yml @@ -46,6 +46,10 @@ jobs: - name: Build Pandas uses: ./.github/actions/build_pandas + - name: Extra installs + # https://pytest-qt.readthedocs.io/en/latest/troubleshooting.html#github-actions-azure-pipelines-travis-ci-and-gitlab-ci-cd + run: sudo apt-get update && sudo apt-get install -y libegl1 libopengl0 + - name: Test website run: python -m pytest web/ diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 4539884e6afd3..a085d0265a1a5 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -134,8 +134,8 @@ jobs: fetch-depth: 0 - name: Extra installs - run: sudo apt-get update && sudo apt-get install -y ${{ matrix.extra_apt }} - if: ${{ matrix.extra_apt }} + # https://pytest-qt.readthedocs.io/en/latest/troubleshooting.html#github-actions-azure-pipelines-travis-ci-and-gitlab-ci-cd + run: sudo apt-get update && sudo apt-get install -y libegl1 libopengl0 ${{ matrix.extra_apt || ''}} - name: Generate extra locales # These extra locales will be available for locale.setlocale() calls in tests From 66c5b9a8c9ce2ed48b0ddd57a9f539a67c5dd7ce Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Tue, 6 Aug 2024 02:07:12 +0530 Subject: [PATCH 268/272] DOC: fix SA01 for pandas.MultiIndex.get_level_values (#59400) Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- ci/code_checks.sh | 1 - pandas/core/indexes/multi.py | 17 +++++++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 420e86ecddaa1..6d7ba8c941502 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -70,7 +70,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then --format=actions \ -i ES01 `# For now it is ok if docstrings are missing the extended summary` \ -i "pandas.Series.dt PR01" `# Accessors are implemented as classes, but we do not document the Parameters section` \ - -i "pandas.MultiIndex.get_level_values SA01" \ -i "pandas.MultiIndex.get_loc_level PR07" \ -i "pandas.MultiIndex.names SA01" \ -i "pandas.MultiIndex.reorder_levels RT03,SA01" \ diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 799b6f32c1387..58664b07f4a46 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1783,6 +1783,16 @@ def get_level_values(self, level) -> Index: # type: ignore[override] Return vector of label values for requested level. Length of returned vector is equal to the length of the index. + The `get_level_values` method is a crucial utility for extracting + specific level values from a `MultiIndex`. This function is particularly + useful when working with multi-level data, allowing you to isolate + and manipulate individual levels without having to deal with the + complexity of the entire `MultiIndex` structure. It seamlessly handles + both integer and string-based level access, providing flexibility in + how you can interact with the data. Additionally, this method ensures + that the returned `Index` maintains the integrity of the original data, + even when missing values are present, by appropriately casting the + result to a suitable data type. Parameters ---------- @@ -1796,6 +1806,13 @@ def get_level_values(self, level) -> Index: # type: ignore[override] Values is a level of this MultiIndex converted to a single :class:`Index` (or subclass thereof). + See Also + -------- + MultiIndex : A multi-level, or hierarchical, index object for pandas objects. + Index : Immutable sequence used for indexing and alignment. + MultiIndex.remove_unused_levels : Create new MultiIndex from current that + removes unused levels. + Notes ----- If the level contains missing values, the result may be casted to From 976c1adcc6bcc059a58055a759e3562a03e31827 Mon Sep 17 00:00:00 2001 From: ivonastojanovic <80911834+ivonastojanovic@users.noreply.github.com> Date: Mon, 5 Aug 2024 21:38:25 +0100 Subject: [PATCH 269/272] DOC: Add 'See Also' for pandas.api.types.is_sparse (#59406) Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- ci/code_checks.sh | 1 - pandas/core/dtypes/common.py | 4 ++++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 6d7ba8c941502..9fba7671c2b17 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -292,7 +292,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.api.types.is_period_dtype SA01" \ -i "pandas.api.types.is_re PR07,SA01" \ -i "pandas.api.types.is_re_compilable PR07,SA01" \ - -i "pandas.api.types.is_sparse SA01" \ -i "pandas.api.types.is_timedelta64_ns_dtype SA01" \ -i "pandas.api.types.pandas_dtype PR07,RT03,SA01" \ -i "pandas.arrays.ArrowExtensionArray PR07,SA01" \ diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index cd1d5366d6a08..96f22c90fd591 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -187,6 +187,10 @@ def is_sparse(arr) -> bool: bool Whether or not the array-like is a pandas sparse array. + See Also + -------- + api.types.SparseDtype : The dtype object for pandas sparse arrays. + Examples -------- Returns `True` if the parameter is a 1-D pandas sparse array. From 3b447d62f33cc39eb1e7da15cbce07b4ad7f9190 Mon Sep 17 00:00:00 2001 From: ivonastojanovic <80911834+ivonastojanovic@users.noreply.github.com> Date: Mon, 5 Aug 2024 21:39:33 +0100 Subject: [PATCH 270/272] DOC: Add 'See Also' for pandas.api.types.is_any_real_numeric_dtype (#59404) Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- ci/code_checks.sh | 1 - pandas/core/dtypes/common.py | 6 ++++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 9fba7671c2b17..b447ad8a67005 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -265,7 +265,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.api.extensions.ExtensionArray.view SA01" \ -i "pandas.api.indexers.VariableOffsetWindowIndexer PR01,SA01" \ -i "pandas.api.interchange.from_dataframe RT03,SA01" \ - -i "pandas.api.types.is_any_real_numeric_dtype SA01" \ -i "pandas.api.types.is_bool PR01,SA01" \ -i "pandas.api.types.is_bool_dtype SA01" \ -i "pandas.api.types.is_categorical_dtype SA01" \ diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 96f22c90fd591..3c11b9d723c14 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1185,6 +1185,12 @@ def is_any_real_numeric_dtype(arr_or_dtype) -> bool: boolean Whether or not the array or dtype is of a real number dtype. + See Also + -------- + is_numeric_dtype : Check if a dtype is numeric. + is_complex_dtype : Check if a dtype is complex. + is_bool_dtype : Check if a dtype is boolean. + Examples -------- >>> from pandas.api.types import is_any_real_numeric_dtype From 772fbc0ca5cb7835a3cab647981adadf086b3afa Mon Sep 17 00:00:00 2001 From: ivonastojanovic <80911834+ivonastojanovic@users.noreply.github.com> Date: Mon, 5 Aug 2024 22:03:35 +0100 Subject: [PATCH 271/272] DOC: Add 'See Also' and 'Parameters' for pandas.api.indexers.VariableOffsetWindowIndexer (#59377) Add 'See Also' and 'Parameters' Add 'See Also' and 'Parameters' for VariableOffsetWindowIndexer Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- ci/code_checks.sh | 1 - pandas/core/indexers/objects.py | 25 +++++++++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index b447ad8a67005..c69b47ae1d4e8 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -263,7 +263,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.api.extensions.ExtensionArray.tolist RT03,SA01" \ -i "pandas.api.extensions.ExtensionArray.unique RT03,SA01" \ -i "pandas.api.extensions.ExtensionArray.view SA01" \ - -i "pandas.api.indexers.VariableOffsetWindowIndexer PR01,SA01" \ -i "pandas.api.interchange.from_dataframe RT03,SA01" \ -i "pandas.api.types.is_bool PR01,SA01" \ -i "pandas.api.types.is_bool_dtype SA01" \ diff --git a/pandas/core/indexers/objects.py b/pandas/core/indexers/objects.py index 083e86500a210..0064aa91056e8 100644 --- a/pandas/core/indexers/objects.py +++ b/pandas/core/indexers/objects.py @@ -167,6 +167,31 @@ class VariableOffsetWindowIndexer(BaseIndexer): """ Calculate window boundaries based on a non-fixed offset such as a BusinessDay. + Parameters + ---------- + index_array : np.ndarray, default 0 + Array-like structure specifying the indices for data points. + This parameter is currently not used. + + window_size : int, optional, default 0 + Specifies the number of data points in each window. + This parameter is currently not used. + + index : DatetimeIndex, optional + ``DatetimeIndex`` of the labels of each observation. + + offset : BaseOffset, optional + ``DateOffset`` representing the size of the window. + + **kwargs + Additional keyword arguments passed to the parent class ``BaseIndexer``. + + See Also + -------- + api.indexers.BaseIndexer : Base class for all indexers. + DataFrame.rolling : Rolling window calculations on DataFrames. + offsets : Module providing various time offset classes. + Examples -------- >>> from pandas.api.indexers import VariableOffsetWindowIndexer From 7a5bfc47b464910540b6af9dc3f591e3f6c19ea0 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 5 Aug 2024 15:10:33 -0700 Subject: [PATCH 272/272] Bump pypa/cibuildwheel from 2.19.2 to 2.20.0 (#59413) Bumps [pypa/cibuildwheel](https://github.com/pypa/cibuildwheel) from 2.19.2 to 2.20.0. - [Release notes](https://github.com/pypa/cibuildwheel/releases) - [Changelog](https://github.com/pypa/cibuildwheel/blob/main/docs/changelog.md) - [Commits](https://github.com/pypa/cibuildwheel/compare/v2.19.2...v2.20.0) --- updated-dependencies: - dependency-name: pypa/cibuildwheel dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- .github/workflows/wheels.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 9f07648b254dd..58adb4efc0627 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -158,7 +158,7 @@ jobs: run: echo "sdist_name=$(cd ./dist && ls -d */)" >> "$GITHUB_ENV" - name: Build wheels - uses: pypa/cibuildwheel@v2.19.2 + uses: pypa/cibuildwheel@v2.20.0 with: package-dir: ./dist/${{ startsWith(matrix.buildplat[1], 'macosx') && env.sdist_name || needs.build_sdist.outputs.sdist_file }} env: