From 20e87cb343a18d6af4085ab6b368d433ca07be8d Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 11 Sep 2023 08:23:09 -0700 Subject: [PATCH 01/17] Bump actions/checkout from 3 to 4 (#55086) Bumps [actions/checkout](https://github.com/actions/checkout) from 3 to 4. - [Release notes](https://github.com/actions/checkout/releases) - [Changelog](https://github.com/actions/checkout/blob/main/CHANGELOG.md) - [Commits](https://github.com/actions/checkout/compare/v3...v4) --- updated-dependencies: - dependency-name: actions/checkout dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/code-checks.yml | 8 ++++---- .github/workflows/codeql.yml | 2 +- .github/workflows/comment-commands.yml | 2 +- .github/workflows/docbuild-and-upload.yml | 2 +- .github/workflows/package-checks.yml | 4 ++-- .github/workflows/unit-tests.yml | 6 +++--- .github/workflows/wheels.yml | 4 ++-- 7 files changed, 14 insertions(+), 14 deletions(-) diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index f87aef5385898..3bd68c07dcbc3 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -33,7 +33,7 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 @@ -109,7 +109,7 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 @@ -143,7 +143,7 @@ jobs: run: docker image prune -f - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 @@ -164,7 +164,7 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 8715c5306a3b0..2182e89731990 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -27,7 +27,7 @@ jobs: - python steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: github/codeql-action/init@v2 with: languages: ${{ matrix.language }} diff --git a/.github/workflows/comment-commands.yml b/.github/workflows/comment-commands.yml index 2550d4de34a45..55dd733d25b50 100644 --- a/.github/workflows/comment-commands.yml +++ b/.github/workflows/comment-commands.yml @@ -51,7 +51,7 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 diff --git a/.github/workflows/docbuild-and-upload.yml b/.github/workflows/docbuild-and-upload.yml index e05f12ac6416a..deaf2be0a0423 100644 --- a/.github/workflows/docbuild-and-upload.yml +++ b/.github/workflows/docbuild-and-upload.yml @@ -36,7 +36,7 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 diff --git a/.github/workflows/package-checks.yml b/.github/workflows/package-checks.yml index 04abcf4ce8816..64a94d7fde5a9 100644 --- a/.github/workflows/package-checks.yml +++ b/.github/workflows/package-checks.yml @@ -34,7 +34,7 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 @@ -62,7 +62,7 @@ jobs: cancel-in-progress: true steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 6410f2edd6175..f2b426269098b 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -136,7 +136,7 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 @@ -194,7 +194,7 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 @@ -330,7 +330,7 @@ jobs: PYTEST_TARGET: pandas steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: fetch-depth: 0 diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 97d78a1a9afe3..83d14b51092e6 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -48,7 +48,7 @@ jobs: sdist_file: ${{ steps.save-path.outputs.sdist_name }} steps: - name: Checkout pandas - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 @@ -103,7 +103,7 @@ jobs: IS_SCHEDULE_DISPATCH: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }} steps: - name: Checkout pandas - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 From a0d4725dd14176602761a3b8edd7d6c0ce41aa08 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Mon, 11 Sep 2023 11:29:43 -0400 Subject: [PATCH 02/17] BUG: concat(axis=1) ignoring sort parameter for DatetimeIndex (#55085) BUG: concat ignoring sort parameter for DatetimeIndex --- doc/source/whatsnew/v2.2.0.rst | 2 +- pandas/core/indexes/api.py | 1 - pandas/tests/reshape/concat/test_datetimes.py | 12 ++++++------ 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index a795514aa31f8..609f99e26cf3b 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -247,8 +247,8 @@ Groupby/resample/rolling Reshaping ^^^^^^^^^ +- Bug in :func:`concat` ignoring ``sort`` parameter when passed :class:`DatetimeIndex` indexes (:issue:`54769`) - Bug in :func:`merge` returning columns in incorrect order when left and/or right is empty (:issue:`51929`) -- Sparse ^^^^^^ diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index a8ef0e034ba9b..6a36021d9e7c5 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -288,7 +288,6 @@ def _find_common_index_dtype(inds): raise TypeError("Cannot join tz-naive with tz-aware DatetimeIndex") if len(dtis) == len(indexes): - sort = True result = indexes[0] elif len(dtis) > 1: diff --git a/pandas/tests/reshape/concat/test_datetimes.py b/pandas/tests/reshape/concat/test_datetimes.py index 2f50a19189987..12d28c388d508 100644 --- a/pandas/tests/reshape/concat/test_datetimes.py +++ b/pandas/tests/reshape/concat/test_datetimes.py @@ -77,23 +77,23 @@ def test_concat_datetime_timezone(self): exp_idx = DatetimeIndex( [ - "2010-12-31 15:00:00+00:00", - "2010-12-31 16:00:00+00:00", - "2010-12-31 17:00:00+00:00", "2010-12-31 23:00:00+00:00", "2011-01-01 00:00:00+00:00", "2011-01-01 01:00:00+00:00", + "2010-12-31 15:00:00+00:00", + "2010-12-31 16:00:00+00:00", + "2010-12-31 17:00:00+00:00", ] ) expected = DataFrame( [ - [np.nan, 1], - [np.nan, 2], - [np.nan, 3], [1, np.nan], [2, np.nan], [3, np.nan], + [np.nan, 1], + [np.nan, 2], + [np.nan, 3], ], index=exp_idx, columns=["a", "b"], From 8929630664bd6d4898de7c6309759ce21b9818a1 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Mon, 11 Sep 2023 11:55:21 -0400 Subject: [PATCH 03/17] PERF: concat(axis=1) with unaligned indexes (#55084) * PERF: concat(axis=1) with unaligned indexes * whatsnew --- doc/source/whatsnew/v2.2.0.rst | 2 ++ pandas/core/indexes/api.py | 8 ++++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 609f99e26cf3b..e5ce0893c947b 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -158,9 +158,11 @@ Deprecations Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ +- Performance improvement in :func:`concat` with ``axis=1`` and objects with unaligned indexes (:issue:`55084`) - Performance improvement in :func:`to_dict` on converting DataFrame to dictionary (:issue:`50990`) - Performance improvement in :meth:`DataFrame.sort_index` and :meth:`Series.sort_index` when indexed by a :class:`MultiIndex` (:issue:`54835`) - Performance improvement when indexing with more than 4 keys (:issue:`54550`) +- .. --------------------------------------------------------------------------- .. _whatsnew_220.bug_fixes: diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index 6a36021d9e7c5..877b8edb32520 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -239,8 +239,12 @@ def _unique_indices(inds, dtype) -> Index: Index """ if all(isinstance(ind, Index) for ind in inds): - result = inds[0].append(inds[1:]).unique() - result = result.astype(dtype, copy=False) + inds = [ind.astype(dtype, copy=False) for ind in inds] + result = inds[0].unique() + other = inds[1].append(inds[2:]) + diff = other[result.get_indexer_for(other) == -1] + if len(diff): + result = result.append(diff.unique()) if sort: result = result.sort_values() return result From debf5ace2e63b09901ec11bc9d533a9e9b40545c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Mon, 11 Sep 2023 12:13:42 -0400 Subject: [PATCH 04/17] TYP: Misc type corrections (#55078) --- pandas/_libs/tslibs/period.pyi | 2 +- pandas/_libs/tslibs/timedeltas.pyi | 7 +++-- pandas/_libs/tslibs/timestamps.pyi | 29 +++++++++-------- pandas/_typing.py | 2 +- pandas/core/reshape/pivot.py | 3 +- pandas/io/parsers/readers.py | 50 ++++++++++++++++++++++++------ 6 files changed, 64 insertions(+), 29 deletions(-) diff --git a/pandas/_libs/tslibs/period.pyi b/pandas/_libs/tslibs/period.pyi index 8826757e31c32..c85865fea8fd0 100644 --- a/pandas/_libs/tslibs/period.pyi +++ b/pandas/_libs/tslibs/period.pyi @@ -89,7 +89,7 @@ class Period(PeriodMixin): @classmethod def _from_ordinal(cls, ordinal: int, freq) -> Period: ... @classmethod - def now(cls, freq: BaseOffset = ...) -> Period: ... + def now(cls, freq: Frequency = ...) -> Period: ... def strftime(self, fmt: str) -> str: ... def to_timestamp( self, diff --git a/pandas/_libs/tslibs/timedeltas.pyi b/pandas/_libs/tslibs/timedeltas.pyi index aba9b25b23154..6d993722ce1d4 100644 --- a/pandas/_libs/tslibs/timedeltas.pyi +++ b/pandas/_libs/tslibs/timedeltas.pyi @@ -14,6 +14,7 @@ from pandas._libs.tslibs import ( Tick, ) from pandas._typing import ( + Frequency, Self, npt, ) @@ -117,9 +118,9 @@ class Timedelta(timedelta): @property def asm8(self) -> np.timedelta64: ... # TODO: round/floor/ceil could return NaT? - def round(self, freq: str) -> Self: ... - def floor(self, freq: str) -> Self: ... - def ceil(self, freq: str) -> Self: ... + def round(self, freq: Frequency) -> Self: ... + def floor(self, freq: Frequency) -> Self: ... + def ceil(self, freq: Frequency) -> Self: ... @property def resolution_string(self) -> str: ... def __add__(self, other: timedelta) -> Timedelta: ... diff --git a/pandas/_libs/tslibs/timestamps.pyi b/pandas/_libs/tslibs/timestamps.pyi index 36ae2d6d892f1..e23f01b800874 100644 --- a/pandas/_libs/tslibs/timestamps.pyi +++ b/pandas/_libs/tslibs/timestamps.pyi @@ -8,6 +8,8 @@ from datetime import ( from time import struct_time from typing import ( ClassVar, + Literal, + TypeAlias, TypeVar, overload, ) @@ -27,6 +29,7 @@ from pandas._typing import ( ) _DatetimeT = TypeVar("_DatetimeT", bound=datetime) +_TimeZones: TypeAlias = str | _tzinfo | None | int def integer_op_not_supported(obj: object) -> TypeError: ... @@ -51,13 +54,13 @@ class Timestamp(datetime): tzinfo: _tzinfo | None = ..., *, nanosecond: int | None = ..., - tz: str | _tzinfo | None | int = ..., + tz: _TimeZones = ..., unit: str | int | None = ..., fold: int | None = ..., ) -> _DatetimeT | NaTType: ... @classmethod def _from_value_and_reso( - cls, value: int, reso: int, tz: _tzinfo | None + cls, value: int, reso: int, tz: _TimeZones ) -> Timestamp: ... @property def value(self) -> int: ... # np.int64 @@ -84,19 +87,19 @@ class Timestamp(datetime): @property def fold(self) -> int: ... @classmethod - def fromtimestamp(cls, ts: float, tz: _tzinfo | None = ...) -> Self: ... + def fromtimestamp(cls, ts: float, tz: _TimeZones = ...) -> Self: ... @classmethod def utcfromtimestamp(cls, ts: float) -> Self: ... @classmethod - def today(cls, tz: _tzinfo | str | None = ...) -> Self: ... + def today(cls, tz: _TimeZones = ...) -> Self: ... @classmethod def fromordinal( cls, ordinal: int, - tz: _tzinfo | str | None = ..., + tz: _TimeZones = ..., ) -> Self: ... @classmethod - def now(cls, tz: _tzinfo | str | None = ...) -> Self: ... + def now(cls, tz: _TimeZones = ...) -> Self: ... @classmethod def utcnow(cls) -> Self: ... # error: Signature of "combine" incompatible with supertype "datetime" @@ -131,7 +134,7 @@ class Timestamp(datetime): fold: int | None = ..., ) -> Self: ... # LSP violation: datetime.datetime.astimezone has a default value for tz - def astimezone(self, tz: _tzinfo | None) -> Self: ... # type: ignore[override] + def astimezone(self, tz: _TimeZones) -> Self: ... # type: ignore[override] def ctime(self) -> str: ... def isoformat(self, sep: str = ..., timespec: str = ...) -> str: ... @classmethod @@ -184,12 +187,12 @@ class Timestamp(datetime): def to_julian_date(self) -> np.float64: ... @property def asm8(self) -> np.datetime64: ... - def tz_convert(self, tz: _tzinfo | str | None) -> Self: ... + def tz_convert(self, tz: _TimeZones) -> Self: ... # TODO: could return NaT? def tz_localize( self, - tz: _tzinfo | str | None, - ambiguous: str = ..., + tz: _TimeZones, + ambiguous: bool | Literal["raise", "NaT"] = ..., nonexistent: TimestampNonexistent = ..., ) -> Self: ... def normalize(self) -> Self: ... @@ -197,19 +200,19 @@ class Timestamp(datetime): def round( self, freq: str, - ambiguous: bool | str = ..., + ambiguous: bool | Literal["raise", "NaT"] = ..., nonexistent: TimestampNonexistent = ..., ) -> Self: ... def floor( self, freq: str, - ambiguous: bool | str = ..., + ambiguous: bool | Literal["raise", "NaT"] = ..., nonexistent: TimestampNonexistent = ..., ) -> Self: ... def ceil( self, freq: str, - ambiguous: bool | str = ..., + ambiguous: bool | Literal["raise", "NaT"] = ..., nonexistent: TimestampNonexistent = ..., ) -> Self: ... def day_name(self, locale: str | None = ...) -> str: ... diff --git a/pandas/_typing.py b/pandas/_typing.py index 743815b91210d..c2bbebfbe2857 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -112,7 +112,7 @@ # Cannot use `Sequence` because a string is a sequence, and we don't want to # accept that. Could refine if https://github.com/python/typing/issues/256 is # resolved to differentiate between Sequence[str] and str -ListLike = Union[AnyArrayLike, list, range] +ListLike = Union[AnyArrayLike, list, tuple, range] # scalars diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index e8ca520e7b420..79354fdd12a2d 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -7,6 +7,7 @@ from typing import ( TYPE_CHECKING, Callable, + Literal, cast, ) @@ -569,7 +570,7 @@ def crosstab( margins: bool = False, margins_name: Hashable = "All", dropna: bool = True, - normalize: bool = False, + normalize: bool | Literal[0, 1, "all", "index", "columns"] = False, ) -> DataFrame: """ Compute a simple cross tabulation of two (or more) factors. diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index e0f171035e89e..e826aad478059 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -638,7 +638,10 @@ def read_csv( header: int | Sequence[int] | None | Literal["infer"] = ..., names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., - usecols: list[HashableT] | Callable[[Hashable], bool] | None = ..., + usecols: list[HashableT] + | tuple[HashableT] + | Callable[[Hashable], bool] + | None = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters: Mapping[Hashable, Callable] | None = ..., @@ -697,7 +700,10 @@ def read_csv( header: int | Sequence[int] | None | Literal["infer"] = ..., names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., - usecols: list[HashableT] | Callable[[Hashable], bool] | None = ..., + usecols: list[HashableT] + | tuple[HashableT] + | Callable[[Hashable], bool] + | None = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters: Mapping[Hashable, Callable] | None = ..., @@ -757,7 +763,10 @@ def read_csv( header: int | Sequence[int] | None | Literal["infer"] = ..., names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., - usecols: list[HashableT] | Callable[[Hashable], bool] | None = ..., + usecols: list[HashableT] + | tuple[HashableT] + | Callable[[Hashable], bool] + | None = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters: Mapping[Hashable, Callable] | None = ..., @@ -817,7 +826,10 @@ def read_csv( header: int | Sequence[int] | None | Literal["infer"] = ..., names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., - usecols: list[HashableT] | Callable[[Hashable], bool] | None = ..., + usecols: list[HashableT] + | tuple[HashableT] + | Callable[[Hashable], bool] + | None = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters: Mapping[Hashable, Callable] | None = ..., @@ -888,7 +900,10 @@ def read_csv( header: int | Sequence[int] | None | Literal["infer"] = "infer", names: Sequence[Hashable] | None | lib.NoDefault = lib.no_default, index_col: IndexLabel | Literal[False] | None = None, - usecols: list[HashableT] | Callable[[Hashable], bool] | None = None, + usecols: list[HashableT] + | tuple[HashableT] + | Callable[[Hashable], bool] + | None = None, # General Parsing Configuration dtype: DtypeArg | None = None, engine: CSVEngine | None = None, @@ -983,7 +998,10 @@ def read_table( header: int | Sequence[int] | None | Literal["infer"] = ..., names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., - usecols: list[HashableT] | Callable[[Hashable], bool] | None = ..., + usecols: list[HashableT] + | tuple[HashableT] + | Callable[[Hashable], bool] + | None = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters: Mapping[Hashable, Callable] | None = ..., @@ -1040,7 +1058,10 @@ def read_table( header: int | Sequence[int] | None | Literal["infer"] = ..., names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., - usecols: list[HashableT] | Callable[[Hashable], bool] | None = ..., + usecols: list[HashableT] + | tuple[HashableT] + | Callable[[Hashable], bool] + | None = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters: Mapping[Hashable, Callable] | None = ..., @@ -1097,7 +1118,10 @@ def read_table( header: int | Sequence[int] | None | Literal["infer"] = ..., names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., - usecols: list[HashableT] | Callable[[Hashable], bool] | None = ..., + usecols: list[HashableT] + | tuple[HashableT] + | Callable[[Hashable], bool] + | None = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters: Mapping[Hashable, Callable] | None = ..., @@ -1154,7 +1178,10 @@ def read_table( header: int | Sequence[int] | None | Literal["infer"] = ..., names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., - usecols: list[HashableT] | Callable[[Hashable], bool] | None = ..., + usecols: list[HashableT] + | tuple[HashableT] + | Callable[[Hashable], bool] + | None = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters: Mapping[Hashable, Callable] | None = ..., @@ -1224,7 +1251,10 @@ def read_table( header: int | Sequence[int] | None | Literal["infer"] = "infer", names: Sequence[Hashable] | None | lib.NoDefault = lib.no_default, index_col: IndexLabel | Literal[False] | None = None, - usecols: list[HashableT] | Callable[[Hashable], bool] | None = None, + usecols: list[HashableT] + | tuple[HashableT] + | Callable[[Hashable], bool] + | None = None, # General Parsing Configuration dtype: DtypeArg | None = None, engine: CSVEngine | None = None, From aadd9e3a13660a7ac0b11730130447e5b07c01d1 Mon Sep 17 00:00:00 2001 From: Natalia Mokeeva <91160475+natmokval@users.noreply.github.com> Date: Mon, 11 Sep 2023 18:14:59 +0200 Subject: [PATCH 05/17] DOC: fix an example which raises an Error in whatsnew/v0.10.0.rst (#55057) * fix an example in whatsnew/v0.10.0.rst * correct thee example in v0.10.0.rst --- doc/source/whatsnew/v0.10.0.rst | 43 +++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 13 deletions(-) diff --git a/doc/source/whatsnew/v0.10.0.rst b/doc/source/whatsnew/v0.10.0.rst index 3425986a37743..422efc1b36946 100644 --- a/doc/source/whatsnew/v0.10.0.rst +++ b/doc/source/whatsnew/v0.10.0.rst @@ -180,19 +180,36 @@ labeled the aggregated group with the end of the interval: the next day). DataFrame constructor with no columns specified. The v0.9.0 behavior (names ``X0``, ``X1``, ...) can be reproduced by specifying ``prefix='X'``: -.. ipython:: python - :okexcept: - - import io - - data = """ - a,b,c - 1,Yes,2 - 3,No,4 - """ - print(data) - pd.read_csv(io.StringIO(data), header=None) - pd.read_csv(io.StringIO(data), header=None, prefix="X") +.. code-block:: ipython + + In [6]: import io + + In [7]: data = """ + ...: a,b,c + ...: 1,Yes,2 + ...: 3,No,4 + ...: """ + ...: + + In [8]: print(data) + + a,b,c + 1,Yes,2 + 3,No,4 + + In [9]: pd.read_csv(io.StringIO(data), header=None) + Out[9]: + 0 1 2 + 0 a b c + 1 1 Yes 2 + 2 3 No 4 + + In [10]: pd.read_csv(io.StringIO(data), header=None, prefix="X") + Out[10]: + X0 X1 X2 + 0 a b c + 1 1 Yes 2 + 2 3 No 4 - Values like ``'Yes'`` and ``'No'`` are not interpreted as boolean by default, though this can be controlled by new ``true_values`` and ``false_values`` From ce5fdf0f55f47014240931a1f975f65767c2442a Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Mon, 11 Sep 2023 12:34:04 -0400 Subject: [PATCH 06/17] ENH: numba engine in df.apply (#54666) * ENH: numba engine in df.apply * fixes * more fixes * try to fix * address code review * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * go for green * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * update type --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- doc/source/whatsnew/v2.2.0.rst | 2 +- pandas/core/_numba/executor.py | 39 ++++++++++++++++ pandas/core/apply.py | 37 +++++++++++++-- pandas/core/frame.py | 33 ++++++++++++++ pandas/tests/apply/test_frame_apply.py | 62 ++++++++++++++++++++------ 5 files changed, 155 insertions(+), 18 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index e5ce0893c947b..07be496a95adc 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -28,7 +28,7 @@ enhancement2 Other enhancements ^^^^^^^^^^^^^^^^^^ -- +- DataFrame.apply now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/_numba/executor.py b/pandas/core/_numba/executor.py index 5cd4779907146..0a26acb7df60a 100644 --- a/pandas/core/_numba/executor.py +++ b/pandas/core/_numba/executor.py @@ -15,6 +15,45 @@ from pandas.compat._optional import import_optional_dependency +@functools.cache +def generate_apply_looper(func, nopython=True, nogil=True, parallel=False): + if TYPE_CHECKING: + import numba + else: + numba = import_optional_dependency("numba") + nb_compat_func = numba.extending.register_jitable(func) + + @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel) + def nb_looper(values, axis): + # Operate on the first row/col in order to get + # the output shape + if axis == 0: + first_elem = values[:, 0] + dim0 = values.shape[1] + else: + first_elem = values[0] + dim0 = values.shape[0] + res0 = nb_compat_func(first_elem) + # Use np.asarray to get shape for + # https://github.com/numba/numba/issues/4202#issuecomment-1185981507 + buf_shape = (dim0,) + np.atleast_1d(np.asarray(res0)).shape + if axis == 0: + buf_shape = buf_shape[::-1] + buff = np.empty(buf_shape) + + if axis == 1: + buff[0] = res0 + for i in numba.prange(1, values.shape[0]): + buff[i] = nb_compat_func(values[i]) + else: + buff[:, 0] = res0 + for j in numba.prange(1, values.shape[1]): + buff[:, j] = nb_compat_func(values[:, j]) + return buff + + return nb_looper + + @functools.cache def make_looper(func, result_dtype, is_grouped_kernel, nopython, nogil, parallel): if TYPE_CHECKING: diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 26467a4a982fa..78d52ed262c7a 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -49,6 +49,7 @@ ABCSeries, ) +from pandas.core._numba.executor import generate_apply_looper import pandas.core.common as com from pandas.core.construction import ensure_wrapped_if_datetimelike @@ -80,6 +81,8 @@ def frame_apply( raw: bool = False, result_type: str | None = None, by_row: Literal[False, "compat"] = "compat", + engine: str = "python", + engine_kwargs: dict[str, bool] | None = None, args=None, kwargs=None, ) -> FrameApply: @@ -100,6 +103,8 @@ def frame_apply( raw=raw, result_type=result_type, by_row=by_row, + engine=engine, + engine_kwargs=engine_kwargs, args=args, kwargs=kwargs, ) @@ -756,11 +761,15 @@ def __init__( result_type: str | None, *, by_row: Literal[False, "compat"] = False, + engine: str = "python", + engine_kwargs: dict[str, bool] | None = None, args, kwargs, ) -> None: if by_row is not False and by_row != "compat": raise ValueError(f"by_row={by_row} not allowed") + self.engine = engine + self.engine_kwargs = engine_kwargs super().__init__( obj, func, raw, result_type, by_row=by_row, args=args, kwargs=kwargs ) @@ -805,6 +814,12 @@ def values(self): def apply(self) -> DataFrame | Series: """compute the results""" + + if self.engine == "numba" and not self.raw: + raise ValueError( + "The numba engine in DataFrame.apply can only be used when raw=True" + ) + # dispatch to handle list-like or dict-like if is_list_like(self.func): return self.apply_list_or_dict_like() @@ -834,7 +849,7 @@ def apply(self) -> DataFrame | Series: # raw elif self.raw: - return self.apply_raw() + return self.apply_raw(engine=self.engine, engine_kwargs=self.engine_kwargs) return self.apply_standard() @@ -907,7 +922,7 @@ def apply_empty_result(self): else: return self.obj.copy() - def apply_raw(self): + def apply_raw(self, engine="python", engine_kwargs=None): """apply to the values as a numpy array""" def wrap_function(func): @@ -925,7 +940,23 @@ def wrapper(*args, **kwargs): return wrapper - result = np.apply_along_axis(wrap_function(self.func), self.axis, self.values) + if engine == "numba": + engine_kwargs = {} if engine_kwargs is None else engine_kwargs + + # error: Argument 1 to "__call__" of "_lru_cache_wrapper" has + # incompatible type "Callable[..., Any] | str | list[Callable + # [..., Any] | str] | dict[Hashable,Callable[..., Any] | str | + # list[Callable[..., Any] | str]]"; expected "Hashable" + nb_looper = generate_apply_looper( + self.func, **engine_kwargs # type: ignore[arg-type] + ) + result = nb_looper(self.values, self.axis) + # If we made the result 2-D, squeeze it back to 1-D + result = np.squeeze(result) + else: + result = np.apply_along_axis( + wrap_function(self.func), self.axis, self.values + ) # TODO: mixed type case if result.ndim == 2: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f1fc63bc4b1ea..8fcb91c846826 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -9925,6 +9925,8 @@ def apply( result_type: Literal["expand", "reduce", "broadcast"] | None = None, args=(), by_row: Literal[False, "compat"] = "compat", + engine: Literal["python", "numba"] = "python", + engine_kwargs: dict[str, bool] | None = None, **kwargs, ): """ @@ -9984,6 +9986,35 @@ def apply( If False, the funcs will be passed the whole Series at once. .. versionadded:: 2.1.0 + + engine : {'python', 'numba'}, default 'python' + Choose between the python (default) engine or the numba engine in apply. + + The numba engine will attempt to JIT compile the passed function, + which may result in speedups for large DataFrames. + It also supports the following engine_kwargs : + + - nopython (compile the function in nopython mode) + - nogil (release the GIL inside the JIT compiled function) + - parallel (try to apply the function in parallel over the DataFrame) + + Note: The numba compiler only supports a subset of + valid Python/numpy operations. + + Please read more about the `supported python features + `_ + and `supported numpy features + `_ + in numba to learn what you can or cannot use in the passed function. + + As of right now, the numba engine can only be used with raw=True. + + .. versionadded:: 2.2.0 + + engine_kwargs : dict + Pass keyword arguments to the engine. + This is currently only used by the numba engine, + see the documentation for the engine argument for more information. **kwargs Additional keyword arguments to pass as keywords arguments to `func`. @@ -10084,6 +10115,8 @@ def apply( raw=raw, result_type=result_type, by_row=by_row, + engine=engine, + engine_kwargs=engine_kwargs, args=args, kwargs=kwargs, ) diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index 3a3f73a68374b..3f2accc23e2d6 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -18,6 +18,13 @@ from pandas.tests.frame.common import zip_frames +@pytest.fixture(params=["python", "numba"]) +def engine(request): + if request.param == "numba": + pytest.importorskip("numba") + return request.param + + def test_apply(float_frame): with np.errstate(all="ignore"): # ufunc @@ -234,36 +241,42 @@ def test_apply_broadcast_series_lambda_func(int_frame_const_col): @pytest.mark.parametrize("axis", [0, 1]) -def test_apply_raw_float_frame(float_frame, axis): +def test_apply_raw_float_frame(float_frame, axis, engine): + if engine == "numba": + pytest.skip("numba can't handle when UDF returns None.") + def _assert_raw(x): assert isinstance(x, np.ndarray) assert x.ndim == 1 - float_frame.apply(_assert_raw, axis=axis, raw=True) + float_frame.apply(_assert_raw, axis=axis, engine=engine, raw=True) @pytest.mark.parametrize("axis", [0, 1]) -def test_apply_raw_float_frame_lambda(float_frame, axis): - result = float_frame.apply(np.mean, axis=axis, raw=True) +def test_apply_raw_float_frame_lambda(float_frame, axis, engine): + result = float_frame.apply(np.mean, axis=axis, engine=engine, raw=True) expected = float_frame.apply(lambda x: x.values.mean(), axis=axis) tm.assert_series_equal(result, expected) -def test_apply_raw_float_frame_no_reduction(float_frame): +def test_apply_raw_float_frame_no_reduction(float_frame, engine): # no reduction - result = float_frame.apply(lambda x: x * 2, raw=True) + result = float_frame.apply(lambda x: x * 2, engine=engine, raw=True) expected = float_frame * 2 tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("axis", [0, 1]) -def test_apply_raw_mixed_type_frame(mixed_type_frame, axis): +def test_apply_raw_mixed_type_frame(mixed_type_frame, axis, engine): + if engine == "numba": + pytest.skip("isinstance check doesn't work with numba") + def _assert_raw(x): assert isinstance(x, np.ndarray) assert x.ndim == 1 # Mixed dtype (GH-32423) - mixed_type_frame.apply(_assert_raw, axis=axis, raw=True) + mixed_type_frame.apply(_assert_raw, axis=axis, engine=engine, raw=True) def test_apply_axis1(float_frame): @@ -300,14 +313,20 @@ def test_apply_mixed_dtype_corner_indexing(): ) @pytest.mark.parametrize("raw", [True, False]) @pytest.mark.parametrize("axis", [0, 1]) -def test_apply_empty_infer_type(ax, func, raw, axis): +def test_apply_empty_infer_type(ax, func, raw, axis, engine, request): df = DataFrame(**{ax: ["a", "b", "c"]}) with np.errstate(all="ignore"): test_res = func(np.array([], dtype="f8")) is_reduction = not isinstance(test_res, np.ndarray) - result = df.apply(func, axis=axis, raw=raw) + if engine == "numba" and raw is False: + mark = pytest.mark.xfail( + reason="numba engine only supports raw=True at the moment" + ) + request.node.add_marker(mark) + + result = df.apply(func, axis=axis, engine=engine, raw=raw) if is_reduction: agg_axis = df._get_agg_axis(axis) assert isinstance(result, Series) @@ -607,8 +626,10 @@ def non_reducing_function(row): assert names == list(df.index) -def test_apply_raw_function_runs_once(): +def test_apply_raw_function_runs_once(engine): # https://github.com/pandas-dev/pandas/issues/34506 + if engine == "numba": + pytest.skip("appending to list outside of numba func is not supported") df = DataFrame({"a": [1, 2, 3]}) values = [] # Save row values function is applied to @@ -623,7 +644,7 @@ def non_reducing_function(row): for func in [reducing_function, non_reducing_function]: del values[:] - df.apply(func, raw=True, axis=1) + df.apply(func, engine=engine, raw=True, axis=1) assert values == list(df.a.to_list()) @@ -1449,10 +1470,12 @@ def test_apply_no_suffix_index(): tm.assert_frame_equal(result, expected) -def test_apply_raw_returns_string(): +def test_apply_raw_returns_string(engine): # https://github.com/pandas-dev/pandas/issues/35940 + if engine == "numba": + pytest.skip("No object dtype support in numba") df = DataFrame({"A": ["aa", "bbb"]}) - result = df.apply(lambda x: x[0], axis=1, raw=True) + result = df.apply(lambda x: x[0], engine=engine, axis=1, raw=True) expected = Series(["aa", "bbb"]) tm.assert_series_equal(result, expected) @@ -1632,3 +1655,14 @@ def test_agg_dist_like_and_nonunique_columns(): result = df.agg({"A": "count"}) expected = df["A"].count() tm.assert_series_equal(result, expected) + + +def test_numba_unsupported(): + df = DataFrame( + {"A": [None, 2, 3], "B": [1.0, np.nan, 3.0], "C": ["foo", None, "bar"]} + ) + with pytest.raises( + ValueError, + match="The numba engine in DataFrame.apply can only be used when raw=True", + ): + df.apply(lambda x: x, engine="numba", raw=False) From 417a5e7fcfdc36385c0599f40bd1b0b8e96a3720 Mon Sep 17 00:00:00 2001 From: Rajat Subhra Mukherjee Date: Tue, 12 Sep 2023 00:46:01 +0530 Subject: [PATCH 07/17] Updated future warning msg in transform() for Series.groupby (#55082) * updated warn msg * Update apply.py --- pandas/core/apply.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 78d52ed262c7a..cc594bc8efb34 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -1863,12 +1863,12 @@ def warn_alias_replacement( full_alias = alias else: full_alias = f"{type(obj).__name__}.{alias}" - alias = f"'{alias}'" + alias = f'"{alias}"' warnings.warn( f"The provided callable {func} is currently using " f"{full_alias}. In a future version of pandas, " f"the provided callable will be used directly. To keep current " - f"behavior pass {alias} instead.", + f"behavior pass the string {alias} instead.", category=FutureWarning, stacklevel=find_stack_level(), ) From 705d4312cf1d94ef2497bcd8091e0eabd1085f4a Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 11 Sep 2023 10:21:08 -1000 Subject: [PATCH 08/17] TST: Make test_hash_equality_invariance xfail more generic (#55094) --- pandas/tests/scalar/timedelta/test_timedelta.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/scalar/timedelta/test_timedelta.py b/pandas/tests/scalar/timedelta/test_timedelta.py index f1d8acf47b29a..cb797a4168088 100644 --- a/pandas/tests/scalar/timedelta/test_timedelta.py +++ b/pandas/tests/scalar/timedelta/test_timedelta.py @@ -927,7 +927,6 @@ def test_timedelta_hash_equality(self): @pytest.mark.xfail( reason="pd.Timedelta violates the Python hash invariant (GH#44504).", - raises=AssertionError, ) @given( st.integers( From 79067a76adc448d17210f2cf4a858b0eb853be4c Mon Sep 17 00:00:00 2001 From: Dmitriy Date: Wed, 13 Sep 2023 02:34:56 +0600 Subject: [PATCH 09/17] ENH: add calamine excel reader (close #50395) (#54998) --- ci/deps/actions-310.yaml | 1 + ci/deps/actions-311-downstream_compat.yaml | 1 + ci/deps/actions-311.yaml | 1 + ci/deps/actions-39-minimum_versions.yaml | 1 + ci/deps/actions-39.yaml | 1 + ci/deps/circle-310-arm64.yaml | 1 + doc/source/getting_started/install.rst | 1 + doc/source/user_guide/io.rst | 23 +++- doc/source/whatsnew/v2.2.0.rst | 23 +++- environment.yml | 1 + pandas/compat/_optional.py | 2 + pandas/core/config_init.py | 10 +- pandas/io/excel/_base.py | 16 ++- pandas/io/excel/_calamine.py | 127 +++++++++++++++++ pandas/tests/io/excel/test_readers.py | 130 ++++++++++++------ pyproject.toml | 3 +- requirements-dev.txt | 1 + scripts/tests/data/deps_expected_random.yaml | 1 + scripts/tests/data/deps_minimum.toml | 3 +- .../tests/data/deps_unmodified_random.yaml | 1 + 20 files changed, 290 insertions(+), 58 deletions(-) create mode 100644 pandas/io/excel/_calamine.py diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index 2190136220c6c..927003b13d6be 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -46,6 +46,7 @@ dependencies: - pymysql>=1.0.2 - pyreadstat>=1.1.5 - pytables>=3.7.0 + - python-calamine>=0.1.6 - pyxlsb>=1.0.9 - s3fs>=2022.05.0 - scipy>=1.8.1 diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml index cf85345cb0cc2..00df41cce3bae 100644 --- a/ci/deps/actions-311-downstream_compat.yaml +++ b/ci/deps/actions-311-downstream_compat.yaml @@ -47,6 +47,7 @@ dependencies: - pymysql>=1.0.2 - pyreadstat>=1.1.5 - pytables>=3.7.0 + - python-calamine>=0.1.6 - pyxlsb>=1.0.9 - s3fs>=2022.05.0 - scipy>=1.8.1 diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index 3c1630714a041..d50ea20da1e0c 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -46,6 +46,7 @@ dependencies: - pymysql>=1.0.2 - pyreadstat>=1.1.5 # - pytables>=3.7.0, 3.8.0 is first version that supports 3.11 + - python-calamine>=0.1.6 - pyxlsb>=1.0.9 - s3fs>=2022.05.0 - scipy>=1.8.1 diff --git a/ci/deps/actions-39-minimum_versions.yaml b/ci/deps/actions-39-minimum_versions.yaml index b1cea49e22d15..10862630bd596 100644 --- a/ci/deps/actions-39-minimum_versions.yaml +++ b/ci/deps/actions-39-minimum_versions.yaml @@ -48,6 +48,7 @@ dependencies: - pymysql=1.0.2 - pyreadstat=1.1.5 - pytables=3.7.0 + - python-calamine=0.1.6 - pyxlsb=1.0.9 - s3fs=2022.05.0 - scipy=1.8.1 diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml index b8a119ece4b03..904b55a813a9f 100644 --- a/ci/deps/actions-39.yaml +++ b/ci/deps/actions-39.yaml @@ -46,6 +46,7 @@ dependencies: - pymysql>=1.0.2 - pyreadstat>=1.1.5 - pytables>=3.7.0 + - python-calamine>=0.1.6 - pyxlsb>=1.0.9 - s3fs>=2022.05.0 - scipy>=1.8.1 diff --git a/ci/deps/circle-310-arm64.yaml b/ci/deps/circle-310-arm64.yaml index 71686837451b4..4060cea73e7f6 100644 --- a/ci/deps/circle-310-arm64.yaml +++ b/ci/deps/circle-310-arm64.yaml @@ -47,6 +47,7 @@ dependencies: - pymysql>=1.0.2 # - pyreadstat>=1.1.5 not available on ARM - pytables>=3.7.0 + - python-calamine>=0.1.6 - pyxlsb>=1.0.9 - s3fs>=2022.05.0 - scipy>=1.8.1 diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index ae7c9d4ea9c62..2c0787397e047 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -281,6 +281,7 @@ xlrd 2.0.1 excel Reading Excel xlsxwriter 3.0.3 excel Writing Excel openpyxl 3.0.10 excel Reading / writing for xlsx files pyxlsb 1.0.9 excel Reading for xlsb files +python-calamine 0.1.6 excel Reading for xls/xlsx/xlsb/ods files ========================= ================== =============== ============================================================= HTML diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index ecd547c5ff4d6..6bd181740c78d 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -3453,7 +3453,8 @@ Excel files The :func:`~pandas.read_excel` method can read Excel 2007+ (``.xlsx``) files using the ``openpyxl`` Python module. Excel 2003 (``.xls``) files can be read using ``xlrd``. Binary Excel (``.xlsb``) -files can be read using ``pyxlsb``. +files can be read using ``pyxlsb``. All formats can be read +using :ref:`calamine` engine. The :meth:`~DataFrame.to_excel` instance method is used for saving a ``DataFrame`` to Excel. Generally the semantics are similar to working with :ref:`csv` data. @@ -3494,6 +3495,9 @@ using internally. * For the engine odf, pandas is using :func:`odf.opendocument.load` to read in (``.ods``) files. +* For the engine calamine, pandas is using :func:`python_calamine.load_workbook` + to read in (``.xlsx``), (``.xlsm``), (``.xls``), (``.xlsb``), (``.ods``) files. + .. code-block:: python # Returns a DataFrame @@ -3935,7 +3939,8 @@ The :func:`~pandas.read_excel` method can also read binary Excel files using the ``pyxlsb`` module. The semantics and features for reading binary Excel files mostly match what can be done for `Excel files`_ using ``engine='pyxlsb'``. ``pyxlsb`` does not recognize datetime types -in files and will return floats instead. +in files and will return floats instead (you can use :ref:`calamine` +if you need recognize datetime types). .. code-block:: python @@ -3947,6 +3952,20 @@ in files and will return floats instead. Currently pandas only supports *reading* binary Excel files. Writing is not implemented. +.. _io.calamine: + +Calamine (Excel and ODS files) +------------------------------ + +The :func:`~pandas.read_excel` method can read Excel file (``.xlsx``, ``.xlsm``, ``.xls``, ``.xlsb``) +and OpenDocument spreadsheets (``.ods``) using the ``python-calamine`` module. +This module is a binding for Rust library `calamine `__ +and is faster than other engines in most cases. The optional dependency 'python-calamine' needs to be installed. + +.. code-block:: python + + # Returns a DataFrame + pd.read_excel("path_to_file.xlsb", engine="calamine") .. _io.clipboard: diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 07be496a95adc..249f08c7e387b 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -14,10 +14,27 @@ including other versions of pandas. Enhancements ~~~~~~~~~~~~ -.. _whatsnew_220.enhancements.enhancement1: +.. _whatsnew_220.enhancements.calamine: -enhancement1 -^^^^^^^^^^^^ +Calamine engine for :func:`read_excel` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The ``calamine`` engine was added to :func:`read_excel`. +It uses ``python-calamine``, which provides Python bindings for the Rust library `calamine `__. +This engine supports Excel files (``.xlsx``, ``.xlsm``, ``.xls``, ``.xlsb``) and OpenDocument spreadsheets (``.ods``) (:issue:`50395`). + +There are two advantages of this engine: + +1. Calamine is often faster than other engines, some benchmarks show results up to 5x faster than 'openpyxl', 20x - 'odf', 4x - 'pyxlsb', and 1.5x - 'xlrd'. + But, 'openpyxl' and 'pyxlsb' are faster in reading a few rows from large files because of lazy iteration over rows. +2. Calamine supports the recognition of datetime in ``.xlsb`` files, unlike 'pyxlsb' which is the only other engine in pandas that can read ``.xlsb`` files. + +.. code-block:: python + + pd.read_excel("path_to_file.xlsb", engine="calamine") + + +For more, see :ref:`io.calamine` in the user guide on IO tools. .. _whatsnew_220.enhancements.enhancement2: diff --git a/environment.yml b/environment.yml index 1a9dffb55bca7..1eb0b4cc2c7a6 100644 --- a/environment.yml +++ b/environment.yml @@ -47,6 +47,7 @@ dependencies: - pymysql>=1.0.2 - pyreadstat>=1.1.5 - pytables>=3.7.0 + - python-calamine>=0.1.6 - pyxlsb>=1.0.9 - s3fs>=2022.05.0 - scipy>=1.8.1 diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index c5792fa1379fe..fa0e9e974ea39 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -37,6 +37,7 @@ "pyarrow": "7.0.0", "pyreadstat": "1.1.5", "pytest": "7.3.2", + "python-calamine": "0.1.6", "pyxlsb": "1.0.9", "s3fs": "2022.05.0", "scipy": "1.8.1", @@ -62,6 +63,7 @@ "lxml.etree": "lxml", "odf": "odfpy", "pandas_gbq": "pandas-gbq", + "python_calamine": "python-calamine", "sqlalchemy": "SQLAlchemy", "tables": "pytables", } diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 62455f119a02f..750b374043193 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -513,11 +513,11 @@ def use_inf_as_na_cb(key) -> None: auto, {others}. """ -_xls_options = ["xlrd"] -_xlsm_options = ["xlrd", "openpyxl"] -_xlsx_options = ["xlrd", "openpyxl"] -_ods_options = ["odf"] -_xlsb_options = ["pyxlsb"] +_xls_options = ["xlrd", "calamine"] +_xlsm_options = ["xlrd", "openpyxl", "calamine"] +_xlsx_options = ["xlrd", "openpyxl", "calamine"] +_ods_options = ["odf", "calamine"] +_xlsb_options = ["pyxlsb", "calamine"] with cf.config_prefix("io.excel.xls"): diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index b4b0f29019c31..073115cab8695 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -159,13 +159,15 @@ of dtype conversion. engine : str, default None If io is not a buffer or path, this must be set to identify io. - Supported engines: "xlrd", "openpyxl", "odf", "pyxlsb". + Supported engines: "xlrd", "openpyxl", "odf", "pyxlsb", "calamine". Engine compatibility : - "xlrd" supports old-style Excel files (.xls). - "openpyxl" supports newer Excel file formats. - "odf" supports OpenDocument file formats (.odf, .ods, .odt). - "pyxlsb" supports Binary Excel files. + - "calamine" supports Excel (.xls, .xlsx, .xlsm, .xlsb) + and OpenDocument (.ods) file formats. .. versionchanged:: 1.2.0 The engine `xlrd `_ @@ -394,7 +396,7 @@ def read_excel( | Callable[[str], bool] | None = ..., dtype: DtypeArg | None = ..., - engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb"] | None = ..., + engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb", "calamine"] | None = ..., converters: dict[str, Callable] | dict[int, Callable] | None = ..., true_values: Iterable[Hashable] | None = ..., false_values: Iterable[Hashable] | None = ..., @@ -433,7 +435,7 @@ def read_excel( | Callable[[str], bool] | None = ..., dtype: DtypeArg | None = ..., - engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb"] | None = ..., + engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb", "calamine"] | None = ..., converters: dict[str, Callable] | dict[int, Callable] | None = ..., true_values: Iterable[Hashable] | None = ..., false_values: Iterable[Hashable] | None = ..., @@ -472,7 +474,7 @@ def read_excel( | Callable[[str], bool] | None = None, dtype: DtypeArg | None = None, - engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb"] | None = None, + engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb", "calamine"] | None = None, converters: dict[str, Callable] | dict[int, Callable] | None = None, true_values: Iterable[Hashable] | None = None, false_values: Iterable[Hashable] | None = None, @@ -1456,13 +1458,15 @@ class ExcelFile: .xls, .xlsx, .xlsb, .xlsm, .odf, .ods, or .odt file. engine : str, default None If io is not a buffer or path, this must be set to identify io. - Supported engines: ``xlrd``, ``openpyxl``, ``odf``, ``pyxlsb`` + Supported engines: ``xlrd``, ``openpyxl``, ``odf``, ``pyxlsb``, ``calamine`` Engine compatibility : - ``xlrd`` supports old-style Excel files (.xls). - ``openpyxl`` supports newer Excel file formats. - ``odf`` supports OpenDocument file formats (.odf, .ods, .odt). - ``pyxlsb`` supports Binary Excel files. + - ``calamine`` supports Excel (.xls, .xlsx, .xlsm, .xlsb) + and OpenDocument (.ods) file formats. .. versionchanged:: 1.2.0 @@ -1498,6 +1502,7 @@ class ExcelFile: ... df1 = pd.read_excel(xls, "Sheet1") # doctest: +SKIP """ + from pandas.io.excel._calamine import CalamineReader from pandas.io.excel._odfreader import ODFReader from pandas.io.excel._openpyxl import OpenpyxlReader from pandas.io.excel._pyxlsb import PyxlsbReader @@ -1508,6 +1513,7 @@ class ExcelFile: "openpyxl": OpenpyxlReader, "odf": ODFReader, "pyxlsb": PyxlsbReader, + "calamine": CalamineReader, } def __init__( diff --git a/pandas/io/excel/_calamine.py b/pandas/io/excel/_calamine.py new file mode 100644 index 0000000000000..d61a9fc664164 --- /dev/null +++ b/pandas/io/excel/_calamine.py @@ -0,0 +1,127 @@ +from __future__ import annotations + +from datetime import ( + date, + datetime, + time, + timedelta, +) +from typing import ( + TYPE_CHECKING, + Any, + Union, + cast, +) + +from pandas._typing import Scalar +from pandas.compat._optional import import_optional_dependency +from pandas.util._decorators import doc + +import pandas as pd +from pandas.core.shared_docs import _shared_docs + +from pandas.io.excel._base import BaseExcelReader + +if TYPE_CHECKING: + from python_calamine import ( + CalamineSheet, + CalamineWorkbook, + ) + + from pandas._typing import ( + FilePath, + ReadBuffer, + StorageOptions, + ) + +_CellValueT = Union[int, float, str, bool, time, date, datetime, timedelta] + + +class CalamineReader(BaseExcelReader["CalamineWorkbook"]): + @doc(storage_options=_shared_docs["storage_options"]) + def __init__( + self, + filepath_or_buffer: FilePath | ReadBuffer[bytes], + storage_options: StorageOptions | None = None, + engine_kwargs: dict | None = None, + ) -> None: + """ + Reader using calamine engine (xlsx/xls/xlsb/ods). + + Parameters + ---------- + filepath_or_buffer : str, path to be parsed or + an open readable stream. + {storage_options} + engine_kwargs : dict, optional + Arbitrary keyword arguments passed to excel engine. + """ + import_optional_dependency("python_calamine") + super().__init__( + filepath_or_buffer, + storage_options=storage_options, + engine_kwargs=engine_kwargs, + ) + + @property + def _workbook_class(self) -> type[CalamineWorkbook]: + from python_calamine import CalamineWorkbook + + return CalamineWorkbook + + def load_workbook( + self, filepath_or_buffer: FilePath | ReadBuffer[bytes], engine_kwargs: Any + ) -> CalamineWorkbook: + from python_calamine import load_workbook + + return load_workbook( + filepath_or_buffer, **engine_kwargs # type: ignore[arg-type] + ) + + @property + def sheet_names(self) -> list[str]: + from python_calamine import SheetTypeEnum + + return [ + sheet.name + for sheet in self.book.sheets_metadata + if sheet.typ == SheetTypeEnum.WorkSheet + ] + + def get_sheet_by_name(self, name: str) -> CalamineSheet: + self.raise_if_bad_sheet_by_name(name) + return self.book.get_sheet_by_name(name) + + def get_sheet_by_index(self, index: int) -> CalamineSheet: + self.raise_if_bad_sheet_by_index(index) + return self.book.get_sheet_by_index(index) + + def get_sheet_data( + self, sheet: CalamineSheet, file_rows_needed: int | None = None + ) -> list[list[Scalar]]: + def _convert_cell(value: _CellValueT) -> Scalar: + if isinstance(value, float): + val = int(value) + if val == value: + return val + else: + return value + elif isinstance(value, date): + return pd.Timestamp(value) + elif isinstance(value, timedelta): + return pd.Timedelta(value) + elif isinstance(value, time): + # cast needed here because Scalar doesn't include datetime.time + return cast(Scalar, value) + + return value + + rows: list[list[_CellValueT]] = sheet.to_python(skip_empty_area=False) + data: list[list[Scalar]] = [] + + for row in rows: + data.append([_convert_cell(cell) for cell in row]) + if file_rows_needed is not None and len(data) >= file_rows_needed: + break + + return data diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 6db70c894f692..de444019e7b4c 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -54,6 +54,7 @@ ), pytest.param("pyxlsb", marks=td.skip_if_no("pyxlsb")), pytest.param("odf", marks=td.skip_if_no("odf")), + pytest.param("calamine", marks=td.skip_if_no("python_calamine")), ] @@ -67,11 +68,11 @@ def _is_valid_engine_ext_pair(engine, read_ext: str) -> bool: return False if engine == "odf" and read_ext != ".ods": return False - if read_ext == ".ods" and engine != "odf": + if read_ext == ".ods" and engine not in {"odf", "calamine"}: return False if engine == "pyxlsb" and read_ext != ".xlsb": return False - if read_ext == ".xlsb" and engine != "pyxlsb": + if read_ext == ".xlsb" and engine not in {"pyxlsb", "calamine"}: return False if engine == "xlrd" and read_ext != ".xls": return False @@ -160,9 +161,9 @@ def test_engine_kwargs(self, read_ext, engine): "ods": {"foo": "abcd"}, } - if read_ext[1:] in {"xls", "xlsb"}: + if engine in {"xlrd", "pyxlsb"}: msg = re.escape(r"open_workbook() got an unexpected keyword argument 'foo'") - elif read_ext[1:] == "ods": + elif engine == "odf": msg = re.escape(r"load() got an unexpected keyword argument 'foo'") else: msg = re.escape(r"load_workbook() got an unexpected keyword argument 'foo'") @@ -194,8 +195,8 @@ def test_usecols_int(self, read_ext): usecols=3, ) - def test_usecols_list(self, request, read_ext, df_ref): - if read_ext == ".xlsb": + def test_usecols_list(self, request, engine, read_ext, df_ref): + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" @@ -218,8 +219,8 @@ def test_usecols_list(self, request, read_ext, df_ref): tm.assert_frame_equal(df1, df_ref, check_names=False) tm.assert_frame_equal(df2, df_ref, check_names=False) - def test_usecols_str(self, request, read_ext, df_ref): - if read_ext == ".xlsb": + def test_usecols_str(self, request, engine, read_ext, df_ref): + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" @@ -275,9 +276,9 @@ def test_usecols_str(self, request, read_ext, df_ref): "usecols", [[0, 1, 3], [0, 3, 1], [1, 0, 3], [1, 3, 0], [3, 0, 1], [3, 1, 0]] ) def test_usecols_diff_positional_int_columns_order( - self, request, read_ext, usecols, df_ref + self, request, engine, read_ext, usecols, df_ref ): - if read_ext == ".xlsb": + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" @@ -298,8 +299,8 @@ def test_usecols_diff_positional_str_columns_order(self, read_ext, usecols, df_r result = pd.read_excel("test1" + read_ext, sheet_name="Sheet1", usecols=usecols) tm.assert_frame_equal(result, expected, check_names=False) - def test_read_excel_without_slicing(self, request, read_ext, df_ref): - if read_ext == ".xlsb": + def test_read_excel_without_slicing(self, request, engine, read_ext, df_ref): + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" @@ -310,8 +311,8 @@ def test_read_excel_without_slicing(self, request, read_ext, df_ref): result = pd.read_excel("test1" + read_ext, sheet_name="Sheet1", index_col=0) tm.assert_frame_equal(result, expected, check_names=False) - def test_usecols_excel_range_str(self, request, read_ext, df_ref): - if read_ext == ".xlsb": + def test_usecols_excel_range_str(self, request, engine, read_ext, df_ref): + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" @@ -398,20 +399,26 @@ def test_excel_stop_iterator(self, read_ext): expected = DataFrame([["aaaa", "bbbbb"]], columns=["Test", "Test1"]) tm.assert_frame_equal(parsed, expected) - def test_excel_cell_error_na(self, request, read_ext): - if read_ext == ".xlsb": + def test_excel_cell_error_na(self, request, engine, read_ext): + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" ) ) + # https://github.com/tafia/calamine/issues/355 + if engine == "calamine" and read_ext == ".ods": + request.node.add_marker( + pytest.mark.xfail(reason="Calamine can't extract error from ods files") + ) + parsed = pd.read_excel("test3" + read_ext, sheet_name="Sheet1") expected = DataFrame([[np.nan]], columns=["Test"]) tm.assert_frame_equal(parsed, expected) - def test_excel_table(self, request, read_ext, df_ref): - if read_ext == ".xlsb": + def test_excel_table(self, request, engine, read_ext, df_ref): + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" @@ -431,8 +438,8 @@ def test_excel_table(self, request, read_ext, df_ref): ) tm.assert_frame_equal(df3, df1.iloc[:-1]) - def test_reader_special_dtypes(self, request, read_ext): - if read_ext == ".xlsb": + def test_reader_special_dtypes(self, request, engine, read_ext): + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" @@ -571,11 +578,17 @@ def test_reader_dtype_str(self, read_ext, dtype, expected): actual = pd.read_excel(basename + read_ext, dtype=dtype) tm.assert_frame_equal(actual, expected) - def test_dtype_backend(self, read_ext, dtype_backend): + def test_dtype_backend(self, request, engine, read_ext, dtype_backend): # GH#36712 if read_ext in (".xlsb", ".xls"): pytest.skip(f"No engine for filetype: '{read_ext}'") + # GH 54994 + if engine == "calamine" and read_ext == ".ods": + request.node.add_marker( + pytest.mark.xfail(reason="OdsWriter produces broken file") + ) + df = DataFrame( { "a": Series([1, 3], dtype="Int64"), @@ -616,11 +629,17 @@ def test_dtype_backend(self, read_ext, dtype_backend): expected = df tm.assert_frame_equal(result, expected) - def test_dtype_backend_and_dtype(self, read_ext): + def test_dtype_backend_and_dtype(self, request, engine, read_ext): # GH#36712 if read_ext in (".xlsb", ".xls"): pytest.skip(f"No engine for filetype: '{read_ext}'") + # GH 54994 + if engine == "calamine" and read_ext == ".ods": + request.node.add_marker( + pytest.mark.xfail(reason="OdsWriter produces broken file") + ) + df = DataFrame({"a": [np.nan, 1.0], "b": [2.5, np.nan]}) with tm.ensure_clean(read_ext) as file_path: df.to_excel(file_path, sheet_name="test", index=False) @@ -632,11 +651,17 @@ def test_dtype_backend_and_dtype(self, read_ext): ) tm.assert_frame_equal(result, df) - def test_dtype_backend_string(self, read_ext, string_storage): + def test_dtype_backend_string(self, request, engine, read_ext, string_storage): # GH#36712 if read_ext in (".xlsb", ".xls"): pytest.skip(f"No engine for filetype: '{read_ext}'") + # GH 54994 + if engine == "calamine" and read_ext == ".ods": + request.node.add_marker( + pytest.mark.xfail(reason="OdsWriter produces broken file") + ) + pa = pytest.importorskip("pyarrow") with pd.option_context("mode.string_storage", string_storage): @@ -800,8 +825,8 @@ def test_date_conversion_overflow(self, request, engine, read_ext): result = pd.read_excel("testdateoverflow" + read_ext) tm.assert_frame_equal(result, expected) - def test_sheet_name(self, request, read_ext, df_ref): - if read_ext == ".xlsb": + def test_sheet_name(self, request, read_ext, engine, df_ref): + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" @@ -869,6 +894,11 @@ def test_corrupt_bytes_raises(self, engine): "Unsupported format, or corrupt file: Expected BOF " "record; found b'foo'" ) + elif engine == "calamine": + from python_calamine import CalamineError + + error = CalamineError + msg = "Cannot detect file format" else: error = BadZipFile msg = "File is not a zip file" @@ -969,6 +999,14 @@ def test_reader_seconds(self, request, engine, read_ext): ) ) + # GH 55045 + if engine == "calamine" and read_ext == ".ods": + request.node.add_marker( + pytest.mark.xfail( + reason="ODS file contains bad datetime (seconds as text)" + ) + ) + # Test reading times with and without milliseconds. GH5945. expected = DataFrame.from_dict( { @@ -994,15 +1032,21 @@ def test_reader_seconds(self, request, engine, read_ext): actual = pd.read_excel("times_1904" + read_ext, sheet_name="Sheet1") tm.assert_frame_equal(actual, expected) - def test_read_excel_multiindex(self, request, read_ext): + def test_read_excel_multiindex(self, request, engine, read_ext): # see gh-4679 - if read_ext == ".xlsb": + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" ) ) + # https://github.com/tafia/calamine/issues/354 + if engine == "calamine" and read_ext == ".ods": + request.node.add_marker( + pytest.mark.xfail(reason="Last test fails in calamine") + ) + mi = MultiIndex.from_product([["foo", "bar"], ["a", "b"]]) mi_file = "testmultiindex" + read_ext @@ -1088,10 +1132,10 @@ def test_read_excel_multiindex(self, request, read_ext): ], ) def test_read_excel_multiindex_blank_after_name( - self, request, read_ext, sheet_name, idx_lvl2 + self, request, engine, read_ext, sheet_name, idx_lvl2 ): # GH34673 - if read_ext == ".xlsb": + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb (GH4679" @@ -1212,9 +1256,9 @@ def test_read_excel_bool_header_arg(self, read_ext): with pytest.raises(TypeError, match=msg): pd.read_excel("test1" + read_ext, header=arg) - def test_read_excel_skiprows(self, request, read_ext): + def test_read_excel_skiprows(self, request, engine, read_ext): # GH 4903 - if read_ext == ".xlsb": + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" @@ -1267,9 +1311,9 @@ def test_read_excel_skiprows(self, request, read_ext): ) tm.assert_frame_equal(actual, expected) - def test_read_excel_skiprows_callable_not_in(self, request, read_ext): + def test_read_excel_skiprows_callable_not_in(self, request, engine, read_ext): # GH 4903 - if read_ext == ".xlsb": + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" @@ -1397,7 +1441,7 @@ def test_trailing_blanks(self, read_ext): def test_ignore_chartsheets_by_str(self, request, engine, read_ext): # GH 41448 - if engine == "odf": + if read_ext == ".ods": pytest.skip("chartsheets do not exist in the ODF format") if engine == "pyxlsb": request.node.add_marker( @@ -1410,7 +1454,7 @@ def test_ignore_chartsheets_by_str(self, request, engine, read_ext): def test_ignore_chartsheets_by_int(self, request, engine, read_ext): # GH 41448 - if engine == "odf": + if read_ext == ".ods": pytest.skip("chartsheets do not exist in the ODF format") if engine == "pyxlsb": request.node.add_marker( @@ -1540,8 +1584,8 @@ def test_excel_passes_na_filter(self, read_ext, na_filter): expected = DataFrame(expected, columns=["Test"]) tm.assert_frame_equal(parsed, expected) - def test_excel_table_sheet_by_index(self, request, read_ext, df_ref): - if read_ext == ".xlsb": + def test_excel_table_sheet_by_index(self, request, engine, read_ext, df_ref): + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" @@ -1569,8 +1613,8 @@ def test_excel_table_sheet_by_index(self, request, read_ext, df_ref): tm.assert_frame_equal(df3, df1.iloc[:-1]) - def test_sheet_name(self, request, read_ext, df_ref): - if read_ext == ".xlsb": + def test_sheet_name(self, request, engine, read_ext, df_ref): + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" @@ -1639,7 +1683,7 @@ def test_excel_read_binary(self, engine, read_ext): def test_excel_read_binary_via_read_excel(self, read_ext, engine): # GH 38424 with open("test1" + read_ext, "rb") as f: - result = pd.read_excel(f) + result = pd.read_excel(f, engine=engine) expected = pd.read_excel("test1" + read_ext, engine=engine) tm.assert_frame_equal(result, expected) @@ -1691,7 +1735,7 @@ def test_engine_invalid_option(self, read_ext): def test_ignore_chartsheets(self, request, engine, read_ext): # GH 41448 - if engine == "odf": + if read_ext == ".ods": pytest.skip("chartsheets do not exist in the ODF format") if engine == "pyxlsb": request.node.add_marker( @@ -1711,6 +1755,10 @@ def test_corrupt_files_closed(self, engine, read_ext): import xlrd errors = (BadZipFile, xlrd.biffh.XLRDError) + elif engine == "calamine": + from python_calamine import CalamineError + + errors = (CalamineError,) with tm.ensure_clean(f"corrupt{read_ext}") as file: Path(file).write_text("corrupt", encoding="utf-8") diff --git a/pyproject.toml b/pyproject.toml index 74d6aaee286a9..9e579036c128b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -69,7 +69,7 @@ computation = ['scipy>=1.8.1', 'xarray>=2022.03.0'] fss = ['fsspec>=2022.05.0'] aws = ['s3fs>=2022.05.0'] gcp = ['gcsfs>=2022.05.0', 'pandas-gbq>=0.17.5'] -excel = ['odfpy>=1.4.1', 'openpyxl>=3.0.10', 'pyxlsb>=1.0.9', 'xlrd>=2.0.1', 'xlsxwriter>=3.0.3'] +excel = ['odfpy>=1.4.1', 'openpyxl>=3.0.10', 'python-calamine>=0.1.6', 'pyxlsb>=1.0.9', 'xlrd>=2.0.1', 'xlsxwriter>=3.0.3'] parquet = ['pyarrow>=7.0.0'] feather = ['pyarrow>=7.0.0'] hdf5 = [# blosc only available on conda (https://github.com/Blosc/python-blosc/issues/297) @@ -112,6 +112,7 @@ all = ['beautifulsoup4>=4.11.1', 'pytest>=7.3.2', 'pytest-xdist>=2.2.0', 'pytest-asyncio>=0.17.0', + 'python-calamine>=0.1.6', 'pyxlsb>=1.0.9', 'qtpy>=2.2.0', 'scipy>=1.8.1', diff --git a/requirements-dev.txt b/requirements-dev.txt index be02007a36333..ef3587b10d416 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -36,6 +36,7 @@ pyarrow>=7.0.0 pymysql>=1.0.2 pyreadstat>=1.1.5 tables>=3.7.0 +python-calamine>=0.1.6 pyxlsb>=1.0.9 s3fs>=2022.05.0 scipy>=1.8.1 diff --git a/scripts/tests/data/deps_expected_random.yaml b/scripts/tests/data/deps_expected_random.yaml index c70025f8f019d..1ede20f5cc0d8 100644 --- a/scripts/tests/data/deps_expected_random.yaml +++ b/scripts/tests/data/deps_expected_random.yaml @@ -44,6 +44,7 @@ dependencies: - pymysql>=1.0.2 - pyreadstat>=1.1.2 - pytables>=3.6.1 + - python-calamine>=0.1.6 - pyxlsb>=1.0.8 - s3fs>=2021.08.0 - scipy>=1.7.1 diff --git a/scripts/tests/data/deps_minimum.toml b/scripts/tests/data/deps_minimum.toml index b43815a982139..501ec4f061f17 100644 --- a/scripts/tests/data/deps_minimum.toml +++ b/scripts/tests/data/deps_minimum.toml @@ -62,7 +62,7 @@ computation = ['scipy>=1.7.1', 'xarray>=0.21.0'] fss = ['fsspec>=2021.07.0'] aws = ['s3fs>=2021.08.0'] gcp = ['gcsfs>=2021.07.0', 'pandas-gbq>=0.15.0'] -excel = ['odfpy>=1.4.1', 'openpyxl>=3.0.7', 'pyxlsb>=1.0.8', 'xlrd>=2.0.1', 'xlsxwriter>=1.4.3'] +excel = ['odfpy>=1.4.1', 'openpyxl>=3.0.7', 'python-calamine>=0.1.6', 'pyxlsb>=1.0.8', 'xlrd>=2.0.1', 'xlsxwriter>=1.4.3'] parquet = ['pyarrow>=7.0.0'] feather = ['pyarrow>=7.0.0'] hdf5 = [# blosc only available on conda (https://github.com/Blosc/python-blosc/issues/297) @@ -103,6 +103,7 @@ all = ['beautifulsoup4>=5.9.3', 'pytest>=7.3.2', 'pytest-xdist>=2.2.0', 'pytest-asyncio>=0.17.0', + 'python-calamine>=0.1.6', 'pyxlsb>=1.0.8', 'qtpy>=2.2.0', 'scipy>=1.7.1', diff --git a/scripts/tests/data/deps_unmodified_random.yaml b/scripts/tests/data/deps_unmodified_random.yaml index 503eb3c7c7734..14bedd1025bf8 100644 --- a/scripts/tests/data/deps_unmodified_random.yaml +++ b/scripts/tests/data/deps_unmodified_random.yaml @@ -44,6 +44,7 @@ dependencies: - pymysql>=1.0.2 - pyreadstat>=1.1.2 - pytables>=3.6.1 + - python-calamine>=0.1.6 - pyxlsb>=1.0.8 - s3fs>=2021.08.0 - scipy>=1.7.1 From 0bdbc44babac09225bdde02b642252ce054723e3 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Wed, 13 Sep 2023 13:00:54 -0400 Subject: [PATCH 10/17] PERF: Index.difference (#55108) * PERF: Index.difference * whatsnew * remove is_monotonic check --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/indexes/base.py | 17 +++-------------- pandas/tests/indexes/datetimes/test_setops.py | 4 +++- pandas/tests/indexes/timedeltas/test_setops.py | 4 +++- 4 files changed, 10 insertions(+), 16 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 249f08c7e387b..1b8864809975f 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -178,6 +178,7 @@ Performance improvements - Performance improvement in :func:`concat` with ``axis=1`` and objects with unaligned indexes (:issue:`55084`) - Performance improvement in :func:`to_dict` on converting DataFrame to dictionary (:issue:`50990`) - Performance improvement in :meth:`DataFrame.sort_index` and :meth:`Series.sort_index` when indexed by a :class:`MultiIndex` (:issue:`54835`) +- Performance improvement in :meth:`Index.difference` (:issue:`55108`) - Performance improvement when indexing with more than 4 keys (:issue:`54550`) - diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index cd55997ad5f69..8756bb3f3c81b 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3615,21 +3615,10 @@ def difference(self, other, sort=None): def _difference(self, other, sort): # overridden by RangeIndex - - this = self.unique() - - indexer = this.get_indexer_for(other) - indexer = indexer.take((indexer != -1).nonzero()[0]) - - label_diff = np.setdiff1d(np.arange(this.size), indexer, assume_unique=True) - - the_diff: MultiIndex | ArrayLike - if isinstance(this, ABCMultiIndex): - the_diff = this.take(label_diff) - else: - the_diff = this._values.take(label_diff) + other = other.unique() + the_diff = self[other.get_indexer_for(self) == -1] + the_diff = the_diff if self.is_unique else the_diff.unique() the_diff = _maybe_try_sort(the_diff, sort) - return the_diff def _wrap_difference_result(self, other, result): diff --git a/pandas/tests/indexes/datetimes/test_setops.py b/pandas/tests/indexes/datetimes/test_setops.py index 2e7b38abf4212..b56bad7f2e833 100644 --- a/pandas/tests/indexes/datetimes/test_setops.py +++ b/pandas/tests/indexes/datetimes/test_setops.py @@ -343,9 +343,11 @@ def test_difference_freq(self, sort): tm.assert_index_equal(idx_diff, expected) tm.assert_attr_equal("freq", idx_diff, expected) + # preserve frequency when the difference is a contiguous + # subset of the original range other = date_range("20160922", "20160925", freq="D") idx_diff = index.difference(other, sort) - expected = DatetimeIndex(["20160920", "20160921"], freq=None) + expected = DatetimeIndex(["20160920", "20160921"], freq="D") tm.assert_index_equal(idx_diff, expected) tm.assert_attr_equal("freq", idx_diff, expected) diff --git a/pandas/tests/indexes/timedeltas/test_setops.py b/pandas/tests/indexes/timedeltas/test_setops.py index cb6dce1e7ad80..6cdd6944e90ea 100644 --- a/pandas/tests/indexes/timedeltas/test_setops.py +++ b/pandas/tests/indexes/timedeltas/test_setops.py @@ -219,9 +219,11 @@ def test_difference_freq(self, sort): tm.assert_index_equal(idx_diff, expected) tm.assert_attr_equal("freq", idx_diff, expected) + # preserve frequency when the difference is a contiguous + # subset of the original range other = timedelta_range("2 days", "5 days", freq="D") idx_diff = index.difference(other, sort) - expected = TimedeltaIndex(["0 days", "1 days"], freq=None) + expected = TimedeltaIndex(["0 days", "1 days"], freq="D") tm.assert_index_equal(idx_diff, expected) tm.assert_attr_equal("freq", idx_diff, expected) From 67b1e8b1f1fbf98c8e4e10473e6ac691d515593e Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 13 Sep 2023 07:03:48 -1000 Subject: [PATCH 11/17] DOC: Remove deprecated attributes in DatetimeIndex (#55093) --- pandas/core/indexes/datetimes.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index dcb5f8caccd3e..400747cbf6b8d 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -198,8 +198,6 @@ class DatetimeIndex(DatetimeTimedeltaMixin): timetz dayofyear day_of_year - weekofyear - week dayofweek day_of_week weekday From 310f8a8a31dd88a55641ce742c7d13a2a8b0e238 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 13 Sep 2023 07:10:10 -1000 Subject: [PATCH 12/17] BUG: dt.tz with ArrowDtype returned string (#55072) --- doc/source/whatsnew/v2.1.1.rst | 1 + pandas/core/arrays/arrow/array.py | 3 ++- pandas/tests/extension/test_arrow.py | 3 ++- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.1.1.rst b/doc/source/whatsnew/v2.1.1.rst index 42af61be26355..6d5da7cdff3b3 100644 --- a/doc/source/whatsnew/v2.1.1.rst +++ b/doc/source/whatsnew/v2.1.1.rst @@ -35,6 +35,7 @@ Bug fixes ~~~~~~~~~ - Fixed bug for :class:`ArrowDtype` raising ``NotImplementedError`` for fixed-size list (:issue:`55000`) - Fixed bug in :meth:`DataFrame.stack` with ``future_stack=True`` and columns a non-:class:`MultiIndex` consisting of tuples (:issue:`54948`) +- Fixed bug in :meth:`Series.dt.tz` with :class:`ArrowDtype` where a string was returned instead of a ``tzinfo`` object (:issue:`55003`) - Fixed bug in :meth:`Series.pct_change` and :meth:`DataFrame.pct_change` showing unnecessary ``FutureWarning`` (:issue:`54981`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 83ed54c42a23c..2b2e0c843564f 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -18,6 +18,7 @@ from pandas._libs.tslibs import ( Timedelta, Timestamp, + timezones, ) from pandas.compat import ( pa_version_under7p0, @@ -2425,7 +2426,7 @@ def _dt_time(self): @property def _dt_tz(self): - return self.dtype.pyarrow_dtype.tz + return timezones.maybe_get_tz(self.dtype.pyarrow_dtype.tz) @property def _dt_unit(self): diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 2e98eea3cac8a..8968b9a7f25fe 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -31,6 +31,7 @@ import pytest from pandas._libs import lib +from pandas._libs.tslibs import timezones from pandas.compat import ( PY311, is_ci_environment, @@ -2432,7 +2433,7 @@ def test_dt_tz(tz): dtype=ArrowDtype(pa.timestamp("ns", tz=tz)), ) result = ser.dt.tz - assert result == tz + assert result == timezones.maybe_get_tz(tz) def test_dt_isocalendar(): From 4e28925751491581a8bf92531714204f2a68dcde Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20=C5=A0=C3=ADcho?= Date: Wed, 13 Sep 2023 19:14:59 +0200 Subject: [PATCH 13/17] BUG: This fixes #55009 (`raw=True` caused `apply` method of `DataFrame` to ignore passed arguments) (#55089) * fixes #55009 * update documentation * write documentation * add test * change formatting * cite DataDrame directly in docs Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/apply.py | 6 +++++- pandas/tests/apply/test_frame_apply.py | 5 +++-- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 1b8864809975f..117df65f983af 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -189,6 +189,7 @@ Bug fixes ~~~~~~~~~ - Bug in :class:`AbstractHolidayCalendar` where timezone data was not propagated when computing holiday observances (:issue:`54580`) - Bug in :class:`pandas.core.window.Rolling` where duplicate datetimelike indexes are treated as consecutive rather than equal with ``closed='left'`` and ``closed='neither'`` (:issue:`20712`) +- Bug in :meth:`DataFrame.apply` where passing ``raw=True`` ignored ``args`` passed to the applied function (:issue:`55009`) Categorical ^^^^^^^^^^^ diff --git a/pandas/core/apply.py b/pandas/core/apply.py index cc594bc8efb34..9748d4fe66739 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -955,7 +955,11 @@ def wrapper(*args, **kwargs): result = np.squeeze(result) else: result = np.apply_along_axis( - wrap_function(self.func), self.axis, self.values + wrap_function(self.func), + self.axis, + self.values, + *self.args, + **self.kwargs, ) # TODO: mixed type case diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index 3f2accc23e2d6..227b72573f979 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -45,8 +45,9 @@ def test_apply(float_frame): @pytest.mark.parametrize("axis", [0, 1]) -def test_apply_args(float_frame, axis): - result = float_frame.apply(lambda x, y: x + y, axis, args=(1,)) +@pytest.mark.parametrize("raw", [True, False]) +def test_apply_args(float_frame, axis, raw): + result = float_frame.apply(lambda x, y: x + y, axis, args=(1,), raw=raw) expected = float_frame + 1 tm.assert_frame_equal(result, expected) From 13b132e7d154cee2b6daf3133a283d745fee4def Mon Sep 17 00:00:00 2001 From: Dmitriy Date: Thu, 14 Sep 2023 02:05:08 +0600 Subject: [PATCH 14/17] BUG: boolean/string value in OdsWriter (#54994) (#54996) --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/io/excel/_odswriter.py | 27 ++++++++++---- pandas/tests/io/excel/test_odswriter.py | 49 +++++++++++++++++++++++++ pandas/tests/io/excel/test_readers.py | 24 ++---------- 4 files changed, 72 insertions(+), 29 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 117df65f983af..54e855f61905a 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -250,6 +250,7 @@ I/O ^^^ - Bug in :func:`read_csv` where ``on_bad_lines="warn"`` would write to ``stderr`` instead of raise a Python warning. This now yields a :class:`.errors.ParserWarning` (:issue:`54296`) - Bug in :func:`read_excel`, with ``engine="xlrd"`` (``xls`` files) erroring when file contains NaNs/Infs (:issue:`54564`) +- Bug in :func:`to_excel`, with ``OdsWriter`` (``ods`` files) writing boolean/string value (:issue:`54994`) Period ^^^^^^ diff --git a/pandas/io/excel/_odswriter.py b/pandas/io/excel/_odswriter.py index 74cbe90acdae8..bc7dca2d95b6b 100644 --- a/pandas/io/excel/_odswriter.py +++ b/pandas/io/excel/_odswriter.py @@ -192,7 +192,15 @@ def _make_table_cell(self, cell) -> tuple[object, Any]: if isinstance(val, bool): value = str(val).lower() pvalue = str(val).upper() - if isinstance(val, datetime.datetime): + return ( + pvalue, + TableCell( + valuetype="boolean", + booleanvalue=value, + attributes=attributes, + ), + ) + elif isinstance(val, datetime.datetime): # Fast formatting value = val.isoformat() # Slow but locale-dependent @@ -210,17 +218,20 @@ def _make_table_cell(self, cell) -> tuple[object, Any]: pvalue, TableCell(valuetype="date", datevalue=value, attributes=attributes), ) + elif isinstance(val, str): + return ( + pvalue, + TableCell( + valuetype="string", + stringvalue=value, + attributes=attributes, + ), + ) else: - class_to_cell_type = { - str: "string", - int: "float", - float: "float", - bool: "boolean", - } return ( pvalue, TableCell( - valuetype=class_to_cell_type[type(val)], + valuetype="float", value=value, attributes=attributes, ), diff --git a/pandas/tests/io/excel/test_odswriter.py b/pandas/tests/io/excel/test_odswriter.py index 21d31ec8a7fb5..ecee58362f8a9 100644 --- a/pandas/tests/io/excel/test_odswriter.py +++ b/pandas/tests/io/excel/test_odswriter.py @@ -1,7 +1,12 @@ +from datetime import ( + date, + datetime, +) import re import pytest +import pandas as pd import pandas._testing as tm from pandas.io.excel import ExcelWriter @@ -47,3 +52,47 @@ def test_book_and_sheets_consistent(ext): table = odf.table.Table(name="test_name") writer.book.spreadsheet.addElement(table) assert writer.sheets == {"test_name": table} + + +@pytest.mark.parametrize( + ["value", "cell_value_type", "cell_value_attribute", "cell_value"], + argvalues=[ + (True, "boolean", "boolean-value", "true"), + ("test string", "string", "string-value", "test string"), + (1, "float", "value", "1"), + (1.5, "float", "value", "1.5"), + ( + datetime(2010, 10, 10, 10, 10, 10), + "date", + "date-value", + "2010-10-10T10:10:10", + ), + (date(2010, 10, 10), "date", "date-value", "2010-10-10"), + ], +) +def test_cell_value_type(ext, value, cell_value_type, cell_value_attribute, cell_value): + # GH#54994 ODS: cell attributes should follow specification + # http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os-part1.html#refTable13 + from odf.namespaces import OFFICENS + from odf.table import ( + TableCell, + TableRow, + ) + + table_cell_name = TableCell().qname + + with tm.ensure_clean(ext) as f: + pd.DataFrame([[value]]).to_excel(f, header=False, index=False) + + with pd.ExcelFile(f) as wb: + sheet = wb._reader.get_sheet_by_index(0) + sheet_rows = sheet.getElementsByType(TableRow) + sheet_cells = [ + x + for x in sheet_rows[0].childNodes + if hasattr(x, "qname") and x.qname == table_cell_name + ] + + cell = sheet_cells[0] + assert cell.attributes.get((OFFICENS, "value-type")) == cell_value_type + assert cell.attributes.get((OFFICENS, cell_value_attribute)) == cell_value diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index de444019e7b4c..8dd9f96a05a90 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -578,17 +578,11 @@ def test_reader_dtype_str(self, read_ext, dtype, expected): actual = pd.read_excel(basename + read_ext, dtype=dtype) tm.assert_frame_equal(actual, expected) - def test_dtype_backend(self, request, engine, read_ext, dtype_backend): + def test_dtype_backend(self, read_ext, dtype_backend): # GH#36712 if read_ext in (".xlsb", ".xls"): pytest.skip(f"No engine for filetype: '{read_ext}'") - # GH 54994 - if engine == "calamine" and read_ext == ".ods": - request.node.add_marker( - pytest.mark.xfail(reason="OdsWriter produces broken file") - ) - df = DataFrame( { "a": Series([1, 3], dtype="Int64"), @@ -629,17 +623,11 @@ def test_dtype_backend(self, request, engine, read_ext, dtype_backend): expected = df tm.assert_frame_equal(result, expected) - def test_dtype_backend_and_dtype(self, request, engine, read_ext): + def test_dtype_backend_and_dtype(self, read_ext): # GH#36712 if read_ext in (".xlsb", ".xls"): pytest.skip(f"No engine for filetype: '{read_ext}'") - # GH 54994 - if engine == "calamine" and read_ext == ".ods": - request.node.add_marker( - pytest.mark.xfail(reason="OdsWriter produces broken file") - ) - df = DataFrame({"a": [np.nan, 1.0], "b": [2.5, np.nan]}) with tm.ensure_clean(read_ext) as file_path: df.to_excel(file_path, sheet_name="test", index=False) @@ -651,17 +639,11 @@ def test_dtype_backend_and_dtype(self, request, engine, read_ext): ) tm.assert_frame_equal(result, df) - def test_dtype_backend_string(self, request, engine, read_ext, string_storage): + def test_dtype_backend_string(self, read_ext, string_storage): # GH#36712 if read_ext in (".xlsb", ".xls"): pytest.skip(f"No engine for filetype: '{read_ext}'") - # GH 54994 - if engine == "calamine" and read_ext == ".ods": - request.node.add_marker( - pytest.mark.xfail(reason="OdsWriter produces broken file") - ) - pa = pytest.importorskip("pyarrow") with pd.option_context("mode.string_storage", string_storage): From 51c2300210533a27fbd8bb58f93c2f382bbbdc40 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Wed, 13 Sep 2023 19:36:04 -0400 Subject: [PATCH 15/17] Use pandasSQL transactions in sql test suite to avoid engine deadlocks (#55129) pandasSQL use transactions in test suite --- pandas/tests/io/test_sql.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index bbdb22955297e..1abe0ad55a864 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -1141,18 +1141,21 @@ def load_types_data(self, types_data): def _read_sql_iris_parameter(self, sql_strings): query = sql_strings["read_parameters"][self.flavor] params = ("Iris-setosa", 5.1) - iris_frame = self.pandasSQL.read_query(query, params=params) + with self.pandasSQL.run_transaction(): + iris_frame = self.pandasSQL.read_query(query, params=params) check_iris_frame(iris_frame) def _read_sql_iris_named_parameter(self, sql_strings): query = sql_strings["read_named_parameters"][self.flavor] params = {"name": "Iris-setosa", "length": 5.1} - iris_frame = self.pandasSQL.read_query(query, params=params) + with self.pandasSQL.run_transaction(): + iris_frame = self.pandasSQL.read_query(query, params=params) check_iris_frame(iris_frame) def _read_sql_iris_no_parameter_with_percent(self, sql_strings): query = sql_strings["read_no_parameters_with_percent"][self.flavor] - iris_frame = self.pandasSQL.read_query(query, params=None) + with self.pandasSQL.run_transaction(): + iris_frame = self.pandasSQL.read_query(query, params=None) check_iris_frame(iris_frame) def _to_sql_empty(self, test_frame1): @@ -1182,7 +1185,8 @@ def _to_sql_with_sql_engine(self, test_frame1, engine="auto", **engine_kwargs): def _roundtrip(self, test_frame1): self.drop_table("test_frame_roundtrip", self.conn) assert self.pandasSQL.to_sql(test_frame1, "test_frame_roundtrip") == 4 - result = self.pandasSQL.read_query("SELECT * FROM test_frame_roundtrip") + with self.pandasSQL.run_transaction(): + result = self.pandasSQL.read_query("SELECT * FROM test_frame_roundtrip") result.set_index("level_0", inplace=True) # result.index.astype(int) @@ -1232,13 +1236,14 @@ class DummyException(Exception): except DummyException: # ignore raised exception pass - res = self.pandasSQL.read_query("SELECT * FROM test_trans") + with self.pandasSQL.run_transaction(): + res = self.pandasSQL.read_query("SELECT * FROM test_trans") assert len(res) == 0 # Make sure when transaction is committed, rows do get inserted with self.pandasSQL.run_transaction() as trans: trans.execute(ins_sql) - res2 = self.pandasSQL.read_query("SELECT * FROM test_trans") + res2 = self.pandasSQL.read_query("SELECT * FROM test_trans") assert len(res2) == 1 From 81fb7e76073ffe6adb875f15cdcfbac52c15b339 Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Wed, 13 Sep 2023 16:37:46 -0700 Subject: [PATCH 16/17] DEPS: remove duplicated dependency in requirement-dev.txt (#55101) * Test installing dev dependencies with pip * fix typo * remove 3.12, list deps * remove pip ci test --- environment.yml | 2 +- requirements-dev.txt | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/environment.yml b/environment.yml index 1eb0b4cc2c7a6..8deae839f5408 100644 --- a/environment.yml +++ b/environment.yml @@ -106,7 +106,7 @@ dependencies: - ipykernel # web - - jinja2 # in optional dependencies, but documented here as needed + # - jinja2 # already listed in optional dependencies, but documented here for reference - markdown - feedparser - pyyaml diff --git a/requirements-dev.txt b/requirements-dev.txt index ef3587b10d416..01e0701bc39a7 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -77,7 +77,6 @@ ipywidgets nbformat notebook>=6.0.3 ipykernel -jinja2 markdown feedparser pyyaml From f00efd0344bd4e22cc867e76c776cb88669e6cde Mon Sep 17 00:00:00 2001 From: William Ayd Date: Wed, 13 Sep 2023 19:39:07 -0400 Subject: [PATCH 17/17] Assorted UBSAN cleanups (#55112) * first round of fixes * fix up includes * updates * dedup logic * move comment --- .../src/vendored/ujson/lib/ultrajsonenc.c | 8 ++- pandas/_libs/tslibs/np_datetime.pyx | 49 +++++++++++++------ 2 files changed, 41 insertions(+), 16 deletions(-) diff --git a/pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c b/pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c index e3e710ce1b876..942bd0b518144 100644 --- a/pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c +++ b/pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c @@ -44,6 +44,7 @@ Numeric decoder derived from TCL library #include #include #include +#include #include #include #include @@ -763,7 +764,12 @@ void Buffer_AppendIntUnchecked(JSONObjectEncoder *enc, JSINT32 value) { void Buffer_AppendLongUnchecked(JSONObjectEncoder *enc, JSINT64 value) { char *wstr; - JSUINT64 uvalue = (value < 0) ? -value : value; + JSUINT64 uvalue; + if (value == INT64_MIN) { + uvalue = INT64_MAX + UINT64_C(1); + } else { + uvalue = (value < 0) ? -value : value; + } wstr = enc->offset; // Conversion. Number is reversed. diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index 7b2ee68c73ad2..c3ee68e14a8d4 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -1,4 +1,3 @@ -cimport cython from cpython.datetime cimport ( PyDateTime_CheckExact, PyDateTime_DATE_GET_HOUR, @@ -18,6 +17,7 @@ from cpython.object cimport ( Py_LT, Py_NE, ) +from libc.stdint cimport INT64_MAX import_datetime() PandasDateTime_IMPORT @@ -545,7 +545,6 @@ cdef ndarray astype_round_check( return iresult -@cython.overflowcheck(True) cdef int64_t get_conversion_factor( NPY_DATETIMEUNIT from_unit, NPY_DATETIMEUNIT to_unit @@ -553,6 +552,7 @@ cdef int64_t get_conversion_factor( """ Find the factor by which we need to multiply to convert from from_unit to to_unit. """ + cdef int64_t value, overflow_limit, factor if ( from_unit == NPY_DATETIMEUNIT.NPY_FR_GENERIC or to_unit == NPY_DATETIMEUNIT.NPY_FR_GENERIC @@ -565,28 +565,44 @@ cdef int64_t get_conversion_factor( return 1 if from_unit == NPY_DATETIMEUNIT.NPY_FR_W: - return 7 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_D, to_unit) + value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_D, to_unit) + factor = 7 elif from_unit == NPY_DATETIMEUNIT.NPY_FR_D: - return 24 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_h, to_unit) + value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_h, to_unit) + factor = 24 elif from_unit == NPY_DATETIMEUNIT.NPY_FR_h: - return 60 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_m, to_unit) + value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_m, to_unit) + factor = 60 elif from_unit == NPY_DATETIMEUNIT.NPY_FR_m: - return 60 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_s, to_unit) + value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_s, to_unit) + factor = 60 elif from_unit == NPY_DATETIMEUNIT.NPY_FR_s: - return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_ms, to_unit) + value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_ms, to_unit) + factor = 1000 elif from_unit == NPY_DATETIMEUNIT.NPY_FR_ms: - return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_us, to_unit) + value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_us, to_unit) + factor = 1000 elif from_unit == NPY_DATETIMEUNIT.NPY_FR_us: - return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_ns, to_unit) + value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_ns, to_unit) + factor = 1000 elif from_unit == NPY_DATETIMEUNIT.NPY_FR_ns: - return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_ps, to_unit) + value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_ps, to_unit) + factor = 1000 elif from_unit == NPY_DATETIMEUNIT.NPY_FR_ps: - return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_fs, to_unit) + value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_fs, to_unit) + factor = 1000 elif from_unit == NPY_DATETIMEUNIT.NPY_FR_fs: - return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_as, to_unit) + value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_as, to_unit) + factor = 1000 else: raise ValueError("Converting from M or Y units is not supported.") + overflow_limit = INT64_MAX // factor + if value > overflow_limit or value < -overflow_limit: + raise OverflowError("result would overflow") + + return factor * value + cdef int64_t convert_reso( int64_t value, @@ -595,7 +611,7 @@ cdef int64_t convert_reso( bint round_ok, ) except? -1: cdef: - int64_t res_value, mult, div, mod + int64_t res_value, mult, div, mod, overflow_limit if from_reso == to_reso: return value @@ -624,9 +640,12 @@ cdef int64_t convert_reso( else: # e.g. ns -> us, risk of overflow, but no risk of lossy rounding mult = get_conversion_factor(from_reso, to_reso) - with cython.overflowcheck(True): + overflow_limit = INT64_MAX // mult + if value > overflow_limit or value < -overflow_limit: # Note: caller is responsible for re-raising as OutOfBoundsTimedelta - res_value = value * mult + raise OverflowError("result would overflow") + + res_value = value * mult return res_value