diff --git a/.circleci/config.yml b/.circleci/config.yml
index b6a5a00429d9a..1c4f33925c999 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -56,7 +56,7 @@ jobs:
/opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev
. ~/virtualenvs/pandas-dev/bin/activate
python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.13.1 meson[ninja]==1.2.1
- python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=3.4.0 hypothesis>=6.84.0
+ python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytest>=7.3.2 pytest-xdist>=3.4.0 hypothesis>=6.84.0
python -m pip install --no-cache-dir --no-build-isolation -e . --config-settings=setup-args="--werror"
python -m pip list --no-cache-dir
export PANDAS_CI=1
diff --git a/.github/actions/build_pandas/action.yml b/.github/actions/build_pandas/action.yml
index 460ae2f8594c0..6eac6fcf84f51 100644
--- a/.github/actions/build_pandas/action.yml
+++ b/.github/actions/build_pandas/action.yml
@@ -22,6 +22,13 @@ runs:
fi
shell: bash -el {0}
+ - name: Uninstall nomkl
+ run: |
+ if conda list nomkl | grep nomkl 1>/dev/null; then
+ conda remove nomkl -y
+ fi
+ shell: bash -el {0}
+
- name: Build Pandas
run: |
if [[ ${{ inputs.editable }} == "true" ]]; then
diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
index 166c06acccc49..68b7573f01501 100644
--- a/.github/workflows/unit-tests.yml
+++ b/.github/workflows/unit-tests.yml
@@ -236,7 +236,7 @@ jobs:
. ~/virtualenvs/pandas-dev/bin/activate
python -m pip install --no-cache-dir -U pip wheel setuptools meson[ninja]==1.2.1 meson-python==0.13.1
python -m pip install numpy --config-settings=setup-args="-Dallow-noblas=true"
- python -m pip install --no-cache-dir versioneer[toml] cython python-dateutil pytz pytest>=7.3.2 pytest-xdist>=3.4.0 hypothesis>=6.84.0
+ python -m pip install --no-cache-dir versioneer[toml] cython python-dateutil pytest>=7.3.2 pytest-xdist>=3.4.0 hypothesis>=6.84.0
python -m pip install --no-cache-dir --no-build-isolation -e . --config-settings=setup-args="--werror"
python -m pip list --no-cache-dir
export PANDAS_CI=1
@@ -274,7 +274,7 @@ jobs:
/opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev
. ~/virtualenvs/pandas-dev/bin/activate
python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.13.1 meson[ninja]==1.2.1
- python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=3.4.0 hypothesis>=6.84.0
+ python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytest>=7.3.2 pytest-xdist>=3.4.0 hypothesis>=6.84.0
python -m pip install --no-cache-dir --no-build-isolation -e . --config-settings=setup-args="--werror"
python -m pip list --no-cache-dir
@@ -295,7 +295,7 @@ jobs:
# In general, this will remain frozen(present, but not running) until:
# - The next unreleased Python version has released beta 1
# - This version should be available on GitHub Actions.
- # - Our required build/runtime dependencies(numpy, pytz, Cython, python-dateutil)
+ # - Our required build/runtime dependencies(numpy, Cython, python-dateutil)
# support that unreleased Python version.
# To unfreeze, comment out the ``if: false`` condition, and make sure you update
# the name of the workflow and Python version in actions/setup-python ``python-version:``
@@ -348,7 +348,7 @@ jobs:
python -m pip install --upgrade pip setuptools wheel meson[ninja]==1.2.1 meson-python==0.13.1
python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy
python -m pip install versioneer[toml]
- python -m pip install python-dateutil pytz tzdata cython hypothesis>=6.84.0 pytest>=7.3.2 pytest-xdist>=3.4.0 pytest-cov
+ python -m pip install python-dateutil tzdata cython hypothesis>=6.84.0 pytest>=7.3.2 pytest-xdist>=3.4.0 pytest-cov
python -m pip install -ve . --no-build-isolation --no-index --no-deps --config-settings=setup-args="--werror"
python -m pip list
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index b81b9ba070a44..f6717dd503c9b 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -23,6 +23,7 @@ repos:
hooks:
- id: ruff
args: [--exit-non-zero-on-fix]
+ exclude: ^pandas/tests/frame/test_query_eval.py
- id: ruff
# TODO: remove autofixe-only rules when they are checked by ruff
name: ruff-selected-autofixes
@@ -31,7 +32,7 @@ repos:
exclude: ^pandas/tests
args: [--select, "ANN001,ANN2", --fix-only, --exit-non-zero-on-fix]
- id: ruff-format
- exclude: ^scripts
+ exclude: ^scripts|^pandas/tests/frame/test_query_eval.py
- repo: https://github.com/jendrikseipp/vulture
rev: 'v2.11'
hooks:
@@ -85,6 +86,7 @@ repos:
types: [text] # overwrite types: [rst]
types_or: [python, rst]
- id: rst-inline-touching-normal
+ exclude: ^pandas/tests/frame/test_query_eval.py
types: [text] # overwrite types: [rst]
types_or: [python, rst]
- repo: https://github.com/sphinx-contrib/sphinx-lint
diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index 540bd59cd5924..f1aeda9cbdc9d 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -70,20 +70,15 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
--format=actions \
-i ES01 `# For now it is ok if docstrings are missing the extended summary` \
-i "pandas.Series.dt PR01" `# Accessors are implemented as classes, but we do not document the Parameters section` \
- -i "pandas.MultiIndex.names SA01" \
-i "pandas.MultiIndex.reorder_levels RT03,SA01" \
- -i "pandas.MultiIndex.sortlevel PR07,SA01" \
-i "pandas.MultiIndex.to_frame RT03" \
-i "pandas.NA SA01" \
-i "pandas.NaT SA01" \
- -i "pandas.Period.asfreq SA01" \
-i "pandas.Period.freq GL08" \
-i "pandas.Period.freqstr SA01" \
- -i "pandas.Period.month SA01" \
-i "pandas.Period.ordinal GL08" \
-i "pandas.Period.strftime PR01,SA01" \
-i "pandas.Period.to_timestamp SA01" \
- -i "pandas.Period.year SA01" \
-i "pandas.PeriodDtype SA01" \
-i "pandas.PeriodDtype.freq SA01" \
-i "pandas.PeriodIndex.day SA01" \
@@ -158,26 +153,10 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
-i "pandas.Series.sparse.sp_values SA01" \
-i "pandas.Series.sparse.to_coo PR07,RT03,SA01" \
-i "pandas.Series.std PR01,RT03,SA01" \
- -i "pandas.Series.str.capitalize RT03" \
- -i "pandas.Series.str.casefold RT03" \
- -i "pandas.Series.str.center RT03,SA01" \
- -i "pandas.Series.str.decode PR07,RT03,SA01" \
- -i "pandas.Series.str.encode PR07,RT03,SA01" \
- -i "pandas.Series.str.ljust RT03,SA01" \
- -i "pandas.Series.str.lower RT03" \
- -i "pandas.Series.str.lstrip RT03" \
-i "pandas.Series.str.match RT03" \
-i "pandas.Series.str.normalize RT03,SA01" \
- -i "pandas.Series.str.partition RT03" \
-i "pandas.Series.str.repeat SA01" \
-i "pandas.Series.str.replace SA01" \
- -i "pandas.Series.str.rjust RT03,SA01" \
- -i "pandas.Series.str.rpartition RT03" \
- -i "pandas.Series.str.rstrip RT03" \
- -i "pandas.Series.str.strip RT03" \
- -i "pandas.Series.str.swapcase RT03" \
- -i "pandas.Series.str.title RT03" \
- -i "pandas.Series.str.upper RT03" \
-i "pandas.Series.str.wrap RT03,SA01" \
-i "pandas.Series.str.zfill RT03" \
-i "pandas.Series.struct.dtypes SA01" \
@@ -201,43 +180,18 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
-i "pandas.TimedeltaIndex.nanoseconds SA01" \
-i "pandas.TimedeltaIndex.seconds SA01" \
-i "pandas.TimedeltaIndex.to_pytimedelta RT03,SA01" \
- -i "pandas.Timestamp.combine PR01,SA01" \
- -i "pandas.Timestamp.ctime SA01" \
- -i "pandas.Timestamp.date SA01" \
-i "pandas.Timestamp.day GL08" \
-i "pandas.Timestamp.fold GL08" \
- -i "pandas.Timestamp.fromordinal SA01" \
- -i "pandas.Timestamp.fromtimestamp PR01,SA01" \
-i "pandas.Timestamp.hour GL08" \
-i "pandas.Timestamp.max PR02" \
-i "pandas.Timestamp.microsecond GL08" \
-i "pandas.Timestamp.min PR02" \
-i "pandas.Timestamp.minute GL08" \
-i "pandas.Timestamp.month GL08" \
- -i "pandas.Timestamp.month_name SA01" \
-i "pandas.Timestamp.nanosecond GL08" \
- -i "pandas.Timestamp.normalize SA01" \
- -i "pandas.Timestamp.quarter SA01" \
- -i "pandas.Timestamp.replace PR07,SA01" \
-i "pandas.Timestamp.resolution PR02" \
-i "pandas.Timestamp.second GL08" \
- -i "pandas.Timestamp.strptime PR01,SA01" \
- -i "pandas.Timestamp.timestamp SA01" \
- -i "pandas.Timestamp.timetuple SA01" \
- -i "pandas.Timestamp.timetz SA01" \
- -i "pandas.Timestamp.to_datetime64 SA01" \
- -i "pandas.Timestamp.to_julian_date SA01" \
- -i "pandas.Timestamp.to_numpy PR01" \
- -i "pandas.Timestamp.to_period PR01,SA01" \
- -i "pandas.Timestamp.today SA01" \
- -i "pandas.Timestamp.toordinal SA01" \
- -i "pandas.Timestamp.tz_localize SA01" \
-i "pandas.Timestamp.tzinfo GL08" \
- -i "pandas.Timestamp.tzname SA01" \
- -i "pandas.Timestamp.unit SA01" \
- -i "pandas.Timestamp.utcfromtimestamp PR01,SA01" \
- -i "pandas.Timestamp.utcoffset SA01" \
- -i "pandas.Timestamp.utctimetuple SA01" \
-i "pandas.Timestamp.value GL08" \
-i "pandas.Timestamp.year GL08" \
-i "pandas.api.extensions.ExtensionArray._pad_or_backfill PR01,RT03,SA01" \
@@ -251,17 +205,11 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
-i "pandas.api.extensions.ExtensionArray.insert PR07,RT03,SA01" \
-i "pandas.api.extensions.ExtensionArray.interpolate PR01,SA01" \
-i "pandas.api.extensions.ExtensionArray.isin PR07,RT03,SA01" \
- -i "pandas.api.extensions.ExtensionArray.isna SA01" \
- -i "pandas.api.extensions.ExtensionArray.nbytes SA01" \
- -i "pandas.api.extensions.ExtensionArray.ndim SA01" \
- -i "pandas.api.extensions.ExtensionArray.ravel RT03,SA01" \
- -i "pandas.api.extensions.ExtensionArray.take RT03" \
-i "pandas.api.extensions.ExtensionArray.tolist RT03,SA01" \
-i "pandas.api.extensions.ExtensionArray.unique RT03,SA01" \
-i "pandas.api.extensions.ExtensionArray.view SA01" \
-i "pandas.api.interchange.from_dataframe RT03,SA01" \
-i "pandas.api.types.is_bool PR01,SA01" \
- -i "pandas.api.types.is_bool_dtype SA01" \
-i "pandas.api.types.is_categorical_dtype SA01" \
-i "pandas.api.types.is_complex PR01,SA01" \
-i "pandas.api.types.is_complex_dtype SA01" \
@@ -423,156 +371,103 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
-i "pandas.set_eng_float_format RT03,SA01" \
-i "pandas.testing.assert_extension_array_equal SA01" \
-i "pandas.tseries.offsets.BDay PR02,SA01" \
- -i "pandas.tseries.offsets.BQuarterBegin PR02" \
- -i "pandas.tseries.offsets.BQuarterBegin.freqstr SA01" \
-i "pandas.tseries.offsets.BQuarterBegin.is_on_offset GL08" \
-i "pandas.tseries.offsets.BQuarterBegin.n GL08" \
- -i "pandas.tseries.offsets.BQuarterBegin.nanos GL08" \
-i "pandas.tseries.offsets.BQuarterBegin.normalize GL08" \
-i "pandas.tseries.offsets.BQuarterBegin.rule_code GL08" \
-i "pandas.tseries.offsets.BQuarterBegin.startingMonth GL08" \
- -i "pandas.tseries.offsets.BQuarterEnd.freqstr SA01" \
-i "pandas.tseries.offsets.BQuarterEnd.is_on_offset GL08" \
-i "pandas.tseries.offsets.BQuarterEnd.n GL08" \
- -i "pandas.tseries.offsets.BQuarterEnd.nanos GL08" \
-i "pandas.tseries.offsets.BQuarterEnd.normalize GL08" \
-i "pandas.tseries.offsets.BQuarterEnd.rule_code GL08" \
-i "pandas.tseries.offsets.BQuarterEnd.startingMonth GL08" \
- -i "pandas.tseries.offsets.BYearBegin.freqstr SA01" \
-i "pandas.tseries.offsets.BYearBegin.is_on_offset GL08" \
-i "pandas.tseries.offsets.BYearBegin.month GL08" \
-i "pandas.tseries.offsets.BYearBegin.n GL08" \
- -i "pandas.tseries.offsets.BYearBegin.nanos GL08" \
-i "pandas.tseries.offsets.BYearBegin.normalize GL08" \
- -i "pandas.tseries.offsets.BYearBegin.rule_code GL08" \
- -i "pandas.tseries.offsets.BYearEnd PR02" \
- -i "pandas.tseries.offsets.BYearEnd.freqstr SA01" \
-i "pandas.tseries.offsets.BYearEnd.is_on_offset GL08" \
-i "pandas.tseries.offsets.BYearEnd.month GL08" \
-i "pandas.tseries.offsets.BYearEnd.n GL08" \
- -i "pandas.tseries.offsets.BYearEnd.nanos GL08" \
-i "pandas.tseries.offsets.BYearEnd.normalize GL08" \
- -i "pandas.tseries.offsets.BYearEnd.rule_code GL08" \
-i "pandas.tseries.offsets.BusinessDay PR02,SA01" \
-i "pandas.tseries.offsets.BusinessDay.calendar GL08" \
- -i "pandas.tseries.offsets.BusinessDay.freqstr SA01" \
-i "pandas.tseries.offsets.BusinessDay.holidays GL08" \
-i "pandas.tseries.offsets.BusinessDay.is_on_offset GL08" \
-i "pandas.tseries.offsets.BusinessDay.n GL08" \
- -i "pandas.tseries.offsets.BusinessDay.nanos GL08" \
-i "pandas.tseries.offsets.BusinessDay.normalize GL08" \
- -i "pandas.tseries.offsets.BusinessDay.rule_code GL08" \
-i "pandas.tseries.offsets.BusinessDay.weekmask GL08" \
-i "pandas.tseries.offsets.BusinessHour PR02,SA01" \
-i "pandas.tseries.offsets.BusinessHour.calendar GL08" \
-i "pandas.tseries.offsets.BusinessHour.end GL08" \
- -i "pandas.tseries.offsets.BusinessHour.freqstr SA01" \
-i "pandas.tseries.offsets.BusinessHour.holidays GL08" \
-i "pandas.tseries.offsets.BusinessHour.is_on_offset GL08" \
-i "pandas.tseries.offsets.BusinessHour.n GL08" \
- -i "pandas.tseries.offsets.BusinessHour.nanos GL08" \
-i "pandas.tseries.offsets.BusinessHour.normalize GL08" \
- -i "pandas.tseries.offsets.BusinessHour.rule_code GL08" \
-i "pandas.tseries.offsets.BusinessHour.start GL08" \
-i "pandas.tseries.offsets.BusinessHour.weekmask GL08" \
- -i "pandas.tseries.offsets.BusinessMonthBegin.freqstr SA01" \
-i "pandas.tseries.offsets.BusinessMonthBegin.is_on_offset GL08" \
-i "pandas.tseries.offsets.BusinessMonthBegin.n GL08" \
- -i "pandas.tseries.offsets.BusinessMonthBegin.nanos GL08" \
-i "pandas.tseries.offsets.BusinessMonthBegin.normalize GL08" \
- -i "pandas.tseries.offsets.BusinessMonthBegin.rule_code GL08" \
- -i "pandas.tseries.offsets.BusinessMonthEnd.freqstr SA01" \
-i "pandas.tseries.offsets.BusinessMonthEnd.is_on_offset GL08" \
-i "pandas.tseries.offsets.BusinessMonthEnd.n GL08" \
- -i "pandas.tseries.offsets.BusinessMonthEnd.nanos GL08" \
-i "pandas.tseries.offsets.BusinessMonthEnd.normalize GL08" \
- -i "pandas.tseries.offsets.BusinessMonthEnd.rule_code GL08" \
-i "pandas.tseries.offsets.CBMonthBegin PR02" \
-i "pandas.tseries.offsets.CBMonthEnd PR02" \
-i "pandas.tseries.offsets.CDay PR02,SA01" \
-i "pandas.tseries.offsets.CustomBusinessDay PR02,SA01" \
-i "pandas.tseries.offsets.CustomBusinessDay.calendar GL08" \
- -i "pandas.tseries.offsets.CustomBusinessDay.freqstr SA01" \
-i "pandas.tseries.offsets.CustomBusinessDay.holidays GL08" \
-i "pandas.tseries.offsets.CustomBusinessDay.is_on_offset GL08" \
-i "pandas.tseries.offsets.CustomBusinessDay.n GL08" \
- -i "pandas.tseries.offsets.CustomBusinessDay.nanos GL08" \
-i "pandas.tseries.offsets.CustomBusinessDay.normalize GL08" \
- -i "pandas.tseries.offsets.CustomBusinessDay.rule_code GL08" \
-i "pandas.tseries.offsets.CustomBusinessDay.weekmask GL08" \
-i "pandas.tseries.offsets.CustomBusinessHour PR02,SA01" \
-i "pandas.tseries.offsets.CustomBusinessHour.calendar GL08" \
-i "pandas.tseries.offsets.CustomBusinessHour.end GL08" \
- -i "pandas.tseries.offsets.CustomBusinessHour.freqstr SA01" \
-i "pandas.tseries.offsets.CustomBusinessHour.holidays GL08" \
-i "pandas.tseries.offsets.CustomBusinessHour.is_on_offset GL08" \
-i "pandas.tseries.offsets.CustomBusinessHour.n GL08" \
- -i "pandas.tseries.offsets.CustomBusinessHour.nanos GL08" \
-i "pandas.tseries.offsets.CustomBusinessHour.normalize GL08" \
- -i "pandas.tseries.offsets.CustomBusinessHour.rule_code GL08" \
-i "pandas.tseries.offsets.CustomBusinessHour.start GL08" \
-i "pandas.tseries.offsets.CustomBusinessHour.weekmask GL08" \
-i "pandas.tseries.offsets.CustomBusinessMonthBegin PR02" \
-i "pandas.tseries.offsets.CustomBusinessMonthBegin.calendar GL08" \
- -i "pandas.tseries.offsets.CustomBusinessMonthBegin.freqstr SA01" \
-i "pandas.tseries.offsets.CustomBusinessMonthBegin.holidays GL08" \
-i "pandas.tseries.offsets.CustomBusinessMonthBegin.is_on_offset SA01" \
-i "pandas.tseries.offsets.CustomBusinessMonthBegin.m_offset GL08" \
-i "pandas.tseries.offsets.CustomBusinessMonthBegin.n GL08" \
- -i "pandas.tseries.offsets.CustomBusinessMonthBegin.nanos GL08" \
-i "pandas.tseries.offsets.CustomBusinessMonthBegin.normalize GL08" \
- -i "pandas.tseries.offsets.CustomBusinessMonthBegin.rule_code GL08" \
-i "pandas.tseries.offsets.CustomBusinessMonthBegin.weekmask GL08" \
-i "pandas.tseries.offsets.CustomBusinessMonthEnd PR02" \
-i "pandas.tseries.offsets.CustomBusinessMonthEnd.calendar GL08" \
- -i "pandas.tseries.offsets.CustomBusinessMonthEnd.freqstr SA01" \
-i "pandas.tseries.offsets.CustomBusinessMonthEnd.holidays GL08" \
-i "pandas.tseries.offsets.CustomBusinessMonthEnd.is_on_offset SA01" \
-i "pandas.tseries.offsets.CustomBusinessMonthEnd.m_offset GL08" \
-i "pandas.tseries.offsets.CustomBusinessMonthEnd.n GL08" \
- -i "pandas.tseries.offsets.CustomBusinessMonthEnd.nanos GL08" \
-i "pandas.tseries.offsets.CustomBusinessMonthEnd.normalize GL08" \
- -i "pandas.tseries.offsets.CustomBusinessMonthEnd.rule_code GL08" \
-i "pandas.tseries.offsets.CustomBusinessMonthEnd.weekmask GL08" \
- -i "pandas.tseries.offsets.DateOffset PR02" \
- -i "pandas.tseries.offsets.DateOffset.freqstr SA01" \
-i "pandas.tseries.offsets.DateOffset.is_on_offset GL08" \
-i "pandas.tseries.offsets.DateOffset.n GL08" \
- -i "pandas.tseries.offsets.DateOffset.nanos GL08" \
-i "pandas.tseries.offsets.DateOffset.normalize GL08" \
- -i "pandas.tseries.offsets.DateOffset.rule_code GL08" \
- -i "pandas.tseries.offsets.Day.freqstr SA01" \
-i "pandas.tseries.offsets.Day.is_on_offset GL08" \
-i "pandas.tseries.offsets.Day.n GL08" \
- -i "pandas.tseries.offsets.Day.nanos SA01" \
-i "pandas.tseries.offsets.Day.normalize GL08" \
- -i "pandas.tseries.offsets.Day.rule_code GL08" \
- -i "pandas.tseries.offsets.Easter PR02" \
- -i "pandas.tseries.offsets.Easter.freqstr SA01" \
-i "pandas.tseries.offsets.Easter.is_on_offset GL08" \
-i "pandas.tseries.offsets.Easter.n GL08" \
- -i "pandas.tseries.offsets.Easter.nanos GL08" \
-i "pandas.tseries.offsets.Easter.normalize GL08" \
- -i "pandas.tseries.offsets.Easter.rule_code GL08" \
- -i "pandas.tseries.offsets.FY5253 PR02" \
- -i "pandas.tseries.offsets.FY5253.freqstr SA01" \
-i "pandas.tseries.offsets.FY5253.get_rule_code_suffix GL08" \
-i "pandas.tseries.offsets.FY5253.get_year_end GL08" \
-i "pandas.tseries.offsets.FY5253.is_on_offset GL08" \
-i "pandas.tseries.offsets.FY5253.n GL08" \
- -i "pandas.tseries.offsets.FY5253.nanos GL08" \
-i "pandas.tseries.offsets.FY5253.normalize GL08" \
-i "pandas.tseries.offsets.FY5253.rule_code GL08" \
-i "pandas.tseries.offsets.FY5253.startingMonth GL08" \
-i "pandas.tseries.offsets.FY5253.variation GL08" \
-i "pandas.tseries.offsets.FY5253.weekday GL08" \
- -i "pandas.tseries.offsets.FY5253Quarter PR02" \
- -i "pandas.tseries.offsets.FY5253Quarter.freqstr SA01" \
-i "pandas.tseries.offsets.FY5253Quarter.get_rule_code_suffix GL08" \
-i "pandas.tseries.offsets.FY5253Quarter.get_weeks GL08" \
-i "pandas.tseries.offsets.FY5253Quarter.is_on_offset GL08" \
-i "pandas.tseries.offsets.FY5253Quarter.n GL08" \
- -i "pandas.tseries.offsets.FY5253Quarter.nanos GL08" \
-i "pandas.tseries.offsets.FY5253Quarter.normalize GL08" \
-i "pandas.tseries.offsets.FY5253Quarter.qtr_with_extra_week GL08" \
-i "pandas.tseries.offsets.FY5253Quarter.rule_code GL08" \
@@ -580,139 +475,80 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
-i "pandas.tseries.offsets.FY5253Quarter.variation GL08" \
-i "pandas.tseries.offsets.FY5253Quarter.weekday GL08" \
-i "pandas.tseries.offsets.FY5253Quarter.year_has_extra_week GL08" \
- -i "pandas.tseries.offsets.Hour PR02" \
- -i "pandas.tseries.offsets.Hour.freqstr SA01" \
-i "pandas.tseries.offsets.Hour.is_on_offset GL08" \
-i "pandas.tseries.offsets.Hour.n GL08" \
- -i "pandas.tseries.offsets.Hour.nanos SA01" \
-i "pandas.tseries.offsets.Hour.normalize GL08" \
- -i "pandas.tseries.offsets.Hour.rule_code GL08" \
- -i "pandas.tseries.offsets.LastWeekOfMonth PR02,SA01" \
- -i "pandas.tseries.offsets.LastWeekOfMonth.freqstr SA01" \
+ -i "pandas.tseries.offsets.LastWeekOfMonth SA01" \
-i "pandas.tseries.offsets.LastWeekOfMonth.is_on_offset GL08" \
-i "pandas.tseries.offsets.LastWeekOfMonth.n GL08" \
- -i "pandas.tseries.offsets.LastWeekOfMonth.nanos GL08" \
-i "pandas.tseries.offsets.LastWeekOfMonth.normalize GL08" \
- -i "pandas.tseries.offsets.LastWeekOfMonth.rule_code GL08" \
-i "pandas.tseries.offsets.LastWeekOfMonth.week GL08" \
-i "pandas.tseries.offsets.LastWeekOfMonth.weekday GL08" \
- -i "pandas.tseries.offsets.Micro PR02" \
- -i "pandas.tseries.offsets.Micro.freqstr SA01" \
-i "pandas.tseries.offsets.Micro.is_on_offset GL08" \
-i "pandas.tseries.offsets.Micro.n GL08" \
- -i "pandas.tseries.offsets.Micro.nanos SA01" \
-i "pandas.tseries.offsets.Micro.normalize GL08" \
- -i "pandas.tseries.offsets.Micro.rule_code GL08" \
- -i "pandas.tseries.offsets.Milli PR02" \
- -i "pandas.tseries.offsets.Milli.freqstr SA01" \
-i "pandas.tseries.offsets.Milli.is_on_offset GL08" \
-i "pandas.tseries.offsets.Milli.n GL08" \
- -i "pandas.tseries.offsets.Milli.nanos SA01" \
-i "pandas.tseries.offsets.Milli.normalize GL08" \
- -i "pandas.tseries.offsets.Milli.rule_code GL08" \
- -i "pandas.tseries.offsets.Minute PR02" \
- -i "pandas.tseries.offsets.Minute.freqstr SA01" \
-i "pandas.tseries.offsets.Minute.is_on_offset GL08" \
-i "pandas.tseries.offsets.Minute.n GL08" \
- -i "pandas.tseries.offsets.Minute.nanos SA01" \
-i "pandas.tseries.offsets.Minute.normalize GL08" \
- -i "pandas.tseries.offsets.Minute.rule_code GL08" \
- -i "pandas.tseries.offsets.MonthBegin PR02" \
- -i "pandas.tseries.offsets.MonthBegin.freqstr SA01" \
-i "pandas.tseries.offsets.MonthBegin.is_on_offset GL08" \
-i "pandas.tseries.offsets.MonthBegin.n GL08" \
- -i "pandas.tseries.offsets.MonthBegin.nanos GL08" \
-i "pandas.tseries.offsets.MonthBegin.normalize GL08" \
- -i "pandas.tseries.offsets.MonthBegin.rule_code GL08" \
- -i "pandas.tseries.offsets.MonthEnd.freqstr SA01" \
-i "pandas.tseries.offsets.MonthEnd.is_on_offset GL08" \
-i "pandas.tseries.offsets.MonthEnd.n GL08" \
- -i "pandas.tseries.offsets.MonthEnd.nanos GL08" \
-i "pandas.tseries.offsets.MonthEnd.normalize GL08" \
- -i "pandas.tseries.offsets.MonthEnd.rule_code GL08" \
- -i "pandas.tseries.offsets.Nano PR02" \
- -i "pandas.tseries.offsets.Nano.freqstr SA01" \
-i "pandas.tseries.offsets.Nano.is_on_offset GL08" \
- -i "pandas.tseries.offsets.Nano.n GL08" \
- -i "pandas.tseries.offsets.Nano.nanos SA01" \
-i "pandas.tseries.offsets.Nano.normalize GL08" \
- -i "pandas.tseries.offsets.Nano.rule_code GL08" \
- -i "pandas.tseries.offsets.QuarterBegin PR02" \
- -i "pandas.tseries.offsets.QuarterBegin.freqstr SA01" \
+ -i "pandas.tseries.offsets.Nano.n GL08" \
-i "pandas.tseries.offsets.QuarterBegin.is_on_offset GL08" \
-i "pandas.tseries.offsets.QuarterBegin.n GL08" \
- -i "pandas.tseries.offsets.QuarterBegin.nanos GL08" \
-i "pandas.tseries.offsets.QuarterBegin.normalize GL08" \
-i "pandas.tseries.offsets.QuarterBegin.rule_code GL08" \
-i "pandas.tseries.offsets.QuarterBegin.startingMonth GL08" \
- -i "pandas.tseries.offsets.QuarterEnd.freqstr SA01" \
-i "pandas.tseries.offsets.QuarterEnd.is_on_offset GL08" \
-i "pandas.tseries.offsets.QuarterEnd.n GL08" \
- -i "pandas.tseries.offsets.QuarterEnd.nanos GL08" \
-i "pandas.tseries.offsets.QuarterEnd.normalize GL08" \
-i "pandas.tseries.offsets.QuarterEnd.rule_code GL08" \
-i "pandas.tseries.offsets.QuarterEnd.startingMonth GL08" \
- -i "pandas.tseries.offsets.Second PR02" \
- -i "pandas.tseries.offsets.Second.freqstr SA01" \
-i "pandas.tseries.offsets.Second.is_on_offset GL08" \
-i "pandas.tseries.offsets.Second.n GL08" \
- -i "pandas.tseries.offsets.Second.nanos SA01" \
-i "pandas.tseries.offsets.Second.normalize GL08" \
- -i "pandas.tseries.offsets.Second.rule_code GL08" \
- -i "pandas.tseries.offsets.SemiMonthBegin PR02,SA01" \
+ -i "pandas.tseries.offsets.SemiMonthBegin SA01" \
-i "pandas.tseries.offsets.SemiMonthBegin.day_of_month GL08" \
- -i "pandas.tseries.offsets.SemiMonthBegin.freqstr SA01" \
-i "pandas.tseries.offsets.SemiMonthBegin.is_on_offset GL08" \
-i "pandas.tseries.offsets.SemiMonthBegin.n GL08" \
- -i "pandas.tseries.offsets.SemiMonthBegin.nanos GL08" \
-i "pandas.tseries.offsets.SemiMonthBegin.normalize GL08" \
-i "pandas.tseries.offsets.SemiMonthBegin.rule_code GL08" \
-i "pandas.tseries.offsets.SemiMonthEnd SA01" \
-i "pandas.tseries.offsets.SemiMonthEnd.day_of_month GL08" \
- -i "pandas.tseries.offsets.SemiMonthEnd.freqstr SA01" \
-i "pandas.tseries.offsets.SemiMonthEnd.is_on_offset GL08" \
-i "pandas.tseries.offsets.SemiMonthEnd.n GL08" \
- -i "pandas.tseries.offsets.SemiMonthEnd.nanos GL08" \
-i "pandas.tseries.offsets.SemiMonthEnd.normalize GL08" \
-i "pandas.tseries.offsets.SemiMonthEnd.rule_code GL08" \
-i "pandas.tseries.offsets.Tick GL08" \
- -i "pandas.tseries.offsets.Tick.freqstr SA01" \
-i "pandas.tseries.offsets.Tick.is_on_offset GL08" \
-i "pandas.tseries.offsets.Tick.n GL08" \
- -i "pandas.tseries.offsets.Tick.nanos SA01" \
-i "pandas.tseries.offsets.Tick.normalize GL08" \
- -i "pandas.tseries.offsets.Tick.rule_code GL08" \
- -i "pandas.tseries.offsets.Week PR02" \
- -i "pandas.tseries.offsets.Week.freqstr SA01" \
-i "pandas.tseries.offsets.Week.is_on_offset GL08" \
-i "pandas.tseries.offsets.Week.n GL08" \
- -i "pandas.tseries.offsets.Week.nanos GL08" \
-i "pandas.tseries.offsets.Week.normalize GL08" \
- -i "pandas.tseries.offsets.Week.rule_code GL08" \
-i "pandas.tseries.offsets.Week.weekday GL08" \
- -i "pandas.tseries.offsets.WeekOfMonth PR02,SA01" \
- -i "pandas.tseries.offsets.WeekOfMonth.freqstr SA01" \
+ -i "pandas.tseries.offsets.WeekOfMonth SA01" \
-i "pandas.tseries.offsets.WeekOfMonth.is_on_offset GL08" \
-i "pandas.tseries.offsets.WeekOfMonth.n GL08" \
- -i "pandas.tseries.offsets.WeekOfMonth.nanos GL08" \
-i "pandas.tseries.offsets.WeekOfMonth.normalize GL08" \
- -i "pandas.tseries.offsets.WeekOfMonth.rule_code GL08" \
-i "pandas.tseries.offsets.WeekOfMonth.week GL08" \
-i "pandas.tseries.offsets.WeekOfMonth.weekday GL08" \
- -i "pandas.tseries.offsets.YearBegin.freqstr SA01" \
-i "pandas.tseries.offsets.YearBegin.is_on_offset GL08" \
-i "pandas.tseries.offsets.YearBegin.month GL08" \
-i "pandas.tseries.offsets.YearBegin.n GL08" \
- -i "pandas.tseries.offsets.YearBegin.nanos GL08" \
-i "pandas.tseries.offsets.YearBegin.normalize GL08" \
- -i "pandas.tseries.offsets.YearBegin.rule_code GL08" \
- -i "pandas.tseries.offsets.YearEnd.freqstr SA01" \
-i "pandas.tseries.offsets.YearEnd.is_on_offset GL08" \
-i "pandas.tseries.offsets.YearEnd.month GL08" \
-i "pandas.tseries.offsets.YearEnd.n GL08" \
- -i "pandas.tseries.offsets.YearEnd.nanos GL08" \
-i "pandas.tseries.offsets.YearEnd.normalize GL08" \
- -i "pandas.tseries.offsets.YearEnd.rule_code GL08" \
-i "pandas.util.hash_pandas_object PR07,SA01" # There should be no backslash in the final line, please keep this comment in the last ignored function
RET=$(($RET + $?)) ; echo $MSG "DONE"
diff --git a/ci/deps/actions-310-minimum_versions.yaml b/ci/deps/actions-310-minimum_versions.yaml
index 0c46f476893dd..e670356c95637 100644
--- a/ci/deps/actions-310-minimum_versions.yaml
+++ b/ci/deps/actions-310-minimum_versions.yaml
@@ -23,7 +23,6 @@ dependencies:
# required dependencies
- python-dateutil=2.8.2
- numpy=1.23.5
- - pytz=2020.1
# optional dependencies
- beautifulsoup4=4.11.2
@@ -49,6 +48,7 @@ dependencies:
- pyreadstat=1.2.0
- pytables=3.8.0
- python-calamine=0.1.7
+ - pytz=2023.4
- pyxlsb=1.0.10
- s3fs=2022.11.0
- scipy=1.10.0
diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml
index 0af46752f5b3d..c33c0344e742f 100644
--- a/ci/deps/actions-310.yaml
+++ b/ci/deps/actions-310.yaml
@@ -21,7 +21,6 @@ dependencies:
# required dependencies
- python-dateutil
- numpy
- - pytz
# optional dependencies
- beautifulsoup4>=4.11.2
@@ -47,6 +46,7 @@ dependencies:
- pyreadstat>=1.2.0
- pytables>=3.8.0
- python-calamine>=0.1.7
+ - pytz>=2023.4
- pyxlsb>=1.0.10
- s3fs>=2022.11.0
- scipy>=1.10.0
diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml
index 1a842c7212c1f..8692b6e35ab2d 100644
--- a/ci/deps/actions-311-downstream_compat.yaml
+++ b/ci/deps/actions-311-downstream_compat.yaml
@@ -22,7 +22,6 @@ dependencies:
# required dependencies
- python-dateutil
- numpy
- - pytz
# optional dependencies
- beautifulsoup4>=4.11.2
@@ -48,6 +47,7 @@ dependencies:
- pyreadstat>=1.2.0
- pytables>=3.8.0
- python-calamine>=0.1.7
+ - pytz>=2023.4
- pyxlsb>=1.0.10
- s3fs>=2022.11.0
- scipy>=1.10.0
diff --git a/ci/deps/actions-311-numpydev.yaml b/ci/deps/actions-311-numpydev.yaml
index 748cfa861ec32..996ce5cd9ab94 100644
--- a/ci/deps/actions-311-numpydev.yaml
+++ b/ci/deps/actions-311-numpydev.yaml
@@ -18,7 +18,6 @@ dependencies:
# pandas dependencies
- python-dateutil
- - pytz
- pip
- pip:
diff --git a/ci/deps/actions-311-pyarrownightly.yaml b/ci/deps/actions-311-pyarrownightly.yaml
index 469fb1bfb9138..434f1d4f7fed2 100644
--- a/ci/deps/actions-311-pyarrownightly.yaml
+++ b/ci/deps/actions-311-pyarrownightly.yaml
@@ -19,7 +19,6 @@ dependencies:
# required dependencies
- python-dateutil
- numpy<2
- - pytz
- pip
- pip:
diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml
index 75394e2c8e109..8e7d9aba7878d 100644
--- a/ci/deps/actions-311.yaml
+++ b/ci/deps/actions-311.yaml
@@ -21,7 +21,6 @@ dependencies:
# required dependencies
- python-dateutil
- numpy
- - pytz
# optional dependencies
- beautifulsoup4>=4.11.2
@@ -47,6 +46,7 @@ dependencies:
- pyreadstat>=1.2.0
- pytables>=3.8.0
- python-calamine>=0.1.7
+ - pytz>=2023.4
- pyxlsb>=1.0.10
- s3fs>=2022.11.0
- scipy>=1.10.0
diff --git a/ci/deps/actions-312.yaml b/ci/deps/actions-312.yaml
index d4b43ddef3601..6c97960a62d40 100644
--- a/ci/deps/actions-312.yaml
+++ b/ci/deps/actions-312.yaml
@@ -21,7 +21,6 @@ dependencies:
# required dependencies
- python-dateutil
- numpy
- - pytz
# optional dependencies
- beautifulsoup4>=4.11.2
@@ -47,6 +46,7 @@ dependencies:
- pyreadstat>=1.2.0
- pytables>=3.8.0
- python-calamine>=0.1.7
+ - pytz>=2023.4
- pyxlsb>=1.0.10
- s3fs>=2022.11.0
- scipy>=1.10.0
diff --git a/ci/deps/actions-pypy-39.yaml b/ci/deps/actions-pypy-39.yaml
index b0ae9f1e48473..c157d2e65c001 100644
--- a/ci/deps/actions-pypy-39.yaml
+++ b/ci/deps/actions-pypy-39.yaml
@@ -22,6 +22,5 @@ dependencies:
# required
- numpy
- python-dateutil
- - pytz
- pip:
- tzdata>=2022.7
diff --git a/ci/deps/circle-311-arm64.yaml b/ci/deps/circle-311-arm64.yaml
index 18535d81e6985..c86534871b3d2 100644
--- a/ci/deps/circle-311-arm64.yaml
+++ b/ci/deps/circle-311-arm64.yaml
@@ -21,7 +21,6 @@ dependencies:
# required dependencies
- python-dateutil
- numpy
- - pytz
# optional dependencies
- beautifulsoup4>=4.11.2
@@ -47,6 +46,7 @@ dependencies:
- pyreadstat>=1.2.0
- pytables>=3.8.0
- python-calamine>=0.1.7
+ - pytz>=2023.4
- pyxlsb>=1.0.10
- s3fs>=2022.11.0
- scipy>=1.10.0
diff --git a/ci/meta.yaml b/ci/meta.yaml
index b76bef2f630b7..9d434991b12c1 100644
--- a/ci/meta.yaml
+++ b/ci/meta.yaml
@@ -37,7 +37,6 @@ requirements:
- numpy >=1.21.6 # [py<311]
- numpy >=1.23.2 # [py>=311]
- python-dateutil >=2.8.2
- - pytz >=2020.1
- python-tzdata >=2022.7
test:
diff --git a/doc/source/development/contributing_codebase.rst b/doc/source/development/contributing_codebase.rst
index 28129440b86d7..277f407ae4418 100644
--- a/doc/source/development/contributing_codebase.rst
+++ b/doc/source/development/contributing_codebase.rst
@@ -762,8 +762,7 @@ install pandas) by typing::
your installation is probably fine and you can start contributing!
Often it is worth running only a subset of tests first around your changes before running the
-entire suite (tip: you can use the `pandas-coverage app `_)
-to find out which tests hit the lines of code you've modified, and then run only those).
+entire suite.
The easiest way to do this is with::
diff --git a/doc/source/development/contributing_docstring.rst b/doc/source/development/contributing_docstring.rst
index 0b8c1e16dce0e..e174eea00ca60 100644
--- a/doc/source/development/contributing_docstring.rst
+++ b/doc/source/development/contributing_docstring.rst
@@ -142,7 +142,7 @@ backticks. The following are considered inline code:
With several mistakes in the docstring.
- It has a blank like after the signature ``def func():``.
+ It has a blank line after the signature ``def func():``.
The text 'Some function' should go in the line after the
opening quotes of the docstring, not in the same line.
diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst
index 86ce05fde547b..8e6cb9e9a132d 100644
--- a/doc/source/getting_started/install.rst
+++ b/doc/source/getting_started/install.rst
@@ -205,7 +205,6 @@ Package Minimum support
================================================================ ==========================
`NumPy `__ 1.23.5
`python-dateutil `__ 2.8.2
-`pytz `__ 2020.1
`tzdata `__ 2022.7
================================================================ ==========================
@@ -419,3 +418,14 @@ Dependency Minimum Version pip extra Notes
========================= ================== =============== =============================================================
Zstandard 0.19.0 compression Zstandard compression
========================= ================== =============== =============================================================
+
+Timezone
+^^^^^^^^
+
+Installable with ``pip install "pandas[timezone]"``
+
+========================= ================== =================== =============================================================
+Dependency Minimum Version pip extra Notes
+========================= ================== =================== =============================================================
+pytz 2023.4 timezone Alternative timezone library to ``zoneinfo``.
+========================= ================== =================== =============================================================
diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst
index 0845417e4910d..4299dca4774b9 100644
--- a/doc/source/user_guide/timeseries.rst
+++ b/doc/source/user_guide/timeseries.rst
@@ -2569,7 +2569,7 @@ Ambiguous times when localizing
because daylight savings time (DST) in a local time zone causes some times to occur
twice within one day ("clocks fall back"). The following options are available:
-* ``'raise'``: Raises a ``pytz.AmbiguousTimeError`` (the default behavior)
+* ``'raise'``: Raises a ``ValueError`` (the default behavior)
* ``'infer'``: Attempt to determine the correct offset base on the monotonicity of the timestamps
* ``'NaT'``: Replaces ambiguous times with ``NaT``
* ``bool``: ``True`` represents a DST time, ``False`` represents non-DST time. An array-like of ``bool`` values is supported for a sequence of times.
@@ -2604,7 +2604,7 @@ A DST transition may also shift the local time ahead by 1 hour creating nonexist
local times ("clocks spring forward"). The behavior of localizing a timeseries with nonexistent times
can be controlled by the ``nonexistent`` argument. The following options are available:
-* ``'raise'``: Raises a ``pytz.NonExistentTimeError`` (the default behavior)
+* ``'raise'``: Raises a ``ValueError`` (the default behavior)
* ``'NaT'``: Replaces nonexistent times with ``NaT``
* ``'shift_forward'``: Shifts nonexistent times forward to the closest real time
* ``'shift_backward'``: Shifts nonexistent times backward to the closest real time
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
index 713eadacbeb2a..1e3b22212ac29 100644
--- a/doc/source/whatsnew/v3.0.0.rst
+++ b/doc/source/whatsnew/v3.0.0.rst
@@ -54,6 +54,7 @@ Other enhancements
- :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`)
- Multiplying two :class:`DateOffset` objects will now raise a ``TypeError`` instead of a ``RecursionError`` (:issue:`59442`)
- Restore support for reading Stata 104-format and enable reading 103-format dta files (:issue:`58554`)
+- Support passing a :class:`Iterable[Hashable]` input to :meth:`DataFrame.drop_duplicates` (:issue:`59237`)
- Support reading Stata 102-format (Stata 1) dta files (:issue:`58978`)
- Support reading Stata 110-format (Stata 7) dta files (:issue:`47176`)
@@ -222,6 +223,8 @@ Optional libraries below the lowest tested version may still work, but are not c
+------------------------+---------------------+
| Package | New Minimum Version |
+========================+=====================+
+| pytz | 2023.4 |
++------------------------+---------------------+
| fastparquet | 2023.10.0 |
+------------------------+---------------------+
| adbc-driver-postgresql | 0.10.0 |
@@ -231,6 +234,37 @@ Optional libraries below the lowest tested version may still work, but are not c
See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for more.
+.. _whatsnew_300.api_breaking.pytz:
+
+``pytz`` now an optional dependency
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+pandas now uses :py:mod:`zoneinfo` from the standard library as the default timezone implementation when passing a timezone
+string to various methods. (:issue:`34916`)
+
+*Old behavior:*
+
+.. code-block:: ipython
+
+ In [1]: ts = pd.Timestamp(2024, 1, 1).tz_localize("US/Pacific")
+ In [2]: ts.tz
+
+
+*New behavior:*
+
+.. ipython:: python
+
+ ts = pd.Timestamp(2024, 1, 1).tz_localize("US/Pacific")
+ ts.tz
+
+``pytz`` timezone objects are still supported when passed directly, but they will no longer be returned by default
+from string inputs. Moreover, ``pytz`` is no longer a required dependency of pandas, but can be installed
+with the pip extra ``pip install pandas[timezone]``.
+
+
+Additionally, pandas no longer throws ``pytz`` exceptions for timezone operations leading to ambiguous or nonexistent
+times. These cases will now raise a ``ValueError``.
+
.. _whatsnew_300.api_breaking.other:
Other API changes
@@ -510,7 +544,7 @@ Datetimelike
- Bug in :attr:`is_year_start` where a DateTimeIndex constructed via a date_range with frequency 'MS' wouldn't have the correct year or quarter start attributes (:issue:`57377`)
- Bug in :class:`Timestamp` constructor failing to raise when ``tz=None`` is explicitly specified in conjunction with timezone-aware ``tzinfo`` or data (:issue:`48688`)
- Bug in :func:`date_range` where the last valid timestamp would sometimes not be produced (:issue:`56134`)
-- Bug in :func:`date_range` where using a negative frequency value would not include all points between the start and end values (:issue:`56382`)
+- Bug in :func:`date_range` where using a negative frequency value would not include all points between the start and end values (:issue:`56147`)
- Bug in :func:`tseries.api.guess_datetime_format` would fail to infer time format when "%Y" == "%H%M" (:issue:`57452`)
- Bug in :func:`tseries.frequencies.to_offset` would fail to parse frequency strings starting with "LWOM" (:issue:`59218`)
- Bug in :meth:`Dataframe.agg` with df with missing values resulting in IndexError (:issue:`58810`)
@@ -622,6 +656,7 @@ Reshaping
^^^^^^^^^
- Bug in :func:`qcut` where values at the quantile boundaries could be incorrectly assigned (:issue:`59355`)
- Bug in :meth:`DataFrame.join` inconsistently setting result index name (:issue:`55815`)
+- Bug in :meth:`DataFrame.merge` where merging on a column containing only ``NaN`` values resulted in an out-of-bounds array access (:issue:`59421`)
- Bug in :meth:`DataFrame.unstack` producing incorrect results when ``sort=False`` (:issue:`54987`, :issue:`55516`)
- Bug in :meth:`DataFrame.unstack` producing incorrect results when manipulating empty :class:`DataFrame` with an :class:`ExtentionDtype` (:issue:`59123`)
@@ -634,6 +669,7 @@ ExtensionArray
^^^^^^^^^^^^^^
- Bug in :meth:`.arrays.ArrowExtensionArray.__setitem__` which caused wrong behavior when using an integer array with repeated values as a key (:issue:`58530`)
- Bug in :meth:`api.types.is_datetime64_any_dtype` where a custom :class:`ExtensionDtype` would return ``False`` for array-likes (:issue:`57055`)
+- Bug in comparison between object with :class:`ArrowDtype` and incompatible-dtyped (e.g. string vs bool) incorrectly raising instead of returning all-``False`` (for ``==``) or all-``True`` (for ``!=``) (:issue:`59505`)
- Bug in various :class:`DataFrame` reductions for pyarrow temporal dtypes returning incorrect dtype when result was null (:issue:`59234`)
Styler
@@ -650,6 +686,7 @@ Other
- Bug in :meth:`DataFrame.apply` where passing ``engine="numba"`` ignored ``args`` passed to the applied function (:issue:`58712`)
- Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which caused an exception when using NumPy attributes via ``@`` notation, e.g., ``df.eval("@np.floor(a)")``. (:issue:`58041`)
- Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which did not allow to use ``tan`` function. (:issue:`55091`)
+- Bug in :meth:`DataFrame.query` which raised an exception or produced incorrect results when expressions contained backtick-quoted column names containing the hash character ``#``, backticks, or characters that fall outside the ASCII range (U+0001..U+007F). (:issue:`59285`) (:issue:`49633`)
- Bug in :meth:`DataFrame.sort_index` when passing ``axis="columns"`` and ``ignore_index=True`` and ``ascending=False`` not returning a :class:`RangeIndex` columns (:issue:`57293`)
- Bug in :meth:`DataFrame.transform` that was returning the wrong order unless the index was monotonically increasing. (:issue:`57069`)
- Bug in :meth:`DataFrame.where` where using a non-bool type array in the function would return a ``ValueError`` instead of a ``TypeError`` (:issue:`56330`)
diff --git a/environment.yml b/environment.yml
index e5646af07c45c..34bc0591ca8df 100644
--- a/environment.yml
+++ b/environment.yml
@@ -24,7 +24,6 @@ dependencies:
# required dependencies
- python-dateutil
- numpy<2
- - pytz
# optional dependencies
- beautifulsoup4>=4.11.2
@@ -50,6 +49,7 @@ dependencies:
- pyreadstat>=1.2.0
- pytables>=3.8.0
- python-calamine>=0.1.7
+ - pytz>=2023.4
- pyxlsb>=1.0.10
- s3fs>=2022.11.0
- scipy>=1.10.0
diff --git a/pandas/__init__.py b/pandas/__init__.py
index 3ee6f6abf97bf..05547e50bbb37 100644
--- a/pandas/__init__.py
+++ b/pandas/__init__.py
@@ -3,7 +3,7 @@
__docformat__ = "restructuredtext"
# Let users know if they're missing any of our hard dependencies
-_hard_dependencies = ("numpy", "pytz", "dateutil")
+_hard_dependencies = ("numpy", "dateutil")
_missing_dependencies = []
for _dependency in _hard_dependencies:
diff --git a/pandas/_config/config.py b/pandas/_config/config.py
index 51794ec04b29e..4ed2d4c3be692 100644
--- a/pandas/_config/config.py
+++ b/pandas/_config/config.py
@@ -426,6 +426,11 @@ def option_context(*args) -> Generator[None, None, None]:
None
No return value.
+ Yields
+ ------
+ None
+ No yield value.
+
See Also
--------
get_option : Retrieve the value of the specified option.
diff --git a/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c b/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c
index f854f7b9210d8..cc65f34d6b6fe 100644
--- a/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c
+++ b/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c
@@ -20,14 +20,12 @@ This file is derived from NumPy 1.7. See NUMPY_LICENSE.txt
#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
#endif // NPY_NO_DEPRECATED_API
-#include
-
#include "pandas/vendored/numpy/datetime/np_datetime.h"
-
#define NO_IMPORT_ARRAY
#define PY_ARRAY_UNIQUE_SYMBOL PANDAS_DATETIME_NUMPY
#include
#include
+#include
#if defined(_WIN32)
#ifndef ENABLE_INTSAFE_SIGNED_FUNCTIONS
@@ -58,12 +56,15 @@ _Static_assert(0, "__has_builtin not detected; please try a newer compiler");
#endif
#endif
+#define XSTR(a) STR(a)
+#define STR(a) #a
+
#define PD_CHECK_OVERFLOW(FUNC) \
do { \
if ((FUNC) != 0) { \
PyGILState_STATE gstate = PyGILState_Ensure(); \
PyErr_SetString(PyExc_OverflowError, \
- "Overflow occurred in npy_datetimestruct_to_datetime"); \
+ "Overflow occurred at " __FILE__ ":" XSTR(__LINE__)); \
PyGILState_Release(gstate); \
return -1; \
} \
@@ -139,8 +140,8 @@ npy_int64 get_datetimestruct_days(const npy_datetimestruct *dts) {
npy_int64 year, days = 0;
const int *month_lengths;
- year = dts->year - 1970;
- days = year * 365;
+ PD_CHECK_OVERFLOW(checked_int64_sub(dts->year, 1970, &year));
+ PD_CHECK_OVERFLOW(checked_int64_mul(year, 365, &days));
/* Adjust for leap years */
if (days >= 0) {
@@ -148,32 +149,32 @@ npy_int64 get_datetimestruct_days(const npy_datetimestruct *dts) {
* 1968 is the closest leap year before 1970.
* Exclude the current year, so add 1.
*/
- year += 1;
+ PD_CHECK_OVERFLOW(checked_int64_add(year, 1, &year));
/* Add one day for each 4 years */
- days += year / 4;
+ PD_CHECK_OVERFLOW(checked_int64_add(days, year / 4, &days));
/* 1900 is the closest previous year divisible by 100 */
- year += 68;
+ PD_CHECK_OVERFLOW(checked_int64_add(year, 68, &year));
/* Subtract one day for each 100 years */
- days -= year / 100;
+ PD_CHECK_OVERFLOW(checked_int64_sub(days, year / 100, &days));
/* 1600 is the closest previous year divisible by 400 */
- year += 300;
+ PD_CHECK_OVERFLOW(checked_int64_add(year, 300, &year));
/* Add one day for each 400 years */
- days += year / 400;
+ PD_CHECK_OVERFLOW(checked_int64_add(days, year / 400, &days));
} else {
/*
* 1972 is the closest later year after 1970.
* Include the current year, so subtract 2.
*/
- year -= 2;
+ PD_CHECK_OVERFLOW(checked_int64_sub(year, 2, &year));
/* Subtract one day for each 4 years */
- days += year / 4;
+ PD_CHECK_OVERFLOW(checked_int64_add(days, year / 4, &days));
/* 2000 is the closest later year divisible by 100 */
- year -= 28;
+ PD_CHECK_OVERFLOW(checked_int64_sub(year, 28, &year));
/* Add one day for each 100 years */
- days -= year / 100;
+ PD_CHECK_OVERFLOW(checked_int64_sub(days, year / 100, &days));
/* 2000 is also the closest later year divisible by 400 */
/* Subtract one day for each 400 years */
- days += year / 400;
+ PD_CHECK_OVERFLOW(checked_int64_add(days, year / 400, &days));
}
month_lengths = days_per_month_table[is_leapyear(dts->year)];
@@ -181,11 +182,11 @@ npy_int64 get_datetimestruct_days(const npy_datetimestruct *dts) {
/* Add the months */
for (i = 0; i < month; ++i) {
- days += month_lengths[i];
+ PD_CHECK_OVERFLOW(checked_int64_add(days, month_lengths[i], &days));
}
/* Add the days */
- days += dts->day - 1;
+ PD_CHECK_OVERFLOW(checked_int64_add(days, dts->day - 1, &days));
return days;
}
@@ -430,6 +431,15 @@ npy_datetime npy_datetimestruct_to_datetime(NPY_DATETIMEUNIT base,
}
const int64_t days = get_datetimestruct_days(dts);
+ if (days == -1) {
+ PyGILState_STATE gstate = PyGILState_Ensure();
+ bool did_error = PyErr_Occurred() == NULL ? false : true;
+ PyGILState_Release(gstate);
+ if (did_error) {
+ return -1;
+ }
+ }
+
if (base == NPY_FR_D) {
return days;
}
diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx
index 0fadbbbed2c72..a635dd33f8420 100644
--- a/pandas/_libs/tslibs/conversion.pyx
+++ b/pandas/_libs/tslibs/conversion.pyx
@@ -69,6 +69,7 @@ from pandas._libs.tslibs.timestamps cimport _Timestamp
from pandas._libs.tslibs.timezones cimport (
get_utcoffset,
is_utc,
+ treat_tz_as_pytz,
)
from pandas._libs.tslibs.tzconversion cimport (
Localizer,
@@ -747,11 +748,17 @@ cdef datetime _localize_pydatetime(datetime dt, tzinfo tz):
identically, i.e. discards nanos from Timestamps.
It also assumes that the `tz` input is not None.
"""
- try:
+ if treat_tz_as_pytz(tz):
+ import pytz
+
# datetime.replace with pytz may be incorrect result
# TODO: try to respect `fold` attribute
- return tz.localize(dt, is_dst=None)
- except AttributeError:
+ try:
+ return tz.localize(dt, is_dst=None)
+ except (pytz.AmbiguousTimeError, pytz.NonExistentTimeError) as err:
+ # As of pandas 3.0, we raise ValueErrors instead of pytz exceptions
+ raise ValueError(str(err)) from err
+ else:
return dt.replace(tzinfo=tz)
diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx
index 130e41e5104a2..41011ff13737a 100644
--- a/pandas/_libs/tslibs/nattype.pyx
+++ b/pandas/_libs/tslibs/nattype.pyx
@@ -229,7 +229,17 @@ cdef class _NaT(datetime):
def to_datetime64(self) -> np.datetime64:
"""
- Return a numpy.datetime64 object with same precision.
+ Return a NumPy datetime64 object with same precision.
+
+ This method returns a numpy.datetime64 object with the same
+ date and time information and precision as the pd.Timestamp object.
+
+ See Also
+ --------
+ numpy.datetime64 : Class to represent dates and times with high precision.
+ Timestamp.to_numpy : Alias for this method.
+ Timestamp.asm8 : Alias for this method.
+ pd.to_datetime : Convert argument to datetime.
Examples
--------
@@ -244,16 +254,24 @@ cdef class _NaT(datetime):
def to_numpy(self, dtype=None, copy=False) -> np.datetime64 | np.timedelta64:
"""
- Convert the Timestamp to a NumPy datetime64 or timedelta64.
+ Convert the Timestamp to a NumPy datetime64.
- With the default 'dtype', this is an alias method for `NaT.to_datetime64()`.
-
- The copy parameter is available here only for compatibility. Its value
+ This is an alias method for `Timestamp.to_datetime64()`. The dtype and
+ copy parameters are available here only for compatibility. Their values
will not affect the return value.
+ Parameters
+ ----------
+ dtype : dtype, optional
+ Data type of the output, ignored in this method as the return type
+ is always `numpy.datetime64`.
+ copy : bool, default False
+ Whether to ensure that the returned value is a new object. This
+ parameter is also ignored as the method does not support copying.
+
Returns
-------
- numpy.datetime64 or numpy.timedelta64
+ numpy.datetime64
See Also
--------
@@ -269,9 +287,6 @@ cdef class _NaT(datetime):
>>> pd.NaT.to_numpy()
numpy.datetime64('NaT')
-
- >>> pd.NaT.to_numpy("m8[ns]")
- numpy.timedelta64('NaT','ns')
"""
if dtype is not None:
# GH#44460
@@ -476,6 +491,11 @@ class NaTType(_NaT):
"""
Return the month name of the Timestamp with specified locale.
+ This method returns the full name of the month corresponding to the
+ `Timestamp`, such as 'January', 'February', etc. The month name can
+ be returned in a specified locale if provided; otherwise, it defaults
+ to the English locale.
+
Parameters
----------
locale : str, default None (English locale)
@@ -484,9 +504,18 @@ class NaTType(_NaT):
Returns
-------
str
+ The full month name as a string.
+
+ See Also
+ --------
+ Timestamp.day_name : Returns the name of the day of the week.
+ Timestamp.strftime : Returns a formatted string of the Timestamp.
+ datetime.datetime.strftime : Returns a string representing the date and time.
Examples
--------
+ Get the month name in English (default):
+
>>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651')
>>> ts.month_name()
'March'
@@ -581,10 +610,25 @@ class NaTType(_NaT):
date = _make_nat_func(
"date",
"""
- Return date object with same year, month and day.
+ Returns `datetime.date` with the same year, month, and day.
+
+ This method extracts the date component from the `Timestamp` and returns
+ it as a `datetime.date` object, discarding the time information.
+
+ Returns
+ -------
+ datetime.date
+ The date part of the `Timestamp`.
+
+ See Also
+ --------
+ Timestamp : Represents a single timestamp, similar to `datetime`.
+ datetime.datetime.date : Extract the date component from a `datetime` object.
Examples
--------
+ Extract the date from a Timestamp:
+
>>> ts = pd.Timestamp('2023-01-01 10:00:00.00')
>>> ts
Timestamp('2023-01-01 10:00:00')
@@ -595,7 +639,24 @@ class NaTType(_NaT):
utctimetuple = _make_error_func(
"utctimetuple",
"""
- Return UTC time tuple, compatible with time.localtime().
+ Return UTC time tuple, compatible with `time.localtime()`.
+
+ This method converts the Timestamp to UTC and returns a time tuple
+ containing 9 components: year, month, day, hour, minute, second,
+ weekday, day of year, and DST flag. This is particularly useful for
+ converting a Timestamp to a format compatible with time module functions.
+
+ Returns
+ -------
+ time.struct_time
+ A time.struct_time object representing the UTC time.
+
+ See Also
+ --------
+ datetime.datetime.utctimetuple :
+ Return UTC time tuple, compatible with time.localtime().
+ Timestamp.timetuple : Return time tuple of local time.
+ time.struct_time : Time tuple structure used by time functions.
Examples
--------
@@ -612,6 +673,22 @@ class NaTType(_NaT):
"""
Return utc offset.
+ This method returns the difference between UTC and the local time
+ as a `timedelta` object. It is useful for understanding the time
+ difference between the current timezone and UTC.
+
+ Returns
+ --------
+ timedelta
+ The difference between UTC and the local time as a `timedelta` object.
+
+ See Also
+ --------
+ datetime.datetime.utcoffset :
+ Standard library method to get the UTC offset of a datetime object.
+ Timestamp.tzname : Return the name of the timezone.
+ Timestamp.dst : Return the daylight saving time (DST) adjustment.
+
Examples
--------
>>> ts = pd.Timestamp('2023-01-01 10:00:00', tz='Europe/Brussels')
@@ -626,6 +703,13 @@ class NaTType(_NaT):
"""
Return time zone name.
+ This method returns the name of the Timestamp's time zone as a string.
+
+ See Also
+ --------
+ Timestamp.tzinfo : Returns the timezone information of the Timestamp.
+ Timestamp.tz_convert : Convert timezone-aware Timestamp to another time zone.
+
Examples
--------
>>> ts = pd.Timestamp('2023-01-01 10:00:00', tz='Europe/Brussels')
@@ -664,6 +748,17 @@ class NaTType(_NaT):
"""
Return time tuple, compatible with time.localtime().
+ This method converts the `Timestamp` into a time tuple, which is compatible
+ with functions like `time.localtime()`. The time tuple is a named tuple with
+ attributes such as year, month, day, hour, minute, second, weekday,
+ day of the year, and daylight savings indicator.
+
+ See Also
+ --------
+ time.localtime : Converts a POSIX timestamp into a time tuple.
+ Timestamp : The `Timestamp` that represents a specific point in time.
+ datetime.datetime.timetuple : Equivalent method in the `datetime` module.
+
Examples
--------
>>> ts = pd.Timestamp('2023-01-01 10:00:00')
@@ -679,6 +774,19 @@ class NaTType(_NaT):
"""
Return time object with same time and tzinfo.
+ This method returns a datetime.time object with
+ the time and tzinfo corresponding to the pd.Timestamp
+ object, ignoring any information about the day/date.
+
+ See Also
+ --------
+ datetime.datetime.timetz : Return datetime.time object with the
+ same time attributes as the datetime object.
+ datetime.time : Class to represent the time of day, independent
+ of any particular day.
+ datetime.datetime.tzinfo : Attribute of datetime.datetime objects
+ representing the timezone of the datetime object.
+
Examples
--------
>>> ts = pd.Timestamp('2023-01-01 10:00:00', tz='Europe/Brussels')
@@ -693,6 +801,17 @@ class NaTType(_NaT):
"""
Return proleptic Gregorian ordinal. January 1 of year 1 is day 1.
+ The proleptic Gregorian ordinal is a continuous count of days since
+ January 1 of year 1, which is considered day 1. This method converts
+ the `Timestamp` to its equivalent ordinal number, useful for date arithmetic
+ and comparison operations.
+
+ See Also
+ --------
+ datetime.datetime.toordinal : Equivalent method in the `datetime` module.
+ Timestamp : The `Timestamp` that represents a specific point in time.
+ Timestamp.fromordinal : Create a `Timestamp` from an ordinal.
+
Examples
--------
>>> ts = pd.Timestamp('2023-01-01 10:00:50')
@@ -705,7 +824,25 @@ class NaTType(_NaT):
ctime = _make_error_func(
"ctime",
"""
- Return ctime() style string.
+ Return a ctime() style string representing the Timestamp.
+
+ This method returns a string representing the date and time
+ in the format returned by the standard library's `time.ctime()`
+ function, which is typically in the form 'Day Mon DD HH:MM:SS YYYY'.
+
+ If the `Timestamp` is outside the range supported by Python's
+ standard library, a `NotImplementedError` is raised.
+
+ Returns
+ -------
+ str
+ A string representing the Timestamp in ctime format.
+
+ See Also
+ --------
+ time.ctime : Return a string representing time in ctime format.
+ Timestamp : Represents a single timestamp, similar to `datetime`.
+ datetime.datetime.ctime : Return a ctime style string from a datetime object.
Examples
--------
@@ -746,9 +883,27 @@ class NaTType(_NaT):
strptime = _make_error_func(
"strptime",
"""
- Timestamp.strptime(string, format)
+ Convert string argument to datetime.
+
+ This method is not implemented; calling it will raise NotImplementedError.
+ Use pd.to_datetime() instead.
+
+ Parameters
+ ----------
+ date_string : str
+ String to convert to a datetime.
+ format : str, default None
+ The format string to parse time, e.g. "%d/%m/%Y".
- Function is not implemented. Use pd.to_datetime().
+ See Also
+ --------
+ pd.to_datetime : Convert argument to datetime.
+ datetime.datetime.strptime : Return a datetime corresponding to a string
+ representing a date and time, parsed according to a separate
+ format string.
+ datetime.datetime.strftime : Return a string representing the date and
+ time, controlled by an explicit format string.
+ Timestamp.isoformat : Return the time formatted according to ISO 8601.
Examples
--------
@@ -765,6 +920,21 @@ class NaTType(_NaT):
Construct a timezone-aware UTC datetime from a POSIX timestamp.
+ This method creates a datetime object from a POSIX timestamp, keeping the
+ Timestamp object's timezone.
+
+ Parameters
+ ----------
+ ts : float
+ POSIX timestamp.
+
+ See Also
+ --------
+ Timezone.tzname : Return time zone name.
+ Timestamp.utcnow : Return a new Timestamp representing UTC day and time.
+ Timestamp.fromtimestamp : Transform timestamp[, tz] to tz's local
+ time from POSIX timestamp.
+
Notes
-----
Timestamp.utcfromtimestamp behavior differs from datetime.utcfromtimestamp
@@ -779,16 +949,43 @@ class NaTType(_NaT):
fromtimestamp = _make_error_func(
"fromtimestamp",
"""
- Timestamp.fromtimestamp(ts)
+ Create a `Timestamp` object from a POSIX timestamp.
- Transform timestamp[, tz] to tz's local time from POSIX timestamp.
+ This method converts a POSIX timestamp (the number of seconds since
+ January 1, 1970, 00:00:00 UTC) into a `Timestamp` object. The resulting
+ `Timestamp` can be localized to a specific time zone if provided.
+
+ Parameters
+ ----------
+ ts : float
+ The POSIX timestamp to convert, representing seconds since
+ the epoch (1970-01-01 00:00:00 UTC).
+ tz : str, zoneinfo.ZoneInfo, pytz.timezone, dateutil.tz.tzfile, optional
+ Time zone for the `Timestamp`. If not provided, the `Timestamp` will
+ be timezone-naive (i.e., without time zone information).
+
+ Returns
+ -------
+ Timestamp
+ A `Timestamp` object representing the given POSIX timestamp.
+
+ See Also
+ --------
+ Timestamp : Represents a single timestamp, similar to `datetime`.
+ to_datetime : Converts various types of data to datetime.
+ datetime.datetime.fromtimestamp : Returns a datetime from a POSIX timestamp.
Examples
--------
+ Convert a POSIX timestamp to a `Timestamp`:
+
>>> pd.Timestamp.fromtimestamp(1584199972) # doctest: +SKIP
Timestamp('2020-03-14 15:32:52')
- Note that the output may change depending on your local time.
+ Note that the output may change depending on your local time and time zone:
+
+ >>> pd.Timestamp.fromtimestamp(1584199972, tz='UTC') # doctest: +SKIP
+ Timestamp('2020-03-14 15:32:52+0000', tz='UTC')
""",
)
combine = _make_error_func(
@@ -796,7 +993,28 @@ class NaTType(_NaT):
"""
Timestamp.combine(date, time)
- Combine date, time into datetime with same date and time fields.
+ Combine a date and time into a single Timestamp object.
+
+ This method takes a `date` object and a `time` object
+ and combines them into a single `Timestamp`
+ that has the same date and time fields.
+
+ Parameters
+ ----------
+ date : datetime.date
+ The date part of the Timestamp.
+ time : datetime.time
+ The time part of the Timestamp.
+
+ Returns
+ -------
+ Timestamp
+ A new `Timestamp` object representing the combined date and time.
+
+ See Also
+ --------
+ Timestamp : Represents a single timestamp, similar to `datetime`.
+ to_datetime : Converts various types of data to datetime.
Examples
--------
@@ -836,6 +1054,23 @@ class NaTType(_NaT):
"""
Return POSIX timestamp as float.
+ This method converts the `Timestamp` object to a POSIX timestamp, which is
+ the number of seconds since the Unix epoch (January 1, 1970). The returned
+ value is a floating-point number, where the integer part represents the
+ seconds, and the fractional part represents the microseconds.
+
+ Returns
+ -------
+ float
+ The POSIX timestamp representation of the `Timestamp` object.
+
+ See Also
+ --------
+ Timestamp.fromtimestamp : Construct a `Timestamp` from a POSIX timestamp.
+ datetime.datetime.timestamp : Equivalent method from the `datetime` module.
+ Timestamp.to_pydatetime : Convert the `Timestamp` to a `datetime` object.
+ Timestamp.to_datetime64 : Converts `Timestamp` to `numpy.datetime64`.
+
Examples
--------
>>> ts = pd.Timestamp('2020-03-14T15:32:52.192548')
@@ -907,6 +1142,11 @@ class NaTType(_NaT):
"""
Construct a timestamp from a a proleptic Gregorian ordinal.
+ This method creates a `Timestamp` object corresponding to the given
+ proleptic Gregorian ordinal, which is a count of days from January 1,
+ 0001 (using the proleptic Gregorian calendar). The time part of the
+ `Timestamp` is set to midnight (00:00:00) by default.
+
Parameters
----------
ordinal : int
@@ -914,14 +1154,31 @@ class NaTType(_NaT):
tz : str, zoneinfo.ZoneInfo, pytz.timezone, dateutil.tz.tzfile or None
Time zone for the Timestamp.
+ Returns
+ -------
+ Timestamp
+ A `Timestamp` object representing the specified ordinal date.
+
+ See Also
+ --------
+ Timestamp : Represents a single timestamp, similar to `datetime`.
+ to_datetime : Converts various types of data to datetime.
+
Notes
-----
By definition there cannot be any tz info on the ordinal itself.
Examples
--------
+ Convert an ordinal to a `Timestamp`:
+
>>> pd.Timestamp.fromordinal(737425)
Timestamp('2020-01-01 00:00:00')
+
+ Create a `Timestamp` from an ordinal with timezone information:
+
+ >>> pd.Timestamp.fromordinal(737425, tz='UTC')
+ Timestamp('2020-01-01 00:00:00+0000', tz='UTC')
""",
)
@@ -1013,6 +1270,12 @@ class NaTType(_NaT):
tz : str or timezone object, default None
Timezone to localize to.
+ See Also
+ --------
+ datetime.datetime.today : Returns the current local date.
+ Timestamp.now : Returns current time with optional timezone.
+ Timestamp : A class representing a specific timestamp.
+
Examples
--------
>>> pd.Timestamp.today() # doctest: +SKIP
@@ -1045,9 +1308,9 @@ class NaTType(_NaT):
* bool contains flags to determine if time is dst or not (note
that this flag is only applicable for ambiguous fall dst dates).
* 'NaT' will return NaT for an ambiguous time.
- * 'raise' will raise an AmbiguousTimeError for an ambiguous time.
+ * 'raise' will raise a ValueError for an ambiguous time.
- nonexistent : {'raise', 'shift_forward', 'shift_backward, 'NaT', \
+ nonexistent : {'raise', 'shift_forward', 'shift_backward', 'NaT', \
timedelta}, default 'raise'
A nonexistent time does not exist in a particular timezone
where clocks moved forward due to DST.
@@ -1058,7 +1321,7 @@ timedelta}, default 'raise'
closest existing time.
* 'NaT' will return NaT where there are nonexistent times.
* timedelta objects will shift nonexistent times by the timedelta.
- * 'raise' will raise an NonExistentTimeError if there are
+ * 'raise' will raise a ValueError if there are
nonexistent times.
Returns
@@ -1146,9 +1409,9 @@ timedelta}, default 'raise'
* bool contains flags to determine if time is dst or not (note
that this flag is only applicable for ambiguous fall dst dates).
* 'NaT' will return NaT for an ambiguous time.
- * 'raise' will raise an AmbiguousTimeError for an ambiguous time.
+ * 'raise' will raise a ValueError for an ambiguous time.
- nonexistent : {'raise', 'shift_forward', 'shift_backward, 'NaT', \
+ nonexistent : {'raise', 'shift_forward', 'shift_backward', 'NaT', \
timedelta}, default 'raise'
A nonexistent time does not exist in a particular timezone
where clocks moved forward due to DST.
@@ -1159,7 +1422,7 @@ timedelta}, default 'raise'
closest existing time.
* 'NaT' will return NaT where there are nonexistent times.
* timedelta objects will shift nonexistent times by the timedelta.
- * 'raise' will raise an NonExistentTimeError if there are
+ * 'raise' will raise a ValueError if there are
nonexistent times.
Raises
@@ -1241,9 +1504,9 @@ timedelta}, default 'raise'
* bool contains flags to determine if time is dst or not (note
that this flag is only applicable for ambiguous fall dst dates).
* 'NaT' will return NaT for an ambiguous time.
- * 'raise' will raise an AmbiguousTimeError for an ambiguous time.
+ * 'raise' will raise a ValueError for an ambiguous time.
- nonexistent : {'raise', 'shift_forward', 'shift_backward, 'NaT', \
+ nonexistent : {'raise', 'shift_forward', 'shift_backward', 'NaT', \
timedelta}, default 'raise'
A nonexistent time does not exist in a particular timezone
where clocks moved forward due to DST.
@@ -1254,7 +1517,7 @@ timedelta}, default 'raise'
closest existing time.
* 'NaT' will return NaT where there are nonexistent times.
* timedelta objects will shift nonexistent times by the timedelta.
- * 'raise' will raise an NonExistentTimeError if there are
+ * 'raise' will raise a ValueError if there are
nonexistent times.
Raises
@@ -1405,9 +1668,9 @@ timedelta}, default 'raise'
* bool contains flags to determine if time is dst or not (note
that this flag is only applicable for ambiguous fall dst dates).
* 'NaT' will return NaT for an ambiguous time.
- * 'raise' will raise an AmbiguousTimeError for an ambiguous time.
+ * 'raise' will raise a ValueError for an ambiguous time.
- nonexistent : 'shift_forward', 'shift_backward, 'NaT', timedelta, \
+ nonexistent : 'shift_forward', 'shift_backward', 'NaT', timedelta, \
default 'raise'
A nonexistent time does not exist in a particular timezone
where clocks moved forward due to DST.
@@ -1420,7 +1683,7 @@ default 'raise'
closest existing time.
* 'NaT' will return NaT where there are nonexistent times.
* timedelta objects will shift nonexistent times by the timedelta.
- * 'raise' will raise an NonExistentTimeError if there are
+ * 'raise' will raise a ValueError if there are
nonexistent times.
Returns
@@ -1432,6 +1695,13 @@ default 'raise'
TypeError
If the Timestamp is tz-aware and tz is not None.
+ See Also
+ --------
+ Timestamp.tzinfo : Returns the timezone information of the Timestamp.
+ Timestamp.tz_convert : Convert timezone-aware Timestamp to another time zone.
+ DatetimeIndex.tz_localize : Localize a DatetimeIndex to a specific time zone.
+ datetime.datetime.astimezone : Convert a datetime object to another time zone.
+
Examples
--------
Create a naive timestamp object:
@@ -1456,22 +1726,48 @@ default 'raise'
"""
Implements datetime.replace, handles nanoseconds.
+ This method creates a new `Timestamp` object by replacing the specified
+ fields with new values. The new `Timestamp` retains the original fields
+ that are not explicitly replaced. This method handles nanoseconds, and
+ the `tzinfo` parameter allows for timezone replacement without conversion.
+
Parameters
----------
year : int, optional
+ The year to replace. If `None`, the year is not changed.
month : int, optional
+ The month to replace. If `None`, the month is not changed.
day : int, optional
+ The day to replace. If `None`, the day is not changed.
hour : int, optional
+ The hour to replace. If `None`, the hour is not changed.
minute : int, optional
+ The minute to replace. If `None`, the minute is not changed.
second : int, optional
+ The second to replace. If `None`, the second is not changed.
microsecond : int, optional
+ The microsecond to replace. If `None`, the microsecond is not changed.
nanosecond : int, optional
+ The nanosecond to replace. If `None`, the nanosecond is not changed.
tzinfo : tz-convertible, optional
+ The timezone information to replace. If `None`, the timezone is not changed.
fold : int, optional
+ The fold information to replace. If `None`, the fold is not changed.
Returns
-------
- Timestamp with fields replaced
+ Timestamp
+ A new `Timestamp` object with the specified fields replaced.
+
+ See Also
+ --------
+ Timestamp : Represents a single timestamp, similar to `datetime`.
+ to_datetime : Converts various types of data to datetime.
+
+ Notes
+ -----
+ The `replace` method does not perform timezone conversions. If you need
+ to convert the timezone, use the `tz_convert` method instead.
Examples
--------
diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx
index 554c4f109f1c5..c48acc07b34db 100644
--- a/pandas/_libs/tslibs/offsets.pyx
+++ b/pandas/_libs/tslibs/offsets.pyx
@@ -595,6 +595,24 @@ cdef class BaseOffset:
@property
def rule_code(self) -> str:
+ """
+ Return a string representing the base frequency.
+
+ See Also
+ --------
+ tseries.offsets.Hour.rule_code :
+ Returns a string representing the base frequency of 'h'.
+ tseries.offsets.Day.rule_code :
+ Returns a string representing the base frequency of 'D'.
+
+ Examples
+ --------
+ >>> pd.offsets.Hour().rule_code
+ 'h'
+
+ >>> pd.offsets.Week(5).rule_code
+ 'W'
+ """
return self._prefix
@cache_readonly
@@ -602,6 +620,17 @@ cdef class BaseOffset:
"""
Return a string representing the frequency.
+ See Also
+ --------
+ tseries.offsets.BusinessDay.freqstr :
+ Return a string representing an offset frequency in Business Days.
+ tseries.offsets.BusinessHour.freqstr :
+ Return a string representing an offset frequency in Business Hours.
+ tseries.offsets.Week.freqstr :
+ Return a string representing an offset frequency in Weeks.
+ tseries.offsets.Hour.freqstr :
+ Return a string representing an offset frequency in Hours.
+
Examples
--------
>>> pd.DateOffset(5).freqstr
@@ -779,6 +808,26 @@ cdef class BaseOffset:
@property
def nanos(self):
+ """
+ Returns a integer of the total number of nanoseconds for fixed frequencies.
+
+ Raises
+ ------
+ ValueError
+ If the frequency is non-fixed.
+
+ See Also
+ --------
+ tseries.offsets.Hour.nanos :
+ Returns an integer of the total number of nanoseconds.
+ tseries.offsets.Day.nanos :
+ Returns an integer of the total number of nanoseconds.
+
+ Examples
+ --------
+ >>> pd.offsets.Week(n=1).nanos
+ ValueError: Week: weekday=None is a non-fixed frequency
+ """
raise ValueError(f"{self} is a non-fixed frequency")
# ------------------------------------------------------------------
@@ -986,12 +1035,14 @@ cdef class Tick(SingleConstructorOffset):
@property
def nanos(self) -> int64_t:
"""
- Return an integer of the total number of nanoseconds.
+ Returns an integer of the total number of nanoseconds.
- Raises
- ------
- ValueError
- If the frequency is non-fixed.
+ See Also
+ --------
+ tseries.offsets.Hour.nanos :
+ Returns an integer of the total number of nanoseconds.
+ tseries.offsets.Day.nanos :
+ Returns an integer of the total number of nanoseconds.
Examples
--------
@@ -1147,7 +1198,7 @@ cdef class Hour(Tick):
"""
Offset ``n`` hours.
- Parameters
+ Attributes
----------
n : int, default 1
The number of hours represented.
@@ -1183,7 +1234,7 @@ cdef class Minute(Tick):
"""
Offset ``n`` minutes.
- Parameters
+ Attributes
----------
n : int, default 1
The number of minutes represented.
@@ -1219,7 +1270,7 @@ cdef class Second(Tick):
"""
Offset ``n`` seconds.
- Parameters
+ Attributes
----------
n : int, default 1
The number of seconds represented.
@@ -1255,7 +1306,7 @@ cdef class Milli(Tick):
"""
Offset ``n`` milliseconds.
- Parameters
+ Attributes
----------
n : int, default 1
The number of milliseconds represented.
@@ -1292,7 +1343,7 @@ cdef class Micro(Tick):
"""
Offset ``n`` microseconds.
- Parameters
+ Attributes
----------
n : int, default 1
The number of microseconds represented.
@@ -1329,7 +1380,7 @@ cdef class Nano(Tick):
"""
Offset ``n`` nanoseconds.
- Parameters
+ Attributes
----------
n : int, default 1
The number of nanoseconds represented.
@@ -1616,7 +1667,7 @@ class DateOffset(RelativeDeltaOffset, metaclass=OffsetMeta):
Besides, adding a DateOffsets specified by the singular form of the date
component can be used to replace certain component of the timestamp.
- Parameters
+ Attributes
----------
n : int, default 1
The number of time periods the offset represents.
@@ -2426,6 +2477,24 @@ cdef class WeekOfMonthMixin(SingleConstructorOffset):
@property
def rule_code(self) -> str:
+ """
+ Return a string representing the base frequency.
+
+ See Also
+ --------
+ tseries.offsets.Hour.rule_code :
+ Returns a string representing the base frequency of 'h'.
+ tseries.offsets.Day.rule_code :
+ Returns a string representing the base frequency of 'D'.
+
+ Examples
+ --------
+ >>> pd.offsets.Week(5).rule_code
+ 'W'
+
+ >>> pd.offsets.WeekOfMonth(n=1, week=0, weekday=0).rule_code
+ 'WOM-1MON'
+ """
weekday = int_to_weekday.get(self.weekday, "")
if self.week == -1:
# LastWeekOfMonth
@@ -2472,6 +2541,24 @@ cdef class YearOffset(SingleConstructorOffset):
@property
def rule_code(self) -> str:
+ """
+ Return a string representing the base frequency.
+
+ See Also
+ --------
+ tseries.offsets.Hour.rule_code :
+ Returns a string representing the base frequency of 'h'.
+ tseries.offsets.Day.rule_code :
+ Returns a string representing the base frequency of 'D'.
+
+ Examples
+ --------
+ >>> pd.tseries.offsets.YearBegin(n=1, month=2).rule_code
+ 'YS-FEB'
+
+ >>> pd.tseries.offsets.YearEnd(n=1, month=6).rule_code
+ 'YE-JUN'
+ """
month = MONTH_ALIASES[self.month]
return f"{self._prefix}-{month}"
@@ -2506,7 +2593,7 @@ cdef class BYearEnd(YearOffset):
"""
DateOffset increments between the last business day of the year.
- Parameters
+ Attributes
----------
n : int, default 1
The number of years represented.
@@ -2804,7 +2891,7 @@ cdef class BQuarterBegin(QuarterOffset):
startingMonth = 2 corresponds to dates like 2/01/2007, 5/01/2007, ...
startingMonth = 3 corresponds to dates like 3/01/2007, 6/01/2007, ...
- Parameters
+ Attributes
----------
n : int, default 1
The number of quarters represented.
@@ -2886,7 +2973,7 @@ cdef class QuarterBegin(QuarterOffset):
startingMonth = 2 corresponds to dates like 2/01/2007, 5/01/2007, ...
startingMonth = 3 corresponds to dates like 3/01/2007, 6/01/2007, ...
- Parameters
+ Attributes
----------
n : int, default 1
The number of quarters represented.
@@ -2984,7 +3071,7 @@ cdef class MonthBegin(MonthOffset):
MonthBegin goes to the next date which is a start of the month.
- Parameters
+ Attributes
----------
n : int, default 1
The number of months represented.
@@ -3272,7 +3359,7 @@ cdef class SemiMonthBegin(SemiMonthOffset):
"""
Two DateOffset's per month repeating on the first day of the month & day_of_month.
- Parameters
+ Attributes
----------
n : int, default 1
The number of months represented.
@@ -3304,7 +3391,7 @@ cdef class Week(SingleConstructorOffset):
"""
Weekly offset.
- Parameters
+ Attributes
----------
n : int, default 1
The number of weeks represented.
@@ -3458,6 +3545,24 @@ cdef class Week(SingleConstructorOffset):
@property
def rule_code(self) -> str:
+ """
+ Return a string representing the base frequency.
+
+ See Also
+ --------
+ tseries.offsets.Hour.name :
+ Returns a string representing the base frequency of 'h'.
+ tseries.offsets.Day.name :
+ Returns a string representing the base frequency of 'D'.
+
+ Examples
+ --------
+ >>> pd.offsets.Hour().rule_code
+ 'h'
+
+ >>> pd.offsets.Week(5).rule_code
+ 'W'
+ """
suffix = ""
if self.weekday is not None:
weekday = int_to_weekday[self.weekday]
@@ -3477,7 +3582,7 @@ cdef class WeekOfMonth(WeekOfMonthMixin):
"""
Describes monthly dates like "the Tuesday of the 2nd week of each month".
- Parameters
+ Attributes
----------
n : int, default 1
The number of months represented.
@@ -3554,7 +3659,7 @@ cdef class LastWeekOfMonth(WeekOfMonthMixin):
For example "the last Tuesday of each month".
- Parameters
+ Attributes
----------
n : int, default 1
The number of months represented.
@@ -3694,7 +3799,7 @@ cdef class FY5253(FY5253Mixin):
X is a specific day of the week.
Y is a certain month of the year
- Parameters
+ Attributes
----------
n : int
The number of fiscal years represented.
@@ -3897,7 +4002,7 @@ cdef class FY5253Quarter(FY5253Mixin):
startingMonth = 2 corresponds to dates like 2/28/2007, 5/31/2007, ...
startingMonth = 3 corresponds to dates like 3/30/2007, 6/29/2007, ...
- Parameters
+ Attributes
----------
n : int
The number of business quarters represented.
@@ -4132,7 +4237,7 @@ cdef class Easter(SingleConstructorOffset):
Right now uses the revised method which is valid in years 1583-4099.
- Parameters
+ Attributes
----------
n : int, default 1
The number of years represented.
diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx
index c6ba97fe9f1a2..4f5dfc75a20bf 100644
--- a/pandas/_libs/tslibs/period.pyx
+++ b/pandas/_libs/tslibs/period.pyx
@@ -1913,20 +1913,58 @@ cdef class _Period(PeriodMixin):
Parameters
----------
freq : str, BaseOffset
- The desired frequency. If passing a `str`, it needs to be a
- valid :ref:`period alias `.
+ The target frequency to convert the Period object to.
+ If a string is provided,
+ it must be a valid :ref:`period alias `.
+
how : {'E', 'S', 'end', 'start'}, default 'end'
- Start or end of the timespan.
+ Specifies whether to align the period to the start or end of the interval:
+ - 'E' or 'end': Align to the end of the interval.
+ - 'S' or 'start': Align to the start of the interval.
Returns
-------
- resampled : Period
+ Period : Period object with the specified frequency, aligned to the parameter.
+
+ See Also
+ --------
+ Period.end_time : Return the end Timestamp.
+ Period.start_time : Return the start Timestamp.
+ Period.dayofyear : Return the day of the year.
+ Period.dayofweek : Return the day of the week.
Examples
--------
- >>> period = pd.Period('2023-1-1', freq='D')
+ Convert a daily period to an hourly period, aligning to the end of the day:
+
+ >>> period = pd.Period('2023-01-01', freq='D')
>>> period.asfreq('h')
Period('2023-01-01 23:00', 'h')
+
+ Convert a monthly period to a daily period, aligning to the start of the month:
+
+ >>> period = pd.Period('2023-01', freq='M')
+ >>> period.asfreq('D', how='start')
+ Period('2023-01-01', 'D')
+
+ Convert a yearly period to a monthly period, aligning to the last month:
+
+ >>> period = pd.Period('2023', freq='Y')
+ >>> period.asfreq('M', how='end')
+ Period('2023-12', 'M')
+
+ Convert a monthly period to an hourly period,
+ aligning to the first day of the month:
+
+ >>> period = pd.Period('2023-01', freq='M')
+ >>> period.asfreq('h', how='start')
+ Period('2023-01-01 00:00', 'H')
+
+ Convert a weekly period to a daily period, aligning to the last day of the week:
+
+ >>> period = pd.Period('2023-08-01', freq='W')
+ >>> period.asfreq('D', how='end')
+ Period('2023-08-04', 'D')
"""
freq = self._maybe_convert_freq(freq)
how = validate_end_alias(how)
@@ -2000,11 +2038,44 @@ cdef class _Period(PeriodMixin):
"""
Return the year this Period falls on.
+ Returns
+ -------
+ int
+
+ See Also
+ --------
+ period.month : Get the month of the year for the given Period.
+ period.day : Return the day of the month the Period falls on.
+
+ Notes
+ -----
+ The year is based on the `ordinal` and `base` attributes of the Period.
+
Examples
--------
- >>> period = pd.Period('2022-01', 'M')
+ Create a Period object for January 2023 and get the year:
+
+ >>> period = pd.Period('2023-01', 'M')
>>> period.year
- 2022
+ 2023
+
+ Create a Period object for 01 January 2023 and get the year:
+
+ >>> period = pd.Period('2023', 'D')
+ >>> period.year
+ 2023
+
+ Get the year for a period representing a quarter:
+
+ >>> period = pd.Period('2023Q2', 'Q')
+ >>> period.year
+ 2023
+
+ Handle a case where the Period object is empty, which results in `NaN`:
+
+ >>> period = pd.Period('nan', 'M')
+ >>> period.year
+ nan
"""
base = self._dtype._dtype_code
return pyear(self.ordinal, base)
@@ -2014,11 +2085,45 @@ cdef class _Period(PeriodMixin):
"""
Return the month this Period falls on.
+ Returns
+ -------
+ int
+
+ See Also
+ --------
+ period.week : Get the week of the year on the given Period.
+ Period.year : Return the year this Period falls on.
+ Period.day : Return the day of the month this Period falls on.
+
+ Notes
+ -----
+ The month is based on the `ordinal` and `base` attributes of the Period.
+
Examples
--------
+ Create a Period object for January 2022 and get the month:
+
>>> period = pd.Period('2022-01', 'M')
>>> period.month
1
+
+ Period object with no specified frequency, resulting in a default frequency:
+
+ >>> period = pd.Period('2022', 'Y')
+ >>> period.month
+ 12
+
+ Create a Period object with a specified frequency but an incomplete date string:
+
+ >>> period = pd.Period('2022', 'M')
+ >>> period.month
+ 1
+
+ Handle a case where the Period object is empty, which results in `NaN`:
+
+ >>> period = pd.Period('nan', 'M')
+ >>> period.month
+ nan
"""
base = self._dtype._dtype_code
return pmonth(self.ordinal, base)
diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx
index 43279051e2a30..ccb1a1d6870f7 100644
--- a/pandas/_libs/tslibs/strptime.pyx
+++ b/pandas/_libs/tslibs/strptime.pyx
@@ -16,6 +16,7 @@ FUNCTIONS:
strptime -- Calculates the time struct represented by the passed-in string
"""
from datetime import timezone
+import zoneinfo
from cpython.datetime cimport (
PyDate_Check,
@@ -38,7 +39,6 @@ from _thread import allocate_lock as _thread_allocate_lock
import re
import numpy as np
-import pytz
cimport numpy as cnp
from numpy cimport (
@@ -747,7 +747,7 @@ cdef tzinfo _parse_with_format(
week_of_year_start = 0
elif parse_code == 17:
# e.g. val='2011-12-30T00:00:00.000000UTC'; fmt='%Y-%m-%dT%H:%M:%S.%f%Z'
- tz = pytz.timezone(found_dict["Z"])
+ tz = zoneinfo.ZoneInfo(found_dict["Z"])
elif parse_code == 19:
# e.g. val='March 1, 2018 12:00:00+0400'; fmt='%B %d, %Y %H:%M:%S%z'
tz = parse_timezone_directive(found_dict["z"])
@@ -837,7 +837,7 @@ class TimeRE(_TimeRE):
if key == "Z":
# lazy computation
if self._Z is None:
- self._Z = self.__seqToRE(pytz.all_timezones, "Z")
+ self._Z = self.__seqToRE(zoneinfo.available_timezones(), "Z")
# Note: handling Z is the key difference vs using the stdlib
# _strptime.TimeRE. test_to_datetime_parse_tzname_or_tzoffset with
# fmt='%Y-%m-%d %H:%M:%S %Z' fails with the stdlib version.
diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx
index 369184d9df40c..3268207b667f2 100644
--- a/pandas/_libs/tslibs/timestamps.pyx
+++ b/pandas/_libs/tslibs/timestamps.pyx
@@ -254,6 +254,28 @@ cdef class _Timestamp(ABCTimestamp):
"""
The abbreviation associated with self._creso.
+ This property returns a string representing the time unit of the Timestamp's
+ resolution. It corresponds to the smallest time unit that can be represented
+ by this Timestamp object. The possible values are:
+ - 's' (second)
+ - 'ms' (millisecond)
+ - 'us' (microsecond)
+ - 'ns' (nanosecond)
+
+ Returns
+ -------
+ str
+ A string abbreviation of the Timestamp's resolution unit:
+ - 's' for second
+ - 'ms' for millisecond
+ - 'us' for microsecond
+ - 'ns' for nanosecond
+
+ See Also
+ --------
+ Timestamp.resolution : Return resolution of the Timestamp.
+ Timedelta : A duration expressing the difference between two dates or times.
+
Examples
--------
>>> pd.Timestamp("2020-01-01 12:34:56").unit
@@ -793,6 +815,11 @@ cdef class _Timestamp(ABCTimestamp):
"""
Return the month name of the Timestamp with specified locale.
+ This method returns the full name of the month corresponding to the
+ `Timestamp`, such as 'January', 'February', etc. The month name can
+ be returned in a specified locale if provided; otherwise, it defaults
+ to the English locale.
+
Parameters
----------
locale : str, default None (English locale)
@@ -801,9 +828,18 @@ cdef class _Timestamp(ABCTimestamp):
Returns
-------
str
+ The full month name as a string.
+
+ See Also
+ --------
+ Timestamp.day_name : Returns the name of the day of the week.
+ Timestamp.strftime : Returns a formatted string of the Timestamp.
+ datetime.datetime.strftime : Returns a string representing the date and time.
Examples
--------
+ Get the month name in English (default):
+
>>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651')
>>> ts.month_name()
'March'
@@ -890,17 +926,38 @@ cdef class _Timestamp(ABCTimestamp):
@property
def quarter(self) -> int:
"""
- Return the quarter of the year.
+ Return the quarter of the year for the `Timestamp`.
+
+ This property returns an integer representing the quarter of the year in
+ which the `Timestamp` falls. The quarters are defined as follows:
+ - Q1: January 1 to March 31
+ - Q2: April 1 to June 30
+ - Q3: July 1 to September 30
+ - Q4: October 1 to December 31
Returns
-------
int
+ The quarter of the year (1 through 4).
+
+ See Also
+ --------
+ Timestamp.month : Returns the month of the `Timestamp`.
+ Timestamp.year : Returns the year of the `Timestamp`.
Examples
--------
+ Get the quarter for a `Timestamp`:
+
>>> ts = pd.Timestamp(2020, 3, 14)
>>> ts.quarter
1
+
+ For a `Timestamp` in the fourth quarter:
+
+ >>> ts = pd.Timestamp(2020, 10, 14)
+ >>> ts.quarter
+ 4
"""
return ((self.month - 1) // 3) + 1
@@ -955,6 +1012,21 @@ cdef class _Timestamp(ABCTimestamp):
"""
Normalize Timestamp to midnight, preserving tz information.
+ This method sets the time component of the `Timestamp` to midnight (00:00:00),
+ while preserving the date and time zone information. It is useful when you
+ need to standardize the time across different `Timestamp` objects without
+ altering the time zone or the date.
+
+ Returns
+ -------
+ Timestamp
+
+ See Also
+ --------
+ Timestamp.floor : Rounds `Timestamp` down to the nearest frequency.
+ Timestamp.ceil : Rounds `Timestamp` up to the nearest frequency.
+ Timestamp.round : Rounds `Timestamp` to the nearest frequency.
+
Examples
--------
>>> ts = pd.Timestamp(2020, 3, 14, 15, 30)
@@ -1190,6 +1262,23 @@ cdef class _Timestamp(ABCTimestamp):
"""
Return POSIX timestamp as float.
+ This method converts the `Timestamp` object to a POSIX timestamp, which is
+ the number of seconds since the Unix epoch (January 1, 1970). The returned
+ value is a floating-point number, where the integer part represents the
+ seconds, and the fractional part represents the microseconds.
+
+ Returns
+ -------
+ float
+ The POSIX timestamp representation of the `Timestamp` object.
+
+ See Also
+ --------
+ Timestamp.fromtimestamp : Construct a `Timestamp` from a POSIX timestamp.
+ datetime.datetime.timestamp : Equivalent method from the `datetime` module.
+ Timestamp.to_pydatetime : Convert the `Timestamp` to a `datetime` object.
+ Timestamp.to_datetime64 : Converts `Timestamp` to `numpy.datetime64`.
+
Examples
--------
>>> ts = pd.Timestamp('2020-03-14T15:32:52.192548')
@@ -1253,7 +1342,17 @@ cdef class _Timestamp(ABCTimestamp):
cpdef to_datetime64(self):
"""
- Return a numpy.datetime64 object with same precision.
+ Return a NumPy datetime64 object with same precision.
+
+ This method returns a numpy.datetime64 object with the same
+ date and time information and precision as the pd.Timestamp object.
+
+ See Also
+ --------
+ numpy.datetime64 : Class to represent dates and times with high precision.
+ Timestamp.to_numpy : Alias for this method.
+ Timestamp.asm8 : Alias for this method.
+ pd.to_datetime : Convert argument to datetime.
Examples
--------
@@ -1276,6 +1375,15 @@ cdef class _Timestamp(ABCTimestamp):
copy parameters are available here only for compatibility. Their values
will not affect the return value.
+ Parameters
+ ----------
+ dtype : dtype, optional
+ Data type of the output, ignored in this method as the return type
+ is always `numpy.datetime64`.
+ copy : bool, default False
+ Whether to ensure that the returned value is a new object. This
+ parameter is also ignored as the method does not support copying.
+
Returns
-------
numpy.datetime64
@@ -1305,6 +1413,21 @@ cdef class _Timestamp(ABCTimestamp):
"""
Return an period of which this timestamp is an observation.
+ This method converts the given Timestamp to a Period object,
+ which represents a span of time,such as a year, month, etc.,
+ based on the specified frequency.
+
+ Parameters
+ ----------
+ freq : str, optional
+ Frequency string for the period (e.g., 'Y', 'M', 'W'). Defaults to `None`.
+
+ See Also
+ --------
+ Timestamp : Represents a specific timestamp.
+ Period : Represents a span of time.
+ to_period : Converts an object to a Period.
+
Examples
--------
>>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651')
@@ -1442,6 +1565,11 @@ class Timestamp(_Timestamp):
"""
Construct a timestamp from a a proleptic Gregorian ordinal.
+ This method creates a `Timestamp` object corresponding to the given
+ proleptic Gregorian ordinal, which is a count of days from January 1,
+ 0001 (using the proleptic Gregorian calendar). The time part of the
+ `Timestamp` is set to midnight (00:00:00) by default.
+
Parameters
----------
ordinal : int
@@ -1449,14 +1577,31 @@ class Timestamp(_Timestamp):
tz : str, zoneinfo.ZoneInfo, pytz.timezone, dateutil.tz.tzfile or None
Time zone for the Timestamp.
+ Returns
+ -------
+ Timestamp
+ A `Timestamp` object representing the specified ordinal date.
+
+ See Also
+ --------
+ Timestamp : Represents a single timestamp, similar to `datetime`.
+ to_datetime : Converts various types of data to datetime.
+
Notes
-----
By definition there cannot be any tz info on the ordinal itself.
Examples
--------
+ Convert an ordinal to a `Timestamp`:
+
>>> pd.Timestamp.fromordinal(737425)
Timestamp('2020-01-01 00:00:00')
+
+ Create a `Timestamp` from an ordinal with timezone information:
+
+ >>> pd.Timestamp.fromordinal(737425, tz='UTC')
+ Timestamp('2020-01-01 00:00:00+0000', tz='UTC')
"""
return cls(datetime.fromordinal(ordinal), tz=tz)
@@ -1507,6 +1652,12 @@ class Timestamp(_Timestamp):
tz : str or timezone object, default None
Timezone to localize to.
+ See Also
+ --------
+ datetime.datetime.today : Returns the current local date.
+ Timestamp.now : Returns current time with optional timezone.
+ Timestamp : A class representing a specific timestamp.
+
Examples
--------
>>> pd.Timestamp.today() # doctest: +SKIP
@@ -1560,6 +1711,21 @@ class Timestamp(_Timestamp):
Construct a timezone-aware UTC datetime from a POSIX timestamp.
+ This method creates a datetime object from a POSIX timestamp, keeping the
+ Timestamp object's timezone.
+
+ Parameters
+ ----------
+ ts : float
+ POSIX timestamp.
+
+ See Also
+ --------
+ Timezone.tzname : Return time zone name.
+ Timestamp.utcnow : Return a new Timestamp representing UTC day and time.
+ Timestamp.fromtimestamp : Transform timestamp[, tz] to tz's local
+ time from POSIX timestamp.
+
Notes
-----
Timestamp.utcfromtimestamp behavior differs from datetime.utcfromtimestamp
@@ -1584,16 +1750,43 @@ class Timestamp(_Timestamp):
@classmethod
def fromtimestamp(cls, ts, tz=None):
"""
- Timestamp.fromtimestamp(ts)
+ Create a `Timestamp` object from a POSIX timestamp.
+
+ This method converts a POSIX timestamp (the number of seconds since
+ January 1, 1970, 00:00:00 UTC) into a `Timestamp` object. The resulting
+ `Timestamp` can be localized to a specific time zone if provided.
- Transform timestamp[, tz] to tz's local time from POSIX timestamp.
+ Parameters
+ ----------
+ ts : float
+ The POSIX timestamp to convert, representing seconds since
+ the epoch (1970-01-01 00:00:00 UTC).
+ tz : str, zoneinfo.ZoneInfo, pytz.timezone, dateutil.tz.tzfile, optional
+ Time zone for the `Timestamp`. If not provided, the `Timestamp` will
+ be timezone-naive (i.e., without time zone information).
+
+ Returns
+ -------
+ Timestamp
+ A `Timestamp` object representing the given POSIX timestamp.
+
+ See Also
+ --------
+ Timestamp : Represents a single timestamp, similar to `datetime`.
+ to_datetime : Converts various types of data to datetime.
+ datetime.datetime.fromtimestamp : Returns a datetime from a POSIX timestamp.
Examples
--------
+ Convert a POSIX timestamp to a `Timestamp`:
+
>>> pd.Timestamp.fromtimestamp(1584199972) # doctest: +SKIP
Timestamp('2020-03-14 15:32:52')
- Note that the output may change depending on your local time.
+ Note that the output may change depending on your local time and time zone:
+
+ >>> pd.Timestamp.fromtimestamp(1584199972, tz='UTC') # doctest: +SKIP
+ Timestamp('2020-03-14 15:32:52+0000', tz='UTC')
"""
tz = maybe_get_tz(tz)
return cls(datetime.fromtimestamp(ts, tz))
@@ -1636,7 +1829,25 @@ class Timestamp(_Timestamp):
def ctime(self):
"""
- Return ctime() style string.
+ Return a ctime() style string representing the Timestamp.
+
+ This method returns a string representing the date and time
+ in the format returned by the standard library's `time.ctime()`
+ function, which is typically in the form 'Day Mon DD HH:MM:SS YYYY'.
+
+ If the `Timestamp` is outside the range supported by Python's
+ standard library, a `NotImplementedError` is raised.
+
+ Returns
+ -------
+ str
+ A string representing the Timestamp in ctime format.
+
+ See Also
+ --------
+ time.ctime : Return a string representing time in ctime format.
+ Timestamp : Represents a single timestamp, similar to `datetime`.
+ datetime.datetime.ctime : Return a ctime style string from a datetime object.
Examples
--------
@@ -1661,10 +1872,25 @@ class Timestamp(_Timestamp):
def date(self):
"""
- Return date object with same year, month and day.
+ Returns `datetime.date` with the same year, month, and day.
+
+ This method extracts the date component from the `Timestamp` and returns
+ it as a `datetime.date` object, discarding the time information.
+
+ Returns
+ -------
+ datetime.date
+ The date part of the `Timestamp`.
+
+ See Also
+ --------
+ Timestamp : Represents a single timestamp, similar to `datetime`.
+ datetime.datetime.date : Extract the date component from a `datetime` object.
Examples
--------
+ Extract the date from a Timestamp:
+
>>> ts = pd.Timestamp('2023-01-01 10:00:00.00')
>>> ts
Timestamp('2023-01-01 10:00:00')
@@ -1735,6 +1961,13 @@ class Timestamp(_Timestamp):
"""
Return time zone name.
+ This method returns the name of the Timestamp's time zone as a string.
+
+ See Also
+ --------
+ Timestamp.tzinfo : Returns the timezone information of the Timestamp.
+ Timestamp.tz_convert : Convert timezone-aware Timestamp to another time zone.
+
Examples
--------
>>> ts = pd.Timestamp('2023-01-01 10:00:00', tz='Europe/Brussels')
@@ -1749,6 +1982,22 @@ class Timestamp(_Timestamp):
"""
Return utc offset.
+ This method returns the difference between UTC and the local time
+ as a `timedelta` object. It is useful for understanding the time
+ difference between the current timezone and UTC.
+
+ Returns
+ --------
+ timedelta
+ The difference between UTC and the local time as a `timedelta` object.
+
+ See Also
+ --------
+ datetime.datetime.utcoffset :
+ Standard library method to get the UTC offset of a datetime object.
+ Timestamp.tzname : Return the name of the timezone.
+ Timestamp.dst : Return the daylight saving time (DST) adjustment.
+
Examples
--------
>>> ts = pd.Timestamp('2023-01-01 10:00:00', tz='Europe/Brussels')
@@ -1761,7 +2010,24 @@ class Timestamp(_Timestamp):
def utctimetuple(self):
"""
- Return UTC time tuple, compatible with time.localtime().
+ Return UTC time tuple, compatible with `time.localtime()`.
+
+ This method converts the Timestamp to UTC and returns a time tuple
+ containing 9 components: year, month, day, hour, minute, second,
+ weekday, day of year, and DST flag. This is particularly useful for
+ converting a Timestamp to a format compatible with time module functions.
+
+ Returns
+ -------
+ time.struct_time
+ A time.struct_time object representing the UTC time.
+
+ See Also
+ --------
+ datetime.datetime.utctimetuple :
+ Return UTC time tuple, compatible with time.localtime().
+ Timestamp.timetuple : Return time tuple of local time.
+ time.struct_time : Time tuple structure used by time functions.
Examples
--------
@@ -1802,6 +2068,17 @@ class Timestamp(_Timestamp):
"""
Return time tuple, compatible with time.localtime().
+ This method converts the `Timestamp` into a time tuple, which is compatible
+ with functions like `time.localtime()`. The time tuple is a named tuple with
+ attributes such as year, month, day, hour, minute, second, weekday,
+ day of the year, and daylight savings indicator.
+
+ See Also
+ --------
+ time.localtime : Converts a POSIX timestamp into a time tuple.
+ Timestamp : The `Timestamp` that represents a specific point in time.
+ datetime.datetime.timetuple : Equivalent method in the `datetime` module.
+
Examples
--------
>>> ts = pd.Timestamp('2023-01-01 10:00:00')
@@ -1826,6 +2103,19 @@ class Timestamp(_Timestamp):
"""
Return time object with same time and tzinfo.
+ This method returns a datetime.time object with
+ the time and tzinfo corresponding to the pd.Timestamp
+ object, ignoring any information about the day/date.
+
+ See Also
+ --------
+ datetime.datetime.timetz : Return datetime.time object with the
+ same time attributes as the datetime object.
+ datetime.time : Class to represent the time of day, independent
+ of any particular day.
+ datetime.datetime.tzinfo : Attribute of datetime.datetime objects
+ representing the timezone of the datetime object.
+
Examples
--------
>>> ts = pd.Timestamp('2023-01-01 10:00:00', tz='Europe/Brussels')
@@ -1840,6 +2130,17 @@ class Timestamp(_Timestamp):
"""
Return proleptic Gregorian ordinal. January 1 of year 1 is day 1.
+ The proleptic Gregorian ordinal is a continuous count of days since
+ January 1 of year 1, which is considered day 1. This method converts
+ the `Timestamp` to its equivalent ordinal number, useful for date arithmetic
+ and comparison operations.
+
+ See Also
+ --------
+ datetime.datetime.toordinal : Equivalent method in the `datetime` module.
+ Timestamp : The `Timestamp` that represents a specific point in time.
+ Timestamp.fromordinal : Create a `Timestamp` from an ordinal.
+
Examples
--------
>>> ts = pd.Timestamp('2023-01-01 10:00:50')
@@ -1863,9 +2164,27 @@ class Timestamp(_Timestamp):
@classmethod
def strptime(cls, date_string, format):
"""
- Timestamp.strptime(string, format)
+ Convert string argument to datetime.
- Function is not implemented. Use pd.to_datetime().
+ This method is not implemented; calling it will raise NotImplementedError.
+ Use pd.to_datetime() instead.
+
+ Parameters
+ ----------
+ date_string : str
+ String to convert to a datetime.
+ format : str, default None
+ The format string to parse time, e.g. "%d/%m/%Y".
+
+ See Also
+ --------
+ pd.to_datetime : Convert argument to datetime.
+ datetime.datetime.strptime : Return a datetime corresponding to a string
+ representing a date and time, parsed according to a separate
+ format string.
+ datetime.datetime.strftime : Return a string representing the date and
+ time, controlled by an explicit format string.
+ Timestamp.isoformat : Return the time formatted according to ISO 8601.
Examples
--------
@@ -1883,7 +2202,28 @@ class Timestamp(_Timestamp):
"""
Timestamp.combine(date, time)
- Combine date, time into datetime with same date and time fields.
+ Combine a date and time into a single Timestamp object.
+
+ This method takes a `date` object and a `time` object
+ and combines them into a single `Timestamp`
+ that has the same date and time fields.
+
+ Parameters
+ ----------
+ date : datetime.date
+ The date part of the Timestamp.
+ time : datetime.time
+ The time part of the Timestamp.
+
+ Returns
+ -------
+ Timestamp
+ A new `Timestamp` object representing the combined date and time.
+
+ See Also
+ --------
+ Timestamp : Represents a single timestamp, similar to `datetime`.
+ to_datetime : Converts various types of data to datetime.
Examples
--------
@@ -2104,9 +2444,9 @@ class Timestamp(_Timestamp):
* bool contains flags to determine if time is dst or not (note
that this flag is only applicable for ambiguous fall dst dates).
* 'NaT' will return NaT for an ambiguous time.
- * 'raise' will raise an AmbiguousTimeError for an ambiguous time.
+ * 'raise' will raise a ValueError for an ambiguous time.
- nonexistent : {'raise', 'shift_forward', 'shift_backward, 'NaT', \
+ nonexistent : {'raise', 'shift_forward', 'shift_backward', 'NaT', \
timedelta}, default 'raise'
A nonexistent time does not exist in a particular timezone
where clocks moved forward due to DST.
@@ -2117,7 +2457,7 @@ timedelta}, default 'raise'
closest existing time.
* 'NaT' will return NaT where there are nonexistent times.
* timedelta objects will shift nonexistent times by the timedelta.
- * 'raise' will raise an NonExistentTimeError if there are
+ * 'raise' will raise a ValueError if there are
nonexistent times.
Returns
@@ -2207,9 +2547,9 @@ timedelta}, default 'raise'
* bool contains flags to determine if time is dst or not (note
that this flag is only applicable for ambiguous fall dst dates).
* 'NaT' will return NaT for an ambiguous time.
- * 'raise' will raise an AmbiguousTimeError for an ambiguous time.
+ * 'raise' will raise a ValueError for an ambiguous time.
- nonexistent : {'raise', 'shift_forward', 'shift_backward, 'NaT', \
+ nonexistent : {'raise', 'shift_forward', 'shift_backward', 'NaT', \
timedelta}, default 'raise'
A nonexistent time does not exist in a particular timezone
where clocks moved forward due to DST.
@@ -2220,7 +2560,7 @@ timedelta}, default 'raise'
closest existing time.
* 'NaT' will return NaT where there are nonexistent times.
* timedelta objects will shift nonexistent times by the timedelta.
- * 'raise' will raise an NonExistentTimeError if there are
+ * 'raise' will raise a ValueError if there are
nonexistent times.
Raises
@@ -2302,9 +2642,9 @@ timedelta}, default 'raise'
* bool contains flags to determine if time is dst or not (note
that this flag is only applicable for ambiguous fall dst dates).
* 'NaT' will return NaT for an ambiguous time.
- * 'raise' will raise an AmbiguousTimeError for an ambiguous time.
+ * 'raise' will raise a ValueError for an ambiguous time.
- nonexistent : {'raise', 'shift_forward', 'shift_backward, 'NaT', \
+ nonexistent : {'raise', 'shift_forward', 'shift_backward', 'NaT', \
timedelta}, default 'raise'
A nonexistent time does not exist in a particular timezone
where clocks moved forward due to DST.
@@ -2315,7 +2655,7 @@ timedelta}, default 'raise'
closest existing time.
* 'NaT' will return NaT where there are nonexistent times.
* timedelta objects will shift nonexistent times by the timedelta.
- * 'raise' will raise an NonExistentTimeError if there are
+ * 'raise' will raise a ValueError if there are
nonexistent times.
Raises
@@ -2441,9 +2781,9 @@ timedelta}, default 'raise'
* bool contains flags to determine if time is dst or not (note
that this flag is only applicable for ambiguous fall dst dates).
* 'NaT' will return NaT for an ambiguous time.
- * 'raise' will raise an AmbiguousTimeError for an ambiguous time.
+ * 'raise' will raise a ValueError for an ambiguous time.
- nonexistent : 'shift_forward', 'shift_backward, 'NaT', timedelta, \
+ nonexistent : 'shift_forward', 'shift_backward', 'NaT', timedelta, \
default 'raise'
A nonexistent time does not exist in a particular timezone
where clocks moved forward due to DST.
@@ -2456,7 +2796,7 @@ default 'raise'
closest existing time.
* 'NaT' will return NaT where there are nonexistent times.
* timedelta objects will shift nonexistent times by the timedelta.
- * 'raise' will raise an NonExistentTimeError if there are
+ * 'raise' will raise a ValueError if there are
nonexistent times.
Returns
@@ -2468,6 +2808,13 @@ default 'raise'
TypeError
If the Timestamp is tz-aware and tz is not None.
+ See Also
+ --------
+ Timestamp.tzinfo : Returns the timezone information of the Timestamp.
+ Timestamp.tz_convert : Convert timezone-aware Timestamp to another time zone.
+ DatetimeIndex.tz_localize : Localize a DatetimeIndex to a specific time zone.
+ datetime.datetime.astimezone : Convert a datetime object to another time zone.
+
Examples
--------
Create a naive timestamp object:
@@ -2603,22 +2950,48 @@ default 'raise'
"""
Implements datetime.replace, handles nanoseconds.
+ This method creates a new `Timestamp` object by replacing the specified
+ fields with new values. The new `Timestamp` retains the original fields
+ that are not explicitly replaced. This method handles nanoseconds, and
+ the `tzinfo` parameter allows for timezone replacement without conversion.
+
Parameters
----------
year : int, optional
+ The year to replace. If `None`, the year is not changed.
month : int, optional
+ The month to replace. If `None`, the month is not changed.
day : int, optional
+ The day to replace. If `None`, the day is not changed.
hour : int, optional
+ The hour to replace. If `None`, the hour is not changed.
minute : int, optional
+ The minute to replace. If `None`, the minute is not changed.
second : int, optional
+ The second to replace. If `None`, the second is not changed.
microsecond : int, optional
+ The microsecond to replace. If `None`, the microsecond is not changed.
nanosecond : int, optional
+ The nanosecond to replace. If `None`, the nanosecond is not changed.
tzinfo : tz-convertible, optional
+ The timezone information to replace. If `None`, the timezone is not changed.
fold : int, optional
+ The fold information to replace. If `None`, the fold is not changed.
Returns
-------
- Timestamp with fields replaced
+ Timestamp
+ A new `Timestamp` object with the specified fields replaced.
+
+ See Also
+ --------
+ Timestamp : Represents a single timestamp, similar to `datetime`.
+ to_datetime : Converts various types of data to datetime.
+
+ Notes
+ -----
+ The `replace` method does not perform timezone conversions. If you need
+ to convert the timezone, use the `tz_convert` method instead.
Examples
--------
@@ -2741,7 +3114,14 @@ default 'raise'
"""
Convert TimeStamp to a Julian Date.
- 0 Julian date is noon January 1, 4713 BC.
+ This method returns the number of days as a float since
+ 0 Julian date, which is noon January 1, 4713 BC.
+
+ See Also
+ --------
+ Timestamp.toordinal : Return proleptic Gregorian ordinal.
+ Timestamp.timestamp : Return POSIX timestamp as float.
+ Timestamp : Represents a single timestamp.
Examples
--------
diff --git a/pandas/_libs/tslibs/timezones.pyx b/pandas/_libs/tslibs/timezones.pyx
index 6292b6ce0fd1d..36b644ffc826d 100644
--- a/pandas/_libs/tslibs/timezones.pyx
+++ b/pandas/_libs/tslibs/timezones.pyx
@@ -2,17 +2,10 @@ from datetime import (
timedelta,
timezone,
)
+import zoneinfo
from pandas.compat._optional import import_optional_dependency
-try:
- # py39+
- import zoneinfo
- from zoneinfo import ZoneInfo
-except ImportError:
- zoneinfo = None
- ZoneInfo = None
-
from cpython.datetime cimport (
datetime,
timedelta,
@@ -28,8 +21,8 @@ from dateutil.tz import (
tzutc as _dateutil_tzutc,
)
import numpy as np
-import pytz
-from pytz.tzinfo import BaseTzInfo as _pytz_BaseTzInfo
+
+pytz = import_optional_dependency("pytz", errors="ignore")
cimport numpy as cnp
from numpy cimport int64_t
@@ -45,10 +38,11 @@ from pandas._libs.tslibs.util cimport (
cdef int64_t NPY_NAT = get_nat()
cdef tzinfo utc_stdlib = timezone.utc
-cdef tzinfo utc_pytz = pytz.utc
+cdef tzinfo utc_pytz = pytz.UTC if pytz else None
cdef tzinfo utc_dateutil_str = dateutil_gettz("UTC") # NB: *not* the same as tzutc()
cdef tzinfo utc_zoneinfo = None
+cdef type ZoneInfo = zoneinfo.ZoneInfo
# ----------------------------------------------------------------------
@@ -56,13 +50,13 @@ cdef tzinfo utc_zoneinfo = None
cdef bint is_utc_zoneinfo(tzinfo tz):
# Workaround for cases with missing tzdata
# https://github.com/pandas-dev/pandas/pull/46425#discussion_r830633025
- if tz is None or zoneinfo is None:
+ if tz is None:
return False
global utc_zoneinfo
if utc_zoneinfo is None:
try:
- utc_zoneinfo = ZoneInfo("UTC")
+ utc_zoneinfo = zoneinfo.ZoneInfo("UTC")
except zoneinfo.ZoneInfoNotFoundError:
return False
# Warn if tzdata is too old, even if there is a system tzdata to alert
@@ -74,17 +68,15 @@ cdef bint is_utc_zoneinfo(tzinfo tz):
cpdef inline bint is_utc(tzinfo tz):
return (
- tz is utc_pytz
- or tz is utc_stdlib
+ tz is utc_stdlib
or isinstance(tz, _dateutil_tzutc)
or tz is utc_dateutil_str
or is_utc_zoneinfo(tz)
+ or (utc_pytz is not None and tz is utc_pytz)
)
cdef bint is_zoneinfo(tzinfo tz):
- if ZoneInfo is None:
- return False
return isinstance(tz, ZoneInfo)
@@ -166,7 +158,7 @@ cpdef inline tzinfo maybe_get_tz(object tz):
elif tz == "UTC" or tz == "utc":
tz = utc_stdlib
else:
- tz = pytz.timezone(tz)
+ tz = zoneinfo.ZoneInfo(tz)
elif is_integer_object(tz):
tz = timezone(timedelta(seconds=tz))
elif isinstance(tz, tzinfo):
@@ -205,7 +197,7 @@ cdef object tz_cache_key(tzinfo tz):
the same tz file). Also, pytz objects are not always hashable so we use
str(tz) instead.
"""
- if isinstance(tz, _pytz_BaseTzInfo):
+ if pytz is not None and isinstance(tz, pytz.tzinfo.BaseTzInfo):
return tz.zone
elif isinstance(tz, _dateutil_tzfile):
if ".tar.gz" in tz._filename:
@@ -239,7 +231,7 @@ cpdef inline bint is_fixed_offset(tzinfo tz):
return 1
else:
return 0
- elif treat_tz_as_pytz(tz):
+ elif treat_tz_as_pytz(tz) and pytz is not None:
if (len(tz._transition_info) == 0
and len(tz._utc_transition_times) == 0):
return 1
diff --git a/pandas/_libs/tslibs/tzconversion.pyx b/pandas/_libs/tslibs/tzconversion.pyx
index e3facd3d9599b..c100f315e9a19 100644
--- a/pandas/_libs/tslibs/tzconversion.pyx
+++ b/pandas/_libs/tslibs/tzconversion.pyx
@@ -15,7 +15,6 @@ from cython cimport Py_ssize_t
import_datetime()
import numpy as np
-import pytz
cimport numpy as cnp
from numpy cimport (
@@ -196,8 +195,8 @@ def tz_localize_to_utc(
NPY_DATETIMEUNIT creso=NPY_DATETIMEUNIT.NPY_FR_ns,
):
"""
- Localize tzinfo-naive i8 to given time zone (using pytz). If
- there are ambiguities in the values, raise AmbiguousTimeError.
+ Localize tzinfo-naive i8 to given time zone. If
+ there are ambiguities in the values, raise ValueError.
Parameters
----------
@@ -368,7 +367,7 @@ timedelta-like}
result[i] = NPY_NAT
else:
stamp = _render_tstamp(val, creso=creso)
- raise pytz.AmbiguousTimeError(
+ raise ValueError(
f"Cannot infer dst time from {stamp}, try using the "
"'ambiguous' argument"
)
@@ -428,7 +427,10 @@ timedelta-like}
result[i] = NPY_NAT
else:
stamp = _render_tstamp(val, creso=creso)
- raise pytz.NonExistentTimeError(stamp)
+ raise ValueError(
+ f"{stamp} is a nonexistent time due to daylight savings time. "
+ "Try using the 'nonexistent' argument."
+ )
return result.base # .base to get underlying ndarray
@@ -631,7 +633,7 @@ cdef ndarray[int64_t] _get_dst_hours(
if trans_idx.size == 1:
# see test_tz_localize_to_utc_ambiguous_infer
stamp = _render_tstamp(vals[trans_idx[0]], creso=creso)
- raise pytz.AmbiguousTimeError(
+ raise ValueError(
f"Cannot infer dst time from {stamp} as there "
"are no repeated times"
)
@@ -653,14 +655,16 @@ cdef ndarray[int64_t] _get_dst_hours(
if grp.size == 1 or np.all(delta > 0):
# see test_tz_localize_to_utc_ambiguous_infer
stamp = _render_tstamp(vals[grp[0]], creso=creso)
- raise pytz.AmbiguousTimeError(stamp)
+ raise ValueError(
+ f"{stamp} is an ambiguous time and cannot be inferred."
+ )
# Find the index for the switch and pull from a for dst and b
# for standard
switch_idxs = (delta <= 0).nonzero()[0]
if switch_idxs.size > 1:
# see test_tz_localize_to_utc_ambiguous_infer
- raise pytz.AmbiguousTimeError(
+ raise ValueError(
f"There are {switch_idxs.size} dst switches when "
"there should only be 1."
)
diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py
index 288559d386a71..756c209661fbb 100644
--- a/pandas/compat/__init__.py
+++ b/pandas/compat/__init__.py
@@ -33,6 +33,7 @@
pa_version_under14p1,
pa_version_under16p0,
pa_version_under17p0,
+ pa_version_under18p0,
)
if TYPE_CHECKING:
@@ -157,6 +158,7 @@ def is_ci_environment() -> bool:
"pa_version_under14p1",
"pa_version_under16p0",
"pa_version_under17p0",
+ "pa_version_under18p0",
"HAS_PYARROW",
"IS64",
"ISMUSL",
diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py
index 06082e71af32a..6b90389a62056 100644
--- a/pandas/compat/_optional.py
+++ b/pandas/compat/_optional.py
@@ -43,6 +43,7 @@
"pyreadstat": "1.2.0",
"pytest": "7.3.2",
"python-calamine": "0.1.7",
+ "pytz": "2023.4",
"pyxlsb": "1.0.10",
"s3fs": "2022.11.0",
"scipy": "1.10.0",
diff --git a/pandas/compat/pyarrow.py b/pandas/compat/pyarrow.py
index ebfc0d69d9655..bd009b544f31e 100644
--- a/pandas/compat/pyarrow.py
+++ b/pandas/compat/pyarrow.py
@@ -17,6 +17,7 @@
pa_version_under15p0 = _palv < Version("15.0.0")
pa_version_under16p0 = _palv < Version("16.0.0")
pa_version_under17p0 = _palv < Version("17.0.0")
+ pa_version_under18p0 = _palv < Version("18.0.0")
HAS_PYARROW = True
except ImportError:
pa_version_under10p1 = True
@@ -28,4 +29,5 @@
pa_version_under15p0 = True
pa_version_under16p0 = True
pa_version_under17p0 = True
+ pa_version_under18p0 = True
HAS_PYARROW = False
diff --git a/pandas/conftest.py b/pandas/conftest.py
index 7c485515f0784..d11213f1164bc 100644
--- a/pandas/conftest.py
+++ b/pandas/conftest.py
@@ -32,7 +32,10 @@
import gc
import operator
import os
-from typing import TYPE_CHECKING
+from typing import (
+ TYPE_CHECKING,
+ Any,
+)
import uuid
from dateutil.tz import (
@@ -43,11 +46,8 @@
from hypothesis import strategies as st
import numpy as np
import pytest
-from pytz import (
- FixedOffset,
- utc,
-)
+from pandas.compat._optional import import_optional_dependency
import pandas.util._test_decorators as td
from pandas.core.dtypes.dtypes import (
@@ -92,12 +92,7 @@
del pa
has_pyarrow = True
-import zoneinfo
-
-try:
- zoneinfo.ZoneInfo("UTC")
-except zoneinfo.ZoneInfoNotFoundError:
- zoneinfo = None # type: ignore[assignment]
+pytz = import_optional_dependency("pytz", errors="ignore")
# ----------------------------------------------------------------
@@ -1199,19 +1194,19 @@ def deco(*args):
"UTC-02:15",
tzutc(),
tzlocal(),
- FixedOffset(300),
- FixedOffset(0),
- FixedOffset(-300),
timezone.utc,
timezone(timedelta(hours=1)),
timezone(timedelta(hours=-1), name="foo"),
]
-if zoneinfo is not None:
+if pytz is not None:
TIMEZONES.extend(
- [
- zoneinfo.ZoneInfo("US/Pacific"), # type: ignore[list-item]
- zoneinfo.ZoneInfo("UTC"), # type: ignore[list-item]
- ]
+ (
+ pytz.FixedOffset(300),
+ pytz.FixedOffset(0),
+ pytz.FixedOffset(-300),
+ pytz.timezone("US/Pacific"),
+ pytz.timezone("UTC"),
+ )
)
TIMEZONE_IDS = [repr(i) for i in TIMEZONES]
@@ -1234,9 +1229,10 @@ def tz_aware_fixture(request):
return request.param
-_UTCS = ["utc", "dateutil/UTC", utc, tzutc(), timezone.utc]
-if zoneinfo is not None:
- _UTCS.append(zoneinfo.ZoneInfo("UTC"))
+_UTCS = ["utc", "dateutil/UTC", tzutc(), timezone.utc]
+
+if pytz is not None:
+ _UTCS.append(pytz.utc)
@pytest.fixture(params=_UTCS)
@@ -2046,12 +2042,12 @@ def using_infer_string() -> bool:
return pd.options.future.infer_string is True
-warsaws = ["Europe/Warsaw", "dateutil/Europe/Warsaw"]
-if zoneinfo is not None:
- warsaws.append(zoneinfo.ZoneInfo("Europe/Warsaw")) # type: ignore[arg-type]
+_warsaws: list[Any] = ["Europe/Warsaw", "dateutil/Europe/Warsaw"]
+if pytz is not None:
+ _warsaws.append(pytz.timezone("Europe/Warsaw"))
-@pytest.fixture(params=warsaws)
+@pytest.fixture(params=_warsaws)
def warsaw(request) -> str:
"""
tzinfo for Europe/Warsaw using pytz, dateutil, or zoneinfo.
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 948836bf6a51d..56f8adda93251 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -1529,9 +1529,7 @@ def safe_sort(
order2 = sorter.argsort()
if verify:
mask = (codes < -len(values)) | (codes >= len(values))
- codes[mask] = 0
- else:
- mask = None
+ codes[mask] = -1
new_codes = take_nd(order2, codes, fill_value=-1)
else:
reverse_indexer = np.empty(len(sorter), dtype=int)
@@ -1540,14 +1538,6 @@ def safe_sort(
# may deal with them here without performance loss using `mode='wrap'`
new_codes = reverse_indexer.take(codes, mode="wrap")
- if use_na_sentinel:
- mask = codes == -1
- if verify:
- mask = mask | (codes < -len(values)) | (codes >= len(values))
-
- if use_na_sentinel and mask is not None:
- np.putmask(new_codes, mask, -1)
-
return ordered, ensure_platform_int(new_codes)
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index d07bfeda50e1d..e95fa441e18fb 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -709,7 +709,13 @@ def _cmp_method(self, other, op) -> ArrowExtensionArray:
if isinstance(
other, (ArrowExtensionArray, np.ndarray, list, BaseMaskedArray)
) or isinstance(getattr(other, "dtype", None), CategoricalDtype):
- result = pc_func(self._pa_array, self._box_pa(other))
+ try:
+ result = pc_func(self._pa_array, self._box_pa(other))
+ except pa.ArrowNotImplementedError:
+ # TODO: could this be wrong if other is object dtype?
+ # in which case we need to operate pointwise?
+ result = ops.invalid_comparison(self, other, op)
+ result = pa.array(result, type=pa.bool_())
elif is_scalar(other):
try:
result = pc_func(self._pa_array, self._box_pa(other))
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
index b429b7c1b1fc4..a0c318409d6bb 100644
--- a/pandas/core/arrays/base.py
+++ b/pandas/core/arrays/base.py
@@ -649,6 +649,11 @@ def ndim(self) -> int:
"""
Extension Arrays are only allowed to be 1-dimensional.
+ See Also
+ --------
+ ExtensionArray.shape: Return a tuple of the array dimensions.
+ ExtensionArray.size: The number of elements in the array.
+
Examples
--------
>>> arr = pd.array([1, 2, 3])
@@ -662,6 +667,11 @@ def nbytes(self) -> int:
"""
The number of bytes needed to store this object in memory.
+ See Also
+ --------
+ ExtensionArray.shape: Return a tuple of the array dimensions.
+ ExtensionArray.size: The number of elements in the array.
+
Examples
--------
>>> pd.array([1, 2, 3]).nbytes
@@ -767,6 +777,11 @@ def isna(self) -> np.ndarray | ExtensionArraySupportsAnyAll:
an ndarray would be expensive, an ExtensionArray may be
returned.
+ See Also
+ --------
+ ExtensionArray.dropna: Return ExtensionArray without NA values.
+ ExtensionArray.fillna: Fill NA/NaN values using the specified method.
+
Notes
-----
If returning an ExtensionArray, then
@@ -1580,6 +1595,7 @@ def take(
Returns
-------
ExtensionArray
+ An array formed with selected `indices`.
Raises
------
@@ -1832,6 +1848,11 @@ def ravel(self, order: Literal["C", "F", "A", "K"] | None = "C") -> Self:
Returns
-------
ExtensionArray
+ A flattened view on the array.
+
+ See Also
+ --------
+ ExtensionArray.tolist: Return a list of the values.
Notes
-----
diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
index ad0bde3abbdd4..fbe1677b95b33 100644
--- a/pandas/core/arrays/datetimelike.py
+++ b/pandas/core/arrays/datetimelike.py
@@ -19,6 +19,7 @@
import numpy as np
+from pandas._config import using_string_dtype
from pandas._config.config import get_option
from pandas._libs import (
@@ -1759,6 +1760,10 @@ def strftime(self, date_format: str) -> npt.NDArray[np.object_]:
dtype='object')
"""
result = self._format_native_types(date_format=date_format, na_rep=np.nan)
+ if using_string_dtype():
+ from pandas import StringDtype
+
+ return pd_array(result, dtype=StringDtype(na_value=np.nan)) # type: ignore[return-value]
return result.astype(object, copy=False)
@@ -1781,7 +1786,7 @@ def strftime(self, date_format: str) -> npt.NDArray[np.object_]:
a non-DST time (note that this flag is only applicable for
ambiguous times)
- 'NaT' will return NaT where there are ambiguous times
- - 'raise' will raise an AmbiguousTimeError if there are ambiguous
+ - 'raise' will raise a ValueError if there are ambiguous
times.
nonexistent : 'shift_forward', 'shift_backward', 'NaT', timedelta, default 'raise'
@@ -1794,7 +1799,7 @@ def strftime(self, date_format: str) -> npt.NDArray[np.object_]:
closest existing time
- 'NaT' will return NaT where there are nonexistent times
- timedelta objects will shift nonexistent times by the timedelta
- - 'raise' will raise an NonExistentTimeError if there are
+ - 'raise' will raise a ValueError if there are
nonexistent times.
Returns
diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py
index dddfc440109d3..201c449185057 100644
--- a/pandas/core/arrays/datetimes.py
+++ b/pandas/core/arrays/datetimes.py
@@ -15,6 +15,7 @@
import numpy as np
+from pandas._config import using_string_dtype
from pandas._config.config import get_option
from pandas._libs import (
@@ -158,15 +159,8 @@ def f(self):
# these return a boolean by-definition
return result
- if field in self._object_ops:
- result = fields.get_date_name_field(values, field, reso=self._creso)
- result = self._maybe_mask_results(result, fill_value=None)
-
- else:
- result = fields.get_date_field(values, field, reso=self._creso)
- result = self._maybe_mask_results(
- result, fill_value=None, convert="float64"
- )
+ result = fields.get_date_field(values, field, reso=self._creso)
+ result = self._maybe_mask_results(result, fill_value=None, convert="float64")
return result
@@ -243,7 +237,6 @@ def _scalar_type(self) -> type[Timestamp]:
"is_year_end",
"is_leap_year",
]
- _object_ops: list[str] = ["freq", "tz"]
_field_ops: list[str] = [
"year",
"month",
@@ -264,7 +257,7 @@ def _scalar_type(self) -> type[Timestamp]:
]
_other_ops: list[str] = ["date", "time", "timetz"]
_datetimelike_ops: list[str] = (
- _field_ops + _object_ops + _bool_ops + _other_ops + ["unit"]
+ _field_ops + _bool_ops + _other_ops + ["unit", "freq", "tz"]
)
_datetimelike_methods: list[str] = [
"to_period",
@@ -972,7 +965,7 @@ def tz_localize(
non-DST time (note that this flag is only applicable for
ambiguous times)
- 'NaT' will return NaT where there are ambiguous times
- - 'raise' will raise an AmbiguousTimeError if there are ambiguous
+ - 'raise' will raise a ValueError if there are ambiguous
times.
nonexistent : 'shift_forward', 'shift_backward, 'NaT', timedelta, \
@@ -986,7 +979,7 @@ def tz_localize(
closest existing time
- 'NaT' will return NaT where there are nonexistent times
- timedelta objects will shift nonexistent times by the timedelta
- - 'raise' will raise an NonExistentTimeError if there are
+ - 'raise' will raise a ValueError if there are
nonexistent times.
Returns
@@ -1340,6 +1333,13 @@ def month_name(self, locale=None) -> npt.NDArray[np.object_]:
values, "month_name", locale=locale, reso=self._creso
)
result = self._maybe_mask_results(result, fill_value=None)
+ if using_string_dtype():
+ from pandas import (
+ StringDtype,
+ array as pd_array,
+ )
+
+ return pd_array(result, dtype=StringDtype(na_value=np.nan)) # type: ignore[return-value]
return result
def day_name(self, locale=None) -> npt.NDArray[np.object_]:
@@ -1401,6 +1401,14 @@ def day_name(self, locale=None) -> npt.NDArray[np.object_]:
values, "day_name", locale=locale, reso=self._creso
)
result = self._maybe_mask_results(result, fill_value=None)
+ if using_string_dtype():
+ # TODO: no tests that check for dtype of result as of 2024-08-15
+ from pandas import (
+ StringDtype,
+ array as pd_array,
+ )
+
+ return pd_array(result, dtype=StringDtype(na_value=np.nan)) # type: ignore[return-value]
return result
@property
diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py
index 07eb91e0cb13b..03712f75db0c7 100644
--- a/pandas/core/arrays/numpy_.py
+++ b/pandas/core/arrays/numpy_.py
@@ -557,7 +557,3 @@ def _wrap_ndarray_result(self, result: np.ndarray):
return TimedeltaArray._simple_new(result, dtype=result.dtype)
return type(self)(result)
-
- # ------------------------------------------------------------------------
- # String methods interface
- _str_na_value = np.nan
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 2ba7c9fccbfce..823084c3e9982 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -140,12 +140,16 @@ def __init__(
# infer defaults
if storage is None:
if na_value is not libmissing.NA:
- if HAS_PYARROW:
- storage = "pyarrow"
- else:
- storage = "python"
+ storage = get_option("mode.string_storage")
+ if storage == "auto":
+ if HAS_PYARROW:
+ storage = "pyarrow"
+ else:
+ storage = "python"
else:
storage = get_option("mode.string_storage")
+ if storage == "auto":
+ storage = "python"
if storage == "pyarrow_numpy":
# TODO raise a deprecation warning
@@ -350,9 +354,7 @@ def _str_map(
self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True
):
if self.dtype.na_value is np.nan:
- return self._str_map_nan_semantics(
- f, na_value=na_value, dtype=dtype, convert=convert
- )
+ return self._str_map_nan_semantics(f, na_value=na_value, dtype=dtype)
from pandas.arrays import BooleanArray
@@ -427,9 +429,7 @@ def _str_map_str_or_object(
# -> We don't know the result type. E.g. `.get` can return anything.
return lib.map_infer_mask(arr, f, mask.view("uint8"))
- def _str_map_nan_semantics(
- self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True
- ):
+ def _str_map_nan_semantics(self, f, na_value=None, dtype: Dtype | None = None):
if dtype is None:
dtype = self.dtype
if na_value is None:
@@ -746,6 +746,12 @@ def _reduce(
axis: AxisInt | None = 0,
**kwargs,
):
+ if self.dtype.na_value is np.nan and name in ["any", "all"]:
+ if name == "any":
+ return nanops.nanany(self._ndarray, skipna=skipna)
+ else:
+ return nanops.nanall(self._ndarray, skipna=skipna)
+
if name in ["min", "max"]:
result = getattr(self, name)(skipna=skipna, axis=axis)
if keepdims:
@@ -754,6 +760,12 @@ def _reduce(
raise TypeError(f"Cannot perform reduction '{name}' with string dtype")
+ def _wrap_reduction_result(self, axis: AxisInt | None, result) -> Any:
+ if self.dtype.na_value is np.nan and result is libmissing.NA:
+ # the masked_reductions use pd.NA -> convert to np.nan
+ return np.nan
+ return super()._wrap_reduction_result(axis, result)
+
def min(self, axis=None, skipna: bool = True, **kwargs) -> Scalar:
nv.validate_min((), kwargs)
result = masked_reductions.min(
@@ -771,8 +783,11 @@ def max(self, axis=None, skipna: bool = True, **kwargs) -> Scalar:
def value_counts(self, dropna: bool = True) -> Series:
from pandas.core.algorithms import value_counts_internal as value_counts
- result = value_counts(self._ndarray, sort=False, dropna=dropna).astype("Int64")
+ result = value_counts(self._ndarray, sort=False, dropna=dropna)
result.index = result.index.astype(self.dtype)
+
+ if self.dtype.na_value is libmissing.NA:
+ result = result.astype("Int64")
return result
def memory_usage(self, deep: bool = False) -> int:
@@ -823,16 +838,16 @@ def _cmp_method(self, other, op):
# logical
result = np.zeros(len(self._ndarray), dtype="bool")
result[valid] = op(self._ndarray[valid], other)
- return BooleanArray(result, mask)
+ res_arr = BooleanArray(result, mask)
+ if self.dtype.na_value is np.nan:
+ if op == operator.ne:
+ return res_arr.to_numpy(np.bool_, na_value=True)
+ else:
+ return res_arr.to_numpy(np.bool_, na_value=False)
+ return res_arr
_arith_method = _cmp_method
- # ------------------------------------------------------------------------
- # String methods interface
- # error: Incompatible types in assignment (expression has type "NAType",
- # base class "NumpyExtensionArray" defined the type as "float")
- _str_na_value = libmissing.NA # type: ignore[assignment]
-
class StringArrayNumpySemantics(StringArray):
_storage = "python"
@@ -863,38 +878,3 @@ def _from_backing_data(self, arr: np.ndarray) -> StringArrayNumpySemantics:
# need to override NumpyExtensionArray._from_backing_data to ensure
# we always preserve the dtype
return NDArrayBacked._from_backing_data(self, arr)
-
- def _reduce(
- self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs
- ):
- if name in ["any", "all"]:
- if name == "any":
- return nanops.nanany(self._ndarray, skipna=skipna)
- else:
- return nanops.nanall(self._ndarray, skipna=skipna)
- else:
- return super()._reduce(name, skipna=skipna, keepdims=keepdims, **kwargs)
-
- def _wrap_reduction_result(self, axis: AxisInt | None, result) -> Any:
- # the masked_reductions use pd.NA
- if result is libmissing.NA:
- return np.nan
- return super()._wrap_reduction_result(axis, result)
-
- def _cmp_method(self, other, op):
- result = super()._cmp_method(other, op)
- if op == operator.ne:
- return result.to_numpy(np.bool_, na_value=True)
- else:
- return result.to_numpy(np.bool_, na_value=False)
-
- def value_counts(self, dropna: bool = True) -> Series:
- from pandas.core.algorithms import value_counts_internal as value_counts
-
- result = value_counts(self._ndarray, sort=False, dropna=dropna)
- result.index = result.index.astype(self.dtype)
- return result
-
- # ------------------------------------------------------------------------
- # String methods interface
- _str_na_value = np.nan
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index cc37995969f0a..67114815341b6 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -1,6 +1,5 @@
from __future__ import annotations
-from functools import partial
import operator
import re
from typing import (
@@ -36,7 +35,6 @@
BaseStringArray,
StringDtype,
)
-from pandas.core.ops import invalid_comparison
from pandas.core.strings.object_array import ObjectStringArrayMixin
if not pa_version_under10p1:
@@ -130,18 +128,22 @@ class ArrowStringArray(ObjectStringArrayMixin, ArrowExtensionArray, BaseStringAr
def __init__(self, values) -> None:
_chk_pyarrow_available()
- if isinstance(values, (pa.Array, pa.ChunkedArray)) and pa.types.is_string(
- values.type
+ if isinstance(values, (pa.Array, pa.ChunkedArray)) and (
+ pa.types.is_string(values.type)
+ or (
+ pa.types.is_dictionary(values.type)
+ and (
+ pa.types.is_string(values.type.value_type)
+ or pa.types.is_large_string(values.type.value_type)
+ )
+ )
):
values = pc.cast(values, pa.large_string())
super().__init__(values)
self._dtype = StringDtype(storage=self._storage, na_value=self._na_value)
- if not pa.types.is_large_string(self._pa_array.type) and not (
- pa.types.is_dictionary(self._pa_array.type)
- and pa.types.is_large_string(self._pa_array.type.value_type)
- ):
+ if not pa.types.is_large_string(self._pa_array.type):
raise ValueError(
"ArrowStringArray requires a PyArrow (chunked) array of "
"large_string type"
@@ -213,12 +215,17 @@ def dtype(self) -> StringDtype: # type: ignore[override]
return self._dtype
def insert(self, loc: int, item) -> ArrowStringArray:
+ if self.dtype.na_value is np.nan and item is np.nan:
+ item = libmissing.NA
if not isinstance(item, str) and item is not libmissing.NA:
raise TypeError("Scalar must be NA or str")
return super().insert(loc, item)
- @classmethod
- def _result_converter(cls, values, na=None):
+ def _result_converter(self, values, na=None):
+ if self.dtype.na_value is np.nan:
+ if not isna(na):
+ values = values.fill_null(bool(na))
+ return ArrowExtensionArray(values).to_numpy(na_value=np.nan)
return BooleanDtype().__from_arrow__(values)
def _maybe_convert_setitem_value(self, value):
@@ -272,10 +279,6 @@ def astype(self, dtype, copy: bool = True):
# ------------------------------------------------------------------------
# String methods interface
- # error: Incompatible types in assignment (expression has type "NAType",
- # base class "ObjectStringArrayMixin" defined the type as "float")
- _str_na_value = libmissing.NA # type: ignore[assignment]
-
_str_map = BaseStringArray._str_map
def _str_contains(
@@ -493,11 +496,30 @@ def _str_get_dummies(self, sep: str = "|"):
return dummies.astype(np.int64, copy=False), labels
def _convert_int_dtype(self, result):
+ if self.dtype.na_value is np.nan:
+ if isinstance(result, pa.Array):
+ result = result.to_numpy(zero_copy_only=False)
+ else:
+ result = result.to_numpy()
+ if result.dtype == np.int32:
+ result = result.astype(np.int64)
+ return result
+
return Int64Dtype().__from_arrow__(result)
def _reduce(
self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs
):
+ if self.dtype.na_value is np.nan and name in ["any", "all"]:
+ if not skipna:
+ nas = pc.is_null(self._pa_array)
+ arr = pc.or_kleene(nas, pc.not_equal(self._pa_array, ""))
+ else:
+ arr = pc.not_equal(self._pa_array, "")
+ return ArrowExtensionArray(arr)._reduce(
+ name, skipna=skipna, keepdims=keepdims, **kwargs
+ )
+
result = self._reduce_calc(name, skipna=skipna, keepdims=keepdims, **kwargs)
if name in ("argmin", "argmax") and isinstance(result, pa.Array):
return self._convert_int_dtype(result)
@@ -528,70 +550,31 @@ def _rank(
)
)
-
-class ArrowStringArrayNumpySemantics(ArrowStringArray):
- _storage = "pyarrow"
- _na_value = np.nan
-
- @classmethod
- def _result_converter(cls, values, na=None):
- if not isna(na):
- values = values.fill_null(bool(na))
- return ArrowExtensionArray(values).to_numpy(na_value=np.nan)
-
- def __getattribute__(self, item):
- # ArrowStringArray and we both inherit from ArrowExtensionArray, which
- # creates inheritance problems (Diamond inheritance)
- if item in ArrowStringArrayMixin.__dict__ and item not in (
- "_pa_array",
- "__dict__",
- ):
- return partial(getattr(ArrowStringArrayMixin, item), self)
- return super().__getattribute__(item)
-
- def _convert_int_dtype(self, result):
- if isinstance(result, pa.Array):
- result = result.to_numpy(zero_copy_only=False)
- else:
- result = result.to_numpy()
- if result.dtype == np.int32:
- result = result.astype(np.int64)
+ def value_counts(self, dropna: bool = True) -> Series:
+ result = super().value_counts(dropna=dropna)
+ if self.dtype.na_value is np.nan:
+ res_values = result._values.to_numpy()
+ return result._constructor(
+ res_values, index=result.index, name=result.name, copy=False
+ )
return result
def _cmp_method(self, other, op):
- try:
- result = super()._cmp_method(other, op)
- except pa.ArrowNotImplementedError:
- return invalid_comparison(self, other, op)
- if op == operator.ne:
- return result.to_numpy(np.bool_, na_value=True)
- else:
- return result.to_numpy(np.bool_, na_value=False)
-
- def value_counts(self, dropna: bool = True) -> Series:
- from pandas import Series
-
- result = super().value_counts(dropna)
- return Series(
- result._values.to_numpy(), index=result.index, name=result.name, copy=False
- )
-
- def _reduce(
- self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs
- ):
- if name in ["any", "all"]:
- if not skipna:
- nas = pc.is_null(self._pa_array)
- arr = pc.or_kleene(nas, pc.not_equal(self._pa_array, ""))
+ result = super()._cmp_method(other, op)
+ if self.dtype.na_value is np.nan:
+ if op == operator.ne:
+ return result.to_numpy(np.bool_, na_value=True)
else:
- arr = pc.not_equal(self._pa_array, "")
- return ArrowExtensionArray(arr)._reduce(
- name, skipna=skipna, keepdims=keepdims, **kwargs
- )
- else:
- return super()._reduce(name, skipna=skipna, keepdims=keepdims, **kwargs)
+ return result.to_numpy(np.bool_, na_value=False)
+ return result
- def insert(self, loc: int, item) -> ArrowStringArrayNumpySemantics:
- if item is np.nan:
- item = libmissing.NA
- return super().insert(loc, item) # type: ignore[return-value]
+
+class ArrowStringArrayNumpySemantics(ArrowStringArray):
+ _na_value = np.nan
+ _str_get = ArrowStringArrayMixin._str_get
+ _str_removesuffix = ArrowStringArrayMixin._str_removesuffix
+ _str_capitalize = ArrowStringArrayMixin._str_capitalize
+ _str_pad = ArrowStringArrayMixin._str_pad
+ _str_title = ArrowStringArrayMixin._str_title
+ _str_swapcase = ArrowStringArrayMixin._str_swapcase
+ _str_slice_replace = ArrowStringArrayMixin._str_slice_replace
diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py
index 83cc2871f5459..b2cfbe7338c0d 100644
--- a/pandas/core/arrays/timedeltas.py
+++ b/pandas/core/arrays/timedeltas.py
@@ -152,9 +152,8 @@ def _scalar_type(self) -> type[Timedelta]:
# define my properties & methods for delegation
_other_ops: list[str] = []
_bool_ops: list[str] = []
- _object_ops: list[str] = ["freq"]
_field_ops: list[str] = ["days", "seconds", "microseconds", "nanoseconds"]
- _datetimelike_ops: list[str] = _field_ops + _object_ops + _bool_ops + ["unit"]
+ _datetimelike_ops: list[str] = _field_ops + _bool_ops + ["unit", "freq"]
_datetimelike_methods: list[str] = [
"to_pytimedelta",
"total_seconds",
diff --git a/pandas/core/computation/parsing.py b/pandas/core/computation/parsing.py
index 8fbf8936d31ef..35a6d1c6ad269 100644
--- a/pandas/core/computation/parsing.py
+++ b/pandas/core/computation/parsing.py
@@ -4,6 +4,7 @@
from __future__ import annotations
+from enum import Enum
from io import StringIO
from keyword import iskeyword
import token
@@ -32,13 +33,21 @@ def create_valid_python_identifier(name: str) -> str:
------
SyntaxError
If the returned name is not a Python valid identifier, raise an exception.
- This can happen if there is a hashtag in the name, as the tokenizer will
- than terminate and not find the backtick.
- But also for characters that fall out of the range of (U+0001..U+007F).
"""
if name.isidentifier() and not iskeyword(name):
return name
+ # Escape characters that fall outside the ASCII range (U+0001..U+007F).
+ # GH 49633
+ gen = (
+ (c, "".join(chr(b) for b in c.encode("ascii", "backslashreplace")))
+ for c in name
+ )
+ name = "".join(
+ c_escaped.replace("\\", "_UNICODE_" if c != c_escaped else "_BACKSLASH_")
+ for c, c_escaped in gen
+ )
+
# Create a dict with the special characters and their replacement string.
# EXACT_TOKEN_TYPES contains these special characters
# token.tok_name contains a readable description of the replacement string.
@@ -54,11 +63,10 @@ def create_valid_python_identifier(name: str) -> str:
"$": "_DOLLARSIGN_",
"€": "_EUROSIGN_",
"°": "_DEGREESIGN_",
- # Including quotes works, but there are exceptions.
"'": "_SINGLEQUOTE_",
'"': "_DOUBLEQUOTE_",
- # Currently not possible. Terminates parser and won't find backtick.
- # "#": "_HASH_",
+ "#": "_HASH_",
+ "`": "_BACKTICK_",
}
)
@@ -127,6 +135,9 @@ def clean_column_name(name: Hashable) -> Hashable:
which is not caught and propagates to the user level.
"""
try:
+ # Escape backticks
+ name = name.replace("`", "``") if isinstance(name, str) else name
+
tokenized = tokenize_string(f"`{name}`")
tokval = next(tokenized)[1]
return create_valid_python_identifier(tokval)
@@ -168,6 +179,91 @@ def tokenize_backtick_quoted_string(
return BACKTICK_QUOTED_STRING, source[string_start:string_end]
+class ParseState(Enum):
+ DEFAULT = 0
+ IN_BACKTICK = 1
+ IN_SINGLE_QUOTE = 2
+ IN_DOUBLE_QUOTE = 3
+
+
+def _split_by_backtick(s: str) -> list[tuple[bool, str]]:
+ """
+ Splits a str into substrings along backtick characters (`).
+
+ Disregards backticks inside quotes.
+
+ Parameters
+ ----------
+ s : str
+ The Python source code string.
+
+ Returns
+ -------
+ substrings: list[tuple[bool, str]]
+ List of tuples, where each tuple has two elements:
+ The first is a boolean indicating if the substring is backtick-quoted.
+ The second is the actual substring.
+ """
+ substrings = []
+ substr: list[str] = [] # Will join into a string before adding to `substrings`
+ i = 0
+ parse_state = ParseState.DEFAULT
+ while i < len(s):
+ char = s[i]
+
+ match char:
+ case "`":
+ # start of a backtick-quoted string
+ if parse_state == ParseState.DEFAULT:
+ if substr:
+ substrings.append((False, "".join(substr)))
+
+ substr = [char]
+ i += 1
+ parse_state = ParseState.IN_BACKTICK
+ continue
+
+ elif parse_state == ParseState.IN_BACKTICK:
+ # escaped backtick inside a backtick-quoted string
+ next_char = s[i + 1] if (i != len(s) - 1) else None
+ if next_char == "`":
+ substr.append(char)
+ substr.append(next_char)
+ i += 2
+ continue
+
+ # end of the backtick-quoted string
+ else:
+ substr.append(char)
+ substrings.append((True, "".join(substr)))
+
+ substr = []
+ i += 1
+ parse_state = ParseState.DEFAULT
+ continue
+ case "'":
+ # start of a single-quoted string
+ if parse_state == ParseState.DEFAULT:
+ parse_state = ParseState.IN_SINGLE_QUOTE
+ # end of a single-quoted string
+ elif (parse_state == ParseState.IN_SINGLE_QUOTE) and (s[i - 1] != "\\"):
+ parse_state = ParseState.DEFAULT
+ case '"':
+ # start of a double-quoted string
+ if parse_state == ParseState.DEFAULT:
+ parse_state = ParseState.IN_DOUBLE_QUOTE
+ # end of a double-quoted string
+ elif (parse_state == ParseState.IN_DOUBLE_QUOTE) and (s[i - 1] != "\\"):
+ parse_state = ParseState.DEFAULT
+ substr.append(char)
+ i += 1
+
+ if substr:
+ substrings.append((False, "".join(substr)))
+
+ return substrings
+
+
def tokenize_string(source: str) -> Iterator[tuple[int, str]]:
"""
Tokenize a Python source code string.
@@ -182,18 +278,19 @@ def tokenize_string(source: str) -> Iterator[tuple[int, str]]:
tok_generator : Iterator[Tuple[int, str]]
An iterator yielding all tokens with only toknum and tokval (Tuple[ing, str]).
"""
+ # GH 59285
+ # Escape characters, including backticks
+ source = "".join(
+ (
+ create_valid_python_identifier(substring[1:-1])
+ if is_backtick_quoted
+ else substring
+ )
+ for is_backtick_quoted, substring in _split_by_backtick(source)
+ )
+
line_reader = StringIO(source).readline
token_generator = tokenize.generate_tokens(line_reader)
- # Loop over all tokens till a backtick (`) is found.
- # Then, take all tokens till the next backtick to form a backtick quoted string
- for toknum, tokval, start, _, _ in token_generator:
- if tokval == "`":
- try:
- yield tokenize_backtick_quoted_string(
- token_generator, source, string_start=start[1] + 1
- )
- except Exception as err:
- raise SyntaxError(f"Failed to parse backticks in '{source}'.") from err
- else:
- yield toknum, tokval
+ for toknum, tokval, _, _, _ in token_generator:
+ yield toknum, tokval
diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py
index e62cda0dfe8d0..e4eefb570fd95 100644
--- a/pandas/core/config_init.py
+++ b/pandas/core/config_init.py
@@ -452,13 +452,12 @@ def is_terminal() -> bool:
string_storage_doc = """
: string
- The default storage for StringDtype. This option is ignored if
- ``future.infer_string`` is set to True.
+ The default storage for StringDtype.
"""
def is_valid_string_storage(value: Any) -> None:
- legal_values = ["python", "pyarrow"]
+ legal_values = ["auto", "python", "pyarrow"]
if value not in legal_values:
msg = "Value must be one of python|pyarrow"
if value == "pyarrow_numpy":
@@ -473,7 +472,7 @@ def is_valid_string_storage(value: Any) -> None:
with cf.config_prefix("mode"):
cf.register_option(
"string_storage",
- "python",
+ "auto",
string_storage_doc,
# validator=is_one_of_factory(["python", "pyarrow"]),
validator=is_valid_string_storage,
diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
index 162f6a4d30f3f..3394bf091e228 100644
--- a/pandas/core/dtypes/cast.py
+++ b/pandas/core/dtypes/cast.py
@@ -1014,10 +1014,8 @@ def convert_dtypes(
Back-end data type applied to the resultant :class:`DataFrame`
(still experimental). Behaviour is as follows:
- * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
- (default).
+ * ``"numpy_nullable"``: returns nullable-dtype
* ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
- DataFrame.
.. versionadded:: 2.0
@@ -1025,6 +1023,8 @@ def convert_dtypes(
-------
np.dtype, or ExtensionDtype
"""
+ from pandas.core.arrays.string_ import StringDtype
+
inferred_dtype: str | DtypeObj
if (
@@ -1103,6 +1103,13 @@ def convert_dtypes(
# If we couldn't do anything else, then we retain the dtype
inferred_dtype = input_array.dtype
+ elif (
+ convert_string
+ and isinstance(input_array.dtype, StringDtype)
+ and input_array.dtype.na_value is np.nan
+ ):
+ inferred_dtype = pandas_dtype_func("string")
+
else:
inferred_dtype = input_array.dtype
diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
index 64b5278424192..bcf1ade9b0320 100644
--- a/pandas/core/dtypes/common.py
+++ b/pandas/core/dtypes/common.py
@@ -1274,6 +1274,10 @@ def is_bool_dtype(arr_or_dtype) -> bool:
"""
Check whether the provided array or dtype is of a boolean dtype.
+ This function verifies whether a given object is a boolean data type. The input
+ can be an array or a dtype object. Accepted array types include instances
+ of ``np.array``, ``pd.Series``, ``pd.Index``, and similar array-like structures.
+
Parameters
----------
arr_or_dtype : array-like or dtype
@@ -1284,6 +1288,10 @@ def is_bool_dtype(arr_or_dtype) -> bool:
boolean
Whether or not the array or dtype is of a boolean dtype.
+ See Also
+ --------
+ api.types.is_bool : Check if an object is a boolean.
+
Notes
-----
An ExtensionArray is considered boolean when the ``_is_boolean``
diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py
index 3aeab96e03163..c0587d36bcb5a 100644
--- a/pandas/core/dtypes/dtypes.py
+++ b/pandas/core/dtypes/dtypes.py
@@ -18,9 +18,9 @@
cast,
)
import warnings
+import zoneinfo
import numpy as np
-import pytz
from pandas._config.config import get_option
@@ -789,7 +789,7 @@ def __init__(self, unit: str_type | DatetimeTZDtype = "ns", tz=None) -> None:
tz = timezones.maybe_get_tz(tz)
tz = timezones.tz_standardize(tz)
elif tz is not None:
- raise pytz.UnknownTimeZoneError(tz)
+ raise zoneinfo.ZoneInfoNotFoundError(tz)
if tz is None:
raise TypeError("A 'tz' is required.")
@@ -882,7 +882,7 @@ def construct_from_string(cls, string: str_type) -> DatetimeTZDtype:
return cls(unit=d["unit"], tz=d["tz"])
except (KeyError, TypeError, ValueError) as err:
# KeyError if maybe_get_tz tries and fails to get a
- # pytz timezone (actually pytz.UnknownTimeZoneError).
+ # zoneinfo timezone (actually zoneinfo.ZoneInfoNotFoundError).
# TypeError if we pass a nonsense tz;
# ValueError if we pass a unit other than "ns"
raise TypeError(msg) from err
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index b8039746d9952..b84fb33af26e5 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -4556,17 +4556,8 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | No
quoted string are replaced by strings that are allowed as a Python identifier.
These characters include all operators in Python, the space character, the
question mark, the exclamation mark, the dollar sign, and the euro sign.
- For other characters that fall outside the ASCII range (U+0001..U+007F)
- and those that are not further specified in PEP 3131,
- the query parser will raise an error.
- This excludes whitespace different than the space character,
- but also the hashtag (as it is used for comments) and the backtick
- itself (backtick can also not be escaped).
-
- In a special case, quotes that make a pair around a backtick can
- confuse the parser.
- For example, ```it's` > `that's``` will raise an error,
- as it forms a quoted string (``'s > `that'``) with a backtick inside.
+
+ A backtick can be escaped by double backticks.
See also the `Python documentation about lexical analysis
`__
@@ -4620,6 +4611,7 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | No
raise ValueError(msg)
kwargs["level"] = kwargs.pop("level", 0) + 1
kwargs["target"] = None
+
res = self.eval(expr, **kwargs)
try:
@@ -6406,7 +6398,7 @@ def dropna(
thresh : int, optional
Require that many non-NA values. Cannot be combined with how.
- subset : column label or sequence of labels, optional
+ subset : column label or iterable of labels, optional
Labels along other axis to consider, e.g. if you are dropping rows
these would be a list of columns to include.
inplace : bool, default False
@@ -6536,7 +6528,7 @@ def dropna(
@overload
def drop_duplicates(
self,
- subset: Hashable | Sequence[Hashable] | None = ...,
+ subset: Hashable | Iterable[Hashable] | None = ...,
*,
keep: DropKeep = ...,
inplace: Literal[True],
@@ -6546,7 +6538,7 @@ def drop_duplicates(
@overload
def drop_duplicates(
self,
- subset: Hashable | Sequence[Hashable] | None = ...,
+ subset: Hashable | Iterable[Hashable] | None = ...,
*,
keep: DropKeep = ...,
inplace: Literal[False] = ...,
@@ -6556,7 +6548,7 @@ def drop_duplicates(
@overload
def drop_duplicates(
self,
- subset: Hashable | Sequence[Hashable] | None = ...,
+ subset: Hashable | Iterable[Hashable] | None = ...,
*,
keep: DropKeep = ...,
inplace: bool = ...,
@@ -6565,7 +6557,7 @@ def drop_duplicates(
def drop_duplicates(
self,
- subset: Hashable | Sequence[Hashable] | None = None,
+ subset: Hashable | Iterable[Hashable] | None = None,
*,
keep: DropKeep = "first",
inplace: bool = False,
@@ -6579,7 +6571,7 @@ def drop_duplicates(
Parameters
----------
- subset : column label or sequence of labels, optional
+ subset : column label or iterable of labels, optional
Only consider certain columns for identifying duplicates, by
default use all of the columns.
keep : {'first', 'last', ``False``}, default 'first'
@@ -6669,7 +6661,7 @@ def drop_duplicates(
def duplicated(
self,
- subset: Hashable | Sequence[Hashable] | None = None,
+ subset: Hashable | Iterable[Hashable] | None = None,
keep: DropKeep = "first",
) -> Series:
"""
@@ -6679,7 +6671,7 @@ def duplicated(
Parameters
----------
- subset : column label or sequence of labels, optional
+ subset : column label or iterable of labels, optional
Only consider certain columns for identifying duplicates, by
default use all of the columns.
keep : {'first', 'last', False}, default 'first'
@@ -6771,10 +6763,7 @@ def f(vals) -> tuple[np.ndarray, int]:
return labels.astype("i8"), len(shape)
if subset is None:
- # https://github.com/pandas-dev/pandas/issues/28770
- # Incompatible types in assignment (expression has type "Index", variable
- # has type "Sequence[Any]")
- subset = self.columns # type: ignore[assignment]
+ subset = self.columns
elif (
not np.iterable(subset)
or isinstance(subset, str)
@@ -6795,7 +6784,7 @@ def f(vals) -> tuple[np.ndarray, int]:
if len(subset) == 1 and self.columns.is_unique:
# GH#45236 This is faster than get_group_index below
- result = self[subset[0]].duplicated(keep)
+ result = self[next(iter(subset))].duplicated(keep)
result.name = None
else:
vals = (col.values for name, col in self.items() if name in subset)
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index 8a6fc69d47cc3..0f0078fc3398b 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -6670,10 +6670,10 @@ def convert_dtypes(
Back-end data type applied to the resultant :class:`DataFrame` or
:class:`Series` (still experimental). Behaviour is as follows:
- * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
- or :class:`Series` (default).
+ * ``"numpy_nullable"``: returns nullable-dtype-backed
+ :class:`DataFrame` or :class:`Serires`.
* ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
- DataFrame or Series.
+ :class:`DataFrame` or :class:`Series`.
.. versionadded:: 2.0
@@ -10570,7 +10570,7 @@ def tz_localize(
a non-DST time (note that this flag is only applicable for
ambiguous times)
- 'NaT' will return NaT where there are ambiguous times
- - 'raise' will raise an AmbiguousTimeError if there are ambiguous
+ - 'raise' will raise a ValueError if there are ambiguous
times.
nonexistent : str, default 'raise'
A nonexistent time does not exist in a particular timezone
@@ -10582,7 +10582,7 @@ def tz_localize(
closest existing time
- 'NaT' will return NaT where there are nonexistent times
- timedelta objects will shift nonexistent times by the timedelta
- - 'raise' will raise an NonExistentTimeError if there are
+ - 'raise' will raise a ValueError if there are
nonexistent times.
Returns
diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py
index 00a929724ed4c..3b3cda8f7cd33 100644
--- a/pandas/core/indexes/datetimes.py
+++ b/pandas/core/indexes/datetimes.py
@@ -6,7 +6,6 @@
import warnings
import numpy as np
-import pytz
from pandas._libs import (
NaT,
@@ -162,7 +161,7 @@ class DatetimeIndex(DatetimeTimedeltaMixin):
non-DST time (note that this flag is only applicable for ambiguous
times)
- 'NaT' will return NaT where there are ambiguous times
- - 'raise' will raise an AmbiguousTimeError if there are ambiguous times.
+ - 'raise' will raise a ValueError if there are ambiguous times.
dayfirst : bool, default False
If True, parse dates in `data` with the day first order.
yearfirst : bool, default False
@@ -264,7 +263,7 @@ def _engine_type(self) -> type[libindex.DatetimeEngine]:
@doc(DatetimeArray.strftime)
def strftime(self, date_format) -> Index:
arr = self._data.strftime(date_format)
- return Index(arr, name=self.name, dtype=object)
+ return Index(arr, name=self.name, dtype=arr.dtype)
@doc(DatetimeArray.tz_convert)
def tz_convert(self, tz) -> Self:
@@ -591,7 +590,7 @@ def get_loc(self, key):
elif isinstance(key, str):
try:
parsed, reso = self._parse_with_reso(key)
- except (ValueError, pytz.NonExistentTimeError) as err:
+ except ValueError as err:
raise KeyError(key) from err
self._disallow_mismatched_indexing(parsed)
diff --git a/pandas/core/indexes/extension.py b/pandas/core/indexes/extension.py
index 48d5e59250f35..2eeacfb769be4 100644
--- a/pandas/core/indexes/extension.py
+++ b/pandas/core/indexes/extension.py
@@ -74,7 +74,7 @@ def fget(self):
return type(self)._simple_new(result, name=self.name)
elif isinstance(result, ABCDataFrame):
return result.set_index(self)
- return Index(result, name=self.name)
+ return Index(result, name=self.name, dtype=result.dtype)
return result
def fset(self, value) -> None:
@@ -101,7 +101,7 @@ def method(self, *args, **kwargs): # type: ignore[misc]
return type(self)._simple_new(result, name=self.name)
elif isinstance(result, ABCDataFrame):
return result.set_index(self)
- return Index(result, name=self.name)
+ return Index(result, name=self.name, dtype=result.dtype)
return result
# error: "property" has no attribute "__name__"
diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
index 0900121ab717f..c3d4ad721c830 100644
--- a/pandas/core/indexes/multi.py
+++ b/pandas/core/indexes/multi.py
@@ -1636,6 +1636,17 @@ def _set_names(self, names, *, level=None, validate: bool = True) -> None:
doc="""
Names of levels in MultiIndex.
+ This attribute provides access to the names of the levels in a `MultiIndex`.
+ The names are stored as a `FrozenList`, which is an immutable list-like
+ container. Each name corresponds to a level in the `MultiIndex`, and can be
+ used to identify or manipulate the levels individually.
+
+ See Also
+ --------
+ MultiIndex.set_names : Set Index or MultiIndex name.
+ MultiIndex.rename : Rename specific levels in a MultiIndex.
+ Index.names : Get names on index.
+
Examples
--------
>>> mi = pd.MultiIndex.from_arrays(
@@ -2681,8 +2692,15 @@ def sortlevel(
"""
Sort MultiIndex at the requested level.
- The result will respect the original ordering of the associated
- factor at that level.
+ This method is useful when dealing with MultiIndex objects, allowing for
+ sorting at a specific level of the index. The function preserves the
+ relative ordering of data within the same level while sorting
+ the overall MultiIndex. The method provides flexibility with the `ascending`
+ parameter to define the sort order and with the `sort_remaining` parameter to
+ control whether the remaining levels should also be sorted. Sorting a
+ MultiIndex can be crucial when performing operations that require ordered
+ indices, such as grouping or merging datasets. The `na_position` argument is
+ important in handling missing values consistently across different levels.
Parameters
----------
@@ -2692,7 +2710,9 @@ def sortlevel(
ascending : bool, default True
False to sort in descending order.
Can also be a list to specify a directed ordering.
- sort_remaining : sort by the remaining levels after level
+ sort_remaining : bool, default True
+ If True, sorts by the remaining levels after sorting by the specified
+ `level`.
na_position : {'first' or 'last'}, default 'first'
Argument 'first' puts NaNs at the beginning, 'last' puts NaNs at
the end.
@@ -2706,6 +2726,13 @@ def sortlevel(
indexer : np.ndarray[np.intp]
Indices of output values in original index.
+ See Also
+ --------
+ MultiIndex : A multi-level, or hierarchical, index object for pandas objects.
+ Index.sort_values : Sort Index values.
+ DataFrame.sort_index : Sort DataFrame by the index.
+ Series.sort_index : Sort Series by the index.
+
Examples
--------
>>> mi = pd.MultiIndex.from_arrays([[0, 0], [2, 1]])
diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
index 149bef6258bfa..dfb96162f0ac1 100644
--- a/pandas/core/internals/blocks.py
+++ b/pandas/core/internals/blocks.py
@@ -512,7 +512,11 @@ def convert(self) -> list[Block]:
convert_non_numeric=True,
)
refs = None
- if res_values is values:
+ if (
+ res_values is values
+ or isinstance(res_values, NumpyExtensionArray)
+ and res_values._ndarray is values
+ ):
refs = self.refs
res_values = ensure_block_shape(res_values, self.ndim)
diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py
index 25fdafa9b8354..1014c9559afaf 100644
--- a/pandas/core/strings/accessor.py
+++ b/pandas/core/strings/accessor.py
@@ -969,6 +969,8 @@ def rsplit(self, pat=None, *, n=-1, expand: bool = False):
Returns
-------
DataFrame/MultiIndex or Series/Index of objects
+ Returns appropriate type based on `expand` parameter with strings
+ split based on the `sep` parameter.
See Also
--------
@@ -1749,6 +1751,18 @@ def pad(
Returns
-------
Series/Index of objects.
+ A Series or Index where the strings are modified by :meth:`str.%(method)s`.
+
+ See Also
+ --------
+ Series.str.rjust : Fills the left side of strings with an arbitrary
+ character.
+ Series.str.ljust : Fills the right side of strings with an arbitrary
+ character.
+ Series.str.center : Fills both sides of strings with an arbitrary
+ character.
+ Series.str.zfill : Pad strings in the Series/Index by prepending '0'
+ character.
Examples
--------
@@ -2024,11 +2038,19 @@ def decode(self, encoding, errors: str = "strict"):
Parameters
----------
encoding : str
+ Specifies the encoding to be used.
errors : str, optional
+ Specifies the error handling scheme.
+ Possible values are those supported by :meth:`bytes.decode`.
Returns
-------
Series or Index
+ A Series or Index with decoded strings.
+
+ See Also
+ --------
+ Series.str.encode : Encodes strings into bytes in a Series/Index.
Examples
--------
@@ -2063,11 +2085,19 @@ def encode(self, encoding, errors: str = "strict"):
Parameters
----------
encoding : str
+ Specifies the encoding to be used.
errors : str, optional
+ Specifies the error handling scheme.
+ Possible values are those supported by :meth:`str.encode`.
Returns
-------
Series/Index of objects
+ A Series or Index with strings encoded into bytes.
+
+ See Also
+ --------
+ Series.str.decode : Decodes bytes into strings in a Series/Index.
Examples
--------
@@ -2099,6 +2129,7 @@ def encode(self, encoding, errors: str = "strict"):
Returns
-------
Series or Index of object
+ Series or Index with the strings being stripped from the %(side)s.
See Also
--------
@@ -3209,7 +3240,8 @@ def len(self):
Returns
-------
- Series or Index of object
+ Series or Index of objects
+ A Series or Index where the strings are modified by :meth:`str.%(method)s`.
See Also
--------
diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py
index 290a28ab60ae1..100afa956bd24 100644
--- a/pandas/core/strings/object_array.py
+++ b/pandas/core/strings/object_array.py
@@ -37,8 +37,6 @@ class ObjectStringArrayMixin(BaseStringArrayMethods):
String Methods operating on object-dtype ndarrays.
"""
- _str_na_value = np.nan
-
def __len__(self) -> int:
# For typing, _str_map relies on the object being sized.
raise NotImplementedError
@@ -56,7 +54,7 @@ def _str_map(
na_value : Scalar, optional
The value to set for NA values. Might also be used for the
fill value if the callable `f` raises an exception.
- This defaults to ``self._str_na_value`` which is ``np.nan``
+ This defaults to ``self.dtype.na_value`` which is ``np.nan``
for object-dtype and Categorical and ``pd.NA`` for StringArray.
dtype : Dtype, optional
The dtype of the result array.
@@ -66,7 +64,7 @@ def _str_map(
if dtype is None:
dtype = np.dtype("object")
if na_value is None:
- na_value = self._str_na_value
+ na_value = self.dtype.na_value # type: ignore[attr-defined]
if not len(self):
return np.array([], dtype=dtype)
@@ -272,7 +270,7 @@ def f(x):
return x.get(i)
elif len(x) > i >= -len(x):
return x[i]
- return self._str_na_value
+ return self.dtype.na_value # type: ignore[attr-defined]
return self._str_map(f)
@@ -466,7 +464,7 @@ def _str_removesuffix(self, suffix: str):
def _str_extract(self, pat: str, flags: int = 0, expand: bool = True):
regex = re.compile(pat, flags=flags)
- na_value = self._str_na_value
+ na_value = self.dtype.na_value # type: ignore[attr-defined]
if not expand:
diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py
index 26e73794af298..982851d0557c3 100644
--- a/pandas/core/tools/numeric.py
+++ b/pandas/core/tools/numeric.py
@@ -99,8 +99,8 @@ def to_numeric(
is to not use nullable data types. If specified, the behavior
is as follows:
- * ``"numpy_nullable"``: returns with nullable-dtype-backed
- * ``"pyarrow"``: returns with pyarrow-backed nullable :class:`ArrowDtype`
+ * ``"numpy_nullable"``: returns nullable-dtype-backed object
+ * ``"pyarrow"``: returns with pyarrow-backed nullable object
.. versionadded:: 2.0
diff --git a/pandas/io/clipboards.py b/pandas/io/clipboards.py
index 5a0a8c321e629..2ed241f0b9bca 100644
--- a/pandas/io/clipboards.py
+++ b/pandas/io/clipboards.py
@@ -38,14 +38,15 @@ def read_clipboard(
A string or regex delimiter. The default of ``'\\s+'`` denotes
one or more whitespace characters.
- dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable'
+ dtype_backend : {'numpy_nullable', 'pyarrow'}
Back-end data type applied to the resultant :class:`DataFrame`
- (still experimental). Behaviour is as follows:
+ (still experimental). If not specified, the default behavior
+ is to not use nullable data types. If specified, the behavior
+ is as follows:
* ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
- (default).
- * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
- DataFrame.
+ * ``"pyarrow"``: returns pyarrow-backed nullable
+ :class:`ArrowDtype` :class:`DataFrame`
.. versionadded:: 2.0
diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py
index f83f9cb1c8d74..ef52107c283e9 100644
--- a/pandas/io/excel/_base.py
+++ b/pandas/io/excel/_base.py
@@ -267,14 +267,15 @@
Rows at the end to skip (0-indexed).
{storage_options}
-dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable'
+dtype_backend : {{'numpy_nullable', 'pyarrow'}}
Back-end data type applied to the resultant :class:`DataFrame`
- (still experimental). Behaviour is as follows:
+ (still experimental). If not specified, the default behavior
+ is to not use nullable data types. If specified, the behavior
+ is as follows:
* ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
- (default).
- * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
- DataFrame.
+ * ``"pyarrow"``: returns pyarrow-backed nullable
+ :class:`ArrowDtype` :class:`DataFrame`
.. versionadded:: 2.0
@@ -1728,14 +1729,15 @@ def parse(
comment string and the end of the current line is ignored.
skipfooter : int, default 0
Rows at the end to skip (0-indexed).
- dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable'
+ dtype_backend : {{'numpy_nullable', 'pyarrow'}}
Back-end data type applied to the resultant :class:`DataFrame`
- (still experimental). Behaviour is as follows:
+ (still experimental). If not specified, the default behavior
+ is to not use nullable data types. If specified, the behavior
+ is as follows:
* ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
- (default).
- * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
- DataFrame.
+ * ``"pyarrow"``: returns pyarrow-backed nullable
+ :class:`ArrowDtype` :class:`DataFrame`
.. versionadded:: 2.0
**kwds : dict, optional
diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py
index 3df3e77a851a3..aaae9857b4fae 100644
--- a/pandas/io/feather_format.py
+++ b/pandas/io/feather_format.py
@@ -92,14 +92,15 @@ def read_feather(
Whether to parallelize reading using multiple threads.
{storage_options}
- dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable'
+ dtype_backend : {{'numpy_nullable', 'pyarrow'}}
Back-end data type applied to the resultant :class:`DataFrame`
- (still experimental). Behaviour is as follows:
+ (still experimental). If not specified, the default behavior
+ is to not use nullable data types. If specified, the behavior
+ is as follows:
- * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
- (default).
- * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
- DataFrame.
+ * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`.
+ * ``"pyarrow"``: returns pyarrow-backed nullable
+ :class:`ArrowDtype` :class:`DataFrame`
.. versionadded:: 2.0
diff --git a/pandas/io/html.py b/pandas/io/html.py
index 4b8bc48130fab..c9897f628fdc9 100644
--- a/pandas/io/html.py
+++ b/pandas/io/html.py
@@ -1131,14 +1131,15 @@ def read_html(
.. versionadded:: 1.5.0
- dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable'
+ dtype_backend : {{'numpy_nullable', 'pyarrow'}}
Back-end data type applied to the resultant :class:`DataFrame`
- (still experimental). Behaviour is as follows:
+ (still experimental). If not specified, the default behavior
+ is to not use nullable data types. If specified, the behavior
+ is as follows:
* ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
- (default).
- * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
- DataFrame.
+ * ``"pyarrow"``: returns pyarrow-backed nullable
+ :class:`ArrowDtype` :class:`DataFrame`
.. versionadded:: 2.0
diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py
index b29ead1d14b1d..d077b9e0c4568 100644
--- a/pandas/io/json/_json.py
+++ b/pandas/io/json/_json.py
@@ -652,14 +652,15 @@ def read_json(
{storage_options}
- dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable'
+ dtype_backend : {{'numpy_nullable', 'pyarrow'}}
Back-end data type applied to the resultant :class:`DataFrame`
- (still experimental). Behaviour is as follows:
+ (still experimental). If not specified, the default behavior
+ is to not use nullable data types. If specified, the behavior
+ is as follows:
* ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
- (default).
- * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
- DataFrame.
+ * ``"pyarrow"``: returns pyarrow-backed nullable
+ :class:`ArrowDtype` :class:`DataFrame`
.. versionadded:: 2.0
diff --git a/pandas/io/json/_table_schema.py b/pandas/io/json/_table_schema.py
index d966e38fa11a5..9d250ee5c08ce 100644
--- a/pandas/io/json/_table_schema.py
+++ b/pandas/io/json/_table_schema.py
@@ -144,11 +144,11 @@ def convert_pandas_type_to_json_field(arr) -> dict[str, JSONSerializable]:
field["freq"] = dtype.freq.freqstr
elif isinstance(dtype, DatetimeTZDtype):
if timezones.is_utc(dtype.tz):
- # timezone.utc has no "zone" attr
field["tz"] = "UTC"
else:
- # error: "tzinfo" has no attribute "zone"
- field["tz"] = dtype.tz.zone # type: ignore[attr-defined]
+ zone = timezones.get_timezone(dtype.tz)
+ if isinstance(zone, str):
+ field["tz"] = zone
elif isinstance(dtype, ExtensionDtype):
field["extDtype"] = dtype.name
return field
diff --git a/pandas/io/orc.py b/pandas/io/orc.py
index b297164d5d108..f179dafc919e5 100644
--- a/pandas/io/orc.py
+++ b/pandas/io/orc.py
@@ -61,14 +61,15 @@ def read_orc(
Output always follows the ordering of the file and not the columns list.
This mirrors the original behaviour of
:external+pyarrow:py:meth:`pyarrow.orc.ORCFile.read`.
- dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable'
+ dtype_backend : {'numpy_nullable', 'pyarrow'}
Back-end data type applied to the resultant :class:`DataFrame`
- (still experimental). Behaviour is as follows:
+ (still experimental). If not specified, the default behavior
+ is to not use nullable data types. If specified, the behavior
+ is as follows:
* ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
- (default).
- * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
- DataFrame.
+ * ``"pyarrow"``: returns pyarrow-backed nullable
+ :class:`ArrowDtype` :class:`DataFrame`
.. versionadded:: 2.0
diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py
index 77a9cc3fca644..24415299e799b 100644
--- a/pandas/io/parquet.py
+++ b/pandas/io/parquet.py
@@ -542,14 +542,15 @@ def read_parquet(
.. versionadded:: 1.3.0
- dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable'
+ dtype_backend : {{'numpy_nullable', 'pyarrow'}}
Back-end data type applied to the resultant :class:`DataFrame`
- (still experimental). Behaviour is as follows:
+ (still experimental). If not specified, the default behavior
+ is to not use nullable data types. If specified, the behavior
+ is as follows:
* ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
- (default).
- * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
- DataFrame.
+ * ``"pyarrow"``: returns pyarrow-backed nullable
+ :class:`ArrowDtype` :class:`DataFrame`
.. versionadded:: 2.0
diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py
index 0cca1ebdb8c8f..6e933f94cf0ba 100644
--- a/pandas/io/parsers/readers.py
+++ b/pandas/io/parsers/readers.py
@@ -268,6 +268,18 @@ class _read_shared(TypedDict, Generic[HashableT], total=False):
Number of lines at bottom of file to skip (Unsupported with ``engine='c'``).
nrows : int, optional
Number of rows of file to read. Useful for reading pieces of large files.
+ Refers to the number of data rows in the returned DataFrame, excluding:
+
+ * The header row containing column names.
+ * Rows before the header row, if ``header=1`` or larger.
+
+ Example usage:
+
+ * To read the first 999,999 (non-header) rows:
+ ``read_csv(..., nrows=999999)``
+
+ * To read rows 1,000,000 through 1,999,999:
+ ``read_csv(..., skiprows=1000000, nrows=999999)``
na_values : Hashable, Iterable of Hashable or dict of {{Hashable : Iterable}}, optional
Additional strings to recognize as ``NA``/``NaN``. If ``dict`` passed, specific
per-column ``NA`` values. By default the following values are interpreted as
@@ -438,14 +450,14 @@ class _read_shared(TypedDict, Generic[HashableT], total=False):
{storage_options}
-dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable'
+dtype_backend : {{'numpy_nullable', 'pyarrow'}}
Back-end data type applied to the resultant :class:`DataFrame`
- (still experimental). Behaviour is as follows:
+ (still experimental). If not specified, the default behavior
+ is to not use nullable data types. If specified, the behavior
+ is as follows:
* ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
- (default).
- * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
- DataFrame.
+ * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` :class:`DataFrame`
.. versionadded:: 2.0
diff --git a/pandas/io/spss.py b/pandas/io/spss.py
index 313ffa79cbd09..e597463aee453 100644
--- a/pandas/io/spss.py
+++ b/pandas/io/spss.py
@@ -36,14 +36,15 @@ def read_spss(
Return a subset of the columns. If None, return all columns.
convert_categoricals : bool, default is True
Convert categorical columns into pd.Categorical.
- dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable'
+ dtype_backend : {'numpy_nullable', 'pyarrow'}
Back-end data type applied to the resultant :class:`DataFrame`
- (still experimental). Behaviour is as follows:
+ (still experimental). If not specified, the default behavior
+ is to not use nullable data types. If specified, the behavior
+ is as follows:
* ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
- (default).
- * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
- DataFrame.
+ * ``"pyarrow"``: returns pyarrow-backed
+ nullable :class:`ArrowDtype` :class:`DataFrame`
.. versionadded:: 2.0
diff --git a/pandas/io/sql.py b/pandas/io/sql.py
index 4fd7de7a28855..99dd06568fa01 100644
--- a/pandas/io/sql.py
+++ b/pandas/io/sql.py
@@ -306,14 +306,15 @@ def read_sql_table(
chunksize : int, default None
If specified, returns an iterator where `chunksize` is the number of
rows to include in each chunk.
- dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable'
+ dtype_backend : {'numpy_nullable', 'pyarrow'}
Back-end data type applied to the resultant :class:`DataFrame`
- (still experimental). Behaviour is as follows:
+ (still experimental). If not specified, the default behavior
+ is to not use nullable data types. If specified, the behavior
+ is as follows:
* ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
- (default).
- * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
- DataFrame.
+ * ``"pyarrow"``: returns pyarrow-backed nullable
+ :class:`ArrowDtype` :class:`DataFrame`
.. versionadded:: 2.0
@@ -443,14 +444,15 @@ def read_sql_query(
{'a': np.float64, 'b': np.int32, 'c': 'Int64'}.
.. versionadded:: 1.3.0
- dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable'
+ dtype_backend : {'numpy_nullable', 'pyarrow'}
Back-end data type applied to the resultant :class:`DataFrame`
- (still experimental). Behaviour is as follows:
+ (still experimental). If not specified, the default behavior
+ is to not use nullable data types. If specified, the behavior
+ is as follows:
* ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
- (default).
- * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
- DataFrame.
+ * ``"pyarrow"``: returns pyarrow-backed nullable
+ :class:`ArrowDtype` :class:`DataFrame`
.. versionadded:: 2.0
@@ -586,14 +588,15 @@ def read_sql(
chunksize : int, default None
If specified, return an iterator where `chunksize` is the
number of rows to include in each chunk.
- dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable'
+ dtype_backend : {'numpy_nullable', 'pyarrow'}
Back-end data type applied to the resultant :class:`DataFrame`
- (still experimental). Behaviour is as follows:
+ (still experimental). If not specified, the default behavior
+ is to not use nullable data types. If specified, the behavior
+ is as follows:
* ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
- (default).
- * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
- DataFrame.
+ * ``"pyarrow"``: returns pyarrow-backed nullable
+ :class:`ArrowDtype` :class:`DataFrame`
.. versionadded:: 2.0
dtype : Type name or dict of columns
@@ -1683,14 +1686,15 @@ def read_table(
chunksize : int, default None
If specified, return an iterator where `chunksize` is the number
of rows to include in each chunk.
- dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable'
+ dtype_backend : {'numpy_nullable', 'pyarrow'}
Back-end data type applied to the resultant :class:`DataFrame`
- (still experimental). Behaviour is as follows:
+ (still experimental). If not specified, the default behavior
+ is to not use nullable data types. If specified, the behavior
+ is as follows:
* ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
- (default).
- * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
- DataFrame.
+ * ``"pyarrow"``: returns pyarrow-backed nullable
+ :class:`ArrowDtype` :class:`DataFrame`
.. versionadded:: 2.0
@@ -2148,14 +2152,15 @@ def read_table(
schema of the SQL database object.
chunksize : int, default None
Raises NotImplementedError
- dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable'
+ dtype_backend : {'numpy_nullable', 'pyarrow'}
Back-end data type applied to the resultant :class:`DataFrame`
- (still experimental). Behaviour is as follows:
+ (still experimental). If not specified, the default behavior
+ is to not use nullable data types. If specified, the behavior
+ is as follows:
* ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
- (default).
- * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
- DataFrame.
+ * ``"pyarrow"``: returns pyarrow-backed nullable
+ :class:`ArrowDtype` :class:`DataFrame`
.. versionadded:: 2.0
diff --git a/pandas/io/xml.py b/pandas/io/xml.py
index 8c7381a926e72..0fcf27af42fde 100644
--- a/pandas/io/xml.py
+++ b/pandas/io/xml.py
@@ -959,14 +959,15 @@ def read_xml(
{storage_options}
- dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable'
+ dtype_backend : {{'numpy_nullable', 'pyarrow'}}
Back-end data type applied to the resultant :class:`DataFrame`
- (still experimental). Behaviour is as follows:
+ (still experimental). If not specified, the default behavior
+ is to not use nullable data types. If specified, the behavior
+ is as follows:
* ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
- (default).
- * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
- DataFrame.
+ * ``"pyarrow"``: returns pyarrow-backed nullable
+ :class:`ArrowDtype` :class:`DataFrame`
.. versionadded:: 2.0
diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py
index fb7d785a94bc4..9a7e563332a42 100644
--- a/pandas/plotting/_matplotlib/core.py
+++ b/pandas/plotting/_matplotlib/core.py
@@ -546,7 +546,7 @@ def _maybe_right_yaxis(self, ax: Axes, axes_num: int) -> Axes:
new_ax.set_yscale("log")
elif self.logy == "sym" or self.loglog == "sym":
new_ax.set_yscale("symlog")
- return new_ax # type: ignore[return-value]
+ return new_ax
@final
@cache_readonly
diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py
index 7d4aae0f7bb4e..cfba32c62f206 100644
--- a/pandas/tests/arrays/string_/test_string_arrow.py
+++ b/pandas/tests/arrays/string_/test_string_arrow.py
@@ -4,7 +4,6 @@
import numpy as np
import pytest
-from pandas.compat import HAS_PYARROW
import pandas.util._test_decorators as td
import pandas as pd
@@ -27,11 +26,10 @@ def test_eq_all_na():
tm.assert_extension_array_equal(result, expected)
-def test_config(string_storage, request, using_infer_string):
- if using_infer_string and string_storage == "python" and HAS_PYARROW:
- # string storage with na_value=NaN always uses pyarrow if available
- # -> does not yet honor the option
- request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)"))
+def test_config(string_storage, using_infer_string):
+ # with the default string_storage setting
+ # always "python" at the moment
+ assert StringDtype().storage == "python"
with pd.option_context("string_storage", string_storage):
assert StringDtype().storage == string_storage
@@ -88,19 +86,18 @@ def test_constructor_not_string_type_value_dictionary_raises(chunked):
ArrowStringArray(arr)
-@pytest.mark.xfail(
- reason="dict conversion does not seem to be implemented for large string in arrow"
-)
+@pytest.mark.parametrize("string_type", ["string", "large_string"])
@pytest.mark.parametrize("chunked", [True, False])
-def test_constructor_valid_string_type_value_dictionary(chunked):
+def test_constructor_valid_string_type_value_dictionary(string_type, chunked):
pa = pytest.importorskip("pyarrow")
- arr = pa.array(["1", "2", "3"], pa.large_string()).dictionary_encode()
+ arr = pa.array(["1", "2", "3"], getattr(pa, string_type)()).dictionary_encode()
if chunked:
arr = pa.chunked_array(arr)
arr = ArrowStringArray(arr)
- assert pa.types.is_string(arr._pa_array.type.value_type)
+ # dictionary type get converted to dense large string array
+ assert pa.types.is_large_string(arr._pa_array.type)
def test_constructor_from_list():
diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py
index 5834b268be2be..59ff4f3122e8f 100644
--- a/pandas/tests/arrays/test_datetimelike.py
+++ b/pandas/tests/arrays/test_datetimelike.py
@@ -891,20 +891,24 @@ def test_concat_same_type_different_freq(self, unit):
tm.assert_datetime_array_equal(result, expected)
- def test_strftime(self, arr1d):
+ def test_strftime(self, arr1d, using_infer_string):
arr = arr1d
result = arr.strftime("%Y %b")
expected = np.array([ts.strftime("%Y %b") for ts in arr], dtype=object)
- tm.assert_numpy_array_equal(result, expected)
+ if using_infer_string:
+ expected = pd.array(expected, dtype=pd.StringDtype(na_value=np.nan))
+ tm.assert_equal(result, expected)
- def test_strftime_nat(self):
+ def test_strftime_nat(self, using_infer_string):
# GH 29578
arr = DatetimeIndex(["2019-01-01", NaT])._data
result = arr.strftime("%Y-%m-%d")
expected = np.array(["2019-01-01", np.nan], dtype=object)
- tm.assert_numpy_array_equal(result, expected)
+ if using_infer_string:
+ expected = pd.array(expected, dtype=pd.StringDtype(na_value=np.nan))
+ tm.assert_equal(result, expected)
class TestTimedeltaArray(SharedTests):
@@ -1161,20 +1165,24 @@ def test_array_interface(self, arr1d):
expected = np.asarray(arr).astype("S20")
tm.assert_numpy_array_equal(result, expected)
- def test_strftime(self, arr1d):
+ def test_strftime(self, arr1d, using_infer_string):
arr = arr1d
result = arr.strftime("%Y")
expected = np.array([per.strftime("%Y") for per in arr], dtype=object)
- tm.assert_numpy_array_equal(result, expected)
+ if using_infer_string:
+ expected = pd.array(expected, dtype=pd.StringDtype(na_value=np.nan))
+ tm.assert_equal(result, expected)
- def test_strftime_nat(self):
+ def test_strftime_nat(self, using_infer_string):
# GH 29578
arr = PeriodArray(PeriodIndex(["2019-01-01", NaT], dtype="period[D]"))
result = arr.strftime("%Y-%m-%d")
expected = np.array(["2019-01-01", np.nan], dtype=object)
- tm.assert_numpy_array_equal(result, expected)
+ if using_infer_string:
+ expected = pd.array(expected, dtype=pd.StringDtype(na_value=np.nan))
+ tm.assert_equal(result, expected)
@pytest.mark.parametrize(
diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py
index 4bf97b1fd8494..2c2dff7a957fe 100644
--- a/pandas/tests/dtypes/test_common.py
+++ b/pandas/tests/dtypes/test_common.py
@@ -3,6 +3,7 @@
import numpy as np
import pytest
+from pandas.compat import HAS_PYARROW
import pandas.util._test_decorators as td
from pandas.core.dtypes.astype import astype_array
@@ -802,13 +803,17 @@ def test_pandas_dtype_ea_not_instance():
def test_pandas_dtype_string_dtypes(string_storage):
- # TODO(infer_string) remove skip if "python" is supported
- pytest.importorskip("pyarrow")
+ with pd.option_context("future.infer_string", True):
+ # with the default string_storage setting
+ result = pandas_dtype("str")
+ assert result == pd.StringDtype(
+ "pyarrow" if HAS_PYARROW else "python", na_value=np.nan
+ )
+
with pd.option_context("future.infer_string", True):
with pd.option_context("string_storage", string_storage):
result = pandas_dtype("str")
- # TODO(infer_string) hardcoded to pyarrow until python is supported
- assert result == pd.StringDtype("pyarrow", na_value=np.nan)
+ assert result == pd.StringDtype(string_storage, na_value=np.nan)
with pd.option_context("future.infer_string", False):
with pd.option_context("string_storage", string_storage):
diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py
index 59779234b46d9..e7f6e5d625d3e 100644
--- a/pandas/tests/frame/methods/test_convert_dtypes.py
+++ b/pandas/tests/frame/methods/test_convert_dtypes.py
@@ -3,21 +3,15 @@
import numpy as np
import pytest
-from pandas._config import using_string_dtype
-
import pandas as pd
import pandas._testing as tm
class TestConvertDtypes:
- # TODO convert_dtypes should not use NaN variant of string dtype, but always NA
- @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
@pytest.mark.parametrize(
"convert_integer, expected", [(False, np.dtype("int32")), (True, "Int32")]
)
- def test_convert_dtypes(
- self, convert_integer, expected, string_storage, using_infer_string
- ):
+ def test_convert_dtypes(self, convert_integer, expected, string_storage):
# Specific types are tested in tests/series/test_dtypes.py
# Just check that it works for DataFrame here
df = pd.DataFrame(
@@ -182,7 +176,6 @@ def test_convert_dtypes_pyarrow_timestamp(self):
result = expected.convert_dtypes(dtype_backend="pyarrow")
tm.assert_series_equal(result, expected)
- @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_convert_dtypes_avoid_block_splitting(self):
# GH#55341
df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": "a"})
@@ -197,7 +190,6 @@ def test_convert_dtypes_avoid_block_splitting(self):
tm.assert_frame_equal(result, expected)
assert result._mgr.nblocks == 2
- @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_convert_dtypes_from_arrow(self):
# GH#56581
df = pd.DataFrame([["a", datetime.time(18, 12)]], columns=["a", "b"])
diff --git a/pandas/tests/frame/methods/test_drop_duplicates.py b/pandas/tests/frame/methods/test_drop_duplicates.py
index 419fb75cb3669..7feb3b6fd816d 100644
--- a/pandas/tests/frame/methods/test_drop_duplicates.py
+++ b/pandas/tests/frame/methods/test_drop_duplicates.py
@@ -476,3 +476,41 @@ def test_drop_duplicates_non_boolean_ignore_index(arg):
msg = '^For argument "ignore_index" expected type bool, received type .*.$'
with pytest.raises(ValueError, match=msg):
df.drop_duplicates(ignore_index=arg)
+
+
+def test_drop_duplicates_set():
+ # GH#59237
+ df = DataFrame(
+ {
+ "AAA": ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"],
+ "B": ["one", "one", "two", "two", "two", "two", "one", "two"],
+ "C": [1, 1, 2, 2, 2, 2, 1, 2],
+ "D": range(8),
+ }
+ )
+ # single column
+ result = df.drop_duplicates({"AAA"})
+ expected = df[:2]
+ tm.assert_frame_equal(result, expected)
+
+ result = df.drop_duplicates({"AAA"}, keep="last")
+ expected = df.loc[[6, 7]]
+ tm.assert_frame_equal(result, expected)
+
+ result = df.drop_duplicates({"AAA"}, keep=False)
+ expected = df.loc[[]]
+ tm.assert_frame_equal(result, expected)
+ assert len(result) == 0
+
+ # multi column
+ expected = df.loc[[0, 1, 2, 3]]
+ result = df.drop_duplicates({"AAA", "B"})
+ tm.assert_frame_equal(result, expected)
+
+ result = df.drop_duplicates({"AAA", "B"}, keep="last")
+ expected = df.loc[[0, 5, 6, 7]]
+ tm.assert_frame_equal(result, expected)
+
+ result = df.drop_duplicates({"AAA", "B"}, keep=False)
+ expected = df.loc[[0]]
+ tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py
index aa2fb19fe8528..fa71153d01157 100644
--- a/pandas/tests/frame/test_query_eval.py
+++ b/pandas/tests/frame/test_query_eval.py
@@ -1,4 +1,5 @@
import operator
+from tokenize import TokenError
import numpy as np
import pytest
@@ -1246,6 +1247,8 @@ def df(self):
"it's": [6, 3, 1],
"that's": [9, 1, 8],
"☺": [8, 7, 6],
+ "xy (z)": [1, 2, 3], # noqa: RUF001
+ "xy (z\\uff09": [4, 5, 6], # noqa: RUF001
"foo#bar": [2, 4, 5],
1: [5, 7, 9],
}
@@ -1341,20 +1344,160 @@ def test_missing_attribute(self, df):
with pytest.raises(AttributeError, match=message):
df.eval("@pd.thing")
- def test_failing_quote(self, df):
- msg = r"(Could not convert ).*( to a valid Python identifier.)"
- with pytest.raises(SyntaxError, match=msg):
- df.query("`it's` > `that's`")
+ def test_quote(self, df):
+ res = df.query("`it's` > `that's`")
+ expect = df[df["it's"] > df["that's"]]
+ tm.assert_frame_equal(res, expect)
- def test_failing_character_outside_range(self, df):
- msg = r"(Could not convert ).*( to a valid Python identifier.)"
- with pytest.raises(SyntaxError, match=msg):
- df.query("`☺` > 4")
+ def test_character_outside_range_smiley(self, df):
+ res = df.query("`☺` > 4")
+ expect = df[df["☺"] > 4]
+ tm.assert_frame_equal(res, expect)
- def test_failing_hashtag(self, df):
- msg = "Failed to parse backticks"
- with pytest.raises(SyntaxError, match=msg):
- df.query("`foo#bar` > 4")
+ def test_character_outside_range_2_byte_parens(self, df):
+ # GH 49633
+ res = df.query("`xy (z)` == 2") # noqa: RUF001
+ expect = df[df["xy (z)"] == 2] # noqa: RUF001
+ tm.assert_frame_equal(res, expect)
+
+ def test_character_outside_range_and_actual_backslash(self, df):
+ # GH 49633
+ res = df.query("`xy (z\\uff09` == 2") # noqa: RUF001
+ expect = df[df["xy \uff08z\\uff09"] == 2]
+ tm.assert_frame_equal(res, expect)
+
+ def test_hashtag(self, df):
+ res = df.query("`foo#bar` > 4")
+ expect = df[df["foo#bar"] > 4]
+ tm.assert_frame_equal(res, expect)
+
+ def test_expr_with_column_name_with_hashtag_character(self):
+ # GH 59285
+ df = DataFrame((1, 2, 3), columns=["a#"])
+ result = df.query("`a#` < 2")
+ expected = df[df["a#"] < 2]
+ tm.assert_frame_equal(result, expected)
+
+ def test_expr_with_comment(self):
+ # GH 59285
+ df = DataFrame((1, 2, 3), columns=["a#"])
+ result = df.query("`a#` < 2 # This is a comment")
+ expected = df[df["a#"] < 2]
+ tm.assert_frame_equal(result, expected)
+
+ def test_expr_with_column_name_with_backtick_and_hash(self):
+ # GH 59285
+ df = DataFrame((1, 2, 3), columns=["a`#b"])
+ result = df.query("`a``#b` < 2")
+ expected = df[df["a`#b"] < 2]
+ tm.assert_frame_equal(result, expected)
+
+ def test_expr_with_column_name_with_backtick(self):
+ # GH 59285
+ df = DataFrame({"a`b": (1, 2, 3), "ab": (4, 5, 6)})
+ result = df.query("`a``b` < 2") # noqa
+ # Note: Formatting checks may wrongly consider the above ``inline code``.
+ expected = df[df["a`b"] < 2]
+ tm.assert_frame_equal(result, expected)
+
+ @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
+ def test_expr_with_string_with_backticks(self):
+ # GH 59285
+ df = DataFrame(("`", "`````", "``````````"), columns=["#backticks"])
+ result = df.query("'```' < `#backticks`")
+ expected = df["```" < df["#backticks"]]
+ tm.assert_frame_equal(result, expected)
+
+ @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
+ def test_expr_with_string_with_backticked_substring_same_as_column_name(self):
+ # GH 59285
+ df = DataFrame(("`", "`````", "``````````"), columns=["#backticks"])
+ result = df.query("'`#backticks`' < `#backticks`")
+ expected = df["`#backticks`" < df["#backticks"]]
+ tm.assert_frame_equal(result, expected)
+
+ @pytest.mark.parametrize(
+ "col1,col2,expr",
+ [
+ ("it's", "that's", "`it's` < `that's`"),
+ ('it"s', 'that"s', '`it"s` < `that"s`'),
+ ("it's", 'that\'s "nice"', "`it's` < `that's \"nice\"`"),
+ ("it's", "that's #cool", "`it's` < `that's #cool` # This is a comment"),
+ ],
+ )
+ def test_expr_with_column_names_with_special_characters(self, col1, col2, expr):
+ # GH 59285
+ df = DataFrame(
+ [
+ {col1: 1, col2: 2},
+ {col1: 3, col2: 4},
+ {col1: -1, col2: -2},
+ {col1: -3, col2: -4},
+ ]
+ )
+ result = df.query(expr)
+ expected = df[df[col1] < df[col2]]
+ tm.assert_frame_equal(result, expected)
+
+ @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
+ def test_expr_with_no_backticks(self):
+ # GH 59285
+ df = DataFrame(("aaa", "vvv", "zzz"), columns=["column_name"])
+ result = df.query("'value' < column_name")
+ expected = df["value" < df["column_name"]]
+ tm.assert_frame_equal(result, expected)
+
+ def test_expr_with_no_quotes_and_backtick_is_unmatched(self):
+ # GH 59285
+ df = DataFrame((1, 5, 10), columns=["column-name"])
+ with pytest.raises((SyntaxError, TokenError), match="invalid syntax"):
+ df.query("5 < `column-name")
+
+ def test_expr_with_no_quotes_and_backtick_is_matched(self):
+ # GH 59285
+ df = DataFrame((1, 5, 10), columns=["column-name"])
+ result = df.query("5 < `column-name`")
+ expected = df[5 < df["column-name"]]
+ tm.assert_frame_equal(result, expected)
+
+ def test_expr_with_backtick_opened_before_quote_and_backtick_is_unmatched(self):
+ # GH 59285
+ df = DataFrame((1, 5, 10), columns=["It's"])
+ with pytest.raises(
+ (SyntaxError, TokenError), match="unterminated string literal"
+ ):
+ df.query("5 < `It's")
+
+ def test_expr_with_backtick_opened_before_quote_and_backtick_is_matched(self):
+ # GH 59285
+ df = DataFrame((1, 5, 10), columns=["It's"])
+ result = df.query("5 < `It's`")
+ expected = df[5 < df["It's"]]
+ tm.assert_frame_equal(result, expected)
+
+ def test_expr_with_quote_opened_before_backtick_and_quote_is_unmatched(self):
+ # GH 59285
+ df = DataFrame(("aaa", "vvv", "zzz"), columns=["column-name"])
+ with pytest.raises(
+ (SyntaxError, TokenError), match="unterminated string literal"
+ ):
+ df.query("`column-name` < 'It`s that\\'s \"quote\" #hash")
+
+ @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
+ def test_expr_with_quote_opened_before_backtick_and_quote_is_matched_at_end(self):
+ # GH 59285
+ df = DataFrame(("aaa", "vvv", "zzz"), columns=["column-name"])
+ result = df.query("`column-name` < 'It`s that\\'s \"quote\" #hash'")
+ expected = df[df["column-name"] < 'It`s that\'s "quote" #hash']
+ tm.assert_frame_equal(result, expected)
+
+ @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
+ def test_expr_with_quote_opened_before_backtick_and_quote_is_matched_in_mid(self):
+ # GH 59285
+ df = DataFrame(("aaa", "vvv", "zzz"), columns=["column-name"])
+ result = df.query("'It`s that\\'s \"quote\" #hash' < `column-name`")
+ expected = df['It`s that\'s "quote" #hash' < df["column-name"]]
+ tm.assert_frame_equal(result, expected)
def test_call_non_named_expression(self, df):
"""
diff --git a/pandas/tests/indexes/datetimes/methods/test_tz_localize.py b/pandas/tests/indexes/datetimes/methods/test_tz_localize.py
index c6697fd169e8a..78a79ac7d1546 100644
--- a/pandas/tests/indexes/datetimes/methods/test_tz_localize.py
+++ b/pandas/tests/indexes/datetimes/methods/test_tz_localize.py
@@ -9,7 +9,6 @@
from dateutil.tz import gettz
import numpy as np
import pytest
-import pytz
from pandas import (
DatetimeIndex,
@@ -69,10 +68,10 @@ def test_dti_tz_localize_nonexistent_raise_coerce(self):
times = ["2015-03-08 01:00", "2015-03-08 02:00", "2015-03-08 03:00"]
index = DatetimeIndex(times)
tz = "US/Eastern"
- with pytest.raises(pytz.NonExistentTimeError, match="|".join(times)):
+ with pytest.raises(ValueError, match="|".join(times)):
index.tz_localize(tz=tz)
- with pytest.raises(pytz.NonExistentTimeError, match="|".join(times)):
+ with pytest.raises(ValueError, match="|".join(times)):
index.tz_localize(tz=tz, nonexistent="raise")
result = index.tz_localize(tz=tz, nonexistent="NaT")
@@ -85,7 +84,7 @@ def test_dti_tz_localize_ambiguous_infer(self, tz):
# November 6, 2011, fall back, repeat 2 AM hour
# With no repeated hours, we cannot infer the transition
dr = date_range(datetime(2011, 11, 6, 0), periods=5, freq=offsets.Hour())
- with pytest.raises(pytz.AmbiguousTimeError, match="Cannot infer dst time"):
+ with pytest.raises(ValueError, match="Cannot infer dst time"):
dr.tz_localize(tz)
def test_dti_tz_localize_ambiguous_infer2(self, tz, unit):
@@ -117,7 +116,7 @@ def test_dti_tz_localize_ambiguous_infer3(self, tz):
def test_dti_tz_localize_ambiguous_times(self, tz):
# March 13, 2011, spring forward, skip from 2 AM to 3 AM
dr = date_range(datetime(2011, 3, 13, 1, 30), periods=3, freq=offsets.Hour())
- with pytest.raises(pytz.NonExistentTimeError, match="2011-03-13 02:30:00"):
+ with pytest.raises(ValueError, match="2011-03-13 02:30:00"):
dr.tz_localize(tz)
# after dst transition, it works
@@ -127,7 +126,7 @@ def test_dti_tz_localize_ambiguous_times(self, tz):
# November 6, 2011, fall back, repeat 2 AM hour
dr = date_range(datetime(2011, 11, 6, 1, 30), periods=3, freq=offsets.Hour())
- with pytest.raises(pytz.AmbiguousTimeError, match="Cannot infer dst time"):
+ with pytest.raises(ValueError, match="Cannot infer dst time"):
dr.tz_localize(tz)
# UTC is OK
@@ -163,11 +162,11 @@ def test_dti_tz_localize(self, prefix):
tm.assert_numpy_array_equal(dti3.values, dti_utc.values)
dti = date_range(start="11/6/2011 1:59", end="11/6/2011 2:00", freq="ms")
- with pytest.raises(pytz.AmbiguousTimeError, match="Cannot infer dst time"):
+ with pytest.raises(ValueError, match="Cannot infer dst time"):
dti.tz_localize(tzstr)
dti = date_range(start="3/13/2011 1:59", end="3/13/2011 2:00", freq="ms")
- with pytest.raises(pytz.NonExistentTimeError, match="2011-03-13 02:00:00"):
+ with pytest.raises(ValueError, match="2011-03-13 02:00:00"):
dti.tz_localize(tzstr)
def test_dti_tz_localize_utc_conversion(self, tz):
@@ -184,7 +183,7 @@ def test_dti_tz_localize_utc_conversion(self, tz):
# DST ambiguity, this should fail
rng = date_range("3/11/2012", "3/12/2012", freq="30min")
# Is this really how it should fail??
- with pytest.raises(pytz.NonExistentTimeError, match="2012-03-11 02:00:00"):
+ with pytest.raises(ValueError, match="2012-03-11 02:00:00"):
rng.tz_localize(tz)
def test_dti_tz_localize_roundtrip(self, tz_aware_fixture):
diff --git a/pandas/tests/indexes/datetimes/test_constructors.py b/pandas/tests/indexes/datetimes/test_constructors.py
index aba440ceeb56b..8da88b97f9ea8 100644
--- a/pandas/tests/indexes/datetimes/test_constructors.py
+++ b/pandas/tests/indexes/datetimes/test_constructors.py
@@ -14,7 +14,6 @@
from dateutil.tz import gettz
import numpy as np
import pytest
-import pytz
from pandas._libs.tslibs import (
astype_overflowsafe,
@@ -750,7 +749,7 @@ def test_disallow_setting_tz(self):
[
None,
"America/Los_Angeles",
- pytz.timezone("America/Los_Angeles"),
+ zoneinfo.ZoneInfo("America/Los_Angeles"),
Timestamp("2000", tz="America/Los_Angeles").tz,
],
)
@@ -765,8 +764,8 @@ def test_constructor_start_end_with_tz(self, tz):
freq="D",
)
tm.assert_index_equal(result, expected)
- # Especially assert that the timezone is consistent for pytz
- assert pytz.timezone("America/Los_Angeles") is result.tz
+ # Especially assert that the timezone is consistent for zoneinfo
+ assert zoneinfo.ZoneInfo("America/Los_Angeles") is result.tz
@pytest.mark.parametrize("tz", ["US/Pacific", "US/Eastern", "Asia/Tokyo"])
def test_constructor_with_non_normalized_pytz(self, tz):
@@ -984,6 +983,7 @@ def test_dti_ambiguous_matches_timestamp(self, tz, use_str, box_cls, request):
# GH#47471 check that we get the same raising behavior in the DTI
# constructor and Timestamp constructor
if isinstance(tz, str) and tz.startswith("pytz/"):
+ pytz = pytest.importorskip("pytz")
tz = pytz.timezone(tz.removeprefix("pytz/"))
dtstr = "2013-11-03 01:59:59.999999"
item = dtstr
@@ -1000,7 +1000,7 @@ def test_dti_ambiguous_matches_timestamp(self, tz, use_str, box_cls, request):
mark = pytest.mark.xfail(reason="We implicitly get fold=0.")
request.applymarker(mark)
- with pytest.raises(pytz.AmbiguousTimeError, match=dtstr):
+ with pytest.raises(ValueError, match=dtstr):
box_cls(item, tz=tz)
@pytest.mark.parametrize("tz", [None, "UTC", "US/Pacific"])
diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py
index b37b5cf74b347..e09883e95ecec 100644
--- a/pandas/tests/indexes/datetimes/test_date_range.py
+++ b/pandas/tests/indexes/datetimes/test_date_range.py
@@ -11,7 +11,6 @@
import numpy as np
import pytest
-import pytz
from pandas._libs.tslibs import timezones
from pandas._libs.tslibs.offsets import (
@@ -881,7 +880,7 @@ def test_date_range_ambiguous_endpoint(self, tz):
# construction with an ambiguous end-point
# GH#11626
- with pytest.raises(pytz.AmbiguousTimeError, match="Cannot infer dst time"):
+ with pytest.raises(ValueError, match="Cannot infer dst time"):
date_range(
"2013-10-26 23:00", "2013-10-27 01:00", tz="Europe/London", freq="h"
)
@@ -905,7 +904,7 @@ def test_date_range_ambiguous_endpoint(self, tz):
def test_date_range_nonexistent_endpoint(self, tz, option, expected):
# construction with an nonexistent end-point
- with pytest.raises(pytz.NonExistentTimeError, match="2019-03-10 02:00:00"):
+ with pytest.raises(ValueError, match="2019-03-10 02:00:00"):
date_range(
"2019-03-10 00:00", "2019-03-10 02:00", tz="US/Pacific", freq="h"
)
@@ -1259,6 +1258,24 @@ def test_range_with_timezone_and_custombusinessday(self, start, period, expected
expected = DatetimeIndex(expected).as_unit("ns")
tm.assert_index_equal(result, expected)
+ def test_data_range_custombusinessday_partial_time(self, unit):
+ # GH#57456
+ offset = offsets.CustomBusinessDay(weekmask="Sun Mon Tue")
+ start = datetime(2024, 2, 6, 23)
+ # end datetime is partial and not in the offset
+ end = datetime(2024, 2, 14, 14)
+ result = date_range(start, end, freq=offset, unit=unit)
+ expected = DatetimeIndex(
+ [
+ "2024-02-06 23:00:00",
+ "2024-02-11 23:00:00",
+ "2024-02-12 23:00:00",
+ "2024-02-13 23:00:00",
+ ],
+ dtype=f"M8[{unit}]",
+ )
+ tm.assert_index_equal(result, expected)
+
class TestDateRangeNonNano:
def test_date_range_reso_validation(self):
diff --git a/pandas/tests/indexes/datetimes/test_timezones.py b/pandas/tests/indexes/datetimes/test_timezones.py
index e4b8a909add0d..8d9340818b511 100644
--- a/pandas/tests/indexes/datetimes/test_timezones.py
+++ b/pandas/tests/indexes/datetimes/test_timezones.py
@@ -184,11 +184,8 @@ def test_dti_tz_nat(self, tzstr):
assert isna(idx[1])
assert idx[0].tzinfo is not None
- @pytest.mark.parametrize("tzstr", ["pytz/US/Eastern", "dateutil/US/Eastern"])
+ @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"])
def test_utc_box_timestamp_and_localize(self, tzstr):
- if tzstr.startswith("pytz/"):
- pytest.importorskip("pytz")
- tzstr = tzstr.removeprefix("pytz/")
tz = timezones.maybe_get_tz(tzstr)
rng = date_range("3/11/2012", "3/12/2012", freq="h", tz="utc")
@@ -203,11 +200,10 @@ def test_utc_box_timestamp_and_localize(self, tzstr):
# right tzinfo
rng = date_range("3/13/2012", "3/14/2012", freq="h", tz="utc")
rng_eastern = rng.tz_convert(tzstr)
- # test not valid for dateutil timezones.
- # assert 'EDT' in repr(rng_eastern[0].tzinfo)
- assert "EDT" in repr(rng_eastern[0].tzinfo) or "tzfile" in repr(
- rng_eastern[0].tzinfo
- )
+ if "dateutil" in tzstr:
+ assert "EDT" in repr(rng_eastern[0].tzinfo) or "tzfile" in repr(
+ rng_eastern[0].tzinfo
+ )
@pytest.mark.parametrize(
"tz", [zoneinfo.ZoneInfo("US/Central"), gettz("US/Central")]
diff --git a/pandas/tests/indexes/interval/test_interval_tree.py b/pandas/tests/indexes/interval/test_interval_tree.py
index 49b17f8b3d40e..df9c3b390f660 100644
--- a/pandas/tests/indexes/interval/test_interval_tree.py
+++ b/pandas/tests/indexes/interval/test_interval_tree.py
@@ -4,7 +4,10 @@
import pytest
from pandas._libs.interval import IntervalTree
-from pandas.compat import IS64
+from pandas.compat import (
+ IS64,
+ WASM,
+)
import pandas._testing as tm
@@ -186,7 +189,7 @@ def test_construction_overflow(self):
expected = (50 + np.iinfo(np.int64).max) / 2
assert result == expected
- @pytest.mark.xfail(not IS64, reason="GH 23440")
+ @pytest.mark.xfail(WASM, reason="GH 23440")
@pytest.mark.parametrize(
"left, right, expected",
[
diff --git a/pandas/tests/indexing/interval/test_interval.py b/pandas/tests/indexing/interval/test_interval.py
index b72ef57475305..6bcebefa6c696 100644
--- a/pandas/tests/indexing/interval/test_interval.py
+++ b/pandas/tests/indexing/interval/test_interval.py
@@ -2,7 +2,7 @@
import pytest
from pandas._libs import index as libindex
-from pandas.compat import IS64
+from pandas.compat import WASM
import pandas as pd
from pandas import (
@@ -210,7 +210,7 @@ def test_mi_intervalindex_slicing_with_scalar(self):
expected = Series([1, 6, 2, 8, 7], index=expected_index, name="value")
tm.assert_series_equal(result, expected)
- @pytest.mark.xfail(not IS64, reason="GH 23440")
+ @pytest.mark.xfail(WASM, reason="GH 23440")
@pytest.mark.parametrize("base", [101, 1010])
def test_reindex_behavior_with_interval_index(self, base):
# GH 51826
diff --git a/pandas/tests/indexing/interval/test_interval_new.py b/pandas/tests/indexing/interval/test_interval_new.py
index 4c1efe9e4f81d..051dc7b98f2aa 100644
--- a/pandas/tests/indexing/interval/test_interval_new.py
+++ b/pandas/tests/indexing/interval/test_interval_new.py
@@ -3,7 +3,7 @@
import numpy as np
import pytest
-from pandas.compat import IS64
+from pandas.compat import WASM
from pandas import (
Index,
@@ -214,7 +214,7 @@ def test_loc_getitem_missing_key_error_message(
obj.loc[[4, 5, 6]]
-@pytest.mark.xfail(not IS64, reason="GH 23440")
+@pytest.mark.xfail(WASM, reason="GH 23440")
@pytest.mark.parametrize(
"intervals",
[
diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py
index 65a52bc8e0794..b831ec3bb2c6a 100644
--- a/pandas/tests/io/excel/test_readers.py
+++ b/pandas/tests/io/excel/test_readers.py
@@ -30,10 +30,6 @@
read_csv,
)
import pandas._testing as tm
-from pandas.core.arrays import (
- ArrowStringArray,
- StringArray,
-)
read_ext_params = [".xls", ".xlsx", ".xlsm", ".xlsb", ".ods"]
engine_params = [
@@ -692,43 +688,33 @@ def test_dtype_backend_and_dtype(self, read_ext, tmp_excel):
)
tm.assert_frame_equal(result, df)
- @pytest.mark.xfail(
- using_string_dtype(), reason="infer_string takes precedence", strict=False
- )
def test_dtype_backend_string(self, read_ext, string_storage, tmp_excel):
# GH#36712
if read_ext in (".xlsb", ".xls"):
pytest.skip(f"No engine for filetype: '{read_ext}'")
- pa = pytest.importorskip("pyarrow")
+ df = DataFrame(
+ {
+ "a": np.array(["a", "b"], dtype=np.object_),
+ "b": np.array(["x", pd.NA], dtype=np.object_),
+ }
+ )
+ df.to_excel(tmp_excel, sheet_name="test", index=False)
with pd.option_context("mode.string_storage", string_storage):
- df = DataFrame(
- {
- "a": np.array(["a", "b"], dtype=np.object_),
- "b": np.array(["x", pd.NA], dtype=np.object_),
- }
- )
- df.to_excel(tmp_excel, sheet_name="test", index=False)
result = pd.read_excel(
tmp_excel, sheet_name="test", dtype_backend="numpy_nullable"
)
- if string_storage == "python":
- expected = DataFrame(
- {
- "a": StringArray(np.array(["a", "b"], dtype=np.object_)),
- "b": StringArray(np.array(["x", pd.NA], dtype=np.object_)),
- }
- )
- else:
- expected = DataFrame(
- {
- "a": ArrowStringArray(pa.array(["a", "b"])),
- "b": ArrowStringArray(pa.array(["x", None])),
- }
- )
- tm.assert_frame_equal(result, expected)
+ expected = DataFrame(
+ {
+ "a": Series(["a", "b"], dtype=pd.StringDtype(string_storage)),
+ "b": Series(["x", None], dtype=pd.StringDtype(string_storage)),
+ }
+ )
+ # the storage of the str columns' Index is also affected by the
+ # string_storage setting -> ignore that for checking the result
+ tm.assert_frame_equal(result, expected, check_column_type=False)
@pytest.mark.parametrize("dtypes, exp_value", [({}, 1), ({"a.1": "int64"}, 1)])
def test_dtype_mangle_dup_cols(self, read_ext, dtypes, exp_value):
diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py
index e1cdfb8bfa7e3..44266ae9a62a5 100644
--- a/pandas/tests/io/excel/test_writers.py
+++ b/pandas/tests/io/excel/test_writers.py
@@ -282,7 +282,6 @@ def test_excel_multindex_roundtrip(
)
tm.assert_frame_equal(df, act, check_names=check_names)
- @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_read_excel_parse_dates(self, tmp_excel):
# see gh-11544, gh-12051
df = DataFrame(
diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
index 1bc227369a968..3d07c0219691e 100644
--- a/pandas/tests/io/json/test_pandas.py
+++ b/pandas/tests/io/json/test_pandas.py
@@ -28,11 +28,6 @@
read_json,
)
import pandas._testing as tm
-from pandas.core.arrays import (
- ArrowStringArray,
- StringArray,
-)
-from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics
from pandas.io.json import ujson_dumps
@@ -2143,12 +2138,10 @@ def test_json_uint64(self):
result = df.to_json(orient="split")
assert result == expected
- @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
def test_read_json_dtype_backend(
self, string_storage, dtype_backend, orient, using_infer_string
):
# GH#50750
- pa = pytest.importorskip("pyarrow")
df = DataFrame(
{
"a": Series([1, np.nan, 3], dtype="Int64"),
@@ -2162,30 +2155,18 @@ def test_read_json_dtype_backend(
}
)
- if using_infer_string:
- string_array = ArrowStringArrayNumpySemantics(pa.array(["a", "b", "c"]))
- string_array_na = ArrowStringArrayNumpySemantics(pa.array(["a", "b", None]))
- elif string_storage == "python":
- string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_))
- string_array_na = StringArray(np.array(["a", "b", NA], dtype=np.object_))
-
- elif dtype_backend == "pyarrow":
- pa = pytest.importorskip("pyarrow")
- from pandas.arrays import ArrowExtensionArray
-
- string_array = ArrowExtensionArray(pa.array(["a", "b", "c"]))
- string_array_na = ArrowExtensionArray(pa.array(["a", "b", None]))
-
- else:
- string_array = ArrowStringArray(pa.array(["a", "b", "c"]))
- string_array_na = ArrowStringArray(pa.array(["a", "b", None]))
-
out = df.to_json(orient=orient)
with pd.option_context("mode.string_storage", string_storage):
result = read_json(
StringIO(out), dtype_backend=dtype_backend, orient=orient
)
+ if dtype_backend == "pyarrow":
+ pa = pytest.importorskip("pyarrow")
+ string_dtype = pd.ArrowDtype(pa.string())
+ else:
+ string_dtype = pd.StringDtype(string_storage)
+
expected = DataFrame(
{
"a": Series([1, np.nan, 3], dtype="Int64"),
@@ -2194,12 +2175,13 @@ def test_read_json_dtype_backend(
"d": Series([1.5, 2.0, 2.5], dtype="Float64"),
"e": Series([True, False, NA], dtype="boolean"),
"f": Series([True, False, True], dtype="boolean"),
- "g": string_array,
- "h": string_array_na,
+ "g": Series(["a", "b", "c"], dtype=string_dtype),
+ "h": Series(["a", "b", None], dtype=string_dtype),
}
)
if dtype_backend == "pyarrow":
+ pa = pytest.importorskip("pyarrow")
from pandas.arrays import ArrowExtensionArray
expected = DataFrame(
@@ -2212,7 +2194,9 @@ def test_read_json_dtype_backend(
if orient == "values":
expected.columns = list(range(8))
- tm.assert_frame_equal(result, expected)
+ # the storage of the str columns' Index is also affected by the
+ # string_storage setting -> ignore that for checking the result
+ tm.assert_frame_equal(result, expected, check_column_type=False)
@pytest.mark.parametrize("orient", ["split", "records", "index"])
def test_read_json_nullable_series(self, string_storage, dtype_backend, orient):
diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py
index 3f410a13c8f80..07f29518b7881 100644
--- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py
+++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py
@@ -19,11 +19,7 @@
Timestamp,
)
import pandas._testing as tm
-from pandas.core.arrays import (
- ArrowStringArray,
- IntegerArray,
- StringArray,
-)
+from pandas.core.arrays import IntegerArray
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
@@ -463,11 +459,8 @@ def test_dtype_backend_and_dtype(all_parsers):
tm.assert_frame_equal(result, expected)
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
def test_dtype_backend_string(all_parsers, string_storage):
# GH#36712
- pa = pytest.importorskip("pyarrow")
-
with pd.option_context("mode.string_storage", string_storage):
parser = all_parsers
@@ -477,21 +470,13 @@ def test_dtype_backend_string(all_parsers, string_storage):
"""
result = parser.read_csv(StringIO(data), dtype_backend="numpy_nullable")
- if string_storage == "python":
- expected = DataFrame(
- {
- "a": StringArray(np.array(["a", "b"], dtype=np.object_)),
- "b": StringArray(np.array(["x", pd.NA], dtype=np.object_)),
- }
- )
- else:
- expected = DataFrame(
- {
- "a": ArrowStringArray(pa.array(["a", "b"])),
- "b": ArrowStringArray(pa.array(["x", None])),
- }
- )
- tm.assert_frame_equal(result, expected)
+ expected = DataFrame(
+ {
+ "a": pd.array(["a", "b"], dtype=pd.StringDtype(string_storage)),
+ "b": pd.array(["x", pd.NA], dtype=pd.StringDtype(string_storage)),
+ },
+ )
+ tm.assert_frame_equal(result, expected)
def test_dtype_backend_ea_dtype_specified(all_parsers):
@@ -507,7 +492,6 @@ def test_dtype_backend_ea_dtype_specified(all_parsers):
tm.assert_frame_equal(result, expected)
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
def test_dtype_backend_pyarrow(all_parsers, request):
# GH#36712
pa = pytest.importorskip("pyarrow")
diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py
index b7b4a77c9e048..6243185294894 100644
--- a/pandas/tests/io/parser/test_read_fwf.py
+++ b/pandas/tests/io/parser/test_read_fwf.py
@@ -13,8 +13,6 @@
import numpy as np
import pytest
-from pandas._config import using_string_dtype
-
from pandas.errors import EmptyDataError
import pandas as pd
@@ -23,10 +21,6 @@
DatetimeIndex,
)
import pandas._testing as tm
-from pandas.core.arrays import (
- ArrowStringArray,
- StringArray,
-)
from pandas.io.common import urlopen
from pandas.io.parsers import (
@@ -941,39 +935,30 @@ def test_widths_and_usecols():
tm.assert_frame_equal(result, expected)
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
def test_dtype_backend(string_storage, dtype_backend):
# GH#50289
- if string_storage == "python":
- arr = StringArray(np.array(["a", "b"], dtype=np.object_))
- arr_na = StringArray(np.array([pd.NA, "a"], dtype=np.object_))
- elif dtype_backend == "pyarrow":
- pa = pytest.importorskip("pyarrow")
- from pandas.arrays import ArrowExtensionArray
-
- arr = ArrowExtensionArray(pa.array(["a", "b"]))
- arr_na = ArrowExtensionArray(pa.array([None, "a"]))
- else:
- pa = pytest.importorskip("pyarrow")
- arr = ArrowStringArray(pa.array(["a", "b"]))
- arr_na = ArrowStringArray(pa.array([None, "a"]))
-
data = """a b c d e f g h i
1 2.5 True a
3 4.5 False b True 6 7.5 a"""
with pd.option_context("mode.string_storage", string_storage):
result = read_fwf(StringIO(data), dtype_backend=dtype_backend)
+ if dtype_backend == "pyarrow":
+ pa = pytest.importorskip("pyarrow")
+ string_dtype = pd.ArrowDtype(pa.string())
+ else:
+ string_dtype = pd.StringDtype(string_storage)
+
expected = DataFrame(
{
"a": pd.Series([1, 3], dtype="Int64"),
"b": pd.Series([2.5, 4.5], dtype="Float64"),
"c": pd.Series([True, False], dtype="boolean"),
- "d": arr,
+ "d": pd.Series(["a", "b"], dtype=string_dtype),
"e": pd.Series([pd.NA, True], dtype="boolean"),
"f": pd.Series([pd.NA, 6], dtype="Int64"),
"g": pd.Series([pd.NA, 7.5], dtype="Float64"),
- "h": arr_na,
+ "h": pd.Series([None, "a"], dtype=string_dtype),
"i": pd.Series([pd.NA, pd.NA], dtype="Int64"),
}
)
@@ -989,7 +974,9 @@ def test_dtype_backend(string_storage, dtype_backend):
)
expected["i"] = ArrowExtensionArray(pa.array([None, None]))
- tm.assert_frame_equal(result, expected)
+ # the storage of the str columns' Index is also affected by the
+ # string_storage setting -> ignore that for checking the result
+ tm.assert_frame_equal(result, expected, check_column_type=False)
def test_invalid_dtype_backend():
diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py
index 923b880004c26..541cc39606047 100644
--- a/pandas/tests/io/test_clipboard.py
+++ b/pandas/tests/io/test_clipboard.py
@@ -19,10 +19,6 @@
read_clipboard,
)
import pandas._testing as tm
-from pandas.core.arrays import (
- ArrowStringArray,
- StringArray,
-)
from pandas.io.clipboard import (
CheckedCall,
@@ -358,23 +354,15 @@ def test_read_clipboard_dtype_backend(
self, clipboard, string_storage, dtype_backend, engine
):
# GH#50502
- if string_storage == "pyarrow" or dtype_backend == "pyarrow":
- pa = pytest.importorskip("pyarrow")
-
- if string_storage == "python":
- string_array = StringArray(np.array(["x", "y"], dtype=np.object_))
- string_array_na = StringArray(np.array(["x", NA], dtype=np.object_))
-
- elif dtype_backend == "pyarrow" and engine != "c":
+ if dtype_backend == "pyarrow":
pa = pytest.importorskip("pyarrow")
- from pandas.arrays import ArrowExtensionArray
-
- string_array = ArrowExtensionArray(pa.array(["x", "y"]))
- string_array_na = ArrowExtensionArray(pa.array(["x", None]))
-
+ if engine == "c" and string_storage == "pyarrow":
+ # TODO avoid this exception?
+ string_dtype = pd.ArrowDtype(pa.large_string())
+ else:
+ string_dtype = pd.ArrowDtype(pa.string())
else:
- string_array = ArrowStringArray(pa.array(["x", "y"]))
- string_array_na = ArrowStringArray(pa.array(["x", None]))
+ string_dtype = pd.StringDtype(string_storage)
text = """a,b,c,d,e,f,g,h,i
x,1,4.0,x,2,4.0,,True,False
@@ -386,10 +374,10 @@ def test_read_clipboard_dtype_backend(
expected = DataFrame(
{
- "a": string_array,
+ "a": Series(["x", "y"], dtype=string_dtype),
"b": Series([1, 2], dtype="Int64"),
"c": Series([4.0, 5.0], dtype="Float64"),
- "d": string_array_na,
+ "d": Series(["x", None], dtype=string_dtype),
"e": Series([2, NA], dtype="Int64"),
"f": Series([4.0, NA], dtype="Float64"),
"g": Series([NA, NA], dtype="Int64"),
diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py
index 5aa8f1c69fe44..6dd4368f09cc8 100644
--- a/pandas/tests/io/test_feather.py
+++ b/pandas/tests/io/test_feather.py
@@ -9,10 +9,6 @@
import pandas as pd
import pandas._testing as tm
-from pandas.core.arrays import (
- ArrowStringArray,
- StringArray,
-)
from pandas.io.feather_format import read_feather, to_feather # isort:skip
@@ -184,25 +180,17 @@ def test_read_feather_dtype_backend(self, string_storage, dtype_backend):
}
)
- if string_storage == "python":
- string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_))
- string_array_na = StringArray(np.array(["a", "b", pd.NA], dtype=np.object_))
-
- elif dtype_backend == "pyarrow":
- from pandas.arrays import ArrowExtensionArray
-
- string_array = ArrowExtensionArray(pa.array(["a", "b", "c"]))
- string_array_na = ArrowExtensionArray(pa.array(["a", "b", None]))
-
- else:
- string_array = ArrowStringArray(pa.array(["a", "b", "c"]))
- string_array_na = ArrowStringArray(pa.array(["a", "b", None]))
-
with tm.ensure_clean() as path:
to_feather(df, path)
with pd.option_context("mode.string_storage", string_storage):
result = read_feather(path, dtype_backend=dtype_backend)
+ if dtype_backend == "pyarrow":
+ pa = pytest.importorskip("pyarrow")
+ string_dtype = pd.ArrowDtype(pa.string())
+ else:
+ string_dtype = pd.StringDtype(string_storage)
+
expected = pd.DataFrame(
{
"a": pd.Series([1, np.nan, 3], dtype="Int64"),
@@ -211,8 +199,8 @@ def test_read_feather_dtype_backend(self, string_storage, dtype_backend):
"d": pd.Series([1.5, 2.0, 2.5], dtype="Float64"),
"e": pd.Series([True, False, pd.NA], dtype="boolean"),
"f": pd.Series([True, False, True], dtype="boolean"),
- "g": string_array,
- "h": string_array_na,
+ "g": pd.Series(["a", "b", "c"], dtype=string_dtype),
+ "h": pd.Series(["a", "b", None], dtype=string_dtype),
}
)
diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
index 164646aedf464..73e9933e3681b 100644
--- a/pandas/tests/io/test_html.py
+++ b/pandas/tests/io/test_html.py
@@ -13,8 +13,6 @@
import numpy as np
import pytest
-from pandas._config import using_string_dtype
-
from pandas.compat import is_platform_windows
import pandas.util._test_decorators as td
@@ -31,17 +29,9 @@
to_datetime,
)
import pandas._testing as tm
-from pandas.core.arrays import (
- ArrowStringArray,
- StringArray,
-)
from pandas.io.common import file_path_to_url
-pytestmark = pytest.mark.xfail(
- using_string_dtype(), reason="TODO(infer_string)", strict=False
-)
-
@pytest.fixture(
params=[
@@ -156,7 +146,7 @@ def test_to_html_compat(self, flavor_read_html):
df = (
DataFrame(
np.random.default_rng(2).random((4, 3)),
- columns=pd.Index(list("abc"), dtype=object),
+ columns=pd.Index(list("abc")),
)
.map("{:.3f}".format)
.astype(float)
@@ -182,24 +172,16 @@ def test_dtype_backend(self, string_storage, dtype_backend, flavor_read_html):
}
)
- if string_storage == "python":
- string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_))
- string_array_na = StringArray(np.array(["a", "b", NA], dtype=np.object_))
- elif dtype_backend == "pyarrow":
- pa = pytest.importorskip("pyarrow")
- from pandas.arrays import ArrowExtensionArray
-
- string_array = ArrowExtensionArray(pa.array(["a", "b", "c"]))
- string_array_na = ArrowExtensionArray(pa.array(["a", "b", None]))
- else:
- pa = pytest.importorskip("pyarrow")
- string_array = ArrowStringArray(pa.array(["a", "b", "c"]))
- string_array_na = ArrowStringArray(pa.array(["a", "b", None]))
-
out = df.to_html(index=False)
with pd.option_context("mode.string_storage", string_storage):
result = flavor_read_html(StringIO(out), dtype_backend=dtype_backend)[0]
+ if dtype_backend == "pyarrow":
+ pa = pytest.importorskip("pyarrow")
+ string_dtype = pd.ArrowDtype(pa.string())
+ else:
+ string_dtype = pd.StringDtype(string_storage)
+
expected = DataFrame(
{
"a": Series([1, np.nan, 3], dtype="Int64"),
@@ -208,8 +190,8 @@ def test_dtype_backend(self, string_storage, dtype_backend, flavor_read_html):
"d": Series([1.5, 2.0, 2.5], dtype="Float64"),
"e": Series([True, False, NA], dtype="boolean"),
"f": Series([True, False, True], dtype="boolean"),
- "g": string_array,
- "h": string_array_na,
+ "g": Series(["a", "b", "c"], dtype=string_dtype),
+ "h": Series(["a", "b", None], dtype=string_dtype),
}
)
@@ -225,7 +207,9 @@ def test_dtype_backend(self, string_storage, dtype_backend, flavor_read_html):
}
)
- tm.assert_frame_equal(result, expected)
+ # the storage of the str columns' Index is also affected by the
+ # string_storage setting -> ignore that for checking the result
+ tm.assert_frame_equal(result, expected, check_column_type=False)
@pytest.mark.network
@pytest.mark.single_cpu
diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
index 561c718ea5851..f4d64bf84b3f5 100644
--- a/pandas/tests/io/test_parquet.py
+++ b/pandas/tests/io/test_parquet.py
@@ -17,6 +17,7 @@
pa_version_under13p0,
pa_version_under15p0,
pa_version_under17p0,
+ pa_version_under18p0,
)
import pandas as pd
@@ -955,11 +956,16 @@ def test_timestamp_nanoseconds(self, pa):
def test_timezone_aware_index(self, request, pa, timezone_aware_date_list):
pytest.importorskip("pyarrow", "11.0.0")
- if timezone_aware_date_list.tzinfo != datetime.timezone.utc:
+ if (
+ timezone_aware_date_list.tzinfo != datetime.timezone.utc
+ and pa_version_under18p0
+ ):
request.applymarker(
pytest.mark.xfail(
- reason="temporary skip this test until it is properly resolved: "
- "https://github.com/pandas-dev/pandas/issues/37286"
+ reason=(
+ "pyarrow returns pytz.FixedOffset while pandas "
+ "constructs datetime.timezone https://github.com/pandas-dev/pandas/issues/37286"
+ )
)
)
idx = 5 * [timezone_aware_date_list]
@@ -1131,6 +1137,21 @@ def test_infer_string_large_string_type(self, tmp_path, pa):
# assert result["strings"].dtype == "string"
# FIXME: don't leave commented-out
+ def test_non_nanosecond_timestamps(self, temp_file):
+ # GH#49236
+ pa = pytest.importorskip("pyarrow", "11.0.0")
+ pq = pytest.importorskip("pyarrow.parquet")
+
+ arr = pa.array([datetime.datetime(1600, 1, 1)], type=pa.timestamp("us"))
+ table = pa.table([arr], names=["timestamp"])
+ pq.write_table(table, temp_file)
+ result = read_parquet(temp_file)
+ expected = pd.DataFrame(
+ data={"timestamp": [datetime.datetime(1600, 1, 1)]},
+ dtype="datetime64[us]",
+ )
+ tm.assert_frame_equal(result, expected)
+
class TestParquetFastParquet(Base):
@pytest.mark.xfail(reason="datetime_with_nat gets incorrect values")
@@ -1172,6 +1193,10 @@ def test_duplicate_columns(self, fp):
msg = "Cannot create parquet dataset with duplicate column names"
self.check_error_on_write(df, fp, ValueError, msg)
+ @pytest.mark.xfail(
+ Version(np.__version__) >= Version("2.0.0"),
+ reason="fastparquet uses np.float_ in numpy2",
+ )
def test_bool_with_none(self, fp):
df = pd.DataFrame({"a": [True, None, False]})
expected = pd.DataFrame({"a": [1.0, np.nan, 0.0]}, dtype="float16")
diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py
index a21893f66722a..980c88f070b89 100644
--- a/pandas/tests/io/test_sql.py
+++ b/pandas/tests/io/test_sql.py
@@ -39,10 +39,6 @@
to_timedelta,
)
import pandas._testing as tm
-from pandas.core.arrays import (
- ArrowStringArray,
- StringArray,
-)
from pandas.util.version import Version
from pandas.io import sql
@@ -3661,24 +3657,13 @@ def dtype_backend_data() -> DataFrame:
@pytest.fixture
def dtype_backend_expected():
- def func(storage, dtype_backend, conn_name) -> DataFrame:
- string_array: StringArray | ArrowStringArray
- string_array_na: StringArray | ArrowStringArray
- if storage == "python":
- string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_))
- string_array_na = StringArray(np.array(["a", "b", pd.NA], dtype=np.object_))
-
- elif dtype_backend == "pyarrow":
+ def func(string_storage, dtype_backend, conn_name) -> DataFrame:
+ string_dtype: pd.StringDtype | pd.ArrowDtype
+ if dtype_backend == "pyarrow":
pa = pytest.importorskip("pyarrow")
- from pandas.arrays import ArrowExtensionArray
-
- string_array = ArrowExtensionArray(pa.array(["a", "b", "c"])) # type: ignore[assignment]
- string_array_na = ArrowExtensionArray(pa.array(["a", "b", None])) # type: ignore[assignment]
-
+ string_dtype = pd.ArrowDtype(pa.string())
else:
- pa = pytest.importorskip("pyarrow")
- string_array = ArrowStringArray(pa.array(["a", "b", "c"]))
- string_array_na = ArrowStringArray(pa.array(["a", "b", None]))
+ string_dtype = pd.StringDtype(string_storage)
df = DataFrame(
{
@@ -3688,8 +3673,8 @@ def func(storage, dtype_backend, conn_name) -> DataFrame:
"d": Series([1.5, 2.0, 2.5], dtype="Float64"),
"e": Series([True, False, pd.NA], dtype="boolean"),
"f": Series([True, False, True], dtype="boolean"),
- "g": string_array,
- "h": string_array_na,
+ "g": Series(["a", "b", "c"], dtype=string_dtype),
+ "h": Series(["a", "b", None], dtype=string_dtype),
}
)
if dtype_backend == "pyarrow":
diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py
index 036a5d6265dd7..5c07a56c9fb3f 100644
--- a/pandas/tests/io/xml/test_xml.py
+++ b/pandas/tests/io/xml/test_xml.py
@@ -14,8 +14,6 @@
import numpy as np
import pytest
-from pandas._config import using_string_dtype
-
from pandas.compat import WASM
from pandas.compat._optional import import_optional_dependency
from pandas.errors import (
@@ -31,11 +29,6 @@
Series,
)
import pandas._testing as tm
-from pandas.core.arrays import (
- ArrowStringArray,
- StringArray,
-)
-from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics
from pandas.io.common import get_handle
from pandas.io.xml import read_xml
@@ -1992,7 +1985,6 @@ def test_s3_parser_consistency(s3_public_bucket_with_data, s3so):
tm.assert_frame_equal(df_lxml, df_etree)
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
def test_read_xml_nullable_dtypes(
parser, string_storage, dtype_backend, using_infer_string
):
@@ -2023,36 +2015,21 @@ def test_read_xml_nullable_dtypes(
"""
- if using_infer_string:
- pa = pytest.importorskip("pyarrow")
- string_array = ArrowStringArrayNumpySemantics(pa.array(["x", "y"]))
- string_array_na = ArrowStringArrayNumpySemantics(pa.array(["x", None]))
-
- elif string_storage == "python":
- string_array = StringArray(np.array(["x", "y"], dtype=np.object_))
- string_array_na = StringArray(np.array(["x", NA], dtype=np.object_))
+ with pd.option_context("mode.string_storage", string_storage):
+ result = read_xml(StringIO(data), parser=parser, dtype_backend=dtype_backend)
- elif dtype_backend == "pyarrow":
+ if dtype_backend == "pyarrow":
pa = pytest.importorskip("pyarrow")
- from pandas.arrays import ArrowExtensionArray
-
- string_array = ArrowExtensionArray(pa.array(["x", "y"]))
- string_array_na = ArrowExtensionArray(pa.array(["x", None]))
-
+ string_dtype = pd.ArrowDtype(pa.string())
else:
- pa = pytest.importorskip("pyarrow")
- string_array = ArrowStringArray(pa.array(["x", "y"]))
- string_array_na = ArrowStringArray(pa.array(["x", None]))
-
- with pd.option_context("mode.string_storage", string_storage):
- result = read_xml(StringIO(data), parser=parser, dtype_backend=dtype_backend)
+ string_dtype = pd.StringDtype(string_storage)
expected = DataFrame(
{
- "a": string_array,
+ "a": Series(["x", "y"], dtype=string_dtype),
"b": Series([1, 2], dtype="Int64"),
"c": Series([4.0, 5.0], dtype="Float64"),
- "d": string_array_na,
+ "d": Series(["x", None], dtype=string_dtype),
"e": Series([2, NA], dtype="Int64"),
"f": Series([4.0, NA], dtype="Float64"),
"g": Series([NA, NA], dtype="Int64"),
@@ -2073,7 +2050,9 @@ def test_read_xml_nullable_dtypes(
)
expected["g"] = ArrowExtensionArray(pa.array([None, None]))
- tm.assert_frame_equal(result, expected)
+ # the storage of the str columns' Index is also affected by the
+ # string_storage setting -> ignore that for checking the result
+ tm.assert_frame_equal(result, expected, check_column_type=False)
def test_invalid_dtype_backend():
diff --git a/pandas/tests/plotting/frame/test_frame.py b/pandas/tests/plotting/frame/test_frame.py
index b381c4fce8430..b39f953da1ee6 100644
--- a/pandas/tests/plotting/frame/test_frame.py
+++ b/pandas/tests/plotting/frame/test_frame.py
@@ -45,6 +45,7 @@
_check_visible,
get_y_axis,
)
+from pandas.util.version import Version
from pandas.io.formats.printing import pprint_thing
@@ -2465,8 +2466,14 @@ def test_group_subplot_invalid_column_name(self):
d = {"a": np.arange(10), "b": np.arange(10)}
df = DataFrame(d)
- with pytest.raises(ValueError, match=r"Column label\(s\) \['bad_name'\]"):
- df.plot(subplots=[("a", "bad_name")])
+ if Version(np.__version__) < Version("2.0.0"):
+ with pytest.raises(ValueError, match=r"Column label\(s\) \['bad_name'\]"):
+ df.plot(subplots=[("a", "bad_name")])
+ else:
+ with pytest.raises(
+ ValueError, match=r"Column label\(s\) \[np\.str\_\('bad_name'\)\]"
+ ):
+ df.plot(subplots=[("a", "bad_name")])
def test_group_subplot_duplicated_column(self):
d = {"a": np.arange(10), "b": np.arange(10), "c": np.arange(10)}
diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py
index 59e2fe81cc7a1..5251b721b685f 100644
--- a/pandas/tests/reshape/merge/test_merge.py
+++ b/pandas/tests/reshape/merge/test_merge.py
@@ -3013,3 +3013,15 @@ def test_merge_datetime_and_timedelta(how):
)
with pytest.raises(ValueError, match=re.escape(msg)):
right.merge(left, on="key", how=how)
+
+
+def test_merge_on_all_nan_column():
+ # GH#59421
+ left = DataFrame({"x": [1, 2, 3], "y": [np.nan, np.nan, np.nan], "z": [4, 5, 6]})
+ right = DataFrame({"x": [1, 2, 3], "y": [np.nan, np.nan, np.nan], "zz": [4, 5, 6]})
+ result = left.merge(right, on=["x", "y"], how="outer")
+ # Should not trigger array bounds eerror with bounds checking or asan enabled.
+ expected = DataFrame(
+ {"x": [1, 2, 3], "y": [np.nan, np.nan, np.nan], "z": [4, 5, 6], "zz": [4, 5, 6]}
+ )
+ tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/scalar/timestamp/methods/test_round.py b/pandas/tests/scalar/timestamp/methods/test_round.py
index 2fb0e1a8d3397..944aa55727217 100644
--- a/pandas/tests/scalar/timestamp/methods/test_round.py
+++ b/pandas/tests/scalar/timestamp/methods/test_round.py
@@ -4,7 +4,6 @@
)
import numpy as np
import pytest
-import pytz
from pandas._libs import lib
from pandas._libs.tslibs import (
@@ -182,7 +181,7 @@ def test_round_dst_border_ambiguous(self, method, unit):
assert result is NaT
msg = "Cannot infer dst time"
- with pytest.raises(pytz.AmbiguousTimeError, match=msg):
+ with pytest.raises(ValueError, match=msg):
getattr(ts, method)("h", ambiguous="raise")
@pytest.mark.parametrize(
@@ -205,7 +204,7 @@ def test_round_dst_border_nonexistent(self, method, ts_str, freq, unit):
assert result is NaT
msg = "2018-03-11 02:00:00"
- with pytest.raises(pytz.NonExistentTimeError, match=msg):
+ with pytest.raises(ValueError, match=msg):
getattr(ts, method)(freq, nonexistent="raise")
@pytest.mark.parametrize(
diff --git a/pandas/tests/scalar/timestamp/methods/test_tz_localize.py b/pandas/tests/scalar/timestamp/methods/test_tz_localize.py
index 90dc8d77608cb..cb7ac5fa6f1da 100644
--- a/pandas/tests/scalar/timestamp/methods/test_tz_localize.py
+++ b/pandas/tests/scalar/timestamp/methods/test_tz_localize.py
@@ -4,11 +4,6 @@
from dateutil.tz import gettz
import pytest
-import pytz
-from pytz.exceptions import (
- AmbiguousTimeError,
- NonExistentTimeError,
-)
from pandas._libs.tslibs.dtypes import NpyDatetimeUnit
from pandas.errors import OutOfBoundsDatetime
@@ -54,13 +49,14 @@ def test_tz_localize_ambiguous_bool(self, unit, tz):
# make sure that we are correctly accepting bool values as ambiguous
# GH#14402
if isinstance(tz, str) and tz.startswith("pytz/"):
+ pytz = pytest.importorskip("pytz")
tz = pytz.timezone(tz.removeprefix("pytz/"))
ts = Timestamp("2015-11-01 01:00:03").as_unit(unit)
expected0 = Timestamp("2015-11-01 01:00:03-0500", tz=tz)
expected1 = Timestamp("2015-11-01 01:00:03-0600", tz=tz)
msg = "Cannot infer dst time from 2015-11-01 01:00:03"
- with pytest.raises(pytz.AmbiguousTimeError, match=msg):
+ with pytest.raises(ValueError, match=msg):
ts.tz_localize(tz)
result = ts.tz_localize(tz, ambiguous=True)
@@ -105,10 +101,10 @@ def test_tz_localize_ambiguous(self):
def test_tz_localize_nonexistent(self, stamp, tz):
# GH#13057
ts = Timestamp(stamp)
- with pytest.raises(NonExistentTimeError, match=stamp):
+ with pytest.raises(ValueError, match=stamp):
ts.tz_localize(tz)
# GH 22644
- with pytest.raises(NonExistentTimeError, match=stamp):
+ with pytest.raises(ValueError, match=stamp):
ts.tz_localize(tz, nonexistent="raise")
assert ts.tz_localize(tz, nonexistent="NaT") is NaT
@@ -154,7 +150,7 @@ def test_tz_localize_ambiguous_raise(self):
# GH#13057
ts = Timestamp("2015-11-1 01:00")
msg = "Cannot infer dst time from 2015-11-01 01:00:00,"
- with pytest.raises(AmbiguousTimeError, match=msg):
+ with pytest.raises(ValueError, match=msg):
ts.tz_localize("US/Pacific", ambiguous="raise")
def test_tz_localize_nonexistent_invalid_arg(self, warsaw):
@@ -330,7 +326,7 @@ def test_timestamp_tz_localize_nonexistent_raise(self, warsaw, unit):
tz = warsaw
ts = Timestamp("2015-03-29 02:20:00").as_unit(unit)
msg = "2015-03-29 02:20:00"
- with pytest.raises(pytz.NonExistentTimeError, match=msg):
+ with pytest.raises(ValueError, match=msg):
ts.tz_localize(tz, nonexistent="raise")
msg = (
"The nonexistent argument must be one of 'raise', 'NaT', "
diff --git a/pandas/tests/scalar/timestamp/test_constructors.py b/pandas/tests/scalar/timestamp/test_constructors.py
index 39f302c3357de..2c97c4a32e0aa 100644
--- a/pandas/tests/scalar/timestamp/test_constructors.py
+++ b/pandas/tests/scalar/timestamp/test_constructors.py
@@ -15,7 +15,6 @@
)
import numpy as np
import pytest
-import pytz
from pandas._libs.tslibs.dtypes import NpyDatetimeUnit
from pandas.errors import OutOfBoundsDatetime
@@ -747,7 +746,7 @@ def test_constructor_tz_or_tzinfo(self):
tz="UTC",
),
Timestamp(2000, 1, 2, 3, 4, 5, 6, None, nanosecond=1),
- Timestamp(2000, 1, 2, 3, 4, 5, 6, tz=pytz.UTC, nanosecond=1),
+ Timestamp(2000, 1, 2, 3, 4, 5, 6, tz=timezone.utc, nanosecond=1),
],
)
def test_constructor_nanosecond(self, result):
@@ -904,7 +903,7 @@ def test_raise_tz_and_tzinfo_in_datetime_input(self, box):
Timestamp(box(**kwargs), tz="US/Pacific")
msg = "Cannot pass a datetime or Timestamp"
with pytest.raises(ValueError, match=msg):
- Timestamp(box(**kwargs), tzinfo=pytz.timezone("US/Pacific"))
+ Timestamp(box(**kwargs), tzinfo=zoneinfo.ZoneInfo("US/Pacific"))
def test_dont_convert_dateutil_utc_to_default_utc(self):
result = Timestamp(datetime(2018, 1, 1), tz=tzutc())
@@ -948,7 +947,7 @@ def test_timestamp_constructor_near_dst_boundary(self):
assert result == expected
msg = "Cannot infer dst time from 2015-10-25 02:00:00"
- with pytest.raises(pytz.AmbiguousTimeError, match=msg):
+ with pytest.raises(ValueError, match=msg):
Timestamp("2015-10-25 02:00", tz=tz)
result = Timestamp("2017-03-26 01:00", tz="Europe/Paris")
@@ -956,7 +955,7 @@ def test_timestamp_constructor_near_dst_boundary(self):
assert result == expected
msg = "2017-03-26 02:00"
- with pytest.raises(pytz.NonExistentTimeError, match=msg):
+ with pytest.raises(ValueError, match=msg):
Timestamp("2017-03-26 02:00", tz="Europe/Paris")
# GH#11708
@@ -975,7 +974,7 @@ def test_timestamp_constructor_near_dst_boundary(self):
assert result == expected
msg = "2017-03-26 02:00"
- with pytest.raises(pytz.NonExistentTimeError, match=msg):
+ with pytest.raises(ValueError, match=msg):
Timestamp("2017-03-26 02:00", tz="Europe/Paris")
result = Timestamp("2017-03-26 02:00:00+0100", tz="Europe/Paris")
diff --git a/pandas/tests/series/accessors/test_dt_accessor.py b/pandas/tests/series/accessors/test_dt_accessor.py
index 3e1ece6b7f59e..9b9a8ea3600ae 100644
--- a/pandas/tests/series/accessors/test_dt_accessor.py
+++ b/pandas/tests/series/accessors/test_dt_accessor.py
@@ -9,7 +9,6 @@
import numpy as np
import pytest
-import pytz
from pandas._config import using_string_dtype
@@ -28,6 +27,7 @@
Period,
PeriodIndex,
Series,
+ StringDtype,
TimedeltaIndex,
date_range,
period_range,
@@ -352,7 +352,7 @@ def test_dt_round_tz_ambiguous(self, method):
tm.assert_series_equal(result, expected)
# raise
- with tm.external_error_raised(pytz.AmbiguousTimeError):
+ with tm.external_error_raised(ValueError):
getattr(df1.date.dt, method)("h", ambiguous="raise")
@pytest.mark.parametrize(
@@ -374,7 +374,7 @@ def test_dt_round_tz_nonexistent(self, method, ts_str, freq):
expected = Series([pd.NaT]).dt.tz_localize(result.dt.tz)
tm.assert_series_equal(result, expected)
- with pytest.raises(pytz.NonExistentTimeError, match="2018-03-11 02:00:00"):
+ with pytest.raises(ValueError, match="2018-03-11 02:00:00"):
getattr(ser.dt, method)(freq, nonexistent="raise")
@pytest.mark.parametrize("freq", ["ns", "us", "1000us"])
@@ -514,7 +514,6 @@ def test_dt_accessor_datetime_name_accessors(self, time_locale):
ser = pd.concat([ser, Series([pd.NaT])])
assert np.isnan(ser.dt.month_name(locale=time_locale).iloc[-1])
- @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_strftime(self):
# GH 10086
ser = Series(date_range("20130101", periods=5))
@@ -585,10 +584,9 @@ def test_strftime_period_days(self, using_infer_string):
dtype="=U10",
)
if using_infer_string:
- expected = expected.astype("str")
+ expected = expected.astype(StringDtype(na_value=np.nan))
tm.assert_index_equal(result, expected)
- @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_strftime_dt64_microsecond_resolution(self):
ser = Series([datetime(2013, 1, 1, 2, 32, 59), datetime(2013, 1, 2, 14, 32, 1)])
result = ser.dt.strftime("%Y-%m-%d %H:%M:%S")
@@ -621,7 +619,6 @@ def test_strftime_period_minutes(self):
)
tm.assert_series_equal(result, expected)
- @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
@pytest.mark.parametrize(
"data",
[
@@ -644,7 +641,7 @@ def test_strftime_all_nat(self, data):
ser = Series(data)
with tm.assert_produces_warning(None):
result = ser.dt.strftime("%Y-%m-%d")
- expected = Series([np.nan], dtype=object)
+ expected = Series([np.nan], dtype="str")
tm.assert_series_equal(result, expected)
def test_valid_dt_with_missing_values(self):
diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py
index 4a8af259b4134..90c4056a39e84 100644
--- a/pandas/tests/series/methods/test_convert_dtypes.py
+++ b/pandas/tests/series/methods/test_convert_dtypes.py
@@ -3,8 +3,6 @@
import numpy as np
import pytest
-from pandas._config import using_string_dtype
-
from pandas._libs import lib
import pandas as pd
@@ -12,7 +10,6 @@
class TestSeriesConvertDtypes:
- @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
@pytest.mark.parametrize(
"data, maindtype, expected_default, expected_other",
[
@@ -223,9 +220,9 @@ def test_convert_dtypes(
and params[0]
and not params[1]
):
- # If we would convert with convert strings then infer_objects converts
- # with the option
- expected_dtype = "string[pyarrow_numpy]"
+ # If convert_string=False and infer_objects=True, we end up with the
+ # default string dtype instead of preserving object for string data
+ expected_dtype = pd.StringDtype(na_value=np.nan)
expected = pd.Series(data, dtype=expected_dtype)
tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/series/methods/test_tz_localize.py b/pandas/tests/series/methods/test_tz_localize.py
index 45620a721f442..53288e8a1f8e7 100644
--- a/pandas/tests/series/methods/test_tz_localize.py
+++ b/pandas/tests/series/methods/test_tz_localize.py
@@ -1,7 +1,6 @@
from datetime import timezone
import pytest
-import pytz
from pandas._libs.tslibs import timezones
@@ -28,7 +27,7 @@ def test_series_tz_localize_ambiguous_bool(self):
expected0 = Series([expected0])
expected1 = Series([expected1])
- with tm.external_error_raised(pytz.AmbiguousTimeError):
+ with tm.external_error_raised(ValueError):
ser.dt.tz_localize("US/Central")
result = ser.dt.tz_localize("US/Central", ambiguous=True)
@@ -79,11 +78,11 @@ def test_tz_localize_nonexistent(self, warsaw, method, exp, unit):
df = ser.to_frame()
if method == "raise":
- with tm.external_error_raised(pytz.NonExistentTimeError):
+ with tm.external_error_raised(ValueError):
dti.tz_localize(tz, nonexistent=method)
- with tm.external_error_raised(pytz.NonExistentTimeError):
+ with tm.external_error_raised(ValueError):
ser.tz_localize(tz, nonexistent=method)
- with tm.external_error_raised(pytz.NonExistentTimeError):
+ with tm.external_error_raised(ValueError):
df.tz_localize(tz, nonexistent=method)
elif exp == "invalid":
diff --git a/pandas/tests/series/test_logical_ops.py b/pandas/tests/series/test_logical_ops.py
index 262ec35b472ad..baed3ba936699 100644
--- a/pandas/tests/series/test_logical_ops.py
+++ b/pandas/tests/series/test_logical_ops.py
@@ -9,6 +9,7 @@
from pandas.compat import HAS_PYARROW
from pandas import (
+ ArrowDtype,
DataFrame,
Index,
Series,
@@ -523,18 +524,38 @@ def test_int_dtype_different_index_not_bool(self):
result = ser1 ^ ser2
tm.assert_series_equal(result, expected)
+ # TODO: this belongs in comparison tests
def test_pyarrow_numpy_string_invalid(self):
# GH#56008
- pytest.importorskip("pyarrow")
+ pa = pytest.importorskip("pyarrow")
ser = Series([False, True])
ser2 = Series(["a", "b"], dtype="string[pyarrow_numpy]")
result = ser == ser2
- expected = Series(False, index=ser.index)
- tm.assert_series_equal(result, expected)
+ expected_eq = Series(False, index=ser.index)
+ tm.assert_series_equal(result, expected_eq)
result = ser != ser2
- expected = Series(True, index=ser.index)
- tm.assert_series_equal(result, expected)
+ expected_ne = Series(True, index=ser.index)
+ tm.assert_series_equal(result, expected_ne)
with pytest.raises(TypeError, match="Invalid comparison"):
ser > ser2
+
+ # GH#59505
+ ser3 = ser2.astype("string[pyarrow]")
+ result3_eq = ser3 == ser
+ tm.assert_series_equal(result3_eq, expected_eq.astype("bool[pyarrow]"))
+ result3_ne = ser3 != ser
+ tm.assert_series_equal(result3_ne, expected_ne.astype("bool[pyarrow]"))
+
+ with pytest.raises(TypeError, match="Invalid comparison"):
+ ser > ser3
+
+ ser4 = ser2.astype(ArrowDtype(pa.string()))
+ result4_eq = ser4 == ser
+ tm.assert_series_equal(result4_eq, expected_eq.astype("bool[pyarrow]"))
+ result4_ne = ser4 != ser
+ tm.assert_series_equal(result4_ne, expected_ne.astype("bool[pyarrow]"))
+
+ with pytest.raises(TypeError, match="Invalid comparison"):
+ ser > ser4
diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py
index ee26fdae74960..18df76ddd8ed8 100644
--- a/pandas/tests/test_downstream.py
+++ b/pandas/tests/test_downstream.py
@@ -218,7 +218,7 @@ def test_missing_required_dependency():
subprocess.check_output(call, stderr=subprocess.STDOUT)
output = exc.value.stdout.decode()
- for name in ["numpy", "pytz", "dateutil"]:
+ for name in ["numpy", "dateutil"]:
assert name in output
diff --git a/pandas/tests/test_sorting.py b/pandas/tests/test_sorting.py
index 2a225bda953cf..869d41efa6c28 100644
--- a/pandas/tests/test_sorting.py
+++ b/pandas/tests/test_sorting.py
@@ -408,6 +408,13 @@ def test_codes_out_of_bound(self):
tm.assert_numpy_array_equal(result, expected)
tm.assert_numpy_array_equal(result_codes, expected_codes)
+ @pytest.mark.parametrize("codes", [[-1, -1], [2, -1], [2, 2]])
+ def test_codes_empty_array_out_of_bound(self, codes):
+ empty_values = np.array([])
+ expected_codes = -np.ones_like(codes, dtype=np.intp)
+ _, result_codes = safe_sort(empty_values, codes)
+ tm.assert_numpy_array_equal(result_codes, expected_codes)
+
def test_mixed_integer(self):
values = np.array(["b", 1, 0, "a", 0, "b"], dtype=object)
result = safe_sort(values)
diff --git a/pandas/tests/tseries/offsets/test_dst.py b/pandas/tests/tseries/offsets/test_dst.py
index dfdc69c0fe18e..e75958843040d 100644
--- a/pandas/tests/tseries/offsets/test_dst.py
+++ b/pandas/tests/tseries/offsets/test_dst.py
@@ -108,13 +108,13 @@ def _test_offset(
"second": "2013-11-03 01:59:01.999999",
"microsecond": "2013-11-03 01:59:59.000001",
}[offset_name]
- with pytest.raises(pytz.AmbiguousTimeError, match=err_msg):
+ with pytest.raises(ValueError, match=err_msg):
tstart + offset
# While we're here, let's check that we get the same behavior in a
# vectorized path
dti = DatetimeIndex([tstart])
warn_msg = "Non-vectorized DateOffset"
- with pytest.raises(pytz.AmbiguousTimeError, match=err_msg):
+ with pytest.raises(ValueError, match=err_msg):
with tm.assert_produces_warning(performance_warning, match=warn_msg):
dti + offset
return
@@ -256,10 +256,10 @@ def test_all_offset_classes(self, tup):
],
)
def test_nontick_offset_with_ambiguous_time_error(original_dt, target_dt, offset, tz):
- # .apply for non-Tick offsets throws AmbiguousTimeError when the target dt
+ # .apply for non-Tick offsets throws ValueError when the target dt
# is dst-ambiguous
- localized_dt = original_dt.tz_localize(pytz.timezone(tz))
+ localized_dt = original_dt.tz_localize(tz)
msg = f"Cannot infer dst time from {target_dt}, try using the 'ambiguous' argument"
- with pytest.raises(pytz.AmbiguousTimeError, match=msg):
+ with pytest.raises(ValueError, match=msg):
localized_dt + offset
diff --git a/pandas/tests/tseries/offsets/test_offsets_properties.py b/pandas/tests/tseries/offsets/test_offsets_properties.py
index 99a6a583dd3e9..943434e515828 100644
--- a/pandas/tests/tseries/offsets/test_offsets_properties.py
+++ b/pandas/tests/tseries/offsets/test_offsets_properties.py
@@ -13,7 +13,6 @@
given,
)
import pytest
-import pytz
import pandas as pd
from pandas._testing._hypothesis import (
@@ -34,11 +33,11 @@ def test_on_offset_implementations(dt, offset):
# (dt + offset) - offset == dt
try:
compare = (dt + offset) - offset
- except (pytz.NonExistentTimeError, pytz.AmbiguousTimeError):
+ except ValueError:
# When dt + offset does not exist or is DST-ambiguous, assume(False) to
# indicate to hypothesis that this is not a valid test case
# DST-ambiguous example (GH41906):
- # dt = datetime.datetime(1900, 1, 1, tzinfo=pytz.timezone('Africa/Kinshasa'))
+ # dt = datetime.datetime(1900, 1, 1, tzinfo=ZoneInfo('Africa/Kinshasa'))
# offset = MonthBegin(66)
assume(False)
diff --git a/pandas/tests/tslibs/test_tzconversion.py b/pandas/tests/tslibs/test_tzconversion.py
index c1a56ffb71b02..f32829b4e0b21 100644
--- a/pandas/tests/tslibs/test_tzconversion.py
+++ b/pandas/tests/tslibs/test_tzconversion.py
@@ -1,6 +1,7 @@
+import zoneinfo
+
import numpy as np
import pytest
-import pytz
from pandas._libs.tslibs.tzconversion import tz_localize_to_utc
@@ -11,13 +12,15 @@ def test_tz_localize_to_utc_ambiguous_infer(self):
val = 1_320_541_200_000_000_000
vals = np.array([val, val - 1, val], dtype=np.int64)
- with pytest.raises(pytz.AmbiguousTimeError, match="2011-11-06 01:00:00"):
- tz_localize_to_utc(vals, pytz.timezone("US/Eastern"), ambiguous="infer")
+ with pytest.raises(ValueError, match="2011-11-06 01:00:00"):
+ tz_localize_to_utc(vals, zoneinfo.ZoneInfo("US/Eastern"), ambiguous="infer")
- with pytest.raises(pytz.AmbiguousTimeError, match="are no repeated times"):
- tz_localize_to_utc(vals[:1], pytz.timezone("US/Eastern"), ambiguous="infer")
+ with pytest.raises(ValueError, match="are no repeated times"):
+ tz_localize_to_utc(
+ vals[:1], zoneinfo.ZoneInfo("US/Eastern"), ambiguous="infer"
+ )
vals[1] += 1
msg = "There are 2 dst switches when there should only be 1"
- with pytest.raises(pytz.AmbiguousTimeError, match=msg):
- tz_localize_to_utc(vals, pytz.timezone("US/Eastern"), ambiguous="infer")
+ with pytest.raises(ValueError, match=msg):
+ tz_localize_to_utc(vals, zoneinfo.ZoneInfo("US/Eastern"), ambiguous="infer")
diff --git a/pandas/util/_print_versions.py b/pandas/util/_print_versions.py
index 7e18ebe40cfa8..bd20660bdbba6 100644
--- a/pandas/util/_print_versions.py
+++ b/pandas/util/_print_versions.py
@@ -67,7 +67,6 @@ def _get_dependency_info() -> dict[str, JSONSerializable]:
"pandas",
# required
"numpy",
- "pytz",
"dateutil",
# install / build,
"pip",
diff --git a/pyproject.toml b/pyproject.toml
index cc5cc1cf84d0c..645ded35f3d18 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -30,7 +30,6 @@ dependencies = [
"numpy>=1.23.5; python_version<'3.12'",
"numpy>=1.26.0; python_version>='3.12'",
"python-dateutil>=2.8.2",
- "pytz>=2020.1",
"tzdata>=2022.7"
]
classifiers = [
@@ -81,6 +80,7 @@ plot = ['matplotlib>=3.6.3']
output-formatting = ['jinja2>=3.1.2', 'tabulate>=0.9.0']
clipboard = ['PyQt5>=5.15.9', 'qtpy>=2.3.0']
compression = ['zstandard>=0.19.0']
+timezone = ['pytz>=2023.4']
all = ['adbc-driver-postgresql>=0.10.0',
'adbc-driver-sqlite>=0.8.0',
'beautifulsoup4>=4.11.2',
@@ -107,6 +107,7 @@ all = ['adbc-driver-postgresql>=0.10.0',
'pytest>=7.3.2',
'pytest-xdist>=3.4.0',
'python-calamine>=0.1.7',
+ 'pytz>=2023.4',
'pyxlsb>=1.0.10',
'qtpy>=2.3.0',
'scipy>=1.10.0',
diff --git a/requirements-dev.txt b/requirements-dev.txt
index dbfd7c6bf7bf5..52d2553fc4001 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -15,7 +15,6 @@ PyQt5>=5.15.9
coverage
python-dateutil
numpy<2
-pytz
beautifulsoup4>=4.11.2
blosc
bottleneck>=1.3.6
@@ -39,6 +38,7 @@ pymysql>=1.0.2
pyreadstat>=1.2.0
tables>=3.8.0
python-calamine>=0.1.7
+pytz>=2023.4
pyxlsb>=1.0.10
s3fs>=2022.11.0
scipy>=1.10.0
diff --git a/web/pandas/_templates/layout.html b/web/pandas/_templates/layout.html
index aa4bfc92ce8a8..4c66f28818abd 100644
--- a/web/pandas/_templates/layout.html
+++ b/web/pandas/_templates/layout.html
@@ -73,8 +73,8 @@
-
-
+
+
diff --git a/web/pandas/community/ecosystem.md b/web/pandas/community/ecosystem.md
index 49ece5564c300..c14996211bb8b 100644
--- a/web/pandas/community/ecosystem.md
+++ b/web/pandas/community/ecosystem.md
@@ -360,6 +360,13 @@ Deltalake python package lets you access tables stored in
JVM. It provides the ``delta_table.to_pyarrow_table().to_pandas()`` method to convert
any Delta table into Pandas dataframe.
+### [pandas-gbq](https://github.com/googleapis/python-bigquery-pandas)
+
+pandas-gbq provides high performance reads and writes to and from
+[Google BigQuery](https://cloud.google.com/bigquery/). Previously (before version 2.2.0),
+these methods were exposed as `pandas.read_gbq` and `DataFrame.to_gbq`.
+Use `pandas_gbq.read_gbq` and `pandas_gbq.to_gbq`, instead.
+
## Out-of-core
### [Bodo](https://bodo.ai/)
@@ -513,6 +520,13 @@ Arrays](https://awkward-array.org/) inside pandas' Series and
DataFrame. It also provides an accessor for using awkward functions
on Series that are of awkward type.
+### [db-dtypes](https://github.com/googleapis/python-db-dtypes-pandas)
+
+db-dtypes provides an extension types for working with types like
+DATE, TIME, and JSON from database systems. This package is used
+by pandas-gbq to provide natural dtypes for BigQuery data types without
+a natural numpy type.
+
### [Pandas-Genomics](https://pandas-genomics.readthedocs.io/en/latest/)
Pandas-Genomics provides an extension type and extension array for working