diff --git a/.circleci/config.yml b/.circleci/config.yml index 6f134c9a7a7bd..ebbca94718259 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -1,46 +1,66 @@ version: 2.1 jobs: - test-arm: + test-linux-arm: machine: image: default resource_class: arm.large environment: - ENV_FILE: ci/deps/circle-310-arm64.yaml + ENV_FILE: ci/deps/circle-311-arm64.yaml PYTEST_WORKERS: auto PATTERN: "not single_cpu and not slow and not network and not clipboard and not arm_slow and not db" PYTEST_TARGET: "pandas" PANDAS_CI: "1" steps: - checkout - - run: .circleci/setup_env.sh - - run: > - PATH=$HOME/miniconda3/envs/pandas-dev/bin:$HOME/miniconda3/condabin:$PATH - LD_PRELOAD=$HOME/miniconda3/envs/pandas-dev/lib/libgomp.so.1:$LD_PRELOAD - ci/run_tests.sh - linux-musl: + - run: + name: Install Environment and Run Tests + shell: /bin/bash -exo pipefail + # https://pytest-qt.readthedocs.io/en/latest/troubleshooting.html#github-actions-azure-pipelines-travis-ci-and-gitlab-ci-cd + command: | + MINI_URL="https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Miniforge3-24.3.0-0-Linux-aarch64.sh" + wget -q $MINI_URL -O Miniforge3.sh + chmod +x Miniforge3.sh + MINI_DIR="$HOME/miniconda3" + rm -rf $MINI_DIR + ./Miniforge3.sh -b -p $MINI_DIR + export PATH=$MINI_DIR/bin:$PATH + conda info -a + conda env create -q -n pandas-dev -f $ENV_FILE + conda list -n pandas-dev + source activate pandas-dev + if pip show pandas 1>/dev/null; then + pip uninstall -y pandas + fi + python -m pip install --no-build-isolation -ve . --config-settings=setup-args="--werror" + PATH=$HOME/miniconda3/envs/pandas-dev/bin:$HOME/miniconda3/condabin:$PATH + sudo apt-get update && sudo apt-get install -y libegl1 libopengl0 + ci/run_tests.sh + test-linux-musl: docker: - image: quay.io/pypa/musllinux_1_1_aarch64 resource_class: arm.large steps: # Install pkgs first to have git in the image # (needed for checkout) - - run: | - apk update - apk add git - apk add musl-locales + - run: + name: Install System Packages + command: | + apk update + apk add git + apk add musl-locales - checkout - - run: | - /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev - . ~/virtualenvs/pandas-dev/bin/activate - python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.13.1 meson[ninja]==1.2.1 - python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 hypothesis>=6.46.1 - python -m pip install --no-cache-dir --no-build-isolation -e . --config-settings=setup-args="--werror" - python -m pip list --no-cache-dir - - run: | - . ~/virtualenvs/pandas-dev/bin/activate - export PANDAS_CI=1 - python -m pytest -m 'not slow and not network and not clipboard and not single_cpu' pandas --junitxml=test-data.xml + - run: + name: Install Environment and Run Tests + command: | + /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev + . ~/virtualenvs/pandas-dev/bin/activate + python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.13.1 meson[ninja]==1.2.1 + python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=3.4.0 hypothesis>=6.84.0 + python -m pip install --no-cache-dir --no-build-isolation -e . --config-settings=setup-args="--werror" + python -m pip list --no-cache-dir + export PANDAS_CI=1 + python -m pytest -m 'not slow and not network and not clipboard and not single_cpu' pandas --junitxml=test-data.xml build-aarch64: parameters: cibw-build: @@ -71,7 +91,7 @@ jobs: name: Build aarch64 wheels no_output_timeout: 30m # Sometimes the tests won't generate any output, make sure the job doesn't get killed by that command: | - pip3 install cibuildwheel==2.15.0 + pip3 install cibuildwheel==2.18.1 cibuildwheel --prerelease-pythons --output-dir wheelhouse environment: @@ -79,21 +99,16 @@ jobs: - run: name: Install Anaconda Client & Upload Wheels + shell: /bin/bash -exo pipefail command: | - echo "Install Mambaforge" - MAMBA_URL="https://github.com/conda-forge/miniforge/releases/download/23.1.0-0/Mambaforge-23.1.0-0-Linux-aarch64.sh" - echo "Downloading $MAMBA_URL" - wget -q $MAMBA_URL -O minimamba.sh - chmod +x minimamba.sh - - MAMBA_DIR="$HOME/miniconda3" - rm -rf $MAMBA_DIR - ./minimamba.sh -b -p $MAMBA_DIR - - export PATH=$MAMBA_DIR/bin:$PATH - - mamba install -y -c conda-forge anaconda-client - + MINI_URL="https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Miniforge3-24.3.0-0-Linux-aarch64.sh" + wget -q $MINI_URL -O Miniforge3.sh + chmod +x Miniforge3.sh + MINI_DIR="$HOME/miniconda3" + rm -rf $MINI_DIR + ./Miniforge3.sh -b -p $MINI_DIR + export PATH=$MINI_DIR/bin:$PATH + conda install -y -c conda-forge anaconda-client source ci/upload_wheels.sh set_upload_vars upload_wheels @@ -107,14 +122,14 @@ workflows: not: equal: [ scheduled_pipeline, << pipeline.trigger_source >> ] jobs: - - test-arm + - test-linux-arm test-musl: # Don't run trigger this one when scheduled pipeline runs when: not: equal: [ scheduled_pipeline, << pipeline.trigger_source >> ] jobs: - - linux-musl + - test-linux-musl build-wheels: jobs: - build-aarch64: @@ -123,11 +138,9 @@ workflows: only: /^v.*/ matrix: parameters: - cibw-build: ["cp39-manylinux_aarch64", - "cp310-manylinux_aarch64", + cibw-build: ["cp310-manylinux_aarch64", "cp311-manylinux_aarch64", "cp312-manylinux_aarch64", - "cp39-musllinux_aarch64", "cp310-musllinux_aarch64", "cp311-musllinux_aarch64", "cp312-musllinux_aarch64",] diff --git a/.circleci/setup_env.sh b/.circleci/setup_env.sh deleted file mode 100755 index eef4db1191a9a..0000000000000 --- a/.circleci/setup_env.sh +++ /dev/null @@ -1,60 +0,0 @@ -#!/bin/bash -e - -echo "Install Mambaforge" -MAMBA_URL="https://github.com/conda-forge/miniforge/releases/download/23.1.0-0/Mambaforge-23.1.0-0-Linux-aarch64.sh" -echo "Downloading $MAMBA_URL" -wget -q $MAMBA_URL -O minimamba.sh -chmod +x minimamba.sh - -MAMBA_DIR="$HOME/miniconda3" -rm -rf $MAMBA_DIR -./minimamba.sh -b -p $MAMBA_DIR - -export PATH=$MAMBA_DIR/bin:$PATH - -echo -echo "which conda" -which conda - -echo -echo "update conda" -conda config --set ssl_verify false -conda config --set quiet true --set always_yes true --set changeps1 false -mamba install -y -c conda-forge -n base pip setuptools - -echo "conda info -a" -conda info -a - -echo "conda list (root environment)" -conda list - -echo -# Clean up any left-over from a previous build -mamba env remove -n pandas-dev -echo "mamba env update --file=${ENV_FILE}" -# See https://github.com/mamba-org/mamba/issues/633 -mamba create -q -n pandas-dev -time mamba env update -n pandas-dev --file="${ENV_FILE}" - -echo "conda list -n pandas-dev" -conda list -n pandas-dev - -echo "activate pandas-dev" -source activate pandas-dev - -# Explicitly set an environment variable indicating that this is pandas' CI environment. -# -# This allows us to enable things like -Werror that shouldn't be activated in -# downstream CI jobs that may also build pandas from source. -export PANDAS_CI=1 - -if pip show pandas 1>/dev/null; then - echo - echo "remove any installed pandas package w/o removing anything else" - pip uninstall -y pandas -fi - -echo "Install pandas" -python -m pip install --no-build-isolation -ve . --config-settings=setup-args="--werror" - -echo "done" diff --git a/.gitattributes b/.gitattributes index 19c6fd2fd1d47..b3d70ca8b24fb 100644 --- a/.gitattributes +++ b/.gitattributes @@ -68,7 +68,7 @@ ci export-ignore doc export-ignore gitpod export-ignore MANIFEST.in export-ignore -scripts export-ignore +scripts/** export-ignore typings export-ignore web export-ignore CITATION.cff export-ignore @@ -82,3 +82,7 @@ setup.py export-ignore # csv_dir_path fixture checks the existence of the directory # exclude the whole directory to avoid running related tests in sdist pandas/tests/io/parser/data export-ignore + +# Include cibw script in sdist since it's needed for building wheels +scripts/cibw_before_build.sh -export-ignore +scripts/cibw_before_test.sh -export-ignore diff --git a/.github/actions/build_pandas/action.yml b/.github/actions/build_pandas/action.yml index 63f687324b0ae..460ae2f8594c0 100644 --- a/.github/actions/build_pandas/action.yml +++ b/.github/actions/build_pandas/action.yml @@ -4,12 +4,6 @@ inputs: editable: description: Whether to build pandas in editable mode (default true) default: true - meson_args: - description: Extra flags to pass to meson - required: false - cflags_adds: - description: Items to append to the CFLAGS variable - required: false runs: using: composite steps: @@ -30,12 +24,11 @@ runs: - name: Build Pandas run: | - export CFLAGS="$CFLAGS ${{ inputs.cflags_adds }}" if [[ ${{ inputs.editable }} == "true" ]]; then - pip install -e . --no-build-isolation -v --no-deps ${{ inputs.meson_args }} \ + pip install -e . --no-build-isolation -v --no-deps \ --config-settings=setup-args="--werror" else - pip install . --no-build-isolation -v --no-deps ${{ inputs.meson_args }} \ + pip install . --no-build-isolation -v --no-deps \ --config-settings=setup-args="--werror" fi shell: bash -el {0} diff --git a/.github/actions/run-tests/action.yml b/.github/actions/run-tests/action.yml index 66e4142dc0cbb..f5d6abdf0f186 100644 --- a/.github/actions/run-tests/action.yml +++ b/.github/actions/run-tests/action.yml @@ -7,14 +7,14 @@ runs: shell: bash -el {0} - name: Publish test results - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: Test results path: test-data.xml if: failure() - name: Upload coverage to Codecov - uses: codecov/codecov-action@v3 + uses: codecov/codecov-action@v4 with: flags: unittests name: codecov-pandas diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index 937af7e49c6d3..7e9c056e75131 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -51,6 +51,11 @@ jobs: # TODO: The doctests have to be run first right now, since the Cython doctests only work # with pandas installed in non-editable mode # This can be removed once pytest-cython doesn't require C extensions to be installed inplace + + - name: Extra installs + # https://pytest-qt.readthedocs.io/en/latest/troubleshooting.html#github-actions-azure-pipelines-travis-ci-and-gitlab-ci-cd + run: sudo apt-get update && sudo apt-get install -y libegl1 libopengl0 + - name: Run doctests run: cd ci && ./code_checks.sh doctests if: ${{ steps.build.outcome == 'success' && always() }} diff --git a/.github/workflows/docbuild-and-upload.yml b/.github/workflows/docbuild-and-upload.yml index 924a6263f34d2..47b97fa57852a 100644 --- a/.github/workflows/docbuild-and-upload.yml +++ b/.github/workflows/docbuild-and-upload.yml @@ -46,6 +46,10 @@ jobs: - name: Build Pandas uses: ./.github/actions/build_pandas + - name: Extra installs + # https://pytest-qt.readthedocs.io/en/latest/troubleshooting.html#github-actions-azure-pipelines-travis-ci-and-gitlab-ci-cd + run: sudo apt-get update && sudo apt-get install -y libegl1 libopengl0 + - name: Test website run: python -m pytest web/ diff --git a/.github/workflows/package-checks.yml b/.github/workflows/package-checks.yml index 2de1649d42dfd..97f90c1588962 100644 --- a/.github/workflows/package-checks.yml +++ b/.github/workflows/package-checks.yml @@ -53,7 +53,7 @@ jobs: runs-on: ubuntu-22.04 strategy: matrix: - python-version: ['3.9', '3.10', '3.11'] + python-version: ['3.10', '3.11'] fail-fast: false name: Test Conda Forge Recipe - Python ${{ matrix.python-version }} concurrency: diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 1b88d4d90d3e1..a085d0265a1a5 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -26,7 +26,7 @@ jobs: timeout-minutes: 90 strategy: matrix: - env_file: [actions-39.yaml, actions-310.yaml, actions-311.yaml, actions-312.yaml] + env_file: [actions-310.yaml, actions-311.yaml, actions-312.yaml] # Prevent the include jobs from overriding other jobs pattern: [""] include: @@ -35,7 +35,7 @@ jobs: pattern: "not slow and not network and not single_cpu" pytest_target: "pandas/tests/test_downstream.py" - name: "Minimum Versions" - env_file: actions-39-minimum_versions.yaml + env_file: actions-310-minimum_versions.yaml pattern: "not slow and not network and not single_cpu" - name: "Locale: it_IT" env_file: actions-311.yaml @@ -57,6 +57,9 @@ jobs: # Also install zh_CN (its encoding is gb2312) but do not activate it. # It will be temporarily activated during tests with locale.setlocale extra_loc: "zh_CN" + - name: "Future infer strings" + env_file: actions-311.yaml + pandas_future_infer_string: "1" - name: "Pypy" env_file: actions-pypy-39.yaml pattern: "not slow and not network and not single_cpu" @@ -74,9 +77,10 @@ jobs: PATTERN: ${{ matrix.pattern }} LANG: ${{ matrix.lang || 'C.UTF-8' }} LC_ALL: ${{ matrix.lc_all || '' }} - PANDAS_CI: ${{ matrix.pandas_ci || '1' }} + PANDAS_CI: '1' + PANDAS_FUTURE_INFER_STRING: ${{ matrix.pandas_future_infer_string || '0' }} TEST_ARGS: ${{ matrix.test_args || '' }} - PYTEST_WORKERS: ${{ matrix.pytest_workers || 'auto' }} + PYTEST_WORKERS: 'auto' PYTEST_TARGET: ${{ matrix.pytest_target || 'pandas' }} NPY_PROMOTION_STATE: ${{ matrix.env_file == 'actions-311-numpydev.yaml' && 'weak' || 'legacy' }} # Clipboard tests @@ -88,7 +92,7 @@ jobs: services: mysql: - image: mysql:8.0.33 + image: mysql:8 env: MYSQL_ALLOW_EMPTY_PASSWORD: yes MYSQL_DATABASE: pandas @@ -101,7 +105,7 @@ jobs: - 3306:3306 postgres: - image: postgres:13 + image: postgres:16 env: PGUSER: postgres POSTGRES_USER: postgres @@ -116,7 +120,7 @@ jobs: - 5432:5432 moto: - image: motoserver/moto:4.1.13 + image: motoserver/moto:5.0.0 env: AWS_ACCESS_KEY_ID: foobar_key AWS_SECRET_ACCESS_KEY: foobar_secret @@ -130,8 +134,8 @@ jobs: fetch-depth: 0 - name: Extra installs - run: sudo apt-get update && sudo apt-get install -y ${{ matrix.extra_apt }} - if: ${{ matrix.extra_apt }} + # https://pytest-qt.readthedocs.io/en/latest/troubleshooting.html#github-actions-azure-pipelines-travis-ci-and-gitlab-ci-cd + run: sudo apt-get update && sudo apt-get install -y libegl1 libopengl0 ${{ matrix.extra_apt || ''}} - name: Generate extra locales # These extra locales will be available for locale.setlocale() calls in tests @@ -146,9 +150,8 @@ jobs: - name: Build Pandas id: build uses: ./.github/actions/build_pandas - with: - meson_args: ${{ matrix.meson_args }} - cflags_adds: ${{ matrix.cflags_adds }} + # TODO: Re-enable once Pypy has Pypy 3.10 on conda-forge + if: ${{ matrix.name != 'Pypy' }} - name: Test (not single_cpu) uses: ./.github/actions/run-tests @@ -170,7 +173,7 @@ jobs: matrix: # Note: Don't use macOS latest since macos 14 appears to be arm64 only os: [macos-13, macos-14, windows-latest] - env_file: [actions-39.yaml, actions-310.yaml, actions-311.yaml, actions-312.yaml] + env_file: [actions-310.yaml, actions-311.yaml, actions-312.yaml] fail-fast: false runs-on: ${{ matrix.os }} name: ${{ format('{0} {1}', matrix.os, matrix.env_file) }} @@ -228,7 +231,7 @@ jobs: . ~/virtualenvs/pandas-dev/bin/activate python -m pip install --no-cache-dir -U pip wheel setuptools meson[ninja]==1.2.1 meson-python==0.13.1 python -m pip install numpy --config-settings=setup-args="-Dallow-noblas=true" - python -m pip install --no-cache-dir versioneer[toml] cython python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 hypothesis>=6.46.1 + python -m pip install --no-cache-dir versioneer[toml] cython python-dateutil pytz pytest>=7.3.2 pytest-xdist>=3.4.0 hypothesis>=6.84.0 python -m pip install --no-cache-dir --no-build-isolation -e . --config-settings=setup-args="--werror" python -m pip list --no-cache-dir export PANDAS_CI=1 @@ -266,7 +269,7 @@ jobs: /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev . ~/virtualenvs/pandas-dev/bin/activate python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.13.1 meson[ninja]==1.2.1 - python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 hypothesis>=6.46.1 + python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=3.4.0 hypothesis>=6.84.0 python -m pip install --no-cache-dir --no-build-isolation -e . --config-settings=setup-args="--werror" python -m pip list --no-cache-dir @@ -300,7 +303,7 @@ jobs: # To freeze this file, uncomment out the ``if: false`` condition, and migrate the jobs # to the corresponding posix/windows-macos/sdist etc. workflows. # Feel free to modify this comment as necessary. - if: false # Uncomment this to freeze the workflow, comment it to unfreeze + # if: false # Uncomment this to freeze the workflow, comment it to unfreeze defaults: run: shell: bash -eou pipefail {0} @@ -315,7 +318,7 @@ jobs: concurrency: # https://github.community/t/concurrecy-not-work-for-push/183068/7 - group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.os }}-${{ matrix.pytest_target }}-dev + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.os }}-python-dev cancel-in-progress: true env: @@ -332,7 +335,7 @@ jobs: - name: Set up Python Dev Version uses: actions/setup-python@v5 with: - python-version: '3.12-dev' + python-version: '3.13-dev' - name: Build Environment run: | @@ -340,13 +343,57 @@ jobs: python -m pip install --upgrade pip setuptools wheel meson[ninja]==1.2.1 meson-python==0.13.1 python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy python -m pip install versioneer[toml] - python -m pip install python-dateutil pytz tzdata cython hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-cov + python -m pip install python-dateutil pytz tzdata cython hypothesis>=6.84.0 pytest>=7.3.2 pytest-xdist>=3.4.0 pytest-cov + python -m pip install -ve . --no-build-isolation --no-index --no-deps --config-settings=setup-args="--werror" + python -m pip list + + - name: Run Tests + uses: ./.github/actions/run-tests + + python-freethreading: + defaults: + run: + shell: bash -eou pipefail {0} + runs-on: ubuntu-22.04 + + timeout-minutes: 90 + + concurrency: + # https://github.community/t/concurrecy-not-work-for-push/183068/7 + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.os }}-python-freethreading-dev + cancel-in-progress: true + + env: + PYTEST_WORKERS: "auto" + PANDAS_CI: 1 + PATTERN: "not slow and not network and not clipboard and not single_cpu" + PYTEST_TARGET: pandas + + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Python Free-threading Version + uses: deadsnakes/action@v3.1.0 + with: + python-version: 3.13-dev + nogil: true + + - name: Build Environment + run: | + python --version + python -m pip install --upgrade pip setuptools wheel meson[ninja]==1.2.1 meson-python==0.13.1 + python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy cython + python -m pip install versioneer[toml] + python -m pip install python-dateutil pytz tzdata hypothesis>=6.84.0 pytest>=7.3.2 pytest-xdist>=3.4.0 pytest-cov python -m pip install -ve . --no-build-isolation --no-index --no-deps --config-settings=setup-args="--werror" python -m pip list - name: Run Tests uses: ./.github/actions/run-tests + # NOTE: this job must be kept in sync with the Pyodide build job in wheels.yml emscripten: # Note: the Python version, Emscripten toolchain version are determined # by the Pyodide version. The appropriate versions can be found in the diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index ab0201ca623aa..58adb4efc0627 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -99,7 +99,25 @@ jobs: - [macos-14, macosx_arm64] - [windows-2022, win_amd64] # TODO: support PyPy? - python: [["cp39", "3.9"], ["cp310", "3.10"], ["cp311", "3.11"], ["cp312", "3.12"]] + python: [["cp310", "3.10"], ["cp311", "3.11"], ["cp312", "3.12"], ["cp313", "3.13"], ["cp313t", "3.13"]] + include: + # TODO: Remove this plus installing build deps in cibw_before_build.sh + # and test deps in cibw_before_test.sh after pandas can be built with a released NumPy/Cython + - python: ["cp313", "3.13"] + cibw_build_frontend: 'pip; args: --no-build-isolation' + - python: ["cp313t", "3.13"] + cibw_build_frontend: 'pip; args: --no-build-isolation' + # Build Pyodide wheels and upload them to Anaconda.org + # NOTE: this job is similar to the one in unit-tests.yml except for the fact + # that it uses cibuildwheel instead of a standard Pyodide xbuildenv setup. + - buildplat: [ubuntu-22.04, pyodide_wasm32] + python: ["cp312", "3.12"] + cibw_build_frontend: 'build' + # TODO: Build free-threaded wheels for Windows + exclude: + - buildplat: [windows-2022, win_amd64] + python: ["cp313t", "3.13"] + env: IS_PUSH: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') }} IS_SCHEDULE_DISPATCH: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }} @@ -140,12 +158,14 @@ jobs: run: echo "sdist_name=$(cd ./dist && ls -d */)" >> "$GITHUB_ENV" - name: Build wheels - uses: pypa/cibuildwheel@v2.18.0 + uses: pypa/cibuildwheel@v2.20.0 with: package-dir: ./dist/${{ startsWith(matrix.buildplat[1], 'macosx') && env.sdist_name || needs.build_sdist.outputs.sdist_file }} env: CIBW_PRERELEASE_PYTHONS: True CIBW_BUILD: ${{ matrix.python[0] }}-${{ matrix.buildplat[1] }} + CIBW_BUILD_FRONTEND: ${{ matrix.cibw_build_frontend || 'pip' }} + CIBW_PLATFORM: ${{ matrix.buildplat[1] == 'pyodide_wasm32' && 'pyodide' || 'auto' }} - name: Set up Python uses: mamba-org/setup-micromamba@v1 @@ -168,15 +188,17 @@ jobs: - name: Test Windows Wheels if: ${{ matrix.buildplat[1] == 'win_amd64' }} shell: pwsh + # TODO: Remove NumPy nightly install when there's a 3.13 wheel on PyPI run: | $TST_CMD = @" - python -m pip install hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0; + python -m pip install hypothesis>=6.84.0 pytest>=7.3.2 pytest-xdist>=3.4.0; + ${{ matrix.python[1] == '3.13' && 'python -m pip install -i https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy;' }} python -m pip install `$(Get-Item pandas\wheelhouse\*.whl); python -c `'import pandas as pd; pd.test(extra_args=[`\"--no-strict-data-files`\", `\"-m not clipboard and not single_cpu and not slow and not network and not db`\"])`'; "@ # add rc to the end of the image name if the Python version is unreleased - docker pull python:${{ matrix.python[1] == '3.12' && '3.12-rc' || format('{0}-windowsservercore', matrix.python[1]) }} - docker run --env PANDAS_CI='1' -v ${PWD}:C:\pandas python:${{ matrix.python[1] == '3.12' && '3.12-rc' || format('{0}-windowsservercore', matrix.python[1]) }} powershell -Command $TST_CMD + docker pull python:${{ matrix.python[1] == '3.13' && '3.13-rc' || format('{0}-windowsservercore', matrix.python[1]) }} + docker run --env PANDAS_CI='1' -v ${PWD}:C:\pandas python:${{ matrix.python[1] == '3.13' && '3.13-rc' || format('{0}-windowsservercore', matrix.python[1]) }} powershell -Command $TST_CMD - uses: actions/upload-artifact@v4 with: diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a5de902866611..b81b9ba070a44 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -19,7 +19,7 @@ ci: skip: [pyright, mypy] repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.4.3 + rev: v0.5.0 hooks: - id: ruff args: [--exit-non-zero-on-fix] @@ -40,7 +40,7 @@ repos: pass_filenames: true require_serial: false - repo: https://github.com/codespell-project/codespell - rev: v2.2.6 + rev: v2.3.0 hooks: - id: codespell types_or: [python, rst, markdown, cython, c] @@ -67,15 +67,16 @@ repos: - id: fix-encoding-pragma args: [--remove] - id: trailing-whitespace + args: [--markdown-linebreak-ext=md] - repo: https://github.com/PyCQA/isort rev: 5.13.2 hooks: - id: isort - repo: https://github.com/asottile/pyupgrade - rev: v3.15.2 + rev: v3.16.0 hooks: - id: pyupgrade - args: [--py39-plus] + args: [--py310-plus] - repo: https://github.com/pre-commit/pygrep-hooks rev: v1.10.0 hooks: @@ -92,7 +93,7 @@ repos: - id: sphinx-lint args: ["--enable", "all", "--disable", "line-too-long"] - repo: https://github.com/pre-commit/mirrors-clang-format - rev: v18.1.4 + rev: v18.1.8 hooks: - id: clang-format files: ^pandas/_libs/src|^pandas/_libs/include diff --git a/MANIFEST.in b/MANIFEST.in index 9894381ed6252..f586d457eaaf8 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -62,3 +62,7 @@ prune pandas/tests/io/parser/data # Selectively re-add *.cxx files that were excluded above graft pandas/_libs/src graft pandas/_libs/include + +# Include cibw script in sdist since it's needed for building wheels +include scripts/cibw_before_build.sh +include scripts/cibw_before_test.sh diff --git a/README.md b/README.md index e5329d66c2d89..715b0c9dc459c 100644 --- a/README.md +++ b/README.md @@ -96,7 +96,7 @@ The source code is currently hosted on GitHub at: https://github.com/pandas-dev/pandas Binary installers for the latest released version are available at the [Python -Package Index (PyPI)](https://pypi.org/project/pandas) and on [Conda](https://docs.conda.io/en/latest/). +Package Index (PyPI)](https://pypi.org/project/pandas) and on [Conda](https://anaconda.org/conda-forge/pandas). ```sh # conda diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index 15e691d46f693..b2495356f134c 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -546,24 +546,17 @@ def time_chained_indexing(self, mode): class Block: - params = [ - (True, "True"), - (np.array(True), "np.array(True)"), - ] - - def setup(self, true_value, mode): + def setup(self): self.df = DataFrame( False, columns=np.arange(500).astype(str), index=date_range("2010-01-01", "2011-01-01"), ) - self.true_value = true_value - - def time_test(self, true_value, mode): + def time_test(self): start = datetime(2010, 5, 1) end = datetime(2010, 9, 1) - self.df.loc[start:end, :] = true_value + self.df.loc[start:end, :] = True from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/tslibs/fields.py b/asv_bench/benchmarks/tslibs/fields.py index 3a2baec54109a..fe31879e67a67 100644 --- a/asv_bench/benchmarks/tslibs/fields.py +++ b/asv_bench/benchmarks/tslibs/fields.py @@ -19,10 +19,15 @@ class TimeGetTimedeltaField: def setup(self, size, field): arr = np.random.randint(0, 10, size=size, dtype="i8") self.i8data = arr + arr = np.random.randint(-86400 * 1_000_000_000, 0, size=size, dtype="i8") + self.i8data_negative = arr def time_get_timedelta_field(self, size, field): get_timedelta_field(self.i8data, field) + def time_get_timedelta_field_negative_td(self, size, field): + get_timedelta_field(self.i8data_negative, field) + class TimeGetDateField: params = [ @@ -72,3 +77,6 @@ def setup(self, size, side, period, freqstr, month_kw): def time_get_start_end_field(self, size, side, period, freqstr, month_kw): get_start_end_field(self.i8data, self.attrname, freqstr, month_kw=month_kw) + + +from ..pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/tslibs/timedelta.py b/asv_bench/benchmarks/tslibs/timedelta.py index dcc73aefc6c7a..9d9689fcfa94b 100644 --- a/asv_bench/benchmarks/tslibs/timedelta.py +++ b/asv_bench/benchmarks/tslibs/timedelta.py @@ -20,7 +20,7 @@ def time_from_int(self): Timedelta(123456789) def time_from_unit(self): - Timedelta(1, unit="d") + Timedelta(1, unit="D") def time_from_components(self): Timedelta( diff --git a/asv_bench/benchmarks/tslibs/timestamp.py b/asv_bench/benchmarks/tslibs/timestamp.py index 082220ee0dff2..6145966fb6a0e 100644 --- a/asv_bench/benchmarks/tslibs/timestamp.py +++ b/asv_bench/benchmarks/tslibs/timestamp.py @@ -1,7 +1,10 @@ -from datetime import datetime +from datetime import ( + datetime, + timezone, +) +import zoneinfo import numpy as np -import pytz from pandas import Timestamp @@ -12,7 +15,7 @@ class TimestampConstruction: def setup(self): self.npdatetime64 = np.datetime64("2020-01-01 00:00:00") self.dttime_unaware = datetime(2020, 1, 1, 0, 0, 0) - self.dttime_aware = datetime(2020, 1, 1, 0, 0, 0, 0, pytz.UTC) + self.dttime_aware = datetime(2020, 1, 1, 0, 0, 0, 0, timezone.utc) self.ts = Timestamp("2020-01-01 00:00:00") def time_parse_iso8601_no_tz(self): @@ -113,7 +116,7 @@ def setup(self, tz): self.ts = Timestamp("2017-08-25 08:16:14", tz=tz) def time_replace_tz(self, tz): - self.ts.replace(tzinfo=pytz.timezone("US/Eastern")) + self.ts.replace(tzinfo=zoneinfo.ZoneInfo("US/Eastern")) def time_replace_None(self, tz): self.ts.replace(tzinfo=None) @@ -144,8 +147,8 @@ def time_ceil(self, tz): class TimestampAcrossDst: def setup(self): - dt = datetime(2016, 3, 27, 1) - self.tzinfo = pytz.timezone("CET").localize(dt, is_dst=False).tzinfo + dt = datetime(2016, 3, 27, 1, fold=0) + self.tzinfo = dt.astimezone(zoneinfo.ZoneInfo("Europe/Berlin")).tzinfo self.ts2 = Timestamp(dt) def time_replace_across_dst(self): diff --git a/asv_bench/benchmarks/tslibs/tslib.py b/asv_bench/benchmarks/tslibs/tslib.py index 4a011d4bb3f06..885cf48d01743 100644 --- a/asv_bench/benchmarks/tslibs/tslib.py +++ b/asv_bench/benchmarks/tslibs/tslib.py @@ -20,13 +20,13 @@ timedelta, timezone, ) +import zoneinfo from dateutil.tz import ( gettz, tzlocal, ) import numpy as np -import pytz try: from pandas._libs.tslibs import ints_to_pydatetime @@ -38,7 +38,7 @@ None, timezone.utc, timezone(timedelta(minutes=60)), - pytz.timezone("US/Pacific"), + zoneinfo.ZoneInfo("US/Pacific"), gettz("Asia/Tokyo"), tzlocal_obj, ] diff --git a/asv_bench/benchmarks/tslibs/tz_convert.py b/asv_bench/benchmarks/tslibs/tz_convert.py index c6b510efdca69..c87adb5e5d0e9 100644 --- a/asv_bench/benchmarks/tslibs/tz_convert.py +++ b/asv_bench/benchmarks/tslibs/tz_convert.py @@ -1,5 +1,6 @@ +from datetime import timezone + import numpy as np -from pytz import UTC from pandas._libs.tslibs.tzconversion import tz_localize_to_utc @@ -41,7 +42,7 @@ def time_tz_convert_from_utc(self, size, tz): # dti = DatetimeIndex(self.i8data, tz=tz) # dti.tz_localize(None) if old_sig: - tz_convert_from_utc(self.i8data, UTC, tz) + tz_convert_from_utc(self.i8data, timezone.utc, tz) else: tz_convert_from_utc(self.i8data, tz) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index a93e23a0f5022..c69b47ae1d4e8 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -70,43 +70,18 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then --format=actions \ -i ES01 `# For now it is ok if docstrings are missing the extended summary` \ -i "pandas.Series.dt PR01" `# Accessors are implemented as classes, but we do not document the Parameters section` \ - -i "pandas.DataFrame.max RT03" \ - -i "pandas.DataFrame.mean RT03,SA01" \ - -i "pandas.DataFrame.median RT03,SA01" \ - -i "pandas.DataFrame.min RT03" \ - -i "pandas.DataFrame.plot PR02,SA01" \ - -i "pandas.Grouper PR02" \ - -i "pandas.MultiIndex PR01" \ - -i "pandas.MultiIndex.append PR07,SA01" \ - -i "pandas.MultiIndex.copy PR07,RT03,SA01" \ - -i "pandas.MultiIndex.drop PR07,RT03,SA01" \ - -i "pandas.MultiIndex.dtypes SA01" \ - -i "pandas.MultiIndex.get_level_values SA01" \ - -i "pandas.MultiIndex.get_loc PR07" \ -i "pandas.MultiIndex.get_loc_level PR07" \ - -i "pandas.MultiIndex.levels SA01" \ - -i "pandas.MultiIndex.levshape SA01" \ -i "pandas.MultiIndex.names SA01" \ - -i "pandas.MultiIndex.nlevels SA01" \ - -i "pandas.MultiIndex.remove_unused_levels RT03,SA01" \ -i "pandas.MultiIndex.reorder_levels RT03,SA01" \ - -i "pandas.MultiIndex.set_codes SA01" \ - -i "pandas.MultiIndex.set_levels RT03,SA01" \ -i "pandas.MultiIndex.sortlevel PR07,SA01" \ -i "pandas.MultiIndex.to_frame RT03" \ - -i "pandas.MultiIndex.truncate SA01" \ -i "pandas.NA SA01" \ -i "pandas.NaT SA01" \ - -i "pandas.NamedAgg SA01" \ - -i "pandas.Period SA01" \ -i "pandas.Period.asfreq SA01" \ -i "pandas.Period.freq GL08" \ -i "pandas.Period.freqstr SA01" \ - -i "pandas.Period.is_leap_year SA01" \ -i "pandas.Period.month SA01" \ - -i "pandas.Period.now SA01" \ -i "pandas.Period.ordinal GL08" \ - -i "pandas.Period.quarter SA01" \ -i "pandas.Period.strftime PR01,SA01" \ -i "pandas.Period.to_timestamp SA01" \ -i "pandas.Period.year SA01" \ @@ -164,7 +139,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.dt.tz_convert PR01,PR02" \ -i "pandas.Series.dt.tz_localize PR01,PR02" \ -i "pandas.Series.dt.unit GL08" \ - -i "pandas.Series.ge SA01" \ -i "pandas.Series.gt SA01" \ -i "pandas.Series.list.__getitem__ SA01" \ -i "pandas.Series.list.flatten SA01" \ @@ -172,13 +146,11 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.lt SA01" \ -i "pandas.Series.ne SA01" \ -i "pandas.Series.pad PR01,SA01" \ - -i "pandas.Series.plot PR02,SA01" \ - -i "pandas.Series.pop RT03,SA01" \ + -i "pandas.Series.pop SA01" \ -i "pandas.Series.prod RT03" \ -i "pandas.Series.product RT03" \ -i "pandas.Series.reorder_levels RT03,SA01" \ -i "pandas.Series.sem PR01,RT03,SA01" \ - -i "pandas.Series.skew RT03,SA01" \ -i "pandas.Series.sparse PR01,SA01" \ -i "pandas.Series.sparse.density SA01" \ -i "pandas.Series.sparse.fill_value SA01" \ @@ -192,9 +164,7 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.str.center RT03,SA01" \ -i "pandas.Series.str.decode PR07,RT03,SA01" \ -i "pandas.Series.str.encode PR07,RT03,SA01" \ - -i "pandas.Series.str.find RT03" \ -i "pandas.Series.str.fullmatch RT03" \ - -i "pandas.Series.str.get RT03,SA01" \ -i "pandas.Series.str.index RT03" \ -i "pandas.Series.str.ljust RT03,SA01" \ -i "pandas.Series.str.lower RT03" \ @@ -204,7 +174,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.str.partition RT03" \ -i "pandas.Series.str.repeat SA01" \ -i "pandas.Series.str.replace SA01" \ - -i "pandas.Series.str.rfind RT03" \ -i "pandas.Series.str.rindex RT03" \ -i "pandas.Series.str.rjust RT03,SA01" \ -i "pandas.Series.str.rpartition RT03" \ @@ -216,15 +185,11 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.str.wrap RT03,SA01" \ -i "pandas.Series.str.zfill RT03" \ -i "pandas.Series.struct.dtypes SA01" \ - -i "pandas.Series.to_dict SA01" \ - -i "pandas.Series.to_frame SA01" \ -i "pandas.Series.to_markdown SA01" \ -i "pandas.Series.update PR07,SA01" \ - -i "pandas.Timedelta.as_unit SA01" \ -i "pandas.Timedelta.asm8 SA01" \ -i "pandas.Timedelta.ceil SA01" \ -i "pandas.Timedelta.components SA01" \ - -i "pandas.Timedelta.days SA01" \ -i "pandas.Timedelta.floor SA01" \ -i "pandas.Timedelta.max PR02" \ -i "pandas.Timedelta.min PR02" \ @@ -244,12 +209,10 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Timestamp.ctime SA01" \ -i "pandas.Timestamp.date SA01" \ -i "pandas.Timestamp.day GL08" \ - -i "pandas.Timestamp.floor SA01" \ -i "pandas.Timestamp.fold GL08" \ -i "pandas.Timestamp.fromordinal SA01" \ -i "pandas.Timestamp.fromtimestamp PR01,SA01" \ -i "pandas.Timestamp.hour GL08" \ - -i "pandas.Timestamp.isoweekday SA01" \ -i "pandas.Timestamp.max PR02" \ -i "pandas.Timestamp.microsecond GL08" \ -i "pandas.Timestamp.min PR02" \ @@ -258,13 +221,11 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Timestamp.month_name SA01" \ -i "pandas.Timestamp.nanosecond GL08" \ -i "pandas.Timestamp.normalize SA01" \ - -i "pandas.Timestamp.now SA01" \ -i "pandas.Timestamp.quarter SA01" \ -i "pandas.Timestamp.replace PR07,SA01" \ -i "pandas.Timestamp.resolution PR02" \ -i "pandas.Timestamp.second GL08" \ -i "pandas.Timestamp.strptime PR01,SA01" \ - -i "pandas.Timestamp.time SA01" \ -i "pandas.Timestamp.timestamp SA01" \ -i "pandas.Timestamp.timetuple SA01" \ -i "pandas.Timestamp.timetz SA01" \ @@ -274,7 +235,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Timestamp.to_period PR01,SA01" \ -i "pandas.Timestamp.today SA01" \ -i "pandas.Timestamp.toordinal SA01" \ - -i "pandas.Timestamp.tz SA01" \ -i "pandas.Timestamp.tz_localize SA01" \ -i "pandas.Timestamp.tzinfo GL08" \ -i "pandas.Timestamp.tzname SA01" \ @@ -303,9 +263,7 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.api.extensions.ExtensionArray.tolist RT03,SA01" \ -i "pandas.api.extensions.ExtensionArray.unique RT03,SA01" \ -i "pandas.api.extensions.ExtensionArray.view SA01" \ - -i "pandas.api.indexers.VariableOffsetWindowIndexer PR01,SA01" \ -i "pandas.api.interchange.from_dataframe RT03,SA01" \ - -i "pandas.api.types.is_any_real_numeric_dtype SA01" \ -i "pandas.api.types.is_bool PR01,SA01" \ -i "pandas.api.types.is_bool_dtype SA01" \ -i "pandas.api.types.is_categorical_dtype SA01" \ @@ -332,13 +290,8 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.api.types.is_period_dtype SA01" \ -i "pandas.api.types.is_re PR07,SA01" \ -i "pandas.api.types.is_re_compilable PR07,SA01" \ - -i "pandas.api.types.is_signed_integer_dtype SA01" \ - -i "pandas.api.types.is_sparse SA01" \ - -i "pandas.api.types.is_string_dtype SA01" \ -i "pandas.api.types.is_timedelta64_ns_dtype SA01" \ - -i "pandas.api.types.is_unsigned_integer_dtype SA01" \ -i "pandas.api.types.pandas_dtype PR07,RT03,SA01" \ - -i "pandas.api.types.union_categoricals RT03,SA01" \ -i "pandas.arrays.ArrowExtensionArray PR07,SA01" \ -i "pandas.arrays.BooleanArray SA01" \ -i "pandas.arrays.DatetimeArray SA01" \ @@ -351,7 +304,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.arrays.NumpyExtensionArray SA01" \ -i "pandas.arrays.SparseArray PR07,SA01" \ -i "pandas.arrays.TimedeltaArray PR07,SA01" \ - -i "pandas.bdate_range RT03,SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.__iter__ RT03,SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.agg RT03" \ -i "pandas.core.groupby.DataFrameGroupBy.aggregate RT03" \ @@ -362,12 +314,11 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.core.groupby.DataFrameGroupBy.hist RT03" \ -i "pandas.core.groupby.DataFrameGroupBy.indices SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.max SA01" \ - -i "pandas.core.groupby.DataFrameGroupBy.median SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.min SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.nth PR02" \ -i "pandas.core.groupby.DataFrameGroupBy.nunique SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.ohlc SA01" \ - -i "pandas.core.groupby.DataFrameGroupBy.plot PR02,SA01" \ + -i "pandas.core.groupby.DataFrameGroupBy.plot PR02" \ -i "pandas.core.groupby.DataFrameGroupBy.prod SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.sem SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.sum SA01" \ @@ -381,11 +332,10 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.core.groupby.SeriesGroupBy.is_monotonic_decreasing SA01" \ -i "pandas.core.groupby.SeriesGroupBy.is_monotonic_increasing SA01" \ -i "pandas.core.groupby.SeriesGroupBy.max SA01" \ - -i "pandas.core.groupby.SeriesGroupBy.median SA01" \ -i "pandas.core.groupby.SeriesGroupBy.min SA01" \ -i "pandas.core.groupby.SeriesGroupBy.nth PR02" \ -i "pandas.core.groupby.SeriesGroupBy.ohlc SA01" \ - -i "pandas.core.groupby.SeriesGroupBy.plot PR02,SA01" \ + -i "pandas.core.groupby.SeriesGroupBy.plot PR02" \ -i "pandas.core.groupby.SeriesGroupBy.prod SA01" \ -i "pandas.core.groupby.SeriesGroupBy.sem SA01" \ -i "pandas.core.groupby.SeriesGroupBy.sum SA01" \ @@ -396,7 +346,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.core.resample.Resampler.indices SA01" \ -i "pandas.core.resample.Resampler.max PR01,RT03,SA01" \ -i "pandas.core.resample.Resampler.mean SA01" \ - -i "pandas.core.resample.Resampler.median SA01" \ -i "pandas.core.resample.Resampler.min PR01,RT03,SA01" \ -i "pandas.core.resample.Resampler.ohlc SA01" \ -i "pandas.core.resample.Resampler.prod SA01" \ @@ -412,7 +361,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.core.window.rolling.Window.std PR01" \ -i "pandas.core.window.rolling.Window.var PR01" \ -i "pandas.date_range RT03" \ - -i "pandas.describe_option SA01" \ -i "pandas.errors.AbstractMethodError PR01,SA01" \ -i "pandas.errors.AttributeConflictWarning SA01" \ -i "pandas.errors.CSSWarning SA01" \ @@ -475,34 +423,12 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.io.stata.StataReader.variable_labels RT03,SA01" \ -i "pandas.io.stata.StataWriter.write_file SA01" \ -i "pandas.json_normalize RT03,SA01" \ - -i "pandas.merge PR07" \ - -i "pandas.merge_asof PR07,RT03" \ - -i "pandas.merge_ordered PR07" \ - -i "pandas.option_context SA01" \ -i "pandas.period_range RT03,SA01" \ - -i "pandas.pivot PR07" \ - -i "pandas.pivot_table PR07" \ -i "pandas.plotting.andrews_curves RT03,SA01" \ - -i "pandas.plotting.autocorrelation_plot RT03,SA01" \ -i "pandas.plotting.lag_plot RT03,SA01" \ - -i "pandas.plotting.parallel_coordinates PR07,RT03,SA01" \ - -i "pandas.plotting.plot_params SA01" \ -i "pandas.plotting.scatter_matrix PR07,SA01" \ - -i "pandas.plotting.table PR07,RT03,SA01" \ - -i "pandas.qcut PR07,SA01" \ - -i "pandas.read_feather SA01" \ - -i "pandas.read_orc SA01" \ - -i "pandas.read_sas SA01" \ - -i "pandas.read_spss SA01" \ - -i "pandas.reset_option SA01" \ -i "pandas.set_eng_float_format RT03,SA01" \ - -i "pandas.set_option SA01" \ - -i "pandas.show_versions SA01" \ - -i "pandas.test SA01" \ -i "pandas.testing.assert_extension_array_equal SA01" \ - -i "pandas.testing.assert_index_equal PR07,SA01" \ - -i "pandas.testing.assert_series_equal PR07,SA01" \ - -i "pandas.timedelta_range SA01" \ -i "pandas.tseries.offsets.BDay PR02,SA01" \ -i "pandas.tseries.offsets.BQuarterBegin PR02" \ -i "pandas.tseries.offsets.BQuarterBegin.freqstr SA01" \ @@ -787,7 +713,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.tseries.offsets.YearBegin.nanos GL08" \ -i "pandas.tseries.offsets.YearBegin.normalize GL08" \ -i "pandas.tseries.offsets.YearBegin.rule_code GL08" \ - -i "pandas.tseries.offsets.YearEnd PR02" \ -i "pandas.tseries.offsets.YearEnd.freqstr SA01" \ -i "pandas.tseries.offsets.YearEnd.is_on_offset GL08" \ -i "pandas.tseries.offsets.YearEnd.month GL08" \ @@ -795,8 +720,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.tseries.offsets.YearEnd.nanos GL08" \ -i "pandas.tseries.offsets.YearEnd.normalize GL08" \ -i "pandas.tseries.offsets.YearEnd.rule_code GL08" \ - -i "pandas.unique PR07" \ - -i "pandas.util.hash_array PR07,SA01" \ -i "pandas.util.hash_pandas_object PR07,SA01" # There should be no backslash in the final line, please keep this comment in the last ignored function RET=$(($RET + $?)) ; echo $MSG "DONE" diff --git a/ci/deps/actions-39-minimum_versions.yaml b/ci/deps/actions-310-minimum_versions.yaml similarity index 91% rename from ci/deps/actions-39-minimum_versions.yaml rename to ci/deps/actions-310-minimum_versions.yaml index b760f27a3d4d3..0c46f476893dd 100644 --- a/ci/deps/actions-39-minimum_versions.yaml +++ b/ci/deps/actions-310-minimum_versions.yaml @@ -4,7 +4,7 @@ name: pandas-dev channels: - conda-forge dependencies: - - python=3.9 + - python=3.10 # build dependencies - versioneer[toml] @@ -15,9 +15,9 @@ dependencies: # test dependencies - pytest>=7.3.2 - pytest-cov - - pytest-xdist>=2.2.0 - - pytest-localserver>=0.7.1 - - pytest-qt>=4.2.0 + - pytest-xdist>=3.4.0 + - pytest-localserver>=0.8.1 + - pytest-qt>=4.4.0 - boto3 # required dependencies @@ -32,7 +32,7 @@ dependencies: - fastparquet=2023.10.0 - fsspec=2022.11.0 - html5lib=1.1 - - hypothesis=6.46.1 + - hypothesis=6.84.0 - gcsfs=2022.11.0 - jinja2=3.1.2 - lxml=4.9.2 diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index ed7dfe1a3c17e..0af46752f5b3d 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -13,8 +13,9 @@ dependencies: # test dependencies - pytest>=7.3.2 - pytest-cov - - pytest-xdist>=2.2.0 - - pytest-qt>=4.2.0 + - pytest-xdist>=3.4.0 + - pytest-localserver>=0.8.1 + - pytest-qt>=4.4.0 - boto3 # required dependencies @@ -29,7 +30,7 @@ dependencies: - fastparquet>=2023.10.0 - fsspec>=2022.11.0 - html5lib>=1.1 - - hypothesis>=6.46.1 + - hypothesis>=6.84.0 - gcsfs>=2022.11.0 - jinja2>=3.1.2 - lxml>=4.9.2 @@ -60,4 +61,3 @@ dependencies: - adbc-driver-postgresql>=0.10.0 - adbc-driver-sqlite>=0.8.0 - tzdata>=2022.7 - - pytest-localserver>=0.7.1 diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml index dd1d341c70a9b..1a842c7212c1f 100644 --- a/ci/deps/actions-311-downstream_compat.yaml +++ b/ci/deps/actions-311-downstream_compat.yaml @@ -14,9 +14,9 @@ dependencies: # test dependencies - pytest>=7.3.2 - pytest-cov - - pytest-xdist>=2.2.0 - - pytest-localserver>=0.7.1 - - pytest-qt>=4.2.0 + - pytest-xdist>=3.4.0 + - pytest-localserver>=0.8.1 + - pytest-qt>=4.4.0 - boto3 # required dependencies @@ -31,7 +31,7 @@ dependencies: - fastparquet>=2023.10.0 - fsspec>=2022.11.0 - html5lib>=1.1 - - hypothesis>=6.46.1 + - hypothesis>=6.84.0 - gcsfs>=2022.11.0 - jinja2>=3.1.2 - lxml>=4.9.2 diff --git a/ci/deps/actions-311-numpydev.yaml b/ci/deps/actions-311-numpydev.yaml index 61a0eabbf133c..748cfa861ec32 100644 --- a/ci/deps/actions-311-numpydev.yaml +++ b/ci/deps/actions-311-numpydev.yaml @@ -13,8 +13,8 @@ dependencies: # test dependencies - pytest>=7.3.2 - pytest-cov - - pytest-xdist>=2.2.0 - - hypothesis>=6.46.1 + - pytest-xdist>=3.4.0 + - hypothesis>=6.84.0 # pandas dependencies - python-dateutil diff --git a/ci/deps/actions-311-pyarrownightly.yaml b/ci/deps/actions-311-pyarrownightly.yaml index d84063ac2a9ba..469fb1bfb9138 100644 --- a/ci/deps/actions-311-pyarrownightly.yaml +++ b/ci/deps/actions-311-pyarrownightly.yaml @@ -13,12 +13,12 @@ dependencies: # test dependencies - pytest>=7.3.2 - pytest-cov - - pytest-xdist>=2.2.0 - - hypothesis>=6.46.1 + - pytest-xdist>=3.4.0 + - hypothesis>=6.84.0 # required dependencies - python-dateutil - - numpy + - numpy<2 - pytz - pip diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index 388116439f944..75394e2c8e109 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -13,8 +13,9 @@ dependencies: # test dependencies - pytest>=7.3.2 - pytest-cov - - pytest-xdist>=2.2.0 - - pytest-qt>=4.2.0 + - pytest-xdist>=3.4.0 + - pytest-localserver>=0.8.1 + - pytest-qt>=4.4.0 - boto3 # required dependencies @@ -29,7 +30,7 @@ dependencies: - fastparquet>=2023.10.0 - fsspec>=2022.11.0 - html5lib>=1.1 - - hypothesis>=6.46.1 + - hypothesis>=6.84.0 - gcsfs>=2022.11.0 - jinja2>=3.1.2 - lxml>=4.9.2 @@ -59,4 +60,3 @@ dependencies: - pip: - adbc-driver-postgresql>=0.10.0 - adbc-driver-sqlite>=0.8.0 - - pytest-localserver>=0.7.1 diff --git a/ci/deps/actions-312.yaml b/ci/deps/actions-312.yaml index 1d9f8aa3b092a..d4b43ddef3601 100644 --- a/ci/deps/actions-312.yaml +++ b/ci/deps/actions-312.yaml @@ -13,8 +13,9 @@ dependencies: # test dependencies - pytest>=7.3.2 - pytest-cov - - pytest-xdist>=2.2.0 - - pytest-qt>=4.2.0 + - pytest-xdist>=3.4.0 + - pytest-localserver>=0.8.1 + - pytest-qt>=4.4.0 - boto3 # required dependencies @@ -29,7 +30,7 @@ dependencies: - fastparquet>=2023.10.0 - fsspec>=2022.11.0 - html5lib>=1.1 - - hypothesis>=6.46.1 + - hypothesis>=6.84.0 - gcsfs>=2022.11.0 - jinja2>=3.1.2 - lxml>=4.9.2 @@ -60,4 +61,3 @@ dependencies: - adbc-driver-postgresql>=0.10.0 - adbc-driver-sqlite>=0.8.0 - tzdata>=2022.7 - - pytest-localserver>=0.7.1 diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml deleted file mode 100644 index 8f235a836bb3d..0000000000000 --- a/ci/deps/actions-39.yaml +++ /dev/null @@ -1,63 +0,0 @@ -name: pandas-dev -channels: - - conda-forge -dependencies: - - python=3.9 - - # build dependencies - - versioneer[toml] - - cython>=0.29.33 - - meson[ninja]=1.2.1 - - meson-python=0.13.1 - - # test dependencies - - pytest>=7.3.2 - - pytest-cov - - pytest-xdist>=2.2.0 - - pytest-qt>=4.2.0 - - boto3 - - # required dependencies - - python-dateutil - - numpy - - pytz - - # optional dependencies - - beautifulsoup4>=4.11.2 - - blosc>=1.21.3 - - bottleneck>=1.3.6 - - fastparquet>=2023.10.0 - - fsspec>=2022.11.0 - - html5lib>=1.1 - - hypothesis>=6.46.1 - - gcsfs>=2022.11.0 - - jinja2>=3.1.2 - - lxml>=4.9.2 - - matplotlib>=3.6.3 - - numba>=0.56.4 - - numexpr>=2.8.4 - - odfpy>=1.4.1 - - qtpy>=2.3.0 - - openpyxl>=3.1.0 - - psycopg2>=2.9.6 - - pyarrow>=10.0.1 - - pymysql>=1.0.2 - - pyqt>=5.15.9 - - pyreadstat>=1.2.0 - - pytables>=3.8.0 - - python-calamine>=0.1.7 - - pyxlsb>=1.0.10 - - s3fs>=2022.11.0 - - scipy>=1.10.0 - - sqlalchemy>=2.0.0 - - tabulate>=0.9.0 - - xarray>=2022.12.0 - - xlrd>=2.0.1 - - xlsxwriter>=3.0.5 - - zstandard>=0.19.0 - - - pip: - - adbc-driver-postgresql>=0.10.0 - - adbc-driver-sqlite>=0.8.0 - - tzdata>=2022.7 - - pytest-localserver>=0.7.1 diff --git a/ci/deps/actions-pypy-39.yaml b/ci/deps/actions-pypy-39.yaml index d9c8dd81b7c33..b0ae9f1e48473 100644 --- a/ci/deps/actions-pypy-39.yaml +++ b/ci/deps/actions-pypy-39.yaml @@ -16,8 +16,8 @@ dependencies: # test dependencies - pytest>=7.3.2 - pytest-cov - - pytest-xdist>=2.2.0 - - hypothesis>=6.46.1 + - pytest-xdist>=3.4.0 + - hypothesis>=6.84.0 # required - numpy diff --git a/ci/deps/circle-310-arm64.yaml b/ci/deps/circle-311-arm64.yaml similarity index 90% rename from ci/deps/circle-310-arm64.yaml rename to ci/deps/circle-311-arm64.yaml index ed4d139714e71..18535d81e6985 100644 --- a/ci/deps/circle-310-arm64.yaml +++ b/ci/deps/circle-311-arm64.yaml @@ -2,7 +2,7 @@ name: pandas-dev channels: - conda-forge dependencies: - - python=3.10 + - python=3.11 # build dependencies - versioneer[toml] @@ -13,9 +13,9 @@ dependencies: # test dependencies - pytest>=7.3.2 - pytest-cov - - pytest-xdist>=2.2.0 - - pytest-localserver>=0.7.1 - - pytest-qt>=4.2.0 + - pytest-xdist>=3.4.0 + - pytest-localserver>=0.8.1 + - pytest-qt>=4.4.0 - boto3 # required dependencies @@ -30,7 +30,7 @@ dependencies: - fastparquet>=2023.10.0 - fsspec>=2022.11.0 - html5lib>=1.1 - - hypothesis>=6.46.1 + - hypothesis>=6.84.0 - gcsfs>=2022.11.0 - jinja2>=3.1.2 - lxml>=4.9.2 diff --git a/ci/meta.yaml b/ci/meta.yaml index aac5593e493b7..b76bef2f630b7 100644 --- a/ci/meta.yaml +++ b/ci/meta.yaml @@ -64,9 +64,9 @@ test: requires: - pip - pytest >=7.3.2 - - pytest-xdist >=2.2.0 + - pytest-xdist >=3.4.0 - pytest-cov - - hypothesis >=6.46.1 + - hypothesis >=6.84.0 - tomli # [py<311] about: diff --git a/doc/data/titanic.csv b/doc/data/titanic.csv index 5cc466e97cf12..0f7d184728a17 100644 --- a/doc/data/titanic.csv +++ b/doc/data/titanic.csv @@ -1,93 +1,93 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S 2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38,1,0,PC 17599,71.2833,C85,C -3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S +3,1,3,"Heikkinen, Miss Laina",female,26,0,0,STON/O2. 3101282,7.925,,S 4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S 5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S 6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q 7,0,1,"McCarthy, Mr. Timothy J",male,54,0,0,17463,51.8625,E46,S -8,0,3,"Palsson, Master. Gosta Leonard",male,2,3,1,349909,21.075,,S +8,0,3,"Palsson, Master Gosta Leonard",male,2,3,1,349909,21.075,,S 9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27,0,2,347742,11.1333,,S 10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14,1,0,237736,30.0708,,C -11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4,1,1,PP 9549,16.7,G6,S -12,1,1,"Bonnell, Miss. Elizabeth",female,58,0,0,113783,26.55,C103,S +11,1,3,"Sandstrom, Miss Marguerite Rut",female,4,1,1,PP 9549,16.7,G6,S +12,1,1,"Bonnell, Miss Elizabeth",female,58,0,0,113783,26.55,C103,S 13,0,3,"Saundercock, Mr. William Henry",male,20,0,0,A/5. 2151,8.05,,S 14,0,3,"Andersson, Mr. Anders Johan",male,39,1,5,347082,31.275,,S -15,0,3,"Vestrom, Miss. Hulda Amanda Adolfina",female,14,0,0,350406,7.8542,,S +15,0,3,"Vestrom, Miss Hulda Amanda Adolfina",female,14,0,0,350406,7.8542,,S 16,1,2,"Hewlett, Mrs. (Mary D Kingcome) ",female,55,0,0,248706,16,,S -17,0,3,"Rice, Master. Eugene",male,2,4,1,382652,29.125,,Q +17,0,3,"Rice, Master Eugene",male,2,4,1,382652,29.125,,Q 18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13,,S 19,0,3,"Vander Planke, Mrs. Julius (Emelia Maria Vandemoortele)",female,31,1,0,345763,18,,S 20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.225,,C 21,0,2,"Fynney, Mr. Joseph J",male,35,0,0,239865,26,,S 22,1,2,"Beesley, Mr. Lawrence",male,34,0,0,248698,13,D56,S -23,1,3,"McGowan, Miss. Anna ""Annie""",female,15,0,0,330923,8.0292,,Q +23,1,3,"McGowan, Miss Anna ""Annie""",female,15,0,0,330923,8.0292,,Q 24,1,1,"Sloper, Mr. William Thompson",male,28,0,0,113788,35.5,A6,S -25,0,3,"Palsson, Miss. Torborg Danira",female,8,3,1,349909,21.075,,S +25,0,3,"Palsson, Miss Torborg Danira",female,8,3,1,349909,21.075,,S 26,1,3,"Asplund, Mrs. Carl Oscar (Selma Augusta Emilia Johansson)",female,38,1,5,347077,31.3875,,S 27,0,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.225,,C 28,0,1,"Fortune, Mr. Charles Alexander",male,19,3,2,19950,263,C23 C25 C27,S -29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,,Q +29,1,3,"O'Dwyer, Miss Ellen ""Nellie""",female,,0,0,330959,7.8792,,Q 30,0,3,"Todoroff, Mr. Lalio",male,,0,0,349216,7.8958,,S 31,0,1,"Uruchurtu, Don. Manuel E",male,40,0,0,PC 17601,27.7208,,C 32,1,1,"Spencer, Mrs. William Augustus (Marie Eugenie)",female,,1,0,PC 17569,146.5208,B78,C -33,1,3,"Glynn, Miss. Mary Agatha",female,,0,0,335677,7.75,,Q +33,1,3,"Glynn, Miss Mary Agatha",female,,0,0,335677,7.75,,Q 34,0,2,"Wheadon, Mr. Edward H",male,66,0,0,C.A. 24579,10.5,,S 35,0,1,"Meyer, Mr. Edgar Joseph",male,28,1,0,PC 17604,82.1708,,C 36,0,1,"Holverson, Mr. Alexander Oskar",male,42,1,0,113789,52,,S 37,1,3,"Mamee, Mr. Hanna",male,,0,0,2677,7.2292,,C 38,0,3,"Cann, Mr. Ernest Charles",male,21,0,0,A./5. 2152,8.05,,S -39,0,3,"Vander Planke, Miss. Augusta Maria",female,18,2,0,345764,18,,S -40,1,3,"Nicola-Yarred, Miss. Jamila",female,14,1,0,2651,11.2417,,C +39,0,3,"Vander Planke, Miss Augusta Maria",female,18,2,0,345764,18,,S +40,1,3,"Nicola-Yarred, Miss Jamila",female,14,1,0,2651,11.2417,,C 41,0,3,"Ahlin, Mrs. Johan (Johanna Persdotter Larsson)",female,40,1,0,7546,9.475,,S 42,0,2,"Turpin, Mrs. William John Robert (Dorothy Ann Wonnacott)",female,27,1,0,11668,21,,S 43,0,3,"Kraeff, Mr. Theodor",male,,0,0,349253,7.8958,,C -44,1,2,"Laroche, Miss. Simonne Marie Anne Andree",female,3,1,2,SC/Paris 2123,41.5792,,C -45,1,3,"Devaney, Miss. Margaret Delia",female,19,0,0,330958,7.8792,,Q +44,1,2,"Laroche, Miss Simonne Marie Anne Andree",female,3,1,2,SC/Paris 2123,41.5792,,C +45,1,3,"Devaney, Miss Margaret Delia",female,19,0,0,330958,7.8792,,Q 46,0,3,"Rogers, Mr. William John",male,,0,0,S.C./A.4. 23567,8.05,,S 47,0,3,"Lennon, Mr. Denis",male,,1,0,370371,15.5,,Q -48,1,3,"O'Driscoll, Miss. Bridget",female,,0,0,14311,7.75,,Q +48,1,3,"O'Driscoll, Miss Bridget",female,,0,0,14311,7.75,,Q 49,0,3,"Samaan, Mr. Youssef",male,,2,0,2662,21.6792,,C 50,0,3,"Arnold-Franchi, Mrs. Josef (Josefine Franchi)",female,18,1,0,349237,17.8,,S -51,0,3,"Panula, Master. Juha Niilo",male,7,4,1,3101295,39.6875,,S +51,0,3,"Panula, Master Juha Niilo",male,7,4,1,3101295,39.6875,,S 52,0,3,"Nosworthy, Mr. Richard Cater",male,21,0,0,A/4. 39886,7.8,,S 53,1,1,"Harper, Mrs. Henry Sleeper (Myna Haxtun)",female,49,1,0,PC 17572,76.7292,D33,C 54,1,2,"Faunthorpe, Mrs. Lizzie (Elizabeth Anne Wilkinson)",female,29,1,0,2926,26,,S 55,0,1,"Ostby, Mr. Engelhart Cornelius",male,65,0,1,113509,61.9792,B30,C 56,1,1,"Woolner, Mr. Hugh",male,,0,0,19947,35.5,C52,S -57,1,2,"Rugg, Miss. Emily",female,21,0,0,C.A. 31026,10.5,,S +57,1,2,"Rugg, Miss Emily",female,21,0,0,C.A. 31026,10.5,,S 58,0,3,"Novel, Mr. Mansouer",male,28.5,0,0,2697,7.2292,,C -59,1,2,"West, Miss. Constance Mirium",female,5,1,2,C.A. 34651,27.75,,S -60,0,3,"Goodwin, Master. William Frederick",male,11,5,2,CA 2144,46.9,,S +59,1,2,"West, Miss Constance Mirium",female,5,1,2,C.A. 34651,27.75,,S +60,0,3,"Goodwin, Master William Frederick",male,11,5,2,CA 2144,46.9,,S 61,0,3,"Sirayanian, Mr. Orsen",male,22,0,0,2669,7.2292,,C -62,1,1,"Icard, Miss. Amelie",female,38,0,0,113572,80,B28, +62,1,1,"Icard, Miss Amelie",female,38,0,0,113572,80,B28, 63,0,1,"Harris, Mr. Henry Birkhardt",male,45,1,0,36973,83.475,C83,S -64,0,3,"Skoog, Master. Harald",male,4,3,2,347088,27.9,,S +64,0,3,"Skoog, Master Harald",male,4,3,2,347088,27.9,,S 65,0,1,"Stewart, Mr. Albert A",male,,0,0,PC 17605,27.7208,,C -66,1,3,"Moubarek, Master. Gerios",male,,1,1,2661,15.2458,,C +66,1,3,"Moubarek, Master Gerios",male,,1,1,2661,15.2458,,C 67,1,2,"Nye, Mrs. (Elizabeth Ramell)",female,29,0,0,C.A. 29395,10.5,F33,S 68,0,3,"Crease, Mr. Ernest James",male,19,0,0,S.P. 3464,8.1583,,S -69,1,3,"Andersson, Miss. Erna Alexandra",female,17,4,2,3101281,7.925,,S +69,1,3,"Andersson, Miss Erna Alexandra",female,17,4,2,3101281,7.925,,S 70,0,3,"Kink, Mr. Vincenz",male,26,2,0,315151,8.6625,,S 71,0,2,"Jenkin, Mr. Stephen Curnow",male,32,0,0,C.A. 33111,10.5,,S -72,0,3,"Goodwin, Miss. Lillian Amy",female,16,5,2,CA 2144,46.9,,S +72,0,3,"Goodwin, Miss Lillian Amy",female,16,5,2,CA 2144,46.9,,S 73,0,2,"Hood, Mr. Ambrose Jr",male,21,0,0,S.O.C. 14879,73.5,,S 74,0,3,"Chronopoulos, Mr. Apostolos",male,26,1,0,2680,14.4542,,C 75,1,3,"Bing, Mr. Lee",male,32,0,0,1601,56.4958,,S 76,0,3,"Moen, Mr. Sigurd Hansen",male,25,0,0,348123,7.65,F G73,S 77,0,3,"Staneff, Mr. Ivan",male,,0,0,349208,7.8958,,S 78,0,3,"Moutal, Mr. Rahamin Haim",male,,0,0,374746,8.05,,S -79,1,2,"Caldwell, Master. Alden Gates",male,0.83,0,2,248738,29,,S -80,1,3,"Dowdell, Miss. Elizabeth",female,30,0,0,364516,12.475,,S +79,1,2,"Caldwell, Master Alden Gates",male,0.83,0,2,248738,29,,S +80,1,3,"Dowdell, Miss Elizabeth",female,30,0,0,364516,12.475,,S 81,0,3,"Waelens, Mr. Achille",male,22,0,0,345767,9,,S 82,1,3,"Sheerlinck, Mr. Jan Baptist",male,29,0,0,345779,9.5,,S -83,1,3,"McDermott, Miss. Brigdet Delia",female,,0,0,330932,7.7875,,Q +83,1,3,"McDermott, Miss Brigdet Delia",female,,0,0,330932,7.7875,,Q 84,0,1,"Carrau, Mr. Francisco M",male,28,0,0,113059,47.1,,S -85,1,2,"Ilett, Miss. Bertha",female,17,0,0,SO/C 14885,10.5,,S +85,1,2,"Ilett, Miss Bertha",female,17,0,0,SO/C 14885,10.5,,S 86,1,3,"Backstrom, Mrs. Karl Alfred (Maria Mathilda Gustafsson)",female,33,3,0,3101278,15.85,,S 87,0,3,"Ford, Mr. William Neal",male,16,1,3,W./C. 6608,34.375,,S 88,0,3,"Slocovski, Mr. Selman Francis",male,,0,0,SOTON/OQ 392086,8.05,,S -89,1,1,"Fortune, Miss. Mabel Helen",female,23,3,2,19950,263,C23 C25 C27,S +89,1,1,"Fortune, Miss Mabel Helen",female,23,3,2,19950,263,C23 C25 C27,S 90,0,3,"Celotti, Mr. Francesco",male,24,0,0,343275,8.05,,S 91,0,3,"Christmann, Mr. Emil",male,29,0,0,343276,8.05,,S 92,0,3,"Andreasson, Mr. Paul Edvin",male,20,0,0,347466,7.8542,,S @@ -99,35 +99,35 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 98,1,1,"Greenfield, Mr. William Bertram",male,23,0,1,PC 17759,63.3583,D10 D12,C 99,1,2,"Doling, Mrs. John T (Ada Julia Bone)",female,34,0,1,231919,23,,S 100,0,2,"Kantor, Mr. Sinai",male,34,1,0,244367,26,,S -101,0,3,"Petranec, Miss. Matilda",female,28,0,0,349245,7.8958,,S +101,0,3,"Petranec, Miss Matilda",female,28,0,0,349245,7.8958,,S 102,0,3,"Petroff, Mr. Pastcho (""Pentcho"")",male,,0,0,349215,7.8958,,S 103,0,1,"White, Mr. Richard Frasar",male,21,0,1,35281,77.2875,D26,S 104,0,3,"Johansson, Mr. Gustaf Joel",male,33,0,0,7540,8.6542,,S 105,0,3,"Gustafsson, Mr. Anders Vilhelm",male,37,2,0,3101276,7.925,,S 106,0,3,"Mionoff, Mr. Stoytcho",male,28,0,0,349207,7.8958,,S -107,1,3,"Salkjelsvik, Miss. Anna Kristine",female,21,0,0,343120,7.65,,S +107,1,3,"Salkjelsvik, Miss Anna Kristine",female,21,0,0,343120,7.65,,S 108,1,3,"Moss, Mr. Albert Johan",male,,0,0,312991,7.775,,S 109,0,3,"Rekic, Mr. Tido",male,38,0,0,349249,7.8958,,S -110,1,3,"Moran, Miss. Bertha",female,,1,0,371110,24.15,,Q +110,1,3,"Moran, Miss Bertha",female,,1,0,371110,24.15,,Q 111,0,1,"Porter, Mr. Walter Chamberlain",male,47,0,0,110465,52,C110,S -112,0,3,"Zabour, Miss. Hileni",female,14.5,1,0,2665,14.4542,,C +112,0,3,"Zabour, Miss Hileni",female,14.5,1,0,2665,14.4542,,C 113,0,3,"Barton, Mr. David John",male,22,0,0,324669,8.05,,S -114,0,3,"Jussila, Miss. Katriina",female,20,1,0,4136,9.825,,S -115,0,3,"Attalah, Miss. Malake",female,17,0,0,2627,14.4583,,C +114,0,3,"Jussila, Miss Katriina",female,20,1,0,4136,9.825,,S +115,0,3,"Attalah, Miss Malake",female,17,0,0,2627,14.4583,,C 116,0,3,"Pekoniemi, Mr. Edvard",male,21,0,0,STON/O 2. 3101294,7.925,,S 117,0,3,"Connors, Mr. Patrick",male,70.5,0,0,370369,7.75,,Q 118,0,2,"Turpin, Mr. William John Robert",male,29,1,0,11668,21,,S 119,0,1,"Baxter, Mr. Quigg Edmond",male,24,0,1,PC 17558,247.5208,B58 B60,C -120,0,3,"Andersson, Miss. Ellis Anna Maria",female,2,4,2,347082,31.275,,S +120,0,3,"Andersson, Miss Ellis Anna Maria",female,2,4,2,347082,31.275,,S 121,0,2,"Hickman, Mr. Stanley George",male,21,2,0,S.O.C. 14879,73.5,,S 122,0,3,"Moore, Mr. Leonard Charles",male,,0,0,A4. 54510,8.05,,S 123,0,2,"Nasser, Mr. Nicholas",male,32.5,1,0,237736,30.0708,,C -124,1,2,"Webber, Miss. Susan",female,32.5,0,0,27267,13,E101,S +124,1,2,"Webber, Miss Susan",female,32.5,0,0,27267,13,E101,S 125,0,1,"White, Mr. Percival Wayland",male,54,0,1,35281,77.2875,D26,S -126,1,3,"Nicola-Yarred, Master. Elias",male,12,1,0,2651,11.2417,,C +126,1,3,"Nicola-Yarred, Master Elias",male,12,1,0,2651,11.2417,,C 127,0,3,"McMahon, Mr. Martin",male,,0,0,370372,7.75,,Q 128,1,3,"Madsen, Mr. Fridtjof Arne",male,24,0,0,C 17369,7.1417,,S -129,1,3,"Peter, Miss. Anna",female,,1,1,2668,22.3583,F E69,C +129,1,3,"Peter, Miss Anna",female,,1,1,2668,22.3583,F E69,C 130,0,3,"Ekstrom, Mr. Johan",male,45,0,0,347061,6.975,,S 131,0,3,"Drazenoic, Mr. Jozef",male,33,0,0,349241,7.8958,,C 132,0,3,"Coelho, Mr. Domingos Fernandeo",male,20,0,0,SOTON/O.Q. 3101307,7.05,,S @@ -135,18 +135,18 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 134,1,2,"Weisz, Mrs. Leopold (Mathilde Francoise Pede)",female,29,1,0,228414,26,,S 135,0,2,"Sobey, Mr. Samuel James Hayden",male,25,0,0,C.A. 29178,13,,S 136,0,2,"Richard, Mr. Emile",male,23,0,0,SC/PARIS 2133,15.0458,,C -137,1,1,"Newsom, Miss. Helen Monypeny",female,19,0,2,11752,26.2833,D47,S +137,1,1,"Newsom, Miss Helen Monypeny",female,19,0,2,11752,26.2833,D47,S 138,0,1,"Futrelle, Mr. Jacques Heath",male,37,1,0,113803,53.1,C123,S 139,0,3,"Osen, Mr. Olaf Elon",male,16,0,0,7534,9.2167,,S 140,0,1,"Giglio, Mr. Victor",male,24,0,0,PC 17593,79.2,B86,C 141,0,3,"Boulos, Mrs. Joseph (Sultana)",female,,0,2,2678,15.2458,,C -142,1,3,"Nysten, Miss. Anna Sofia",female,22,0,0,347081,7.75,,S +142,1,3,"Nysten, Miss Anna Sofia",female,22,0,0,347081,7.75,,S 143,1,3,"Hakkarainen, Mrs. Pekka Pietari (Elin Matilda Dolck)",female,24,1,0,STON/O2. 3101279,15.85,,S 144,0,3,"Burke, Mr. Jeremiah",male,19,0,0,365222,6.75,,Q 145,0,2,"Andrew, Mr. Edgardo Samuel",male,18,0,0,231945,11.5,,S 146,0,2,"Nicholls, Mr. Joseph Charles",male,19,1,1,C.A. 33112,36.75,,S 147,1,3,"Andersson, Mr. August Edvard (""Wennerstrom"")",male,27,0,0,350043,7.7958,,S -148,0,3,"Ford, Miss. Robina Maggie ""Ruby""",female,9,2,2,W./C. 6608,34.375,,S +148,0,3,"Ford, Miss Robina Maggie ""Ruby""",female,9,2,2,W./C. 6608,34.375,,S 149,0,2,"Navratil, Mr. Michel (""Louis M Hoffman"")",male,36.5,0,2,230080,26,F2,S 150,0,2,"Byles, Rev. Thomas Roussel Davids",male,42,0,0,244310,13,,S 151,0,2,"Bateman, Rev. Robert James",male,51,0,0,S.O.P. 1166,12.525,,S @@ -155,35 +155,35 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 154,0,3,"van Billiard, Mr. Austin Blyler",male,40.5,0,2,A/5. 851,14.5,,S 155,0,3,"Olsen, Mr. Ole Martin",male,,0,0,Fa 265302,7.3125,,S 156,0,1,"Williams, Mr. Charles Duane",male,51,0,1,PC 17597,61.3792,,C -157,1,3,"Gilnagh, Miss. Katherine ""Katie""",female,16,0,0,35851,7.7333,,Q +157,1,3,"Gilnagh, Miss Katherine ""Katie""",female,16,0,0,35851,7.7333,,Q 158,0,3,"Corn, Mr. Harry",male,30,0,0,SOTON/OQ 392090,8.05,,S 159,0,3,"Smiljanic, Mr. Mile",male,,0,0,315037,8.6625,,S -160,0,3,"Sage, Master. Thomas Henry",male,,8,2,CA. 2343,69.55,,S +160,0,3,"Sage, Master Thomas Henry",male,,8,2,CA. 2343,69.55,,S 161,0,3,"Cribb, Mr. John Hatfield",male,44,0,1,371362,16.1,,S 162,1,2,"Watt, Mrs. James (Elizabeth ""Bessie"" Inglis Milne)",female,40,0,0,C.A. 33595,15.75,,S 163,0,3,"Bengtsson, Mr. John Viktor",male,26,0,0,347068,7.775,,S 164,0,3,"Calic, Mr. Jovo",male,17,0,0,315093,8.6625,,S -165,0,3,"Panula, Master. Eino Viljami",male,1,4,1,3101295,39.6875,,S -166,1,3,"Goldsmith, Master. Frank John William ""Frankie""",male,9,0,2,363291,20.525,,S +165,0,3,"Panula, Master Eino Viljami",male,1,4,1,3101295,39.6875,,S +166,1,3,"Goldsmith, Master Frank John William ""Frankie""",male,9,0,2,363291,20.525,,S 167,1,1,"Chibnall, Mrs. (Edith Martha Bowerman)",female,,0,1,113505,55,E33,S 168,0,3,"Skoog, Mrs. William (Anna Bernhardina Karlsson)",female,45,1,4,347088,27.9,,S 169,0,1,"Baumann, Mr. John D",male,,0,0,PC 17318,25.925,,S 170,0,3,"Ling, Mr. Lee",male,28,0,0,1601,56.4958,,S 171,0,1,"Van der hoef, Mr. Wyckoff",male,61,0,0,111240,33.5,B19,S -172,0,3,"Rice, Master. Arthur",male,4,4,1,382652,29.125,,Q -173,1,3,"Johnson, Miss. Eleanor Ileen",female,1,1,1,347742,11.1333,,S +172,0,3,"Rice, Master Arthur",male,4,4,1,382652,29.125,,Q +173,1,3,"Johnson, Miss Eleanor Ileen",female,1,1,1,347742,11.1333,,S 174,0,3,"Sivola, Mr. Antti Wilhelm",male,21,0,0,STON/O 2. 3101280,7.925,,S 175,0,1,"Smith, Mr. James Clinch",male,56,0,0,17764,30.6958,A7,C 176,0,3,"Klasen, Mr. Klas Albin",male,18,1,1,350404,7.8542,,S -177,0,3,"Lefebre, Master. Henry Forbes",male,,3,1,4133,25.4667,,S -178,0,1,"Isham, Miss. Ann Elizabeth",female,50,0,0,PC 17595,28.7125,C49,C +177,0,3,"Lefebre, Master Henry Forbes",male,,3,1,4133,25.4667,,S +178,0,1,"Isham, Miss Ann Elizabeth",female,50,0,0,PC 17595,28.7125,C49,C 179,0,2,"Hale, Mr. Reginald",male,30,0,0,250653,13,,S 180,0,3,"Leonard, Mr. Lionel",male,36,0,0,LINE,0,,S -181,0,3,"Sage, Miss. Constance Gladys",female,,8,2,CA. 2343,69.55,,S +181,0,3,"Sage, Miss Constance Gladys",female,,8,2,CA. 2343,69.55,,S 182,0,2,"Pernot, Mr. Rene",male,,0,0,SC/PARIS 2131,15.05,,C -183,0,3,"Asplund, Master. Clarence Gustaf Hugo",male,9,4,2,347077,31.3875,,S -184,1,2,"Becker, Master. Richard F",male,1,2,1,230136,39,F4,S -185,1,3,"Kink-Heilmann, Miss. Luise Gretchen",female,4,0,2,315153,22.025,,S +183,0,3,"Asplund, Master Clarence Gustaf Hugo",male,9,4,2,347077,31.3875,,S +184,1,2,"Becker, Master Richard F",male,1,2,1,230136,39,F4,S +185,1,3,"Kink-Heilmann, Miss Luise Gretchen",female,4,0,2,315153,22.025,,S 186,0,1,"Rood, Mr. Hugh Roscoe",male,,0,0,113767,50,A32,S 187,1,3,"O'Brien, Mrs. Thomas (Johanna ""Hannah"" Godfrey)",female,,1,0,370365,15.5,,Q 188,1,1,"Romaine, Mr. Charles Hallace (""Mr C Rolmane"")",male,45,0,0,111428,26.55,,S @@ -191,33 +191,33 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 190,0,3,"Turcin, Mr. Stjepan",male,36,0,0,349247,7.8958,,S 191,1,2,"Pinsky, Mrs. (Rosa)",female,32,0,0,234604,13,,S 192,0,2,"Carbines, Mr. William",male,19,0,0,28424,13,,S -193,1,3,"Andersen-Jensen, Miss. Carla Christine Nielsine",female,19,1,0,350046,7.8542,,S -194,1,2,"Navratil, Master. Michel M",male,3,1,1,230080,26,F2,S +193,1,3,"Andersen-Jensen, Miss Carla Christine Nielsine",female,19,1,0,350046,7.8542,,S +194,1,2,"Navratil, Master Michel M",male,3,1,1,230080,26,F2,S 195,1,1,"Brown, Mrs. James Joseph (Margaret Tobin)",female,44,0,0,PC 17610,27.7208,B4,C -196,1,1,"Lurette, Miss. Elise",female,58,0,0,PC 17569,146.5208,B80,C +196,1,1,"Lurette, Miss Elise",female,58,0,0,PC 17569,146.5208,B80,C 197,0,3,"Mernagh, Mr. Robert",male,,0,0,368703,7.75,,Q 198,0,3,"Olsen, Mr. Karl Siegwart Andreas",male,42,0,1,4579,8.4042,,S -199,1,3,"Madigan, Miss. Margaret ""Maggie""",female,,0,0,370370,7.75,,Q -200,0,2,"Yrois, Miss. Henriette (""Mrs Harbeck"")",female,24,0,0,248747,13,,S +199,1,3,"Madigan, Miss Margaret ""Maggie""",female,,0,0,370370,7.75,,Q +200,0,2,"Yrois, Miss Henriette (""Mrs Harbeck"")",female,24,0,0,248747,13,,S 201,0,3,"Vande Walle, Mr. Nestor Cyriel",male,28,0,0,345770,9.5,,S 202,0,3,"Sage, Mr. Frederick",male,,8,2,CA. 2343,69.55,,S 203,0,3,"Johanson, Mr. Jakob Alfred",male,34,0,0,3101264,6.4958,,S 204,0,3,"Youseff, Mr. Gerious",male,45.5,0,0,2628,7.225,,C 205,1,3,"Cohen, Mr. Gurshon ""Gus""",male,18,0,0,A/5 3540,8.05,,S -206,0,3,"Strom, Miss. Telma Matilda",female,2,0,1,347054,10.4625,G6,S +206,0,3,"Strom, Miss Telma Matilda",female,2,0,1,347054,10.4625,G6,S 207,0,3,"Backstrom, Mr. Karl Alfred",male,32,1,0,3101278,15.85,,S 208,1,3,"Albimona, Mr. Nassef Cassem",male,26,0,0,2699,18.7875,,C -209,1,3,"Carr, Miss. Helen ""Ellen""",female,16,0,0,367231,7.75,,Q +209,1,3,"Carr, Miss Helen ""Ellen""",female,16,0,0,367231,7.75,,Q 210,1,1,"Blank, Mr. Henry",male,40,0,0,112277,31,A31,C 211,0,3,"Ali, Mr. Ahmed",male,24,0,0,SOTON/O.Q. 3101311,7.05,,S -212,1,2,"Cameron, Miss. Clear Annie",female,35,0,0,F.C.C. 13528,21,,S +212,1,2,"Cameron, Miss Clear Annie",female,35,0,0,F.C.C. 13528,21,,S 213,0,3,"Perkin, Mr. John Henry",male,22,0,0,A/5 21174,7.25,,S 214,0,2,"Givard, Mr. Hans Kristensen",male,30,0,0,250646,13,,S 215,0,3,"Kiernan, Mr. Philip",male,,1,0,367229,7.75,,Q -216,1,1,"Newell, Miss. Madeleine",female,31,1,0,35273,113.275,D36,C -217,1,3,"Honkanen, Miss. Eliina",female,27,0,0,STON/O2. 3101283,7.925,,S +216,1,1,"Newell, Miss Madeleine",female,31,1,0,35273,113.275,D36,C +217,1,3,"Honkanen, Miss Eliina",female,27,0,0,STON/O2. 3101283,7.925,,S 218,0,2,"Jacobsohn, Mr. Sidney Samuel",male,42,1,0,243847,27,,S -219,1,1,"Bazzani, Miss. Albina",female,32,0,0,11813,76.2917,D15,C +219,1,1,"Bazzani, Miss Albina",female,32,0,0,11813,76.2917,D15,C 220,0,2,"Harris, Mr. Walter",male,30,0,0,W/C 14208,10.5,,S 221,1,3,"Sunderland, Mr. Victor Francis",male,16,0,0,SOTON/OQ 392089,8.05,,S 222,0,2,"Bracken, Mr. James H",male,27,0,0,220367,13,,S @@ -228,24 +228,24 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 227,1,2,"Mellors, Mr. William John",male,19,0,0,SW/PP 751,10.5,,S 228,0,3,"Lovell, Mr. John Hall (""Henry"")",male,20.5,0,0,A/5 21173,7.25,,S 229,0,2,"Fahlstrom, Mr. Arne Jonas",male,18,0,0,236171,13,,S -230,0,3,"Lefebre, Miss. Mathilde",female,,3,1,4133,25.4667,,S +230,0,3,"Lefebre, Miss Mathilde",female,,3,1,4133,25.4667,,S 231,1,1,"Harris, Mrs. Henry Birkhardt (Irene Wallach)",female,35,1,0,36973,83.475,C83,S 232,0,3,"Larsson, Mr. Bengt Edvin",male,29,0,0,347067,7.775,,S 233,0,2,"Sjostedt, Mr. Ernst Adolf",male,59,0,0,237442,13.5,,S -234,1,3,"Asplund, Miss. Lillian Gertrud",female,5,4,2,347077,31.3875,,S +234,1,3,"Asplund, Miss Lillian Gertrud",female,5,4,2,347077,31.3875,,S 235,0,2,"Leyson, Mr. Robert William Norman",male,24,0,0,C.A. 29566,10.5,,S -236,0,3,"Harknett, Miss. Alice Phoebe",female,,0,0,W./C. 6609,7.55,,S +236,0,3,"Harknett, Miss Alice Phoebe",female,,0,0,W./C. 6609,7.55,,S 237,0,2,"Hold, Mr. Stephen",male,44,1,0,26707,26,,S -238,1,2,"Collyer, Miss. Marjorie ""Lottie""",female,8,0,2,C.A. 31921,26.25,,S +238,1,2,"Collyer, Miss Marjorie ""Lottie""",female,8,0,2,C.A. 31921,26.25,,S 239,0,2,"Pengelly, Mr. Frederick William",male,19,0,0,28665,10.5,,S 240,0,2,"Hunt, Mr. George Henry",male,33,0,0,SCO/W 1585,12.275,,S -241,0,3,"Zabour, Miss. Thamine",female,,1,0,2665,14.4542,,C -242,1,3,"Murphy, Miss. Katherine ""Kate""",female,,1,0,367230,15.5,,Q +241,0,3,"Zabour, Miss Thamine",female,,1,0,2665,14.4542,,C +242,1,3,"Murphy, Miss Katherine ""Kate""",female,,1,0,367230,15.5,,Q 243,0,2,"Coleridge, Mr. Reginald Charles",male,29,0,0,W./C. 14263,10.5,,S 244,0,3,"Maenpaa, Mr. Matti Alexanteri",male,22,0,0,STON/O 2. 3101275,7.125,,S 245,0,3,"Attalah, Mr. Sleiman",male,30,0,0,2694,7.225,,C 246,0,1,"Minahan, Dr. William Edward",male,44,2,0,19928,90,C78,Q -247,0,3,"Lindahl, Miss. Agda Thorilda Viktoria",female,25,0,0,347071,7.775,,S +247,0,3,"Lindahl, Miss Agda Thorilda Viktoria",female,25,0,0,347071,7.775,,S 248,1,2,"Hamalainen, Mrs. William (Anna)",female,24,0,2,250649,14.5,,S 249,1,1,"Beckwith, Mr. Richard Leonard",male,37,1,1,11751,52.5542,D35,S 250,0,2,"Carter, Rev. Ernest Courtenay",male,54,1,0,244252,26,,S @@ -256,28 +256,28 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 255,0,3,"Rosblom, Mrs. Viktor (Helena Wilhelmina)",female,41,0,2,370129,20.2125,,S 256,1,3,"Touma, Mrs. Darwis (Hanne Youssef Razi)",female,29,0,2,2650,15.2458,,C 257,1,1,"Thorne, Mrs. Gertrude Maybelle",female,,0,0,PC 17585,79.2,,C -258,1,1,"Cherry, Miss. Gladys",female,30,0,0,110152,86.5,B77,S -259,1,1,"Ward, Miss. Anna",female,35,0,0,PC 17755,512.3292,,C +258,1,1,"Cherry, Miss Gladys",female,30,0,0,110152,86.5,B77,S +259,1,1,"Ward, Miss Anna",female,35,0,0,PC 17755,512.3292,,C 260,1,2,"Parrish, Mrs. (Lutie Davis)",female,50,0,1,230433,26,,S 261,0,3,"Smith, Mr. Thomas",male,,0,0,384461,7.75,,Q -262,1,3,"Asplund, Master. Edvin Rojj Felix",male,3,4,2,347077,31.3875,,S +262,1,3,"Asplund, Master Edvin Rojj Felix",male,3,4,2,347077,31.3875,,S 263,0,1,"Taussig, Mr. Emil",male,52,1,1,110413,79.65,E67,S 264,0,1,"Harrison, Mr. William",male,40,0,0,112059,0,B94,S -265,0,3,"Henry, Miss. Delia",female,,0,0,382649,7.75,,Q +265,0,3,"Henry, Miss Delia",female,,0,0,382649,7.75,,Q 266,0,2,"Reeves, Mr. David",male,36,0,0,C.A. 17248,10.5,,S 267,0,3,"Panula, Mr. Ernesti Arvid",male,16,4,1,3101295,39.6875,,S 268,1,3,"Persson, Mr. Ernst Ulrik",male,25,1,0,347083,7.775,,S 269,1,1,"Graham, Mrs. William Thompson (Edith Junkins)",female,58,0,1,PC 17582,153.4625,C125,S -270,1,1,"Bissette, Miss. Amelia",female,35,0,0,PC 17760,135.6333,C99,S +270,1,1,"Bissette, Miss Amelia",female,35,0,0,PC 17760,135.6333,C99,S 271,0,1,"Cairns, Mr. Alexander",male,,0,0,113798,31,,S 272,1,3,"Tornquist, Mr. William Henry",male,25,0,0,LINE,0,,S 273,1,2,"Mellinger, Mrs. (Elizabeth Anne Maidment)",female,41,0,1,250644,19.5,,S 274,0,1,"Natsch, Mr. Charles H",male,37,0,1,PC 17596,29.7,C118,C -275,1,3,"Healy, Miss. Hanora ""Nora""",female,,0,0,370375,7.75,,Q -276,1,1,"Andrews, Miss. Kornelia Theodosia",female,63,1,0,13502,77.9583,D7,S -277,0,3,"Lindblom, Miss. Augusta Charlotta",female,45,0,0,347073,7.75,,S +275,1,3,"Healy, Miss Hanora ""Nora""",female,,0,0,370375,7.75,,Q +276,1,1,"Andrews, Miss Kornelia Theodosia",female,63,1,0,13502,77.9583,D7,S +277,0,3,"Lindblom, Miss Augusta Charlotta",female,45,0,0,347073,7.75,,S 278,0,2,"Parkes, Mr. Francis ""Frank""",male,,0,0,239853,0,,S -279,0,3,"Rice, Master. Eric",male,7,4,1,382652,29.125,,Q +279,0,3,"Rice, Master Eric",male,7,4,1,382652,29.125,,Q 280,1,3,"Abbott, Mrs. Stanton (Rosa Hunt)",female,35,1,1,C.A. 2673,20.25,,S 281,0,3,"Duane, Mr. Frank",male,65,0,0,336439,7.75,,Q 282,0,3,"Olsson, Mr. Nils Johan Goransson",male,28,0,0,347464,7.8542,,S @@ -288,66 +288,66 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 287,1,3,"de Mulder, Mr. Theodore",male,30,0,0,345774,9.5,,S 288,0,3,"Naidenoff, Mr. Penko",male,22,0,0,349206,7.8958,,S 289,1,2,"Hosono, Mr. Masabumi",male,42,0,0,237798,13,,S -290,1,3,"Connolly, Miss. Kate",female,22,0,0,370373,7.75,,Q -291,1,1,"Barber, Miss. Ellen ""Nellie""",female,26,0,0,19877,78.85,,S +290,1,3,"Connolly, Miss Kate",female,22,0,0,370373,7.75,,Q +291,1,1,"Barber, Miss Ellen ""Nellie""",female,26,0,0,19877,78.85,,S 292,1,1,"Bishop, Mrs. Dickinson H (Helen Walton)",female,19,1,0,11967,91.0792,B49,C 293,0,2,"Levy, Mr. Rene Jacques",male,36,0,0,SC/Paris 2163,12.875,D,C -294,0,3,"Haas, Miss. Aloisia",female,24,0,0,349236,8.85,,S +294,0,3,"Haas, Miss Aloisia",female,24,0,0,349236,8.85,,S 295,0,3,"Mineff, Mr. Ivan",male,24,0,0,349233,7.8958,,S 296,0,1,"Lewy, Mr. Ervin G",male,,0,0,PC 17612,27.7208,,C 297,0,3,"Hanna, Mr. Mansour",male,23.5,0,0,2693,7.2292,,C -298,0,1,"Allison, Miss. Helen Loraine",female,2,1,2,113781,151.55,C22 C26,S +298,0,1,"Allison, Miss Helen Loraine",female,2,1,2,113781,151.55,C22 C26,S 299,1,1,"Saalfeld, Mr. Adolphe",male,,0,0,19988,30.5,C106,S 300,1,1,"Baxter, Mrs. James (Helene DeLaudeniere Chaput)",female,50,0,1,PC 17558,247.5208,B58 B60,C -301,1,3,"Kelly, Miss. Anna Katherine ""Annie Kate""",female,,0,0,9234,7.75,,Q +301,1,3,"Kelly, Miss Anna Katherine ""Annie Kate""",female,,0,0,9234,7.75,,Q 302,1,3,"McCoy, Mr. Bernard",male,,2,0,367226,23.25,,Q 303,0,3,"Johnson, Mr. William Cahoone Jr",male,19,0,0,LINE,0,,S -304,1,2,"Keane, Miss. Nora A",female,,0,0,226593,12.35,E101,Q +304,1,2,"Keane, Miss Nora A",female,,0,0,226593,12.35,E101,Q 305,0,3,"Williams, Mr. Howard Hugh ""Harry""",male,,0,0,A/5 2466,8.05,,S -306,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S -307,1,1,"Fleming, Miss. Margaret",female,,0,0,17421,110.8833,,C +306,1,1,"Allison, Master Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S +307,1,1,"Fleming, Miss Margaret",female,,0,0,17421,110.8833,,C 308,1,1,"Penasco y Castellana, Mrs. Victor de Satode (Maria Josefa Perez de Soto y Vallejo)",female,17,1,0,PC 17758,108.9,C65,C 309,0,2,"Abelson, Mr. Samuel",male,30,1,0,P/PP 3381,24,,C -310,1,1,"Francatelli, Miss. Laura Mabel",female,30,0,0,PC 17485,56.9292,E36,C -311,1,1,"Hays, Miss. Margaret Bechstein",female,24,0,0,11767,83.1583,C54,C -312,1,1,"Ryerson, Miss. Emily Borie",female,18,2,2,PC 17608,262.375,B57 B59 B63 B66,C +310,1,1,"Francatelli, Miss Laura Mabel",female,30,0,0,PC 17485,56.9292,E36,C +311,1,1,"Hays, Miss Margaret Bechstein",female,24,0,0,11767,83.1583,C54,C +312,1,1,"Ryerson, Miss Emily Borie",female,18,2,2,PC 17608,262.375,B57 B59 B63 B66,C 313,0,2,"Lahtinen, Mrs. William (Anna Sylfven)",female,26,1,1,250651,26,,S 314,0,3,"Hendekovic, Mr. Ignjac",male,28,0,0,349243,7.8958,,S 315,0,2,"Hart, Mr. Benjamin",male,43,1,1,F.C.C. 13529,26.25,,S -316,1,3,"Nilsson, Miss. Helmina Josefina",female,26,0,0,347470,7.8542,,S +316,1,3,"Nilsson, Miss Helmina Josefina",female,26,0,0,347470,7.8542,,S 317,1,2,"Kantor, Mrs. Sinai (Miriam Sternin)",female,24,1,0,244367,26,,S 318,0,2,"Moraweck, Dr. Ernest",male,54,0,0,29011,14,,S -319,1,1,"Wick, Miss. Mary Natalie",female,31,0,2,36928,164.8667,C7,S +319,1,1,"Wick, Miss Mary Natalie",female,31,0,2,36928,164.8667,C7,S 320,1,1,"Spedden, Mrs. Frederic Oakley (Margaretta Corning Stone)",female,40,1,1,16966,134.5,E34,C 321,0,3,"Dennis, Mr. Samuel",male,22,0,0,A/5 21172,7.25,,S 322,0,3,"Danoff, Mr. Yoto",male,27,0,0,349219,7.8958,,S -323,1,2,"Slayter, Miss. Hilda Mary",female,30,0,0,234818,12.35,,Q +323,1,2,"Slayter, Miss Hilda Mary",female,30,0,0,234818,12.35,,Q 324,1,2,"Caldwell, Mrs. Albert Francis (Sylvia Mae Harbaugh)",female,22,1,1,248738,29,,S 325,0,3,"Sage, Mr. George John Jr",male,,8,2,CA. 2343,69.55,,S -326,1,1,"Young, Miss. Marie Grice",female,36,0,0,PC 17760,135.6333,C32,C +326,1,1,"Young, Miss Marie Grice",female,36,0,0,PC 17760,135.6333,C32,C 327,0,3,"Nysveen, Mr. Johan Hansen",male,61,0,0,345364,6.2375,,S 328,1,2,"Ball, Mrs. (Ada E Hall)",female,36,0,0,28551,13,D,S 329,1,3,"Goldsmith, Mrs. Frank John (Emily Alice Brown)",female,31,1,1,363291,20.525,,S -330,1,1,"Hippach, Miss. Jean Gertrude",female,16,0,1,111361,57.9792,B18,C -331,1,3,"McCoy, Miss. Agnes",female,,2,0,367226,23.25,,Q +330,1,1,"Hippach, Miss Jean Gertrude",female,16,0,1,111361,57.9792,B18,C +331,1,3,"McCoy, Miss Agnes",female,,2,0,367226,23.25,,Q 332,0,1,"Partner, Mr. Austen",male,45.5,0,0,113043,28.5,C124,S 333,0,1,"Graham, Mr. George Edward",male,38,0,1,PC 17582,153.4625,C91,S 334,0,3,"Vander Planke, Mr. Leo Edmondus",male,16,2,0,345764,18,,S 335,1,1,"Frauenthal, Mrs. Henry William (Clara Heinsheimer)",female,,1,0,PC 17611,133.65,,S 336,0,3,"Denkoff, Mr. Mitto",male,,0,0,349225,7.8958,,S 337,0,1,"Pears, Mr. Thomas Clinton",male,29,1,0,113776,66.6,C2,S -338,1,1,"Burns, Miss. Elizabeth Margaret",female,41,0,0,16966,134.5,E40,C +338,1,1,"Burns, Miss Elizabeth Margaret",female,41,0,0,16966,134.5,E40,C 339,1,3,"Dahl, Mr. Karl Edwart",male,45,0,0,7598,8.05,,S 340,0,1,"Blackwell, Mr. Stephen Weart",male,45,0,0,113784,35.5,T,S -341,1,2,"Navratil, Master. Edmond Roger",male,2,1,1,230080,26,F2,S -342,1,1,"Fortune, Miss. Alice Elizabeth",female,24,3,2,19950,263,C23 C25 C27,S +341,1,2,"Navratil, Master Edmond Roger",male,2,1,1,230080,26,F2,S +342,1,1,"Fortune, Miss Alice Elizabeth",female,24,3,2,19950,263,C23 C25 C27,S 343,0,2,"Collander, Mr. Erik Gustaf",male,28,0,0,248740,13,,S 344,0,2,"Sedgwick, Mr. Charles Frederick Waddington",male,25,0,0,244361,13,,S 345,0,2,"Fox, Mr. Stanley Hubert",male,36,0,0,229236,13,,S -346,1,2,"Brown, Miss. Amelia ""Mildred""",female,24,0,0,248733,13,F33,S -347,1,2,"Smith, Miss. Marion Elsie",female,40,0,0,31418,13,,S +346,1,2,"Brown, Miss Amelia ""Mildred""",female,24,0,0,248733,13,F33,S +347,1,2,"Smith, Miss Marion Elsie",female,40,0,0,31418,13,,S 348,1,3,"Davison, Mrs. Thomas Henry (Mary E Finck)",female,,1,0,386525,16.1,,S -349,1,3,"Coutts, Master. William Loch ""William""",male,3,1,1,C.A. 37671,15.9,,S +349,1,3,"Coutts, Master William Loch ""William""",male,3,1,1,C.A. 37671,15.9,,S 350,0,3,"Dimic, Mr. Jovan",male,42,0,0,315088,8.6625,,S 351,0,3,"Odahl, Mr. Nils Martin",male,23,0,0,7267,9.225,,S 352,0,1,"Williams-Lambert, Mr. Fletcher Fellows",male,,0,0,113510,35,C128,S @@ -355,10 +355,10 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 354,0,3,"Arnold-Franchi, Mr. Josef",male,25,1,0,349237,17.8,,S 355,0,3,"Yousif, Mr. Wazli",male,,0,0,2647,7.225,,C 356,0,3,"Vanden Steen, Mr. Leo Peter",male,28,0,0,345783,9.5,,S -357,1,1,"Bowerman, Miss. Elsie Edith",female,22,0,1,113505,55,E33,S -358,0,2,"Funk, Miss. Annie Clemmer",female,38,0,0,237671,13,,S -359,1,3,"McGovern, Miss. Mary",female,,0,0,330931,7.8792,,Q -360,1,3,"Mockler, Miss. Helen Mary ""Ellie""",female,,0,0,330980,7.8792,,Q +357,1,1,"Bowerman, Miss Elsie Edith",female,22,0,1,113505,55,E33,S +358,0,2,"Funk, Miss Annie Clemmer",female,38,0,0,237671,13,,S +359,1,3,"McGovern, Miss Mary",female,,0,0,330931,7.8792,,Q +360,1,3,"Mockler, Miss Helen Mary ""Ellie""",female,,0,0,330980,7.8792,,Q 361,0,3,"Skoog, Mr. Wilhelm",male,40,1,4,347088,27.9,,S 362,0,2,"del Carlo, Mr. Sebastiano",male,29,1,0,SC/PARIS 2167,27.7208,,C 363,0,3,"Barbara, Mrs. (Catherine David)",female,45,0,1,2691,14.4542,,C @@ -367,58 +367,58 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 366,0,3,"Adahl, Mr. Mauritz Nils Martin",male,30,0,0,C 7076,7.25,,S 367,1,1,"Warren, Mrs. Frank Manley (Anna Sophia Atkinson)",female,60,1,0,110813,75.25,D37,C 368,1,3,"Moussa, Mrs. (Mantoura Boulos)",female,,0,0,2626,7.2292,,C -369,1,3,"Jermyn, Miss. Annie",female,,0,0,14313,7.75,,Q +369,1,3,"Jermyn, Miss Annie",female,,0,0,14313,7.75,,Q 370,1,1,"Aubart, Mme. Leontine Pauline",female,24,0,0,PC 17477,69.3,B35,C 371,1,1,"Harder, Mr. George Achilles",male,25,1,0,11765,55.4417,E50,C 372,0,3,"Wiklund, Mr. Jakob Alfred",male,18,1,0,3101267,6.4958,,S 373,0,3,"Beavan, Mr. William Thomas",male,19,0,0,323951,8.05,,S 374,0,1,"Ringhini, Mr. Sante",male,22,0,0,PC 17760,135.6333,,C -375,0,3,"Palsson, Miss. Stina Viola",female,3,3,1,349909,21.075,,S +375,0,3,"Palsson, Miss Stina Viola",female,3,3,1,349909,21.075,,S 376,1,1,"Meyer, Mrs. Edgar Joseph (Leila Saks)",female,,1,0,PC 17604,82.1708,,C -377,1,3,"Landergren, Miss. Aurora Adelia",female,22,0,0,C 7077,7.25,,S +377,1,3,"Landergren, Miss Aurora Adelia",female,22,0,0,C 7077,7.25,,S 378,0,1,"Widener, Mr. Harry Elkins",male,27,0,2,113503,211.5,C82,C 379,0,3,"Betros, Mr. Tannous",male,20,0,0,2648,4.0125,,C 380,0,3,"Gustafsson, Mr. Karl Gideon",male,19,0,0,347069,7.775,,S -381,1,1,"Bidois, Miss. Rosalie",female,42,0,0,PC 17757,227.525,,C -382,1,3,"Nakid, Miss. Maria (""Mary"")",female,1,0,2,2653,15.7417,,C +381,1,1,"Bidois, Miss Rosalie",female,42,0,0,PC 17757,227.525,,C +382,1,3,"Nakid, Miss Maria (""Mary"")",female,1,0,2,2653,15.7417,,C 383,0,3,"Tikkanen, Mr. Juho",male,32,0,0,STON/O 2. 3101293,7.925,,S 384,1,1,"Holverson, Mrs. Alexander Oskar (Mary Aline Towner)",female,35,1,0,113789,52,,S 385,0,3,"Plotcharsky, Mr. Vasil",male,,0,0,349227,7.8958,,S 386,0,2,"Davies, Mr. Charles Henry",male,18,0,0,S.O.C. 14879,73.5,,S -387,0,3,"Goodwin, Master. Sidney Leonard",male,1,5,2,CA 2144,46.9,,S -388,1,2,"Buss, Miss. Kate",female,36,0,0,27849,13,,S +387,0,3,"Goodwin, Master Sidney Leonard",male,1,5,2,CA 2144,46.9,,S +388,1,2,"Buss, Miss Kate",female,36,0,0,27849,13,,S 389,0,3,"Sadlier, Mr. Matthew",male,,0,0,367655,7.7292,,Q -390,1,2,"Lehmann, Miss. Bertha",female,17,0,0,SC 1748,12,,C +390,1,2,"Lehmann, Miss Bertha",female,17,0,0,SC 1748,12,,C 391,1,1,"Carter, Mr. William Ernest",male,36,1,2,113760,120,B96 B98,S 392,1,3,"Jansson, Mr. Carl Olof",male,21,0,0,350034,7.7958,,S 393,0,3,"Gustafsson, Mr. Johan Birger",male,28,2,0,3101277,7.925,,S -394,1,1,"Newell, Miss. Marjorie",female,23,1,0,35273,113.275,D36,C +394,1,1,"Newell, Miss Marjorie",female,23,1,0,35273,113.275,D36,C 395,1,3,"Sandstrom, Mrs. Hjalmar (Agnes Charlotta Bengtsson)",female,24,0,2,PP 9549,16.7,G6,S 396,0,3,"Johansson, Mr. Erik",male,22,0,0,350052,7.7958,,S -397,0,3,"Olsson, Miss. Elina",female,31,0,0,350407,7.8542,,S +397,0,3,"Olsson, Miss Elina",female,31,0,0,350407,7.8542,,S 398,0,2,"McKane, Mr. Peter David",male,46,0,0,28403,26,,S 399,0,2,"Pain, Dr. Alfred",male,23,0,0,244278,10.5,,S 400,1,2,"Trout, Mrs. William H (Jessie L)",female,28,0,0,240929,12.65,,S 401,1,3,"Niskanen, Mr. Juha",male,39,0,0,STON/O 2. 3101289,7.925,,S 402,0,3,"Adams, Mr. John",male,26,0,0,341826,8.05,,S -403,0,3,"Jussila, Miss. Mari Aina",female,21,1,0,4137,9.825,,S +403,0,3,"Jussila, Miss Mari Aina",female,21,1,0,4137,9.825,,S 404,0,3,"Hakkarainen, Mr. Pekka Pietari",male,28,1,0,STON/O2. 3101279,15.85,,S -405,0,3,"Oreskovic, Miss. Marija",female,20,0,0,315096,8.6625,,S +405,0,3,"Oreskovic, Miss Marija",female,20,0,0,315096,8.6625,,S 406,0,2,"Gale, Mr. Shadrach",male,34,1,0,28664,21,,S 407,0,3,"Widegren, Mr. Carl/Charles Peter",male,51,0,0,347064,7.75,,S -408,1,2,"Richards, Master. William Rowe",male,3,1,1,29106,18.75,,S +408,1,2,"Richards, Master William Rowe",male,3,1,1,29106,18.75,,S 409,0,3,"Birkeland, Mr. Hans Martin Monsen",male,21,0,0,312992,7.775,,S -410,0,3,"Lefebre, Miss. Ida",female,,3,1,4133,25.4667,,S +410,0,3,"Lefebre, Miss Ida",female,,3,1,4133,25.4667,,S 411,0,3,"Sdycoff, Mr. Todor",male,,0,0,349222,7.8958,,S 412,0,3,"Hart, Mr. Henry",male,,0,0,394140,6.8583,,Q -413,1,1,"Minahan, Miss. Daisy E",female,33,1,0,19928,90,C78,Q +413,1,1,"Minahan, Miss Daisy E",female,33,1,0,19928,90,C78,Q 414,0,2,"Cunningham, Mr. Alfred Fleming",male,,0,0,239853,0,,S 415,1,3,"Sundman, Mr. Johan Julian",male,44,0,0,STON/O 2. 3101269,7.925,,S 416,0,3,"Meek, Mrs. Thomas (Annie Louise Rowley)",female,,0,0,343095,8.05,,S 417,1,2,"Drew, Mrs. James Vivian (Lulu Thorne Christian)",female,34,1,1,28220,32.5,,S -418,1,2,"Silven, Miss. Lyyli Karoliina",female,18,0,2,250652,13,,S +418,1,2,"Silven, Miss Lyyli Karoliina",female,18,0,2,250652,13,,S 419,0,2,"Matthews, Mr. William John",male,30,0,0,28228,13,,S -420,0,3,"Van Impe, Miss. Catharina",female,10,0,2,345773,24.15,,S +420,0,3,"Van Impe, Miss Catharina",female,10,0,2,345773,24.15,,S 421,0,3,"Gheorgheff, Mr. Stanio",male,,0,0,349254,7.8958,,C 422,0,3,"Charters, Mr. David",male,21,0,0,A/5. 13032,7.7333,,Q 423,0,3,"Zimmerman, Mr. Leo",male,29,0,0,315082,7.875,,S @@ -426,7 +426,7 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 425,0,3,"Rosblom, Mr. Viktor Richard",male,18,1,1,370129,20.2125,,S 426,0,3,"Wiseman, Mr. Phillippe",male,,0,0,A/4. 34244,7.25,,S 427,1,2,"Clarke, Mrs. Charles V (Ada Maria Winfield)",female,28,1,0,2003,26,,S -428,1,2,"Phillips, Miss. Kate Florence (""Mrs Kate Louise Phillips Marshall"")",female,19,0,0,250655,26,,S +428,1,2,"Phillips, Miss Kate Florence (""Mrs Kate Louise Phillips Marshall"")",female,19,0,0,250655,26,,S 429,0,3,"Flynn, Mr. James",male,,0,0,364851,7.75,,Q 430,1,3,"Pickard, Mr. Berk (Berk Trembisky)",male,32,0,0,SOTON/O.Q. 392078,8.05,E10,S 431,1,1,"Bjornstrom-Steffansson, Mr. Mauritz Hakan",male,28,0,0,110564,26.55,C52,S @@ -434,8 +434,8 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 433,1,2,"Louch, Mrs. Charles Alexander (Alice Adelaide Slow)",female,42,1,0,SC/AH 3085,26,,S 434,0,3,"Kallio, Mr. Nikolai Erland",male,17,0,0,STON/O 2. 3101274,7.125,,S 435,0,1,"Silvey, Mr. William Baird",male,50,1,0,13507,55.9,E44,S -436,1,1,"Carter, Miss. Lucile Polk",female,14,1,2,113760,120,B96 B98,S -437,0,3,"Ford, Miss. Doolina Margaret ""Daisy""",female,21,2,2,W./C. 6608,34.375,,S +436,1,1,"Carter, Miss Lucile Polk",female,14,1,2,113760,120,B96 B98,S +437,0,3,"Ford, Miss Doolina Margaret ""Daisy""",female,21,2,2,W./C. 6608,34.375,,S 438,1,2,"Richards, Mrs. Sidney (Emily Hocking)",female,24,2,3,29106,18.75,,S 439,0,1,"Fortune, Mr. Mark",male,64,1,4,19950,263,C23 C25 C27,S 440,0,2,"Kvillner, Mr. Johan Henrik Johannesson",male,31,0,0,C.A. 18723,10.5,,S @@ -444,10 +444,10 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 443,0,3,"Petterson, Mr. Johan Emil",male,25,1,0,347076,7.775,,S 444,1,2,"Reynaldo, Ms. Encarnacion",female,28,0,0,230434,13,,S 445,1,3,"Johannesen-Bratthammer, Mr. Bernt",male,,0,0,65306,8.1125,,S -446,1,1,"Dodge, Master. Washington",male,4,0,2,33638,81.8583,A34,S -447,1,2,"Mellinger, Miss. Madeleine Violet",female,13,0,1,250644,19.5,,S +446,1,1,"Dodge, Master Washington",male,4,0,2,33638,81.8583,A34,S +447,1,2,"Mellinger, Miss Madeleine Violet",female,13,0,1,250644,19.5,,S 448,1,1,"Seward, Mr. Frederic Kimber",male,34,0,0,113794,26.55,,S -449,1,3,"Baclini, Miss. Marie Catherine",female,5,2,1,2666,19.2583,,C +449,1,3,"Baclini, Miss Marie Catherine",female,5,2,1,2666,19.2583,,C 450,1,1,"Peuchen, Major. Arthur Godfrey",male,52,0,0,113786,30.5,C104,S 451,0,2,"West, Mr. Edwy Arthur",male,36,1,2,C.A. 34651,27.75,,S 452,0,3,"Hagland, Mr. Ingvald Olai Olsen",male,,1,0,65303,19.9667,,S @@ -457,7 +457,7 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 456,1,3,"Jalsevac, Mr. Ivan",male,29,0,0,349240,7.8958,,C 457,0,1,"Millet, Mr. Francis Davis",male,65,0,0,13509,26.55,E38,S 458,1,1,"Kenyon, Mrs. Frederick R (Marion)",female,,1,0,17464,51.8625,D21,S -459,1,2,"Toomey, Miss. Ellen",female,50,0,0,F.C.C. 13531,10.5,,S +459,1,2,"Toomey, Miss Ellen",female,50,0,0,F.C.C. 13531,10.5,,S 460,0,3,"O'Connor, Mr. Maurice",male,,0,0,371060,7.75,,Q 461,1,1,"Anderson, Mr. Harry",male,48,0,0,19952,26.55,E12,S 462,0,3,"Morley, Mr. William",male,34,0,0,364506,8.05,,S @@ -468,42 +468,42 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 467,0,2,"Campbell, Mr. William",male,,0,0,239853,0,,S 468,0,1,"Smart, Mr. John Montgomery",male,56,0,0,113792,26.55,,S 469,0,3,"Scanlan, Mr. James",male,,0,0,36209,7.725,,Q -470,1,3,"Baclini, Miss. Helene Barbara",female,0.75,2,1,2666,19.2583,,C +470,1,3,"Baclini, Miss Helene Barbara",female,0.75,2,1,2666,19.2583,,C 471,0,3,"Keefe, Mr. Arthur",male,,0,0,323592,7.25,,S 472,0,3,"Cacic, Mr. Luka",male,38,0,0,315089,8.6625,,S 473,1,2,"West, Mrs. Edwy Arthur (Ada Mary Worth)",female,33,1,2,C.A. 34651,27.75,,S 474,1,2,"Jerwan, Mrs. Amin S (Marie Marthe Thuillard)",female,23,0,0,SC/AH Basle 541,13.7917,D,C -475,0,3,"Strandberg, Miss. Ida Sofia",female,22,0,0,7553,9.8375,,S +475,0,3,"Strandberg, Miss Ida Sofia",female,22,0,0,7553,9.8375,,S 476,0,1,"Clifford, Mr. George Quincy",male,,0,0,110465,52,A14,S 477,0,2,"Renouf, Mr. Peter Henry",male,34,1,0,31027,21,,S 478,0,3,"Braund, Mr. Lewis Richard",male,29,1,0,3460,7.0458,,S 479,0,3,"Karlsson, Mr. Nils August",male,22,0,0,350060,7.5208,,S -480,1,3,"Hirvonen, Miss. Hildur E",female,2,0,1,3101298,12.2875,,S -481,0,3,"Goodwin, Master. Harold Victor",male,9,5,2,CA 2144,46.9,,S +480,1,3,"Hirvonen, Miss Hildur E",female,2,0,1,3101298,12.2875,,S +481,0,3,"Goodwin, Master Harold Victor",male,9,5,2,CA 2144,46.9,,S 482,0,2,"Frost, Mr. Anthony Wood ""Archie""",male,,0,0,239854,0,,S 483,0,3,"Rouse, Mr. Richard Henry",male,50,0,0,A/5 3594,8.05,,S 484,1,3,"Turkula, Mrs. (Hedwig)",female,63,0,0,4134,9.5875,,S 485,1,1,"Bishop, Mr. Dickinson H",male,25,1,0,11967,91.0792,B49,C -486,0,3,"Lefebre, Miss. Jeannie",female,,3,1,4133,25.4667,,S +486,0,3,"Lefebre, Miss Jeannie",female,,3,1,4133,25.4667,,S 487,1,1,"Hoyt, Mrs. Frederick Maxfield (Jane Anne Forby)",female,35,1,0,19943,90,C93,S 488,0,1,"Kent, Mr. Edward Austin",male,58,0,0,11771,29.7,B37,C 489,0,3,"Somerton, Mr. Francis William",male,30,0,0,A.5. 18509,8.05,,S -490,1,3,"Coutts, Master. Eden Leslie ""Neville""",male,9,1,1,C.A. 37671,15.9,,S +490,1,3,"Coutts, Master Eden Leslie ""Neville""",male,9,1,1,C.A. 37671,15.9,,S 491,0,3,"Hagland, Mr. Konrad Mathias Reiersen",male,,1,0,65304,19.9667,,S 492,0,3,"Windelov, Mr. Einar",male,21,0,0,SOTON/OQ 3101317,7.25,,S 493,0,1,"Molson, Mr. Harry Markland",male,55,0,0,113787,30.5,C30,S 494,0,1,"Artagaveytia, Mr. Ramon",male,71,0,0,PC 17609,49.5042,,C 495,0,3,"Stanley, Mr. Edward Roland",male,21,0,0,A/4 45380,8.05,,S 496,0,3,"Yousseff, Mr. Gerious",male,,0,0,2627,14.4583,,C -497,1,1,"Eustis, Miss. Elizabeth Mussey",female,54,1,0,36947,78.2667,D20,C +497,1,1,"Eustis, Miss Elizabeth Mussey",female,54,1,0,36947,78.2667,D20,C 498,0,3,"Shellard, Mr. Frederick William",male,,0,0,C.A. 6212,15.1,,S 499,0,1,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25,1,2,113781,151.55,C22 C26,S 500,0,3,"Svensson, Mr. Olof",male,24,0,0,350035,7.7958,,S 501,0,3,"Calic, Mr. Petar",male,17,0,0,315086,8.6625,,S -502,0,3,"Canavan, Miss. Mary",female,21,0,0,364846,7.75,,Q -503,0,3,"O'Sullivan, Miss. Bridget Mary",female,,0,0,330909,7.6292,,Q -504,0,3,"Laitinen, Miss. Kristina Sofia",female,37,0,0,4135,9.5875,,S -505,1,1,"Maioni, Miss. Roberta",female,16,0,0,110152,86.5,B79,S +502,0,3,"Canavan, Miss Mary",female,21,0,0,364846,7.75,,Q +503,0,3,"O'Sullivan, Miss Bridget Mary",female,,0,0,330909,7.6292,,Q +504,0,3,"Laitinen, Miss Kristina Sofia",female,37,0,0,4135,9.5875,,S +505,1,1,"Maioni, Miss Roberta",female,16,0,0,110152,86.5,B79,S 506,0,1,"Penasco y Castellana, Mr. Victor de Satode",male,18,1,0,PC 17758,108.9,C65,C 507,1,2,"Quick, Mrs. Frederick Charles (Jane Richards)",female,33,0,2,26360,26,,S 508,1,1,"Bradley, Mr. George (""George Arthur Brayton"")",male,,0,0,111427,26.55,,S @@ -519,41 +519,41 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 518,0,3,"Ryan, Mr. Patrick",male,,0,0,371110,24.15,,Q 519,1,2,"Angle, Mrs. William A (Florence ""Mary"" Agnes Hughes)",female,36,1,0,226875,26,,S 520,0,3,"Pavlovic, Mr. Stefo",male,32,0,0,349242,7.8958,,S -521,1,1,"Perreault, Miss. Anne",female,30,0,0,12749,93.5,B73,S +521,1,1,"Perreault, Miss Anne",female,30,0,0,12749,93.5,B73,S 522,0,3,"Vovk, Mr. Janko",male,22,0,0,349252,7.8958,,S 523,0,3,"Lahoud, Mr. Sarkis",male,,0,0,2624,7.225,,C 524,1,1,"Hippach, Mrs. Louis Albert (Ida Sophia Fischer)",female,44,0,1,111361,57.9792,B18,C 525,0,3,"Kassem, Mr. Fared",male,,0,0,2700,7.2292,,C 526,0,3,"Farrell, Mr. James",male,40.5,0,0,367232,7.75,,Q -527,1,2,"Ridsdale, Miss. Lucy",female,50,0,0,W./C. 14258,10.5,,S +527,1,2,"Ridsdale, Miss Lucy",female,50,0,0,W./C. 14258,10.5,,S 528,0,1,"Farthing, Mr. John",male,,0,0,PC 17483,221.7792,C95,S 529,0,3,"Salonen, Mr. Johan Werner",male,39,0,0,3101296,7.925,,S 530,0,2,"Hocking, Mr. Richard George",male,23,2,1,29104,11.5,,S -531,1,2,"Quick, Miss. Phyllis May",female,2,1,1,26360,26,,S +531,1,2,"Quick, Miss Phyllis May",female,2,1,1,26360,26,,S 532,0,3,"Toufik, Mr. Nakli",male,,0,0,2641,7.2292,,C 533,0,3,"Elias, Mr. Joseph Jr",male,17,1,1,2690,7.2292,,C 534,1,3,"Peter, Mrs. Catherine (Catherine Rizk)",female,,0,2,2668,22.3583,,C -535,0,3,"Cacic, Miss. Marija",female,30,0,0,315084,8.6625,,S -536,1,2,"Hart, Miss. Eva Miriam",female,7,0,2,F.C.C. 13529,26.25,,S +535,0,3,"Cacic, Miss Marija",female,30,0,0,315084,8.6625,,S +536,1,2,"Hart, Miss Eva Miriam",female,7,0,2,F.C.C. 13529,26.25,,S 537,0,1,"Butt, Major. Archibald Willingham",male,45,0,0,113050,26.55,B38,S -538,1,1,"LeRoy, Miss. Bertha",female,30,0,0,PC 17761,106.425,,C +538,1,1,"LeRoy, Miss Bertha",female,30,0,0,PC 17761,106.425,,C 539,0,3,"Risien, Mr. Samuel Beard",male,,0,0,364498,14.5,,S -540,1,1,"Frolicher, Miss. Hedwig Margaritha",female,22,0,2,13568,49.5,B39,C -541,1,1,"Crosby, Miss. Harriet R",female,36,0,2,WE/P 5735,71,B22,S -542,0,3,"Andersson, Miss. Ingeborg Constanzia",female,9,4,2,347082,31.275,,S -543,0,3,"Andersson, Miss. Sigrid Elisabeth",female,11,4,2,347082,31.275,,S +540,1,1,"Frolicher, Miss Hedwig Margaritha",female,22,0,2,13568,49.5,B39,C +541,1,1,"Crosby, Miss Harriet R",female,36,0,2,WE/P 5735,71,B22,S +542,0,3,"Andersson, Miss Ingeborg Constanzia",female,9,4,2,347082,31.275,,S +543,0,3,"Andersson, Miss Sigrid Elisabeth",female,11,4,2,347082,31.275,,S 544,1,2,"Beane, Mr. Edward",male,32,1,0,2908,26,,S 545,0,1,"Douglas, Mr. Walter Donald",male,50,1,0,PC 17761,106.425,C86,C 546,0,1,"Nicholson, Mr. Arthur Ernest",male,64,0,0,693,26,,S 547,1,2,"Beane, Mrs. Edward (Ethel Clarke)",female,19,1,0,2908,26,,S 548,1,2,"Padro y Manent, Mr. Julian",male,,0,0,SC/PARIS 2146,13.8625,,C 549,0,3,"Goldsmith, Mr. Frank John",male,33,1,1,363291,20.525,,S -550,1,2,"Davies, Master. John Morgan Jr",male,8,1,1,C.A. 33112,36.75,,S +550,1,2,"Davies, Master John Morgan Jr",male,8,1,1,C.A. 33112,36.75,,S 551,1,1,"Thayer, Mr. John Borland Jr",male,17,0,2,17421,110.8833,C70,C 552,0,2,"Sharp, Mr. Percival James R",male,27,0,0,244358,26,,S 553,0,3,"O'Brien, Mr. Timothy",male,,0,0,330979,7.8292,,Q 554,1,3,"Leeni, Mr. Fahim (""Philip Zenni"")",male,22,0,0,2620,7.225,,C -555,1,3,"Ohman, Miss. Velin",female,22,0,0,347085,7.775,,S +555,1,3,"Ohman, Miss Velin",female,22,0,0,347085,7.775,,S 556,0,1,"Wright, Mr. George",male,62,0,0,113807,26.55,,S 557,1,1,"Duff Gordon, Lady. (Lucille Christiana Sutherland) (""Mrs Morgan"")",female,48,1,0,11755,39.6,A16,C 558,0,1,"Robbins, Mr. Victor",male,,0,0,PC 17757,227.525,,C @@ -563,7 +563,7 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 562,0,3,"Sivic, Mr. Husein",male,40,0,0,349251,7.8958,,S 563,0,2,"Norman, Mr. Robert Douglas",male,28,0,0,218629,13.5,,S 564,0,3,"Simmons, Mr. John",male,,0,0,SOTON/OQ 392082,8.05,,S -565,0,3,"Meanwell, Miss. (Marion Ogden)",female,,0,0,SOTON/O.Q. 392087,8.05,,S +565,0,3,"Meanwell, Miss (Marion Ogden)",female,,0,0,SOTON/O.Q. 392087,8.05,,S 566,0,3,"Davies, Mr. Alfred J",male,24,2,0,A/4 48871,24.15,,S 567,0,3,"Stoytcheff, Mr. Ilia",male,19,0,0,349205,7.8958,,S 568,0,3,"Palsson, Mrs. Nils (Alma Cornelia Berglund)",female,29,0,4,349909,21.075,,S @@ -572,19 +572,19 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 571,1,2,"Harris, Mr. George",male,62,0,0,S.W./PP 752,10.5,,S 572,1,1,"Appleton, Mrs. Edward Dale (Charlotte Lamson)",female,53,2,0,11769,51.4792,C101,S 573,1,1,"Flynn, Mr. John Irwin (""Irving"")",male,36,0,0,PC 17474,26.3875,E25,S -574,1,3,"Kelly, Miss. Mary",female,,0,0,14312,7.75,,Q +574,1,3,"Kelly, Miss Mary",female,,0,0,14312,7.75,,Q 575,0,3,"Rush, Mr. Alfred George John",male,16,0,0,A/4. 20589,8.05,,S 576,0,3,"Patchett, Mr. George",male,19,0,0,358585,14.5,,S -577,1,2,"Garside, Miss. Ethel",female,34,0,0,243880,13,,S +577,1,2,"Garside, Miss Ethel",female,34,0,0,243880,13,,S 578,1,1,"Silvey, Mrs. William Baird (Alice Munger)",female,39,1,0,13507,55.9,E44,S 579,0,3,"Caram, Mrs. Joseph (Maria Elias)",female,,1,0,2689,14.4583,,C 580,1,3,"Jussila, Mr. Eiriik",male,32,0,0,STON/O 2. 3101286,7.925,,S -581,1,2,"Christy, Miss. Julie Rachel",female,25,1,1,237789,30,,S +581,1,2,"Christy, Miss Julie Rachel",female,25,1,1,237789,30,,S 582,1,1,"Thayer, Mrs. John Borland (Marian Longstreth Morris)",female,39,1,1,17421,110.8833,C68,C 583,0,2,"Downton, Mr. William James",male,54,0,0,28403,26,,S 584,0,1,"Ross, Mr. John Hugo",male,36,0,0,13049,40.125,A10,C 585,0,3,"Paulner, Mr. Uscher",male,,0,0,3411,8.7125,,C -586,1,1,"Taussig, Miss. Ruth",female,18,0,2,110413,79.65,E68,S +586,1,1,"Taussig, Miss Ruth",female,18,0,2,110413,79.65,E68,S 587,0,2,"Jarvis, Mr. John Denzil",male,47,0,0,237565,15,,S 588,1,1,"Frolicher-Stehli, Mr. Maxmillian",male,60,1,1,13567,79.2,B41,C 589,0,3,"Gilinski, Mr. Eliezer",male,22,0,0,14973,8.05,,S @@ -592,10 +592,10 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 591,0,3,"Rintamaki, Mr. Matti",male,35,0,0,STON/O 2. 3101273,7.125,,S 592,1,1,"Stephenson, Mrs. Walter Bertram (Martha Eustis)",female,52,1,0,36947,78.2667,D20,C 593,0,3,"Elsbury, Mr. William James",male,47,0,0,A/5 3902,7.25,,S -594,0,3,"Bourke, Miss. Mary",female,,0,2,364848,7.75,,Q +594,0,3,"Bourke, Miss Mary",female,,0,2,364848,7.75,,Q 595,0,2,"Chapman, Mr. John Henry",male,37,1,0,SC/AH 29037,26,,S 596,0,3,"Van Impe, Mr. Jean Baptiste",male,36,1,1,345773,24.15,,S -597,1,2,"Leitch, Miss. Jessie Wills",female,,0,0,248727,33,,S +597,1,2,"Leitch, Miss Jessie Wills",female,,0,0,248727,33,,S 598,0,3,"Johnson, Mr. Alfred",male,49,0,0,LINE,0,,S 599,0,3,"Boulos, Mr. Hanna",male,,0,0,2664,7.225,,C 600,1,1,"Duff Gordon, Sir. Cosmo Edmund (""Mr Morgan"")",male,49,1,0,PC 17485,56.9292,A20,C @@ -608,16 +608,16 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 607,0,3,"Karaic, Mr. Milan",male,30,0,0,349246,7.8958,,S 608,1,1,"Daniel, Mr. Robert Williams",male,27,0,0,113804,30.5,,S 609,1,2,"Laroche, Mrs. Joseph (Juliette Marie Louise Lafargue)",female,22,1,2,SC/Paris 2123,41.5792,,C -610,1,1,"Shutes, Miss. Elizabeth W",female,40,0,0,PC 17582,153.4625,C125,S +610,1,1,"Shutes, Miss Elizabeth W",female,40,0,0,PC 17582,153.4625,C125,S 611,0,3,"Andersson, Mrs. Anders Johan (Alfrida Konstantia Brogren)",female,39,1,5,347082,31.275,,S 612,0,3,"Jardin, Mr. Jose Neto",male,,0,0,SOTON/O.Q. 3101305,7.05,,S -613,1,3,"Murphy, Miss. Margaret Jane",female,,1,0,367230,15.5,,Q +613,1,3,"Murphy, Miss Margaret Jane",female,,1,0,367230,15.5,,Q 614,0,3,"Horgan, Mr. John",male,,0,0,370377,7.75,,Q 615,0,3,"Brocklebank, Mr. William Alfred",male,35,0,0,364512,8.05,,S -616,1,2,"Herman, Miss. Alice",female,24,1,2,220845,65,,S +616,1,2,"Herman, Miss Alice",female,24,1,2,220845,65,,S 617,0,3,"Danbom, Mr. Ernst Gilbert",male,34,1,1,347080,14.4,,S 618,0,3,"Lobb, Mrs. William Arthur (Cordelia K Stanlick)",female,26,1,0,A/5. 3336,16.1,,S -619,1,2,"Becker, Miss. Marion Louise",female,4,2,1,230136,39,F4,S +619,1,2,"Becker, Miss Marion Louise",female,4,2,1,230136,39,F4,S 620,0,2,"Gavey, Mr. Lawrence",male,26,0,0,31028,10.5,,S 621,0,3,"Yasbeck, Mr. Antoni",male,27,1,0,2659,14.4542,,C 622,1,1,"Kimball, Mr. Edwin Nelson Jr",male,42,1,0,11753,52.5542,D19,S @@ -626,34 +626,34 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 625,0,3,"Bowen, Mr. David John ""Dai""",male,21,0,0,54636,16.1,,S 626,0,1,"Sutton, Mr. Frederick",male,61,0,0,36963,32.3208,D50,S 627,0,2,"Kirkland, Rev. Charles Leonard",male,57,0,0,219533,12.35,,Q -628,1,1,"Longley, Miss. Gretchen Fiske",female,21,0,0,13502,77.9583,D9,S +628,1,1,"Longley, Miss Gretchen Fiske",female,21,0,0,13502,77.9583,D9,S 629,0,3,"Bostandyeff, Mr. Guentcho",male,26,0,0,349224,7.8958,,S 630,0,3,"O'Connell, Mr. Patrick D",male,,0,0,334912,7.7333,,Q 631,1,1,"Barkworth, Mr. Algernon Henry Wilson",male,80,0,0,27042,30,A23,S 632,0,3,"Lundahl, Mr. Johan Svensson",male,51,0,0,347743,7.0542,,S 633,1,1,"Stahelin-Maeglin, Dr. Max",male,32,0,0,13214,30.5,B50,C 634,0,1,"Parr, Mr. William Henry Marsh",male,,0,0,112052,0,,S -635,0,3,"Skoog, Miss. Mabel",female,9,3,2,347088,27.9,,S -636,1,2,"Davis, Miss. Mary",female,28,0,0,237668,13,,S +635,0,3,"Skoog, Miss Mabel",female,9,3,2,347088,27.9,,S +636,1,2,"Davis, Miss Mary",female,28,0,0,237668,13,,S 637,0,3,"Leinonen, Mr. Antti Gustaf",male,32,0,0,STON/O 2. 3101292,7.925,,S 638,0,2,"Collyer, Mr. Harvey",male,31,1,1,C.A. 31921,26.25,,S 639,0,3,"Panula, Mrs. Juha (Maria Emilia Ojala)",female,41,0,5,3101295,39.6875,,S 640,0,3,"Thorneycroft, Mr. Percival",male,,1,0,376564,16.1,,S 641,0,3,"Jensen, Mr. Hans Peder",male,20,0,0,350050,7.8542,,S 642,1,1,"Sagesser, Mlle. Emma",female,24,0,0,PC 17477,69.3,B35,C -643,0,3,"Skoog, Miss. Margit Elizabeth",female,2,3,2,347088,27.9,,S +643,0,3,"Skoog, Miss Margit Elizabeth",female,2,3,2,347088,27.9,,S 644,1,3,"Foo, Mr. Choong",male,,0,0,1601,56.4958,,S -645,1,3,"Baclini, Miss. Eugenie",female,0.75,2,1,2666,19.2583,,C +645,1,3,"Baclini, Miss Eugenie",female,0.75,2,1,2666,19.2583,,C 646,1,1,"Harper, Mr. Henry Sleeper",male,48,1,0,PC 17572,76.7292,D33,C 647,0,3,"Cor, Mr. Liudevit",male,19,0,0,349231,7.8958,,S 648,1,1,"Simonius-Blumer, Col. Oberst Alfons",male,56,0,0,13213,35.5,A26,C 649,0,3,"Willey, Mr. Edward",male,,0,0,S.O./P.P. 751,7.55,,S -650,1,3,"Stanley, Miss. Amy Zillah Elsie",female,23,0,0,CA. 2314,7.55,,S +650,1,3,"Stanley, Miss Amy Zillah Elsie",female,23,0,0,CA. 2314,7.55,,S 651,0,3,"Mitkoff, Mr. Mito",male,,0,0,349221,7.8958,,S -652,1,2,"Doling, Miss. Elsie",female,18,0,1,231919,23,,S +652,1,2,"Doling, Miss Elsie",female,18,0,1,231919,23,,S 653,0,3,"Kalvik, Mr. Johannes Halvorsen",male,21,0,0,8475,8.4333,,S -654,1,3,"O'Leary, Miss. Hanora ""Norah""",female,,0,0,330919,7.8292,,Q -655,0,3,"Hegarty, Miss. Hanora ""Nora""",female,18,0,0,365226,6.75,,Q +654,1,3,"O'Leary, Miss Hanora ""Norah""",female,,0,0,330919,7.8292,,Q +655,0,3,"Hegarty, Miss Hanora ""Nora""",female,18,0,0,365226,6.75,,Q 656,0,2,"Hickman, Mr. Leonard Mark",male,24,2,0,S.O.C. 14879,73.5,,S 657,0,3,"Radeff, Mr. Alexander",male,,0,0,349223,7.8958,,S 658,0,3,"Bourke, Mrs. John (Catherine)",female,32,1,1,364849,15.5,,Q @@ -676,10 +676,10 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 675,0,2,"Watson, Mr. Ennis Hastings",male,,0,0,239856,0,,S 676,0,3,"Edvardsson, Mr. Gustaf Hjalmar",male,18,0,0,349912,7.775,,S 677,0,3,"Sawyer, Mr. Frederick Charles",male,24.5,0,0,342826,8.05,,S -678,1,3,"Turja, Miss. Anna Sofia",female,18,0,0,4138,9.8417,,S +678,1,3,"Turja, Miss Anna Sofia",female,18,0,0,4138,9.8417,,S 679,0,3,"Goodwin, Mrs. Frederick (Augusta Tyler)",female,43,1,6,CA 2144,46.9,,S 680,1,1,"Cardeza, Mr. Thomas Drake Martinez",male,36,0,1,PC 17755,512.3292,B51 B53 B55,C -681,0,3,"Peters, Miss. Katie",female,,0,0,330935,8.1375,,Q +681,0,3,"Peters, Miss Katie",female,,0,0,330935,8.1375,,Q 682,1,1,"Hassab, Mr. Hammad",male,27,0,0,PC 17572,76.7292,D49,C 683,0,3,"Olsvigen, Mr. Thor Anderson",male,20,0,0,6563,9.225,,S 684,0,3,"Goodwin, Mr. Charles Edward",male,14,5,2,CA 2144,46.9,,S @@ -688,48 +688,48 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 687,0,3,"Panula, Mr. Jaako Arnold",male,14,4,1,3101295,39.6875,,S 688,0,3,"Dakic, Mr. Branko",male,19,0,0,349228,10.1708,,S 689,0,3,"Fischer, Mr. Eberhard Thelander",male,18,0,0,350036,7.7958,,S -690,1,1,"Madill, Miss. Georgette Alexandra",female,15,0,1,24160,211.3375,B5,S +690,1,1,"Madill, Miss Georgette Alexandra",female,15,0,1,24160,211.3375,B5,S 691,1,1,"Dick, Mr. Albert Adrian",male,31,1,0,17474,57,B20,S -692,1,3,"Karun, Miss. Manca",female,4,0,1,349256,13.4167,,C +692,1,3,"Karun, Miss Manca",female,4,0,1,349256,13.4167,,C 693,1,3,"Lam, Mr. Ali",male,,0,0,1601,56.4958,,S 694,0,3,"Saad, Mr. Khalil",male,25,0,0,2672,7.225,,C 695,0,1,"Weir, Col. John",male,60,0,0,113800,26.55,,S 696,0,2,"Chapman, Mr. Charles Henry",male,52,0,0,248731,13.5,,S 697,0,3,"Kelly, Mr. James",male,44,0,0,363592,8.05,,S -698,1,3,"Mullens, Miss. Katherine ""Katie""",female,,0,0,35852,7.7333,,Q +698,1,3,"Mullens, Miss Katherine ""Katie""",female,,0,0,35852,7.7333,,Q 699,0,1,"Thayer, Mr. John Borland",male,49,1,1,17421,110.8833,C68,C 700,0,3,"Humblen, Mr. Adolf Mathias Nicolai Olsen",male,42,0,0,348121,7.65,F G63,S 701,1,1,"Astor, Mrs. John Jacob (Madeleine Talmadge Force)",female,18,1,0,PC 17757,227.525,C62 C64,C 702,1,1,"Silverthorne, Mr. Spencer Victor",male,35,0,0,PC 17475,26.2875,E24,S -703,0,3,"Barbara, Miss. Saiide",female,18,0,1,2691,14.4542,,C +703,0,3,"Barbara, Miss Saiide",female,18,0,1,2691,14.4542,,C 704,0,3,"Gallagher, Mr. Martin",male,25,0,0,36864,7.7417,,Q 705,0,3,"Hansen, Mr. Henrik Juul",male,26,1,0,350025,7.8542,,S 706,0,2,"Morley, Mr. Henry Samuel (""Mr Henry Marshall"")",male,39,0,0,250655,26,,S 707,1,2,"Kelly, Mrs. Florence ""Fannie""",female,45,0,0,223596,13.5,,S 708,1,1,"Calderhead, Mr. Edward Pennington",male,42,0,0,PC 17476,26.2875,E24,S -709,1,1,"Cleaver, Miss. Alice",female,22,0,0,113781,151.55,,S -710,1,3,"Moubarek, Master. Halim Gonios (""William George"")",male,,1,1,2661,15.2458,,C +709,1,1,"Cleaver, Miss Alice",female,22,0,0,113781,151.55,,S +710,1,3,"Moubarek, Master Halim Gonios (""William George"")",male,,1,1,2661,15.2458,,C 711,1,1,"Mayne, Mlle. Berthe Antonine (""Mrs de Villiers"")",female,24,0,0,PC 17482,49.5042,C90,C 712,0,1,"Klaber, Mr. Herman",male,,0,0,113028,26.55,C124,S 713,1,1,"Taylor, Mr. Elmer Zebley",male,48,1,0,19996,52,C126,S 714,0,3,"Larsson, Mr. August Viktor",male,29,0,0,7545,9.4833,,S 715,0,2,"Greenberg, Mr. Samuel",male,52,0,0,250647,13,,S 716,0,3,"Soholt, Mr. Peter Andreas Lauritz Andersen",male,19,0,0,348124,7.65,F G73,S -717,1,1,"Endres, Miss. Caroline Louise",female,38,0,0,PC 17757,227.525,C45,C -718,1,2,"Troutt, Miss. Edwina Celia ""Winnie""",female,27,0,0,34218,10.5,E101,S +717,1,1,"Endres, Miss Caroline Louise",female,38,0,0,PC 17757,227.525,C45,C +718,1,2,"Troutt, Miss Edwina Celia ""Winnie""",female,27,0,0,34218,10.5,E101,S 719,0,3,"McEvoy, Mr. Michael",male,,0,0,36568,15.5,,Q 720,0,3,"Johnson, Mr. Malkolm Joackim",male,33,0,0,347062,7.775,,S -721,1,2,"Harper, Miss. Annie Jessie ""Nina""",female,6,0,1,248727,33,,S +721,1,2,"Harper, Miss Annie Jessie ""Nina""",female,6,0,1,248727,33,,S 722,0,3,"Jensen, Mr. Svend Lauritz",male,17,1,0,350048,7.0542,,S 723,0,2,"Gillespie, Mr. William Henry",male,34,0,0,12233,13,,S 724,0,2,"Hodges, Mr. Henry Price",male,50,0,0,250643,13,,S 725,1,1,"Chambers, Mr. Norman Campbell",male,27,1,0,113806,53.1,E8,S 726,0,3,"Oreskovic, Mr. Luka",male,20,0,0,315094,8.6625,,S 727,1,2,"Renouf, Mrs. Peter Henry (Lillian Jefferys)",female,30,3,0,31027,21,,S -728,1,3,"Mannion, Miss. Margareth",female,,0,0,36866,7.7375,,Q +728,1,3,"Mannion, Miss Margareth",female,,0,0,36866,7.7375,,Q 729,0,2,"Bryhl, Mr. Kurt Arnold Gottfrid",male,25,1,0,236853,26,,S -730,0,3,"Ilmakangas, Miss. Pieta Sofia",female,25,1,0,STON/O2. 3101271,7.925,,S -731,1,1,"Allen, Miss. Elisabeth Walton",female,29,0,0,24160,211.3375,B5,S +730,0,3,"Ilmakangas, Miss Pieta Sofia",female,25,1,0,STON/O2. 3101271,7.925,,S +731,1,1,"Allen, Miss Elisabeth Walton",female,29,0,0,24160,211.3375,B5,S 732,0,3,"Hassan, Mr. Houssein G N",male,11,0,0,2699,18.7875,,C 733,0,2,"Knight, Mr. Robert J",male,,0,0,239855,0,,S 734,0,2,"Berriman, Mr. William John",male,23,0,0,28425,13,,S @@ -741,20 +741,20 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 740,0,3,"Nankoff, Mr. Minko",male,,0,0,349218,7.8958,,S 741,1,1,"Hawksford, Mr. Walter James",male,,0,0,16988,30,D45,S 742,0,1,"Cavendish, Mr. Tyrell William",male,36,1,0,19877,78.85,C46,S -743,1,1,"Ryerson, Miss. Susan Parker ""Suzette""",female,21,2,2,PC 17608,262.375,B57 B59 B63 B66,C +743,1,1,"Ryerson, Miss Susan Parker ""Suzette""",female,21,2,2,PC 17608,262.375,B57 B59 B63 B66,C 744,0,3,"McNamee, Mr. Neal",male,24,1,0,376566,16.1,,S 745,1,3,"Stranden, Mr. Juho",male,31,0,0,STON/O 2. 3101288,7.925,,S 746,0,1,"Crosby, Capt. Edward Gifford",male,70,1,1,WE/P 5735,71,B22,S 747,0,3,"Abbott, Mr. Rossmore Edward",male,16,1,1,C.A. 2673,20.25,,S -748,1,2,"Sinkkonen, Miss. Anna",female,30,0,0,250648,13,,S +748,1,2,"Sinkkonen, Miss Anna",female,30,0,0,250648,13,,S 749,0,1,"Marvin, Mr. Daniel Warner",male,19,1,0,113773,53.1,D30,S 750,0,3,"Connaghton, Mr. Michael",male,31,0,0,335097,7.75,,Q -751,1,2,"Wells, Miss. Joan",female,4,1,1,29103,23,,S -752,1,3,"Moor, Master. Meier",male,6,0,1,392096,12.475,E121,S +751,1,2,"Wells, Miss Joan",female,4,1,1,29103,23,,S +752,1,3,"Moor, Master Meier",male,6,0,1,392096,12.475,E121,S 753,0,3,"Vande Velde, Mr. Johannes Joseph",male,33,0,0,345780,9.5,,S 754,0,3,"Jonkoff, Mr. Lalio",male,23,0,0,349204,7.8958,,S 755,1,2,"Herman, Mrs. Samuel (Jane Laver)",female,48,1,2,220845,65,,S -756,1,2,"Hamalainen, Master. Viljo",male,0.67,1,1,250649,14.5,,S +756,1,2,"Hamalainen, Master Viljo",male,0.67,1,1,250649,14.5,,S 757,0,3,"Carlsson, Mr. August Sigfrid",male,28,0,0,350042,7.7958,,S 758,0,2,"Bailey, Mr. Percy Andrew",male,18,0,0,29108,11.5,,S 759,0,3,"Theobald, Mr. Thomas Leonard",male,34,0,0,363294,8.05,,S @@ -766,7 +766,7 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 765,0,3,"Eklund, Mr. Hans Linus",male,16,0,0,347074,7.775,,S 766,1,1,"Hogeboom, Mrs. John C (Anna Andrews)",female,51,1,0,13502,77.9583,D11,S 767,0,1,"Brewe, Dr. Arthur Jackson",male,,0,0,112379,39.6,,C -768,0,3,"Mangan, Miss. Mary",female,30.5,0,0,364850,7.75,,Q +768,0,3,"Mangan, Miss Mary",female,30.5,0,0,364850,7.75,,Q 769,0,3,"Moran, Mr. Daniel J",male,,1,0,371110,24.15,,Q 770,0,3,"Gronnestad, Mr. Daniel Danielsen",male,32,0,0,8471,8.3625,,S 771,0,3,"Lievens, Mr. Rene Aime",male,24,0,0,345781,9.5,,S @@ -776,22 +776,22 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 775,1,2,"Hocking, Mrs. Elizabeth (Eliza Needs)",female,54,1,3,29105,23,,S 776,0,3,"Myhrman, Mr. Pehr Fabian Oliver Malkolm",male,18,0,0,347078,7.75,,S 777,0,3,"Tobin, Mr. Roger",male,,0,0,383121,7.75,F38,Q -778,1,3,"Emanuel, Miss. Virginia Ethel",female,5,0,0,364516,12.475,,S +778,1,3,"Emanuel, Miss Virginia Ethel",female,5,0,0,364516,12.475,,S 779,0,3,"Kilgannon, Mr. Thomas J",male,,0,0,36865,7.7375,,Q 780,1,1,"Robert, Mrs. Edward Scott (Elisabeth Walton McMillan)",female,43,0,1,24160,211.3375,B3,S -781,1,3,"Ayoub, Miss. Banoura",female,13,0,0,2687,7.2292,,C +781,1,3,"Ayoub, Miss Banoura",female,13,0,0,2687,7.2292,,C 782,1,1,"Dick, Mrs. Albert Adrian (Vera Gillespie)",female,17,1,0,17474,57,B20,S 783,0,1,"Long, Mr. Milton Clyde",male,29,0,0,113501,30,D6,S 784,0,3,"Johnston, Mr. Andrew G",male,,1,2,W./C. 6607,23.45,,S 785,0,3,"Ali, Mr. William",male,25,0,0,SOTON/O.Q. 3101312,7.05,,S 786,0,3,"Harmer, Mr. Abraham (David Lishin)",male,25,0,0,374887,7.25,,S -787,1,3,"Sjoblom, Miss. Anna Sofia",female,18,0,0,3101265,7.4958,,S -788,0,3,"Rice, Master. George Hugh",male,8,4,1,382652,29.125,,Q -789,1,3,"Dean, Master. Bertram Vere",male,1,1,2,C.A. 2315,20.575,,S +787,1,3,"Sjoblom, Miss Anna Sofia",female,18,0,0,3101265,7.4958,,S +788,0,3,"Rice, Master George Hugh",male,8,4,1,382652,29.125,,Q +789,1,3,"Dean, Master Bertram Vere",male,1,1,2,C.A. 2315,20.575,,S 790,0,1,"Guggenheim, Mr. Benjamin",male,46,0,0,PC 17593,79.2,B82 B84,C 791,0,3,"Keane, Mr. Andrew ""Andy""",male,,0,0,12460,7.75,,Q 792,0,2,"Gaskell, Mr. Alfred",male,16,0,0,239865,26,,S -793,0,3,"Sage, Miss. Stella Anna",female,,8,2,CA. 2343,69.55,,S +793,0,3,"Sage, Miss Stella Anna",female,,8,2,CA. 2343,69.55,,S 794,0,1,"Hoyt, Mr. William Fisher",male,,0,0,PC 17600,30.6958,,C 795,0,3,"Dantcheff, Mr. Ristiu",male,25,0,0,349203,7.8958,,S 796,0,2,"Otter, Mr. Richard",male,39,0,0,28213,13,,S @@ -801,47 +801,47 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 800,0,3,"Van Impe, Mrs. Jean Baptiste (Rosalie Paula Govaert)",female,30,1,1,345773,24.15,,S 801,0,2,"Ponesell, Mr. Martin",male,34,0,0,250647,13,,S 802,1,2,"Collyer, Mrs. Harvey (Charlotte Annie Tate)",female,31,1,1,C.A. 31921,26.25,,S -803,1,1,"Carter, Master. William Thornton II",male,11,1,2,113760,120,B96 B98,S -804,1,3,"Thomas, Master. Assad Alexander",male,0.42,0,1,2625,8.5167,,C +803,1,1,"Carter, Master William Thornton II",male,11,1,2,113760,120,B96 B98,S +804,1,3,"Thomas, Master Assad Alexander",male,0.42,0,1,2625,8.5167,,C 805,1,3,"Hedman, Mr. Oskar Arvid",male,27,0,0,347089,6.975,,S 806,0,3,"Johansson, Mr. Karl Johan",male,31,0,0,347063,7.775,,S 807,0,1,"Andrews, Mr. Thomas Jr",male,39,0,0,112050,0,A36,S -808,0,3,"Pettersson, Miss. Ellen Natalia",female,18,0,0,347087,7.775,,S +808,0,3,"Pettersson, Miss Ellen Natalia",female,18,0,0,347087,7.775,,S 809,0,2,"Meyer, Mr. August",male,39,0,0,248723,13,,S 810,1,1,"Chambers, Mrs. Norman Campbell (Bertha Griggs)",female,33,1,0,113806,53.1,E8,S 811,0,3,"Alexander, Mr. William",male,26,0,0,3474,7.8875,,S 812,0,3,"Lester, Mr. James",male,39,0,0,A/4 48871,24.15,,S 813,0,2,"Slemen, Mr. Richard James",male,35,0,0,28206,10.5,,S -814,0,3,"Andersson, Miss. Ebba Iris Alfrida",female,6,4,2,347082,31.275,,S +814,0,3,"Andersson, Miss Ebba Iris Alfrida",female,6,4,2,347082,31.275,,S 815,0,3,"Tomlin, Mr. Ernest Portage",male,30.5,0,0,364499,8.05,,S 816,0,1,"Fry, Mr. Richard",male,,0,0,112058,0,B102,S -817,0,3,"Heininen, Miss. Wendla Maria",female,23,0,0,STON/O2. 3101290,7.925,,S +817,0,3,"Heininen, Miss Wendla Maria",female,23,0,0,STON/O2. 3101290,7.925,,S 818,0,2,"Mallet, Mr. Albert",male,31,1,1,S.C./PARIS 2079,37.0042,,C 819,0,3,"Holm, Mr. John Fredrik Alexander",male,43,0,0,C 7075,6.45,,S -820,0,3,"Skoog, Master. Karl Thorsten",male,10,3,2,347088,27.9,,S +820,0,3,"Skoog, Master Karl Thorsten",male,10,3,2,347088,27.9,,S 821,1,1,"Hays, Mrs. Charles Melville (Clara Jennings Gregg)",female,52,1,1,12749,93.5,B69,S 822,1,3,"Lulic, Mr. Nikola",male,27,0,0,315098,8.6625,,S 823,0,1,"Reuchlin, Jonkheer. John George",male,38,0,0,19972,0,,S 824,1,3,"Moor, Mrs. (Beila)",female,27,0,1,392096,12.475,E121,S -825,0,3,"Panula, Master. Urho Abraham",male,2,4,1,3101295,39.6875,,S +825,0,3,"Panula, Master Urho Abraham",male,2,4,1,3101295,39.6875,,S 826,0,3,"Flynn, Mr. John",male,,0,0,368323,6.95,,Q 827,0,3,"Lam, Mr. Len",male,,0,0,1601,56.4958,,S -828,1,2,"Mallet, Master. Andre",male,1,0,2,S.C./PARIS 2079,37.0042,,C +828,1,2,"Mallet, Master Andre",male,1,0,2,S.C./PARIS 2079,37.0042,,C 829,1,3,"McCormack, Mr. Thomas Joseph",male,,0,0,367228,7.75,,Q 830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62,0,0,113572,80,B28, 831,1,3,"Yasbeck, Mrs. Antoni (Selini Alexander)",female,15,1,0,2659,14.4542,,C -832,1,2,"Richards, Master. George Sibley",male,0.83,1,1,29106,18.75,,S +832,1,2,"Richards, Master George Sibley",male,0.83,1,1,29106,18.75,,S 833,0,3,"Saad, Mr. Amin",male,,0,0,2671,7.2292,,C 834,0,3,"Augustsson, Mr. Albert",male,23,0,0,347468,7.8542,,S 835,0,3,"Allum, Mr. Owen George",male,18,0,0,2223,8.3,,S -836,1,1,"Compton, Miss. Sara Rebecca",female,39,1,1,PC 17756,83.1583,E49,C +836,1,1,"Compton, Miss Sara Rebecca",female,39,1,1,PC 17756,83.1583,E49,C 837,0,3,"Pasic, Mr. Jakob",male,21,0,0,315097,8.6625,,S 838,0,3,"Sirota, Mr. Maurice",male,,0,0,392092,8.05,,S 839,1,3,"Chip, Mr. Chang",male,32,0,0,1601,56.4958,,S 840,1,1,"Marechal, Mr. Pierre",male,,0,0,11774,29.7,C47,C 841,0,3,"Alhomaki, Mr. Ilmari Rudolf",male,20,0,0,SOTON/O2 3101287,7.925,,S 842,0,2,"Mudd, Mr. Thomas Charles",male,16,0,0,S.O./P.P. 3,10.5,,S -843,1,1,"Serepeca, Miss. Augusta",female,30,0,0,113798,31,,C +843,1,1,"Serepeca, Miss Augusta",female,30,0,0,113798,31,,C 844,0,3,"Lemberopolous, Mr. Peter L",male,34.5,0,0,2683,6.4375,,C 845,0,3,"Culumovic, Mr. Jeso",male,17,0,0,315090,8.6625,,S 846,0,3,"Abbing, Mr. Anthony",male,42,0,0,C.A. 5547,7.55,,S @@ -849,10 +849,10 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 848,0,3,"Markoff, Mr. Marin",male,35,0,0,349213,7.8958,,C 849,0,2,"Harper, Rev. John",male,28,0,1,248727,33,,S 850,1,1,"Goldenberg, Mrs. Samuel L (Edwiga Grabowska)",female,,1,0,17453,89.1042,C92,C -851,0,3,"Andersson, Master. Sigvard Harald Elias",male,4,4,2,347082,31.275,,S +851,0,3,"Andersson, Master Sigvard Harald Elias",male,4,4,2,347082,31.275,,S 852,0,3,"Svensson, Mr. Johan",male,74,0,0,347060,7.775,,S -853,0,3,"Boulos, Miss. Nourelain",female,9,1,1,2678,15.2458,,C -854,1,1,"Lines, Miss. Mary Conover",female,16,0,1,PC 17592,39.4,D28,S +853,0,3,"Boulos, Miss Nourelain",female,9,1,1,2678,15.2458,,C +854,1,1,"Lines, Miss Mary Conover",female,16,0,1,PC 17592,39.4,D28,S 855,0,2,"Carter, Mrs. Ernest Courtenay (Lilian Hughes)",female,44,1,0,244252,26,,S 856,1,3,"Aks, Mrs. Sam (Leah Rosen)",female,18,0,1,392091,9.35,,S 857,1,1,"Wick, Mrs. George Dennick (Mary Hitchcock)",female,45,1,1,36928,164.8667,,S @@ -862,31 +862,31 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 861,0,3,"Hansen, Mr. Claus Peter",male,41,2,0,350026,14.1083,,S 862,0,2,"Giles, Mr. Frederick Edward",male,21,1,0,28134,11.5,,S 863,1,1,"Swift, Mrs. Frederick Joel (Margaret Welles Barron)",female,48,0,0,17466,25.9292,D17,S -864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.55,,S +864,0,3,"Sage, Miss Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.55,,S 865,0,2,"Gill, Mr. John William",male,24,0,0,233866,13,,S 866,1,2,"Bystrom, Mrs. (Karolina)",female,42,0,0,236852,13,,S -867,1,2,"Duran y More, Miss. Asuncion",female,27,1,0,SC/PARIS 2149,13.8583,,C +867,1,2,"Duran y More, Miss Asuncion",female,27,1,0,SC/PARIS 2149,13.8583,,C 868,0,1,"Roebling, Mr. Washington Augustus II",male,31,0,0,PC 17590,50.4958,A24,S 869,0,3,"van Melkebeke, Mr. Philemon",male,,0,0,345777,9.5,,S -870,1,3,"Johnson, Master. Harold Theodor",male,4,1,1,347742,11.1333,,S +870,1,3,"Johnson, Master Harold Theodor",male,4,1,1,347742,11.1333,,S 871,0,3,"Balkic, Mr. Cerin",male,26,0,0,349248,7.8958,,S 872,1,1,"Beckwith, Mrs. Richard Leonard (Sallie Monypeny)",female,47,1,1,11751,52.5542,D35,S 873,0,1,"Carlsson, Mr. Frans Olof",male,33,0,0,695,5,B51 B53 B55,S 874,0,3,"Vander Cruyssen, Mr. Victor",male,47,0,0,345765,9,,S 875,1,2,"Abelson, Mrs. Samuel (Hannah Wizosky)",female,28,1,0,P/PP 3381,24,,C -876,1,3,"Najib, Miss. Adele Kiamie ""Jane""",female,15,0,0,2667,7.225,,C +876,1,3,"Najib, Miss Adele Kiamie ""Jane""",female,15,0,0,2667,7.225,,C 877,0,3,"Gustafsson, Mr. Alfred Ossian",male,20,0,0,7534,9.8458,,S 878,0,3,"Petroff, Mr. Nedelio",male,19,0,0,349212,7.8958,,S 879,0,3,"Laleff, Mr. Kristo",male,,0,0,349217,7.8958,,S 880,1,1,"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)",female,56,0,1,11767,83.1583,C50,C 881,1,2,"Shelley, Mrs. William (Imanita Parrish Hall)",female,25,0,1,230433,26,,S 882,0,3,"Markun, Mr. Johann",male,33,0,0,349257,7.8958,,S -883,0,3,"Dahlberg, Miss. Gerda Ulrika",female,22,0,0,7552,10.5167,,S +883,0,3,"Dahlberg, Miss Gerda Ulrika",female,22,0,0,7552,10.5167,,S 884,0,2,"Banfield, Mr. Frederick James",male,28,0,0,C.A./SOTON 34068,10.5,,S 885,0,3,"Sutehall, Mr. Henry Jr",male,25,0,0,SOTON/OQ 392076,7.05,,S 886,0,3,"Rice, Mrs. William (Margaret Norton)",female,39,0,5,382652,29.125,,Q 887,0,2,"Montvila, Rev. Juozas",male,27,0,0,211536,13,,S -888,1,1,"Graham, Miss. Margaret Edith",female,19,0,0,112053,30,B42,S -889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S +888,1,1,"Graham, Miss Margaret Edith",female,19,0,0,112053,30,B42,S +889,0,3,"Johnston, Miss Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S 890,1,1,"Behr, Mr. Karl Howell",male,26,0,0,111369,30,C148,C 891,0,3,"Dooley, Mr. Patrick",male,32,0,0,370376,7.75,,Q diff --git a/doc/source/_static/schemas/01_table_spreadsheet.png b/doc/source/_static/schemas/01_table_spreadsheet.png index b3cf5a0245b9c..4e3497879de31 100644 Binary files a/doc/source/_static/schemas/01_table_spreadsheet.png and b/doc/source/_static/schemas/01_table_spreadsheet.png differ diff --git a/doc/source/development/contributing_environment.rst b/doc/source/development/contributing_environment.rst index 325c902dd4f9e..0691414f53306 100644 --- a/doc/source/development/contributing_environment.rst +++ b/doc/source/development/contributing_environment.rst @@ -130,7 +130,7 @@ Consult the docs for setting up pyenv `here `__. pyenv virtualenv # For instance: - pyenv virtualenv 3.9.10 pandas-dev + pyenv virtualenv 3.10 pandas-dev # Activate the virtualenv pyenv activate pandas-dev diff --git a/doc/source/development/policies.rst b/doc/source/development/policies.rst index f958e4c4ad1fc..a3665c5bb4d1f 100644 --- a/doc/source/development/policies.rst +++ b/doc/source/development/policies.rst @@ -51,7 +51,7 @@ pandas may change the behavior of experimental features at any time. Python support ~~~~~~~~~~~~~~ -pandas mirrors the `NumPy guidelines for Python support `__. +pandas mirrors the `SPEC 0 guideline for Python support `__. Security policy ~~~~~~~~~~~~~~~ diff --git a/doc/source/getting_started/comparison/comparison_with_sql.rst b/doc/source/getting_started/comparison/comparison_with_sql.rst index daa528c7d408a..dc0590f18751a 100644 --- a/doc/source/getting_started/comparison/comparison_with_sql.rst +++ b/doc/source/getting_started/comparison/comparison_with_sql.rst @@ -505,7 +505,7 @@ DELETE DELETE FROM tips WHERE tip > 9; -In pandas we select the rows that should remain instead of deleting them: +In pandas we select the rows that should remain instead of deleting the rows that should be removed: .. ipython:: python diff --git a/doc/source/getting_started/index.rst b/doc/source/getting_started/index.rst index d9cb1de14aded..9f29f7f4f4406 100644 --- a/doc/source/getting_started/index.rst +++ b/doc/source/getting_started/index.rst @@ -134,8 +134,8 @@ to explore, clean, and process your data. In pandas, a data table is called a :c
-pandas supports the integration with many file formats or data sources out of the box (csv, excel, sql, json, parquet,…). Importing data from each of these -data sources is provided by function with the prefix ``read_*``. Similarly, the ``to_*`` methods are used to store data. +pandas supports the integration with many file formats or data sources out of the box (csv, excel, sql, json, parquet,…). The ability to import data from each of these +data sources is provided by functions with the prefix, ``read_*``. Similarly, the ``to_*`` methods are used to store data. .. image:: ../_static/schemas/02_io_readwrite.svg :align: center @@ -181,7 +181,7 @@ data sources is provided by function with the prefix ``read_*``. Similarly, the
-Selecting or filtering specific rows and/or columns? Filtering the data on a condition? Methods for slicing, selecting, and extracting the +Selecting or filtering specific rows and/or columns? Filtering the data on a particular condition? Methods for slicing, selecting, and extracting the data you need are available in pandas. .. image:: ../_static/schemas/03_subset_columns_rows.svg @@ -228,7 +228,7 @@ data you need are available in pandas.
-pandas provides plotting your data out of the box, using the power of Matplotlib. You can pick the plot type (scatter, bar, boxplot,...) +pandas provides plotting for your data right out of the box with the power of Matplotlib. Simply pick the plot type (scatter, bar, boxplot,...) corresponding to your data. .. image:: ../_static/schemas/04_plot_overview.svg @@ -275,7 +275,7 @@ corresponding to your data.
-There is no need to loop over all rows of your data table to do calculations. Data manipulations on a column work elementwise. +There's no need to loop over all rows of your data table to do calculations. Column data manipulations work elementwise in pandas. Adding a column to a :class:`DataFrame` based on existing data in other columns is straightforward. .. image:: ../_static/schemas/05_newcolumn_2.svg @@ -322,7 +322,7 @@ Adding a column to a :class:`DataFrame` based on existing data in other columns
-Basic statistics (mean, median, min, max, counts...) are easily calculable. These or custom aggregations can be applied on the entire +Basic statistics (mean, median, min, max, counts...) are easily calculable across data frames. These, or even custom aggregations, can be applied on the entire data set, a sliding window of the data, or grouped by categories. The latter is also known as the split-apply-combine approach. .. image:: ../_static/schemas/06_groupby.svg @@ -369,8 +369,8 @@ data set, a sliding window of the data, or grouped by categories. The latter is
-Change the structure of your data table in multiple ways. You can :func:`~pandas.melt` your data table from wide to long/tidy form or :func:`~pandas.pivot` -from long to wide format. With aggregations built-in, a pivot table is created with a single command. +Change the structure of your data table in a variety of ways. You can use :func:`~pandas.melt` to reshape your data from a wide format to a long and tidy one. Use :func:`~pandas.pivot` + to go from long to wide format. With aggregations built-in, a pivot table can be created with a single command. .. image:: ../_static/schemas/07_melt.svg :align: center @@ -416,7 +416,7 @@ from long to wide format. With aggregations built-in, a pivot table is created w
-Multiple tables can be concatenated both column wise and row wise as database-like join/merge operations are provided to combine multiple tables of data. +Multiple tables can be concatenated column wise or row wise with pandas' database-like join and merge operations. .. image:: ../_static/schemas/08_concat_row.svg :align: center @@ -505,7 +505,7 @@ pandas has great support for time series and has an extensive set of tools for w
-Data sets do not only contain numerical data. pandas provides a wide range of functions to clean textual data and extract useful information from it. +Data sets often contain more than just numerical data. pandas provides a wide range of functions to clean textual data and extract useful information from it. .. raw:: html @@ -551,9 +551,9 @@ the pandas-equivalent operations compared to software you already know: :class-card: comparison-card :shadow: md - The `R programming language `__ provides the - ``data.frame`` data structure and multiple packages, such as - `tidyverse `__ use and extend ``data.frame`` + The `R programming language `__ provides a + ``data.frame`` data structure as well as packages like + `tidyverse `__ which use and extend ``data.frame`` for convenient data handling functionalities similar to pandas. +++ @@ -572,8 +572,8 @@ the pandas-equivalent operations compared to software you already know: :class-card: comparison-card :shadow: md - Already familiar to ``SELECT``, ``GROUP BY``, ``JOIN``, etc.? - Most of these SQL manipulations do have equivalents in pandas. + Already familiar with ``SELECT``, ``GROUP BY``, ``JOIN``, etc.? + Many SQL manipulations have equivalents in pandas. +++ @@ -631,10 +631,10 @@ the pandas-equivalent operations compared to software you already know: :class-card: comparison-card :shadow: md - The `SAS `__ statistical software suite - also provides the ``data set`` corresponding to the pandas ``DataFrame``. - Also SAS vectorized operations, filtering, string processing operations, - and more have similar functions in pandas. + `SAS `__, the statistical software suite, + uses the ``data set`` structure, which closely corresponds pandas' ``DataFrame``. + Also SAS vectorized operations such as filtering or string processing operations + have similar functions in pandas. +++ diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index 01a79fc8e36fd..86ce05fde547b 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -161,7 +161,7 @@ Python terminal. >>> import pandas as pd >>> pd.test() - running: pytest -m "not slow and not network and not db" /home/user/anaconda3/lib/python3.9/site-packages/pandas + running: pytest -m "not slow and not network and not db" /home/user/anaconda3/lib/python3.10/site-packages/pandas ============================= test session starts ============================== platform linux -- Python 3.9.7, pytest-6.2.5, py-1.11.0, pluggy-1.0.0 diff --git a/doc/source/getting_started/intro_tutorials/01_table_oriented.rst b/doc/source/getting_started/intro_tutorials/01_table_oriented.rst index ff89589baefb1..efcdb22778ef4 100644 --- a/doc/source/getting_started/intro_tutorials/01_table_oriented.rst +++ b/doc/source/getting_started/intro_tutorials/01_table_oriented.rst @@ -46,7 +46,7 @@ I want to store passenger data of the Titanic. For a number of passengers, I kno "Name": [ "Braund, Mr. Owen Harris", "Allen, Mr. William Henry", - "Bonnell, Miss. Elizabeth", + "Bonnell, Miss Elizabeth", ], "Age": [22, 35, 58], "Sex": ["male", "male", "female"], diff --git a/doc/source/getting_started/intro_tutorials/04_plotting.rst b/doc/source/getting_started/intro_tutorials/04_plotting.rst index 49cf7d32e0ef5..e9f83c602d086 100644 --- a/doc/source/getting_started/intro_tutorials/04_plotting.rst +++ b/doc/source/getting_started/intro_tutorials/04_plotting.rst @@ -32,8 +32,10 @@ How do I create plots in pandas? air_quality.head() .. note:: - The usage of the ``index_col`` and ``parse_dates`` parameters of the ``read_csv`` function to define the first (0th) column as - index of the resulting ``DataFrame`` and convert the dates in the column to :class:`Timestamp` objects, respectively. + The ``index_col=0`` and ``parse_dates=True`` parameters passed to the ``read_csv`` function define + the first (0th) column as index of the resulting ``DataFrame`` and convert the dates in the column + to :class:`Timestamp` objects, respectively. + .. raw:: html diff --git a/doc/source/getting_started/intro_tutorials/09_timeseries.rst b/doc/source/getting_started/intro_tutorials/09_timeseries.rst index 14db38c3822dc..6ba3c17fac3c3 100644 --- a/doc/source/getting_started/intro_tutorials/09_timeseries.rst +++ b/doc/source/getting_started/intro_tutorials/09_timeseries.rst @@ -295,7 +295,7 @@ Aggregate the current hourly time series values to the monthly maximum value in .. ipython:: python - monthly_max = no_2.resample("ME").max() + monthly_max = no_2.resample("MS").max() monthly_max A very powerful method on time series data with a datetime index, is the diff --git a/doc/source/getting_started/overview.rst b/doc/source/getting_started/overview.rst index 05a7d63b7ff47..a8b7a387d80ec 100644 --- a/doc/source/getting_started/overview.rst +++ b/doc/source/getting_started/overview.rst @@ -6,11 +6,11 @@ Package overview **************** -pandas is a `Python `__ package providing fast, +pandas is a `Python `__ package that provides fast, flexible, and expressive data structures designed to make working with "relational" or "labeled" data both easy and intuitive. It aims to be the -fundamental high-level building block for doing practical, **real-world** data -analysis in Python. Additionally, it has the broader goal of becoming **the +fundamental high-level building block for Python's practical, **real-world** data +analysis. Additionally, it seeks to become **the most powerful and flexible open source data analysis/manipulation tool available in any language**. It is already well on its way toward this goal. diff --git a/doc/source/user_guide/10min.rst b/doc/source/user_guide/10min.rst index 3cdcb81c14961..887ffd5580a52 100644 --- a/doc/source/user_guide/10min.rst +++ b/doc/source/user_guide/10min.rst @@ -101,7 +101,7 @@ truncated for brevity. Viewing data ------------ -See the :ref:`Essentially basics functionality section `. +See the :ref:`Essential basic functionality section `. Use :meth:`DataFrame.head` and :meth:`DataFrame.tail` to view the top and bottom rows of the frame respectively: diff --git a/doc/source/user_guide/basics.rst b/doc/source/user_guide/basics.rst index 0ff40dcdcd150..ffd7a2ad7bb01 100644 --- a/doc/source/user_guide/basics.rst +++ b/doc/source/user_guide/basics.rst @@ -155,16 +155,6 @@ speedups. ``numexpr`` uses smart chunking, caching, and multiple cores. ``bottle a set of specialized cython routines that are especially fast when dealing with arrays that have ``nans``. -Here is a sample (using 100 column x 100,000 row ``DataFrames``): - -.. csv-table:: - :header: "Operation", "0.11.0 (ms)", "Prior Version (ms)", "Ratio to Prior" - :widths: 25, 25, 25, 25 - - ``df1 > df2``, 13.32, 125.35, 0.1063 - ``df1 * df2``, 21.71, 36.63, 0.5928 - ``df1 + df2``, 22.04, 36.50, 0.6039 - You are highly encouraged to install both libraries. See the section :ref:`Recommended Dependencies ` for more installation info. @@ -1606,7 +1596,7 @@ For instance: This method does not convert the row to a Series object; it merely returns the values inside a namedtuple. Therefore, :meth:`~DataFrame.itertuples` preserves the data type of the values -and is generally faster as :meth:`~DataFrame.iterrows`. +and is generally faster than :meth:`~DataFrame.iterrows`. .. note:: diff --git a/doc/source/user_guide/boolean.rst b/doc/source/user_guide/boolean.rst index 3c361d4de17e5..7de0430123fd2 100644 --- a/doc/source/user_guide/boolean.rst +++ b/doc/source/user_guide/boolean.rst @@ -37,6 +37,19 @@ If you would prefer to keep the ``NA`` values you can manually fill them with `` s[mask.fillna(True)] +If you create a column of ``NA`` values (for example to fill them later) +with ``df['new_col'] = pd.NA``, the ``dtype`` would be set to ``object`` in the +new column. The performance on this column will be worse than with +the appropriate type. It's better to use +``df['new_col'] = pd.Series(pd.NA, dtype="boolean")`` +(or another ``dtype`` that supports ``NA``). + +.. ipython:: python + + df = pd.DataFrame() + df['objects'] = pd.NA + df.dtypes + .. _boolean.kleene: Kleene logical operations diff --git a/doc/source/user_guide/categorical.rst b/doc/source/user_guide/categorical.rst index 7b2fd32303845..1e7d66dfeb142 100644 --- a/doc/source/user_guide/categorical.rst +++ b/doc/source/user_guide/categorical.rst @@ -793,7 +793,7 @@ Assigning a ``Categorical`` to parts of a column of other types will use the val :okwarning: df = pd.DataFrame({"a": [1, 1, 1, 1, 1], "b": ["a", "a", "a", "a", "a"]}) - df.loc[1:2, "a"] = pd.Categorical(["b", "b"], categories=["a", "b"]) + df.loc[1:2, "a"] = pd.Categorical([2, 2], categories=[2, 3]) df.loc[2:3, "b"] = pd.Categorical(["b", "b"], categories=["a", "b"]) df df.dtypes diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index 267499edfae6f..8c80fa7052dd5 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -668,8 +668,9 @@ column, which produces an aggregated result with a hierarchical column index: grouped[["C", "D"]].agg(["sum", "mean", "std"]) -The resulting aggregations are named after the functions themselves. If you -need to rename, then you can add in a chained operation for a ``Series`` like this: +The resulting aggregations are named after the functions themselves. + +For a ``Series``, if you need to rename, you can add in a chained operation like this: .. ipython:: python @@ -679,8 +680,19 @@ need to rename, then you can add in a chained operation for a ``Series`` like th .rename(columns={"sum": "foo", "mean": "bar", "std": "baz"}) ) +Or, you can simply pass a list of tuples each with the name of the new column and the aggregate function: + +.. ipython:: python + + ( + grouped["C"] + .agg([("foo", "sum"), ("bar", "mean"), ("baz", "std")]) + ) + For a grouped ``DataFrame``, you can rename in a similar manner: +By chaining ``rename`` operation, + .. ipython:: python ( @@ -689,6 +701,16 @@ For a grouped ``DataFrame``, you can rename in a similar manner: ) ) +Or, passing a list of tuples, + +.. ipython:: python + + ( + grouped[["C", "D"]].agg( + [("foo", "sum"), ("bar", "mean"), ("baz", "std")] + ) + ) + .. note:: In general, the output column names should be unique, but pandas will allow diff --git a/doc/source/user_guide/integer_na.rst b/doc/source/user_guide/integer_na.rst index 1a727cd78af09..76a2f22b7987d 100644 --- a/doc/source/user_guide/integer_na.rst +++ b/doc/source/user_guide/integer_na.rst @@ -84,6 +84,19 @@ with the dtype. In the future, we may provide an option for :class:`Series` to infer a nullable-integer dtype. +If you create a column of ``NA`` values (for example to fill them later) +with ``df['new_col'] = pd.NA``, the ``dtype`` would be set to ``object`` in the +new column. The performance on this column will be worse than with +the appropriate type. It's better to use +``df['new_col'] = pd.Series(pd.NA, dtype="Int64")`` +(or another ``dtype`` that supports ``NA``). + +.. ipython:: python + + df = pd.DataFrame() + df['objects'] = pd.NA + df.dtypes + Operations ---------- diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index dc06dd9620c24..be40710a9e307 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -2161,7 +2161,7 @@ a JSON string with two fields, ``schema`` and ``data``. { "A": [1, 2, 3], "B": ["a", "b", "c"], - "C": pd.date_range("2016-01-01", freq="d", periods=3), + "C": pd.date_range("2016-01-01", freq="D", periods=3), }, index=pd.Index(range(3), name="idx"), ) @@ -2270,7 +2270,7 @@ round-trippable manner. { "foo": [1, 2, 3, 4], "bar": ["a", "b", "c", "d"], - "baz": pd.date_range("2018-01-01", freq="d", periods=4), + "baz": pd.date_range("2018-01-01", freq="D", periods=4), "qux": pd.Categorical(["a", "b", "c", "c"]), }, index=pd.Index(range(4), name="idx"), @@ -3003,7 +3003,7 @@ However, if XPath does not reference node names such as default, ``/*``, then .. note:: Since ``xpath`` identifies the parent of content to be parsed, only immediate - desendants which include child nodes or current attributes are parsed. + descendants which include child nodes or current attributes are parsed. Therefore, ``read_xml`` will not parse the text of grandchildren or other descendants and will not parse attributes of any descendant. To retrieve lower level content, adjust xpath to lower level. For example, @@ -3535,7 +3535,7 @@ For example, to read in a ``MultiIndex`` index without names: df = pd.read_excel("path_to_file.xlsx", index_col=[0, 1]) df -If the index has level names, they will parsed as well, using the same +If the index has level names, they will be parsed as well, using the same parameters. .. ipython:: python @@ -4990,7 +4990,7 @@ Caveats convenience you can use ``store.flush(fsync=True)`` to do this for you. * Once a ``table`` is created columns (DataFrame) are fixed; only exactly the same columns can be appended -* Be aware that timezones (e.g., ``pytz.timezone('US/Eastern')``) +* Be aware that timezones (e.g., ``zoneinfo.ZoneInfo('US/Eastern')``) are not necessarily equal across timezone versions. So if data is localized to a specific timezone in the HDFStore using one version of a timezone library and that data is updated with another version, the data @@ -5169,6 +5169,8 @@ See the `Full Documentation `__. .. ipython:: python + import pytz + df = pd.DataFrame( { "a": list("abc"), @@ -5178,7 +5180,7 @@ See the `Full Documentation `__. "e": [True, False, True], "f": pd.Categorical(list("abc")), "g": pd.date_range("20130101", periods=3), - "h": pd.date_range("20130101", periods=3, tz="US/Eastern"), + "h": pd.date_range("20130101", periods=3, tz=pytz.timezone("US/Eastern")), "i": pd.date_range("20130101", periods=3, freq="ns"), } ) @@ -5847,10 +5849,10 @@ You can check if a table exists using :func:`~pandas.io.sql.has_table` Schema support '''''''''''''' -Reading from and writing to different schema's is supported through the ``schema`` +Reading from and writing to different schemas is supported through the ``schema`` keyword in the :func:`~pandas.read_sql_table` and :func:`~pandas.DataFrame.to_sql` functions. Note however that this depends on the database flavor (sqlite does not -have schema's). For example: +have schemas). For example: .. code-block:: python diff --git a/doc/source/user_guide/merging.rst b/doc/source/user_guide/merging.rst index f3b849dc6de45..cfd2f40aa93a3 100644 --- a/doc/source/user_guide/merging.rst +++ b/doc/source/user_guide/merging.rst @@ -974,7 +974,7 @@ with optional filling of missing data with ``fill_method``. :func:`merge_asof` --------------------- -:func:`merge_asof` is similar to an ordered left-join except that mactches are on the +:func:`merge_asof` is similar to an ordered left-join except that matches are on the nearest key rather than equal keys. For each row in the ``left`` :class:`DataFrame`, the last row in the ``right`` :class:`DataFrame` are selected where the ``on`` key is less than the left's key. Both :class:`DataFrame` must be sorted by the key. diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst index 5149bd30dbbef..e15939eb49239 100644 --- a/doc/source/user_guide/missing_data.rst +++ b/doc/source/user_guide/missing_data.rst @@ -32,7 +32,7 @@ use :class:`api.typing.NaTType`. :class:`NA` for :class:`StringDtype`, :class:`Int64Dtype` (and other bit widths), :class:`Float64Dtype` (and other bit widths), :class:`BooleanDtype` and :class:`ArrowDtype`. These types will maintain the original data type of the data. -For typing applications, use :class:`api.types.NAType`. +For typing applications, use :class:`api.typing.NAType`. .. ipython:: python @@ -60,7 +60,7 @@ To detect these missing value, use the :func:`isna` or :func:`notna` methods. .. warning:: - Equality compaisons between ``np.nan``, :class:`NaT`, and :class:`NA` + Equality comparisons between ``np.nan``, :class:`NaT`, and :class:`NA` do not act like ``None`` .. ipython:: python @@ -319,7 +319,7 @@ Missing values propagate through arithmetic operations between pandas objects. The descriptive statistics and computational methods discussed in the :ref:`data structure overview ` (and listed :ref:`here -` and :ref:`here `) are all +` and :ref:`here `) all account for missing data. When summing data, NA values or empty data will be treated as zero. @@ -337,10 +337,8 @@ When taking the product, NA values or empty data will be treated as 1. pd.Series([], dtype="float64").prod() Cumulative methods like :meth:`~DataFrame.cumsum` and :meth:`~DataFrame.cumprod` -ignore NA values by default preserve them in the result. This behavior can be changed -with ``skipna`` - -* Cumulative methods like :meth:`~DataFrame.cumsum` and :meth:`~DataFrame.cumprod` ignore NA values by default, but preserve them in the resulting arrays. To override this behaviour and include NA values, use ``skipna=False``. +ignore NA values by default, but preserve them in the resulting array. To override +this behaviour and include NA values in the calculation, use ``skipna=False``. .. ipython:: python @@ -355,7 +353,7 @@ with ``skipna`` Dropping missing data ~~~~~~~~~~~~~~~~~~~~~ -:meth:`~DataFrame.dropna` dropa rows or columns with missing data. +:meth:`~DataFrame.dropna` drops rows or columns with missing data. .. ipython:: python diff --git a/doc/source/user_guide/options.rst b/doc/source/user_guide/options.rst index ce805f98ca528..7757d95c2bccd 100644 --- a/doc/source/user_guide/options.rst +++ b/doc/source/user_guide/options.rst @@ -8,7 +8,7 @@ Options and settings Overview -------- -pandas has an options API configure and customize global behavior related to +pandas has an options API to configure and customize global behavior related to :class:`DataFrame` display, data behavior and more. Options have a full "dotted-style", case-insensitive name (e.g. ``display.max_rows``). diff --git a/doc/source/user_guide/pyarrow.rst b/doc/source/user_guide/pyarrow.rst index 61b383afb7c43..aecbce0441b53 100644 --- a/doc/source/user_guide/pyarrow.rst +++ b/doc/source/user_guide/pyarrow.rst @@ -159,9 +159,11 @@ PyArrow also provides IO reading functionality that has been integrated into sev functions provide an ``engine`` keyword that can dispatch to PyArrow to accelerate reading from an IO source. * :func:`read_csv` +* :func:`read_feather` * :func:`read_json` * :func:`read_orc` -* :func:`read_feather` +* :func:`read_parquet` +* :func:`read_table` (experimental) .. ipython:: python diff --git a/doc/source/user_guide/style.ipynb b/doc/source/user_guide/style.ipynb index 04ba3e5be8ff7..f4a55280cd1f1 100644 --- a/doc/source/user_guide/style.ipynb +++ b/doc/source/user_guide/style.ipynb @@ -351,7 +351,7 @@ "\n", "- Using [.set_table_styles()][table] to control broader areas of the table with specified internal CSS. Although table styles allow the flexibility to add CSS selectors and properties controlling all individual parts of the table, they are unwieldy for individual cell specifications. Also, note that table styles cannot be exported to Excel. \n", "- Using [.set_td_classes()][td_class] to directly link either external CSS classes to your data cells or link the internal CSS classes created by [.set_table_styles()][table]. See [here](#Setting-Classes-and-Linking-to-External-CSS). These cannot be used on column header rows or indexes, and also won't export to Excel. \n", - "- Using the [.apply()][apply] and [.map()][map] functions to add direct internal CSS to specific data cells. See [here](#Styler-Functions). As of v1.4.0 there are also methods that work directly on column header rows or indexes; [.apply_index()][applyindex] and [.map_index()][mapindex]. Note that only these methods add styles that will export to Excel. These methods work in a similar way to [DataFrame.apply()][dfapply] and [DataFrame.map()][dfmap].\n", + "- Using the [.apply()][apply] and [.map()][map] functions to add direct internal CSS to specific data cells. See [here](#Styler-Functions). As of v1.4.0 there are also methods that work directly on column header rows or indexes: [.apply_index()][applyindex] and [.map_index()][mapindex]. Note that only these methods add styles that will export to Excel. These methods work in a similar way to [DataFrame.apply()][dfapply] and [DataFrame.map()][dfmap].\n", "\n", "[table]: ../reference/api/pandas.io.formats.style.Styler.set_table_styles.rst\n", "[styler]: ../reference/api/pandas.io.formats.style.Styler.rst\n", diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index ad2690ae395be..827e7a3c884d9 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -204,7 +204,7 @@ and replacing any remaining whitespaces with underscores: .. warning:: - The type of the Series is inferred and the allowed types (i.e. strings). + The type of the Series is inferred and is one among the allowed types (i.e. strings). Generally speaking, the ``.str`` accessor is intended to work only on strings. With very few exceptions, other uses are not supported, and may be disabled at a later point. diff --git a/doc/source/user_guide/timedeltas.rst b/doc/source/user_guide/timedeltas.rst index 5daf204f39bcf..01df17bac5fd7 100644 --- a/doc/source/user_guide/timedeltas.rst +++ b/doc/source/user_guide/timedeltas.rst @@ -35,7 +35,7 @@ You can construct a ``Timedelta`` scalar through various arguments, including `I pd.Timedelta(days=1, seconds=1) # integers with a unit - pd.Timedelta(1, unit="d") + pd.Timedelta(1, unit="D") # from a datetime.timedelta/np.timedelta64 pd.Timedelta(datetime.timedelta(days=1, seconds=1)) @@ -94,7 +94,7 @@ is numeric: .. ipython:: python pd.to_timedelta(np.arange(5), unit="s") - pd.to_timedelta(np.arange(5), unit="d") + pd.to_timedelta(np.arange(5), unit="D") .. warning:: If a string or array of strings is passed as an input then the ``unit`` keyword diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index b31249e1cf7c1..0845417e4910d 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -1273,6 +1273,10 @@ frequencies. We will refer to these aliases as *offset aliases*. are deprecated in favour of the aliases ``h``, ``bh``, ``cbh``, ``min``, ``s``, ``ms``, ``us``, and ``ns``. + Aliases ``Y``, ``M``, and ``Q`` are deprecated in favour of the aliases + ``YE``, ``ME``, ``QE``. + + .. note:: When using the offset aliases above, it should be noted that functions @@ -1475,7 +1479,7 @@ or some other non-observed day. Defined observance rules are: "after_nearest_workday", "apply ``nearest_workday`` and then move to next workday after that day" "sunday_to_monday", "move Sunday to following Monday" "next_monday_or_tuesday", "move Saturday to Monday and Sunday/Monday to Tuesday" - "previous_friday", move Saturday and Sunday to previous Friday" + "previous_friday", "move Saturday and Sunday to previous Friday" "next_monday", "move Saturday and Sunday to following Monday" "weekend_to_monday", "same as ``next_monday``" @@ -1860,7 +1864,7 @@ to resample based on datetimelike column in the frame, it can passed to the ), ) df - df.resample("ME", on="date")[["a"]].sum() + df.resample("MS", on="date")[["a"]].sum() Similarly, if you instead want to resample by a datetimelike level of ``MultiIndex``, its name or location can be passed to the @@ -1868,7 +1872,7 @@ level of ``MultiIndex``, its name or location can be passed to the .. ipython:: python - df.resample("ME", level="d")[["a"]].sum() + df.resample("MS", level="d")[["a"]].sum() .. _timeseries.iterating-label: @@ -2333,7 +2337,7 @@ Time zone handling ------------------ pandas provides rich support for working with timestamps in different time -zones using the ``pytz`` and ``dateutil`` libraries or :class:`datetime.timezone` +zones using the ``zoneinfo``, ``pytz`` and ``dateutil`` libraries or :class:`datetime.timezone` objects from the standard library. @@ -2350,14 +2354,14 @@ By default, pandas objects are time zone unaware: To localize these dates to a time zone (assign a particular time zone to a naive date), you can use the ``tz_localize`` method or the ``tz`` keyword argument in :func:`date_range`, :class:`Timestamp`, or :class:`DatetimeIndex`. -You can either pass ``pytz`` or ``dateutil`` time zone objects or Olson time zone database strings. +You can either pass ``zoneinfo``, ``pytz`` or ``dateutil`` time zone objects or Olson time zone database strings. Olson time zone strings will return ``pytz`` time zone objects by default. To return ``dateutil`` time zone objects, append ``dateutil/`` before the string. -* In ``pytz`` you can find a list of common (and less common) time zones using - ``from pytz import common_timezones, all_timezones``. +* For ``zoneinfo``, a list of available timezones are available from :py:func:`zoneinfo.available_timezones`. +* In ``pytz`` you can find a list of common (and less common) time zones using ``pytz.all_timezones``. * ``dateutil`` uses the OS time zones so there isn't a fixed list available. For - common zones, the names are the same as ``pytz``. + common zones, the names are the same as ``pytz`` and ``zoneinfo``. .. ipython:: python @@ -2462,7 +2466,7 @@ you can use the ``tz_convert`` method. .. warning:: - If you are using dates beyond 2038-01-18, due to current deficiencies + If you are using dates beyond 2038-01-18 with ``pytz``, due to current deficiencies in the underlying libraries caused by the year 2038 problem, daylight saving time (DST) adjustments to timezone aware dates will not be applied. If and when the underlying libraries are fixed, the DST transitions will be applied. @@ -2471,9 +2475,11 @@ you can use the ``tz_convert`` method. .. ipython:: python + import pytz + d_2037 = "2037-03-31T010101" d_2038 = "2038-03-31T010101" - DST = "Europe/London" + DST = pytz.timezone("Europe/London") assert pd.Timestamp(d_2037, tz=DST) != pd.Timestamp(d_2037, tz="GMT") assert pd.Timestamp(d_2038, tz=DST) == pd.Timestamp(d_2038, tz="GMT") diff --git a/doc/source/whatsnew/v0.13.0.rst b/doc/source/whatsnew/v0.13.0.rst index 3c5488a47bdf2..8e323d8aac5e3 100644 --- a/doc/source/whatsnew/v0.13.0.rst +++ b/doc/source/whatsnew/v0.13.0.rst @@ -523,13 +523,25 @@ Enhancements Using the new top-level ``to_timedelta``, you can convert a scalar or array from the standard timedelta format (produced by ``to_csv``) into a timedelta type (``np.timedelta64`` in ``nanoseconds``). - .. ipython:: python + .. code-block:: ipython + + In [53]: pd.to_timedelta('1 days 06:05:01.00003') + Out[53]: Timedelta('1 days 06:05:01.000030') + + In [54]: pd.to_timedelta('15.5us') + Out[54]: Timedelta('0 days 00:00:00.000015500') + + In [55]: pd.to_timedelta(['1 days 06:05:01.00003', '15.5us', 'nan']) + Out[55]: TimedeltaIndex(['1 days 06:05:01.000030', '0 days 00:00:00.000015500', NaT], dtype='timedelta64[ns]', freq=None) + + In [56]: pd.to_timedelta(np.arange(5), unit='s') + Out[56]: + TimedeltaIndex(['0 days 00:00:00', '0 days 00:00:01', '0 days 00:00:02', + '0 days 00:00:03', '0 days 00:00:04'], + dtype='timedelta64[ns]', freq=None) - pd.to_timedelta('1 days 06:05:01.00003') - pd.to_timedelta('15.5us') - pd.to_timedelta(['1 days 06:05:01.00003', '15.5us', 'nan']) - pd.to_timedelta(np.arange(5), unit='s') - pd.to_timedelta(np.arange(5), unit='d') + In [57]: pd.to_timedelta(np.arange(5), unit='d') + Out[57]: TimedeltaIndex(['0 days', '1 days', '2 days', '3 days', '4 days'], dtype='timedelta64[ns]', freq=None) A Series of dtype ``timedelta64[ns]`` can now be divided by another ``timedelta64[ns]`` object, or astyped to yield a ``float64`` dtyped Series. This diff --git a/doc/source/whatsnew/v0.18.0.rst b/doc/source/whatsnew/v0.18.0.rst index 569197fe9daf5..563035e0e2940 100644 --- a/doc/source/whatsnew/v0.18.0.rst +++ b/doc/source/whatsnew/v0.18.0.rst @@ -322,15 +322,28 @@ Tz-aware are rounded, floored and ceiled in local times Timedeltas -.. ipython:: python +.. code-block:: ipython + + In [37]: t = pd.timedelta_range('1 days 2 hr 13 min 45 us', periods=3, freq='d') - t = pd.timedelta_range('1 days 2 hr 13 min 45 us', periods=3, freq='d') - t - t.round('10min') + In [38]: t + Out[38]: + TimedeltaIndex(['1 days 02:13:00.000045', '2 days 02:13:00.000045', + '3 days 02:13:00.000045'], + dtype='timedelta64[ns]', freq='D') + + In [39]: t.round('10min') + Out[39]: + TimedeltaIndex(['1 days 02:10:00', '2 days 02:10:00', + '3 days 02:10:00'], + dtype='timedelta64[ns]', freq=None) # Timedelta scalar - t[0] - t[0].round('2h') + In [40]: t[0] + Out[40]: Timedelta('1 days 02:13:00.000045') + + In [41]: t[0].round('2h') + Out[41]: Timedelta('1 days 02:00:00') In addition, ``.round()``, ``.floor()`` and ``.ceil()`` will be available through the ``.dt`` accessor of ``Series``. diff --git a/doc/source/whatsnew/v0.20.0.rst b/doc/source/whatsnew/v0.20.0.rst index f63db945165e7..d6d1d96ccc878 100644 --- a/doc/source/whatsnew/v0.20.0.rst +++ b/doc/source/whatsnew/v0.20.0.rst @@ -308,15 +308,26 @@ The new orient ``'table'`` for :meth:`DataFrame.to_json` will generate a `Table Schema`_ compatible string representation of the data. -.. ipython:: python +.. code-block:: ipython - df = pd.DataFrame( - {'A': [1, 2, 3], - 'B': ['a', 'b', 'c'], - 'C': pd.date_range('2016-01-01', freq='d', periods=3)}, - index=pd.Index(range(3), name='idx')) - df - df.to_json(orient='table') + In [38]: df = pd.DataFrame( + ....: {'A': [1, 2, 3], + ....: 'B': ['a', 'b', 'c'], + ....: 'C': pd.date_range('2016-01-01', freq='d', periods=3)}, + ....: index=pd.Index(range(3), name='idx')) + In [39]: df + Out[39]: + A B C + idx + 0 1 a 2016-01-01 + 1 2 b 2016-01-02 + 2 3 c 2016-01-03 + + [3 rows x 3 columns] + + In [40]: df.to_json(orient='table') + Out[40]: + '{"schema":{"fields":[{"name":"idx","type":"integer"},{"name":"A","type":"integer"},{"name":"B","type":"string"},{"name":"C","type":"datetime"}],"primaryKey":["idx"],"pandas_version":"1.4.0"},"data":[{"idx":0,"A":1,"B":"a","C":"2016-01-01T00:00:00.000"},{"idx":1,"A":2,"B":"b","C":"2016-01-02T00:00:00.000"},{"idx":2,"A":3,"B":"c","C":"2016-01-03T00:00:00.000"}]}' See :ref:`IO: Table Schema for more information `. diff --git a/doc/source/whatsnew/v0.22.0.rst b/doc/source/whatsnew/v0.22.0.rst index a33a8f7addeef..8a9227ac37b67 100644 --- a/doc/source/whatsnew/v0.22.0.rst +++ b/doc/source/whatsnew/v0.22.0.rst @@ -157,16 +157,27 @@ sum and ``1`` for product. *pandas 0.22.0* -.. ipython:: python +.. code-block:: ipython + + In [11]: s = pd.Series([1, 1, np.nan, np.nan], + ....: index=pd.date_range("2017", periods=4)) - s = pd.Series([1, 1, np.nan, np.nan], index=pd.date_range("2017", periods=4)) - s.resample("2d").sum() + In [12]: s.resample("2d").sum() + Out[12]: + 2017-01-01 2.0 + 2017-01-03 0.0 + Freq: 2D, Length: 2, dtype: float64 To restore the 0.21 behavior of returning ``NaN``, use ``min_count>=1``. -.. ipython:: python +.. code-block:: ipython + + In [13]: s.resample("2d").sum(min_count=1) + Out[13]: + 2017-01-01 2.0 + 2017-01-03 NaN + Freq: 2D, Length: 2, dtype: float64 - s.resample("2d").sum(min_count=1) In particular, upsampling and taking the sum or product is affected, as upsampling introduces missing values even if the original series was diff --git a/doc/source/whatsnew/v0.23.0.rst b/doc/source/whatsnew/v0.23.0.rst index 808741ccf4475..663b47a4d2d55 100644 --- a/doc/source/whatsnew/v0.23.0.rst +++ b/doc/source/whatsnew/v0.23.0.rst @@ -50,19 +50,55 @@ JSON read/write round-trippable with ``orient='table'`` A ``DataFrame`` can now be written to and subsequently read back via JSON while preserving metadata through usage of the ``orient='table'`` argument (see :issue:`18912` and :issue:`9146`). Previously, none of the available ``orient`` values guaranteed the preservation of dtypes and index names, amongst other metadata. -.. ipython:: python +.. code-block:: ipython - df = pd.DataFrame({'foo': [1, 2, 3, 4], - 'bar': ['a', 'b', 'c', 'd'], - 'baz': pd.date_range('2018-01-01', freq='d', periods=4), - 'qux': pd.Categorical(['a', 'b', 'c', 'c'])}, - index=pd.Index(range(4), name='idx')) - df - df.dtypes - df.to_json('test.json', orient='table') - new_df = pd.read_json('test.json', orient='table') - new_df - new_df.dtypes + In [1]: df = pd.DataFrame({'foo': [1, 2, 3, 4], + ...: 'bar': ['a', 'b', 'c', 'd'], + ...: 'baz': pd.date_range('2018-01-01', freq='d', periods=4), + ...: 'qux': pd.Categorical(['a', 'b', 'c', 'c'])}, + ...: index=pd.Index(range(4), name='idx')) + + In [2]: df + Out[2]: + foo bar baz qux + idx + 0 1 a 2018-01-01 a + 1 2 b 2018-01-02 b + 2 3 c 2018-01-03 c + 3 4 d 2018-01-04 c + + [4 rows x 4 columns] + + In [3]: df.dtypes + Out[3]: + foo int64 + bar object + baz datetime64[ns] + qux category + Length: 4, dtype: object + + In [4]: df.to_json('test.json', orient='table') + + In [5]: new_df = pd.read_json('test.json', orient='table') + + In [6]: new_df + Out[6]: + foo bar baz qux + idx + 0 1 a 2018-01-01 a + 1 2 b 2018-01-02 b + 2 3 c 2018-01-03 c + 3 4 d 2018-01-04 c + + [4 rows x 4 columns] + + In [7]: new_df.dtypes + Out[7]: + foo int64 + bar object + baz datetime64[ns] + qux category + Length: 4, dtype: object Please note that the string ``index`` is not supported with the round trip format, as it is used by default in ``write_json`` to indicate a missing index name. diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index e8a0aaead705e..846d863910b4c 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -32,20 +32,27 @@ Other enhancements - :class:`pandas.api.typing.SASReader` is available for typing the output of :func:`read_sas` (:issue:`55689`) - :func:`DataFrame.to_excel` now raises an ``UserWarning`` when the character count in a cell exceeds Excel's limitation of 32767 characters (:issue:`56954`) - :func:`read_stata` now returns ``datetime64`` resolutions better matching those natively stored in the stata format (:issue:`55642`) +- :meth:`DataFrame.agg` called with ``axis=1`` and a ``func`` which relabels the result index now raises a ``NotImplementedError`` (:issue:`58807`). +- :meth:`Index.get_loc` now accepts also subclasses of ``tuple`` as keys (:issue:`57922`) - :meth:`Styler.set_tooltips` provides alternative method to storing tooltips by using title attribute of td elements. (:issue:`56981`) +- Added missing parameter ``weights`` in :meth:`DataFrame.plot.kde` for the estimation of the PDF (:issue:`59337`) - Allow dictionaries to be passed to :meth:`pandas.Series.str.replace` via ``pat`` parameter (:issue:`51748`) - Support passing a :class:`Series` input to :func:`json_normalize` that retains the :class:`Series` :class:`Index` (:issue:`51452`) - Support reading value labels from Stata 108-format (Stata 6) and earlier files (:issue:`58154`) - Users can globally disable any ``PerformanceWarning`` by setting the option ``mode.performance_warnings`` to ``False`` (:issue:`56920`) - :meth:`Styler.format_index_names` can now be used to format the index and column names (:issue:`48936` and :issue:`47489`) - :class:`.errors.DtypeWarning` improved to include column names when mixed data types are detected (:issue:`58174`) +- :func:`DataFrame.to_excel` argument ``merge_cells`` now accepts a value of ``"columns"`` to only merge :class:`MultiIndex` column header header cells (:issue:`35384`) - :meth:`DataFrame.corrwith` now accepts ``min_periods`` as optional arguments, as in :meth:`DataFrame.corr` and :meth:`Series.corr` (:issue:`9490`) - :meth:`DataFrame.cummin`, :meth:`DataFrame.cummax`, :meth:`DataFrame.cumprod` and :meth:`DataFrame.cumsum` methods now have a ``numeric_only`` parameter (:issue:`53072`) +- :meth:`DataFrame.ewm` now allows ``adjust=False`` when ``times`` is provided (:issue:`54328`) - :meth:`DataFrame.fillna` and :meth:`Series.fillna` can now accept ``value=None``; for non-object dtype the corresponding NA value will be used (:issue:`57723`) +- :meth:`DataFrame.pivot_table` and :func:`pivot_table` now allow the passing of keyword arguments to ``aggfunc`` through ``**kwargs`` (:issue:`57884`) - :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`) - :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`) +- Restore support for reading Stata 104-format and enable reading 103-format dta files (:issue:`58554`) +- Support reading Stata 102-format (Stata 1) dta files (:issue:`58978`) - Support reading Stata 110-format (Stata 7) dta files (:issue:`47176`) -- .. --------------------------------------------------------------------------- .. _whatsnew_300.notable_bug_fixes: @@ -124,8 +131,76 @@ notable_bug_fix2 Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. _whatsnew_300.api_breaking.datetime_resolution_inference: + +Datetime resolution inference +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Converting a sequence of strings, ``datetime`` objects, or ``np.datetime64`` objects to +a ``datetime64`` dtype now performs inference on the appropriate resolution (AKA unit) for the output dtype. This affects :class:`Series`, :class:`DataFrame`, :class:`Index`, :class:`DatetimeIndex`, and :func:`to_datetime`. + +Previously, these would always give nanosecond resolution: + +.. code-block:: ipython + + In [1]: dt = pd.Timestamp("2024-03-22 11:36").to_pydatetime() + In [2]: pd.to_datetime([dt]).dtype + Out[2]: dtype('`_) - Enforced deprecation ``all`` and ``any`` reductions with ``datetime64``, :class:`DatetimeTZDtype`, and :class:`PeriodDtype` dtypes (:issue:`58029`) - Enforced deprecation disallowing ``float`` "periods" in :func:`date_range`, :func:`period_range`, :func:`timedelta_range`, :func:`interval_range`, (:issue:`56036`) - Enforced deprecation disallowing parsing datetimes with mixed time zones unless user passes ``utc=True`` to :func:`to_datetime` (:issue:`57275`) @@ -277,9 +390,13 @@ Removal of prior version deprecations/changes - Enforced deprecation of string ``A`` denoting frequency in :class:`YearEnd` and strings ``A-DEC``, ``A-JAN``, etc. denoting annual frequencies with various fiscal year ends (:issue:`57699`) - Enforced deprecation of string ``BAS`` denoting frequency in :class:`BYearBegin` and strings ``BAS-DEC``, ``BAS-JAN``, etc. denoting annual frequencies with various fiscal year starts (:issue:`57793`) - Enforced deprecation of string ``BA`` denoting frequency in :class:`BYearEnd` and strings ``BA-DEC``, ``BA-JAN``, etc. denoting annual frequencies with various fiscal year ends (:issue:`57793`) -- Enforced deprecation of strings ``T``, ``L``, ``U``, and ``N`` denoting frequencies in :class:`Minute`, :class:`Second`, :class:`Milli`, :class:`Micro`, :class:`Nano` (:issue:`57627`) +- Enforced deprecation of strings ``H``, ``BH``, and ``CBH`` denoting frequencies in :class:`Hour`, :class:`BusinessHour`, :class:`CustomBusinessHour` (:issue:`59143`) +- Enforced deprecation of strings ``H``, ``BH``, and ``CBH`` denoting units in :class:`Timedelta` (:issue:`59143`) +- Enforced deprecation of strings ``T``, ``L``, ``U``, and ``N`` denoting frequencies in :class:`Minute`, :class:`Milli`, :class:`Micro`, :class:`Nano` (:issue:`57627`) - Enforced deprecation of strings ``T``, ``L``, ``U``, and ``N`` denoting units in :class:`Timedelta` (:issue:`57627`) - Enforced deprecation of the behavior of :func:`concat` when ``len(keys) != len(objs)`` would truncate to the shorter of the two. Now this raises a ``ValueError`` (:issue:`43485`) +- Enforced deprecation of the behavior of :meth:`DataFrame.replace` and :meth:`Series.replace` with :class:`CategoricalDtype` that would introduce new categories. (:issue:`58270`) +- Enforced deprecation of the behavior of :meth:`Series.argsort` in the presence of NA values (:issue:`58232`) - Enforced deprecation of values "pad", "ffill", "bfill", and "backfill" for :meth:`Series.interpolate` and :meth:`DataFrame.interpolate` (:issue:`57869`) - Enforced deprecation removing :meth:`Categorical.to_list`, use ``obj.tolist()`` instead (:issue:`51254`) - Enforced silent-downcasting deprecation for :ref:`all relevant methods ` (:issue:`54710`) @@ -387,12 +504,18 @@ Categorical Datetimelike ^^^^^^^^^^^^ +- Bug in :attr:`is_year_start` where a DateTimeIndex constructed via a date_range with frequency 'MS' wouldn't have the correct year or quarter start attributes (:issue:`57377`) - Bug in :class:`Timestamp` constructor failing to raise when ``tz=None`` is explicitly specified in conjunction with timezone-aware ``tzinfo`` or data (:issue:`48688`) - Bug in :func:`date_range` where the last valid timestamp would sometimes not be produced (:issue:`56134`) - Bug in :func:`date_range` where using a negative frequency value would not include all points between the start and end values (:issue:`56382`) - Bug in :func:`tseries.api.guess_datetime_format` would fail to infer time format when "%Y" == "%H%M" (:issue:`57452`) +- Bug in :func:`tseries.frequencies.to_offset` would fail to parse frequency strings starting with "LWOM" (:issue:`59218`) +- Bug in :meth:`Dataframe.agg` with df with missing values resulting in IndexError (:issue:`58810`) - Bug in :meth:`DatetimeIndex.is_year_start` and :meth:`DatetimeIndex.is_quarter_start` does not raise on Custom business days frequencies bigger then "1C" (:issue:`58664`) - Bug in :meth:`DatetimeIndex.is_year_start` and :meth:`DatetimeIndex.is_quarter_start` returning ``False`` on double-digit frequencies (:issue:`58523`) +- Bug in :meth:`DatetimeIndex.union` and :meth:`DatetimeIndex.intersection` when ``unit`` was non-nanosecond (:issue:`59036`) +- Bug in :meth:`Series.dt.microsecond` producing incorrect results for pyarrow backed :class:`Series`. (:issue:`59154`) +- Bug in :meth:`to_datetime` not respecting dayfirst if an uncommon date string was passed. (:issue:`58859`) - Bug in setting scalar values with mismatched resolution into arrays with non-nanosecond ``datetime64``, ``timedelta64`` or :class:`DatetimeTZDtype` incorrectly truncating those scalars (:issue:`56410`) Timedelta @@ -407,8 +530,8 @@ Timezones Numeric ^^^^^^^ +- Bug in :meth:`DataFrame.quantile` where the column type was not preserved when ``numeric_only=True`` with a list-like ``q`` produced an empty result (:issue:`59035`) - Bug in ``np.matmul`` with :class:`Index` inputs raising a ``TypeError`` (:issue:`57079`) -- Conversion ^^^^^^^^^^ @@ -431,6 +554,7 @@ Interval Indexing ^^^^^^^^ - Bug in :meth:`DataFrame.__getitem__` returning modified columns when called with ``slice`` in Python 3.12 (:issue:`57500`) +- Bug in :meth:`DataFrame.from_records` throwing a ``ValueError`` when passed an empty list in ``index`` (:issue:`58594`) - Missing @@ -442,26 +566,38 @@ MultiIndex ^^^^^^^^^^ - :func:`DataFrame.loc` with ``axis=0`` and :class:`MultiIndex` when setting a value adds extra columns (:issue:`58116`) - :meth:`DataFrame.melt` would not accept multiple names in ``var_name`` when the columns were a :class:`MultiIndex` (:issue:`58033`) +- :meth:`MultiIndex.insert` would not insert NA value correctly at unified location of index -1 (:issue:`59003`) +- :func:`MultiIndex.get_level_values` accessing a :class:`DatetimeIndex` does not carry the frequency attribute along (:issue:`58327`, :issue:`57949`) - I/O ^^^ - Bug in :class:`DataFrame` and :class:`Series` ``repr`` of :py:class:`collections.abc.Mapping`` elements. (:issue:`57915`) +- Bug in :meth:`.DataFrame.to_json` when ``"index"`` was a value in the :attr:`DataFrame.column` and :attr:`Index.name` was ``None``. Now, this will fail with a ``ValueError`` (:issue:`58925`) - Bug in :meth:`DataFrame.to_dict` raises unnecessary ``UserWarning`` when columns are not unique and ``orient='tight'``. (:issue:`58281`) - Bug in :meth:`DataFrame.to_excel` when writing empty :class:`DataFrame` with :class:`MultiIndex` on both axes (:issue:`57696`) +- Bug in :meth:`DataFrame.to_stata` when writing :class:`DataFrame` and ``byteorder=`big```. (:issue:`58969`) - Bug in :meth:`DataFrame.to_string` that raised ``StopIteration`` with nested DataFrames. (:issue:`16098`) +- Bug in :meth:`HDFStore.get` was failing to save data of dtype datetime64[s] correctly (:issue:`59004`) +- Bug in :meth:`read_csv` causing segmentation fault when ``encoding_errors`` is not a string. (:issue:`59059`) - Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`) +- Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`) +- Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`) +- Bug in :meth:`read_json` not validating the ``typ`` argument to not be exactly ``"frame"`` or ``"series"`` (:issue:`59124`) +- Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`) +- Bug in :meth:`read_stata` where extreme value integers were incorrectly interpreted as missing for format versions 111 and prior (:issue:`58130`) +- Bug in :meth:`read_stata` where the missing code for double was not recognised for format versions 105 and prior (:issue:`58149`) Period ^^^^^^ -- +- Fixed error message when passing invalid period alias to :meth:`PeriodIndex.to_timestamp` (:issue:`58974`) - Plotting ^^^^^^^^ -- Bug in :meth:`.DataFrameGroupBy.boxplot` failed when there were multiple groupings (:issue:`14701`) +- Bug in :meth:`.DataFrameGroupBy.boxplot` failed when there were multiple groupings (:issue:`14701`) - Bug in :meth:`DataFrame.plot` that causes a shift to the right when the frequency multiplier is greater than one. (:issue:`57587`) -- +- Bug in :meth:`Series.plot` with ``kind="pie"`` with :class:`ArrowDtype` (:issue:`59192`) Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ @@ -471,25 +607,29 @@ Groupby/resample/rolling - Bug in :meth:`.DataFrameGroupBy.quantile` when ``interpolation="nearest"`` is inconsistent with :meth:`DataFrame.quantile` (:issue:`47942`) - Bug in :meth:`.Resampler.interpolate` on a :class:`DataFrame` with non-uniform sampling and/or indices not aligning with the resulting resampled index would result in wrong interpolation (:issue:`21351`) - Bug in :meth:`DataFrame.ewm` and :meth:`Series.ewm` when passed ``times`` and aggregation functions other than mean (:issue:`51695`) +- Bug in :meth:`DataFrameGroupBy.agg` that raises ``AttributeError`` when there is dictionary input and duplicated columns, instead of returning a DataFrame with the aggregation of all duplicate columns. (:issue:`55041`) - Bug in :meth:`DataFrameGroupBy.apply` that was returning a completely empty DataFrame when all return values of ``func`` were ``None`` instead of returning an empty DataFrame with the original columns and dtypes. (:issue:`57775`) - Bug in :meth:`DataFrameGroupBy.apply` with ``as_index=False`` that was returning :class:`MultiIndex` instead of returning :class:`Index`. (:issue:`58291`) +- Bug in :meth:`DataFrameGroupBy.cumsum` where it did not return the correct dtype when the label contained ``None``. (:issue:`58811`) - Bug in :meth:`DataFrameGroupby.transform` and :meth:`SeriesGroupby.transform` with a reducer and ``observed=False`` that coerces dtype to float when there are unobserved categories. (:issue:`55326`) - +- Bug in :meth:`Rolling.apply` where the applied function could be called on fewer than ``min_period`` periods if ``method="table"``. (:issue:`58868`) Reshaping ^^^^^^^^^ - Bug in :meth:`DataFrame.join` inconsistently setting result index name (:issue:`55815`) - Bug in :meth:`DataFrame.unstack` producing incorrect results when ``sort=False`` (:issue:`54987`, :issue:`55516`) +- Bug in :meth:`DataFrame.unstack` producing incorrect results when manipulating empty :class:`DataFrame` with an :class:`ExtentionDtype` (:issue:`59123`) Sparse ^^^^^^ - Bug in :class:`SparseDtype` for equal comparison with na fill value. (:issue:`54770`) -- +- Bug in :meth:`DataFrame.sparse.from_spmatrix` which hard coded an invalid ``fill_value`` for certain subtypes. (:issue:`59063`) ExtensionArray ^^^^^^^^^^^^^^ - Bug in :meth:`.arrays.ArrowExtensionArray.__setitem__` which caused wrong behavior when using an integer array with repeated values as a key (:issue:`58530`) - Bug in :meth:`api.types.is_datetime64_any_dtype` where a custom :class:`ExtensionDtype` would return ``False`` for array-likes (:issue:`57055`) +- Bug in various :class:`DataFrame` reductions for pyarrow temporal dtypes returning incorrect dtype when result was null (:issue:`59234`) Styler ^^^^^^ @@ -498,8 +638,11 @@ Styler Other ^^^^^ - Bug in :class:`DataFrame` when passing a ``dict`` with a NA scalar and ``columns`` that would always return ``np.nan`` (:issue:`57205`) +- Bug in :func:`eval` on :class:`ExtensionArray` on including division ``/`` failed with a ``TypeError``. (:issue:`58748`) +- Bug in :func:`eval` on :class:`complex` including division ``/`` discards imaginary part. (:issue:`21374`) - Bug in :func:`eval` where the names of the :class:`Series` were not preserved when using ``engine="numexpr"``. (:issue:`10239`) - Bug in :func:`unique` on :class:`Index` not always returning :class:`Index` (:issue:`57043`) +- Bug in :meth:`DataFrame.apply` where passing ``engine="numba"`` ignored ``args`` passed to the applied function (:issue:`58712`) - Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which caused an exception when using NumPy attributes via ``@`` notation, e.g., ``df.eval("@np.floor(a)")``. (:issue:`58041`) - Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which did not allow to use ``tan`` function. (:issue:`55091`) - Bug in :meth:`DataFrame.sort_index` when passing ``axis="columns"`` and ``ignore_index=True`` and ``ascending=False`` not returning a :class:`RangeIndex` columns (:issue:`57293`) @@ -507,6 +650,7 @@ Other - Bug in :meth:`DataFrame.where` where using a non-bool type array in the function would return a ``ValueError`` instead of a ``TypeError`` (:issue:`56330`) - Bug in :meth:`Index.sort_values` when passing a key function that turns values into tuples, e.g. ``key=natsort.natsort_key``, would raise ``TypeError`` (:issue:`56081`) - Bug in :meth:`Series.diff` allowing non-integer values for the ``periods`` argument. (:issue:`56607`) +- Bug in :meth:`Series.dt` methods in :class:`ArrowDtype` that were returning incorrect values. (:issue:`57355`) - Bug in :meth:`Series.rank` that doesn't preserve missing values for nullable integers when ``na_option='keep'``. (:issue:`56976`) - Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` inconsistently replacing matching instances when ``regex=True`` and missing values are present. (:issue:`56599`) - Bug in Dataframe Interchange Protocol implementation was returning incorrect results for data buffers' associated dtype, for string and datetime columns (:issue:`54781`) diff --git a/environment.yml b/environment.yml index dcc7aa5280b2c..e5646af07c45c 100644 --- a/environment.yml +++ b/environment.yml @@ -15,8 +15,8 @@ dependencies: # test dependencies - pytest>=7.3.2 - pytest-cov - - pytest-xdist>=2.2.0 - - pytest-qt>=4.2.0 + - pytest-xdist>=3.4.0 + - pytest-qt>=4.4.0 - pytest-localserver - pyqt>=5.15.9 - coverage @@ -33,7 +33,7 @@ dependencies: - fastparquet>=2023.10.0 - fsspec>=2022.11.0 - html5lib>=1.1 - - hypothesis>=6.46.1 + - hypothesis>=6.84.0 - gcsfs>=2022.11.0 - ipython - jinja2>=3.1.2 diff --git a/meson.build b/meson.build index 06623a305ab54..efe543b7a267c 100644 --- a/meson.build +++ b/meson.build @@ -44,6 +44,11 @@ else meson.add_dist_script(py, versioneer, '-o', '_version_meson.py') endif +cy = meson.get_compiler('cython') +if cy.version().version_compare('>=3.1.0') + add_project_arguments('-Xfreethreading_compatible=true', language : 'cython') +endif + # Needed by pandas.test() when it looks for the pytest ini options py.install_sources( 'pyproject.toml', diff --git a/pandas/_config/__init__.py b/pandas/_config/__init__.py index e746933ac0bf7..80d9ea1b364f3 100644 --- a/pandas/_config/__init__.py +++ b/pandas/_config/__init__.py @@ -30,6 +30,6 @@ from pandas._config.display import detect_console_encoding -def using_pyarrow_string_dtype() -> bool: +def using_string_dtype() -> bool: _mode_options = _global_config["future"] return _mode_options["infer_string"] diff --git a/pandas/_config/config.py b/pandas/_config/config.py index 1b91a7c3ee636..51794ec04b29e 100644 --- a/pandas/_config/config.py +++ b/pandas/_config/config.py @@ -55,7 +55,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, NamedTuple, cast, ) @@ -66,6 +65,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Generator, Sequence, ) @@ -211,6 +211,14 @@ def set_option(*args) -> None: TypeError if keyword arguments are provided OptionError if no such option exists + See Also + -------- + get_option : Retrieve the value of the specified option. + reset_option : Reset one or more options to their default value. + describe_option : Print the description for one or more registered options. + option_context : Context manager to temporarily set options in a ``with`` + statement. + Notes ----- For all available options, please view the :ref:`User Guide ` @@ -271,6 +279,12 @@ def describe_option(pat: str = "", _print_desc: bool = True) -> str | None: str If the description(s) as a string if ``_print_desc=False``. + See Also + -------- + get_option : Retrieve the value of the specified option. + set_option : Set the value of the specified option or options. + reset_option : Reset one or more options to their default value. + Notes ----- For all available options, please view the @@ -315,6 +329,12 @@ def reset_option(pat: str) -> None: None No return value. + See Also + -------- + get_option : Retrieve the value of the specified option. + set_option : Set the value of the specified option or options. + describe_option : Print the description for one or more registered options. + Notes ----- For all available options, please view the @@ -406,6 +426,13 @@ def option_context(*args) -> Generator[None, None, None]: None No return value. + See Also + -------- + get_option : Retrieve the value of the specified option. + set_option : Set the value of the specified option. + reset_option : Reset one or more options to their default value. + describe_option : Print the description for one or more registered options. + Notes ----- For all available options, please view the :ref:`User Guide ` diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 15f8727c38f8d..d7e485f74e58b 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -398,8 +398,14 @@ def group_cumsum( for i in range(N): lab = labels[i] - if lab < 0: + if uses_mask and lab < 0: + # GH#58811 + result_mask[i, :] = True + out[i, :] = 0 + continue + elif lab < 0: continue + for j in range(K): val = values[i, j] diff --git a/pandas/_libs/include/pandas/vendored/klib/khash_python.h b/pandas/_libs/include/pandas/vendored/klib/khash_python.h index 811fdd139de2c..2fa61642968cf 100644 --- a/pandas/_libs/include/pandas/vendored/klib/khash_python.h +++ b/pandas/_libs/include/pandas/vendored/klib/khash_python.h @@ -156,7 +156,7 @@ KHASH_MAP_INIT_COMPLEX128(complex128, size_t) // NaN-floats should be in the same equivalency class, see GH 22119 static inline int floatobject_cmp(PyFloatObject *a, PyFloatObject *b) { - return (Py_IS_NAN(PyFloat_AS_DOUBLE(a)) && Py_IS_NAN(PyFloat_AS_DOUBLE(b))) || + return (isnan(PyFloat_AS_DOUBLE(a)) && isnan(PyFloat_AS_DOUBLE(b))) || (PyFloat_AS_DOUBLE(a) == PyFloat_AS_DOUBLE(b)); } @@ -164,12 +164,12 @@ static inline int floatobject_cmp(PyFloatObject *a, PyFloatObject *b) { // PyObject_RichCompareBool for complexobjects has a different behavior // needs to be replaced static inline int complexobject_cmp(PyComplexObject *a, PyComplexObject *b) { - return (Py_IS_NAN(a->cval.real) && Py_IS_NAN(b->cval.real) && - Py_IS_NAN(a->cval.imag) && Py_IS_NAN(b->cval.imag)) || - (Py_IS_NAN(a->cval.real) && Py_IS_NAN(b->cval.real) && + return (isnan(a->cval.real) && isnan(b->cval.real) && isnan(a->cval.imag) && + isnan(b->cval.imag)) || + (isnan(a->cval.real) && isnan(b->cval.real) && a->cval.imag == b->cval.imag) || - (a->cval.real == b->cval.real && Py_IS_NAN(a->cval.imag) && - Py_IS_NAN(b->cval.imag)) || + (a->cval.real == b->cval.real && isnan(a->cval.imag) && + isnan(b->cval.imag)) || (a->cval.real == b->cval.real && a->cval.imag == b->cval.imag); } @@ -207,7 +207,8 @@ static inline int pyobject_cmp(PyObject *a, PyObject *b) { if (PyComplex_CheckExact(a)) { return complexobject_cmp((PyComplexObject *)a, (PyComplexObject *)b); } - if (PyTuple_CheckExact(a)) { + if (PyTuple_Check(a)) { + // compare tuple subclasses as builtin tuples return tupleobject_cmp((PyTupleObject *)a, (PyTupleObject *)b); } // frozenset isn't yet supported @@ -223,7 +224,7 @@ static inline int pyobject_cmp(PyObject *a, PyObject *b) { static inline Py_hash_t _Pandas_HashDouble(double val) { // Since Python3.10, nan is no longer has hash 0 - if (Py_IS_NAN(val)) { + if (isnan(val)) { return 0; } #if PY_VERSION_HEX < 0x030A0000 @@ -311,7 +312,8 @@ static inline khuint32_t kh_python_hash_func(PyObject *key) { // because complex(k,0) == k holds for any int-object k // and kh_complex128_hash_func doesn't respect it hash = complexobject_hash((PyComplexObject *)key); - } else if (PyTuple_CheckExact(key)) { + } else if (PyTuple_Check(key)) { + // hash tuple subclasses as builtin tuples hash = tupleobject_hash((PyTupleObject *)key); } else { hash = PyObject_Hash(key); diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 83373cb4b1d9f..0bb47541e5963 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -37,7 +37,7 @@ from cython cimport ( floating, ) -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._libs.missing import check_na_tuples_nonequal @@ -96,16 +96,12 @@ from pandas._libs.missing cimport ( is_null_datetime64, is_null_timedelta64, ) -from pandas._libs.tslibs.conversion cimport ( - _TSObject, - convert_to_tsobject, -) +from pandas._libs.tslibs.conversion cimport convert_to_tsobject from pandas._libs.tslibs.nattype cimport ( NPY_NAT, c_NaT as NaT, checknull_with_nat, ) -from pandas._libs.tslibs.np_datetime cimport NPY_FR_ns from pandas._libs.tslibs.offsets cimport is_offset_object from pandas._libs.tslibs.period cimport is_period_object from pandas._libs.tslibs.timedeltas cimport convert_to_timedelta64 @@ -2497,7 +2493,6 @@ def maybe_convert_objects(ndarray[object] objects, ndarray[uint8_t] mask Seen seen = Seen() object val - _TSObject tsobj float64_t fnan = NaN if dtype_if_all_nat is not None: @@ -2604,8 +2599,7 @@ def maybe_convert_objects(ndarray[object] objects, else: seen.datetime_ = True try: - tsobj = convert_to_tsobject(val, None, None, 0, 0) - tsobj.ensure_reso(NPY_FR_ns) + convert_to_tsobject(val, None, None, 0, 0) except OutOfBoundsDatetime: # e.g. test_out_of_s_bounds_datetime64 seen.object_ = True @@ -2705,10 +2699,10 @@ def maybe_convert_objects(ndarray[object] objects, seen.object_ = True elif seen.str_: - if using_pyarrow_string_dtype() and is_string_array(objects, skipna=True): + if using_string_dtype() and is_string_array(objects, skipna=True): from pandas.core.arrays.string_ import StringDtype - dtype = StringDtype(storage="pyarrow_numpy") + dtype = StringDtype(storage="pyarrow", na_value=np.nan) return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) elif convert_to_nullable_dtype and is_string_array(objects, skipna=True): diff --git a/pandas/_libs/src/datetime/pd_datetime.c b/pandas/_libs/src/datetime/pd_datetime.c index 4c1969f6d9f57..2c32fb0481486 100644 --- a/pandas/_libs/src/datetime/pd_datetime.c +++ b/pandas/_libs/src/datetime/pd_datetime.c @@ -245,7 +245,12 @@ static int pandas_datetime_exec(PyObject *Py_UNUSED(module)) { } static PyModuleDef_Slot pandas_datetime_slots[] = { - {Py_mod_exec, pandas_datetime_exec}, {0, NULL}}; + {Py_mod_exec, pandas_datetime_exec}, +#if PY_VERSION_HEX >= 0x030D0000 + {Py_mod_gil, Py_MOD_GIL_NOT_USED}, +#endif + {0, NULL}, +}; static struct PyModuleDef pandas_datetimemodule = { PyModuleDef_HEAD_INIT, diff --git a/pandas/_libs/src/parser/pd_parser.c b/pandas/_libs/src/parser/pd_parser.c index 48f3cd14cbc30..51cdf071a15cf 100644 --- a/pandas/_libs/src/parser/pd_parser.c +++ b/pandas/_libs/src/parser/pd_parser.c @@ -161,7 +161,12 @@ static int pandas_parser_exec(PyObject *Py_UNUSED(module)) { } static PyModuleDef_Slot pandas_parser_slots[] = { - {Py_mod_exec, pandas_parser_exec}, {0, NULL}}; + {Py_mod_exec, pandas_parser_exec}, +#if PY_VERSION_HEX >= 0x030D0000 + {Py_mod_gil, Py_MOD_GIL_NOT_USED}, +#endif + {0, NULL}, +}; static struct PyModuleDef pandas_parsermodule = { PyModuleDef_HEAD_INIT, diff --git a/pandas/_libs/src/vendored/ujson/python/objToJSON.c b/pandas/_libs/src/vendored/ujson/python/objToJSON.c index fa91db5fe34e3..5f35860c59cb7 100644 --- a/pandas/_libs/src/vendored/ujson/python/objToJSON.c +++ b/pandas/_libs/src/vendored/ujson/python/objToJSON.c @@ -410,8 +410,8 @@ static void NpyArr_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { npyarr->type_num = PyArray_DESCR(obj)->type_num; if (GET_TC(tc)->transpose) { - npyarr->dim = PyArray_DIM(obj, npyarr->ndim); - npyarr->stride = PyArray_STRIDE(obj, npyarr->ndim); + npyarr->dim = PyArray_DIM(obj, (int)npyarr->ndim); + npyarr->stride = PyArray_STRIDE(obj, (int)npyarr->ndim); npyarr->stridedim = npyarr->ndim; npyarr->index[npyarr->ndim] = 0; npyarr->inc = -1; @@ -452,8 +452,8 @@ static void NpyArrPassThru_iterEnd(JSOBJ obj, JSONTypeContext *tc) { return; } const PyArrayObject *arrayobj = (const PyArrayObject *)npyarr->array; - npyarr->dim = PyArray_DIM(arrayobj, npyarr->stridedim); - npyarr->stride = PyArray_STRIDE(arrayobj, npyarr->stridedim); + npyarr->dim = PyArray_DIM(arrayobj, (int)npyarr->stridedim); + npyarr->stride = PyArray_STRIDE(arrayobj, (int)npyarr->stridedim); npyarr->dataptr += npyarr->stride; NpyArr_freeItemValue(obj, tc); @@ -524,8 +524,8 @@ static int NpyArr_iterNext(JSOBJ _obj, JSONTypeContext *tc) { } const PyArrayObject *arrayobj = (const PyArrayObject *)npyarr->array; - npyarr->dim = PyArray_DIM(arrayobj, npyarr->stridedim); - npyarr->stride = PyArray_STRIDE(arrayobj, npyarr->stridedim); + npyarr->dim = PyArray_DIM(arrayobj, (int)npyarr->stridedim); + npyarr->stride = PyArray_STRIDE(arrayobj, (int)npyarr->stridedim); npyarr->index[npyarr->stridedim] = 0; ((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = npyarr; diff --git a/pandas/_libs/src/vendored/ujson/python/ujson.c b/pandas/_libs/src/vendored/ujson/python/ujson.c index 075411a23b075..f369d122a3dbe 100644 --- a/pandas/_libs/src/vendored/ujson/python/ujson.c +++ b/pandas/_libs/src/vendored/ujson/python/ujson.c @@ -384,6 +384,10 @@ PyMODINIT_FUNC PyInit_json(void) { return NULL; } +#ifdef Py_GIL_DISABLED + PyUnstable_Module_SetGIL(module, Py_MOD_GIL_NOT_USED); +#endif + #ifndef PYPY_VERSION PyObject *mod_decimal = PyImport_ImportModule("decimal"); if (mod_decimal) { diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index dca3ba0ce49b3..928d253bf3169 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -63,7 +63,10 @@ from pandas._libs.tslibs.conversion cimport ( get_datetime64_nanos, parse_pydatetime, ) -from pandas._libs.tslibs.dtypes cimport npy_unit_to_abbrev +from pandas._libs.tslibs.dtypes cimport ( + get_supported_reso, + npy_unit_to_abbrev, +) from pandas._libs.tslibs.nattype cimport ( NPY_NAT, c_nat_strings as nat_strings, @@ -260,7 +263,7 @@ cpdef array_to_datetime( bint dayfirst=False, bint yearfirst=False, bint utc=False, - NPY_DATETIMEUNIT creso=NPY_FR_ns, + NPY_DATETIMEUNIT creso=NPY_DATETIMEUNIT.NPY_FR_GENERIC, str unit_for_numerics=None, ): """ @@ -288,8 +291,8 @@ cpdef array_to_datetime( yearfirst parsing behavior when encountering datetime strings utc : bool, default False indicator whether the dates should be UTC - creso : NPY_DATETIMEUNIT, default NPY_FR_ns - Set to NPY_FR_GENERIC to infer a resolution. + creso : NPY_DATETIMEUNIT, default NPY_FR_GENERIC + If NPY_FR_GENERIC, conduct inference. unit_for_numerics : str, default "ns" Returns @@ -389,7 +392,7 @@ cpdef array_to_datetime( # GH#32264 np.str_ object val = str(val) - if parse_today_now(val, &iresult[i], utc, creso): + if parse_today_now(val, &iresult[i], utc, creso, infer_reso=infer_reso): # We can't _quite_ dispatch this to convert_str_to_tsobject # bc there isn't a nice way to pass "utc" item_reso = NPY_DATETIMEUNIT.NPY_FR_us @@ -533,7 +536,9 @@ def array_to_datetime_with_tz( if state.creso_ever_changed: # We encountered mismatched resolutions, need to re-parse with # the correct one. - return array_to_datetime_with_tz(values, tz=tz, creso=creso) + return array_to_datetime_with_tz( + values, tz=tz, dayfirst=dayfirst, yearfirst=yearfirst, creso=creso + ) elif creso == NPY_DATETIMEUNIT.NPY_FR_GENERIC: # i.e. we never encountered anything non-NaT, default to "s". This # ensures that insert and concat-like operations with NaT diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 3a55f5fa0c003..0fadbbbed2c72 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -606,37 +606,42 @@ cdef _TSObject convert_str_to_tsobject(str ts, tzinfo tz, # equiv: datetime.today().replace(tzinfo=tz) return convert_datetime_to_tsobject(dt, tz, nanos=0, reso=NPY_FR_us) else: - string_to_dts_failed = string_to_dts( - ts, &dts, &out_bestunit, &out_local, - &out_tzoffset, False - ) - if not string_to_dts_failed: - reso = get_supported_reso(out_bestunit) - check_dts_bounds(&dts, reso) - obj = _TSObject() - obj.dts = dts - obj.creso = reso - ival = npy_datetimestruct_to_datetime(reso, &dts) - - if out_local == 1: - obj.tzinfo = timezone(timedelta(minutes=out_tzoffset)) - obj.value = tz_localize_to_utc_single( - ival, obj.tzinfo, ambiguous="raise", nonexistent=None, creso=reso - ) - if tz is None: - check_overflows(obj, reso) - return obj - _adjust_tsobject_tz_using_offset(obj, tz) - return obj - else: - if tz is not None: - # shift for _localize_tso - ival = tz_localize_to_utc_single( - ival, tz, ambiguous="raise", nonexistent=None, creso=reso + if not dayfirst: # GH 58859 + string_to_dts_failed = string_to_dts( + ts, &dts, &out_bestunit, &out_local, + &out_tzoffset, False + ) + if not string_to_dts_failed: + reso = get_supported_reso(out_bestunit) + check_dts_bounds(&dts, reso) + obj = _TSObject() + obj.dts = dts + obj.creso = reso + ival = npy_datetimestruct_to_datetime(reso, &dts) + + if out_local == 1: + obj.tzinfo = timezone(timedelta(minutes=out_tzoffset)) + obj.value = tz_localize_to_utc_single( + ival, + obj.tzinfo, + ambiguous="raise", + nonexistent=None, + creso=reso, ) - obj.value = ival - maybe_localize_tso(obj, tz, obj.creso) - return obj + if tz is None: + check_overflows(obj, reso) + return obj + _adjust_tsobject_tz_using_offset(obj, tz) + return obj + else: + if tz is not None: + # shift for _localize_tso + ival = tz_localize_to_utc_single( + ival, tz, ambiguous="raise", nonexistent=None, creso=reso + ) + obj.value = ival + maybe_localize_tso(obj, tz, obj.creso) + return obj dt = parse_datetime_string( ts, diff --git a/pandas/_libs/tslibs/dtypes.pxd b/pandas/_libs/tslibs/dtypes.pxd index 33f6789f3b402..d8c536a34bc04 100644 --- a/pandas/_libs/tslibs/dtypes.pxd +++ b/pandas/_libs/tslibs/dtypes.pxd @@ -12,12 +12,14 @@ cdef NPY_DATETIMEUNIT get_supported_reso(NPY_DATETIMEUNIT reso) cdef bint is_supported_unit(NPY_DATETIMEUNIT reso) cdef dict c_OFFSET_TO_PERIOD_FREQSTR -cdef dict c_OFFSET_DEPR_FREQSTR -cdef dict c_REVERSE_OFFSET_DEPR_FREQSTR -cdef dict c_DEPR_ABBREVS +cdef dict c_PERIOD_TO_OFFSET_FREQSTR +cdef dict c_OFFSET_RENAMED_FREQSTR +cdef dict c_DEPR_UNITS +cdef dict c_PERIOD_AND_OFFSET_DEPR_FREQSTR cdef dict attrname_to_abbrevs cdef dict npy_unit_to_attrname cdef dict attrname_to_npy_unit +cdef str INVALID_FREQ_ERR_MSG cdef enum c_FreqGroup: # Mirrors FreqGroup in the .pyx file diff --git a/pandas/_libs/tslibs/dtypes.pyx b/pandas/_libs/tslibs/dtypes.pyx index 5bfbe211bfd14..4100f3d90e817 100644 --- a/pandas/_libs/tslibs/dtypes.pyx +++ b/pandas/_libs/tslibs/dtypes.pyx @@ -1,9 +1,6 @@ # period frequency constants corresponding to scikits timeseries # originals from enum import Enum -import warnings - -from pandas.util._exceptions import find_stack_level from pandas._libs.tslibs.ccalendar cimport c_MONTH_NUMBERS from pandas._libs.tslibs.np_datetime cimport ( @@ -176,6 +173,10 @@ OFFSET_TO_PERIOD_FREQSTR: dict = { "EOM": "M", "BME": "M", "SME": "M", + "BMS": "M", + "CBME": "M", + "CBMS": "M", + "SMS": "M", "BQS": "Q", "QS": "Q", "BQE": "Q", @@ -228,7 +229,6 @@ OFFSET_TO_PERIOD_FREQSTR: dict = { "YE-NOV": "Y-NOV", "W": "W", "ME": "M", - "Y": "Y", "BYE": "Y", "BYE-DEC": "Y-DEC", "BYE-JAN": "Y-JAN", @@ -245,7 +245,7 @@ OFFSET_TO_PERIOD_FREQSTR: dict = { "YS": "Y", "BYS": "Y", } -cdef dict c_OFFSET_DEPR_FREQSTR = { +cdef dict c_OFFSET_RENAMED_FREQSTR = { "M": "ME", "Q": "QE", "Q-DEC": "QE-DEC", @@ -303,19 +303,68 @@ cdef dict c_OFFSET_DEPR_FREQSTR = { "BQ-OCT": "BQE-OCT", "BQ-NOV": "BQE-NOV", } -cdef dict c_OFFSET_TO_PERIOD_FREQSTR = OFFSET_TO_PERIOD_FREQSTR -cdef dict c_REVERSE_OFFSET_DEPR_FREQSTR = { - v: k for k, v in c_OFFSET_DEPR_FREQSTR.items() +PERIOD_TO_OFFSET_FREQSTR = { + "M": "ME", + "Q": "QE", + "Q-DEC": "QE-DEC", + "Q-JAN": "QE-JAN", + "Q-FEB": "QE-FEB", + "Q-MAR": "QE-MAR", + "Q-APR": "QE-APR", + "Q-MAY": "QE-MAY", + "Q-JUN": "QE-JUN", + "Q-JUL": "QE-JUL", + "Q-AUG": "QE-AUG", + "Q-SEP": "QE-SEP", + "Q-OCT": "QE-OCT", + "Q-NOV": "QE-NOV", + "Y": "YE", + "Y-DEC": "YE-DEC", + "Y-JAN": "YE-JAN", + "Y-FEB": "YE-FEB", + "Y-MAR": "YE-MAR", + "Y-APR": "YE-APR", + "Y-MAY": "YE-MAY", + "Y-JUN": "YE-JUN", + "Y-JUL": "YE-JUL", + "Y-AUG": "YE-AUG", + "Y-SEP": "YE-SEP", + "Y-OCT": "YE-OCT", + "Y-NOV": "YE-NOV", } +cdef dict c_OFFSET_TO_PERIOD_FREQSTR = OFFSET_TO_PERIOD_FREQSTR +cdef dict c_PERIOD_TO_OFFSET_FREQSTR = PERIOD_TO_OFFSET_FREQSTR -# Map deprecated resolution abbreviations to correct resolution abbreviations -cdef dict c_DEPR_ABBREVS = { +cdef dict c_DEPR_UNITS = { + "w": "W", + "d": "D", "H": "h", - "BH": "bh", - "CBH": "cbh", + "MIN": "min", "S": "s", + "MS": "ms", + "US": "us", + "NS": "ns", +} + +cdef dict c_PERIOD_AND_OFFSET_DEPR_FREQSTR = { + "w": "W", + "w-mon": "W-MON", + "w-tue": "W-TUE", + "w-wed": "W-WED", + "w-thu": "W-THU", + "w-fri": "W-FRI", + "w-sat": "W-SAT", + "w-sun": "W-SUN", + "d": "D", + "b": "B", + "c": "C", + "MIN": "min", + "US": "us", + "NS": "ns", } +cdef str INVALID_FREQ_ERR_MSG = "Invalid frequency: {0}" + class FreqGroup(Enum): # Mirrors c_FreqGroup in the .pxd file @@ -405,43 +454,18 @@ class Resolution(Enum): >>> Resolution.get_reso_from_freqstr('h') == Resolution.RESO_HR True """ - cdef: - str abbrev - if freq in {"T", "t", "L", "l", "U", "u", "N", "n"}: - raise ValueError( - f"Frequency \'{freq}\' is no longer supported." - ) try: - if freq in c_DEPR_ABBREVS: - abbrev = c_DEPR_ABBREVS[freq] - warnings.warn( - f"\'{freq}\' is deprecated and will be removed in a future " - f"version. Please use \'{abbrev}\' " - f"instead of \'{freq}\'.", - FutureWarning, - stacklevel=find_stack_level(), - ) - freq = abbrev attr_name = _abbrev_to_attrnames[freq] - except KeyError: + except KeyError as exc: + msg = INVALID_FREQ_ERR_MSG.format(freq) # For quarterly and yearly resolutions, we need to chop off # a month string. split_freq = freq.split("-") if len(split_freq) != 2: - raise + raise ValueError(msg) from exc if split_freq[1] not in _month_names: # i.e. we want e.g. "Q-DEC", not "Q-INVALID" - raise - if split_freq[0] in c_DEPR_ABBREVS: - abbrev = c_DEPR_ABBREVS[split_freq[0]] - warnings.warn( - f"\'{split_freq[0]}\' is deprecated and will be removed in a " - f"future version. Please use \'{abbrev}\' " - f"instead of \'{split_freq[0]}\'.", - FutureWarning, - stacklevel=find_stack_level(), - ) - split_freq[0] = abbrev + raise ValueError(msg) from exc attr_name = _abbrev_to_attrnames[split_freq[0]] return cls.from_attrname(attr_name) diff --git a/pandas/_libs/tslibs/fields.pyx b/pandas/_libs/tslibs/fields.pyx index 399a5c2e96cd5..e523ac2e7b5c6 100644 --- a/pandas/_libs/tslibs/fields.pyx +++ b/pandas/_libs/tslibs/fields.pyx @@ -253,9 +253,10 @@ def get_start_end_field( # month of year. Other offsets use month, startingMonth as ending # month of year. - if freq_name.lstrip("B")[0:2] in ["MS", "QS", "YS"]: + if freq_name.lstrip("B")[0:2] in ["QS", "YS"]: end_month = 12 if month_kw == 1 else month_kw - 1 start_month = month_kw + else: end_month = month_kw start_month = (end_month % 12) + 1 diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index c483814a3ef74..130e41e5104a2 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -441,6 +441,13 @@ class NaTType(_NaT): Monday == 1 ... Sunday == 7. + See Also + -------- + Timestamp.weekday : Return the day of the week with Monday=0, Sunday=6. + Timestamp.isocalendar : Return a tuple containing ISO year, week number + and weekday. + datetime.date.isoweekday : Equivalent method in datetime module. + Examples -------- >>> ts = pd.Timestamp('2023-01-01 10:00:00') @@ -633,6 +640,16 @@ class NaTType(_NaT): """ Return time object with same time but with tzinfo=None. + This method extracts the time part of the `Timestamp` object, excluding any + timezone information. It returns a `datetime.time` object which only represents + the time (hours, minutes, seconds, and microseconds). + + See Also + -------- + Timestamp.date : Return date object with same year, month and day. + Timestamp.tz_convert : Convert timezone-aware Timestamp to another time zone. + Timestamp.tz_localize : Localize the Timestamp to a timezone. + Examples -------- >>> ts = pd.Timestamp('2023-01-01 10:00:00') @@ -841,7 +858,7 @@ class NaTType(_NaT): Parameters ---------- - tz : str, pytz.timezone, dateutil.tz.tzfile or None + tz : str, zoneinfo.ZoneInfo, pytz.timezone, dateutil.tz.tzfile or None Time zone for time which Timestamp will be converted to. None will remove timezone holding UTC time. @@ -894,7 +911,7 @@ class NaTType(_NaT): ---------- ordinal : int Date corresponding to a proleptic Gregorian ordinal. - tz : str, pytz.timezone, dateutil.tz.tzfile or None + tz : str, zoneinfo.ZoneInfo, pytz.timezone, dateutil.tz.tzfile or None Time zone for the Timestamp. Notes @@ -957,11 +974,21 @@ class NaTType(_NaT): """ Return new Timestamp object representing current time local to tz. + This method returns a new `Timestamp` object that represents the current time. + If a timezone is provided, the current time will be localized to that timezone. + Otherwise, it returns the current local time. + Parameters ---------- tz : str or timezone object, default None Timezone to localize to. + See Also + -------- + to_datetime : Convert argument to datetime. + Timestamp.utcnow : Return a new Timestamp representing UTC day and time. + Timestamp.today : Return the current time in the local timezone. + Examples -------- >>> pd.Timestamp.now() # doctest: +SKIP @@ -1139,6 +1166,12 @@ timedelta}, default 'raise' ------ ValueError if the freq cannot be converted. + See Also + -------- + Timestamp.ceil : Round up a Timestamp to the specified resolution. + Timestamp.round : Round a Timestamp to the specified resolution. + Series.dt.floor : Round down the datetime values in a Series. + Notes ----- If the Timestamp has a timezone, flooring will take place relative to the @@ -1301,7 +1334,7 @@ timedelta}, default 'raise' Parameters ---------- - tz : str, pytz.timezone, dateutil.tz.tzfile or None + tz : str, zoneinfo.ZoneInfo, pytz.timezone, dateutil.tz.tzfile or None Time zone for time which Timestamp will be converted to. None will remove timezone holding UTC time. @@ -1355,7 +1388,7 @@ timedelta}, default 'raise' Parameters ---------- - tz : str, pytz.timezone, dateutil.tz.tzfile or None + tz : str, zoneinfo.ZoneInfo, pytz.timezone, dateutil.tz.tzfile or None Time zone for time which Timestamp will be converted to. None will remove timezone holding local time. @@ -1455,13 +1488,13 @@ default 'raise' Replace timezone (not a conversion): - >>> import pytz - >>> ts.replace(tzinfo=pytz.timezone('US/Pacific')) + >>> import zoneinfo + >>> ts.replace(tzinfo=zoneinfo.ZoneInfo('US/Pacific')) Timestamp('2020-03-14 15:32:52.192548651-0700', tz='US/Pacific') Analogous for ``pd.NaT``: - >>> pd.NaT.replace(tzinfo=pytz.timezone('US/Pacific')) + >>> pd.NaT.replace(tzinfo=zoneinfo.ZoneInfo('US/Pacific')) NaT """, ) @@ -1490,7 +1523,7 @@ default 'raise' See Also -------- - Timestamp.asm8 : Return numpy datetime64 format in nanoseconds. + Timestamp.asm8 : Return numpy datetime64 format with same precision. Timestamp.to_pydatetime : Convert Timestamp object to a native Python datetime object. to_timedelta : Convert argument into timedelta object, diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 0d681e0c2aae6..fd1bb3fe3e173 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -56,9 +56,10 @@ from pandas._libs.tslibs.ccalendar cimport ( ) from pandas._libs.tslibs.conversion cimport localize_pydatetime from pandas._libs.tslibs.dtypes cimport ( - c_DEPR_ABBREVS, - c_OFFSET_DEPR_FREQSTR, - c_REVERSE_OFFSET_DEPR_FREQSTR, + c_OFFSET_RENAMED_FREQSTR, + c_OFFSET_TO_PERIOD_FREQSTR, + c_PERIOD_AND_OFFSET_DEPR_FREQSTR, + c_PERIOD_TO_OFFSET_FREQSTR, periods_per_day, ) from pandas._libs.tslibs.nattype cimport ( @@ -2579,7 +2580,7 @@ cdef class YearEnd(YearOffset): YearEnd goes to the next date which is the end of the year. - Parameters + Attributes ---------- n : int, default 1 The number of years represented. @@ -4675,6 +4676,7 @@ prefix_mapping = { Hour, # 'h' Day, # 'D' WeekOfMonth, # 'WOM' + LastWeekOfMonth, # 'LWOM' FY5253, FY5253Quarter, ] @@ -4695,13 +4697,9 @@ _lite_rule_alias = { "BYS": "BYS-JAN", # BYearBegin(month=1), "Min": "min", - "min": "min", - "ms": "ms", - "us": "us", - "ns": "ns", } -_dont_uppercase = {"h", "bh", "cbh", "MS", "ms", "s"} +_dont_uppercase = {"min", "h", "bh", "cbh", "s", "ms", "us", "ns"} INVALID_FREQ_ERR_MSG = "Invalid frequency: {0}" @@ -4711,6 +4709,61 @@ INVALID_FREQ_ERR_MSG = "Invalid frequency: {0}" _offset_map = {} +def _warn_about_deprecated_aliases(name: str, is_period: bool) -> str: + if name in _lite_rule_alias: + return name + if name in c_PERIOD_AND_OFFSET_DEPR_FREQSTR: + warnings.warn( + f"\'{name}\' is deprecated and will be removed " + f"in a future version, please use " + f"\'{c_PERIOD_AND_OFFSET_DEPR_FREQSTR.get(name)}\' " + f" instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + return c_PERIOD_AND_OFFSET_DEPR_FREQSTR[name] + + for _name in (name.lower(), name.upper()): + if name == _name: + continue + if _name in c_PERIOD_AND_OFFSET_DEPR_FREQSTR.values(): + warnings.warn( + f"\'{name}\' is deprecated and will be removed " + f"in a future version, please use " + f"\'{_name}\' " + f" instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + return _name + + return name + + +def _validate_to_offset_alias(alias: str, is_period: bool) -> None: + if not is_period: + if alias.upper() in c_OFFSET_RENAMED_FREQSTR: + raise ValueError( + f"\'{alias}\' is no longer supported for offsets. Please " + f"use \'{c_OFFSET_RENAMED_FREQSTR.get(alias.upper())}\' " + f"instead." + ) + if (alias.upper() != alias and + alias.lower() not in {"s", "ms", "us", "ns"} and + alias.upper().split("-")[0].endswith(("S", "E"))): + raise ValueError(INVALID_FREQ_ERR_MSG.format(alias)) + if ( + is_period and + alias in c_OFFSET_TO_PERIOD_FREQSTR and + alias != c_OFFSET_TO_PERIOD_FREQSTR[alias] + ): + alias_msg = c_OFFSET_TO_PERIOD_FREQSTR.get(alias) + raise ValueError( + f"for Period, please use \'{alias_msg}\' " + f"instead of \'{alias}\'" + ) + + # TODO: better name? def _get_offset(name: str) -> BaseOffset: """ @@ -4720,35 +4773,6 @@ def _get_offset(name: str) -> BaseOffset: -------- _get_offset('EOM') --> BMonthEnd(1) """ - if ( - name not in _lite_rule_alias - and (name.upper() in _lite_rule_alias) - and name != "ms" - ): - warnings.warn( - f"\'{name}\' is deprecated and will be removed " - f"in a future version, please use \'{name.upper()}\' instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - elif ( - name not in _lite_rule_alias - and (name.lower() in _lite_rule_alias) - and name != "MS" - ): - warnings.warn( - f"\'{name}\' is deprecated and will be removed " - f"in a future version, please use \'{name.lower()}\' instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - if name not in _dont_uppercase: - name = name.upper() - name = _lite_rule_alias.get(name, name) - name = _lite_rule_alias.get(name.lower(), name) - else: - name = _lite_rule_alias.get(name, name) - if name not in _offset_map: try: split = name.split("-") @@ -4850,77 +4874,30 @@ cpdef to_offset(freq, bint is_period=False): tups = zip(split[0::4], split[1::4], split[2::4]) for n, (sep, stride, name) in enumerate(tups): - if not is_period and name.upper() in c_OFFSET_DEPR_FREQSTR: - warnings.warn( - f"\'{name}\' is deprecated and will be removed " - f"in a future version, please use " - f"\'{c_OFFSET_DEPR_FREQSTR.get(name.upper())}\' instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - name = c_OFFSET_DEPR_FREQSTR[name.upper()] - if (not is_period and - name != name.upper() and - name.lower() not in {"s", "ms", "us", "ns"} and - name.upper().split("-")[0].endswith(("S", "E"))): - warnings.warn( - f"\'{name}\' is deprecated and will be removed " - f"in a future version, please use " - f"\'{name.upper()}\' instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - name = name.upper() - if is_period and name.upper() in c_REVERSE_OFFSET_DEPR_FREQSTR: - if name.upper().startswith("Y"): - raise ValueError( - f"for Period, please use \'Y{name.upper()[2:]}\' " - f"instead of \'{name}\'" - ) - if (name.upper().startswith("B") or - name.upper().startswith("S") or - name.upper().startswith("C")): - raise ValueError(INVALID_FREQ_ERR_MSG.format(name)) - else: - raise ValueError( - f"for Period, please use " - f"\'{c_REVERSE_OFFSET_DEPR_FREQSTR.get(name.upper())}\' " - f"instead of \'{name}\'" - ) - elif is_period and name.upper() in c_OFFSET_DEPR_FREQSTR: - if name.upper() != name: - warnings.warn( - f"\'{name}\' is deprecated and will be removed in " - f"a future version, please use \'{name.upper()}\' " - f"instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - name = c_OFFSET_DEPR_FREQSTR.get(name.upper()) + name = _warn_about_deprecated_aliases(name, is_period) + _validate_to_offset_alias(name, is_period) + if is_period: + if name.upper() in c_PERIOD_TO_OFFSET_FREQSTR: + if name.upper() != name: + raise ValueError( + f"\'{name}\' is no longer supported, " + f"please use \'{name.upper()}\' instead.", + ) + name = c_PERIOD_TO_OFFSET_FREQSTR[name.upper()] + name = _lite_rule_alias.get(name, name) if sep != "" and not sep.isspace(): raise ValueError("separator must be spaces") - prefix = _lite_rule_alias.get(name) or name if stride_sign is None: stride_sign = -1 if stride.startswith("-") else 1 if not stride: stride = 1 - if prefix in c_DEPR_ABBREVS: - warnings.warn( - f"\'{prefix}\' is deprecated and will be removed " - f"in a future version, please use " - f"\'{c_DEPR_ABBREVS.get(prefix)}\' instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - prefix = c_DEPR_ABBREVS[prefix] - - if prefix in {"D", "h", "min", "s", "ms", "us", "ns"}: + if name in {"D", "h", "min", "s", "ms", "us", "ns"}: # For these prefixes, we have something like "3h" or # "2.5min", so we can construct a Timedelta with the # matching unit and get our offset from delta_to_tick - td = Timedelta(1, unit=prefix) + td = Timedelta(1, unit=name) off = delta_to_tick(td) offset = off * float(stride) if n != 0: @@ -4929,7 +4906,7 @@ cpdef to_offset(freq, bint is_period=False): offset *= stride_sign else: stride = int(stride) - offset = _get_offset(prefix) + offset = _get_offset(name) offset = offset * int(np.fabs(stride) * stride_sign) if result is None: @@ -4939,14 +4916,19 @@ cpdef to_offset(freq, bint is_period=False): except (ValueError, TypeError) as err: raise ValueError(INVALID_FREQ_ERR_MSG.format( f"{freq}, failed to parse with error message: {repr(err)}") - ) + ) from err else: result = None if result is None: raise ValueError(INVALID_FREQ_ERR_MSG.format(freq)) - if is_period and not hasattr(result, "_period_dtype_code"): + try: + has_period_dtype_code = hasattr(result, "_period_dtype_code") + except ValueError: + has_period_dtype_code = False + + if is_period and not has_period_dtype_code: if isinstance(freq, str): raise ValueError(f"{result.name} is not supported as period frequency") else: diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 35d2433a707a0..308183402198d 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -377,32 +377,33 @@ def parse_datetime_string_with_reso( raise ValueError(f'Given date string "{date_string}" not likely a datetime') # Try iso8601 first, as it handles nanoseconds - string_to_dts_failed = string_to_dts( - date_string, &dts, &out_bestunit, &out_local, - &out_tzoffset, False - ) - if not string_to_dts_failed: - # Match Timestamp and drop picoseconds, femtoseconds, attoseconds - # The new resolution will just be nano - # GH#50417 - if out_bestunit in _timestamp_units: - out_bestunit = NPY_DATETIMEUNIT.NPY_FR_ns - - if out_bestunit == NPY_DATETIMEUNIT.NPY_FR_ns: - # TODO: avoid circular import - from pandas import Timestamp - parsed = Timestamp(date_string) - else: - if out_local: - tz = timezone(timedelta(minutes=out_tzoffset)) + if not dayfirst: # GH 58859 + string_to_dts_failed = string_to_dts( + date_string, &dts, &out_bestunit, &out_local, + &out_tzoffset, False + ) + if not string_to_dts_failed: + # Match Timestamp and drop picoseconds, femtoseconds, attoseconds + # The new resolution will just be nano + # GH#50417 + if out_bestunit in _timestamp_units: + out_bestunit = NPY_DATETIMEUNIT.NPY_FR_ns + + if out_bestunit == NPY_DATETIMEUNIT.NPY_FR_ns: + # TODO: avoid circular import + from pandas import Timestamp + parsed = Timestamp(date_string) else: - tz = None - parsed = datetime_new( - dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us, tz - ) + if out_local: + tz = timezone(timedelta(minutes=out_tzoffset)) + else: + tz = None + parsed = datetime_new( + dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us, tz + ) - reso = npy_unit_to_attrname[out_bestunit] - return parsed, reso + reso = npy_unit_to_attrname[out_bestunit] + return parsed, reso parsed = _parse_delimited_date(date_string, dayfirst, &out_bestunit) if parsed is not None: diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 838b5b9f4595f..c6ba97fe9f1a2 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -2329,6 +2329,12 @@ cdef class _Period(PeriodMixin): """ Return the quarter this Period falls on. + See Also + -------- + Timestamp.quarter : Return the quarter of the Timestamp. + Period.year : Return the year of the period. + Period.month : Return the month of the period. + Examples -------- >>> period = pd.Period('2022-04', 'M') @@ -2443,6 +2449,12 @@ cdef class _Period(PeriodMixin): """ Return True if the period's year is in a leap year. + See Also + -------- + Timestamp.is_leap_year : Check if the year in a Timestamp is a leap year. + DatetimeIndex.is_leap_year : Boolean indicator if the date belongs to a + leap year. + Examples -------- >>> period = pd.Period('2022-01', 'M') @@ -2460,11 +2472,24 @@ cdef class _Period(PeriodMixin): """ Return the period of now's date. + The `now` method provides a convenient way to generate a period + object for the current date and time. This can be particularly + useful in financial and economic analysis, where data is often + collected and analyzed in regular intervals (e.g., hourly, daily, + monthly). By specifying the frequency, users can create periods + that match the granularity of their data. + Parameters ---------- freq : str, BaseOffset Frequency to use for the returned period. + See Also + -------- + to_datetime : Convert argument to datetime. + Period : Represents a period of time. + Period.to_timestamp : Return the Timestamp representation of the Period. + Examples -------- >>> pd.Period.now('h') # doctest: +SKIP @@ -2693,6 +2718,12 @@ class Period(_Period): second : int, default 0 Second value of the period. + See Also + -------- + Timestamp : Pandas replacement for python datetime.datetime object. + date_range : Return a fixed frequency DatetimeIndex. + timedelta_range : Generates a fixed frequency range of timedeltas. + Examples -------- >>> period = pd.Period('2012-1-1', freq='D') diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index d6c3285d89c59..43279051e2a30 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -354,7 +354,7 @@ def array_strptime( bint exact=True, errors="raise", bint utc=False, - NPY_DATETIMEUNIT creso=NPY_FR_ns, + NPY_DATETIMEUNIT creso=NPY_DATETIMEUNIT.NPY_FR_GENERIC, ): """ Calculates the datetime structs represented by the passed array of strings @@ -365,7 +365,7 @@ def array_strptime( fmt : string-like regex exact : matches must be exact if True, search if False errors : string specifying error handling, {'raise', 'coerce'} - creso : NPY_DATETIMEUNIT, default NPY_FR_ns + creso : NPY_DATETIMEUNIT, default NPY_FR_GENERIC Set to NPY_FR_GENERIC to infer a resolution. """ @@ -712,7 +712,7 @@ cdef tzinfo _parse_with_format( elif len(s) <= 6: item_reso[0] = NPY_DATETIMEUNIT.NPY_FR_us else: - item_reso[0] = NPY_DATETIMEUNIT.NPY_FR_ns + item_reso[0] = NPY_FR_ns # Pad to always return nanoseconds s += "0" * (9 - len(s)) us = int(s) diff --git a/pandas/_libs/tslibs/timedeltas.pyi b/pandas/_libs/tslibs/timedeltas.pyi index 24ec6c8891a89..979a5666661b2 100644 --- a/pandas/_libs/tslibs/timedeltas.pyi +++ b/pandas/_libs/tslibs/timedeltas.pyi @@ -39,8 +39,6 @@ UnitChoices: TypeAlias = Literal[ "minute", "min", "minutes", - "T", - "t", "s", "seconds", "sec", @@ -50,21 +48,17 @@ UnitChoices: TypeAlias = Literal[ "millisecond", "milli", "millis", - "L", - "l", "us", "microseconds", "microsecond", "µs", "micro", "micros", - "u", "ns", "nanoseconds", "nano", "nanos", "nanosecond", - "n", ] _S = TypeVar("_S", bound=timedelta) diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 4ff2df34ac717..d5348311f19e2 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -43,7 +43,7 @@ from pandas._libs.tslibs.conversion cimport ( precision_from_unit, ) from pandas._libs.tslibs.dtypes cimport ( - c_DEPR_ABBREVS, + c_DEPR_UNITS, get_supported_reso, is_supported_unit, npy_unit_to_abbrev, @@ -719,15 +719,15 @@ cpdef inline str parse_timedelta_unit(str unit): return "ns" elif unit == "M": return unit - elif unit in c_DEPR_ABBREVS: + elif unit in c_DEPR_UNITS: warnings.warn( f"\'{unit}\' is deprecated and will be removed in a " - f"future version. Please use \'{c_DEPR_ABBREVS.get(unit)}\' " + f"future version. Please use \'{c_DEPR_UNITS.get(unit)}\' " f"instead of \'{unit}\'.", FutureWarning, stacklevel=find_stack_level(), ) - unit = c_DEPR_ABBREVS[unit] + unit = c_DEPR_UNITS[unit] try: return timedelta_abbrevs[unit.lower()] except KeyError: @@ -1078,10 +1078,22 @@ cdef class _Timedelta(timedelta): """ Returns the days of the timedelta. + The `days` attribute of a `pandas.Timedelta` object provides the number + of days represented by the `Timedelta`. This is useful for extracting + the day component from a `Timedelta` that may also include hours, minutes, + seconds, and smaller time units. This attribute simplifies the process + of working with durations where only the day component is of interest. + Returns ------- int + See Also + -------- + Timedelta.seconds : Returns the seconds component of the timedelta. + Timedelta.microseconds : Returns the microseconds component of the timedelta. + Timedelta.total_seconds : Returns the total duration in seconds. + Examples -------- >>> td = pd.Timedelta(1, "d") @@ -1731,6 +1743,12 @@ cdef class _Timedelta(timedelta): ------- Timedelta + See Also + -------- + Timedelta : Represents a duration, the difference between two dates or times. + to_timedelta : Convert argument to timedelta. + Timedelta.asm8 : Return a numpy timedelta64 array scalar view. + Examples -------- >>> td = pd.Timedelta('1001ms') @@ -1800,10 +1818,10 @@ class Timedelta(_Timedelta): * 'microseconds', 'microsecond', 'micros', 'micro', or 'us' * 'nanoseconds', 'nanosecond', 'nanos', 'nano', or 'ns'. - .. deprecated:: 2.2.0 + .. deprecated:: 3.0.0 - Values `H`, `T`, `S`, `L`, `U`, and `N` are deprecated in favour - of the values `h`, `min`, `s`, `ms`, `us`, and `ns`. + Allowing the values `w`, `d`, `MIN`, `MS`, `US` and `NS` to denote units + are deprecated in favour of the values `W`, `D`, `min`, `ms`, `us` and `ns`. **kwargs Available kwargs: {days, seconds, microseconds, diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 04bd439b40b8d..369184d9df40c 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -1139,7 +1139,7 @@ cdef class _Timestamp(ABCTimestamp): See Also -------- - Timestamp.asm8 : Return numpy datetime64 format in nanoseconds. + Timestamp.asm8 : Return numpy datetime64 format with same precision. Timestamp.to_pydatetime : Convert Timestamp object to a native Python datetime object. to_timedelta : Convert argument into timedelta object, @@ -1170,7 +1170,7 @@ cdef class _Timestamp(ABCTimestamp): @property def asm8(self) -> np.datetime64: """ - Return numpy datetime64 format in nanoseconds. + Return numpy datetime64 format with same precision. See Also -------- @@ -1374,11 +1374,11 @@ class Timestamp(_Timestamp): Timezone info. nanosecond : int, optional, default 0 Value of nanosecond. - tz : str, pytz.timezone, dateutil.tz.tzfile or None + tz : str, zoneinfo.ZoneInfo, pytz.timezone, dateutil.tz.tzfile or None Time zone for time which Timestamp will have. unit : str Unit used for conversion if ts_input is of type int or float. The - valid values are 'D', 'h', 'm', 's', 'ms', 'us', and 'ns'. For + valid values are 'W', 'D', 'h', 'm', 's', 'ms', 'us', and 'ns'. For example, 's' means seconds and 'ms' means milliseconds. For float inputs, the result will be stored in nanoseconds, and @@ -1417,6 +1417,11 @@ class Timestamp(_Timestamp): >>> pd.Timestamp(1513393355.5, unit='s') Timestamp('2017-12-16 03:02:35.500000') + This converts an int representing a Unix-epoch in units of weeks + + >>> pd.Timestamp(1535, unit='W') + Timestamp('1999-06-03 00:00:00') + This converts an int representing a Unix-epoch in units of seconds and for a particular timezone @@ -1441,7 +1446,7 @@ class Timestamp(_Timestamp): ---------- ordinal : int Date corresponding to a proleptic Gregorian ordinal. - tz : str, pytz.timezone, dateutil.tz.tzfile or None + tz : str, zoneinfo.ZoneInfo, pytz.timezone, dateutil.tz.tzfile or None Time zone for the Timestamp. Notes @@ -1460,11 +1465,21 @@ class Timestamp(_Timestamp): """ Return new Timestamp object representing current time local to tz. + This method returns a new `Timestamp` object that represents the current time. + If a timezone is provided, the current time will be localized to that timezone. + Otherwise, it returns the current local time. + Parameters ---------- tz : str or timezone object, default None Timezone to localize to. + See Also + -------- + to_datetime : Convert argument to datetime. + Timestamp.utcnow : Return a new Timestamp representing UTC day and time. + Timestamp.today : Return the current time in the local timezone. + Examples -------- >>> pd.Timestamp.now() # doctest: +SKIP @@ -1763,6 +1778,16 @@ class Timestamp(_Timestamp): """ Return time object with same time but with tzinfo=None. + This method extracts the time part of the `Timestamp` object, excluding any + timezone information. It returns a `datetime.time` object which only represents + the time (hours, minutes, seconds, and microseconds). + + See Also + -------- + Timestamp.date : Return date object with same year, month and day. + Timestamp.tz_convert : Convert timezone-aware Timestamp to another time zone. + Timestamp.tz_localize : Localize the Timestamp to a timezone. + Examples -------- >>> ts = pd.Timestamp('2023-01-01 10:00:00') @@ -2202,6 +2227,12 @@ timedelta}, default 'raise' ------ ValueError if the freq cannot be converted. + See Also + -------- + Timestamp.ceil : Round up a Timestamp to the specified resolution. + Timestamp.round : Round a Timestamp to the specified resolution. + Series.dt.floor : Round down the datetime values in a Series. + Notes ----- If the Timestamp has a timezone, flooring will take place relative to the @@ -2357,6 +2388,17 @@ timedelta}, default 'raise' """ Alias for tzinfo. + The `tz` property provides a simple and direct way to retrieve the timezone + information of a `Timestamp` object. It is particularly useful when working + with time series data that includes timezone information, allowing for easy + access and manipulation of the timezone context. + + See Also + -------- + Timestamp.tzinfo : Returns the timezone information of the Timestamp. + Timestamp.tz_convert : Convert timezone-aware Timestamp to another time zone. + Timestamp.tz_localize : Localize the Timestamp to a timezone. + Examples -------- >>> ts = pd.Timestamp(1584226800, unit='s', tz='Europe/Stockholm') @@ -2382,7 +2424,7 @@ timedelta}, default 'raise' Parameters ---------- - tz : str, pytz.timezone, dateutil.tz.tzfile or None + tz : str, zoneinfo.ZoneInfo, pytz.timezone, dateutil.tz.tzfile or None Time zone for time which Timestamp will be converted to. None will remove timezone holding local time. @@ -2489,7 +2531,7 @@ default 'raise' Parameters ---------- - tz : str, pytz.timezone, dateutil.tz.tzfile or None + tz : str, zoneinfo.ZoneInfo, pytz.timezone, dateutil.tz.tzfile or None Time zone for time which Timestamp will be converted to. None will remove timezone holding UTC time. @@ -2593,13 +2635,13 @@ default 'raise' Replace timezone (not a conversion): - >>> import pytz - >>> ts.replace(tzinfo=pytz.timezone('US/Pacific')) + >>> import zoneinfo + >>> ts.replace(tzinfo=zoneinfo.ZoneInfo('US/Pacific')) Timestamp('2020-03-14 15:32:52.192548651-0700', tz='US/Pacific') Analogous for ``pd.NaT``: - >>> pd.NaT.replace(tzinfo=pytz.timezone('US/Pacific')) + >>> pd.NaT.replace(tzinfo=zoneinfo.ZoneInfo('US/Pacific')) NaT """ @@ -2733,6 +2775,13 @@ default 'raise' Monday == 1 ... Sunday == 7. + See Also + -------- + Timestamp.weekday : Return the day of the week with Monday=0, Sunday=6. + Timestamp.isocalendar : Return a tuple containing ISO year, week number + and weekday. + datetime.date.isoweekday : Equivalent method in datetime module. + Examples -------- >>> ts = pd.Timestamp('2023-01-01 10:00:00') diff --git a/pandas/_libs/tslibs/timezones.pyx b/pandas/_libs/tslibs/timezones.pyx index 10e5790dd1c35..6292b6ce0fd1d 100644 --- a/pandas/_libs/tslibs/timezones.pyx +++ b/pandas/_libs/tslibs/timezones.pyx @@ -119,27 +119,26 @@ cpdef inline object get_timezone(tzinfo tz): raise TypeError("tz argument cannot be None") if is_utc(tz): return tz + elif is_zoneinfo(tz): + return tz.key + elif treat_tz_as_pytz(tz): + zone = tz.zone + if zone is None: + return tz + return zone + elif treat_tz_as_dateutil(tz): + if ".tar.gz" in tz._filename: + raise ValueError( + "Bad tz filename. Dateutil on python 3 on windows has a " + "bug which causes tzfile._filename to be the same for all " + "timezone files. Please construct dateutil timezones " + 'implicitly by passing a string like "dateutil/Europe' + '/London" when you construct your pandas objects instead ' + "of passing a timezone object. See " + "https://github.com/pandas-dev/pandas/pull/7362") + return "dateutil/" + tz._filename else: - if treat_tz_as_dateutil(tz): - if ".tar.gz" in tz._filename: - raise ValueError( - "Bad tz filename. Dateutil on python 3 on windows has a " - "bug which causes tzfile._filename to be the same for all " - "timezone files. Please construct dateutil timezones " - 'implicitly by passing a string like "dateutil/Europe' - '/London" when you construct your pandas objects instead ' - "of passing a timezone object. See " - "https://github.com/pandas-dev/pandas/pull/7362") - return "dateutil/" + tz._filename - else: - # tz is a pytz timezone or unknown. - try: - zone = tz.zone - if zone is None: - return tz - return zone - except AttributeError: - return tz + return tz cpdef inline tzinfo maybe_get_tz(object tz): diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 6365c030b695b..5b9ee095d4643 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -1813,6 +1813,9 @@ def ewm(const float64_t[:] vals, const int64_t[:] start, const int64_t[:] end, if normalize: # avoid numerical errors on constant series if weighted != cur: + if not adjust and com == 1: + # update in case of irregular-interval series + new_wt = 1. - old_wt weighted = old_wt * weighted + new_wt * cur weighted /= (old_wt + new_wt) if adjust: diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 12395b42bba19..3aa53d4b07aa5 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -6,11 +6,9 @@ from sys import byteorder from typing import ( TYPE_CHECKING, - Callable, ContextManager, cast, ) -import warnings import numpy as np @@ -58,7 +56,6 @@ assert_indexing_slices_equivalent, assert_interval_array_equal, assert_is_sorted, - assert_is_valid_plot_return_object, assert_metadata_equivalent, assert_numpy_array_equal, assert_period_array_equal, @@ -87,6 +84,8 @@ from pandas.core.construction import extract_array if TYPE_CHECKING: + from collections.abc import Callable + from pandas._typing import ( Dtype, NpDtype, @@ -108,6 +107,7 @@ COMPLEX_DTYPES: list[Dtype] = [complex, "complex64", "complex128"] STRING_DTYPES: list[Dtype] = [str, "str", "U"] +COMPLEX_FLOAT_DTYPES: list[Dtype] = [*COMPLEX_DTYPES, *FLOAT_NUMPY_DTYPES] DATETIME64_DTYPES: list[Dtype] = ["datetime64[ns]", "M8[ns]"] TIMEDELTA64_DTYPES: list[Dtype] = ["timedelta64[ns]", "m8[ns]"] @@ -290,17 +290,11 @@ def box_expected(expected, box_cls, transpose: bool = True): else: expected = pd.array(expected, copy=False) elif box_cls is Index: - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", "Dtype inference", category=FutureWarning) - expected = Index(expected) + expected = Index(expected) elif box_cls is Series: - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", "Dtype inference", category=FutureWarning) - expected = Series(expected) + expected = Series(expected) elif box_cls is DataFrame: - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", "Dtype inference", category=FutureWarning) - expected = Series(expected).to_frame() + expected = Series(expected).to_frame() if transpose: # for vector operations, we need a DataFrame to be a single-row, # not a single-column, in order to operate against non-DataFrame @@ -515,14 +509,14 @@ def shares_memory(left, right) -> bool: if ( isinstance(left, ExtensionArray) and is_string_dtype(left.dtype) - and left.dtype.storage in ("pyarrow", "pyarrow_numpy") # type: ignore[attr-defined] + and left.dtype.storage == "pyarrow" # type: ignore[attr-defined] ): # https://github.com/pandas-dev/pandas/pull/43930#discussion_r736862669 left = cast("ArrowExtensionArray", left) if ( isinstance(right, ExtensionArray) and is_string_dtype(right.dtype) - and right.dtype.storage in ("pyarrow", "pyarrow_numpy") # type: ignore[attr-defined] + and right.dtype.storage == "pyarrow" # type: ignore[attr-defined] ): right = cast("ArrowExtensionArray", right) left_pa_data = left._pa_array @@ -538,8 +532,8 @@ def shares_memory(left, right) -> bool: left._mask, right._mask ) - if isinstance(left, DataFrame) and len(left._mgr.arrays) == 1: - arr = left._mgr.arrays[0] + if isinstance(left, DataFrame) and len(left._mgr.blocks) == 1: + arr = left._mgr.blocks[0].values return shares_memory(arr, right) raise NotImplementedError(type(left), type(right)) @@ -565,7 +559,6 @@ def shares_memory(left, right) -> bool: "assert_indexing_slices_equivalent", "assert_interval_array_equal", "assert_is_sorted", - "assert_is_valid_plot_return_object", "assert_metadata_equivalent", "assert_numpy_array_equal", "assert_period_array_equal", diff --git a/pandas/_testing/_hypothesis.py b/pandas/_testing/_hypothesis.py index b7fc175b10d17..bbad21d8ab8d1 100644 --- a/pandas/_testing/_hypothesis.py +++ b/pandas/_testing/_hypothesis.py @@ -6,7 +6,6 @@ from hypothesis import strategies as st from hypothesis.extra.dateutil import timezones as dateutil_timezones -from hypothesis.extra.pytz import timezones as pytz_timezones from pandas.compat import is_platform_windows @@ -57,7 +56,7 @@ DATETIME_JAN_1_1900_OPTIONAL_TZ = st.datetimes( min_value=pd.Timestamp(1900, 1, 1).to_pydatetime(), # pyright: ignore[reportArgumentType] max_value=pd.Timestamp(1900, 1, 1).to_pydatetime(), # pyright: ignore[reportArgumentType] - timezones=st.one_of(st.none(), dateutil_timezones(), pytz_timezones()), + timezones=st.one_of(st.none(), dateutil_timezones(), st.timezones()), ) DATETIME_IN_PD_TIMESTAMP_RANGE_NO_TZ = st.datetimes( diff --git a/pandas/_testing/_io.py b/pandas/_testing/_io.py index 2955108d3db1a..e1841c95dcdfe 100644 --- a/pandas/_testing/_io.py +++ b/pandas/_testing/_io.py @@ -7,21 +7,18 @@ from typing import ( TYPE_CHECKING, Any, - Callable, ) import uuid import zipfile -from pandas.compat import ( - get_bz2_file, - get_lzma_file, -) from pandas.compat._optional import import_optional_dependency import pandas as pd from pandas._testing.contexts import ensure_clean if TYPE_CHECKING: + from collections.abc import Callable + from pandas._typing import ( FilePath, ReadPickleBuffer, @@ -129,11 +126,15 @@ def write_to_compressed(compression, path, data, dest: str = "test") -> None: elif compression == "gzip": compress_method = gzip.GzipFile elif compression == "bz2": - compress_method = get_bz2_file() + import bz2 + + compress_method = bz2.BZ2File elif compression == "zstd": compress_method = import_optional_dependency("zstandard").open elif compression == "xz": - compress_method = get_lzma_file() + import lzma + + compress_method = lzma.LZMAFile else: raise ValueError(f"Unrecognized compression type: {compression}") diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index 543d7944e4c5d..d52dabe47279a 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -196,7 +196,9 @@ def assert_index_equal( Parameters ---------- left : Index + The first index to compare. right : Index + The second index to compare. exact : bool or {'equiv'}, default 'equiv' Whether to check the Index class, dtype and inferred_type are identical. If 'equiv', then RangeIndex can be substituted for @@ -219,6 +221,11 @@ def assert_index_equal( Specify object name being compared, internally used to show appropriate assertion message. + See Also + -------- + testing.assert_series_equal : Check that two Series are equal. + testing.assert_frame_equal : Check that two DataFrames are equal. + Examples -------- >>> from pandas import testing as tm @@ -422,28 +429,6 @@ def assert_attr_equal(attr: str, left, right, obj: str = "Attributes") -> None: return None -def assert_is_valid_plot_return_object(objs) -> None: - from matplotlib.artist import Artist - from matplotlib.axes import Axes - - if isinstance(objs, (Series, np.ndarray)): - if isinstance(objs, Series): - objs = objs._values - for el in objs.ravel(): - msg = ( - "one of 'objs' is not a matplotlib Axes instance, " - f"type encountered {type(el).__name__!r}" - ) - assert isinstance(el, (Axes, dict)), msg - else: - msg = ( - "objs is neither an ndarray of Artist instances nor a single " - "ArtistArtist instance, tuple, or dict, 'objs' is a " - f"{type(objs).__name__!r}" - ) - assert isinstance(objs, (Artist, tuple, dict)), msg - - def assert_is_sorted(seq) -> None: """Assert that the sequence is sorted.""" if isinstance(seq, (Index, Series)): @@ -593,13 +578,19 @@ def raise_assert_detail( if isinstance(left, np.ndarray): left = pprint_thing(left) - elif isinstance(left, (CategoricalDtype, NumpyEADtype, StringDtype)): + elif isinstance(left, (CategoricalDtype, NumpyEADtype)): left = repr(left) + elif isinstance(left, StringDtype): + # TODO(infer_string) this special case could be avoided if we have + # a more informative repr https://github.com/pandas-dev/pandas/issues/59342 + left = f"StringDtype(storage={left.storage}, na_value={left.na_value})" if isinstance(right, np.ndarray): right = pprint_thing(right) - elif isinstance(right, (CategoricalDtype, NumpyEADtype, StringDtype)): + elif isinstance(right, (CategoricalDtype, NumpyEADtype)): right = repr(right) + elif isinstance(right, StringDtype): + right = f"StringDtype(storage={right.storage}, na_value={right.na_value})" msg += f""" [left]: {left} @@ -850,7 +841,9 @@ def assert_series_equal( Parameters ---------- left : Series + First Series to compare. right : Series + Second Series to compare. check_dtype : bool, default True Whether to check the Series dtype is identical. check_index_type : bool or {'equiv'}, default 'equiv' @@ -901,6 +894,11 @@ def assert_series_equal( .. versionadded:: 1.5.0 + See Also + -------- + testing.assert_index_equal : Check that two Indexes are equal. + testing.assert_frame_equal : Check that two DataFrames are equal. + Examples -------- >>> from pandas import testing as tm diff --git a/pandas/_typing.py b/pandas/_typing.py index ef68018f2721a..d43e6e900546d 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -1,6 +1,7 @@ from __future__ import annotations from collections.abc import ( + Callable, Hashable, Iterator, Mapping, @@ -18,7 +19,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Literal, Optional, Protocol, @@ -90,18 +90,12 @@ # Name "npt._ArrayLikeInt_co" is not defined [name-defined] NumpySorter = Optional[npt._ArrayLikeInt_co] # type: ignore[name-defined] - from typing import SupportsIndex - - if sys.version_info >= (3, 10): - from typing import Concatenate # pyright: ignore[reportUnusedImport] - from typing import ParamSpec - from typing import TypeGuard # pyright: ignore[reportUnusedImport] - else: - from typing_extensions import ( # pyright: ignore[reportUnusedImport] - Concatenate, - ParamSpec, - TypeGuard, - ) + from typing import ( + ParamSpec, + SupportsIndex, + ) + from typing import Concatenate # pyright: ignore[reportUnusedImport] + from typing import TypeGuard # pyright: ignore[reportUnusedImport] P = ParamSpec("P") @@ -516,6 +510,7 @@ def closed(self) -> bool: # ExcelWriter ExcelWriterIfSheetExists = Literal["error", "new", "replace", "overlay"] +ExcelWriterMergeCells = Union[bool, Literal["columns"]] # Offsets OffsetCalendar = Union[np.busdaycalendar, "AbstractHolidayCalendar"] @@ -531,3 +526,5 @@ def closed(self) -> bool: # maintaine the sub-type of any hashable sequence SequenceT = TypeVar("SequenceT", bound=Sequence[Hashable]) + +SliceType = Optional[Hashable] diff --git a/pandas/_version.py b/pandas/_version.py index 7bd9da2bb1cfa..b32c9e67fdbb6 100644 --- a/pandas/_version.py +++ b/pandas/_version.py @@ -10,13 +10,13 @@ """Git implementation of _version.py.""" +from collections.abc import Callable import errno import functools import os import re import subprocess import sys -from typing import Callable def get_keywords(): diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 4583e7edebbdc..e08da7c7e14e3 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -18,13 +18,11 @@ from pandas.compat._constants import ( IS64, ISMUSL, - PY310, PY311, PY312, PYPY, WASM, ) -import pandas.compat.compressors from pandas.compat.numpy import is_numpy_dev from pandas.compat.pyarrow import ( pa_version_under10p1, @@ -33,6 +31,7 @@ pa_version_under14p0, pa_version_under14p1, pa_version_under16p0, + pa_version_under17p0, ) if TYPE_CHECKING: @@ -148,52 +147,6 @@ def is_ci_environment() -> bool: return os.environ.get("PANDAS_CI", "0") == "1" -def get_lzma_file() -> type[pandas.compat.compressors.LZMAFile]: - """ - Importing the `LZMAFile` class from the `lzma` module. - - Returns - ------- - class - The `LZMAFile` class from the `lzma` module. - - Raises - ------ - RuntimeError - If the `lzma` module was not imported correctly, or didn't exist. - """ - if not pandas.compat.compressors.has_lzma: - raise RuntimeError( - "lzma module not available. " - "A Python re-install with the proper dependencies, " - "might be required to solve this issue." - ) - return pandas.compat.compressors.LZMAFile - - -def get_bz2_file() -> type[pandas.compat.compressors.BZ2File]: - """ - Importing the `BZ2File` class from the `bz2` module. - - Returns - ------- - class - The `BZ2File` class from the `bz2` module. - - Raises - ------ - RuntimeError - If the `bz2` module was not imported correctly, or didn't exist. - """ - if not pandas.compat.compressors.has_bz2: - raise RuntimeError( - "bz2 module not available. " - "A Python re-install with the proper dependencies, " - "might be required to solve this issue." - ) - return pandas.compat.compressors.BZ2File - - __all__ = [ "is_numpy_dev", "pa_version_under10p1", @@ -202,9 +155,9 @@ def get_bz2_file() -> type[pandas.compat.compressors.BZ2File]: "pa_version_under14p0", "pa_version_under14p1", "pa_version_under16p0", + "pa_version_under17p0", "IS64", "ISMUSL", - "PY310", "PY311", "PY312", "PYPY", diff --git a/pandas/compat/_constants.py b/pandas/compat/_constants.py index 2625389e5254a..c7b7341013251 100644 --- a/pandas/compat/_constants.py +++ b/pandas/compat/_constants.py @@ -13,7 +13,6 @@ IS64 = sys.maxsize > 2**32 -PY310 = sys.version_info >= (3, 10) PY311 = sys.version_info >= (3, 11) PY312 = sys.version_info >= (3, 12) PYPY = platform.python_implementation() == "PyPy" @@ -24,7 +23,6 @@ __all__ = [ "IS64", "ISMUSL", - "PY310", "PY311", "PY312", "PYPY", diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index f4e717c26d6fd..06082e71af32a 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -16,7 +16,7 @@ if TYPE_CHECKING: import types -# Update install.rst, actions-39-minimum_versions.yaml, +# Update install.rst, actions-310-minimum_versions.yaml, # deps_minimum.toml & pyproject.toml when updating versions! VERSIONS = { @@ -28,7 +28,7 @@ "fastparquet": "2023.10.0", "fsspec": "2022.11.0", "html5lib": "1.1", - "hypothesis": "6.46.1", + "hypothesis": "6.84.0", "gcsfs": "2022.11.0", "jinja2": "3.1.2", "lxml.etree": "4.9.2", diff --git a/pandas/compat/compressors.py b/pandas/compat/compressors.py deleted file mode 100644 index 1f31e34c092c9..0000000000000 --- a/pandas/compat/compressors.py +++ /dev/null @@ -1,77 +0,0 @@ -""" -Patched ``BZ2File`` and ``LZMAFile`` to handle pickle protocol 5. -""" - -from __future__ import annotations - -from pickle import PickleBuffer - -from pandas.compat._constants import PY310 - -try: - import bz2 - - has_bz2 = True -except ImportError: - has_bz2 = False - -try: - import lzma - - has_lzma = True -except ImportError: - has_lzma = False - - -def flatten_buffer( - b: bytes | bytearray | memoryview | PickleBuffer, -) -> bytes | bytearray | memoryview: - """ - Return some 1-D `uint8` typed buffer. - - Coerces anything that does not match that description to one that does - without copying if possible (otherwise will copy). - """ - - if isinstance(b, (bytes, bytearray)): - return b - - if not isinstance(b, PickleBuffer): - b = PickleBuffer(b) - - try: - # coerce to 1-D `uint8` C-contiguous `memoryview` zero-copy - return b.raw() - except BufferError: - # perform in-memory copy if buffer is not contiguous - return memoryview(b).tobytes("A") - - -if has_bz2: - - class BZ2File(bz2.BZ2File): - if not PY310: - - def write(self, b) -> int: - # Workaround issue where `bz2.BZ2File` expects `len` - # to return the number of bytes in `b` by converting - # `b` into something that meets that constraint with - # minimal copying. - # - # Note: This is fixed in Python 3.10. - return super().write(flatten_buffer(b)) - - -if has_lzma: - - class LZMAFile(lzma.LZMAFile): - if not PY310: - - def write(self, b) -> int: - # Workaround issue where `lzma.LZMAFile` expects `len` - # to return the number of bytes in `b` by converting - # `b` into something that meets that constraint with - # minimal copying. - # - # Note: This is fixed in Python 3.10. - return super().write(flatten_buffer(b)) diff --git a/pandas/compat/numpy/__init__.py b/pandas/compat/numpy/__init__.py index 54a12c76a230b..2fab8f32b8e71 100644 --- a/pandas/compat/numpy/__init__.py +++ b/pandas/compat/numpy/__init__.py @@ -12,7 +12,7 @@ np_version_gte1p24 = _nlv >= Version("1.24") np_version_gte1p24p3 = _nlv >= Version("1.24.3") np_version_gte1p25 = _nlv >= Version("1.25") -np_version_gt2 = _nlv >= Version("2.0.0.dev0") +np_version_gt2 = _nlv >= Version("2.0.0") is_numpy_dev = _nlv.dev is not None _min_numpy_ver = "1.23.5" diff --git a/pandas/compat/pyarrow.py b/pandas/compat/pyarrow.py index 5a96e5a4cc49a..87d3dc86cee87 100644 --- a/pandas/compat/pyarrow.py +++ b/pandas/compat/pyarrow.py @@ -16,6 +16,7 @@ pa_version_under14p1 = _palv < Version("14.0.1") pa_version_under15p0 = _palv < Version("15.0.0") pa_version_under16p0 = _palv < Version("16.0.0") + pa_version_under17p0 = _palv < Version("17.0.0") except ImportError: pa_version_under10p1 = True pa_version_under11p0 = True @@ -25,3 +26,4 @@ pa_version_under14p1 = True pa_version_under15p0 = True pa_version_under16p0 = True + pa_version_under17p0 = True diff --git a/pandas/conftest.py b/pandas/conftest.py index 0ab51139528ad..11196ad069366 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -32,10 +32,7 @@ import gc import operator import os -from typing import ( - TYPE_CHECKING, - Callable, -) +from typing import TYPE_CHECKING import uuid from dateutil.tz import ( @@ -79,10 +76,10 @@ Index, MultiIndex, ) -from pandas.util.version import Version if TYPE_CHECKING: from collections.abc import ( + Callable, Hashable, Iterator, ) @@ -150,6 +147,7 @@ def pytest_collection_modifyitems(items, config) -> None: ("is_categorical_dtype", "is_categorical_dtype is deprecated"), ("is_sparse", "is_sparse is deprecated"), ("DataFrameGroupBy.fillna", "DataFrameGroupBy.fillna is deprecated"), + ("DataFrameGroupBy.corrwith", "DataFrameGroupBy.corrwith is deprecated"), ("NDFrame.replace", "Series.replace without 'value'"), ("NDFrame.clip", "Downcasting behavior in Series and DataFrame methods"), ("Series.idxmin", "The behavior of Series.idxmin"), @@ -183,9 +181,10 @@ def pytest_collection_modifyitems(items, config) -> None: ignore_doctest_warning(item, path, message) -hypothesis_health_checks = [hypothesis.HealthCheck.too_slow] -if Version(hypothesis.__version__) >= Version("6.83.2"): - hypothesis_health_checks.append(hypothesis.HealthCheck.differing_executors) +hypothesis_health_checks = [ + hypothesis.HealthCheck.too_slow, + hypothesis.HealthCheck.differing_executors, +] # Hypothesis hypothesis.settings.register_profile( @@ -952,6 +951,9 @@ def rand_series_with_duplicate_datetimeindex() -> Series: ] ) def ea_scalar_and_dtype(request): + """ + Fixture that tests each scalar and datetime type. + """ return request.param @@ -1294,7 +1296,6 @@ def nullable_string_dtype(request): params=[ "python", pytest.param("pyarrow", marks=td.skip_if_no("pyarrow")), - pytest.param("pyarrow_numpy", marks=td.skip_if_no("pyarrow")), ] ) def string_storage(request): @@ -1303,7 +1304,24 @@ def string_storage(request): * 'python' * 'pyarrow' - * 'pyarrow_numpy' + """ + return request.param + + +@pytest.fixture( + params=[ + ("python", pd.NA), + pytest.param(("pyarrow", pd.NA), marks=td.skip_if_no("pyarrow")), + pytest.param(("pyarrow", np.nan), marks=td.skip_if_no("pyarrow")), + ] +) +def string_dtype_arguments(request): + """ + Parametrized fixture for StringDtype storage and na_value. + + * 'python' + pd.NA + * 'pyarrow' + pd.NA + * 'pyarrow' + np.nan """ return request.param @@ -1352,20 +1370,33 @@ def object_dtype(request): @pytest.fixture( params=[ - "object", - "string[python]", - pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), - pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), - ] + np.dtype("object"), + ("python", pd.NA), + pytest.param(("pyarrow", pd.NA), marks=td.skip_if_no("pyarrow")), + pytest.param(("pyarrow", np.nan), marks=td.skip_if_no("pyarrow")), + ], + ids=[ + "string=object", + "string=string[python]", + "string=string[pyarrow]", + "string=str[pyarrow]", + ], ) def any_string_dtype(request): """ Parametrized fixture for string dtypes. * 'object' - * 'string[python]' - * 'string[pyarrow]' + * 'string[python]' (NA variant) + * 'string[pyarrow]' (NA variant) + * 'str' (NaN variant, with pyarrow) """ - return request.param + if isinstance(request.param, np.dtype): + return request.param + else: + # need to instantiate the StringDtype here instead of in the params + # to avoid importing pyarrow during test collection + storage, na_value = request.param + return pd.StringDtype(storage, na_value) @pytest.fixture(params=tm.DATETIME64_DTYPES) @@ -1449,6 +1480,21 @@ def complex_dtype(request): return request.param +@pytest.fixture(params=tm.COMPLEX_FLOAT_DTYPES) +def complex_or_float_dtype(request): + """ + Parameterized fixture for complex and numpy float dtypes. + + * complex + * 'complex64' + * 'complex128' + * float + * 'float32' + * 'float64' + """ + return request.param + + @pytest.fixture(params=tm.SIGNED_INT_NUMPY_DTYPES) def any_signed_int_numpy_dtype(request): """ @@ -2009,14 +2055,6 @@ def warsaw(request) -> str: return request.param -@pytest.fixture -def arrow_string_storage(): - """ - Fixture that lists possible PyArrow values for StringDtype storage field. - """ - return ("pyarrow", "pyarrow_numpy") - - @pytest.fixture def temp_file(tmp_path): """ diff --git a/pandas/core/_numba/executor.py b/pandas/core/_numba/executor.py index 0a26acb7df60a..3f3ebe8dbe023 100644 --- a/pandas/core/_numba/executor.py +++ b/pandas/core/_numba/executor.py @@ -4,16 +4,18 @@ from typing import ( TYPE_CHECKING, Any, - Callable, ) if TYPE_CHECKING: + from collections.abc import Callable from pandas._typing import Scalar import numpy as np from pandas.compat._optional import import_optional_dependency +from pandas.core.util.numba_ import jit_user_function + @functools.cache def generate_apply_looper(func, nopython=True, nogil=True, parallel=False): @@ -21,10 +23,10 @@ def generate_apply_looper(func, nopython=True, nogil=True, parallel=False): import numba else: numba = import_optional_dependency("numba") - nb_compat_func = numba.extending.register_jitable(func) + nb_compat_func = jit_user_function(func) @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel) - def nb_looper(values, axis): + def nb_looper(values, axis, *args): # Operate on the first row/col in order to get # the output shape if axis == 0: @@ -33,7 +35,7 @@ def nb_looper(values, axis): else: first_elem = values[0] dim0 = values.shape[0] - res0 = nb_compat_func(first_elem) + res0 = nb_compat_func(first_elem, *args) # Use np.asarray to get shape for # https://github.com/numba/numba/issues/4202#issuecomment-1185981507 buf_shape = (dim0,) + np.atleast_1d(np.asarray(res0)).shape @@ -44,11 +46,11 @@ def nb_looper(values, axis): if axis == 1: buff[0] = res0 for i in numba.prange(1, values.shape[0]): - buff[i] = nb_compat_func(values[i]) + buff[i] = nb_compat_func(values[i], *args) else: buff[:, 0] = res0 for j in numba.prange(1, values.shape[1]): - buff[:, j] = nb_compat_func(values[:, j]) + buff[:, j] = nb_compat_func(values[:, j], *args) return buff return nb_looper diff --git a/pandas/core/accessor.py b/pandas/core/accessor.py index 3acbfc3eabbac..d8463fda34caa 100644 --- a/pandas/core/accessor.py +++ b/pandas/core/accessor.py @@ -9,7 +9,6 @@ from typing import ( TYPE_CHECKING, - Callable, final, ) import warnings @@ -18,6 +17,8 @@ from pandas.util._exceptions import find_stack_level if TYPE_CHECKING: + from collections.abc import Callable + from pandas._typing import TypeT from pandas import Index diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 33beef23197bd..948836bf6a51d 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -68,6 +68,7 @@ ABCExtensionArray, ABCIndex, ABCMultiIndex, + ABCNumpyExtensionArray, ABCSeries, ABCTimedeltaArray, ) @@ -222,13 +223,17 @@ def _ensure_arraylike(values, func_name: str) -> ArrayLike: """ ensure that we are arraylike if not already """ - if not isinstance(values, (ABCIndex, ABCSeries, ABCExtensionArray, np.ndarray)): + if not isinstance( + values, + (ABCIndex, ABCSeries, ABCExtensionArray, np.ndarray, ABCNumpyExtensionArray), + ): # GH#52986 if func_name != "isin-targets": # Make an exception for the comps argument in isin. raise TypeError( f"{func_name} requires a Series, Index, " - f"ExtensionArray, or np.ndarray, got {type(values).__name__}." + f"ExtensionArray, np.ndarray or NumpyExtensionArray " + f"got {type(values).__name__}." ) inferred = lib.infer_dtype(values, skipna=False) @@ -319,10 +324,12 @@ def unique(values): Parameters ---------- values : 1d array-like + The input array-like object containing values from which to extract + unique values. Returns ------- - numpy.ndarray or ExtensionArray + numpy.ndarray, ExtensionArray or NumpyExtensionArray The return can be: @@ -330,7 +337,7 @@ def unique(values): * Categorical : when the input is a Categorical dtype * ndarray : when the input is a Series/ndarray - Return numpy.ndarray or ExtensionArray. + Return numpy.ndarray, ExtensionArray or NumpyExtensionArray. See Also -------- @@ -346,14 +353,15 @@ def unique(values): array([2, 1]) >>> pd.unique(pd.Series([pd.Timestamp("20160101"), pd.Timestamp("20160101")])) - array(['2016-01-01T00:00:00.000000000'], dtype='datetime64[ns]') + array(['2016-01-01T00:00:00'], dtype='datetime64[s]') >>> pd.unique( ... pd.Series( ... [ ... pd.Timestamp("20160101", tz="US/Eastern"), ... pd.Timestamp("20160101", tz="US/Eastern"), - ... ] + ... ], + ... dtype="M8[ns, US/Eastern]", ... ) ... ) @@ -365,7 +373,8 @@ def unique(values): ... [ ... pd.Timestamp("20160101", tz="US/Eastern"), ... pd.Timestamp("20160101", tz="US/Eastern"), - ... ] + ... ], + ... dtype="M8[ns, US/Eastern]", ... ) ... ) DatetimeIndex(['2016-01-01 00:00:00-05:00'], @@ -400,6 +409,13 @@ def unique(values): >>> pd.unique(pd.Series([("a", "b"), ("b", "a"), ("a", "c"), ("b", "a")]).values) array([('a', 'b'), ('b', 'a'), ('a', 'c')], dtype=object) + + An NumpyExtensionArray of complex + + >>> pd.unique(pd.array([1 + 1j, 2, 3])) + + [(1+1j), (2+0j), (3+0j)] + Length: 3, dtype: complex128 """ return unique_with_mask(values) @@ -1157,11 +1173,14 @@ def take( ... ) array([ 10, 10, -10]) """ - if not isinstance(arr, (np.ndarray, ABCExtensionArray, ABCIndex, ABCSeries)): + if not isinstance( + arr, + (np.ndarray, ABCExtensionArray, ABCIndex, ABCSeries, ABCNumpyExtensionArray), + ): # GH#52981 raise TypeError( - "pd.api.extensions.take requires a numpy.ndarray, " - f"ExtensionArray, Index, or Series, got {type(arr).__name__}." + "pd.api.extensions.take requires a numpy.ndarray, ExtensionArray, " + f"Index, Series, or NumpyExtensionArray got {type(arr).__name__}." ) indices = ensure_platform_int(indices) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 25836e967e948..5959156d11123 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -2,13 +2,13 @@ import abc from collections import defaultdict +from collections.abc import Callable import functools from functools import partial import inspect from typing import ( TYPE_CHECKING, Any, - Callable, Literal, cast, ) @@ -51,6 +51,10 @@ from pandas.core._numba.executor import generate_apply_looper import pandas.core.common as com from pandas.core.construction import ensure_wrapped_if_datetimelike +from pandas.core.util.numba_ import ( + get_jit_arguments, + prepare_function_arguments, +) if TYPE_CHECKING: from collections.abc import ( @@ -70,7 +74,6 @@ from pandas.core.resample import Resampler from pandas.core.window.rolling import BaseWindow - ResType = dict[int, Any] @@ -87,16 +90,19 @@ def frame_apply( kwargs=None, ) -> FrameApply: """construct and return a row or column based frame apply object""" + _, func, columns, _ = reconstruct_func(func, **kwargs) + axis = obj._get_axis_number(axis) klass: type[FrameApply] if axis == 0: klass = FrameRowApply elif axis == 1: + if columns: + raise NotImplementedError( + f"Named aggregation is not supported when {axis=}." + ) klass = FrameColumnApply - _, func, _, _ = reconstruct_func(func, **kwargs) - assert func is not None - return klass( obj, func, @@ -471,8 +477,24 @@ def compute_dict_like( keys += [key] * len(key_data) results += key_data - else: + elif is_groupby: # key used for column selection and output + + df = selected_obj + results, keys = [], [] + for key, how in func.items(): + cols = df[key] + + if cols.ndim == 1: + series = obj._gotitem(key, ndim=1, subset=cols) + results.append(getattr(series, op_name)(how, **kwargs)) + keys.append(key) + else: + for _, col in cols.items(): + series = obj._gotitem(key, ndim=1, subset=col) + results.append(getattr(series, op_name)(how, **kwargs)) + keys.append(key) + else: results = [ getattr(obj._gotitem(key, ndim=1), op_name)(how, **kwargs) for key, how in func.items() @@ -496,11 +518,14 @@ def wrap_results_dict_like( is_ndframe = [isinstance(r, ABCNDFrame) for r in result_data] if all(is_ndframe): - results = dict(zip(result_index, result_data)) + results = [result for result in result_data if not result.empty] keys_to_use: Iterable[Hashable] - keys_to_use = [k for k in result_index if not results[k].empty] + keys_to_use = [k for k, v in zip(result_index, result_data) if not v.empty] # Have to check, if at least one DataFrame is not empty. - keys_to_use = keys_to_use if keys_to_use != [] else result_index + if keys_to_use == []: + keys_to_use = result_index + results = result_data + if selected_obj.ndim == 2: # keys are columns, so we can preserve names ktu = Index(keys_to_use) @@ -509,7 +534,7 @@ def wrap_results_dict_like( axis: AxisInt = 0 if isinstance(obj, ABCSeries) else 1 result = concat( - {k: results[k] for k in keys_to_use}, + results, axis=axis, keys=keys_to_use, ) @@ -972,17 +997,20 @@ def wrapper(*args, **kwargs): return wrapper if engine == "numba": - engine_kwargs = {} if engine_kwargs is None else engine_kwargs - + args, kwargs = prepare_function_arguments( + self.func, # type: ignore[arg-type] + self.args, + self.kwargs, + ) # error: Argument 1 to "__call__" of "_lru_cache_wrapper" has # incompatible type "Callable[..., Any] | str | list[Callable # [..., Any] | str] | dict[Hashable,Callable[..., Any] | str | # list[Callable[..., Any] | str]]"; expected "Hashable" nb_looper = generate_apply_looper( self.func, # type: ignore[arg-type] - **engine_kwargs, + **get_jit_arguments(engine_kwargs, kwargs), ) - result = nb_looper(self.values, self.axis) + result = nb_looper(self.values, self.axis, *args) # If we made the result 2-D, squeeze it back to 1-D result = np.squeeze(result) else: @@ -1123,21 +1151,23 @@ def generate_numba_apply_func( # Currently the parallel argument doesn't get passed through here # (it's disabled) since the dicts in numba aren't thread-safe. @numba.jit(nogil=nogil, nopython=nopython, parallel=parallel) - def numba_func(values, col_names, df_index): + def numba_func(values, col_names, df_index, *args): results = {} for j in range(values.shape[1]): # Create the series ser = Series( values[:, j], index=df_index, name=maybe_cast_str(col_names[j]) ) - results[j] = jitted_udf(ser) + results[j] = jitted_udf(ser, *args) return results return numba_func def apply_with_numba(self) -> dict[int, Any]: + func = cast(Callable, self.func) + args, kwargs = prepare_function_arguments(func, self.args, self.kwargs) nb_func = self.generate_numba_apply_func( - cast(Callable, self.func), **self.engine_kwargs + func, **get_jit_arguments(self.engine_kwargs, kwargs) ) from pandas.core._numba.extensions import set_numba_data @@ -1152,7 +1182,7 @@ def apply_with_numba(self) -> dict[int, Any]: # Convert from numba dict to regular dict # Our isinstance checks in the df constructor don't pass for numbas typed dict with set_numba_data(index) as index, set_numba_data(columns) as columns: - res = dict(nb_func(self.values, columns, index)) + res = dict(nb_func(self.values, columns, index, *args)) return res @property @@ -1260,7 +1290,7 @@ def generate_numba_apply_func( jitted_udf = numba.extending.register_jitable(func) @numba.jit(nogil=nogil, nopython=nopython, parallel=parallel) - def numba_func(values, col_names_index, index): + def numba_func(values, col_names_index, index, *args): results = {} # Currently the parallel argument doesn't get passed through here # (it's disabled) since the dicts in numba aren't thread-safe. @@ -1272,15 +1302,17 @@ def numba_func(values, col_names_index, index): index=col_names_index, name=maybe_cast_str(index[i]), ) - results[i] = jitted_udf(ser) + results[i] = jitted_udf(ser, *args) return results return numba_func def apply_with_numba(self) -> dict[int, Any]: + func = cast(Callable, self.func) + args, kwargs = prepare_function_arguments(func, self.args, self.kwargs) nb_func = self.generate_numba_apply_func( - cast(Callable, self.func), **self.engine_kwargs + func, **get_jit_arguments(self.engine_kwargs, kwargs) ) from pandas.core._numba.extensions import set_numba_data @@ -1291,7 +1323,7 @@ def apply_with_numba(self) -> dict[int, Any]: set_numba_data(self.obj.index) as index, set_numba_data(self.columns) as columns, ): - res = dict(nb_func(self.values, columns, index)) + res = dict(nb_func(self.values, columns, index, *args)) return res @@ -1825,11 +1857,13 @@ def relabel_result( com.get_callable_name(f) if not isinstance(f, str) else f for f in fun ] col_idx_order = Index(s.index).get_indexer(fun) - s = s.iloc[col_idx_order] - + valid_idx = col_idx_order != -1 + if valid_idx.any(): + s = s.iloc[col_idx_order[valid_idx]] # assign the new user-provided "named aggregation" as index names, and reindex # it based on the whole user-provided names. - s.index = reordered_indexes[idx : idx + len(fun)] + if not s.empty: + s.index = reordered_indexes[idx : idx + len(fun)] reordered_result_in_dict[col] = s.reindex(columns) idx = idx + len(fun) return reordered_result_in_dict diff --git a/pandas/core/array_algos/datetimelike_accumulations.py b/pandas/core/array_algos/datetimelike_accumulations.py index c3a7c2e4fefb2..bc10dbfbec90d 100644 --- a/pandas/core/array_algos/datetimelike_accumulations.py +++ b/pandas/core/array_algos/datetimelike_accumulations.py @@ -4,7 +4,7 @@ from __future__ import annotations -from typing import Callable +from typing import TYPE_CHECKING import numpy as np @@ -12,6 +12,9 @@ from pandas.core.dtypes.missing import isna +if TYPE_CHECKING: + from collections.abc import Callable + def _cum_func( func: Callable, diff --git a/pandas/core/array_algos/masked_accumulations.py b/pandas/core/array_algos/masked_accumulations.py index b31d32a606eed..b4e116388b85e 100644 --- a/pandas/core/array_algos/masked_accumulations.py +++ b/pandas/core/array_algos/masked_accumulations.py @@ -5,14 +5,13 @@ from __future__ import annotations -from typing import ( - TYPE_CHECKING, - Callable, -) +from typing import TYPE_CHECKING import numpy as np if TYPE_CHECKING: + from collections.abc import Callable + from pandas._typing import npt diff --git a/pandas/core/array_algos/masked_reductions.py b/pandas/core/array_algos/masked_reductions.py index 3784689995802..f2a32fbe2b0e5 100644 --- a/pandas/core/array_algos/masked_reductions.py +++ b/pandas/core/array_algos/masked_reductions.py @@ -5,10 +5,7 @@ from __future__ import annotations -from typing import ( - TYPE_CHECKING, - Callable, -) +from typing import TYPE_CHECKING import warnings import numpy as np @@ -18,6 +15,8 @@ from pandas.core.nanops import check_below_min_count if TYPE_CHECKING: + from collections.abc import Callable + from pandas._typing import ( AxisInt, npt, diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 3d55513ab914c..a17056b51a014 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -7,7 +7,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Literal, cast, overload, @@ -18,7 +17,6 @@ from pandas._libs import lib from pandas._libs.tslibs import ( - NaT, Timedelta, Timestamp, timezones, @@ -175,7 +173,10 @@ def floordiv_compat( } if TYPE_CHECKING: - from collections.abc import Sequence + from collections.abc import ( + Callable, + Sequence, + ) from pandas._libs.missing import NAType from pandas._typing import ( @@ -574,10 +575,8 @@ def __getitem__(self, item: PositionalIndexer): if isinstance(item, np.ndarray): if not len(item): # Removable once we migrate StringDtype[pyarrow] to ArrowDtype[string] - if self._dtype.name == "string" and self._dtype.storage in ( - "pyarrow", - "pyarrow_numpy", - ): + if self._dtype.name == "string" and self._dtype.storage == "pyarrow": + # TODO(infer_string) should this be large_string? pa_dtype = pa.string() else: pa_dtype = self._dtype.pyarrow_dtype @@ -1705,8 +1704,6 @@ def pyarrow_meth(data, skip_nulls, **kwargs): if name == "median": # GH 52679: Use quantile instead of approximate_median; returns array result = result[0] - if pc.is_null(result).as_py(): - return result if name in ["min", "max", "sum"] and pa.types.is_duration(pa_type): result = result.cast(pa_type) @@ -2612,17 +2609,19 @@ def _str_wrap(self, width: int, **kwargs) -> Self: @property def _dt_days(self) -> Self: return type(self)( - pa.array(self._to_timedeltaarray().days, from_pandas=True, type=pa.int32()) + pa.array( + self._to_timedeltaarray().components.days, + from_pandas=True, + type=pa.int32(), + ) ) @property def _dt_hours(self) -> Self: return type(self)( pa.array( - [ - td.components.hours if td is not NaT else None - for td in self._to_timedeltaarray() - ], + self._to_timedeltaarray().components.hours, + from_pandas=True, type=pa.int32(), ) ) @@ -2631,10 +2630,8 @@ def _dt_hours(self) -> Self: def _dt_minutes(self) -> Self: return type(self)( pa.array( - [ - td.components.minutes if td is not NaT else None - for td in self._to_timedeltaarray() - ], + self._to_timedeltaarray().components.minutes, + from_pandas=True, type=pa.int32(), ) ) @@ -2643,7 +2640,9 @@ def _dt_minutes(self) -> Self: def _dt_seconds(self) -> Self: return type(self)( pa.array( - self._to_timedeltaarray().seconds, from_pandas=True, type=pa.int32() + self._to_timedeltaarray().components.seconds, + from_pandas=True, + type=pa.int32(), ) ) @@ -2651,10 +2650,8 @@ def _dt_seconds(self) -> Self: def _dt_milliseconds(self) -> Self: return type(self)( pa.array( - [ - td.components.milliseconds if td is not NaT else None - for td in self._to_timedeltaarray() - ], + self._to_timedeltaarray().components.milliseconds, + from_pandas=True, type=pa.int32(), ) ) @@ -2663,7 +2660,7 @@ def _dt_milliseconds(self) -> Self: def _dt_microseconds(self) -> Self: return type(self)( pa.array( - self._to_timedeltaarray().microseconds, + self._to_timedeltaarray().components.microseconds, from_pandas=True, type=pa.int32(), ) @@ -2673,7 +2670,9 @@ def _dt_microseconds(self) -> Self: def _dt_nanoseconds(self) -> Self: return type(self)( pa.array( - self._to_timedeltaarray().nanoseconds, from_pandas=True, type=pa.int32() + self._to_timedeltaarray().components.nanoseconds, + from_pandas=True, + type=pa.int32(), ) ) @@ -2791,7 +2790,10 @@ def _dt_days_in_month(self) -> Self: @property def _dt_microsecond(self) -> Self: - return type(self)(pc.microsecond(self._pa_array)) + # GH 59154 + us = pc.microsecond(self._pa_array) + ms_to_us = pc.multiply(pc.millisecond(self._pa_array), 1000) + return type(self)(pc.add(us, ms_to_us)) @property def _dt_minute(self) -> Self: @@ -2970,7 +2972,7 @@ def transpose_homogeneous_pyarrow( """ arrays = list(arrays) nrows, ncols = len(arrays[0]), len(arrays) - indices = np.arange(nrows * ncols).reshape(ncols, nrows).T.flatten() + indices = np.arange(nrows * ncols).reshape(ncols, nrows).T.reshape(-1) arr = pa.chunked_array([chunk for arr in arrays for chunk in arr._pa_array.chunks]) arr = arr.take(indices) return [ArrowExtensionArray(arr.slice(i * ncols, ncols)) for i in range(nrows)] diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index f83fdcd46b371..b429b7c1b1fc4 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -13,7 +13,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, ClassVar, Literal, cast, @@ -78,6 +77,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Iterator, Sequence, ) @@ -1986,7 +1986,10 @@ def _reduce( ) result = meth(skipna=skipna, **kwargs) if keepdims: - result = np.array([result]) + if name in ["min", "max"]: + result = self._from_sequence([result], dtype=self.dtype) + else: + result = np.array([result]) return result diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index a326925545045..74c0cd7719c13 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -329,15 +329,21 @@ def _from_sequence_of_strings( copy: bool = False, true_values: list[str] | None = None, false_values: list[str] | None = None, + none_values: list[str] | None = None, ) -> BooleanArray: true_values_union = cls._TRUE_VALUES.union(true_values or []) false_values_union = cls._FALSE_VALUES.union(false_values or []) - def map_string(s) -> bool: + if none_values is None: + none_values = [] + + def map_string(s) -> bool | None: if s in true_values_union: return True elif s in false_values_union: return False + elif s in none_values: + return None else: raise ValueError(f"{s} cannot be cast to bool") diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 64e5eec43a5c1..18b52f741370f 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -6,12 +6,10 @@ from shutil import get_terminal_size from typing import ( TYPE_CHECKING, - Callable, Literal, cast, overload, ) -import warnings import numpy as np @@ -24,7 +22,6 @@ ) from pandas._libs.arrays import NDArrayBacked from pandas.compat.numpy import function as nv -from pandas.util._exceptions import find_stack_level from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.cast import ( @@ -94,6 +91,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Hashable, Iterator, Sequence, @@ -2673,62 +2671,6 @@ def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: code_values = code_values[null_mask | (code_values >= 0)] return algorithms.isin(self.codes, code_values) - @overload - def _replace(self, *, to_replace, value, inplace: Literal[False] = ...) -> Self: ... - - @overload - def _replace(self, *, to_replace, value, inplace: Literal[True]) -> None: ... - - def _replace(self, *, to_replace, value, inplace: bool = False) -> Self | None: - from pandas import Index - - orig_dtype = self.dtype - - inplace = validate_bool_kwarg(inplace, "inplace") - cat = self if inplace else self.copy() - - mask = isna(np.asarray(value)) - if mask.any(): - removals = np.asarray(to_replace)[mask] - removals = cat.categories[cat.categories.isin(removals)] - new_cat = cat.remove_categories(removals) - NDArrayBacked.__init__(cat, new_cat.codes, new_cat.dtype) - - ser = cat.categories.to_series() - ser = ser.replace(to_replace=to_replace, value=value) - - all_values = Index(ser) - - # GH51016: maintain order of existing categories - idxr = cat.categories.get_indexer_for(all_values) - locs = np.arange(len(ser)) - locs = np.where(idxr == -1, locs, idxr) - locs = locs.argsort() - - new_categories = ser.take(locs) - new_categories = new_categories.drop_duplicates(keep="first") - index_categories = Index(new_categories) - new_codes = recode_for_categories( - cat._codes, all_values, index_categories, copy=False - ) - new_dtype = CategoricalDtype(index_categories, ordered=self.dtype.ordered) - NDArrayBacked.__init__(cat, new_codes, new_dtype) - - if new_dtype != orig_dtype: - warnings.warn( - # GH#55147 - "The behavior of Series.replace (and DataFrame.replace) with " - "CategoricalDtype is deprecated. In a future version, replace " - "will only be used for cases that preserve the categories. " - "To change the categories, use ser.cat.rename_categories " - "instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - if not inplace: - return cat - return None - # ------------------------------------------------------------------------ # String methods interface def _str_map( diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 925858a20ce41..ad0bde3abbdd4 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -9,7 +9,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Literal, Union, cast, @@ -66,6 +65,7 @@ ScalarIndexer, Self, SequenceIndexer, + TakeIndexer, TimeAmbiguous, TimeNonexistent, npt, @@ -148,6 +148,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Iterator, Sequence, ) @@ -1849,11 +1850,11 @@ def strftime(self, date_format: str) -> npt.NDArray[np.object_]: >>> rng_tz.floor("2h", ambiguous=False) DatetimeIndex(['2021-10-31 02:00:00+01:00'], - dtype='datetime64[ns, Europe/Amsterdam]', freq=None) + dtype='datetime64[s, Europe/Amsterdam]', freq=None) >>> rng_tz.floor("2h", ambiguous=True) DatetimeIndex(['2021-10-31 02:00:00+02:00'], - dtype='datetime64[ns, Europe/Amsterdam]', freq=None) + dtype='datetime64[s, Europe/Amsterdam]', freq=None) """ _floor_example = """>>> rng.floor('h') @@ -1876,11 +1877,11 @@ def strftime(self, date_format: str) -> npt.NDArray[np.object_]: >>> rng_tz.floor("2h", ambiguous=False) DatetimeIndex(['2021-10-31 02:00:00+01:00'], - dtype='datetime64[ns, Europe/Amsterdam]', freq=None) + dtype='datetime64[s, Europe/Amsterdam]', freq=None) >>> rng_tz.floor("2h", ambiguous=True) DatetimeIndex(['2021-10-31 02:00:00+02:00'], - dtype='datetime64[ns, Europe/Amsterdam]', freq=None) + dtype='datetime64[s, Europe/Amsterdam]', freq=None) """ _ceil_example = """>>> rng.ceil('h') @@ -1903,11 +1904,11 @@ def strftime(self, date_format: str) -> npt.NDArray[np.object_]: >>> rng_tz.ceil("h", ambiguous=False) DatetimeIndex(['2021-10-31 02:00:00+01:00'], - dtype='datetime64[ns, Europe/Amsterdam]', freq=None) + dtype='datetime64[s, Europe/Amsterdam]', freq=None) >>> rng_tz.ceil("h", ambiguous=True) DatetimeIndex(['2021-10-31 02:00:00+02:00'], - dtype='datetime64[ns, Europe/Amsterdam]', freq=None) + dtype='datetime64[s, Europe/Amsterdam]', freq=None) """ @@ -2340,6 +2341,27 @@ def interpolate( return self return type(self)._simple_new(out_data, dtype=self.dtype) + def take( + self, + indices: TakeIndexer, + *, + allow_fill: bool = False, + fill_value: Any = None, + axis: AxisInt = 0, + ) -> Self: + result = super().take( + indices=indices, allow_fill=allow_fill, fill_value=fill_value, axis=axis + ) + + indices = np.asarray(indices, dtype=np.intp) + maybe_slice = lib.maybe_indices_to_slice(indices, len(self)) + + if isinstance(maybe_slice, slice): + freq = self._get_getitem_freq(maybe_slice) + result._freq = freq + + return result + # -------------------------------------------------------------- # Unsorted diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index b075e3d299ed0..dddfc440109d3 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -143,7 +143,7 @@ def f(self): month_kw = 12 if freq: kwds = freq.kwds - month_kw = kwds.get("startingMonth", kwds.get("month", 12)) + month_kw = kwds.get("startingMonth", kwds.get("month", month_kw)) if freq is not None: freq_name = freq.name @@ -218,7 +218,7 @@ class DatetimeArray(dtl.TimelikeOps, dtl.DatelikeOps): # type: ignore[misc] ... ) ['2023-01-01 00:00:00', '2023-01-02 00:00:00'] - Length: 2, dtype: datetime64[ns] + Length: 2, dtype: datetime64[s] """ _typ = "datetimearray" @@ -594,7 +594,7 @@ def tz(self) -> tzinfo | None: Returns ------- - datetime.tzinfo, pytz.tzinfo.BaseTZInfo, dateutil.tz.tz.tzfile, or None + zoneinfo.ZoneInfo,, datetime.tzinfo, pytz.tzinfo.BaseTZInfo, dateutil.tz.tz.tzfile, or None Returns None when the array is tz-naive. See Also @@ -613,7 +613,7 @@ def tz(self) -> tzinfo | None: >>> s 0 2020-01-01 10:00:00+00:00 1 2020-02-01 11:00:00+00:00 - dtype: datetime64[ns, UTC] + dtype: datetime64[s, UTC] >>> s.dt.tz datetime.timezone.utc @@ -624,7 +624,7 @@ def tz(self) -> tzinfo | None: ... ) >>> idx.tz datetime.timezone.utc - """ + """ # noqa: E501 # GH 18595 return getattr(self.dtype, "tz", None) @@ -863,7 +863,7 @@ def tz_convert(self, tz) -> Self: Parameters ---------- - tz : str, pytz.timezone, dateutil.tz.tzfile, datetime.tzinfo or None + tz : str, zoneinfo.ZoneInfo, pytz.timezone, dateutil.tz.tzfile, datetime.tzinfo or None Time zone for time. Corresponding timestamps would be converted to this time zone of the Datetime Array/Index. A `tz` of None will convert to UTC and remove the timezone information. @@ -923,7 +923,7 @@ def tz_convert(self, tz) -> Self: '2014-08-01 08:00:00', '2014-08-01 09:00:00'], dtype='datetime64[ns]', freq='h') - """ + """ # noqa: E501 tz = timezones.maybe_get_tz(tz) if self.tz is None: @@ -955,7 +955,7 @@ def tz_localize( Parameters ---------- - tz : str, pytz.timezone, dateutil.tz.tzfile, datetime.tzinfo or None + tz : str, zoneinfo.ZoneInfo,, pytz.timezone, dateutil.tz.tzfile, datetime.tzinfo or None Time zone to convert timestamps to. Passing ``None`` will remove the time zone information preserving local time. ambiguous : 'infer', 'NaT', bool array, default 'raise' @@ -1047,7 +1047,7 @@ def tz_localize( 4 2018-10-28 02:30:00+01:00 5 2018-10-28 03:00:00+01:00 6 2018-10-28 03:30:00+01:00 - dtype: datetime64[ns, CET] + dtype: datetime64[s, CET] In some cases, inferring the DST is impossible. In such cases, you can pass an ndarray to the ambiguous parameter to set the DST explicitly @@ -1059,14 +1059,14 @@ def tz_localize( 0 2018-10-28 01:20:00+02:00 1 2018-10-28 02:36:00+02:00 2 2018-10-28 03:46:00+01:00 - dtype: datetime64[ns, CET] + dtype: datetime64[s, CET] If the DST transition causes nonexistent times, you can shift these dates forward or backwards with a timedelta object or `'shift_forward'` or `'shift_backwards'`. >>> s = pd.to_datetime(pd.Series(['2015-03-29 02:30:00', - ... '2015-03-29 03:30:00'])) + ... '2015-03-29 03:30:00'], dtype="M8[ns]")) >>> s.dt.tz_localize('Europe/Warsaw', nonexistent='shift_forward') 0 2015-03-29 03:00:00+02:00 1 2015-03-29 03:30:00+02:00 @@ -1081,7 +1081,7 @@ def tz_localize( 0 2015-03-29 03:30:00+02:00 1 2015-03-29 03:30:00+02:00 dtype: datetime64[ns, Europe/Warsaw] - """ + """ # noqa: E501 nonexistent_options = ("raise", "NaT", "shift_forward", "shift_backward") if nonexistent not in nonexistent_options and not isinstance( nonexistent, timedelta @@ -1427,7 +1427,7 @@ def time(self) -> npt.NDArray[np.object_]: >>> s 0 2020-01-01 10:00:00+00:00 1 2020-02-01 11:00:00+00:00 - dtype: datetime64[ns, UTC] + dtype: datetime64[s, UTC] >>> s.dt.time 0 10:00:00 1 11:00:00 @@ -1470,7 +1470,7 @@ def timetz(self) -> npt.NDArray[np.object_]: >>> s 0 2020-01-01 10:00:00+00:00 1 2020-02-01 11:00:00+00:00 - dtype: datetime64[ns, UTC] + dtype: datetime64[s, UTC] >>> s.dt.timetz 0 10:00:00+00:00 1 11:00:00+00:00 @@ -1512,7 +1512,7 @@ def date(self) -> npt.NDArray[np.object_]: >>> s 0 2020-01-01 10:00:00+00:00 1 2020-02-01 11:00:00+00:00 - dtype: datetime64[ns, UTC] + dtype: datetime64[s, UTC] >>> s.dt.date 0 2020-01-01 1 2020-02-01 @@ -1861,7 +1861,7 @@ def isocalendar(self) -> DataFrame: >>> s 0 2020-01-01 10:00:00+00:00 1 2020-02-01 11:00:00+00:00 - dtype: datetime64[ns, UTC] + dtype: datetime64[s, UTC] >>> s.dt.dayofyear 0 1 1 32 @@ -1897,7 +1897,7 @@ def isocalendar(self) -> DataFrame: >>> s 0 2020-01-01 10:00:00+00:00 1 2020-04-01 11:00:00+00:00 - dtype: datetime64[ns, UTC] + dtype: datetime64[s, UTC] >>> s.dt.quarter 0 1 1 2 @@ -1933,7 +1933,7 @@ def isocalendar(self) -> DataFrame: >>> s 0 2020-01-01 10:00:00+00:00 1 2020-02-01 11:00:00+00:00 - dtype: datetime64[ns, UTC] + dtype: datetime64[s, UTC] >>> s.dt.daysinmonth 0 31 1 29 @@ -2119,6 +2119,32 @@ def isocalendar(self) -> DataFrame: >>> idx.is_year_start array([False, False, True]) + + This method, when applied to Series with datetime values under + the ``.dt`` accessor, will lose information about Business offsets. + + >>> dates = pd.Series(pd.date_range("2020-10-30", periods=4, freq="BYS")) + >>> dates + 0 2021-01-01 + 1 2022-01-03 + 2 2023-01-02 + 3 2024-01-01 + dtype: datetime64[ns] + + >>> dates.dt.is_year_start + 0 True + 1 False + 2 False + 3 True + dtype: bool + + >>> idx = pd.date_range("2020-10-30", periods=4, freq="BYS") + >>> idx + DatetimeIndex(['2021-01-01', '2022-01-03', '2023-01-02', '2024-01-01'], + dtype='datetime64[ns]', freq='BYS-JAN') + + >>> idx.is_year_start + array([ True, True, True, True]) """, ) is_year_end = _field_accessor( @@ -2249,6 +2275,19 @@ def to_julian_date(self) -> npt.NDArray[np.float64]: # ----------------------------------------------------------------- # Reductions + def _reduce( + self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs + ): + result = super()._reduce(name, skipna=skipna, keepdims=keepdims, **kwargs) + if keepdims and isinstance(result, np.ndarray): + if name == "std": + from pandas.core.arrays import TimedeltaArray + + return TimedeltaArray._from_sequence(result) + else: + return self._from_sequence(result, dtype=self.dtype) + return result + def std( self, axis=None, @@ -2372,9 +2411,9 @@ def _sequence_to_dt64( data, copy = maybe_convert_dtype(data, copy, tz=tz) data_dtype = getattr(data, "dtype", None) - if out_unit is None: - out_unit = "ns" - out_dtype = np.dtype(f"M8[{out_unit}]") + out_dtype = DT64NS_DTYPE + if out_unit is not None: + out_dtype = np.dtype(f"M8[{out_unit}]") if data_dtype == object or is_string_dtype(data_dtype): # TODO: We do not have tests specific to string-dtypes, @@ -2400,7 +2439,7 @@ def _sequence_to_dt64( dayfirst=dayfirst, yearfirst=yearfirst, allow_object=False, - out_unit=out_unit or "ns", + out_unit=out_unit, ) copy = False if tz and inferred_tz: @@ -2508,7 +2547,7 @@ def objects_to_datetime64( utc: bool = False, errors: DateTimeErrorChoices = "raise", allow_object: bool = False, - out_unit: str = "ns", + out_unit: str | None = None, ) -> tuple[np.ndarray, tzinfo | None]: """ Convert data to array of timestamps. @@ -2524,7 +2563,8 @@ def objects_to_datetime64( allow_object : bool Whether to return an object-dtype ndarray instead of raising if the data contains more than one timezone. - out_unit : str, default "ns" + out_unit : str or None, default None + None indicates we should do resolution inference. Returns ------- diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 2e1ea7236e5c4..52d64162358c8 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -8,7 +8,6 @@ import textwrap from typing import ( TYPE_CHECKING, - Callable, Literal, Union, overload, @@ -99,6 +98,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Iterator, Sequence, ) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 04cffcaaa5f04..92ed690e527c7 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -3,7 +3,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Literal, cast, overload, @@ -73,6 +72,7 @@ from pandas.core.util.hashing import hash_array if TYPE_CHECKING: + from collections.abc import Callable from collections.abc import ( Iterator, Sequence, @@ -1198,7 +1198,7 @@ def _wrap_na_result(self, *, name, axis, mask_size): mask = np.ones(mask_size, dtype=bool) float_dtyp = "float32" if self.dtype == "Float32" else "float64" - if name in ["mean", "median", "var", "std", "skew", "kurt"]: + if name in ["mean", "median", "var", "std", "skew", "kurt", "sem"]: np_dtype = float_dtyp elif name in ["min", "max"] or self.dtype.itemsize == 8: np_dtype = self.dtype.numpy_dtype.name diff --git a/pandas/core/arrays/numeric.py b/pandas/core/arrays/numeric.py index c5e9ed8698ffe..2c0236273e731 100644 --- a/pandas/core/arrays/numeric.py +++ b/pandas/core/arrays/numeric.py @@ -4,7 +4,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, ) import numpy as np @@ -28,7 +27,10 @@ ) if TYPE_CHECKING: - from collections.abc import Mapping + from collections.abc import ( + Callable, + Mapping, + ) import pyarrow diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 8baf363b909fb..b3513dd083e41 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -5,7 +5,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Literal, TypeVar, cast, @@ -75,7 +74,10 @@ import pandas.core.common as com if TYPE_CHECKING: - from collections.abc import Sequence + from collections.abc import ( + Callable, + Sequence, + ) from pandas._typing import ( AnyArrayLike, @@ -954,6 +956,17 @@ def _check_timedeltalike_freq_compat(self, other): delta = delta.view("i8") return lib.item_from_zerodim(delta) + # ------------------------------------------------------------------ + # Reductions + + def _reduce( + self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs + ): + result = super()._reduce(name, skipna=skipna, keepdims=keepdims, **kwargs) + if keepdims and isinstance(result, np.ndarray): + return self._from_sequence(result, dtype=self.dtype) + return result + def raise_on_incompatible(left, right) -> IncompatibleFrequency: """ diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py index 6a1c25711acb0..b8245349a4e62 100644 --- a/pandas/core/arrays/sparse/accessor.py +++ b/pandas/core/arrays/sparse/accessor.py @@ -98,8 +98,8 @@ def from_coo(cls, A, dense_index: bool = False) -> Series: ... ([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), shape=(3, 4) ... ) >>> A - <3x4 sparse matrix of type '' - with 3 stored elements in COOrdinate format> + >>> A.todense() matrix([[0., 0., 1., 2.], @@ -186,8 +186,8 @@ def to_coo( ... row_levels=["A", "B"], column_levels=["C", "D"], sort_labels=True ... ) >>> A - <3x4 sparse matrix of type '' - with 3 stored elements in COOrdinate format> + >>> A.todense() matrix([[0., 0., 1., 3.], [3., 0., 0., 0.], @@ -291,12 +291,12 @@ def from_spmatrix(cls, data, index=None, columns=None) -> DataFrame: Examples -------- >>> import scipy.sparse - >>> mat = scipy.sparse.eye(3, dtype=float) + >>> mat = scipy.sparse.eye(3, dtype=int) >>> pd.DataFrame.sparse.from_spmatrix(mat) 0 1 2 - 0 1.0 0 0 - 1 0 1.0 0 - 2 0 0 1.0 + 0 1 0 0 + 1 0 1 0 + 2 0 0 1 """ from pandas._libs.sparse import IntIndex @@ -313,7 +313,7 @@ def from_spmatrix(cls, data, index=None, columns=None) -> DataFrame: indices = data.indices indptr = data.indptr array_data = data.data - dtype = SparseDtype(array_data.dtype, 0) + dtype = SparseDtype(array_data.dtype) arrays = [] for i in range(n_columns): sl = slice(indptr[i], indptr[i + 1]) @@ -380,8 +380,8 @@ def to_coo(self) -> spmatrix: -------- >>> df = pd.DataFrame({"A": pd.arrays.SparseArray([0, 1, 0, 1])}) >>> df.sparse.to_coo() - <4x1 sparse matrix of type '' - with 2 stored elements in COOrdinate format> + """ import_optional_dependency("scipy") from scipy.sparse import coo_matrix @@ -393,8 +393,6 @@ def to_coo(self) -> spmatrix: cols, rows, data = [], [], [] for col, (_, ser) in enumerate(self._parent.items()): sp_arr = ser.array - if sp_arr.fill_value != 0: - raise ValueError("fill value must be 0 when converting to COO matrix") row = sp_arr.sp_index.indices cols.append(np.repeat(col, len(row))) diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index adf8f44377e62..3a08344369822 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -10,7 +10,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Literal, cast, overload, @@ -87,7 +86,10 @@ # See https://github.com/python/typing/issues/684 if TYPE_CHECKING: - from collections.abc import Sequence + from collections.abc import ( + Callable, + Sequence, + ) from enum import Enum class ellipsis(Enum): diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 291cc2e62be62..3c0cc3a8a9c70 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -9,7 +9,10 @@ import numpy as np -from pandas._config import get_option +from pandas._config import ( + get_option, + using_string_dtype, +) from pandas._libs import ( lib, @@ -81,8 +84,10 @@ class StringDtype(StorageExtensionDtype): Parameters ---------- - storage : {"python", "pyarrow", "pyarrow_numpy"}, optional + storage : {"python", "pyarrow"}, optional If not given, the value of ``pd.options.mode.string_storage``. + na_value : {np.nan, pd.NA}, default pd.NA + Whether the dtype follows NaN or NA missing value semantics. Attributes ---------- @@ -113,30 +118,69 @@ class StringDtype(StorageExtensionDtype): # follows NumPy semantics, which uses nan. @property def na_value(self) -> libmissing.NAType | float: # type: ignore[override] - if self.storage == "pyarrow_numpy": - return np.nan - else: - return libmissing.NA + return self._na_value - _metadata = ("storage",) + _metadata = ("storage", "_na_value") # type: ignore[assignment] - def __init__(self, storage=None) -> None: + def __init__( + self, + storage: str | None = None, + na_value: libmissing.NAType | float = libmissing.NA, + ) -> None: + # infer defaults if storage is None: - infer_string = get_option("future.infer_string") - if infer_string: - storage = "pyarrow_numpy" + if using_string_dtype() and na_value is not libmissing.NA: + storage = "pyarrow" else: storage = get_option("mode.string_storage") - if storage not in {"python", "pyarrow", "pyarrow_numpy"}: + + if storage == "pyarrow_numpy": + # TODO raise a deprecation warning + storage = "pyarrow" + na_value = np.nan + + # validate options + if storage not in {"python", "pyarrow"}: raise ValueError( - f"Storage must be 'python', 'pyarrow' or 'pyarrow_numpy'. " - f"Got {storage} instead." + f"Storage must be 'python' or 'pyarrow'. Got {storage} instead." ) - if storage in ("pyarrow", "pyarrow_numpy") and pa_version_under10p1: + if storage == "pyarrow" and pa_version_under10p1: raise ImportError( "pyarrow>=10.0.1 is required for PyArrow backed StringArray." ) + + if isinstance(na_value, float) and np.isnan(na_value): + # when passed a NaN value, always set to np.nan to ensure we use + # a consistent NaN value (and we can use `dtype.na_value is np.nan`) + na_value = np.nan + elif na_value is not libmissing.NA: + raise ValueError("'na_value' must be np.nan or pd.NA, got {na_value}") + self.storage = storage + self._na_value = na_value + + def __eq__(self, other: object) -> bool: + # we need to override the base class __eq__ because na_value (NA or NaN) + # cannot be checked with normal `==` + if isinstance(other, str): + if other == self.name: + return True + try: + other = self.construct_from_string(other) + except (TypeError, ImportError): + # TypeError if `other` is not a valid string for StringDtype + # ImportError if pyarrow is not installed for "string[pyarrow]" + return False + if isinstance(other, type(self)): + return self.storage == other.storage and self.na_value is other.na_value + return False + + def __hash__(self) -> int: + # need to override __hash__ as well because of overriding __eq__ + return super().__hash__() + + def __reduce__(self): + return StringDtype, (self.storage, self.na_value) @property def type(self) -> type[str]: @@ -181,6 +225,7 @@ def construct_from_string(cls, string) -> Self: elif string == "string[pyarrow]": return cls(storage="pyarrow") elif string == "string[pyarrow_numpy]": + # TODO deprecate return cls(storage="pyarrow_numpy") else: raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'") @@ -205,7 +250,7 @@ def construct_array_type( # type: ignore[override] if self.storage == "python": return StringArray - elif self.storage == "pyarrow": + elif self.storage == "pyarrow" and self._na_value is libmissing.NA: return ArrowStringArray else: return ArrowStringArrayNumpySemantics @@ -217,13 +262,17 @@ def __from_arrow__( Construct StringArray from pyarrow Array/ChunkedArray. """ if self.storage == "pyarrow": - from pandas.core.arrays.string_arrow import ArrowStringArray + if self._na_value is libmissing.NA: + from pandas.core.arrays.string_arrow import ArrowStringArray - return ArrowStringArray(array) - elif self.storage == "pyarrow_numpy": - from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics + return ArrowStringArray(array) + else: + from pandas.core.arrays.string_arrow import ( + ArrowStringArrayNumpySemantics, + ) + + return ArrowStringArrayNumpySemantics(array) - return ArrowStringArrayNumpySemantics(array) else: import pyarrow @@ -522,10 +571,19 @@ def astype(self, dtype, copy: bool = True): return super().astype(dtype, copy) def _reduce( - self, name: str, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs + self, + name: str, + *, + skipna: bool = True, + keepdims: bool = False, + axis: AxisInt | None = 0, + **kwargs, ): if name in ["min", "max"]: - return getattr(self, name)(skipna=skipna, axis=axis) + result = getattr(self, name)(skipna=skipna, axis=axis) + if keepdims: + return self._from_sequence([result], dtype=self.dtype) + return result raise TypeError(f"Cannot perform reduction '{name}' with string dtype") diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index f2fd9d5d6610f..869cc34d5f61d 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -5,7 +5,6 @@ import re from typing import ( TYPE_CHECKING, - Callable, Union, cast, ) @@ -53,7 +52,10 @@ if TYPE_CHECKING: - from collections.abc import Sequence + from collections.abc import ( + Callable, + Sequence, + ) from pandas._typing import ( ArrayLike, @@ -129,6 +131,7 @@ class ArrowStringArray(ObjectStringArrayMixin, ArrowExtensionArray, BaseStringAr # base class "ArrowExtensionArray" defined the type as "ArrowDtype") _dtype: StringDtype # type: ignore[assignment] _storage = "pyarrow" + _na_value: libmissing.NAType | float = libmissing.NA def __init__(self, values) -> None: _chk_pyarrow_available() @@ -138,7 +141,7 @@ def __init__(self, values) -> None: values = pc.cast(values, pa.large_string()) super().__init__(values) - self._dtype = StringDtype(storage=self._storage) + self._dtype = StringDtype(storage=self._storage, na_value=self._na_value) if not pa.types.is_large_string(self._pa_array.type) and not ( pa.types.is_dictionary(self._pa_array.type) @@ -185,10 +188,7 @@ def _from_sequence( if dtype and not (isinstance(dtype, str) and dtype == "string"): dtype = pandas_dtype(dtype) - assert isinstance(dtype, StringDtype) and dtype.storage in ( - "pyarrow", - "pyarrow_numpy", - ) + assert isinstance(dtype, StringDtype) and dtype.storage == "pyarrow" if isinstance(scalars, BaseMaskedArray): # avoid costly conversion to object dtype in ensure_string_array and @@ -595,7 +595,8 @@ def _rank( class ArrowStringArrayNumpySemantics(ArrowStringArray): - _storage = "pyarrow_numpy" + _storage = "pyarrow" + _na_value = np.nan @classmethod def _result_converter(cls, values, na=None): diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 865e81d7754ef..15bfe442ca87f 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -746,7 +746,7 @@ def total_seconds(self) -> npt.NDArray[np.float64]: -------- **Series** - >>> s = pd.Series(pd.to_timedelta(np.arange(5), unit="d")) + >>> s = pd.Series(pd.to_timedelta(np.arange(5), unit="D")) >>> s 0 0 days 1 1 days @@ -765,7 +765,7 @@ def total_seconds(self) -> npt.NDArray[np.float64]: **TimedeltaIndex** - >>> idx = pd.to_timedelta(np.arange(5), unit="d") + >>> idx = pd.to_timedelta(np.arange(5), unit="D") >>> idx TimedeltaIndex(['0 days', '1 days', '2 days', '3 days', '4 days'], dtype='timedelta64[ns]', freq=None) @@ -809,7 +809,7 @@ def to_pytimedelta(self) -> npt.NDArray[np.object_]: -------- For Series: - >>> ser = pd.Series(pd.to_timedelta([1, 2, 3], unit='d')) + >>> ser = pd.Series(pd.to_timedelta([1, 2, 3], unit='D')) >>> ser 0 1 days 1 2 days diff --git a/pandas/core/base.py b/pandas/core/base.py index 5cdbde8c64c47..863cf978426e2 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1049,6 +1049,34 @@ def value_counts( 4.0 1 NaN 1 Name: count, dtype: int64 + + **Categorical Dtypes** + + Rows with categorical type will be counted as one group + if they have same categories and order. + In the example below, even though ``a``, ``c``, and ``d`` + all have the same data types of ``category``, + only ``c`` and ``d`` will be counted as one group + since ``a`` doesn't have the same categories. + + >>> df = pd.DataFrame({"a": [1], "b": ["2"], "c": [3], "d": [3]}) + >>> df = df.astype({"a": "category", "c": "category", "d": "category"}) + >>> df + a b c d + 0 1 2 3 3 + + >>> df.dtypes + a category + b object + c category + d category + dtype: object + + >>> df.dtypes.value_counts() + category 2 + category 1 + object 1 + Name: count, dtype: int64 """ return algorithms.value_counts_internal( self, @@ -1334,7 +1362,7 @@ def factorize( 0 2000-03-11 1 2000-03-12 2 2000-03-13 - dtype: datetime64[ns] + dtype: datetime64[s] >>> ser.searchsorted('3/14/2000') 3 diff --git a/pandas/core/common.py b/pandas/core/common.py index 96291991227d9..ec0473a20458b 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -12,6 +12,7 @@ defaultdict, ) from collections.abc import ( + Callable, Collection, Generator, Hashable, @@ -24,7 +25,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, TypeVar, cast, overload, @@ -145,7 +145,7 @@ def is_bool_indexer(key: Any) -> bool: elif isinstance(key, list): # check if np.array(key).dtype would be bool if len(key) > 0: - if type(key) is not list: # noqa: E721 + if type(key) is not list: # GH#42461 cython will raise TypeError if we pass a subclass key = list(key) return lib.is_bool_list(key) diff --git a/pandas/core/computation/align.py b/pandas/core/computation/align.py index b4e33b8ac75cb..7de4d8cdf99e1 100644 --- a/pandas/core/computation/align.py +++ b/pandas/core/computation/align.py @@ -8,10 +8,7 @@ partial, wraps, ) -from typing import ( - TYPE_CHECKING, - Callable, -) +from typing import TYPE_CHECKING import warnings import numpy as np @@ -31,7 +28,10 @@ from pandas.core.computation.common import result_type_many if TYPE_CHECKING: - from collections.abc import Sequence + from collections.abc import ( + Callable, + Sequence, + ) from pandas._typing import F diff --git a/pandas/core/computation/eval.py b/pandas/core/computation/eval.py index fee08c6199eef..aad768d31483a 100644 --- a/pandas/core/computation/eval.py +++ b/pandas/core/computation/eval.py @@ -193,8 +193,11 @@ def eval( corresponding bitwise operators. :class:`~pandas.Series` and :class:`~pandas.DataFrame` objects are supported and behave as they would with plain ol' Python evaluation. - `eval` can run arbitrary code which can make you vulnerable to code - injection if you pass user input to this function. + + .. warning:: + + ``eval`` can run arbitrary code which can make you vulnerable to code + injection and untrusted data. Parameters ---------- diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py index a8123a898b4fe..b074e768e0842 100644 --- a/pandas/core/computation/expr.py +++ b/pandas/core/computation/expr.py @@ -12,7 +12,7 @@ from keyword import iskeyword import tokenize from typing import ( - Callable, + TYPE_CHECKING, ClassVar, TypeVar, ) @@ -32,7 +32,6 @@ UNARY_OPS_SYMS, BinOp, Constant, - Div, FuncNode, Op, Term, @@ -47,6 +46,9 @@ from pandas.io.formats import printing +if TYPE_CHECKING: + from collections.abc import Callable + def _rewrite_assign(tok: tuple[int, str]) -> tuple[int, str]: """ @@ -371,7 +373,7 @@ class BaseExprVisitor(ast.NodeVisitor): "Add", "Sub", "Mult", - None, + "Div", "Pow", "FloorDiv", "Mod", @@ -534,9 +536,6 @@ def visit_BinOp(self, node, **kwargs): left, right = self._maybe_downcast_constants(left, right) return self._maybe_evaluate_binop(op, op_class, left, right) - def visit_Div(self, node, **kwargs): - return lambda lhs, rhs: Div(lhs, rhs) - def visit_UnaryOp(self, node, **kwargs): op = self.visit(node.op) operand = self.visit(node.operand) diff --git a/pandas/core/computation/ops.py b/pandas/core/computation/ops.py index b7a1cb173f659..a1a5f77f8539e 100644 --- a/pandas/core/computation/ops.py +++ b/pandas/core/computation/ops.py @@ -9,7 +9,6 @@ import operator from typing import ( TYPE_CHECKING, - Callable, Literal, ) @@ -36,6 +35,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Iterable, Iterator, ) @@ -327,31 +327,6 @@ def _not_in(x, y): _binary_ops_dict.update(d) -def _cast_inplace(terms, acceptable_dtypes, dtype) -> None: - """ - Cast an expression inplace. - - Parameters - ---------- - terms : Op - The expression that should cast. - acceptable_dtypes : list of acceptable numpy.dtype - Will not cast if term's dtype in this list. - dtype : str or numpy.dtype - The dtype to cast to. - """ - dt = np.dtype(dtype) - for term in terms: - if term.type in acceptable_dtypes: - continue - - try: - new_value = term.value.astype(dt) - except AttributeError: - new_value = dt.type(term.value) - term.update(new_value) - - def is_term(obj) -> bool: return isinstance(obj, Term) @@ -508,34 +483,6 @@ def _disallow_scalar_only_bool_ops(self) -> None: raise NotImplementedError("cannot evaluate scalar only bool ops") -def isnumeric(dtype) -> bool: - return issubclass(np.dtype(dtype).type, np.number) - - -class Div(BinOp): - """ - Div operator to special case casting. - - Parameters - ---------- - lhs, rhs : Term or Op - The Terms or Ops in the ``/`` expression. - """ - - def __init__(self, lhs, rhs) -> None: - super().__init__("/", lhs, rhs) - - if not isnumeric(lhs.return_type) or not isnumeric(rhs.return_type): - raise TypeError( - f"unsupported operand type(s) for {self.op}: " - f"'{lhs.return_type}' and '{rhs.return_type}'" - ) - - # do not upcast float32s to float64 un-necessarily - acceptable_dtypes = [np.float32, np.float64] - _cast_inplace(com.flatten(self), acceptable_dtypes, np.float64) - - UNARY_OPS_SYMS = ("+", "-", "~", "not") _unary_ops_funcs = (operator.pos, operator.neg, operator.invert, operator.invert) _unary_ops_dict = dict(zip(UNARY_OPS_SYMS, _unary_ops_funcs)) diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 46c9139c3456c..e62cda0dfe8d0 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -12,8 +12,9 @@ from __future__ import annotations +from collections.abc import Callable import os -from typing import Callable +from typing import Any import pandas._config.config as cf from pandas._config.config import ( @@ -455,12 +456,27 @@ def is_terminal() -> bool: ``future.infer_string`` is set to True. """ + +def is_valid_string_storage(value: Any) -> None: + legal_values = ["python", "pyarrow"] + if value not in legal_values: + msg = "Value must be one of python|pyarrow" + if value == "pyarrow_numpy": + # TODO: we can remove extra message after 3.0 + msg += ( + ". 'pyarrow_numpy' was specified, but this option should be " + "enabled using pandas.options.future.infer_string instead" + ) + raise ValueError(msg) + + with cf.config_prefix("mode"): cf.register_option( "string_storage", "python", string_storage_doc, - validator=is_one_of_factory(["python", "pyarrow", "pyarrow_numpy"]), + # validator=is_one_of_factory(["python", "pyarrow"]), + validator=is_valid_string_storage, ) @@ -858,7 +874,7 @@ def register_converter_cb(key: str) -> None: with cf.config_prefix("future"): cf.register_option( "infer_string", - False, + True if os.environ.get("PANDAS_FUTURE_INFER_STRING", "0") == "1" else False, "Whether to infer sequence of str objects as pyarrow string " "dtype, which will be the default in pandas 3.0 " "(at which point this option will be deprecated).", diff --git a/pandas/core/construction.py b/pandas/core/construction.py index f01d8822241c9..81aeb40f375b0 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -16,7 +16,7 @@ import numpy as np from numpy import ma -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._libs import lib from pandas._libs.tslibs import ( @@ -554,7 +554,7 @@ def sanitize_array( # Avoid ending up with a NumpyExtensionArray dtype = dtype.numpy_dtype - data_was_index = isinstance(data, ABCIndex) + infer_object = not isinstance(data, (ABCIndex, ABCSeries)) # extract ndarray or ExtensionArray, ensure we have no NumpyExtensionArray data = extract_array(data, extract_numpy=True, extract_range=True) @@ -571,14 +571,10 @@ def sanitize_array( if not is_list_like(data): if index is None: raise ValueError("index must be specified when data is not list-like") - if ( - isinstance(data, str) - and using_pyarrow_string_dtype() - and original_dtype is None - ): + if isinstance(data, str) and using_string_dtype() and original_dtype is None: from pandas.core.arrays.string_ import StringDtype - dtype = StringDtype("pyarrow_numpy") + dtype = StringDtype("pyarrow", na_value=np.nan) data = construct_1d_arraylike_from_scalar(data, len(index), dtype) return data @@ -607,12 +603,12 @@ def sanitize_array( if dtype is None: subarr = data - if data.dtype == object and not data_was_index: + if data.dtype == object and infer_object: subarr = maybe_infer_to_datetimelike(data) - elif data.dtype.kind == "U" and using_pyarrow_string_dtype(): + elif data.dtype.kind == "U" and using_string_dtype(): from pandas.core.arrays.string_ import StringDtype - dtype = StringDtype(storage="pyarrow_numpy") + dtype = StringDtype(storage="pyarrow", na_value=np.nan) subarr = dtype.construct_array_type()._from_sequence(data, dtype=dtype) if subarr is data and copy: diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 08adb580ff08f..d750451a1ca84 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -18,7 +18,7 @@ import numpy as np -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._libs import ( Interval, @@ -39,7 +39,6 @@ is_supported_dtype, ) from pandas._libs.tslibs.timedeltas import array_to_timedelta64 -from pandas.compat.numpy import np_version_gt2 from pandas.errors import ( IntCastingNaNError, LossySetitemError, @@ -799,10 +798,10 @@ def infer_dtype_from_scalar(val) -> tuple[DtypeObj, Any]: # coming out as np.str_! dtype = _dtype_obj - if using_pyarrow_string_dtype(): + if using_string_dtype(): from pandas.core.arrays.string_ import StringDtype - dtype = StringDtype(storage="pyarrow_numpy") + dtype = StringDtype(storage="pyarrow", na_value=np.nan) elif isinstance(val, (np.datetime64, dt.datetime)): try: @@ -1193,7 +1192,7 @@ def maybe_infer_to_datetimelike( # numpy would have done it for us. convert_numeric=False, convert_non_numeric=True, - dtype_if_all_nat=np.dtype("M8[ns]"), + dtype_if_all_nat=np.dtype("M8[s]"), ) @@ -1643,13 +1642,11 @@ def maybe_cast_to_integer_array(arr: list | np.ndarray, dtype: np.dtype) -> np.n with warnings.catch_warnings(): # We already disallow dtype=uint w/ negative numbers # (test_constructor_coercion_signed_to_unsigned) so safe to ignore. - if not np_version_gt2: - warnings.filterwarnings( - "ignore", - "NumPy will stop allowing conversion of " - "out-of-bound Python int", - DeprecationWarning, - ) + warnings.filterwarnings( + "ignore", + "NumPy will stop allowing conversion of " "out-of-bound Python int", + DeprecationWarning, + ) casted = np.asarray(arr, dtype=dtype) else: with warnings.catch_warnings(): diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 2ac75a0700759..3c11b9d723c14 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -7,7 +7,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, ) import warnings @@ -55,6 +54,8 @@ ) if TYPE_CHECKING: + from collections.abc import Callable + from pandas._typing import ( ArrayLike, DtypeObj, @@ -186,6 +187,10 @@ def is_sparse(arr) -> bool: bool Whether or not the array-like is a pandas sparse array. + See Also + -------- + api.types.SparseDtype : The dtype object for pandas sparse arrays. + Examples -------- Returns `True` if the parameter is a 1-D pandas sparse array. @@ -557,6 +562,11 @@ def is_string_dtype(arr_or_dtype) -> bool: boolean Whether or not the array or dtype is of the string dtype. + See Also + -------- + api.types.is_string_dtype : Check whether the provided array or dtype + is of the string dtype. + Examples -------- >>> from pandas.api.types import is_string_dtype @@ -714,6 +724,15 @@ def is_signed_integer_dtype(arr_or_dtype) -> bool: Whether or not the array or dtype is of a signed integer dtype and not an instance of timedelta64. + See Also + -------- + api.types.is_integer_dtype: Check whether the provided array or dtype + is of an integer dtype. + api.types.is_numeric_dtype: Check whether the provided array or dtype + is of a numeric dtype. + api.types.is_unsigned_integer_dtype: Check whether the provided array + or dtype is of an unsigned integer dtype. + Examples -------- >>> from pandas.core.dtypes.common import is_signed_integer_dtype @@ -770,6 +789,15 @@ def is_unsigned_integer_dtype(arr_or_dtype) -> bool: boolean Whether or not the array or dtype is of an unsigned integer dtype. + See Also + -------- + api.types.is_signed_integer_dtype : Check whether the provided array + or dtype is of an signed integer dtype. + api.types.is_integer_dtype : Check whether the provided array or dtype + is of an integer dtype. + api.types.is_numeric_dtype : Check whether the provided array or dtype + is of a numeric dtype. + Examples -------- >>> from pandas.api.types import is_unsigned_integer_dtype @@ -1157,6 +1185,12 @@ def is_any_real_numeric_dtype(arr_or_dtype) -> bool: boolean Whether or not the array or dtype is of a real number dtype. + See Also + -------- + is_numeric_dtype : Check if a dtype is numeric. + is_complex_dtype : Check if a dtype is complex. + is_bool_dtype : Check if a dtype is boolean. + Examples -------- >>> from pandas.api.types import is_any_real_numeric_dtype diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 17e68b0e19a68..dcf8cb5c78536 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -190,6 +190,7 @@ def union_categoricals( Returns ------- Categorical + The union of categories being combined. Raises ------ @@ -201,6 +202,11 @@ def union_categoricals( ValueError Empty list of categoricals passed + See Also + -------- + CategoricalDtype : Type for categorical data with the categories and orderedness. + Categorical : Represent a categorical variable in classic R / S-plus fashion. + Notes ----- To learn more about categories, see `link diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 45814ca77b70f..3aeab96e03163 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -205,7 +205,7 @@ class CategoricalDtype(PandasExtensionDtype, ExtensionDtype): by providing an empty index. As follows, >>> pd.CategoricalDtype(pd.DatetimeIndex([])).categories.dtype - dtype(' bool | npt.NDArray[np.bool_] | NDFrame: >>> index = pd.DatetimeIndex(["2017-07-05", "2017-07-06", None, "2017-07-08"]) >>> index DatetimeIndex(['2017-07-05', '2017-07-06', 'NaT', '2017-07-08'], - dtype='datetime64[ns]', freq=None) + dtype='datetime64[s]', freq=None) >>> pd.isna(index) array([False, False, True, False]) @@ -362,7 +362,7 @@ def notna(obj: object) -> bool | npt.NDArray[np.bool_] | NDFrame: >>> index = pd.DatetimeIndex(["2017-07-05", "2017-07-06", None, "2017-07-08"]) >>> index DatetimeIndex(['2017-07-05', '2017-07-06', 'NaT', '2017-07-08'], - dtype='datetime64[ns]', freq=None) + dtype='datetime64[s]', freq=None) >>> pd.notna(index) array([ True, True, False, True]) @@ -618,6 +618,8 @@ def na_value_for_dtype(dtype: DtypeObj, compat: bool = True): nan >>> na_value_for_dtype(np.dtype("float64")) nan + >>> na_value_for_dtype(np.dtype("complex128")) + nan >>> na_value_for_dtype(np.dtype("bool")) False >>> na_value_for_dtype(np.dtype("datetime64[ns]")) @@ -629,7 +631,7 @@ def na_value_for_dtype(dtype: DtypeObj, compat: bool = True): elif dtype.kind in "mM": unit = np.datetime_data(dtype)[0] return dtype.type("NaT", unit) - elif dtype.kind == "f": + elif dtype.kind in "fc": return np.nan elif dtype.kind in "iu": if compat: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 01ac5a2be3d79..ea91046f4b8e4 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -14,6 +14,7 @@ import collections from collections import abc from collections.abc import ( + Callable, Hashable, Iterable, Iterator, @@ -29,7 +30,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Literal, cast, overload, @@ -531,6 +531,7 @@ class DataFrame(NDFrame, OpsMixin): will perform column selection instead. dtype : dtype, default None Data type to force. Only a single dtype is allowed. If None, infer. + If ``data`` is DataFrame then is ignored. copy : bool or None, default None Copy data from inputs. For dict data, the default of None behaves like ``copy=True``. For DataFrame @@ -728,10 +729,6 @@ def __init__( NDFrame.__init__(self, data) return - is_pandas_object = isinstance(data, (Series, Index, ExtensionArray)) - data_dtype = getattr(data, "dtype", None) - original_dtype = dtype - # GH47215 if isinstance(index, set): raise ValueError("index cannot be a set") @@ -896,18 +893,6 @@ def __init__( NDFrame.__init__(self, mgr) - if original_dtype is None and is_pandas_object and data_dtype == np.object_: - if self.dtypes.iloc[0] != data_dtype: - warnings.warn( - "Dtype inference on a pandas object " - "(Series, Index, ExtensionArray) is deprecated. The DataFrame " - "constructor will keep the original dtype in the future. " - "Call `infer_objects` on the result to get the old " - "behavior.", - FutureWarning, - stacklevel=2, - ) - # ---------------------------------------------------------------------- def __dataframe__( @@ -916,6 +901,19 @@ def __dataframe__( """ Return the dataframe interchange object implementing the interchange protocol. + .. note:: + + For new development, we highly recommend using the Arrow C Data Interface + alongside the Arrow PyCapsule Interface instead of the interchange protocol + + .. warning:: + + Due to severe implementation issues, we recommend only considering using the + interchange protocol in the following cases: + + - converting to pandas: for pandas >= 2.0.3 + - converting from pandas: for pandas >= 3.0.0 + Parameters ---------- nan_as_null : bool, default False @@ -1062,7 +1060,7 @@ def _is_homogeneous_type(self) -> bool: False """ # The "<" part of "<=" here is for empty DataFrame cases - return len({arr.dtype for arr in self._mgr.arrays}) <= 1 + return len({block.values.dtype for block in self._mgr.blocks}) <= 1 @property def _can_fast_transpose(self) -> bool: @@ -4489,7 +4487,7 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | No or punctuations (besides underscores) or starting with digits must be surrounded by backticks. (For example, a column named "Area (cm^2)" would be referenced as ```Area (cm^2)```). Column names which are Python keywords - (like "list", "for", "import", etc) cannot be used. + (like "if", "for", "import", etc) cannot be used. For example, if one of your columns is called ``a a`` and you want to sum it with ``b``, your query should be ```a a` + b``. @@ -4570,43 +4568,51 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | No For example, ```it's` > `that's``` will raise an error, as it forms a quoted string (``'s > `that'``) with a backtick inside. - See also the Python documentation about lexical analysis - (https://docs.python.org/3/reference/lexical_analysis.html) + See also the `Python documentation about lexical analysis + `__ in combination with the source code in :mod:`pandas.core.computation.parsing`. Examples -------- >>> df = pd.DataFrame( - ... {"A": range(1, 6), "B": range(10, 0, -2), "C C": range(10, 5, -1)} + ... {"A": range(1, 6), "B": range(10, 0, -2), "C&C": range(10, 5, -1)} ... ) >>> df - A B C C + A B C&C 0 1 10 10 1 2 8 9 2 3 6 8 3 4 4 7 4 5 2 6 >>> df.query("A > B") - A B C C + A B C&C 4 5 2 6 The previous expression is equivalent to >>> df[df.A > df.B] - A B C C + A B C&C 4 5 2 6 For columns with spaces in their name, you can use backtick quoting. - >>> df.query("B == `C C`") - A B C C + >>> df.query("B == `C&C`") + A B C&C 0 1 10 10 The previous expression is equivalent to - >>> df[df.B == df["C C"]] - A B C C + >>> df[df.B == df["C&C"]] + A B C&C 0 1 10 10 + + Using local variable: + + >>> local_var = 2 + >>> df.query("A <= @local_var") + A B C&C + 0 1 10 10 + 1 2 8 9 """ inplace = validate_bool_kwarg(inplace, "inplace") if not isinstance(expr, str): @@ -4647,6 +4653,13 @@ def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None: ---------- expr : str The expression string to evaluate. + + You can refer to variables + in the environment by prefixing them with an '@' character like + ``@a + b``. + + You can refer to column names that are not valid Python variable + names by surrounding them with backticks `````. inplace : bool, default False If the expression contains an assignment, whether to perform the operation inplace and mutate the existing DataFrame. Otherwise, @@ -4678,14 +4691,16 @@ def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None: Examples -------- - >>> df = pd.DataFrame({"A": range(1, 6), "B": range(10, 0, -2)}) + >>> df = pd.DataFrame( + ... {"A": range(1, 6), "B": range(10, 0, -2), "C&C": range(10, 5, -1)} + ... ) >>> df - A B - 0 1 10 - 1 2 8 - 2 3 6 - 3 4 4 - 4 5 2 + A B C&C + 0 1 10 10 + 1 2 8 9 + 2 3 6 8 + 3 4 4 7 + 4 5 2 6 >>> df.eval("A + B") 0 11 1 10 @@ -4697,35 +4712,55 @@ def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None: Assignment is allowed though by default the original DataFrame is not modified. - >>> df.eval("C = A + B") - A B C - 0 1 10 11 - 1 2 8 10 - 2 3 6 9 - 3 4 4 8 - 4 5 2 7 + >>> df.eval("D = A + B") + A B C&C D + 0 1 10 10 11 + 1 2 8 9 10 + 2 3 6 8 9 + 3 4 4 7 8 + 4 5 2 6 7 >>> df - A B - 0 1 10 - 1 2 8 - 2 3 6 - 3 4 4 - 4 5 2 + A B C&C + 0 1 10 10 + 1 2 8 9 + 2 3 6 8 + 3 4 4 7 + 4 5 2 6 Multiple columns can be assigned to using multi-line expressions: >>> df.eval( ... ''' - ... C = A + B - ... D = A - B + ... D = A + B + ... E = A - B ... ''' ... ) - A B C D - 0 1 10 11 -9 - 1 2 8 10 -6 - 2 3 6 9 -3 - 3 4 4 8 0 - 4 5 2 7 3 + A B C&C D E + 0 1 10 10 11 -9 + 1 2 8 9 10 -6 + 2 3 6 8 9 -3 + 3 4 4 7 8 0 + 4 5 2 6 7 3 + + For columns with spaces in their name, you can use backtick quoting. + + >>> df.eval("B * `C&C`") + 0 100 + 1 72 + 2 48 + 3 28 + 4 12 + + Local variables shall be explicitly referenced using ``@`` + character in front of the name: + + >>> local_var = 2 + >>> df.eval("@local_var * A") + 0 2 + 1 4 + 2 6 + 3 8 + 4 10 """ from pandas.core.computation.eval import eval as _eval @@ -4761,6 +4796,7 @@ def select_dtypes(self, include=None, exclude=None) -> DataFrame: ValueError * If both of ``include`` and ``exclude`` are empty * If ``include`` and ``exclude`` have overlapping elements + TypeError * If any kind of string dtype is passed in. See Also @@ -5705,7 +5741,6 @@ def shift( periods = cast(int, periods) ncols = len(self.columns) - arrays = self._mgr.arrays if axis == 1 and periods != 0 and ncols > 0 and freq is None: if fill_value is lib.no_default: # We will infer fill_value to match the closest column @@ -5731,12 +5766,12 @@ def shift( result.columns = self.columns.copy() return result - elif len(arrays) > 1 or ( + elif len(self._mgr.blocks) > 1 or ( # If we only have one block and we know that we can't # keep the same dtype (i.e. the _can_hold_element check) # then we can go through the reindex_indexer path # (and avoid casting logic in the Block method). - not can_hold_element(arrays[0], fill_value) + not can_hold_element(self._mgr.blocks[0].values, fill_value) ): # GH#35488 we need to watch out for multi-block cases # We only get here with fill_value not-lib.no_default @@ -9255,6 +9290,11 @@ def pivot( .. versionadded:: 1.3.0 + **kwargs : dict + Optional keyword arguments to pass to ``aggfunc``. + + .. versionadded:: 3.0.0 + Returns ------- DataFrame @@ -9362,6 +9402,7 @@ def pivot_table( margins_name: Level = "All", observed: bool = True, sort: bool = True, + **kwargs, ) -> DataFrame: from pandas.core.reshape.pivot import pivot_table @@ -9377,6 +9418,7 @@ def pivot_table( margins_name=margins_name, observed=observed, sort=sort, + **kwargs, ) def stack( @@ -11432,7 +11474,7 @@ def _get_data() -> DataFrame: if numeric_only: df = _get_data() if axis is None: - dtype = find_common_type([arr.dtype for arr in df._mgr.arrays]) + dtype = find_common_type([block.values.dtype for block in df._mgr.blocks]) if isinstance(dtype, ExtensionDtype): df = df.astype(dtype) arr = concat_compat(list(df._iter_column_arrays())) @@ -11457,7 +11499,9 @@ def _get_data() -> DataFrame: # kurtosis excluded since groupby does not implement it if df.shape[1] and name != "kurt": - dtype = find_common_type([arr.dtype for arr in df._mgr.arrays]) + dtype = find_common_type( + [block.values.dtype for block in df._mgr.blocks] + ) if isinstance(dtype, ExtensionDtype): # GH 54341: fastpath for EA-backed axis=1 reductions # This flattens the frame into a single 1D array while keeping @@ -11531,8 +11575,8 @@ def _reduce_axis1(self, name: str, func, skipna: bool) -> Series: else: raise NotImplementedError(name) - for arr in self._mgr.arrays: - middle = func(arr, axis=0, skipna=skipna) + for blocks in self._mgr.blocks: + middle = func(blocks.values, axis=0, skipna=skipna) result = ufunc(result, middle) res_ser = self._constructor_sliced(result, index=self.index, copy=False) @@ -11615,7 +11659,7 @@ def all( **kwargs, ) -> Series | bool: ... - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="all") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="all") @doc(make_doc("all", ndim=1)) def all( self, @@ -11662,7 +11706,7 @@ def min( **kwargs, ) -> Series | Any: ... - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="min") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="min") @doc(make_doc("min", ndim=2)) def min( self, @@ -11709,7 +11753,7 @@ def max( **kwargs, ) -> Series | Any: ... - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="max") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="max") @doc(make_doc("max", ndim=2)) def max( self, @@ -11725,7 +11769,7 @@ def max( result = result.__finalize__(self, method="max") return result - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="sum") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="sum") def sum( self, axis: Axis | None = 0, @@ -11826,7 +11870,7 @@ def sum( result = result.__finalize__(self, method="sum") return result - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="prod") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="prod") def prod( self, axis: Axis | None = 0, @@ -11944,7 +11988,7 @@ def mean( **kwargs, ) -> Series | Any: ... - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="mean") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="mean") @doc(make_doc("mean", ndim=2)) def mean( self, @@ -11991,7 +12035,7 @@ def median( **kwargs, ) -> Series | Any: ... - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="median") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="median") @doc(make_doc("median", ndim=2)) def median( self, @@ -12041,7 +12085,7 @@ def sem( **kwargs, ) -> Series | Any: ... - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="sem") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="sem") def sem( self, axis: Axis | None = 0, @@ -12161,7 +12205,7 @@ def var( **kwargs, ) -> Series | Any: ... - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="var") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="var") def var( self, axis: Axis | None = 0, @@ -12280,7 +12324,7 @@ def std( **kwargs, ) -> Series | Any: ... - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="std") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="std") def std( self, axis: Axis | None = 0, @@ -12403,7 +12447,7 @@ def skew( **kwargs, ) -> Series | Any: ... - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="skew") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="skew") def skew( self, axis: Axis | None = 0, @@ -12523,7 +12567,7 @@ def kurt( **kwargs, ) -> Series | Any: ... - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="kurt") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="kurt") def kurt( self, axis: Axis | None = 0, @@ -13049,7 +13093,7 @@ def quantile( if len(data.columns) == 0: # GH#23925 _get_numeric_data may have dropped all columns - cols = Index([], name=self.columns.name) + cols = self.columns[:0] dtype = np.float64 if axis == 1: @@ -13249,7 +13293,7 @@ def to_period( >>> idx DatetimeIndex(['2001-03-31', '2002-05-31', '2003-08-31'], - dtype='datetime64[ns]', freq=None) + dtype='datetime64[s]', freq=None) >>> idx.to_period("M") PeriodIndex(['2001-03', '2002-05', '2003-08'], dtype='period[M]') @@ -13298,6 +13342,11 @@ def isin(self, values: Series | DataFrame | Sequence | Mapping) -> DataFrame: Series.str.contains: Test if pattern or regex is contained within a string of a Series or Index. + Notes + ----- + ``__iter__`` is used (and not ``__contains__``) to iterate over values + when checking if it contains the elements in DataFrame. + Examples -------- >>> df = pd.DataFrame( diff --git a/pandas/core/generic.py b/pandas/core/generic.py index ca60ca9b48a14..8a6fc69d47cc3 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -13,7 +13,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, ClassVar, Literal, NoReturn, @@ -158,7 +157,6 @@ Index, MultiIndex, PeriodIndex, - RangeIndex, default_index, ensure_index, ) @@ -186,6 +184,7 @@ from pandas.io.formats.printing import pprint_thing if TYPE_CHECKING: + from collections.abc import Callable from collections.abc import ( Hashable, Iterator, @@ -1492,14 +1491,12 @@ def __invert__(self) -> Self: return res.__finalize__(self, method="__invert__") @final - def __nonzero__(self) -> NoReturn: + def __bool__(self) -> NoReturn: raise ValueError( f"The truth value of a {type(self).__name__} is ambiguous. " "Use a.empty, a.bool(), a.item(), a.any() or a.all()." ) - __bool__ = __nonzero__ - @final def abs(self) -> Self: """ @@ -1852,7 +1849,7 @@ def _drop_labels_or_levels(self, keys, axis: AxisInt = 0): else: # Drop the last level of Index by replacing with # a RangeIndex - dropped.columns = RangeIndex(dropped.columns.size) + dropped.columns = default_index(dropped.columns.size) # Handle dropping index labels if labels_to_drop: @@ -2388,7 +2385,8 @@ def to_json( index : bool or None, default None The index is only used when 'orient' is 'split', 'index', 'column', or 'table'. Of these, 'index' and 'column' do not support - `index=False`. + `index=False`. The string 'index' as a column name with empty :class:`Index` + or if it is 'index' will raise a ``ValueError``. indent : int, optional Length of whitespace used to indent each record. @@ -2780,7 +2778,8 @@ def to_sql( ---------- name : str Name of SQL table. - con : sqlalchemy.engine.(Engine or Connection) or sqlite3.Connection + con : ADBC connection, sqlalchemy.engine.(Engine or Connection) or sqlite3.Connection + ADBC provides high performance I/O with native type support, where available. Using SQLAlchemy makes it possible to use any DB supported by that library. Legacy support is provided for sqlite3.Connection objects. The user is responsible for engine disposal and connection closure for the SQLAlchemy @@ -2967,6 +2966,22 @@ def to_sql( >>> with engine.connect() as conn: ... conn.execute(text("SELECT * FROM integers")).fetchall() [(1,), (None,), (2,)] + + .. versionadded:: 2.2.0 + + pandas now supports writing via ADBC drivers + + >>> df = pd.DataFrame({'name' : ['User 10', 'User 11', 'User 12']}) + >>> df + name + 0 User 10 + 1 User 11 + 2 User 12 + + >>> from adbc_driver_sqlite import dbapi # doctest:+SKIP + >>> with dbapi.connect("sqlite://") as conn: # doctest:+SKIP + ... df.to_sql(name="users", con=conn) + 3 """ # noqa: E501 from pandas.io import sql @@ -3209,7 +3224,7 @@ class (index) object 32B 'bird' 'bird' 'mammal' 'mammal' Dimensions: (date: 2, animal: 2) Coordinates: - * date (date) datetime64[ns] 2018-01-01 2018-01-02 + * date (date) datetime64[s] 2018-01-01 2018-01-02 * animal (animal) object 'falcon' 'parrot' Data variables: speed (date, animal) int64 350 18 361 15 @@ -5732,7 +5747,7 @@ def sample( replace : bool, default False Allow or disallow sampling of the same row more than once. weights : str or ndarray-like, optional - Default 'None' results in equal probability weighting. + Default ``None`` results in equal probability weighting. If passed a Series, will align with target object on index. Index values in weights not found in sampled object will be ignored and index values in sampled object not in weights will be assigned @@ -5747,6 +5762,7 @@ def sample( random_state : int, array-like, BitGenerator, np.random.RandomState, np.random.Generator, optional If int, array-like, or BitGenerator, seed for random number generator. If np.random.RandomState or np.random.Generator, use as given. + Default ``None`` results in sampling with the current state of np.random. .. versionchanged:: 1.4.0 @@ -6017,17 +6033,16 @@ def __finalize__(self, other, method: str | None = None, **kwargs) -> Self: object.__setattr__(self, name, getattr(other, name, None)) if method == "concat": + objs = other.objs # propagate attrs only if all concat arguments have the same attrs - if all(bool(obj.attrs) for obj in other.objs): + if all(bool(obj.attrs) for obj in objs): # all concatenate arguments have non-empty attrs - attrs = other.objs[0].attrs - have_same_attrs = all(obj.attrs == attrs for obj in other.objs[1:]) + attrs = objs[0].attrs + have_same_attrs = all(obj.attrs == attrs for obj in objs[1:]) if have_same_attrs: self.attrs = deepcopy(attrs) - allows_duplicate_labels = all( - x.flags.allows_duplicate_labels for x in other.objs - ) + allows_duplicate_labels = all(x.flags.allows_duplicate_labels for x in objs) self.flags.allows_duplicate_labels = allows_duplicate_labels return self @@ -6194,7 +6209,7 @@ def dtypes(self): >>> df.dtypes float float64 int int64 - datetime datetime64[ns] + datetime datetime64[s] string object dtype: object """ @@ -6373,7 +6388,7 @@ def astype( # TODO(EA2D): special case not needed with 2D EAs dtype = pandas_dtype(dtype) if isinstance(dtype, ExtensionDtype) and all( - arr.dtype == dtype for arr in self._mgr.arrays + block.values.dtype == dtype for block in self._mgr.blocks ): return self.copy(deep=False) # GH 18099/22869: columnwise conversion to extension dtype @@ -9271,7 +9286,9 @@ def compare( # reorder axis to keep things organized indices = ( - np.arange(diff.shape[axis]).reshape([2, diff.shape[axis] // 2]).T.flatten() + np.arange(diff.shape[axis]) + .reshape([2, diff.shape[axis] // 2]) + .T.reshape(-1) ) diff = diff.take(indices, axis=axis) @@ -10653,10 +10670,10 @@ def tz_localize( dates forward or backward with a timedelta object or `'shift_forward'` or `'shift_backward'`. - >>> s = pd.Series( - ... range(2), - ... index=pd.DatetimeIndex(["2015-03-29 02:30:00", "2015-03-29 03:30:00"]), + >>> dti = pd.DatetimeIndex( + ... ["2015-03-29 02:30:00", "2015-03-29 03:30:00"], dtype="M8[ns]" ... ) + >>> s = pd.Series(range(2), index=dti) >>> s.tz_localize("Europe/Warsaw", nonexistent="shift_forward") 2015-03-29 03:00:00+02:00 0 2015-03-29 03:30:00+02:00 1 @@ -11148,9 +11165,9 @@ def _logical_func( if ( self.ndim > 1 and axis == 1 - and len(self._mgr.arrays) > 1 + and len(self._mgr.blocks) > 1 # TODO(EA2D): special-case not needed - and all(x.ndim == 2 for x in self._mgr.arrays) + and all(block.values.ndim == 2 for block in self._mgr.blocks) and not kwargs ): # Fastpath avoiding potentially expensive transpose @@ -11758,6 +11775,8 @@ def last_valid_index(self) -> Hashable: Returns ------- {name1} or scalar\ + + Value containing the calculation referenced in the description.\ {see_also}\ {examples} """ @@ -12460,6 +12479,14 @@ def last_valid_index(self) -> Hashable: stat_func="min", verb="Min", default_output=0, level_output_0=2, level_output_1=0 ) +_skew_see_also = """ + +See Also +-------- +Series.skew : Return unbiased skew over requested axis. +Series.var : Return unbiased variance over requested axis. +Series.std : Return unbiased standard deviation over requested axis.""" + _stat_func_see_also = """ See Also @@ -12571,7 +12598,7 @@ def make_doc(name: str, ndim: int) -> str: elif name == "median": base_doc = _num_doc desc = "Return the median of the values over the requested axis." - see_also = "" + see_also = _stat_func_see_also examples = """ Examples @@ -12612,7 +12639,7 @@ def make_doc(name: str, ndim: int) -> str: elif name == "mean": base_doc = _num_doc desc = "Return the mean of the values over the requested axis." - see_also = "" + see_also = _stat_func_see_also examples = """ Examples @@ -12719,7 +12746,7 @@ def make_doc(name: str, ndim: int) -> str: elif name == "skew": base_doc = _num_doc desc = "Return unbiased skew over requested axis.\n\nNormalized by N-1." - see_also = "" + see_also = _skew_see_also examples = """ Examples @@ -12760,6 +12787,7 @@ def make_doc(name: str, ndim: int) -> str: a 0.0 dtype: float64""" kwargs = {"min_count": ""} + elif name == "kurt": base_doc = _num_doc desc = ( diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index a20577e8d3df9..c112d9b6a4b54 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -9,18 +9,19 @@ from __future__ import annotations from collections import abc +from collections.abc import Callable from functools import partial from textwrap import dedent from typing import ( TYPE_CHECKING, Any, - Callable, Literal, NamedTuple, TypeVar, Union, cast, ) +import warnings import numpy as np @@ -32,6 +33,7 @@ Substitution, doc, ) +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( ensure_int64, @@ -122,6 +124,10 @@ class NamedAgg(NamedTuple): Function to apply to the provided column. If string, the name of a built-in pandas function. + See Also + -------- + DataFrame.groupby : Group DataFrame using a mapper or by a Series of columns. + Examples -------- >>> df = pd.DataFrame({"key": [1, 1, 2], "a": [-1, 0, 1], 1: [10, 11, 12]}) @@ -680,7 +686,8 @@ def nunique(self, dropna: bool = True) -> Series | DataFrame: b 1 dtype: int64 """ - ids, ngroups = self._grouper.group_info + ids = self._grouper.ids + ngroups = self._grouper.ngroups val = self.obj._values codes, uniques = algorithms.factorize(val, use_na_sentinel=dropna, sort=False) @@ -1206,7 +1213,7 @@ def idxmin(self, skipna: bool = True) -> Series: >>> ser.groupby(["a", "a", "b", "b"]).idxmin() a 2023-01-01 b 2023-02-01 - dtype: datetime64[ns] + dtype: datetime64[s] """ return self._idxmax_idxmin("idxmin", skipna=skipna) @@ -1259,7 +1266,7 @@ def idxmax(self, skipna: bool = True) -> Series: >>> ser.groupby(["a", "a", "b", "b"]).idxmax() a 2023-01-15 b 2023-02-15 - dtype: datetime64[ns] + dtype: datetime64[s] """ return self._idxmax_idxmin("idxmax", skipna=skipna) @@ -2726,6 +2733,8 @@ def corrwith( """ Compute pairwise correlation. + .. deprecated:: 3.0.0 + Pairwise correlation is computed between rows or columns of DataFrame with rows or columns of Series or DataFrame. DataFrames are first aligned along both axes before computing the @@ -2785,6 +2794,11 @@ def corrwith( 2 0.755929 NaN 3 0.576557 NaN """ + warnings.warn( + "DataFrameGroupBy.corrwith is deprecated", + FutureWarning, + stacklevel=find_stack_level(), + ) result = self._op_via_apply( "corrwith", other=other, diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 1b58317c08736..c07bc56377151 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -10,6 +10,7 @@ class providing the base-class of operations. from __future__ import annotations from collections.abc import ( + Callable, Hashable, Iterable, Iterator, @@ -24,7 +25,6 @@ class providing the base-class of operations. from textwrap import dedent from typing import ( TYPE_CHECKING, - Callable, Literal, TypeVar, Union, @@ -128,7 +128,6 @@ class providing the base-class of operations. from pandas.core.indexes.api import ( Index, MultiIndex, - RangeIndex, default_index, ) from pandas.core.internals.blocks import ensure_block_shape @@ -635,7 +634,7 @@ def groups(self) -> dict[Hashable, Index]: 0 1 2 3 1 1 5 6 2 7 8 9 - >>> df.groupby(by=["a"]).groups + >>> df.groupby(by="a").groups {1: [0, 1], 7: [2]} For Resampler: @@ -655,6 +654,15 @@ def groups(self) -> dict[Hashable, Index]: >>> ser.resample("MS").groups {Timestamp('2023-01-01 00:00:00'): 2, Timestamp('2023-02-01 00:00:00'): 4} """ + if isinstance(self.keys, list) and len(self.keys) == 1: + warnings.warn( + "`groups` by one element list returns scalar is deprecated " + "and will be removed. In a future version `groups` by one element " + "list will return tuple. Use ``df.groupby(by='a').groups`` " + "instead of ``df.groupby(by=['a']).groups`` to avoid this warning", + FutureWarning, + stacklevel=find_stack_level(), + ) return self._grouper.groups @final @@ -1264,7 +1272,7 @@ def _set_result_index_ordered( if self._grouper.has_dropped_na: # Add back in any missing rows due to dropna - index here is integral # with values referring to the row of the input so can use RangeIndex - result = result.reindex(RangeIndex(len(index)), axis=0) + result = result.reindex(default_index(len(index)), axis=0) result = result.set_axis(index, axis=0) return result @@ -1334,7 +1342,7 @@ def _wrap_aggregated_output( # enforced in __init__ result = self._insert_inaxis_grouper(result, qs=qs) result = result._consolidate() - result.index = RangeIndex(len(result)) + result.index = default_index(len(result)) else: index = self._grouper.result_index @@ -1360,7 +1368,7 @@ def _wrap_applied_output( @final def _numba_prep(self, data: DataFrame): - ids, ngroups = self._grouper.group_info + ngroups = self._grouper.ngroups sorted_index = self._grouper.result_ilocs sorted_ids = self._grouper._sorted_ids @@ -1969,7 +1977,8 @@ def _cumcount_array(self, ascending: bool = True) -> np.ndarray: this is currently implementing sort=False (though the default is sort=True) for groupby in general """ - ids, ngroups = self._grouper.group_info + ids = self._grouper.ids + ngroups = self._grouper.ngroups sorter = get_group_index_sorter(ids, ngroups) ids, count = ids[sorter], len(ids) @@ -2185,7 +2194,8 @@ def count(self) -> NDFrameT: Freq: MS, dtype: int64 """ data = self._get_data_to_aggregate() - ids, ngroups = self._grouper.group_info + ids = self._grouper.ids + ngroups = self._grouper.ngroups mask = ids != -1 is_series = data.ndim == 1 @@ -2337,6 +2347,12 @@ def median(self, numeric_only: bool = False) -> NDFrameT: Series or DataFrame Median of values within each group. + See Also + -------- + Series.groupby : Apply a function groupby to a Series. + DataFrame.groupby : Apply a function groupby to each row or column of a + DataFrame. + Examples -------- For SeriesGroupBy: @@ -3840,7 +3856,8 @@ def _fill(self, direction: Literal["ffill", "bfill"], limit: int | None = None): if limit is None: limit = -1 - ids, ngroups = self._grouper.group_info + ids = self._grouper.ids + ngroups = self._grouper.ngroups col_func = partial( libgroupby.group_fillna_indexer, @@ -4361,7 +4378,8 @@ def post_processor( qs = np.array([q], dtype=np.float64) pass_qs = None - ids, ngroups = self._grouper.group_info + ids = self._grouper.ids + ngroups = self._grouper.ngroups if self.dropna: # splitter drops NA groups, we need to do the same ids = ids[ids >= 0] @@ -5038,7 +5056,8 @@ def shift( else: if fill_value is lib.no_default: fill_value = None - ids, ngroups = self._grouper.group_info + ids = self._grouper.ids + ngroups = self._grouper.ngroups res_indexer = np.zeros(len(ids), dtype=np.int64) libgroupby.group_shift_indexer(res_indexer, ids, ngroups, period) @@ -5385,6 +5404,7 @@ def sample( random_state : int, array-like, BitGenerator, np.random.RandomState, np.random.Generator, optional If int, array-like, or BitGenerator, seed for random number generator. If np.random.RandomState or np.random.Generator, use as given. + Default ``None`` results in sampling with the current state of np.random. .. versionchanged:: 1.4.0 @@ -5399,6 +5419,7 @@ def sample( See Also -------- DataFrame.sample: Generate random samples from a DataFrame object. + Series.sample: Generate random samples from a Series object. numpy.random.choice: Generate a random sample from a given 1-D numpy array. diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index e75a5b9089f5f..5f9ebdcea4a2d 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -34,6 +34,7 @@ from pandas.core.indexes.api import ( Index, MultiIndex, + default_index, ) from pandas.core.series import Series @@ -71,6 +72,9 @@ class Grouper: Currently unused, reserved for future use. **kwargs Dictionary of the keyword arguments to pass to Grouper. + + Attributes + ---------- key : str, defaults to None Groupby key, which selects the grouping column of the target. level : name/number, defaults to None @@ -901,7 +905,7 @@ def is_in_obj(gpr) -> bool: if len(groupings) == 0 and len(obj): raise ValueError("No group keys passed!") if len(groupings) == 0: - groupings.append(Grouping(Index([], dtype="int"), np.array([], dtype=np.intp))) + groupings.append(Grouping(default_index(0), np.array([], dtype=np.intp))) # create the internals grouper grouper = ops.BaseGrouper(group_axis, groupings, sort=sort, dropna=dropna) diff --git a/pandas/core/groupby/numba_.py b/pandas/core/groupby/numba_.py index b22fc9248eeca..73b681c64c3a3 100644 --- a/pandas/core/groupby/numba_.py +++ b/pandas/core/groupby/numba_.py @@ -7,7 +7,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, ) import numpy as np @@ -20,6 +19,8 @@ ) if TYPE_CHECKING: + from collections.abc import Callable + from pandas._typing import Scalar diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 4f40c4f4283f0..da80969b613cd 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -12,7 +12,6 @@ import functools from typing import ( TYPE_CHECKING, - Callable, Generic, final, ) @@ -70,10 +69,10 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Generator, Hashable, Iterator, - Sequence, ) from pandas.core.generic import NDFrame @@ -581,14 +580,14 @@ class BaseGrouper: def __init__( self, axis: Index, - groupings: Sequence[grouper.Grouping], + groupings: list[grouper.Grouping], sort: bool = True, dropna: bool = True, ) -> None: assert isinstance(axis, Index), axis self.axis = axis - self._groupings: list[grouper.Grouping] = list(groupings) + self._groupings = groupings self._sort = sort self.dropna = dropna @@ -596,10 +595,6 @@ def __init__( def groupings(self) -> list[grouper.Grouping]: return self._groupings - @property - def shape(self) -> Shape: - return tuple(ping.ngroups for ping in self.groupings) - def __iter__(self) -> Iterator[Hashable]: return iter(self.indices) @@ -628,11 +623,15 @@ def _get_splitter(self, data: NDFrame) -> DataSplitter: ------- Generator yielding subsetted objects """ - ids, ngroups = self.group_info - return _get_splitter( + if isinstance(data, Series): + klass: type[DataSplitter] = SeriesSplitter + else: + # i.e. DataFrame + klass = FrameSplitter + + return klass( data, - ids, - ngroups, + self.ngroups, sorted_ids=self._sorted_ids, sort_idx=self.result_ilocs, ) @@ -692,7 +691,8 @@ def size(self) -> Series: """ Compute group sizes. """ - ids, ngroups = self.group_info + ids = self.ids + ngroups = self.ngroups out: np.ndarray | list if ngroups: out = np.bincount(ids[ids != -1], minlength=ngroups) @@ -729,12 +729,6 @@ def has_dropped_na(self) -> bool: """ return bool((self.ids < 0).any()) - @cache_readonly - def group_info(self) -> tuple[npt.NDArray[np.intp], int]: - result_index, ids = self.result_index_and_ids - ngroups = len(result_index) - return ids, ngroups - @cache_readonly def codes_info(self) -> npt.NDArray[np.intp]: # return the codes of items in original grouped axis @@ -1123,10 +1117,6 @@ def indices(self): i = bin return indices - @cache_readonly - def group_info(self) -> tuple[npt.NDArray[np.intp], int]: - return self.ids, self.ngroups - @cache_readonly def codes(self) -> list[npt.NDArray[np.intp]]: return [self.ids] @@ -1191,29 +1181,25 @@ class DataSplitter(Generic[NDFrameT]): def __init__( self, data: NDFrameT, - labels: npt.NDArray[np.intp], ngroups: int, *, sort_idx: npt.NDArray[np.intp], sorted_ids: npt.NDArray[np.intp], ) -> None: self.data = data - self.labels = ensure_platform_int(labels) # _should_ already be np.intp self.ngroups = ngroups self._slabels = sorted_ids self._sort_idx = sort_idx def __iter__(self) -> Iterator: - sdata = self._sorted_data - if self.ngroups == 0: # we are inside a generator, rather than raise StopIteration # we merely return signal the end return starts, ends = lib.generate_slices(self._slabels, self.ngroups) - + sdata = self._sorted_data for start, end in zip(starts, ends): yield self._chop(sdata, slice(start, end)) @@ -1241,20 +1227,3 @@ def _chop(self, sdata: DataFrame, slice_obj: slice) -> DataFrame: mgr = sdata._mgr.get_slice(slice_obj, axis=1) df = sdata._constructor_from_mgr(mgr, axes=mgr.axes) return df.__finalize__(sdata, method="groupby") - - -def _get_splitter( - data: NDFrame, - labels: npt.NDArray[np.intp], - ngroups: int, - *, - sort_idx: npt.NDArray[np.intp], - sorted_ids: npt.NDArray[np.intp], -) -> DataSplitter: - if isinstance(data, Series): - klass: type[DataSplitter] = SeriesSplitter - else: - # i.e. DataFrame - klass = FrameSplitter - - return klass(data, labels, ngroups, sort_idx=sort_idx, sorted_ids=sorted_ids) diff --git a/pandas/core/indexers/objects.py b/pandas/core/indexers/objects.py index 083e86500a210..0064aa91056e8 100644 --- a/pandas/core/indexers/objects.py +++ b/pandas/core/indexers/objects.py @@ -167,6 +167,31 @@ class VariableOffsetWindowIndexer(BaseIndexer): """ Calculate window boundaries based on a non-fixed offset such as a BusinessDay. + Parameters + ---------- + index_array : np.ndarray, default 0 + Array-like structure specifying the indices for data points. + This parameter is currently not used. + + window_size : int, optional, default 0 + Specifies the number of data points in each window. + This parameter is currently not used. + + index : DatetimeIndex, optional + ``DatetimeIndex`` of the labels of each observation. + + offset : BaseOffset, optional + ``DateOffset`` representing the size of the window. + + **kwargs + Additional keyword arguments passed to the parent class ``BaseIndexer``. + + See Also + -------- + api.indexers.BaseIndexer : Base class for all indexers. + DataFrame.rolling : Rolling window calculations on DataFrames. + offsets : Module providing various time offset classes. + Examples -------- >>> from pandas.api.indexers import VariableOffsetWindowIndexer diff --git a/pandas/core/indexes/accessors.py b/pandas/core/indexes/accessors.py index 3cb51f7447677..e2dc71f68a65b 100644 --- a/pandas/core/indexes/accessors.py +++ b/pandas/core/indexes/accessors.py @@ -459,7 +459,7 @@ def to_pytimedelta(self) -> np.ndarray: Examples -------- - >>> s = pd.Series(pd.to_timedelta(np.arange(5), unit="d")) + >>> s = pd.Series(pd.to_timedelta(np.arange(5), unit="D")) >>> s 0 0 days 1 1 days diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index 83e8df5072b92..5144e647e73b4 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -130,7 +130,7 @@ def _get_combined_index( # TODO: handle index names! indexes = _get_distinct_objs(indexes) if len(indexes) == 0: - index = Index([]) + index: Index = default_index(0) elif len(indexes) == 1: index = indexes[0] elif intersect: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 6a3fb8bc851df..50f44cc728aea 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -8,7 +8,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, ClassVar, Literal, NoReturn, @@ -46,7 +45,9 @@ ArrayLike, Axes, Axis, + AxisInt, DropKeep, + Dtype, DtypeObj, F, IgnoreRaise, @@ -58,6 +59,7 @@ ReindexMethod, Self, Shape, + SliceType, npt, ) from pandas.compat.numpy import function as nv @@ -193,6 +195,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Hashable, Iterable, Sequence, @@ -490,8 +493,6 @@ def __new__( if not copy and isinstance(data, (ABCSeries, Index)): refs = data._references - is_pandas_object = isinstance(data, (ABCSeries, Index, ExtensionArray)) - # range if isinstance(data, (range, RangeIndex)): result = RangeIndex(start=data, copy=copy, name=name) @@ -508,7 +509,7 @@ def __new__( elif is_ea_or_datetimelike_dtype(data_dtype): pass - elif isinstance(data, (np.ndarray, Index, ABCSeries)): + elif isinstance(data, (np.ndarray, ABCMultiIndex)): if isinstance(data, ABCMultiIndex): data = data._values @@ -518,7 +519,9 @@ def __new__( # they are actually ints, e.g. '0' and 0.0 # should not be coerced data = com.asarray_tuplesafe(data, dtype=_dtype_obj) - + elif isinstance(data, (ABCSeries, Index)): + # GH 56244: Avoid potential inference on object types + pass elif is_scalar(data): raise cls._raise_scalar_data_error(data) elif hasattr(data, "__array__"): @@ -571,19 +574,7 @@ def __new__( klass = cls._dtype_to_subclass(arr.dtype) arr = klass._ensure_array(arr, arr.dtype, copy=False) - result = klass._simple_new(arr, name, refs=refs) - if dtype is None and is_pandas_object and data_dtype == np.object_: - if result.dtype != data_dtype: - warnings.warn( - "Dtype inference on a pandas object " - "(Series, Index, ExtensionArray) is deprecated. The Index " - "constructor will keep the original dtype in the future. " - "Call `infer_objects` on the result to get the old " - "behavior.", - FutureWarning, - stacklevel=2, - ) - return result # type: ignore[return-value] + return klass._simple_new(arr, name, refs=refs) @classmethod def _ensure_array(cls, data, dtype, copy: bool): @@ -1099,7 +1090,7 @@ def view(self, cls=None): result._id = self._id return result - def astype(self, dtype, copy: bool = True): + def astype(self, dtype: Dtype, copy: bool = True): """ Create an Index with values cast to dtypes. @@ -2634,7 +2625,7 @@ def isna(self) -> npt.NDArray[np.bool_]: ... ) >>> idx DatetimeIndex(['1940-04-25', 'NaT', 'NaT', 'NaT'], - dtype='datetime64[ns]', freq=None) + dtype='datetime64[s]', freq=None) >>> idx.isna() array([False, True, True, True]) """ @@ -2919,14 +2910,12 @@ def __iadd__(self, other): return self + other @final - def __nonzero__(self) -> NoReturn: + def __bool__(self) -> NoReturn: raise ValueError( f"The truth value of a {type(self).__name__} is ambiguous. " "Use a.empty, a.bool(), a.item(), a.any() or a.all()." ) - __bool__ = __nonzero__ - # -------------------------------------------------------------------- # Set Operation Methods @@ -2969,7 +2958,7 @@ def _dti_setop_align_tzs(self, other: Index, setop: str_t) -> tuple[Index, Index return self, other @final - def union(self, other, sort=None): + def union(self, other, sort: bool | None = None): """ Form the union of two Index objects. @@ -3346,7 +3335,7 @@ def _intersection_via_get_indexer( return result @final - def difference(self, other, sort=None): + def difference(self, other, sort: bool | None = None): """ Return a new Index with elements of index not in `other`. @@ -3432,7 +3421,12 @@ def _wrap_difference_result(self, other, result): # We will override for MultiIndex to handle empty results return self._wrap_setop_result(other, result) - def symmetric_difference(self, other, result_name=None, sort=None): + def symmetric_difference( + self, + other, + result_name: abc.Hashable | None = None, + sort: bool | None = None, + ): """ Compute the symmetric difference of two Index objects. @@ -5459,9 +5453,10 @@ def equals(self, other: Any) -> bool: if ( isinstance(self.dtype, StringDtype) - and self.dtype.storage == "pyarrow_numpy" + and self.dtype.na_value is np.nan and other.dtype != self.dtype ): + # TODO(infer_string) can we avoid this special case? # special case for object behavior return other.equals(self.astype(object)) @@ -5791,13 +5786,6 @@ def sort_values( else: return sorted_index - @final - def sort(self, *args, **kwargs): - """ - Use sort_values instead. - """ - raise TypeError("cannot sort an Index object in-place, use sort_values instead") - def shift(self, periods: int = 1, freq=None) -> Self: """ Shift index by desired number of time frequency increments. @@ -6401,7 +6389,7 @@ def _transform_index(self, func, *, level=None) -> Index: items = [func(x) for x in self] return Index(items, name=self.name, tupleize_cols=False) - def isin(self, values, level=None) -> npt.NDArray[np.bool_]: + def isin(self, values, level: str_t | int | None = None) -> npt.NDArray[np.bool_]: """ Return a boolean array where the index values are in `values`. @@ -6699,7 +6687,12 @@ def get_slice_bound(self, label, side: Literal["left", "right"]) -> int: else: return slc - def slice_locs(self, start=None, end=None, step=None) -> tuple[int, int]: + def slice_locs( + self, + start: SliceType = None, + end: SliceType = None, + step: int | None = None, + ) -> tuple[int, int]: """ Compute slice locations for input labels. @@ -6793,7 +6786,9 @@ def slice_locs(self, start=None, end=None, step=None) -> tuple[int, int]: return start_slice, end_slice - def delete(self, loc) -> Self: + def delete( + self, loc: int | np.integer | list[int] | npt.NDArray[np.integer] + ) -> Self: """ Make new Index with passed location(-s) deleted. @@ -7239,7 +7234,9 @@ def _maybe_disable_logical_methods(self, opname: str_t) -> None: raise TypeError(f"cannot perform {opname} with {type(self).__name__}") @Appender(IndexOpsMixin.argmin.__doc__) - def argmin(self, axis=None, skipna: bool = True, *args, **kwargs) -> int: + def argmin( + self, axis: AxisInt | None = None, skipna: bool = True, *args, **kwargs + ) -> int: nv.validate_argmin(args, kwargs) nv.validate_minmax_axis(axis) @@ -7252,7 +7249,9 @@ def argmin(self, axis=None, skipna: bool = True, *args, **kwargs) -> int: return super().argmin(skipna=skipna) @Appender(IndexOpsMixin.argmax.__doc__) - def argmax(self, axis=None, skipna: bool = True, *args, **kwargs) -> int: + def argmax( + self, axis: AxisInt | None = None, skipna: bool = True, *args, **kwargs + ) -> int: nv.validate_argmax(args, kwargs) nv.validate_minmax_axis(axis) @@ -7263,7 +7262,7 @@ def argmax(self, axis=None, skipna: bool = True, *args, **kwargs) -> int: raise ValueError("Encountered all NA values") return super().argmax(skipna=skipna) - def min(self, axis=None, skipna: bool = True, *args, **kwargs): + def min(self, axis: AxisInt | None = None, skipna: bool = True, *args, **kwargs): """ Return the minimum value of the Index. @@ -7326,7 +7325,7 @@ def min(self, axis=None, skipna: bool = True, *args, **kwargs): return nanops.nanmin(self._values, skipna=skipna) - def max(self, axis=None, skipna: bool = True, *args, **kwargs): + def max(self, axis: AxisInt | None = None, skipna: bool = True, *args, **kwargs): """ Return the maximum value of the Index. @@ -7485,9 +7484,12 @@ def ensure_index_from_sequences(sequences, names=None) -> Index: -------- ensure_index """ + from pandas.core.indexes.api import default_index from pandas.core.indexes.multi import MultiIndex - if len(sequences) == 1: + if len(sequences) == 0: + return default_index(0) + elif len(sequences) == 1: if names is not None: names = names[0] return Index(maybe_sequence_to_range(sequences[0]), name=names) @@ -7540,7 +7542,7 @@ def ensure_index(index_like: Axes, copy: bool = False) -> Index: index_like = list(index_like) if isinstance(index_like, list): - if type(index_like) is not list: # noqa: E721 + if type(index_like) is not list: # must check for exactly list here because of strict type # check in clean_index_list index_like = list(index_like) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 3b04d95cb7cbd..312219eb7b91a 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -377,8 +377,13 @@ def __contains__(self, key: Any) -> bool: # if key is a NaN, check if any NaN is in self. if is_valid_na_for_dtype(key, self.categories.dtype): return self.hasnans - - return contains(self, key, container=self._engine) + if self.categories._typ == "rangeindex": + container: Index | libindex.IndexEngine | libindex.ExtensionEngine = ( + self.categories + ) + else: + container = self._engine + return contains(self, key, container=container) def reindex( self, target, method=None, level=None, limit: int | None = None, tolerance=None diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 7e8d808769bc1..e1120466eaf83 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -523,7 +523,7 @@ def _as_range_index(self) -> RangeIndex: # Convert our i8 representations to RangeIndex # Caller is responsible for checking isinstance(self.freq, Tick) freq = cast(Tick, self.freq) - tick = Timedelta(freq).as_unit("ns")._value + tick = Timedelta(freq).as_unit(self.unit)._value rng = range(self[0]._value, self[-1]._value + tick, tick) return RangeIndex(rng) @@ -536,7 +536,9 @@ def _wrap_range_setop(self, other, res_i8) -> Self: # RangeIndex defaults to step=1, which we don't want. new_freq = self.freq elif isinstance(res_i8, RangeIndex): - new_freq = to_offset(Timedelta(res_i8.step)) + new_freq = to_offset( + Timedelta(res_i8.step, unit=self.unit).as_unit(self.unit) + ) # TODO(GH#41493): we cannot just do # type(self._data)(res_i8.values, dtype=self.dtype, freq=new_freq) diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 78f04f57029b1..00a929724ed4c 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -147,7 +147,7 @@ class DatetimeIndex(DatetimeTimedeltaMixin): One of pandas date offset strings or corresponding objects. The string 'infer' can be passed in order to set the frequency of the index as the inferred frequency upon creation. - tz : pytz.timezone or dateutil.tz.tzfile or datetime.tzinfo or str + tz : zoneinfo.ZoneInfo, pytz.timezone, dateutil.tz.tzfile, datetime.tzinfo or str Set the Timezone of the data. ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise' When clocks moved backward due to DST, ambiguous times may arise. @@ -242,7 +242,7 @@ class DatetimeIndex(DatetimeTimedeltaMixin): >>> idx = pd.DatetimeIndex(["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"]) >>> idx DatetimeIndex(['2020-01-01 10:00:00+00:00', '2020-02-01 11:00:00+00:00'], - dtype='datetime64[ns, UTC]', freq=None) + dtype='datetime64[s, UTC]', freq=None) """ _typ = "datetimeindex" @@ -473,7 +473,8 @@ def snap(self, freq: Frequency = "S") -> DatetimeIndex: Examples -------- >>> idx = pd.DatetimeIndex( - ... ["2023-01-01", "2023-01-02", "2023-02-01", "2023-02-02"] + ... ["2023-01-01", "2023-01-02", "2023-02-01", "2023-02-02"], + ... dtype="M8[ns]", ... ) >>> idx DatetimeIndex(['2023-01-01', '2023-01-02', '2023-02-01', '2023-02-02'], @@ -1069,6 +1070,13 @@ def bdate_range( Returns ------- DatetimeIndex + Fixed frequency DatetimeIndex. + + See Also + -------- + date_range : Return a fixed frequency DatetimeIndex. + period_range : Return a fixed frequency PeriodIndex. + timedelta_range : Return a fixed frequency TimedeltaIndex. Notes ----- diff --git a/pandas/core/indexes/extension.py b/pandas/core/indexes/extension.py index fc806a3546571..48d5e59250f35 100644 --- a/pandas/core/indexes/extension.py +++ b/pandas/core/indexes/extension.py @@ -7,7 +7,6 @@ from inspect import signature from typing import ( TYPE_CHECKING, - Callable, TypeVar, ) @@ -18,6 +17,8 @@ from pandas.core.indexes.base import Index if TYPE_CHECKING: + from collections.abc import Callable + import numpy as np from pandas._typing import ( diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 3927619a567bf..58664b07f4a46 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1,6 +1,7 @@ from __future__ import annotations from collections.abc import ( + Callable, Collection, Generator, Hashable, @@ -12,7 +13,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Literal, cast, ) @@ -209,8 +209,12 @@ class MultiIndex(Index): level). names : optional sequence of objects Names for each of the index levels. (name is accepted for compat). + dtype : Numpy dtype or pandas type, optional + Data type for the MultiIndex. copy : bool, default False Copy the meta-data. + name : Label + Kept for compatibility with 1-dimensional Index. Should not be used. verify_integrity : bool, default True Check that the levels/codes are consistent and valid. @@ -634,7 +638,6 @@ def from_product( (2, 'purple')], names=['number', 'color']) """ - from pandas.core.reshape.util import cartesian_product if not is_list_like(iterables): raise TypeError("Input must be a list / sequence of iterables.") @@ -771,6 +774,11 @@ def dtypes(self) -> Series: """ Return the dtypes as a Series for the underlying MultiIndex. + See Also + -------- + Index.dtype : Return the dtype object of the underlying data. + Series.dtypes : Return the data type of the underlying Series. + Examples -------- >>> idx = pd.MultiIndex.from_product( @@ -826,6 +834,12 @@ def levels(self) -> FrozenList: it filters out all rows of the level C, MultiIndex.levels will still return A, B, C. + See Also + -------- + MultiIndex.codes : The codes of the levels in the MultiIndex. + MultiIndex.get_level_values : Return vector of label values for requested + level. + Examples -------- >>> index = pd.MultiIndex.from_product( @@ -919,6 +933,19 @@ def set_levels( """ Set new levels on MultiIndex. Defaults to returning new index. + The `set_levels` method provides a flexible way to change the levels of a + `MultiIndex`. This is particularly useful when you need to update the + index structure of your DataFrame without altering the data. The method + returns a new `MultiIndex` unless the operation is performed in-place, + ensuring that the original index remains unchanged unless explicitly + modified. + + The method checks the integrity of the new levels against the existing + codes by default, but this can be disabled if you are confident that + your levels are consistent with the underlying data. This can be useful + when you want to perform optimizations or make specific adjustments to + the index levels that do not strictly adhere to the original structure. + Parameters ---------- levels : sequence or list of sequence @@ -931,6 +958,14 @@ def set_levels( Returns ------- MultiIndex + A new `MultiIndex` with the updated levels. + + See Also + -------- + MultiIndex.set_codes : Set new codes on the existing `MultiIndex`. + MultiIndex.remove_unused_levels : Create new MultiIndex from current that + removes unused levels. + Index.set_names : Set Index or MultiIndex name. Examples -------- @@ -1016,6 +1051,13 @@ def nlevels(self) -> int: """ Integer number of levels in this MultiIndex. + See Also + -------- + MultiIndex.levels : Get the levels of the MultiIndex. + MultiIndex.codes : Get the codes of the MultiIndex. + MultiIndex.from_arrays : Convert arrays to MultiIndex. + MultiIndex.from_tuples : Convert list of tuples to MultiIndex. + Examples -------- >>> mi = pd.MultiIndex.from_arrays([["a"], ["b"], ["c"]]) @@ -1030,7 +1072,19 @@ def nlevels(self) -> int: @property def levshape(self) -> Shape: """ - A tuple with the length of each level. + A tuple representing the length of each level in the MultiIndex. + + In a `MultiIndex`, each level can contain multiple unique values. The + `levshape` property provides a quick way to assess the size of each + level by returning a tuple where each entry represents the number of + unique values in that specific level. This is particularly useful in + scenarios where you need to understand the structure and distribution + of your index levels, such as when working with multidimensional data. + + See Also + -------- + MultiIndex.shape : Return a tuple of the shape of the MultiIndex. + MultiIndex.levels : Returns the levels of the MultiIndex. Examples -------- @@ -1134,6 +1188,12 @@ def set_codes( new index (of same type and class...etc) or None The same type as the caller or None if ``inplace=True``. + See Also + -------- + MultiIndex.set_levels : Set new levels on MultiIndex. + MultiIndex.codes : Get the codes of the levels in the MultiIndex. + MultiIndex.levels : Get the levels of the MultiIndex. + Examples -------- >>> idx = pd.MultiIndex.from_tuples( @@ -1255,20 +1315,37 @@ def copy( # type: ignore[override] name=None, ) -> Self: """ - Make a copy of this object. + Make a copy of this object. Names, dtype, levels and codes can be passed and \ + will be set on new copy. - Names, dtype, levels and codes can be passed and will be set on new copy. + The `copy` method provides a mechanism to create a duplicate of an + existing MultiIndex object. This is particularly useful in scenarios where + modifications are required on an index, but the original MultiIndex should + remain unchanged. By specifying the `deep` parameter, users can control + whether the copy should be a deep or shallow copy, providing flexibility + depending on the size and complexity of the MultiIndex. Parameters ---------- names : sequence, optional + Names to set on the new MultiIndex object. deep : bool, default False + If False, the new object will be a shallow copy. If True, a deep copy + will be attempted. Deep copying can be potentially expensive for large + MultiIndex objects. name : Label Kept for compatibility with 1-dimensional Index. Should not be used. Returns ------- MultiIndex + A new MultiIndex object with the specified modifications. + + See Also + -------- + MultiIndex.from_arrays : Convert arrays to MultiIndex. + MultiIndex.from_tuples : Convert list of tuples to MultiIndex. + MultiIndex.from_frame : Convert DataFrame to MultiIndex. Notes ----- @@ -1656,7 +1733,7 @@ def duplicated(self, keep: DropKeep = "first") -> npt.NDArray[np.bool_]: # (previously declared in base class "IndexOpsMixin") _duplicated = duplicated # type: ignore[misc] - def fillna(self, value, downcast=None): + def fillna(self, value): """ fillna is not implemented for MultiIndex """ @@ -1706,6 +1783,16 @@ def get_level_values(self, level) -> Index: # type: ignore[override] Return vector of label values for requested level. Length of returned vector is equal to the length of the index. + The `get_level_values` method is a crucial utility for extracting + specific level values from a `MultiIndex`. This function is particularly + useful when working with multi-level data, allowing you to isolate + and manipulate individual levels without having to deal with the + complexity of the entire `MultiIndex` structure. It seamlessly handles + both integer and string-based level access, providing flexibility in + how you can interact with the data. Additionally, this method ensures + that the returned `Index` maintains the integrity of the original data, + even when missing values are present, by appropriately casting the + result to a suitable data type. Parameters ---------- @@ -1719,6 +1806,13 @@ def get_level_values(self, level) -> Index: # type: ignore[override] Values is a level of this MultiIndex converted to a single :class:`Index` (or subclass thereof). + See Also + -------- + MultiIndex : A multi-level, or hierarchical, index object for pandas objects. + Index : Immutable sequence used for indexing and alignment. + MultiIndex.remove_unused_levels : Create new MultiIndex from current that + removes unused levels. + Notes ----- If the level contains missing values, the result may be casted to @@ -2014,9 +2108,22 @@ def remove_unused_levels(self) -> MultiIndex: appearance, meaning the same .values and ordering. It will also be .equals() to the original. + The `remove_unused_levels` method is useful in cases where you have a + MultiIndex with hierarchical levels, but some of these levels are no + longer needed due to filtering or subsetting operations. By removing + the unused levels, the resulting MultiIndex becomes more compact and + efficient, which can improve performance in subsequent operations. + Returns ------- MultiIndex + A new MultiIndex with unused levels removed. + + See Also + -------- + MultiIndex.droplevel : Remove specified levels from a MultiIndex. + MultiIndex.reorder_levels : Rearrange levels of a MultiIndex. + MultiIndex.set_levels : Set new levels on a MultiIndex. Examples -------- @@ -2196,15 +2303,28 @@ def append(self, other): """ Append a collection of Index options together. + The `append` method is used to combine multiple `Index` objects into a single + `Index`. This is particularly useful when dealing with multi-level indexing + (MultiIndex) where you might need to concatenate different levels of indices. + The method handles the alignment of the levels and codes of the indices being + appended to ensure consistency in the resulting `MultiIndex`. + Parameters ---------- other : Index or list/tuple of indices + Index or list/tuple of Index objects to be appended. Returns ------- Index The combined index. + See Also + -------- + MultiIndex: A multi-level, or hierarchical, index object for pandas objects. + Index.append : Append a collection of Index options together. + concat : Concatenate pandas objects along a particular axis. + Examples -------- >>> mi = pd.MultiIndex.from_arrays([["a"], ["b"]]) @@ -2289,16 +2409,32 @@ def drop( # type: ignore[override] """ Make a new :class:`pandas.MultiIndex` with the passed list of codes deleted. + This method allows for the removal of specified labels from a MultiIndex. + The labels to be removed can be provided as a list of tuples if no level + is specified, or as a list of labels from a specific level if the level + parameter is provided. This can be useful for refining the structure of a + MultiIndex to fit specific requirements. + Parameters ---------- codes : array-like Must be a list of tuples when ``level`` is not specified. level : int or level name, default None + Level from which the labels will be dropped. errors : str, default 'raise' + If 'ignore', suppress error and existing labels are dropped. Returns ------- MultiIndex + A new MultiIndex with the specified labels removed. + + See Also + -------- + MultiIndex.remove_unused_levels : Create new MultiIndex from current that + removes unused levels. + MultiIndex.reorder_levels : Rearrange levels using input order. + MultiIndex.rename : Rename levels in a MultiIndex. Examples -------- @@ -2912,14 +3048,19 @@ def _get_loc_single_level_index(self, level_index: Index, key: Hashable) -> int: def get_loc(self, key): """ - Get location for a label or a tuple of labels. + Get location for a label or a tuple of labels. The location is returned \ + as an integer/slice or boolean mask. - The location is returned as an integer/slice or boolean - mask. + This method returns the integer location, slice object, or boolean mask + corresponding to the specified key, which can be a single label or a tuple + of labels. The key represents a position in the MultiIndex, and the location + indicates where the key is found within the index. Parameters ---------- key : label or tuple of labels (one for each level) + A label or tuple of labels that correspond to the levels of the MultiIndex. + The key must match the structure of the MultiIndex. Returns ------- @@ -3590,6 +3731,11 @@ def truncate(self, before=None, after=None) -> MultiIndex: MultiIndex The truncated MultiIndex. + See Also + -------- + DataFrame.truncate : Truncate a DataFrame before and after some index values. + Series.truncate : Truncate a Series before and after some index values. + Examples -------- >>> mi = pd.MultiIndex.from_arrays([["a", "b", "c"], ["x", "y", "z"]]) @@ -3873,8 +4019,11 @@ def insert(self, loc: int, item) -> MultiIndex: # have to insert into level # must insert at end otherwise you have to recompute all the # other codes - lev_loc = len(level) - level = level.insert(lev_loc, k) + if isna(k): # GH 59003 + lev_loc = -1 + else: + lev_loc = len(level) + level = level.insert(lev_loc, k) else: lev_loc = level.get_loc(k) @@ -4069,3 +4218,60 @@ def _require_listlike(level, arr, arrname: str): if not is_list_like(arr) or not is_list_like(arr[0]): raise TypeError(f"{arrname} must be list of lists-like") return level, arr + + +def cartesian_product(X: list[np.ndarray]) -> list[np.ndarray]: + """ + Numpy version of itertools.product. + Sometimes faster (for large inputs)... + + Parameters + ---------- + X : list-like of list-likes + + Returns + ------- + product : list of ndarrays + + Examples + -------- + >>> cartesian_product([list("ABC"), [1, 2]]) + [array(['A', 'A', 'B', 'B', 'C', 'C'], dtype=' None: # if there is only one block/type, still have to take split path # unless the block is one-dimensional or it can hold the value - if not take_split_path and len(self.obj._mgr.arrays) and self.ndim > 1: + if not take_split_path and len(self.obj._mgr.blocks) and self.ndim > 1: # in case of dict, keys are indices val = list(value.values()) if isinstance(value, dict) else value - arr = self.obj._mgr.arrays[0] + arr = self.obj._mgr.blocks[0].values take_split_path = not can_hold_element( arr, extract_array(val, extract_numpy=True) ) @@ -2124,14 +2129,14 @@ def _setitem_single_column(self, loc: int, value, plane_indexer) -> None: self.obj._mgr.column_setitem( loc, plane_indexer, value, inplace_only=True ) - except (ValueError, TypeError, LossySetitemError): + except (ValueError, TypeError, LossySetitemError) as exc: # If we're setting an entire column and we can't do it inplace, # then we can use value's dtype (or inferred dtype) # instead of object dtype = self.obj.dtypes.iloc[loc] if dtype not in (np.void, object) and not self.obj.empty: # - Exclude np.void, as that is a special case for expansion. - # We want to warn for + # We want to raise for # df = pd.DataFrame({'a': [1, 2]}) # df.loc[:, 'a'] = .3 # but not for @@ -2140,14 +2145,9 @@ def _setitem_single_column(self, loc: int, value, plane_indexer) -> None: # - Exclude `object`, as then no upcasting happens. # - Exclude empty initial object with enlargement, # as then there's nothing to be inconsistent with. - warnings.warn( - f"Setting an item of incompatible dtype is deprecated " - "and will raise in a future error of pandas. " - f"Value '{value}' has dtype incompatible with {dtype}, " - "please explicitly cast to a compatible dtype first.", - FutureWarning, - stacklevel=find_stack_level(), - ) + raise TypeError( + f"Invalid value '{value}' for dtype '{dtype}'" + ) from exc self.obj.isetitem(loc, value) else: # set value into the column (first attempting to operate inplace, then @@ -2395,7 +2395,7 @@ def ravel(i): new_ix = Index([new_ix]) else: new_ix = Index(new_ix) - if ser.index.equals(new_ix): + if not len(new_ix) or ser.index.equals(new_ix): if using_cow: return ser return ser._values.copy() @@ -2440,7 +2440,7 @@ def _align_frame(self, indexer, df: DataFrame) -> DataFrame: ax = self.obj.axes[i] if is_sequence(ix) or isinstance(ix, slice): if isinstance(ix, np.ndarray): - ix = ix.ravel() + ix = ix.reshape(-1) if idx is None: idx = ax[ix] elif cols is None: diff --git a/pandas/core/interchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py index 4575837fb12fc..7f2647d64b190 100644 --- a/pandas/core/interchange/from_dataframe.py +++ b/pandas/core/interchange/from_dataframe.py @@ -36,6 +36,19 @@ def from_dataframe(df, allow_copy: bool = True) -> pd.DataFrame: """ Build a ``pd.DataFrame`` from any DataFrame supporting the interchange protocol. + .. note:: + + For new development, we highly recommend using the Arrow C Data Interface + alongside the Arrow PyCapsule Interface instead of the interchange protocol + + .. warning:: + + Due to severe implementation issues, we recommend only considering using the + interchange protocol in the following cases: + + - converting to pandas: for pandas >= 2.0.3 + - converting from pandas: for pandas >= 3.0.0 + Parameters ---------- df : DataFrameXchg diff --git a/pandas/core/internals/api.py b/pandas/core/internals/api.py index 24bfad4791b29..04944db2ebd9c 100644 --- a/pandas/core/internals/api.py +++ b/pandas/core/internals/api.py @@ -10,6 +10,7 @@ from __future__ import annotations from typing import TYPE_CHECKING +import warnings import numpy as np @@ -87,6 +88,15 @@ def make_block( - Block.make_block_same_class - Block.__init__ """ + warnings.warn( + # GH#56815 + "make_block is deprecated and will be removed in a future version. " + "Use pd.api.internals.create_dataframe_from_blocks or " + "(recommended) higher-level public APIs instead.", + DeprecationWarning, + stacklevel=2, + ) + if dtype is not None: dtype = pandas_dtype(dtype) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index cffb1f658a640..149bef6258bfa 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -5,7 +5,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Literal, cast, final, @@ -101,7 +100,6 @@ ) from pandas.core.array_algos.transforms import shift from pandas.core.arrays import ( - Categorical, DatetimeArray, ExtensionArray, IntervalArray, @@ -121,6 +119,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Generator, Iterable, Sequence, @@ -429,7 +428,7 @@ def split_and_operate(self, func, *args, **kwargs) -> list[Block]: # Up/Down-casting @final - def coerce_to_target_dtype(self, other, warn_on_upcast: bool = False) -> Block: + def coerce_to_target_dtype(self, other, raise_on_upcast: bool) -> Block: """ coerce the current block to a dtype compat for other we will return a block, possibly object, and not raise @@ -456,7 +455,7 @@ def coerce_to_target_dtype(self, other, warn_on_upcast: bool = False) -> Block: isinstance(other, (np.datetime64, np.timedelta64)) and np.isnat(other) ) ): - warn_on_upcast = False + raise_on_upcast = False elif ( isinstance(other, np.ndarray) and other.ndim == 1 @@ -464,17 +463,10 @@ def coerce_to_target_dtype(self, other, warn_on_upcast: bool = False) -> Block: and is_float_dtype(other.dtype) and lib.has_only_ints_or_nan(other) ): - warn_on_upcast = False - - if warn_on_upcast: - warnings.warn( - f"Setting an item of incompatible dtype is deprecated " - "and will raise an error in a future version of pandas. " - f"Value '{other}' has dtype incompatible with {self.values.dtype}, " - "please explicitly cast to a compatible dtype first.", - FutureWarning, - stacklevel=find_stack_level(), - ) + raise_on_upcast = False + + if raise_on_upcast: + raise TypeError(f"Invalid value '{other}' for dtype '{self.values.dtype}'") if self.values.dtype == new_dtype: raise AssertionError( f"Did not expect new dtype {new_dtype} to equal self.dtype " @@ -696,14 +688,6 @@ def replace( # go through replace_list values = self.values - if isinstance(values, Categorical): - # TODO: avoid special-casing - # GH49404 - blk = self._maybe_copy(inplace) - values = cast(Categorical, blk.values) - values._replace(to_replace=to_replace, value=value, inplace=True) - return [blk] - if not self._can_hold_element(to_replace): # We cannot hold `to_replace`, so we know immediately that # replacing it is a no-op. @@ -729,7 +713,7 @@ def replace( if value is None or value is NA: blk = self.astype(np.dtype(object)) else: - blk = self.coerce_to_target_dtype(value) + blk = self.coerce_to_target_dtype(value, raise_on_upcast=False) return blk.replace( to_replace=to_replace, value=value, @@ -803,14 +787,6 @@ def replace_list( """ values = self.values - if isinstance(values, Categorical): - # TODO: avoid special-casing - # GH49404 - blk = self._maybe_copy(inplace) - values = cast(Categorical, blk.values) - values._replace(to_replace=src_list, value=dest_list, inplace=True) - return [blk] - # Exclude anything that we know we won't contain pairs = [ (x, y) for x, y in zip(src_list, dest_list) if self._can_hold_element(x) @@ -1122,7 +1098,7 @@ def setitem(self, indexer, value) -> Block: casted = np_can_hold_element(values.dtype, value) except LossySetitemError: # current dtype cannot store value, coerce to common dtype - nb = self.coerce_to_target_dtype(value, warn_on_upcast=True) + nb = self.coerce_to_target_dtype(value, raise_on_upcast=True) return nb.setitem(indexer, value) else: if self.dtype == _dtype_obj: @@ -1193,7 +1169,7 @@ def putmask(self, mask, new) -> list[Block]: if not is_list_like(new): # using just new[indexer] can't save us the need to cast return self.coerce_to_target_dtype( - new, warn_on_upcast=True + new, raise_on_upcast=True ).putmask(mask, new) else: indexer = mask.nonzero()[0] @@ -1261,7 +1237,7 @@ def where(self, other, cond) -> list[Block]: if self.ndim == 1 or self.shape[0] == 1: # no need to split columns - block = self.coerce_to_target_dtype(other) + block = self.coerce_to_target_dtype(other, raise_on_upcast=False) return block.where(orig_other, cond) else: @@ -1455,7 +1431,7 @@ def shift(self, periods: int, fill_value: Any = None) -> list[Block]: fill_value, ) except LossySetitemError: - nb = self.coerce_to_target_dtype(fill_value) + nb = self.coerce_to_target_dtype(fill_value, raise_on_upcast=False) return nb.shift(periods, fill_value=fill_value) else: @@ -1654,11 +1630,11 @@ def setitem(self, indexer, value): except (ValueError, TypeError): if isinstance(self.dtype, IntervalDtype): # see TestSetitemFloatIntervalWithIntIntervalValues - nb = self.coerce_to_target_dtype(orig_value, warn_on_upcast=True) + nb = self.coerce_to_target_dtype(orig_value, raise_on_upcast=True) return nb.setitem(orig_indexer, orig_value) elif isinstance(self, NDArrayBackedExtensionBlock): - nb = self.coerce_to_target_dtype(orig_value, warn_on_upcast=True) + nb = self.coerce_to_target_dtype(orig_value, raise_on_upcast=True) return nb.setitem(orig_indexer, orig_value) else: @@ -1693,13 +1669,13 @@ def where(self, other, cond) -> list[Block]: if self.ndim == 1 or self.shape[0] == 1: if isinstance(self.dtype, IntervalDtype): # TestSetitemFloatIntervalWithIntIntervalValues - blk = self.coerce_to_target_dtype(orig_other) + blk = self.coerce_to_target_dtype(orig_other, raise_on_upcast=False) return blk.where(orig_other, orig_cond) elif isinstance(self, NDArrayBackedExtensionBlock): # NB: not (yet) the same as # isinstance(values, NDArrayBackedExtensionArray) - blk = self.coerce_to_target_dtype(orig_other) + blk = self.coerce_to_target_dtype(orig_other, raise_on_upcast=False) return blk.where(orig_other, orig_cond) else: @@ -1754,13 +1730,13 @@ def putmask(self, mask, new) -> list[Block]: if isinstance(self.dtype, IntervalDtype): # Discussion about what we want to support in the general # case GH#39584 - blk = self.coerce_to_target_dtype(orig_new, warn_on_upcast=True) + blk = self.coerce_to_target_dtype(orig_new, raise_on_upcast=True) return blk.putmask(orig_mask, orig_new) elif isinstance(self, NDArrayBackedExtensionBlock): # NB: not (yet) the same as # isinstance(values, NDArrayBackedExtensionArray) - blk = self.coerce_to_target_dtype(orig_new, warn_on_upcast=True) + blk = self.coerce_to_target_dtype(orig_new, raise_on_upcast=True) return blk.putmask(orig_mask, orig_new) else: diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index cea52bf8c91b2..08e1650a5de12 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -14,7 +14,7 @@ import numpy as np from numpy import ma -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._libs import lib @@ -192,6 +192,7 @@ def ndarray_to_mgr( ) -> Manager: # used in DataFrame.__init__ # input must be a ndarray, list, Series, Index, ExtensionArray + infer_object = not isinstance(values, (ABCSeries, Index, ExtensionArray)) if isinstance(values, ABCSeries): if columns is None: @@ -287,22 +288,21 @@ def ndarray_to_mgr( # if we don't have a dtype specified, then try to convert objects # on the entire block; this is to convert if we have datetimelike's # embedded in an object type - if dtype is None and is_object_dtype(values.dtype): + if dtype is None and infer_object and is_object_dtype(values.dtype): obj_columns = list(values) maybe_datetime = [maybe_infer_to_datetimelike(x) for x in obj_columns] # don't convert (and copy) the objects if no type inference occurs if any(x is not y for x, y in zip(obj_columns, maybe_datetime)): - dvals_list = [ensure_block_shape(dval, 2) for dval in maybe_datetime] block_values = [ - new_block_2d(dvals_list[n], placement=BlockPlacement(n)) - for n in range(len(dvals_list)) + new_block_2d(ensure_block_shape(dval, 2), placement=BlockPlacement(n)) + for n, dval in enumerate(maybe_datetime) ] else: bp = BlockPlacement(slice(len(columns))) nb = new_block_2d(values, placement=bp, refs=refs) block_values = [nb] - elif dtype is None and values.dtype.kind == "U" and using_pyarrow_string_dtype(): - dtype = StringDtype(storage="pyarrow_numpy") + elif dtype is None and values.dtype.kind == "U" and using_string_dtype(): + dtype = StringDtype(storage="pyarrow", na_value=np.nan) obj_columns = list(values) block_values = [ @@ -842,7 +842,7 @@ def _list_of_dict_to_arrays( # assure that they are of the base dict class and not of derived # classes - data = [d if type(d) is dict else dict(d) for d in data] # noqa: E721 + data = [d if type(d) is dict else dict(d) for d in data] content = lib.dicts_to_array(data, list(columns)) return content, columns diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 7c1bcbec1d3f2..c42ea44b2fc89 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1,6 +1,7 @@ from __future__ import annotations from collections.abc import ( + Callable, Hashable, Sequence, ) @@ -8,7 +9,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Literal, NoReturn, cast, @@ -249,7 +249,7 @@ def blklocs(self) -> npt.NDArray[np.intp]: def make_empty(self, axes=None) -> Self: """return an empty BlockManager with the items axis of len 0""" if axes is None: - axes = [Index([])] + self.axes[1:] + axes = [default_index(0)] + self.axes[1:] # preserve dtype if possible if self.ndim == 1: @@ -263,12 +263,9 @@ def make_empty(self, axes=None) -> Self: blocks = [] return type(self).from_blocks(blocks, axes) - def __nonzero__(self) -> bool: + def __bool__(self) -> bool: return True - # Python3 compat - __bool__ = __nonzero__ - def set_axis(self, axis: AxisInt, new_labels: Index) -> None: # Caller is responsible for ensuring we have an Index object. self._validate_set_axis(axis, new_labels) @@ -353,6 +350,8 @@ def arrays(self) -> list[ArrayLike]: Warning! The returned arrays don't handle Copy-on-Write, so this should be used with caution (only in read-mode). """ + # TODO: Deprecate, usage in Dask + # https://github.com/dask/dask/blob/484fc3f1136827308db133cd256ba74df7a38d8c/dask/base.py#L1312 return [blk.values for blk in self.blocks] def __repr__(self) -> str: @@ -819,11 +818,13 @@ def reindex_indexer( raise IndexError("Requested axis not found in manager") if axis == 0: - new_blocks = self._slice_take_blocks_ax0( - indexer, - fill_value=fill_value, - only_slice=only_slice, - use_na_proxy=use_na_proxy, + new_blocks = list( + self._slice_take_blocks_ax0( + indexer, + fill_value=fill_value, + only_slice=only_slice, + use_na_proxy=use_na_proxy, + ) ) else: new_blocks = [ @@ -855,7 +856,7 @@ def _slice_take_blocks_ax0( *, use_na_proxy: bool = False, ref_inplace_op: bool = False, - ) -> list[Block]: + ) -> Generator[Block, None, None]: """ Slice/take blocks along axis=0. @@ -873,9 +874,9 @@ def _slice_take_blocks_ax0( ref_inplace_op: bool, default False Don't track refs if True because we operate inplace - Returns - ------- - new_blocks : list of Block + Yields + ------ + Block : New Block """ allow_fill = fill_value is not lib.no_default @@ -890,9 +891,10 @@ def _slice_take_blocks_ax0( # GH#32959 EABlock would fail since we can't make 0-width # TODO(EA2D): special casing unnecessary with 2D EAs if sllen == 0: - return [] + return bp = BlockPlacement(slice(0, sllen)) - return [blk.getitem_block_columns(slobj, new_mgr_locs=bp)] + yield blk.getitem_block_columns(slobj, new_mgr_locs=bp) + return elif not allow_fill or self.ndim == 1: if allow_fill and fill_value is None: fill_value = blk.fill_value @@ -900,25 +902,21 @@ def _slice_take_blocks_ax0( if not allow_fill and only_slice: # GH#33597 slice instead of take, so we get # views instead of copies - blocks = [ - blk.getitem_block_columns( + for i, ml in enumerate(slobj): + yield blk.getitem_block_columns( slice(ml, ml + 1), new_mgr_locs=BlockPlacement(i), ref_inplace_op=ref_inplace_op, ) - for i, ml in enumerate(slobj) - ] - return blocks else: bp = BlockPlacement(slice(0, sllen)) - return [ - blk.take_nd( - slobj, - axis=0, - new_mgr_locs=bp, - fill_value=fill_value, - ) - ] + yield blk.take_nd( + slobj, + axis=0, + new_mgr_locs=bp, + fill_value=fill_value, + ) + return if sl_type == "slice": blknos = self.blknos[slobj] @@ -933,18 +931,15 @@ def _slice_take_blocks_ax0( # When filling blknos, make sure blknos is updated before appending to # blocks list, that way new blkno is exactly len(blocks). - blocks = [] group = not only_slice for blkno, mgr_locs in libinternals.get_blkno_placements(blknos, group=group): if blkno == -1: # If we've got here, fill_value was not lib.no_default - blocks.append( - self._make_na_block( - placement=mgr_locs, - fill_value=fill_value, - use_na_proxy=use_na_proxy, - ) + yield self._make_na_block( + placement=mgr_locs, + fill_value=fill_value, + use_na_proxy=use_na_proxy, ) else: blk = self.blocks[blkno] @@ -959,7 +954,7 @@ def _slice_take_blocks_ax0( for mgr_loc in mgr_locs: newblk = blk.copy(deep=deep) newblk.mgr_locs = BlockPlacement(slice(mgr_loc, mgr_loc + 1)) - blocks.append(newblk) + yield newblk else: # GH#32779 to avoid the performance penalty of copying, @@ -970,7 +965,7 @@ def _slice_take_blocks_ax0( if isinstance(taker, slice): nb = blk.getitem_block_columns(taker, new_mgr_locs=mgr_locs) - blocks.append(nb) + yield nb elif only_slice: # GH#33597 slice instead of take, so we get # views instead of copies @@ -979,12 +974,10 @@ def _slice_take_blocks_ax0( bp = BlockPlacement(ml) nb = blk.getitem_block_columns(slc, new_mgr_locs=bp) # We have np.shares_memory(nb.values, blk.values) - blocks.append(nb) + yield nb else: nb = blk.take_nd(taker, axis=0, new_mgr_locs=mgr_locs) - blocks.append(nb) - - return blocks + yield nb def _make_na_block( self, placement: BlockPlacement, fill_value=None, use_na_proxy: bool = False @@ -2068,7 +2061,7 @@ def array(self) -> ArrayLike: """ Quick access to the backing array of the Block. """ - return self.arrays[0] + return self.blocks[0].values # error: Cannot override writeable attribute with read-only property @property diff --git a/pandas/core/methods/describe.py b/pandas/core/methods/describe.py index ef20d4c509732..17d4d38c97f33 100644 --- a/pandas/core/methods/describe.py +++ b/pandas/core/methods/describe.py @@ -12,7 +12,6 @@ ) from typing import ( TYPE_CHECKING, - Callable, cast, ) @@ -42,6 +41,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Hashable, Sequence, ) diff --git a/pandas/core/methods/selectn.py b/pandas/core/methods/selectn.py index 283acaca2c117..02e7445f1d275 100644 --- a/pandas/core/methods/selectn.py +++ b/pandas/core/methods/selectn.py @@ -29,6 +29,8 @@ ) from pandas.core.dtypes.dtypes import BaseMaskedDtype +from pandas.core.indexes.api import default_index + if TYPE_CHECKING: from pandas._typing import ( DtypeObj, @@ -38,6 +40,7 @@ from pandas import ( DataFrame, + Index, Series, ) else: @@ -199,8 +202,6 @@ def __init__(self, obj: DataFrame, n: int, keep: str, columns: IndexLabel) -> No self.columns = columns def compute(self, method: str) -> DataFrame: - from pandas.core.api import Index - n = self.n frame = self.obj columns = self.columns @@ -227,7 +228,7 @@ def get_indexer(current_indexer: Index, other_indexer: Index) -> Index: original_index = frame.index cur_frame = frame = frame.reset_index(drop=True) cur_n = n - indexer = Index([], dtype=np.int64) + indexer: Index = default_index(0) for i, column in enumerate(columns): # For each column we apply method to cur_frame[column]. diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 22092551ec882..e775156a6ae2f 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -3,8 +3,8 @@ import functools import itertools from typing import ( + TYPE_CHECKING, Any, - Callable, cast, ) import warnings @@ -48,6 +48,9 @@ notna, ) +if TYPE_CHECKING: + from collections.abc import Callable + bn = import_optional_dependency("bottleneck", errors="warn") _BOTTLENECK_INSTALLED = bn is not None _USE_BOTTLENECK = False diff --git a/pandas/core/ops/common.py b/pandas/core/ops/common.py index d19ac6246e1cd..5cbe1c421e05a 100644 --- a/pandas/core/ops/common.py +++ b/pandas/core/ops/common.py @@ -5,10 +5,7 @@ from __future__ import annotations from functools import wraps -from typing import ( - TYPE_CHECKING, - Callable, -) +from typing import TYPE_CHECKING from pandas._libs.lib import item_from_zerodim from pandas._libs.missing import is_matching_na @@ -19,6 +16,8 @@ ) if TYPE_CHECKING: + from collections.abc import Callable + from pandas._typing import F diff --git a/pandas/core/ops/invalid.py b/pandas/core/ops/invalid.py index c300db8c114c1..395db1617cb63 100644 --- a/pandas/core/ops/invalid.py +++ b/pandas/core/ops/invalid.py @@ -8,13 +8,14 @@ from typing import ( TYPE_CHECKING, Any, - Callable, NoReturn, ) import numpy as np if TYPE_CHECKING: + from collections.abc import Callable + from pandas._typing import ( ArrayLike, Scalar, diff --git a/pandas/core/resample.py b/pandas/core/resample.py index ccbe25fdae841..8ee71ea2293e6 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -4,7 +4,6 @@ from textwrap import dedent from typing import ( TYPE_CHECKING, - Callable, Literal, cast, final, @@ -92,7 +91,10 @@ ) if TYPE_CHECKING: - from collections.abc import Hashable + from collections.abc import ( + Callable, + Hashable, + ) from pandas._typing import ( Any, diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 7055201b5a1ee..6836ba3f65691 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -5,9 +5,9 @@ from __future__ import annotations from collections import abc +import types from typing import ( TYPE_CHECKING, - Callable, Literal, cast, overload, @@ -17,10 +17,12 @@ import numpy as np from pandas._libs import lib -from pandas.util._decorators import cache_readonly from pandas.util._exceptions import find_stack_level -from pandas.core.dtypes.common import is_bool +from pandas.core.dtypes.common import ( + is_bool, + is_scalar, +) from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.generic import ( ABCDataFrame, @@ -46,6 +48,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Hashable, Iterable, Mapping, @@ -385,291 +388,330 @@ def concat( DeprecationWarning, stacklevel=find_stack_level(), ) + if join == "outer": + intersect = False + elif join == "inner": + intersect = True + else: # pragma: no cover + raise ValueError( + "Only can inner (intersect) or outer (union) join the other axis" + ) - op = _Concatenator( - objs, - axis=axis, - ignore_index=ignore_index, - join=join, - keys=keys, - levels=levels, - names=names, - verify_integrity=verify_integrity, - sort=sort, - ) - - return op.get_result() + if not is_bool(sort): + raise ValueError( + f"The 'sort' keyword only accepts boolean values; {sort} was passed." + ) + sort = bool(sort) + objs, keys, ndims = _clean_keys_and_objs(objs, keys) -class _Concatenator: - """ - Orchestrates a concatenation operation for BlockManagers - """ + # select an object to be our result reference + sample, objs = _get_sample_object(objs, ndims, keys, names, levels, intersect) - sort: bool - - def __init__( - self, - objs: Iterable[Series | DataFrame] | Mapping[HashableT, Series | DataFrame], - axis: Axis = 0, - join: str = "outer", - keys: Iterable[Hashable] | None = None, - levels=None, - names: list[HashableT] | None = None, - ignore_index: bool = False, - verify_integrity: bool = False, - sort: bool = False, - ) -> None: - if isinstance(objs, (ABCSeries, ABCDataFrame, str)): - raise TypeError( - "first argument must be an iterable of pandas " - f'objects, you passed an object of type "{type(objs).__name__}"' - ) + # Standardize axis parameter to int + if sample.ndim == 1: + from pandas import DataFrame - if join == "outer": - self.intersect = False - elif join == "inner": - self.intersect = True - else: # pragma: no cover - raise ValueError( - "Only can inner (intersect) or outer (union) join the other axis" - ) + bm_axis = DataFrame._get_axis_number(axis) + is_frame = False + is_series = True + else: + bm_axis = sample._get_axis_number(axis) + is_frame = True + is_series = False - if not is_bool(sort): - raise ValueError( - f"The 'sort' keyword only accepts boolean values; {sort} was passed." - ) - # Incompatible types in assignment (expression has type "Union[bool, bool_]", - # variable has type "bool") - self.sort = sort # type: ignore[assignment] + # Need to flip BlockManager axis in the DataFrame special case + bm_axis = sample._get_block_manager_axis(bm_axis) - self.ignore_index = ignore_index - self.verify_integrity = verify_integrity + # if we have mixed ndims, then convert to highest ndim + # creating column numbers as needed + if len(ndims) > 1: + objs = _sanitize_mixed_ndim(objs, sample, ignore_index, bm_axis) - objs, keys, ndims = _clean_keys_and_objs(objs, keys) + axis = 1 - bm_axis if is_frame else 0 + names = names or getattr(keys, "names", None) + return _get_result( + objs, + is_series, + bm_axis, + ignore_index, + intersect, + sort, + keys, + levels, + verify_integrity, + names, + axis, + ) - # select an object to be our result reference - sample, objs = _get_sample_object( - objs, ndims, keys, names, levels, self.intersect - ) - # Standardize axis parameter to int - if sample.ndim == 1: - from pandas import DataFrame +def _sanitize_mixed_ndim( + objs: list[Series | DataFrame], + sample: Series | DataFrame, + ignore_index: bool, + axis: AxisInt, +) -> list[Series | DataFrame]: + # if we have mixed ndims, then convert to highest ndim + # creating column numbers as needed + + new_objs = [] + + current_column = 0 + max_ndim = sample.ndim + for obj in objs: + ndim = obj.ndim + if ndim == max_ndim: + pass + + elif ndim != max_ndim - 1: + raise ValueError( + "cannot concatenate unaligned mixed dimensional NDFrame objects" + ) - axis = DataFrame._get_axis_number(axis) - self._is_frame = False - self._is_series = True else: - axis = sample._get_axis_number(axis) - self._is_frame = True - self._is_series = False - - # Need to flip BlockManager axis in the DataFrame special case - axis = sample._get_block_manager_axis(axis) - - # if we have mixed ndims, then convert to highest ndim - # creating column numbers as needed - if len(ndims) > 1: - objs = self._sanitize_mixed_ndim(objs, sample, ignore_index, axis) - - self.objs = objs - - # note: this is the BlockManager axis (since DataFrame is transposed) - self.bm_axis = axis - self.axis = 1 - self.bm_axis if self._is_frame else 0 - self.keys = keys - self.names = names or getattr(keys, "names", None) - self.levels = levels - - def _sanitize_mixed_ndim( - self, - objs: list[Series | DataFrame], - sample: Series | DataFrame, - ignore_index: bool, - axis: AxisInt, - ) -> list[Series | DataFrame]: - # if we have mixed ndims, then convert to highest ndim - # creating column numbers as needed - - new_objs = [] - - current_column = 0 - max_ndim = sample.ndim - for obj in objs: - ndim = obj.ndim - if ndim == max_ndim: - pass - - elif ndim != max_ndim - 1: - raise ValueError( - "cannot concatenate unaligned mixed dimensional NDFrame objects" - ) - - else: - name = getattr(obj, "name", None) - if ignore_index or name is None: - if axis == 1: - # doing a row-wise concatenation so need everything - # to line up - name = 0 - else: - # doing a column-wise concatenation so need series - # to have unique names - name = current_column - current_column += 1 - obj = sample._constructor(obj, copy=False) - if isinstance(obj, ABCDataFrame): - obj.columns = range(name, name + 1, 1) + name = getattr(obj, "name", None) + if ignore_index or name is None: + if axis == 1: + # doing a row-wise concatenation so need everything + # to line up + name = 0 else: - obj = sample._constructor({name: obj}, copy=False) - - new_objs.append(obj) - - return new_objs + # doing a column-wise concatenation so need series + # to have unique names + name = current_column + current_column += 1 + obj = sample._constructor(obj, copy=False) + if isinstance(obj, ABCDataFrame): + obj.columns = range(name, name + 1, 1) + else: + obj = sample._constructor({name: obj}, copy=False) - def get_result(self): - cons: Callable[..., DataFrame | Series] - sample: DataFrame | Series + new_objs.append(obj) - # series only - if self._is_series: - sample = cast("Series", self.objs[0]) + return new_objs - # stack blocks - if self.bm_axis == 0: - name = com.consensus_name_attr(self.objs) - cons = sample._constructor - arrs = [ser._values for ser in self.objs] +def _get_result( + objs: list[Series | DataFrame], + is_series: bool, + bm_axis: AxisInt, + ignore_index: bool, + intersect: bool, + sort: bool, + keys: Iterable[Hashable] | None, + levels, + verify_integrity: bool, + names: list[HashableT] | None, + axis: AxisInt, +): + cons: Callable[..., DataFrame | Series] + sample: DataFrame | Series - res = concat_compat(arrs, axis=0) + # series only + if is_series: + sample = cast("Series", objs[0]) - new_index: Index - if self.ignore_index: - # We can avoid surprisingly-expensive _get_concat_axis - new_index = default_index(len(res)) - else: - new_index = self.new_axes[0] + # stack blocks + if bm_axis == 0: + name = com.consensus_name_attr(objs) + cons = sample._constructor - mgr = type(sample._mgr).from_array(res, index=new_index) + arrs = [ser._values for ser in objs] - result = sample._constructor_from_mgr(mgr, axes=mgr.axes) - result._name = name - return result.__finalize__(self, method="concat") + res = concat_compat(arrs, axis=0) - # combine as columns in a frame + if ignore_index: + new_index: Index = default_index(len(res)) else: - data = dict(enumerate(self.objs)) + new_index = _get_concat_axis_series( + objs, + ignore_index, + bm_axis, + keys, + levels, + verify_integrity, + names, + ) - # GH28330 Preserves subclassed objects through concat - cons = sample._constructor_expanddim + mgr = type(sample._mgr).from_array(res, index=new_index) - index, columns = self.new_axes - df = cons(data, index=index, copy=False) - df.columns = columns - return df.__finalize__(self, method="concat") + result = sample._constructor_from_mgr(mgr, axes=mgr.axes) + result._name = name + return result.__finalize__( + types.SimpleNamespace(objs=objs), method="concat" + ) - # combine block managers + # combine as columns in a frame else: - sample = cast("DataFrame", self.objs[0]) - - mgrs_indexers = [] - for obj in self.objs: - indexers = {} - for ax, new_labels in enumerate(self.new_axes): - # ::-1 to convert BlockManager ax to DataFrame ax - if ax == self.bm_axis: - # Suppress reindexing on concat axis - continue - - # 1-ax to convert BlockManager axis to DataFrame axis - obj_labels = obj.axes[1 - ax] - if not new_labels.equals(obj_labels): - indexers[ax] = obj_labels.get_indexer(new_labels) - - mgrs_indexers.append((obj._mgr, indexers)) - - new_data = concatenate_managers( - mgrs_indexers, self.new_axes, concat_axis=self.bm_axis, copy=False - ) + data = dict(enumerate(objs)) - out = sample._constructor_from_mgr(new_data, axes=new_data.axes) - return out.__finalize__(self, method="concat") + # GH28330 Preserves subclassed objects through concat + cons = sample._constructor_expanddim - @cache_readonly - def new_axes(self) -> list[Index]: - if self._is_series and self.bm_axis == 1: - ndim = 2 - else: - ndim = self.objs[0].ndim - return [ - self._get_concat_axis - if i == self.bm_axis - else get_objs_combined_axis( - self.objs, - axis=self.objs[0]._get_block_manager_axis(i), - intersect=self.intersect, - sort=self.sort, + index = get_objs_combined_axis( + objs, + axis=objs[0]._get_block_manager_axis(0), + intersect=intersect, + sort=sort, ) - for i in range(ndim) - ] - - @cache_readonly - def _get_concat_axis(self) -> Index: - """ - Return index to be used along concatenation axis. - """ - if self._is_series: - if self.bm_axis == 0: - indexes = [x.index for x in self.objs] - elif self.ignore_index: - idx = default_index(len(self.objs)) - return idx - elif self.keys is None: - names: list[Hashable] = [None] * len(self.objs) - num = 0 - has_names = False - for i, x in enumerate(self.objs): - if x.ndim != 1: - raise TypeError( - f"Cannot concatenate type 'Series' with " - f"object of type '{type(x).__name__}'" - ) - if x.name is not None: - names[i] = x.name - has_names = True - else: - names[i] = num - num += 1 - if has_names: - return Index(names) - else: - return default_index(len(self.objs)) - else: - return ensure_index(self.keys).set_names(self.names) - else: - indexes = [x.axes[self.axis] for x in self.objs] + columns = _get_concat_axis_series( + objs, ignore_index, bm_axis, keys, levels, verify_integrity, names + ) + df = cons(data, index=index, copy=False) + df.columns = columns + return df.__finalize__(types.SimpleNamespace(objs=objs), method="concat") + + # combine block managers + else: + sample = cast("DataFrame", objs[0]) + + mgrs_indexers = [] + result_axes = new_axes( + objs, + bm_axis, + intersect, + sort, + keys, + names, + axis, + levels, + verify_integrity, + ignore_index, + ) + for obj in objs: + indexers = {} + for ax, new_labels in enumerate(result_axes): + # ::-1 to convert BlockManager ax to DataFrame ax + if ax == bm_axis: + # Suppress reindexing on concat axis + continue + + # 1-ax to convert BlockManager axis to DataFrame axis + obj_labels = obj.axes[1 - ax] + if not new_labels.equals(obj_labels): + indexers[ax] = obj_labels.get_indexer(new_labels) + + mgrs_indexers.append((obj._mgr, indexers)) + + new_data = concatenate_managers( + mgrs_indexers, result_axes, concat_axis=bm_axis, copy=False + ) + + out = sample._constructor_from_mgr(new_data, axes=new_data.axes) + return out.__finalize__(types.SimpleNamespace(objs=objs), method="concat") - if self.ignore_index: - idx = default_index(sum(len(i) for i in indexes)) - return idx - if self.keys is None: - if self.levels is not None: +def new_axes( + objs: list[Series | DataFrame], + bm_axis: AxisInt, + intersect: bool, + sort: bool, + keys: Iterable[Hashable] | None, + names: list[HashableT] | None, + axis: AxisInt, + levels, + verify_integrity: bool, + ignore_index: bool, +) -> list[Index]: + """Return the new [index, column] result for concat.""" + return [ + _get_concat_axis_dataframe( + objs, + axis, + ignore_index, + keys, + names, + levels, + verify_integrity, + ) + if i == bm_axis + else get_objs_combined_axis( + objs, + axis=objs[0]._get_block_manager_axis(i), + intersect=intersect, + sort=sort, + ) + for i in range(2) + ] + + +def _get_concat_axis_series( + objs: list[Series | DataFrame], + ignore_index: bool, + bm_axis: AxisInt, + keys: Iterable[Hashable] | None, + levels, + verify_integrity: bool, + names: list[HashableT] | None, +) -> Index: + """Return result concat axis when concatenating Series objects.""" + if ignore_index: + return default_index(len(objs)) + elif bm_axis == 0: + indexes = [x.index for x in objs] + if keys is None: + if levels is not None: raise ValueError("levels supported only when keys is not None") concat_axis = _concat_indexes(indexes) else: - concat_axis = _make_concat_multiindex( - indexes, self.keys, self.levels, self.names - ) + concat_axis = _make_concat_multiindex(indexes, keys, levels, names) + if verify_integrity and not concat_axis.is_unique: + overlap = concat_axis[concat_axis.duplicated()].unique() + raise ValueError(f"Indexes have overlapping values: {overlap}") + return concat_axis + elif keys is None: + result_names: list[Hashable] = [None] * len(objs) + num = 0 + has_names = False + for i, x in enumerate(objs): + if x.ndim != 1: + raise TypeError( + f"Cannot concatenate type 'Series' with " + f"object of type '{type(x).__name__}'" + ) + if x.name is not None: + result_names[i] = x.name + has_names = True + else: + result_names[i] = num + num += 1 + if has_names: + return Index(result_names) + else: + return default_index(len(objs)) + else: + return ensure_index(keys).set_names(names) # type: ignore[arg-type] - if self.verify_integrity: - if not concat_axis.is_unique: - overlap = concat_axis[concat_axis.duplicated()].unique() - raise ValueError(f"Indexes have overlapping values: {overlap}") - return concat_axis +def _get_concat_axis_dataframe( + objs: list[Series | DataFrame], + axis: AxisInt, + ignore_index: bool, + keys: Iterable[Hashable] | None, + names: list[HashableT] | None, + levels, + verify_integrity: bool, +) -> Index: + """Return result concat axis when concatenating DataFrame objects.""" + indexes_gen = (x.axes[axis] for x in objs) + + if ignore_index: + return default_index(sum(len(i) for i in indexes_gen)) + else: + indexes = list(indexes_gen) + + if keys is None: + if levels is not None: + raise ValueError("levels supported only when keys is not None") + concat_axis = _concat_indexes(indexes) + else: + concat_axis = _make_concat_multiindex(indexes, keys, levels, names) + + if verify_integrity and not concat_axis.is_unique: + overlap = concat_axis[concat_axis.duplicated()].unique() + raise ValueError(f"Indexes have overlapping values: {overlap}") + + return concat_axis def _clean_keys_and_objs( @@ -680,7 +722,7 @@ def _clean_keys_and_objs( Returns ------- clean_objs : list[Series | DataFrame] - LIst of DataFrame and Series with Nones removed. + List of DataFrame and Series with Nones removed. keys : Index | None None if keys was None Index if objs was a Mapping or keys was not None. Filtered where objs was None. @@ -690,28 +732,33 @@ def _clean_keys_and_objs( if isinstance(objs, abc.Mapping): if keys is None: keys = objs.keys() - objs_list = [objs[k] for k in keys] - else: - objs_list = list(objs) + objs = [objs[k] for k in keys] + elif isinstance(objs, (ABCSeries, ABCDataFrame)) or is_scalar(objs): + raise TypeError( + "first argument must be an iterable of pandas " + f'objects, you passed an object of type "{type(objs).__name__}"' + ) + elif not isinstance(objs, abc.Sized): + objs = list(objs) - if len(objs_list) == 0: + if len(objs) == 0: raise ValueError("No objects to concatenate") if keys is not None: if not isinstance(keys, Index): keys = Index(keys) - if len(keys) != len(objs_list): + if len(keys) != len(objs): # GH#43485 raise ValueError( f"The length of the keys ({len(keys)}) must match " - f"the length of the objects to concatenate ({len(objs_list)})" + f"the length of the objects to concatenate ({len(objs)})" ) # GH#1649 key_indices = [] clean_objs = [] ndims = set() - for i, obj in enumerate(objs_list): + for i, obj in enumerate(objs): if obj is None: continue elif isinstance(obj, (ABCSeries, ABCDataFrame)): diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py index 9d88e61951e99..c397c1c2566a5 100644 --- a/pandas/core/reshape/encoding.py +++ b/pandas/core/reshape/encoding.py @@ -10,6 +10,7 @@ import numpy as np +from pandas._libs import missing as libmissing from pandas._libs.sparse import IntIndex from pandas.core.dtypes.common import ( @@ -256,7 +257,7 @@ def _get_dummies_1d( dtype = ArrowDtype(pa.bool_()) # type: ignore[assignment] elif ( isinstance(input_dtype, StringDtype) - and input_dtype.storage != "pyarrow_numpy" + and input_dtype.na_value is libmissing.NA ): dtype = pandas_dtype("boolean") # type: ignore[assignment] else: diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index 294de2cf2fe1d..bfd8e3ccd2f7c 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -15,7 +15,6 @@ import pandas.core.algorithms as algos from pandas.core.indexes.api import MultiIndex from pandas.core.reshape.concat import concat -from pandas.core.reshape.util import tile_compat from pandas.core.tools.numeric import to_numeric if TYPE_CHECKING: @@ -202,9 +201,9 @@ def melt( if value_vars_was_not_none: frame = frame.iloc[:, algos.unique(idx)] else: - frame = frame.copy() + frame = frame.copy(deep=False) else: - frame = frame.copy() + frame = frame.copy(deep=False) if col_level is not None: # allow list or other? # frame is a copy @@ -266,7 +265,8 @@ def melt( result = frame._constructor(mdata, columns=mcolumns) if not ignore_index: - result.index = tile_compat(frame.index, num_cols_adjusted) + taker = np.tile(np.arange(len(frame)), num_cols_adjusted) + result.index = frame.index.take(taker) return result diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index e6e84c2135b82..6364072fd215c 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -39,11 +39,7 @@ npt, ) from pandas.errors import MergeError -from pandas.util._decorators import ( - Appender, - Substitution, - cache_readonly, -) +from pandas.util._decorators import cache_readonly from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.base import ExtensionDtype @@ -95,7 +91,6 @@ ensure_wrapped_if_datetimelike, extract_array, ) -from pandas.core.frame import _merge_doc from pandas.core.indexes.api import default_index from pandas.core.sorting import ( get_group_index, @@ -133,8 +128,6 @@ _known = (np.ndarray, ExtensionArray, Index, ABCSeries) -@Substitution("\nleft : DataFrame or named Series") -@Appender(_merge_doc, indents=0) def merge( left: DataFrame | Series, right: DataFrame | Series, @@ -150,6 +143,210 @@ def merge( indicator: str | bool = False, validate: str | None = None, ) -> DataFrame: + """ + Merge DataFrame or named Series objects with a database-style join. + + A named Series object is treated as a DataFrame with a single named column. + + The join is done on columns or indexes. If joining columns on + columns, the DataFrame indexes *will be ignored*. Otherwise if joining indexes + on indexes or indexes on a column or columns, the index will be passed on. + When performing a cross merge, no column specifications to merge on are + allowed. + + .. warning:: + + If both key columns contain rows where the key is a null value, those + rows will be matched against each other. This is different from usual SQL + join behaviour and can lead to unexpected results. + + Parameters + ---------- + left : DataFrame or named Series + First pandas object to merge. + right : DataFrame or named Series + Second pandas object to merge. + how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'inner' + Type of merge to be performed. + + * left: use only keys from left frame, similar to a SQL left outer join; + preserve key order. + * right: use only keys from right frame, similar to a SQL right outer join; + preserve key order. + * outer: use union of keys from both frames, similar to a SQL full outer + join; sort keys lexicographically. + * inner: use intersection of keys from both frames, similar to a SQL inner + join; preserve the order of the left keys. + * cross: creates the cartesian product from both frames, preserves the order + of the left keys. + on : label or list + Column or index level names to join on. These must be found in both + DataFrames. If `on` is None and not merging on indexes then this defaults + to the intersection of the columns in both DataFrames. + left_on : label or list, or array-like + Column or index level names to join on in the left DataFrame. Can also + be an array or list of arrays of the length of the left DataFrame. + These arrays are treated as if they are columns. + right_on : label or list, or array-like + Column or index level names to join on in the right DataFrame. Can also + be an array or list of arrays of the length of the right DataFrame. + These arrays are treated as if they are columns. + left_index : bool, default False + Use the index from the left DataFrame as the join key(s). If it is a + MultiIndex, the number of keys in the other DataFrame (either the index + or a number of columns) must match the number of levels. + right_index : bool, default False + Use the index from the right DataFrame as the join key. Same caveats as + left_index. + sort : bool, default False + Sort the join keys lexicographically in the result DataFrame. If False, + the order of the join keys depends on the join type (how keyword). + suffixes : list-like, default is ("_x", "_y") + A length-2 sequence where each element is optionally a string + indicating the suffix to add to overlapping column names in + `left` and `right` respectively. Pass a value of `None` instead + of a string to indicate that the column name from `left` or + `right` should be left as-is, with no suffix. At least one of the + values must not be None. + copy : bool, default False + If False, avoid copy if possible. + + .. note:: + The `copy` keyword will change behavior in pandas 3.0. + `Copy-on-Write + `__ + will be enabled by default, which means that all methods with a + `copy` keyword will use a lazy copy mechanism to defer the copy and + ignore the `copy` keyword. The `copy` keyword will be removed in a + future version of pandas. + + You can already get the future behavior and improvements through + enabling copy on write ``pd.options.mode.copy_on_write = True`` + + .. deprecated:: 3.0.0 + indicator : bool or str, default False + If True, adds a column to the output DataFrame called "_merge" with + information on the source of each row. The column can be given a different + name by providing a string argument. The column will have a Categorical + type with the value of "left_only" for observations whose merge key only + appears in the left DataFrame, "right_only" for observations + whose merge key only appears in the right DataFrame, and "both" + if the observation's merge key is found in both DataFrames. + + validate : str, optional + If specified, checks if merge is of specified type. + + * "one_to_one" or "1:1": check if merge keys are unique in both + left and right datasets. + * "one_to_many" or "1:m": check if merge keys are unique in left + dataset. + * "many_to_one" or "m:1": check if merge keys are unique in right + dataset. + * "many_to_many" or "m:m": allowed, but does not result in checks. + + Returns + ------- + DataFrame + A DataFrame of the two merged objects. + + See Also + -------- + merge_ordered : Merge with optional filling/interpolation. + merge_asof : Merge on nearest keys. + DataFrame.join : Similar method using indices. + + Examples + -------- + >>> df1 = pd.DataFrame( + ... {"lkey": ["foo", "bar", "baz", "foo"], "value": [1, 2, 3, 5]} + ... ) + >>> df2 = pd.DataFrame( + ... {"rkey": ["foo", "bar", "baz", "foo"], "value": [5, 6, 7, 8]} + ... ) + >>> df1 + lkey value + 0 foo 1 + 1 bar 2 + 2 baz 3 + 3 foo 5 + >>> df2 + rkey value + 0 foo 5 + 1 bar 6 + 2 baz 7 + 3 foo 8 + + Merge df1 and df2 on the lkey and rkey columns. The value columns have + the default suffixes, _x and _y, appended. + + >>> df1.merge(df2, left_on="lkey", right_on="rkey") + lkey value_x rkey value_y + 0 foo 1 foo 5 + 1 foo 1 foo 8 + 2 bar 2 bar 6 + 3 baz 3 baz 7 + 4 foo 5 foo 5 + 5 foo 5 foo 8 + + Merge DataFrames df1 and df2 with specified left and right suffixes + appended to any overlapping columns. + + >>> df1.merge(df2, left_on="lkey", right_on="rkey", suffixes=("_left", "_right")) + lkey value_left rkey value_right + 0 foo 1 foo 5 + 1 foo 1 foo 8 + 2 bar 2 bar 6 + 3 baz 3 baz 7 + 4 foo 5 foo 5 + 5 foo 5 foo 8 + + Merge DataFrames df1 and df2, but raise an exception if the DataFrames have + any overlapping columns. + + >>> df1.merge(df2, left_on="lkey", right_on="rkey", suffixes=(False, False)) + Traceback (most recent call last): + ... + ValueError: columns overlap but no suffix specified: + Index(['value'], dtype='object') + + >>> df1 = pd.DataFrame({"a": ["foo", "bar"], "b": [1, 2]}) + >>> df2 = pd.DataFrame({"a": ["foo", "baz"], "c": [3, 4]}) + >>> df1 + a b + 0 foo 1 + 1 bar 2 + >>> df2 + a c + 0 foo 3 + 1 baz 4 + + >>> df1.merge(df2, how="inner", on="a") + a b c + 0 foo 1 3 + + >>> df1.merge(df2, how="left", on="a") + a b c + 0 foo 1 3.0 + 1 bar 2 NaN + + >>> df1 = pd.DataFrame({"left": ["foo", "bar"]}) + >>> df2 = pd.DataFrame({"right": [7, 8]}) + >>> df1 + left + 0 foo + 1 bar + >>> df2 + right + 0 7 + 1 8 + + >>> df1.merge(df2, how="cross") + left right + 0 foo 7 + 1 foo 8 + 2 bar 7 + 3 bar 8 + """ left_df = _validate_operand(left) left._check_copy_deprecation(copy) right_df = _validate_operand(right) @@ -316,7 +513,9 @@ def merge_ordered( Parameters ---------- left : DataFrame or named Series + First pandas object to merge. right : DataFrame or named Series + Second pandas object to merge. on : label or list Field names to join on. Must be found in both DataFrames. left_on : label or list, or array-like @@ -474,7 +673,9 @@ def merge_asof( Parameters ---------- left : DataFrame or named Series + First pandas object to merge. right : DataFrame or named Series + Second pandas object to merge. on : label Field name to join on. Must be found in both DataFrames. The data MUST be ordered. Furthermore this must be a numeric column, @@ -513,6 +714,7 @@ def merge_asof( Returns ------- DataFrame + A DataFrame of the two merged objects. See Also -------- @@ -2475,8 +2677,7 @@ def _factorize_keys( elif isinstance(lk, ExtensionArray) and lk.dtype == rk.dtype: if (isinstance(lk.dtype, ArrowDtype) and is_string_dtype(lk.dtype)) or ( - isinstance(lk.dtype, StringDtype) - and lk.dtype.storage in ["pyarrow", "pyarrow_numpy"] + isinstance(lk.dtype, StringDtype) and lk.dtype.storage == "pyarrow" ): import pyarrow as pa import pyarrow.compute as pc diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index e0126d439a79c..0886aad310034 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -3,7 +3,6 @@ import itertools from typing import ( TYPE_CHECKING, - Callable, Literal, cast, ) @@ -11,10 +10,6 @@ import numpy as np from pandas._libs import lib -from pandas.util._decorators import ( - Appender, - Substitution, -) from pandas.core.dtypes.cast import maybe_downcast_to_dtype from pandas.core.dtypes.common import ( @@ -29,7 +24,6 @@ ) import pandas.core.common as com -from pandas.core.frame import _shared_docs from pandas.core.groupby import Grouper from pandas.core.indexes.api import ( Index, @@ -37,11 +31,13 @@ get_objs_combined_axis, ) from pandas.core.reshape.concat import concat -from pandas.core.reshape.util import cartesian_product from pandas.core.series import Series if TYPE_CHECKING: - from collections.abc import Hashable + from collections.abc import ( + Callable, + Hashable, + ) from pandas._typing import ( AggFuncType, @@ -54,10 +50,6 @@ from pandas import DataFrame -# Note: We need to make sure `frame` is imported before `pivot`, otherwise -# _shared_docs['pivot_table'] will not yet exist. TODO: Fix this dependency -@Substitution("\ndata : DataFrame") -@Appender(_shared_docs["pivot_table"], indents=1) def pivot_table( data: DataFrame, values=None, @@ -70,7 +62,178 @@ def pivot_table( margins_name: Hashable = "All", observed: bool = True, sort: bool = True, + **kwargs, ) -> DataFrame: + """ + Create a spreadsheet-style pivot table as a DataFrame. + + The levels in the pivot table will be stored in MultiIndex objects + (hierarchical indexes) on the index and columns of the result DataFrame. + + Parameters + ---------- + data : DataFrame + Input pandas DataFrame object. + values : list-like or scalar, optional + Column or columns to aggregate. + index : column, Grouper, array, or list of the previous + Keys to group by on the pivot table index. If a list is passed, + it can contain any of the other types (except list). If an array is + passed, it must be the same length as the data and will be used in + the same manner as column values. + columns : column, Grouper, array, or list of the previous + Keys to group by on the pivot table column. If a list is passed, + it can contain any of the other types (except list). If an array is + passed, it must be the same length as the data and will be used in + the same manner as column values. + aggfunc : function, list of functions, dict, default "mean" + If a list of functions is passed, the resulting pivot table will have + hierarchical columns whose top level are the function names + (inferred from the function objects themselves). + If a dict is passed, the key is column to aggregate and the value is + function or list of functions. If ``margins=True``, aggfunc will be + used to calculate the partial aggregates. + fill_value : scalar, default None + Value to replace missing values with (in the resulting pivot table, + after aggregation). + margins : bool, default False + If ``margins=True``, special ``All`` columns and rows + will be added with partial group aggregates across the categories + on the rows and columns. + dropna : bool, default True + Do not include columns whose entries are all NaN. If True, + rows with a NaN value in any column will be omitted before + computing margins. + margins_name : str, default 'All' + Name of the row / column that will contain the totals + when margins is True. + observed : bool, default False + This only applies if any of the groupers are Categoricals. + If True: only show observed values for categorical groupers. + If False: show all values for categorical groupers. + + .. versionchanged:: 3.0.0 + + The default value is now ``True``. + + sort : bool, default True + Specifies if the result should be sorted. + + .. versionadded:: 1.3.0 + + **kwargs : dict + Optional keyword arguments to pass to ``aggfunc``. + + .. versionadded:: 3.0.0 + + Returns + ------- + DataFrame + An Excel style pivot table. + + See Also + -------- + DataFrame.pivot : Pivot without aggregation that can handle + non-numeric data. + DataFrame.melt: Unpivot a DataFrame from wide to long format, + optionally leaving identifiers set. + wide_to_long : Wide panel to long format. Less flexible but more + user-friendly than melt. + + Notes + ----- + Reference :ref:`the user guide ` for more examples. + + Examples + -------- + >>> df = pd.DataFrame( + ... { + ... "A": ["foo", "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar"], + ... "B": ["one", "one", "one", "two", "two", "one", "one", "two", "two"], + ... "C": [ + ... "small", + ... "large", + ... "large", + ... "small", + ... "small", + ... "large", + ... "small", + ... "small", + ... "large", + ... ], + ... "D": [1, 2, 2, 3, 3, 4, 5, 6, 7], + ... "E": [2, 4, 5, 5, 6, 6, 8, 9, 9], + ... } + ... ) + >>> df + A B C D E + 0 foo one small 1 2 + 1 foo one large 2 4 + 2 foo one large 2 5 + 3 foo two small 3 5 + 4 foo two small 3 6 + 5 bar one large 4 6 + 6 bar one small 5 8 + 7 bar two small 6 9 + 8 bar two large 7 9 + + This first example aggregates values by taking the sum. + + >>> table = pd.pivot_table( + ... df, values="D", index=["A", "B"], columns=["C"], aggfunc="sum" + ... ) + >>> table + C large small + A B + bar one 4.0 5.0 + two 7.0 6.0 + foo one 4.0 1.0 + two NaN 6.0 + + We can also fill missing values using the `fill_value` parameter. + + >>> table = pd.pivot_table( + ... df, values="D", index=["A", "B"], columns=["C"], aggfunc="sum", fill_value=0 + ... ) + >>> table + C large small + A B + bar one 4 5 + two 7 6 + foo one 4 1 + two 0 6 + + The next example aggregates by taking the mean across multiple columns. + + >>> table = pd.pivot_table( + ... df, values=["D", "E"], index=["A", "C"], aggfunc={"D": "mean", "E": "mean"} + ... ) + >>> table + D E + A C + bar large 5.500000 7.500000 + small 5.500000 8.500000 + foo large 2.000000 4.500000 + small 2.333333 4.333333 + + We can also calculate multiple types of aggregations for any given + value column. + + >>> table = pd.pivot_table( + ... df, + ... values=["D", "E"], + ... index=["A", "C"], + ... aggfunc={"D": "mean", "E": ["min", "max", "mean"]}, + ... ) + >>> table + D E + mean max mean min + A C + bar large 5.500000 9 7.500000 6 + small 5.500000 9 8.500000 8 + foo large 2.000000 5 4.500000 4 + small 2.333333 6 4.333333 2 + """ index = _convert_by(index) columns = _convert_by(columns) @@ -90,6 +253,7 @@ def pivot_table( margins_name=margins_name, observed=observed, sort=sort, + kwargs=kwargs, ) pieces.append(_table) keys.append(getattr(func, "__name__", func)) @@ -109,6 +273,7 @@ def pivot_table( margins_name, observed, sort, + kwargs, ) return table.__finalize__(data, method="pivot_table") @@ -125,6 +290,7 @@ def __internal_pivot_table( margins_name: Hashable, observed: bool, sort: bool, + kwargs, ) -> DataFrame: """ Helper of :func:`pandas.pivot_table` for any non-list ``aggfunc``. @@ -167,7 +333,7 @@ def __internal_pivot_table( values = list(values) grouped = data.groupby(keys, observed=observed, sort=sort, dropna=dropna) - agged = grouped.agg(aggfunc) + agged = grouped.agg(aggfunc, **kwargs) if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns): agged = agged.dropna(how="all") @@ -191,15 +357,11 @@ def __internal_pivot_table( if not dropna: if isinstance(table.index, MultiIndex): - m = MultiIndex.from_arrays( - cartesian_product(table.index.levels), names=table.index.names - ) + m = MultiIndex.from_product(table.index.levels, names=table.index.names) table = table.reindex(m, axis=0, fill_value=fill_value) if isinstance(table.columns, MultiIndex): - m = MultiIndex.from_arrays( - cartesian_product(table.columns.levels), names=table.columns.names - ) + m = MultiIndex.from_product(table.columns.levels, names=table.columns.names) table = table.reindex(m, axis=1, fill_value=fill_value) if sort is True and isinstance(table, ABCDataFrame): @@ -222,6 +384,7 @@ def __internal_pivot_table( rows=index, cols=columns, aggfunc=aggfunc, + kwargs=kwargs, observed=dropna, margins_name=margins_name, fill_value=fill_value, @@ -247,6 +410,7 @@ def _add_margins( rows, cols, aggfunc, + kwargs, observed: bool, margins_name: Hashable = "All", fill_value=None, @@ -259,7 +423,7 @@ def _add_margins( if margins_name in table.index.get_level_values(level): raise ValueError(msg) - grand_margin = _compute_grand_margin(data, values, aggfunc, margins_name) + grand_margin = _compute_grand_margin(data, values, aggfunc, kwargs, margins_name) if table.ndim == 2: # i.e. DataFrame @@ -280,7 +444,15 @@ def _add_margins( elif values: marginal_result_set = _generate_marginal_results( - table, data, values, rows, cols, aggfunc, observed, margins_name + table, + data, + values, + rows, + cols, + aggfunc, + kwargs, + observed, + margins_name, ) if not isinstance(marginal_result_set, tuple): return marginal_result_set @@ -289,7 +461,7 @@ def _add_margins( # no values, and table is a DataFrame assert isinstance(table, ABCDataFrame) marginal_result_set = _generate_marginal_results_without_values( - table, data, rows, cols, aggfunc, observed, margins_name + table, data, rows, cols, aggfunc, kwargs, observed, margins_name ) if not isinstance(marginal_result_set, tuple): return marginal_result_set @@ -326,26 +498,26 @@ def _add_margins( def _compute_grand_margin( - data: DataFrame, values, aggfunc, margins_name: Hashable = "All" + data: DataFrame, values, aggfunc, kwargs, margins_name: Hashable = "All" ): if values: grand_margin = {} for k, v in data[values].items(): try: if isinstance(aggfunc, str): - grand_margin[k] = getattr(v, aggfunc)() + grand_margin[k] = getattr(v, aggfunc)(**kwargs) elif isinstance(aggfunc, dict): if isinstance(aggfunc[k], str): - grand_margin[k] = getattr(v, aggfunc[k])() + grand_margin[k] = getattr(v, aggfunc[k])(**kwargs) else: - grand_margin[k] = aggfunc[k](v) + grand_margin[k] = aggfunc[k](v, **kwargs) else: - grand_margin[k] = aggfunc(v) + grand_margin[k] = aggfunc(v, **kwargs) except TypeError: pass return grand_margin else: - return {margins_name: aggfunc(data.index)} + return {margins_name: aggfunc(data.index, **kwargs)} def _generate_marginal_results( @@ -355,6 +527,7 @@ def _generate_marginal_results( rows, cols, aggfunc, + kwargs, observed: bool, margins_name: Hashable = "All", ): @@ -368,15 +541,17 @@ def _all_key(key): return (key, margins_name) + ("",) * (len(cols) - 1) if len(rows) > 0: - margin = data[rows + values].groupby(rows, observed=observed).agg(aggfunc) + margin = ( + data[rows + values] + .groupby(rows, observed=observed) + .agg(aggfunc, **kwargs) + ) cat_axis = 1 for key, piece in table.T.groupby(level=0, observed=observed): piece = piece.T all_key = _all_key(key) - # we are going to mutate this, so need to copy! - piece = piece.copy() piece[all_key] = margin[key] table_pieces.append(piece) @@ -393,7 +568,7 @@ def _all_key(key): table_pieces.append(piece) # GH31016 this is to calculate margin for each group, and assign # corresponded key as index - transformed_piece = DataFrame(piece.apply(aggfunc)).T + transformed_piece = DataFrame(piece.apply(aggfunc, **kwargs)).T if isinstance(piece.index, MultiIndex): # We are adding an empty level transformed_piece.index = MultiIndex.from_tuples( @@ -423,7 +598,9 @@ def _all_key(key): margin_keys = table.columns if len(cols) > 0: - row_margin = data[cols + values].groupby(cols, observed=observed).agg(aggfunc) + row_margin = ( + data[cols + values].groupby(cols, observed=observed).agg(aggfunc, **kwargs) + ) row_margin = row_margin.stack() # GH#26568. Use names instead of indices in case of numeric names @@ -442,6 +619,7 @@ def _generate_marginal_results_without_values( rows, cols, aggfunc, + kwargs, observed: bool, margins_name: Hashable = "All", ): @@ -456,14 +634,16 @@ def _all_key(): return (margins_name,) + ("",) * (len(cols) - 1) if len(rows) > 0: - margin = data.groupby(rows, observed=observed)[rows].apply(aggfunc) + margin = data.groupby(rows, observed=observed)[rows].apply( + aggfunc, **kwargs + ) all_key = _all_key() table[all_key] = margin result = table margin_keys.append(all_key) else: - margin = data.groupby(level=0, observed=observed).apply(aggfunc) + margin = data.groupby(level=0, observed=observed).apply(aggfunc, **kwargs) all_key = _all_key() table[all_key] = margin result = table @@ -474,7 +654,9 @@ def _all_key(): margin_keys = table.columns if len(cols): - row_margin = data.groupby(cols, observed=observed)[cols].apply(aggfunc) + row_margin = data.groupby(cols, observed=observed)[cols].apply( + aggfunc, **kwargs + ) else: row_margin = Series(np.nan, index=result.columns) @@ -495,8 +677,6 @@ def _convert_by(by): return by -@Substitution("\ndata : DataFrame") -@Appender(_shared_docs["pivot"], indents=1) def pivot( data: DataFrame, *, @@ -504,16 +684,162 @@ def pivot( index: IndexLabel | lib.NoDefault = lib.no_default, values: IndexLabel | lib.NoDefault = lib.no_default, ) -> DataFrame: + """ + Return reshaped DataFrame organized by given index / column values. + + Reshape data (produce a "pivot" table) based on column values. Uses + unique values from specified `index` / `columns` to form axes of the + resulting DataFrame. This function does not support data + aggregation, multiple values will result in a MultiIndex in the + columns. See the :ref:`User Guide ` for more on reshaping. + + Parameters + ---------- + data : DataFrame + Input pandas DataFrame object. + columns : str or object or a list of str + Column to use to make new frame's columns. + index : str or object or a list of str, optional + Column to use to make new frame's index. If not given, uses existing index. + values : str, object or a list of the previous, optional + Column(s) to use for populating new frame's values. If not + specified, all remaining columns will be used and the result will + have hierarchically indexed columns. + + Returns + ------- + DataFrame + Returns reshaped DataFrame. + + Raises + ------ + ValueError: + When there are any `index`, `columns` combinations with multiple + values. `DataFrame.pivot_table` when you need to aggregate. + + See Also + -------- + DataFrame.pivot_table : Generalization of pivot that can handle + duplicate values for one index/column pair. + DataFrame.unstack : Pivot based on the index values instead of a + column. + wide_to_long : Wide panel to long format. Less flexible but more + user-friendly than melt. + + Notes + ----- + For finer-tuned control, see hierarchical indexing documentation along + with the related stack/unstack methods. + + Reference :ref:`the user guide ` for more examples. + + Examples + -------- + >>> df = pd.DataFrame( + ... { + ... "foo": ["one", "one", "one", "two", "two", "two"], + ... "bar": ["A", "B", "C", "A", "B", "C"], + ... "baz": [1, 2, 3, 4, 5, 6], + ... "zoo": ["x", "y", "z", "q", "w", "t"], + ... } + ... ) + >>> df + foo bar baz zoo + 0 one A 1 x + 1 one B 2 y + 2 one C 3 z + 3 two A 4 q + 4 two B 5 w + 5 two C 6 t + + >>> df.pivot(index="foo", columns="bar", values="baz") + bar A B C + foo + one 1 2 3 + two 4 5 6 + + >>> df.pivot(index="foo", columns="bar")["baz"] + bar A B C + foo + one 1 2 3 + two 4 5 6 + + >>> df.pivot(index="foo", columns="bar", values=["baz", "zoo"]) + baz zoo + bar A B C A B C + foo + one 1 2 3 x y z + two 4 5 6 q w t + + You could also assign a list of column names or a list of index names. + + >>> df = pd.DataFrame( + ... { + ... "lev1": [1, 1, 1, 2, 2, 2], + ... "lev2": [1, 1, 2, 1, 1, 2], + ... "lev3": [1, 2, 1, 2, 1, 2], + ... "lev4": [1, 2, 3, 4, 5, 6], + ... "values": [0, 1, 2, 3, 4, 5], + ... } + ... ) + >>> df + lev1 lev2 lev3 lev4 values + 0 1 1 1 1 0 + 1 1 1 2 2 1 + 2 1 2 1 3 2 + 3 2 1 2 4 3 + 4 2 1 1 5 4 + 5 2 2 2 6 5 + + >>> df.pivot(index="lev1", columns=["lev2", "lev3"], values="values") + lev2 1 2 + lev3 1 2 1 2 + lev1 + 1 0.0 1.0 2.0 NaN + 2 4.0 3.0 NaN 5.0 + + >>> df.pivot(index=["lev1", "lev2"], columns=["lev3"], values="values") + lev3 1 2 + lev1 lev2 + 1 1 0.0 1.0 + 2 2.0 NaN + 2 1 4.0 3.0 + 2 NaN 5.0 + + A ValueError is raised if there are any duplicates. + + >>> df = pd.DataFrame( + ... { + ... "foo": ["one", "one", "two", "two"], + ... "bar": ["A", "A", "B", "C"], + ... "baz": [1, 2, 3, 4], + ... } + ... ) + >>> df + foo bar baz + 0 one A 1 + 1 one A 2 + 2 two B 3 + 3 two C 4 + + Notice that the first two rows are the same for our `index` + and `columns` arguments. + + >>> df.pivot(index="foo", columns="bar", values="baz") + Traceback (most recent call last): + ... + ValueError: Index contains duplicate entries, cannot reshape + """ columns_listlike = com.convert_to_list_like(columns) # If columns is None we will create a MultiIndex level with None as name # which might cause duplicated names because None is the default for # level names - data = data.copy(deep=False) - data.index = data.index.copy() - data.index.names = [ - name if name is not None else lib.no_default for name in data.index.names - ] + if any(name is None for name in data.index.names): + data = data.copy(deep=False) + data.index.names = [ + name if name is not None else lib.no_default for name in data.index.names + ] indexed: DataFrame | Series if values is lib.no_default: diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 5426c72a356d6..9b7b768fe7adb 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -42,7 +42,7 @@ from pandas.core.indexes.api import ( Index, MultiIndex, - RangeIndex, + default_index, ) from pandas.core.reshape.concat import concat from pandas.core.series import Series @@ -288,21 +288,19 @@ def get_new_values(self, values, fill_value=None): dtype = values.dtype - # if our mask is all True, then we can use our existing dtype - if mask_all: - dtype = values.dtype - new_values = np.empty(result_shape, dtype=dtype) - else: - if isinstance(dtype, ExtensionDtype): - # GH#41875 - # We are assuming that fill_value can be held by this dtype, - # unlike the non-EA case that promotes. - cls = dtype.construct_array_type() - new_values = cls._empty(result_shape, dtype=dtype) + if isinstance(dtype, ExtensionDtype): + # GH#41875 + # We are assuming that fill_value can be held by this dtype, + # unlike the non-EA case that promotes. + cls = dtype.construct_array_type() + new_values = cls._empty(result_shape, dtype=dtype) + if not mask_all: new_values[:] = fill_value - else: + else: + if not mask_all: dtype, fill_value = maybe_promote(dtype, fill_value) - new_values = np.empty(result_shape, dtype=dtype) + new_values = np.empty(result_shape, dtype=dtype) + if not mask_all: new_values.fill(fill_value) name = dtype.name @@ -461,7 +459,7 @@ def _unstack_multiple( ) if isinstance(data, Series): - dummy = data.copy() + dummy = data.copy(deep=False) dummy.index = dummy_index unstacked = dummy.unstack("__placeholder__", fill_value=fill_value, sort=sort) @@ -842,7 +840,7 @@ def _convert_level_number(level_num: int, columns: Index): [x._values.astype(dtype, copy=False) for _, x in subset.items()] ) N, K = subset.shape - idx = np.arange(N * K).reshape(K, N).T.ravel() + idx = np.arange(N * K).reshape(K, N).T.reshape(-1) value_slice = value_slice.take(idx) else: value_slice = subset.values @@ -924,7 +922,7 @@ def _reorder_for_extension_array_stack( # idx is an indexer like # [c0r0, c1r0, c2r0, ..., # c0r1, c1r1, c2r1, ...] - idx = np.arange(n_rows * n_columns).reshape(n_columns, n_rows).T.ravel() + idx = np.arange(n_rows * n_columns).reshape(n_columns, n_rows).T.reshape(-1) return arr.take(idx) @@ -1025,7 +1023,7 @@ def stack_reshape( buf = [] for idx in stack_cols.unique(): if len(frame.columns) == 1: - data = frame.copy() + data = frame.copy(deep=False) else: if not isinstance(frame.columns, MultiIndex) and not isinstance(idx, tuple): # GH#57750 - if the frame is an Index with tuples, .loc below will fail @@ -1047,7 +1045,7 @@ def stack_reshape( if data.ndim == 1: data.name = 0 else: - data.columns = RangeIndex(len(data.columns)) + data.columns = default_index(len(data.columns)) buf.append(data) if len(buf) > 0 and not frame.empty: diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 1499afbde56d3..18517199f073c 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -7,7 +7,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Literal, ) @@ -44,6 +43,8 @@ from pandas.core.arrays.datetimelike import dtype_to_unit if TYPE_CHECKING: + from collections.abc import Callable + from pandas._typing import ( DtypeObj, IntervalLeftRight, @@ -141,12 +142,17 @@ def cut( fixed set of values. Series : One-dimensional array with axis labels (including time series). IntervalIndex : Immutable Index implementing an ordered, sliceable set. + numpy.histogram_bin_edges: Function to calculate only the edges of the bins + used by the histogram function. Notes ----- Any NA values will be NA in the result. Out of bounds values will be NA in the resulting Series or Categorical object. + ``numpy.histogram_bin_edges`` can be used along with cut to calculate bins according + to some predefined methods. + Reference :ref:`the user guide ` for more examples. Examples @@ -238,6 +244,16 @@ def cut( >>> pd.cut([0, 0.5, 1.5, 2.5, 4.5], bins) [NaN, (0.0, 1.0], NaN, (2.0, 3.0], (4.0, 5.0]] Categories (3, interval[int64, right]): [(0, 1] < (2, 3] < (4, 5]] + + Using np.histogram_bin_edges with cut + + >>> pd.cut( + ... np.array([1, 7, 5, 4]), + ... bins=np.histogram_bin_edges(np.array([1, 7, 5, 4]), bins="auto"), + ... ) + ... # doctest: +ELLIPSIS + [NaN, (5.0, 7.0], (3.0, 5.0], (3.0, 5.0]] + Categories (3, interval[float64, right]): [(1.0, 3.0] < (3.0, 5.0] < (5.0, 7.0]] """ # NOTE: this binning code is changed a bit from histogram for var(x) == 0 @@ -289,6 +305,7 @@ def qcut( Parameters ---------- x : 1d ndarray or Series + Input Numpy array or pandas Series object to be discretized. q : int or list-like of float Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles. @@ -313,6 +330,11 @@ def qcut( bins : ndarray of floats Returned only if `retbins` is True. + See Also + -------- + cut : Bin values into discrete intervals. + Series.quantile : Return value at the given quantile. + Notes ----- Out of bounds values will be NA in the resulting Categorical object diff --git a/pandas/core/reshape/util.py b/pandas/core/reshape/util.py deleted file mode 100644 index 0f1fbc662e1a6..0000000000000 --- a/pandas/core/reshape/util.py +++ /dev/null @@ -1,85 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING - -import numpy as np - -from pandas.core.dtypes.common import is_list_like - -if TYPE_CHECKING: - from pandas._typing import NumpyIndexT - - -def cartesian_product(X) -> list[np.ndarray]: - """ - Numpy version of itertools.product. - Sometimes faster (for large inputs)... - - Parameters - ---------- - X : list-like of list-likes - - Returns - ------- - product : list of ndarrays - - Examples - -------- - >>> cartesian_product([list("ABC"), [1, 2]]) - [array(['A', 'A', 'B', 'B', 'C', 'C'], dtype=' NumpyIndexT: - """ - Index compat for np.tile. - - Notes - ----- - Does not support multi-dimensional `num`. - """ - if isinstance(arr, np.ndarray): - return np.tile(arr, num) - - # Otherwise we have an Index - taker = np.tile(np.arange(len(arr)), num) - return arr.take(taker) diff --git a/pandas/core/series.py b/pandas/core/series.py index c49eef49f7393..a197886748bce 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -5,6 +5,7 @@ from __future__ import annotations from collections.abc import ( + Callable, Hashable, Iterable, Mapping, @@ -17,7 +18,6 @@ IO, TYPE_CHECKING, Any, - Callable, Literal, cast, overload, @@ -49,7 +49,6 @@ deprecate_nonkeyword_arguments, doc, ) -from pandas.util._exceptions import find_stack_level from pandas.util._validators import ( validate_ascending, validate_bool_kwarg, @@ -257,6 +256,7 @@ class Series(base.IndexOpsMixin, NDFrame): # type: ignore[misc] Data type for the output Series. If not specified, this will be inferred from `data`. See the :ref:`user guide ` for more usages. + If ``data`` is Series then is ignored. name : Hashable, default None The name to give to the Series. copy : bool, default False @@ -389,10 +389,6 @@ def __init__( self.name = name return - is_pandas_object = isinstance(data, (Series, Index, ExtensionArray)) - data_dtype = getattr(data, "dtype", None) - original_dtype = dtype - if isinstance(data, (ExtensionArray, np.ndarray)): if copy is not False: if dtype is None or astype_is_view(data.dtype, pandas_dtype(dtype)): @@ -438,7 +434,6 @@ def __init__( data = data.astype(dtype) refs = data._references - data = data._values copy = False elif isinstance(data, np.ndarray): @@ -512,17 +507,6 @@ def __init__( self.name = name self._set_axis(0, index) - if original_dtype is None and is_pandas_object and data_dtype == np.object_: - if self.dtype != data_dtype: - warnings.warn( - "Dtype inference on a pandas object " - "(Series, Index, ExtensionArray) is deprecated. The Series " - "constructor will keep the original dtype in the future. " - "Call `infer_objects` on the result to get the old behavior.", - FutureWarning, - stacklevel=find_stack_level(), - ) - def _init_dict( self, data: Mapping, index: Index | None = None, dtype: DtypeObj | None = None ): @@ -1442,7 +1426,7 @@ def to_string( ) -> None: ... @deprecate_nonkeyword_arguments( - version="3.0.0", allowed_args=["self", "buf"], name="to_string" + version="4.0", allowed_args=["self", "buf"], name="to_string" ) def to_string( self, @@ -1600,7 +1584,7 @@ def to_markdown( ), ) @deprecate_nonkeyword_arguments( - version="3.0.0", allowed_args=["self", "buf"], name="to_markdown" + version="4.0", allowed_args=["self", "buf"], name="to_markdown" ) def to_markdown( self, @@ -1729,6 +1713,12 @@ def to_dict( collections.abc.MutableMapping Key-value representation of Series. + See Also + -------- + Series.to_list: Converts Series to a list of the values. + Series.to_numpy: Converts Series to NumPy ndarray. + Series.array: ExtensionArray of the data backing this Series. + Examples -------- >>> s = pd.Series([1, 2, 3, 4]) @@ -1766,6 +1756,10 @@ def to_frame(self, name: Hashable = lib.no_default) -> DataFrame: DataFrame DataFrame representation of Series. + See Also + -------- + Series.to_dict : Convert Series to dict object. + Examples -------- >>> s = pd.Series(["a", "b", "c"], name="vals") @@ -1821,14 +1815,30 @@ def _set_name( Parrot 30.0 Parrot 20.0 Name: Max Speed, dtype: float64 + + We can pass a list of values to group the Series data by custom labels: + >>> ser.groupby(["a", "b", "a", "b"]).mean() a 210.0 b 185.0 Name: Max Speed, dtype: float64 + + Grouping by numeric labels yields similar results: + + >>> ser.groupby([0, 1, 0, 1]).mean() + 0 210.0 + 1 185.0 + Name: Max Speed, dtype: float64 + + We can group by a level of the index: + >>> ser.groupby(level=0).mean() Falcon 370.0 Parrot 25.0 Name: Max Speed, dtype: float64 + + We can group by a condition applied to the Series values: + >>> ser.groupby(ser > 100).mean() Max Speed False 25.0 @@ -1851,11 +1861,16 @@ def _set_name( Parrot Captive 30.0 Wild 20.0 Name: Max Speed, dtype: float64 + >>> ser.groupby(level=0).mean() Animal Falcon 370.0 Parrot 25.0 Name: Max Speed, dtype: float64 + + We can also group by the 'Type' level of the hierarchical index + to get the mean speed for each type: + >>> ser.groupby(level="Type").mean() Type Captive 210.0 @@ -1871,12 +1886,17 @@ def _set_name( b 3 dtype: int64 + To include `NA` values in the group keys, set `dropna=False`: + >>> ser.groupby(level=0, dropna=False).sum() a 3 b 3 NaN 3 dtype: int64 + We can also group by a custom list with NaN values to handle + missing group labels: + >>> arrays = ['Falcon', 'Falcon', 'Parrot', 'Parrot'] >>> ser = pd.Series([390., 350., 30., 20.], index=arrays, name="Max Speed") >>> ser.groupby(["a", "b", "a", np.nan]).mean() @@ -2060,14 +2080,14 @@ def unique(self) -> ArrayLike: >>> pd.Series([pd.Timestamp("2016-01-01") for _ in range(3)]).unique() ['2016-01-01 00:00:00'] - Length: 1, dtype: datetime64[ns] + Length: 1, dtype: datetime64[s] >>> pd.Series( ... [pd.Timestamp("2016-01-01", tz="US/Eastern") for _ in range(3)] ... ).unique() ['2016-01-01 00:00:00-05:00'] - Length: 1, dtype: datetime64[ns, US/Eastern] + Length: 1, dtype: datetime64[s, US/Eastern] An Categorical will return categories in the order of appearance and with the same dtype. @@ -3175,6 +3195,7 @@ def combine_first(self, other) -> Series: other = other.reindex(keep_other) if this.dtype.kind == "M" and other.dtype.kind != "M": + # TODO: try to match resos? other = to_datetime(other) combined = concat([this, other]) combined = combined.reindex(new_index) @@ -3737,25 +3758,7 @@ def argsort( # GH#54257 We allow -1 here so that np.argsort(series) works self._get_axis_number(axis) - values = self._values - mask = isna(values) - - if mask.any(): - # TODO(3.0): once this deprecation is enforced we can call - # self.array.argsort directly, which will close GH#43840 and - # GH#12694 - warnings.warn( - "The behavior of Series.argsort in the presence of NA values is " - "deprecated. In a future version, NA values will be ordered " - "last instead of set to -1.", - FutureWarning, - stacklevel=find_stack_level(), - ) - result = np.full(len(self), -1, dtype=np.intp) - notmask = ~mask - result[notmask] = np.argsort(values[notmask], kind=kind) - else: - result = np.argsort(values, kind=kind) + result = self.array.argsort(kind=kind) res = self._constructor( result, index=self.index, name=self.name, dtype=np.intp, copy=False @@ -4980,7 +4983,7 @@ def drop( C 2 dtype: int64 - Drop labels B en C + Drop labels B and C >>> s.drop(labels=["B", "C"]) A 0 @@ -5035,7 +5038,8 @@ def pop(self, item: Hashable) -> Any: Returns ------- - Value that is popped from series. + scalar + Value that is popped from series. Examples -------- @@ -6065,8 +6069,69 @@ def lt(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: other, operator.lt, level=level, fill_value=fill_value, axis=axis ) - @Appender(ops.make_flex_doc("ge", "series")) def ge(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: + """ + Return Greater than or equal to of series and other, \ + element-wise (binary operator `ge`). + + Equivalent to ``series >= other``, but with support to substitute a + fill_value for missing data in either one of the inputs. + + Parameters + ---------- + other : Series or scalar value + The second operand in this operation. + level : int or name + Broadcast across a level, matching Index values on the + passed MultiIndex level. + fill_value : None or float value, default None (NaN) + Fill existing missing (NaN) values, and any new element needed for + successful Series alignment, with this value before computation. + If data in both corresponding Series locations is missing + the result of filling (at that location) will be missing. + axis : {0 or 'index'} + Unused. Parameter needed for compatibility with DataFrame. + + Returns + ------- + Series + The result of the operation. + + See Also + -------- + Series.gt : Greater than comparison, element-wise. + Series.le : Less than or equal to comparison, element-wise. + Series.lt : Less than comparison, element-wise. + Series.eq : Equal to comparison, element-wise. + Series.ne : Not equal to comparison, element-wise. + + Examples + -------- + >>> a = pd.Series([1, 1, 1, np.nan, 1], index=["a", "b", "c", "d", "e"]) + >>> a + a 1.0 + b 1.0 + c 1.0 + d NaN + e 1.0 + dtype: float64 + >>> b = pd.Series([0, 1, 2, np.nan, 1], index=["a", "b", "c", "d", "f"]) + >>> b + a 0.0 + b 1.0 + c 2.0 + d NaN + f 1.0 + dtype: float64 + >>> a.ge(b, fill_value=0) + a True + b True + c False + d False + e True + f False + dtype: bool + """ return self._flex_method( other, operator.ge, level=level, fill_value=fill_value, axis=axis ) @@ -6484,7 +6549,7 @@ def any( # type: ignore[override] filter_type="bool", ) - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="all") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="all") @Appender(make_doc("all", ndim=1)) def all( self, @@ -6504,7 +6569,7 @@ def all( filter_type="bool", ) - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="min") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="min") def min( self, axis: Axis | None = 0, @@ -6539,7 +6604,7 @@ def min( Returns ------- scalar or Series (if level specified) - The maximum of the values in the Series. + The minimum of the values in the Series. See Also -------- @@ -6575,7 +6640,7 @@ def min( self, axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs ) - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="max") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="max") def max( self, axis: Axis | None = 0, @@ -6646,7 +6711,7 @@ def max( self, axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs ) - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="sum") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="sum") def sum( self, axis: Axis | None = None, @@ -6688,7 +6753,7 @@ def sum( Returns ------- scalar or Series (if level specified) - Median of the values for the requested axis. + Sum of the values for the requested axis. See Also -------- @@ -6747,7 +6812,7 @@ def sum( **kwargs, ) - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="prod") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="prod") @doc(make_doc("prod", ndim=1)) def prod( self, @@ -6766,7 +6831,7 @@ def prod( **kwargs, ) - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="mean") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="mean") def mean( self, axis: Axis | None = 0, @@ -6798,7 +6863,7 @@ def mean( Returns ------- scalar or Series (if level specified) - Median of the values for the requested axis. + Mean of the values for the requested axis. See Also -------- @@ -6820,7 +6885,7 @@ def mean( self, axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs ) - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="median") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="median") def median( self, axis: Axis | None = 0, @@ -6901,7 +6966,7 @@ def median( self, axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs ) - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="sem") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="sem") @doc(make_doc("sem", ndim=1)) def sem( self, @@ -6920,7 +6985,7 @@ def sem( **kwargs, ) - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="var") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="var") def var( self, axis: Axis | None = None, @@ -7007,7 +7072,7 @@ def var( **kwargs, ) - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="std") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="std") @doc(make_doc("std", ndim=1)) def std( self, @@ -7026,7 +7091,7 @@ def std( **kwargs, ) - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="skew") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="skew") @doc(make_doc("skew", ndim=1)) def skew( self, @@ -7039,7 +7104,7 @@ def skew( self, axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs ) - @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="kurt") + @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="kurt") def kurt( self, axis: Axis | None = 0, diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py index 38a443b56ee3d..5725b96f66cd4 100644 --- a/pandas/core/shared_docs.py +++ b/pandas/core/shared_docs.py @@ -49,6 +49,8 @@ for more details. A passed user-defined-function will be passed a Series for evaluation. + +If ``func`` defines an index relabeling, ``axis`` must be ``0`` or ``index``. {examples}""" _shared_docs["compare"] = """ diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 4fba243f73536..0d8f42694ccb4 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -5,7 +5,6 @@ import itertools from typing import ( TYPE_CHECKING, - Callable, cast, ) @@ -32,6 +31,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Hashable, Sequence, ) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 7494a43caf004..b37e22c9a91ec 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -5,7 +5,6 @@ import re from typing import ( TYPE_CHECKING, - Callable, Literal, cast, ) @@ -50,6 +49,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Hashable, Iterator, ) @@ -1085,6 +1085,13 @@ def get(self, i): Returns ------- Series or Index + Series or Index where each value is the extracted element from + the corresponding input component. + + See Also + -------- + Series.str.extract : Extract capture groups in the regex as columns + in a DataFrame. Examples -------- @@ -2971,6 +2978,9 @@ def extractall(self, pat, flags: int = 0) -> DataFrame: Returns ------- Series or Index of int. + A Series (if the input is a Series) or an Index (if the input is an + Index) of the %(side)s indexes corresponding to the positions where the + substring is found in each string of the input. See Also -------- @@ -2980,9 +2990,9 @@ def extractall(self, pat, flags: int = 0) -> DataFrame: -------- For Series.str.find: - >>> ser = pd.Series(["cow_", "duck_", "do_ve"]) + >>> ser = pd.Series(["_cow_", "duck_", "do_v_e"]) >>> ser.str.find("_") - 0 3 + 0 0 1 4 2 2 dtype: int64 diff --git a/pandas/core/strings/base.py b/pandas/core/strings/base.py index c1f94abff428a..1281a03e297f9 100644 --- a/pandas/core/strings/base.py +++ b/pandas/core/strings/base.py @@ -3,14 +3,16 @@ import abc from typing import ( TYPE_CHECKING, - Callable, Literal, ) import numpy as np if TYPE_CHECKING: - from collections.abc import Sequence + from collections.abc import ( + Callable, + Sequence, + ) import re from pandas._typing import ( diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index bdcf55e61d2d1..290a28ab60ae1 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -5,7 +5,6 @@ import textwrap from typing import ( TYPE_CHECKING, - Callable, Literal, cast, ) @@ -22,7 +21,10 @@ from pandas.core.strings.base import BaseStringArrayMethods if TYPE_CHECKING: - from collections.abc import Sequence + from collections.abc import ( + Callable, + Sequence, + ) from pandas._typing import ( NpDtype, @@ -457,16 +459,7 @@ def _str_rstrip(self, to_strip=None): return self._str_map(lambda x: x.rstrip(to_strip)) def _str_removeprefix(self, prefix: str): - # outstanding question on whether to use native methods for users on Python 3.9+ - # https://github.com/pandas-dev/pandas/pull/39226#issuecomment-836719770, - # in which case we could do return self._str_map(str.removeprefix) - - def removeprefix(text: str) -> str: - if text.startswith(prefix): - return text[len(prefix) :] - return text - - return self._str_map(removeprefix) + return self._str_map(lambda x: x.removeprefix(prefix)) def _str_removesuffix(self, suffix: str): return self._str_map(lambda x: x.removesuffix(suffix)) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index b01cdb335ec46..0e91bfa99e887 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -6,7 +6,6 @@ from itertools import islice from typing import ( TYPE_CHECKING, - Callable, TypedDict, Union, cast, @@ -29,6 +28,7 @@ timezones as libtimezones, ) from pandas._libs.tslibs.conversion import cast_from_unit_vectorized +from pandas._libs.tslibs.dtypes import NpyDatetimeUnit from pandas._libs.tslibs.parsing import ( DateParseError, guess_datetime_format, @@ -76,7 +76,10 @@ from pandas.core.indexes.datetimes import DatetimeIndex if TYPE_CHECKING: - from collections.abc import Hashable + from collections.abc import ( + Callable, + Hashable, + ) from pandas._libs.tslibs.nattype import NaTType from pandas._libs.tslibs.timedeltas import UnitChoices @@ -126,7 +129,7 @@ class FulldatetimeDict(YearMonthDayDict, total=False): def _guess_datetime_format_for_array(arr, dayfirst: bool | None = False) -> str | None: # Try to guess the format based on the first non-NaN element, return None if can't if (first_non_null := tslib.first_non_null(arr)) != -1: - if type(first_non_nan_element := arr[first_non_null]) is str: # noqa: E721 + if type(first_non_nan_element := arr[first_non_null]) is str: # GH#32264 np.str_ object guessed_format = guess_datetime_format( first_non_nan_element, dayfirst=dayfirst @@ -524,6 +527,7 @@ def _to_datetime_with_unit(arg, unit, name, utc: bool, errors: str) -> Index: utc=utc, errors=errors, unit_for_numerics=unit, + creso=NpyDatetimeUnit.NPY_FR_ns.value, ) result = DatetimeIndex(arr, name=name) @@ -873,7 +877,7 @@ def to_datetime( >>> pd.to_datetime(df) 0 2015-02-04 1 2016-03-05 - dtype: datetime64[ns] + dtype: datetime64[s] Using a unix epoch time @@ -903,7 +907,7 @@ def to_datetime( Passing ``errors='coerce'`` will force an out-of-bounds date to :const:`NaT`, in addition to forcing non-dates (or non-parseable dates) to :const:`NaT`. - >>> pd.to_datetime("13000101", format="%Y%m%d", errors="coerce") + >>> pd.to_datetime("invalid for Ymd", format="%Y%m%d", errors="coerce") NaT .. _to_datetime_tz_examples: @@ -916,14 +920,14 @@ def to_datetime( >>> pd.to_datetime(["2018-10-26 12:00:00", "2018-10-26 13:00:15"]) DatetimeIndex(['2018-10-26 12:00:00', '2018-10-26 13:00:15'], - dtype='datetime64[ns]', freq=None) + dtype='datetime64[s]', freq=None) - Timezone-aware inputs *with constant time offset* are converted to timezone-aware :class:`DatetimeIndex`: >>> pd.to_datetime(["2018-10-26 12:00 -0500", "2018-10-26 13:00 -0500"]) DatetimeIndex(['2018-10-26 12:00:00-05:00', '2018-10-26 13:00:00-05:00'], - dtype='datetime64[ns, UTC-05:00]', freq=None) + dtype='datetime64[s, UTC-05:00]', freq=None) - However, timezone-aware inputs *with mixed time offsets* (for example issued from a timezone with daylight savings, such as Europe/Paris) @@ -965,21 +969,21 @@ def to_datetime( >>> pd.to_datetime(["2018-10-26 12:00", "2018-10-26 13:00"], utc=True) DatetimeIndex(['2018-10-26 12:00:00+00:00', '2018-10-26 13:00:00+00:00'], - dtype='datetime64[ns, UTC]', freq=None) + dtype='datetime64[s, UTC]', freq=None) - Timezone-aware inputs are *converted* to UTC (the output represents the exact same datetime, but viewed from the UTC time offset `+00:00`). >>> pd.to_datetime(["2018-10-26 12:00 -0530", "2018-10-26 12:00 -0500"], utc=True) DatetimeIndex(['2018-10-26 17:30:00+00:00', '2018-10-26 17:00:00+00:00'], - dtype='datetime64[ns, UTC]', freq=None) + dtype='datetime64[s, UTC]', freq=None) - Inputs can contain both string or datetime, the above rules still apply >>> pd.to_datetime(["2018-10-26 12:00", datetime(2020, 1, 1, 18)], utc=True) DatetimeIndex(['2018-10-26 12:00:00+00:00', '2020-01-01 18:00:00+00:00'], - dtype='datetime64[ns, UTC]', freq=None) + dtype='datetime64[us, UTC]', freq=None) """ if exact is not lib.no_default and format in {"mixed", "ISO8601"}: raise ValueError("Cannot use 'exact' when 'format' is 'mixed' or 'ISO8601'") diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index 3d28a73df99d1..26e73794af298 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -7,7 +7,10 @@ import numpy as np -from pandas._libs import lib +from pandas._libs import ( + lib, + missing as libmissing, +) from pandas.util._validators import check_dtype_backend from pandas.core.dtypes.cast import maybe_downcast_numeric @@ -64,6 +67,7 @@ def to_numeric( ---------- arg : scalar, list, tuple, 1-d array, or Series Argument to be converted. + errors : {'raise', 'coerce'}, default 'raise' - If 'raise', then invalid parsing will raise an exception. - If 'coerce', then invalid parsing will be set as NaN. @@ -88,14 +92,15 @@ def to_numeric( the dtype it is to be cast to, so if none of the dtypes checked satisfy that specification, no downcasting will be performed on the data. - dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable' + + dtype_backend : {'numpy_nullable', 'pyarrow'} Back-end data type applied to the resultant :class:`DataFrame` - (still experimental). Behaviour is as follows: + (still experimental). If not specified, the default behavior + is to not use nullable data types. If specified, the behavior + is as follows: - * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` - (default). - * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` - DataFrame. + * ``"numpy_nullable"``: returns with nullable-dtype-backed + * ``"pyarrow"``: returns with pyarrow-backed nullable :class:`ArrowDtype` .. versionadded:: 2.0 @@ -216,7 +221,7 @@ def to_numeric( coerce_numeric=coerce_numeric, convert_to_masked_nullable=dtype_backend is not lib.no_default or isinstance(values_dtype, StringDtype) - and not values_dtype.storage == "pyarrow_numpy", + and values_dtype.na_value is libmissing.NA, ) if new_mask is not None: @@ -227,7 +232,7 @@ def to_numeric( dtype_backend is not lib.no_default and new_mask is None or isinstance(values_dtype, StringDtype) - and not values_dtype.storage == "pyarrow_numpy" + and values_dtype.na_value is libmissing.NA ): new_mask = np.zeros(values.shape, dtype=np.bool_) diff --git a/pandas/core/tools/timedeltas.py b/pandas/core/tools/timedeltas.py index 296168fe7e725..8d82a5c213910 100644 --- a/pandas/core/tools/timedeltas.py +++ b/pandas/core/tools/timedeltas.py @@ -170,7 +170,7 @@ def to_timedelta( TimedeltaIndex(['0 days 00:00:00', '0 days 00:00:01', '0 days 00:00:02', '0 days 00:00:03', '0 days 00:00:04'], dtype='timedelta64[ns]', freq=None) - >>> pd.to_timedelta(np.arange(5), unit="d") + >>> pd.to_timedelta(np.arange(5), unit="D") TimedeltaIndex(['0 days', '1 days', '2 days', '3 days', '4 days'], dtype='timedelta64[ns]', freq=None) """ diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index 3b9dd40a92ce8..e120e69dc27cf 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -244,6 +244,7 @@ def hash_array( Parameters ---------- vals : ndarray or ExtensionArray + The input array to hash. encoding : str, default 'utf8' Encoding for data & key when strings. hash_key : str, default _default_hash_key @@ -257,6 +258,11 @@ def hash_array( ndarray[np.uint64, ndim=1] Hashed values, same length as the vals. + See Also + -------- + util.hash_pandas_object : Return a data hash of the Index/Series/DataFrame. + util.hash_tuples : Hash an MultiIndex / listlike-of-tuples efficiently. + Examples -------- >>> pd.util.hash_array(np.array([1, 2, 3])) diff --git a/pandas/core/util/numba_.py b/pandas/core/util/numba_.py index a6079785e7475..de024f612516b 100644 --- a/pandas/core/util/numba_.py +++ b/pandas/core/util/numba_.py @@ -2,17 +2,18 @@ from __future__ import annotations +import inspect import types -from typing import ( - TYPE_CHECKING, - Callable, -) +from typing import TYPE_CHECKING import numpy as np from pandas.compat._optional import import_optional_dependency from pandas.errors import NumbaUtilError +if TYPE_CHECKING: + from collections.abc import Callable + GLOBAL_USE_NUMBA: bool = False @@ -54,10 +55,15 @@ def get_jit_arguments( engine_kwargs = {} nopython = engine_kwargs.get("nopython", True) - if kwargs and nopython: + if kwargs: + # Note: in case numba supports keyword-only arguments in + # a future version, we should remove this check. But this + # seems unlikely to happen soon. + raise NumbaUtilError( - "numba does not support kwargs with nopython=True: " - "https://github.com/numba/numba/issues/2916" + "numba does not support keyword-only arguments" + "https://github.com/numba/numba/issues/2916, " + "https://github.com/numba/numba/issues/6846" ) nogil = engine_kwargs.get("nogil", False) parallel = engine_kwargs.get("parallel", False) @@ -97,3 +103,47 @@ def jit_user_function(func: Callable) -> Callable: numba_func = numba.extending.register_jitable(func) return numba_func + + +_sentinel = object() + + +def prepare_function_arguments( + func: Callable, args: tuple, kwargs: dict +) -> tuple[tuple, dict]: + """ + Prepare arguments for jitted function. As numba functions do not support kwargs, + we try to move kwargs into args if possible. + + Parameters + ---------- + func : function + user defined function + args : tuple + user input positional arguments + kwargs : dict + user input keyword arguments + + Returns + ------- + tuple[tuple, dict] + args, kwargs + + """ + if not kwargs: + return args, kwargs + + # the udf should have this pattern: def udf(value, *args, **kwargs):... + signature = inspect.signature(func) + arguments = signature.bind(_sentinel, *args, **kwargs) + arguments.apply_defaults() + # Ref: https://peps.python.org/pep-0362/ + # Arguments which could be passed as part of either *args or **kwargs + # will be included only in the BoundArguments.args attribute. + args = arguments.args + kwargs = arguments.kwargs + + assert args[0] is _sentinel + args = args[1:] + + return args, kwargs diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py index b2855ff1f4048..43a3c03b6cef9 100644 --- a/pandas/core/window/ewm.py +++ b/pandas/core/window/ewm.py @@ -134,8 +134,10 @@ class ExponentialMovingWindow(BaseWindow): Provide exponentially weighted (EW) calculations. Exactly one of ``com``, ``span``, ``halflife``, or ``alpha`` must be - provided if ``times`` is not provided. If ``times`` is provided, + provided if ``times`` is not provided. If ``times`` is provided and ``adjust=True``, ``halflife`` and one of ``com``, ``span`` or ``alpha`` may be provided. + If ``times`` is provided and ``adjust=False``, ``halflife`` must be the only + provided decay-specification parameter. Parameters ---------- @@ -358,8 +360,6 @@ def __init__( self.ignore_na = ignore_na self.times = times if self.times is not None: - if not self.adjust: - raise NotImplementedError("times is not supported with adjust=False.") times_dtype = getattr(self.times, "dtype", None) if not ( is_datetime64_dtype(times_dtype) @@ -376,6 +376,11 @@ def __init__( # Halflife is no longer applicable when calculating COM # But allow COM to still be calculated if the user passes other decay args if common.count_not_none(self.com, self.span, self.alpha) > 0: + if not self.adjust: + raise NotImplementedError( + "None of com, span, or alpha can be specified if " + "times is provided and adjust=False" + ) self._com = get_center_of_mass(self.com, self.span, None, self.alpha) else: self._com = 1.0 diff --git a/pandas/core/window/expanding.py b/pandas/core/window/expanding.py index f14954cd9a4b0..d0c8a2e67b6ca 100644 --- a/pandas/core/window/expanding.py +++ b/pandas/core/window/expanding.py @@ -4,7 +4,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Literal, ) @@ -32,6 +31,8 @@ ) if TYPE_CHECKING: + from collections.abc import Callable + from pandas._typing import ( QuantileInterpolation, WindowingRankType, diff --git a/pandas/core/window/numba_.py b/pandas/core/window/numba_.py index eb06479fc325e..171d3bc1d1c35 100644 --- a/pandas/core/window/numba_.py +++ b/pandas/core/window/numba_.py @@ -4,7 +4,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, ) import numpy as np @@ -14,6 +13,8 @@ from pandas.core.util.numba_ import jit_user_function if TYPE_CHECKING: + from collections.abc import Callable + from pandas._typing import Scalar @@ -148,6 +149,9 @@ def ewm( # note that len(deltas) = len(vals) - 1 and deltas[i] # is to be used in conjunction with vals[i+1] old_wt *= old_wt_factor ** deltas[start + j - 1] + if not adjust and com == 1: + # update in case of irregular-interval time series + new_wt = 1.0 - old_wt else: weighted = old_wt_factor * weighted if is_observation: @@ -227,10 +231,10 @@ def roll_table( stop = end[i] window = values[start:stop] count_nan = np.sum(np.isnan(window), axis=0) - sub_result = numba_func(window, *args) nan_mask = len(window) - count_nan >= minimum_periods + if nan_mask.any(): + result[i, :] = numba_func(window, *args) min_periods_mask[i, :] = nan_mask - result[i, :] = sub_result result = np.where(min_periods_mask, result, np.nan) return result @@ -323,6 +327,9 @@ def ewm_table( # note that len(deltas) = len(vals) - 1 and deltas[i] # is to be used in conjunction with vals[i+1] old_wt[j] *= old_wt_factor ** deltas[i - 1] + if not adjust and com == 1: + # update in case of irregular-interval time series + new_wt = 1.0 - old_wt[j] else: weighted[j] = old_wt_factor * weighted[j] if is_observations[j]: diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 2243d8dd1a613..16aa6d7e56a1c 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -13,7 +13,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Literal, ) @@ -90,6 +89,7 @@ ) if TYPE_CHECKING: + from collections.abc import Callable from collections.abc import ( Hashable, Iterator, diff --git a/pandas/io/_util.py b/pandas/io/_util.py index 3b2ae5daffdba..a72a16269959d 100644 --- a/pandas/io/_util.py +++ b/pandas/io/_util.py @@ -1,11 +1,16 @@ from __future__ import annotations -from typing import Callable +from typing import TYPE_CHECKING + +import numpy as np from pandas.compat._optional import import_optional_dependency import pandas as pd +if TYPE_CHECKING: + from collections.abc import Callable + def _arrow_dtype_mapping() -> dict: pa = import_optional_dependency("pyarrow") @@ -29,6 +34,6 @@ def arrow_string_types_mapper() -> Callable: pa = import_optional_dependency("pyarrow") return { - pa.string(): pd.StringDtype(storage="pyarrow_numpy"), - pa.large_string(): pd.StringDtype(storage="pyarrow_numpy"), + pa.string(): pd.StringDtype(storage="pyarrow", na_value=np.nan), + pa.large_string(): pd.StringDtype(storage="pyarrow", na_value=np.nan), }.get diff --git a/pandas/io/common.py b/pandas/io/common.py index 4507a7d08c8ba..a76f0cf6dd34d 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -55,10 +55,6 @@ BaseBuffer, ReadCsvBuffer, ) -from pandas.compat import ( - get_bz2_file, - get_lzma_file, -) from pandas.compat._optional import import_optional_dependency from pandas.util._decorators import doc from pandas.util._exceptions import find_stack_level @@ -784,9 +780,11 @@ def get_handle( # BZ Compression elif compression == "bz2": + import bz2 + # Overload of "BZ2File" to handle pickle protocol 5 # "Union[str, BaseBuffer]", "str", "Dict[str, Any]" - handle = get_bz2_file()( # type: ignore[call-overload] + handle = bz2.BZ2File( # type: ignore[call-overload] handle, mode=ioargs.mode, **compression_args, @@ -849,7 +847,9 @@ def get_handle( # error: Argument 1 to "LZMAFile" has incompatible type "Union[str, # BaseBuffer]"; expected "Optional[Union[Union[str, bytes, PathLike[str], # PathLike[bytes]], IO[bytes]], None]" - handle = get_lzma_file()( + import lzma + + handle = lzma.LZMAFile( handle, # type: ignore[arg-type] ioargs.mode, **compression_args, diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 1eb22d4ee9de7..f83f9cb1c8d74 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -1,6 +1,7 @@ from __future__ import annotations from collections.abc import ( + Callable, Hashable, Iterable, Mapping, @@ -14,7 +15,6 @@ IO, TYPE_CHECKING, Any, - Callable, Generic, Literal, TypeVar, @@ -957,7 +957,7 @@ class ExcelWriter(Generic[_WorkbookT]): * `xlsxwriter `__ for xlsx files if xlsxwriter is installed otherwise `openpyxl `__ - * `odswriter `__ for ods files + * `odf `__ for ods files See :meth:`DataFrame.to_excel` for typical usage. @@ -1004,7 +1004,7 @@ class ExcelWriter(Generic[_WorkbookT]): * xlsxwriter: ``xlsxwriter.Workbook(file, **engine_kwargs)`` * openpyxl (write mode): ``openpyxl.Workbook(**engine_kwargs)`` * openpyxl (append mode): ``openpyxl.load_workbook(file, **engine_kwargs)`` - * odswriter: ``odf.opendocument.OpenDocumentSpreadsheet(**engine_kwargs)`` + * odf: ``odf.opendocument.OpenDocumentSpreadsheet(**engine_kwargs)`` .. versionadded:: 1.3.0 diff --git a/pandas/io/excel/_util.py b/pandas/io/excel/_util.py index f879f16aa5dc8..e7c5d518abaee 100644 --- a/pandas/io/excel/_util.py +++ b/pandas/io/excel/_util.py @@ -1,6 +1,7 @@ from __future__ import annotations from collections.abc import ( + Callable, Hashable, Iterable, MutableMapping, @@ -9,7 +10,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Literal, TypeVar, overload, diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index b42dbaa579ee7..3df3e77a851a3 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -6,8 +6,9 @@ TYPE_CHECKING, Any, ) +import warnings -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._libs import lib from pandas.compat._optional import import_optional_dependency @@ -107,6 +108,14 @@ def read_feather( type of object stored in file DataFrame object stored in the file. + See Also + -------- + read_csv : Read a comma-separated values (csv) file into a pandas DataFrame. + read_excel : Read an Excel file into a pandas DataFrame. + read_spss : Read an SPSS file into a pandas DataFrame. + read_orc : Load an ORC object into a pandas DataFrame. + read_sas : Read SAS file into a pandas DataFrame. + Examples -------- >>> df = pd.read_feather("path/to/file.feather") # doctest: +SKIP @@ -122,10 +131,17 @@ def read_feather( with get_handle( path, "rb", storage_options=storage_options, is_text=False ) as handles: - if dtype_backend is lib.no_default and not using_pyarrow_string_dtype(): - return feather.read_feather( - handles.handle, columns=columns, use_threads=bool(use_threads) - ) + if dtype_backend is lib.no_default and not using_string_dtype(): + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + "make_block is deprecated", + DeprecationWarning, + ) + + return feather.read_feather( + handles.handle, columns=columns, use_threads=bool(use_threads) + ) pa_table = feather.read_table( handles.handle, columns=columns, use_threads=bool(use_threads) @@ -139,7 +155,7 @@ def read_feather( elif dtype_backend == "pyarrow": return pa_table.to_pandas(types_mapper=pd.ArrowDtype) - elif using_pyarrow_string_dtype(): + elif using_string_dtype(): return pa_table.to_pandas(types_mapper=arrow_string_types_mapper()) else: raise NotImplementedError diff --git a/pandas/io/formats/css.py b/pandas/io/formats/css.py index d3d0da6f562a7..0af04526ea96d 100644 --- a/pandas/io/formats/css.py +++ b/pandas/io/formats/css.py @@ -5,10 +5,7 @@ from __future__ import annotations import re -from typing import ( - TYPE_CHECKING, - Callable, -) +from typing import TYPE_CHECKING import warnings from pandas.errors import CSSWarning @@ -16,6 +13,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Generator, Iterable, Iterator, diff --git a/pandas/io/formats/excel.py b/pandas/io/formats/excel.py index b6c6112b05ab3..52b5755558900 100644 --- a/pandas/io/formats/excel.py +++ b/pandas/io/formats/excel.py @@ -5,6 +5,7 @@ from __future__ import annotations from collections.abc import ( + Callable, Hashable, Iterable, Mapping, @@ -16,7 +17,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, cast, ) import warnings @@ -52,6 +52,7 @@ if TYPE_CHECKING: from pandas._typing import ( + ExcelWriterMergeCells, FilePath, IndexLabel, StorageOptions, @@ -523,8 +524,11 @@ class ExcelFormatter: Column label for index column(s) if desired. If None is given, and `header` and `index` are True, then the index names are used. A sequence should be given if the DataFrame uses MultiIndex. - merge_cells : bool, default False - Format MultiIndex and Hierarchical Rows as merged cells. + merge_cells : bool or 'columns', default False + Format MultiIndex column headers and Hierarchical Rows as merged cells + if True. Merge MultiIndex column headers only if 'columns'. + .. versionchanged:: 3.0.0 + Added the 'columns' option. inf_rep : str, default `'inf'` representation for np.inf values (which aren't representable in Excel) A `'-'` sign will be added in front of -inf. @@ -547,7 +551,7 @@ def __init__( header: Sequence[Hashable] | bool = True, index: bool = True, index_label: IndexLabel | None = None, - merge_cells: bool = False, + merge_cells: ExcelWriterMergeCells = False, inf_rep: str = "inf", style_converter: Callable | None = None, ) -> None: @@ -580,6 +584,9 @@ def __init__( self.index = index self.index_label = index_label self.header = header + + if not isinstance(merge_cells, bool) and merge_cells != "columns": + raise ValueError(f"Unexpected value for {merge_cells=}.") self.merge_cells = merge_cells self.inf_rep = inf_rep @@ -614,7 +621,7 @@ def _format_header_mi(self) -> Iterable[ExcelCell]: columns = self.columns level_strs = columns._format_multi( - sparsify=self.merge_cells, include_names=False + sparsify=self.merge_cells in {True, "columns"}, include_names=False ) level_lengths = get_level_lengths(level_strs) coloffset = 0 @@ -623,7 +630,7 @@ def _format_header_mi(self) -> Iterable[ExcelCell]: if self.index and isinstance(self.df.index, MultiIndex): coloffset = self.df.index.nlevels - 1 - if self.merge_cells: + if self.merge_cells in {True, "columns"}: # Format multi-index as a merged cells. for lnum, name in enumerate(columns.names): yield ExcelCell( @@ -793,7 +800,9 @@ def _format_hierarchical_rows(self) -> Iterable[ExcelCell]: # with index names (blank if None) for # unambiguous round-trip, unless not merging, # in which case the names all go on one row Issue #11328 - if isinstance(self.columns, MultiIndex) and self.merge_cells: + if isinstance(self.columns, MultiIndex) and ( + self.merge_cells in {True, "columns"} + ): self.rowcounter += 1 # if index labels are not empty go ahead and dump @@ -801,7 +810,7 @@ def _format_hierarchical_rows(self) -> Iterable[ExcelCell]: for cidx, name in enumerate(index_labels): yield ExcelCell(self.rowcounter - 1, cidx, name, None) - if self.merge_cells: + if self.merge_cells and self.merge_cells != "columns": # Format hierarchical rows as merged cells. level_strs = self.df.index._format_multi( sparsify=True, include_names=False diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index c503121328f53..9ad5ac83e9eae 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -6,6 +6,7 @@ from __future__ import annotations from collections.abc import ( + Callable, Generator, Hashable, Mapping, @@ -22,7 +23,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Final, cast, ) diff --git a/pandas/io/formats/printing.py b/pandas/io/formats/printing.py index 0bd4f2935f4d0..67b5eb6f5ee5b 100644 --- a/pandas/io/formats/printing.py +++ b/pandas/io/formats/printing.py @@ -5,6 +5,7 @@ from __future__ import annotations from collections.abc import ( + Callable, Iterable, Mapping, Sequence, @@ -13,7 +14,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, TypeVar, Union, ) diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 69021eb2656f6..6f4c2fa6c6eae 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -4,13 +4,11 @@ from __future__ import annotations -from contextlib import contextmanager import copy from functools import partial import operator from typing import ( TYPE_CHECKING, - Callable, overload, ) @@ -56,7 +54,7 @@ if TYPE_CHECKING: from collections.abc import ( - Generator, + Callable, Hashable, Sequence, ) @@ -68,6 +66,7 @@ Axis, AxisInt, Concatenate, + ExcelWriterMergeCells, FilePath, IndexLabel, IntervalClosedType, @@ -84,22 +83,6 @@ from pandas import ExcelWriter -try: - import matplotlib as mpl - import matplotlib.pyplot as plt - - has_mpl = True -except ImportError: - has_mpl = False - - -@contextmanager -def _mpl(func: Callable) -> Generator[tuple[Any, Any], None, None]: - if has_mpl: - yield plt, mpl - else: - raise ImportError(f"{func.__name__} requires matplotlib.") - #### # Shared Doc Strings @@ -569,7 +552,7 @@ def to_excel( startrow: int = 0, startcol: int = 0, engine: str | None = None, - merge_cells: bool = True, + merge_cells: ExcelWriterMergeCells = True, encoding: str | None = None, inf_rep: str = "inf", verbose: bool = True, @@ -3832,61 +3815,61 @@ def _background_gradient( else: # else validate gmap against the underlying data gmap = _validate_apply_axis_arg(gmap, "gmap", float, data) - with _mpl(Styler.background_gradient) as (_, _matplotlib): - smin = np.nanmin(gmap) if vmin is None else vmin - smax = np.nanmax(gmap) if vmax is None else vmax - rng = smax - smin - # extend lower / upper bounds, compresses color range - norm = _matplotlib.colors.Normalize(smin - (rng * low), smax + (rng * high)) + smin = np.nanmin(gmap) if vmin is None else vmin + smax = np.nanmax(gmap) if vmax is None else vmax + rng = smax - smin + _matplotlib = import_optional_dependency( + "matplotlib", extra="Styler.background_gradient requires matplotlib." + ) + # extend lower / upper bounds, compresses color range + norm = _matplotlib.colors.Normalize(smin - (rng * low), smax + (rng * high)) + + if cmap is None: + rgbas = _matplotlib.colormaps[_matplotlib.rcParams["image.cmap"]](norm(gmap)) + else: + rgbas = _matplotlib.colormaps.get_cmap(cmap)(norm(gmap)) + + def relative_luminance(rgba) -> float: + """ + Calculate relative luminance of a color. + + The calculation adheres to the W3C standards + (https://www.w3.org/WAI/GL/wiki/Relative_luminance) + + Parameters + ---------- + color : rgb or rgba tuple + + Returns + ------- + float + The relative luminance as a value from 0 to 1 + """ + r, g, b = ( + x / 12.92 if x <= 0.04045 else ((x + 0.055) / 1.055) ** 2.4 + for x in rgba[:3] + ) + return 0.2126 * r + 0.7152 * g + 0.0722 * b - if cmap is None: - rgbas = _matplotlib.colormaps[_matplotlib.rcParams["image.cmap"]]( - norm(gmap) + def css(rgba, text_only) -> str: + if not text_only: + dark = relative_luminance(rgba) < text_color_threshold + text_color = "#f1f1f1" if dark else "#000000" + return ( + f"background-color: {_matplotlib.colors.rgb2hex(rgba)};" + f"color: {text_color};" ) else: - rgbas = _matplotlib.colormaps.get_cmap(cmap)(norm(gmap)) - - def relative_luminance(rgba) -> float: - """ - Calculate relative luminance of a color. - - The calculation adheres to the W3C standards - (https://www.w3.org/WAI/GL/wiki/Relative_luminance) - - Parameters - ---------- - color : rgb or rgba tuple - - Returns - ------- - float - The relative luminance as a value from 0 to 1 - """ - r, g, b = ( - x / 12.92 if x <= 0.04045 else ((x + 0.055) / 1.055) ** 2.4 - for x in rgba[:3] - ) - return 0.2126 * r + 0.7152 * g + 0.0722 * b - - def css(rgba, text_only) -> str: - if not text_only: - dark = relative_luminance(rgba) < text_color_threshold - text_color = "#f1f1f1" if dark else "#000000" - return ( - f"background-color: {_matplotlib.colors.rgb2hex(rgba)};" - f"color: {text_color};" - ) - else: - return f"color: {_matplotlib.colors.rgb2hex(rgba)};" + return f"color: {_matplotlib.colors.rgb2hex(rgba)};" - if data.ndim == 1: - return [css(rgba, text_only) for rgba in rgbas] - else: - return DataFrame( - [[css(rgba, text_only) for rgba in row] for row in rgbas], - index=data.index, - columns=data.columns, - ) + if data.ndim == 1: + return [css(rgba, text_only) for rgba in rgbas] + else: + return DataFrame( + [[css(rgba, text_only) for rgba in row] for row in rgbas], + index=data.index, + columns=data.columns, + ) def _highlight_between( @@ -4124,20 +4107,22 @@ def css_calc(x, left: float, right: float, align: str, color: str | list | tuple rgbas = None if cmap is not None: # use the matplotlib colormap input - with _mpl(Styler.bar) as (_, _matplotlib): - cmap = ( - _matplotlib.colormaps[cmap] - if isinstance(cmap, str) - else cmap # assumed to be a Colormap instance as documented - ) - norm = _matplotlib.colors.Normalize(left, right) - rgbas = cmap(norm(values)) - if data.ndim == 1: - rgbas = [_matplotlib.colors.rgb2hex(rgba) for rgba in rgbas] - else: - rgbas = [ - [_matplotlib.colors.rgb2hex(rgba) for rgba in row] for row in rgbas - ] + _matplotlib = import_optional_dependency( + "matplotlib", extra="Styler.bar requires matplotlib." + ) + cmap = ( + _matplotlib.colormaps[cmap] + if isinstance(cmap, str) + else cmap # assumed to be a Colormap instance as documented + ) + norm = _matplotlib.colors.Normalize(left, right) + rgbas = cmap(norm(values)) + if data.ndim == 1: + rgbas = [_matplotlib.colors.rgb2hex(rgba) for rgba in rgbas] + else: + rgbas = [ + [_matplotlib.colors.rgb2hex(rgba) for rgba in row] for row in rgbas + ] assert isinstance(align, str) # mypy: should now be in [left, right, mid, zero] if data.ndim == 1: diff --git a/pandas/io/formats/style_render.py b/pandas/io/formats/style_render.py index 92afbc0e150ef..ec718f2a1276f 100644 --- a/pandas/io/formats/style_render.py +++ b/pandas/io/formats/style_render.py @@ -1,13 +1,15 @@ from __future__ import annotations from collections import defaultdict -from collections.abc import Sequence +from collections.abc import ( + Callable, + Sequence, +) from functools import partial import re from typing import ( TYPE_CHECKING, Any, - Callable, DefaultDict, Optional, TypedDict, @@ -1050,7 +1052,7 @@ def format( When using a ``formatter`` string the dtypes must be compatible, otherwise a `ValueError` will be raised. - When instantiating a Styler, default formatting can be applied be setting the + When instantiating a Styler, default formatting can be applied by setting the ``pandas.options``: - ``styler.format.formatter``: default None. diff --git a/pandas/io/html.py b/pandas/io/html.py index 42f5266e7649b..4b8bc48130fab 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -1178,7 +1178,10 @@ def read_html( **after** `skiprows` is applied. This function will *always* return a list of :class:`DataFrame` *or* - it will fail, e.g., it will *not* return an empty list. + it will fail, i.e., it will *not* return an empty list, save for some + rare cases. + It might return an empty list in case of inputs with single row and + ```` containing only whitespaces. Examples -------- diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 13d74e935f786..b29ead1d14b1d 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -9,7 +9,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Generic, Literal, TypeVar, @@ -60,11 +59,13 @@ from pandas.io.json._table_schema import ( build_table_schema, parse_table_schema, + set_default_names, ) from pandas.io.parsers.readers import validate_integer if TYPE_CHECKING: from collections.abc import ( + Callable, Hashable, Mapping, ) @@ -353,6 +354,8 @@ def __init__( raise ValueError(msg) self.schema = build_table_schema(obj, index=self.index) + if self.index: + obj = set_default_names(obj) # NotImplemented on a column MultiIndex if obj.ndim == 2 and isinstance(obj.columns, MultiIndex): @@ -369,18 +372,22 @@ def __init__( msg = "Overlapping names between the index and columns" raise ValueError(msg) - obj = obj.copy() timedeltas = obj.select_dtypes(include=["timedelta"]).columns + copied = False if len(timedeltas): + obj = obj.copy() + copied = True obj[timedeltas] = obj[timedeltas].map(lambda x: x.isoformat()) - # Convert PeriodIndex to datetimes before serializing - if isinstance(obj.index.dtype, PeriodDtype): - obj.index = obj.index.to_timestamp() # exclude index from obj if index=False if not self.index: self.obj = obj.reset_index(drop=True) else: + # Convert PeriodIndex to datetimes before serializing + if isinstance(obj.index.dtype, PeriodDtype): + if not copied: + obj = obj.copy(deep=False) + obj.index = obj.index.to_timestamp() self.obj = obj.reset_index(drop=False) self.date_format = "iso" self.orient = "records" @@ -965,7 +972,7 @@ def read(self) -> DataFrame | Series: else: return obj - def _get_object_parser(self, json) -> DataFrame | Series: + def _get_object_parser(self, json: str) -> DataFrame | Series: """ Parses a json document into a pandas object. """ @@ -981,16 +988,14 @@ def _get_object_parser(self, json) -> DataFrame | Series: "date_unit": self.date_unit, "dtype_backend": self.dtype_backend, } - obj = None if typ == "frame": - obj = FrameParser(json, **kwargs).parse() - - if typ == "series" or obj is None: + return FrameParser(json, **kwargs).parse() + elif typ == "series": if not isinstance(dtype, bool): kwargs["dtype"] = dtype - obj = SeriesParser(json, **kwargs).parse() - - return obj + return SeriesParser(json, **kwargs).parse() + else: + raise ValueError(f"{typ=} must be 'frame' or 'series'.") def close(self) -> None: """ @@ -1103,7 +1108,6 @@ def __init__( self.convert_dates = convert_dates self.date_unit = date_unit self.keep_default_dates = keep_default_dates - self.obj: DataFrame | Series | None = None self.dtype_backend = dtype_backend @final @@ -1117,26 +1121,22 @@ def check_keys_split(self, decoded: dict) -> None: raise ValueError(f"JSON data had unexpected key(s): {bad_keys_joined}") @final - def parse(self): - self._parse() + def parse(self) -> DataFrame | Series: + obj = self._parse() - if self.obj is None: - return None if self.convert_axes: - self._convert_axes() - self._try_convert_types() - return self.obj + obj = self._convert_axes(obj) + obj = self._try_convert_types(obj) + return obj - def _parse(self) -> None: + def _parse(self) -> DataFrame | Series: raise AbstractMethodError(self) @final - def _convert_axes(self) -> None: + def _convert_axes(self, obj: DataFrame | Series) -> DataFrame | Series: """ Try to convert axes. """ - obj = self.obj - assert obj is not None # for mypy for axis_name in obj._AXIS_ORDERS: ax = obj._get_axis(axis_name) ser = Series(ax, dtype=ax.dtype, copy=False) @@ -1149,9 +1149,10 @@ def _convert_axes(self) -> None: ) if result: new_axis = Index(new_ser, dtype=new_ser.dtype, copy=False) - setattr(self.obj, axis_name, new_axis) + setattr(obj, axis_name, new_axis) + return obj - def _try_convert_types(self) -> None: + def _try_convert_types(self, obj): raise AbstractMethodError(self) @final @@ -1178,8 +1179,10 @@ def _try_convert_data( elif self.dtype is True: pass - else: - # dtype to force + elif not _should_convert_dates( + convert_dates, self.keep_default_dates, name + ): + # convert_dates takes precedence over columns listed in dtypes dtype = ( self.dtype.get(name) if isinstance(self.dtype, dict) else self.dtype ) @@ -1190,8 +1193,8 @@ def _try_convert_data( return data, False if convert_dates: - new_data, result = self._try_convert_to_date(data) - if result: + new_data = self._try_convert_to_date(data) + if new_data is not data: return new_data, True converted = False @@ -1241,16 +1244,16 @@ def _try_convert_data( return data, converted @final - def _try_convert_to_date(self, data: Series) -> tuple[Series, bool]: + def _try_convert_to_date(self, data: Series) -> Series: """ Try to parse a ndarray like into a date column. Try to coerce object in epoch/iso formats and integer/float in epoch - formats. Return a boolean if parsing was successful. + formats. """ # no conversion on empty if not len(data): - return data, False + return data new_data = data @@ -1261,7 +1264,7 @@ def _try_convert_to_date(self, data: Series) -> tuple[Series, bool]: try: new_data = data.astype("int64") except OverflowError: - return data, False + return data except (TypeError, ValueError): pass @@ -1273,57 +1276,45 @@ def _try_convert_to_date(self, data: Series) -> tuple[Series, bool]: | (new_data._values == iNaT) ) if not in_range.all(): - return data, False + return data date_units = (self.date_unit,) if self.date_unit else self._STAMP_UNITS for date_unit in date_units: try: - new_data = to_datetime(new_data, errors="raise", unit=date_unit) + return to_datetime(new_data, errors="raise", unit=date_unit) except (ValueError, OverflowError, TypeError): continue - return new_data, True - return data, False + return data class SeriesParser(Parser): _default_orient = "index" _split_keys = ("name", "index", "data") - obj: Series | None - def _parse(self) -> None: + def _parse(self) -> Series: data = ujson_loads(self.json, precise_float=self.precise_float) if self.orient == "split": decoded = {str(k): v for k, v in data.items()} self.check_keys_split(decoded) - self.obj = Series(**decoded) + return Series(**decoded) else: - self.obj = Series(data) + return Series(data) - def _try_convert_types(self) -> None: - if self.obj is None: - return - obj, result = self._try_convert_data( - "data", self.obj, convert_dates=self.convert_dates - ) - if result: - self.obj = obj + def _try_convert_types(self, obj: Series) -> Series: + obj, _ = self._try_convert_data("data", obj, convert_dates=self.convert_dates) + return obj class FrameParser(Parser): _default_orient = "columns" _split_keys = ("columns", "index", "data") - obj: DataFrame | None - def _parse(self) -> None: + def _parse(self) -> DataFrame: json = self.json orient = self.orient - if orient == "columns": - self.obj = DataFrame( - ujson_loads(json, precise_float=self.precise_float), dtype=None - ) - elif orient == "split": + if orient == "split": decoded = { str(k): v for k, v in ujson_loads(json, precise_float=self.precise_float).items() @@ -1337,90 +1328,61 @@ def _parse(self) -> None: orig_names, is_potential_multi_index(orig_names, None), ) - self.obj = DataFrame(dtype=None, **decoded) + return DataFrame(dtype=None, **decoded) elif orient == "index": - self.obj = DataFrame.from_dict( + return DataFrame.from_dict( ujson_loads(json, precise_float=self.precise_float), dtype=None, orient="index", ) elif orient == "table": - self.obj = parse_table_schema(json, precise_float=self.precise_float) + return parse_table_schema(json, precise_float=self.precise_float) else: - self.obj = DataFrame( + # includes orient == "columns" + return DataFrame( ujson_loads(json, precise_float=self.precise_float), dtype=None ) - def _process_converter( - self, - f: Callable[[Hashable, Series], tuple[Series, bool]], - filt: Callable[[Hashable], bool] | None = None, - ) -> None: - """ - Take a conversion function and possibly recreate the frame. - """ - if filt is None: - filt = lambda col: True - - obj = self.obj - assert obj is not None # for mypy - - needs_new_obj = False - new_obj = {} - for i, (col, c) in enumerate(obj.items()): - if filt(col): - new_data, result = f(col, c) - if result: - c = new_data - needs_new_obj = True - new_obj[i] = c - - if needs_new_obj: - # possibly handle dup columns - new_frame = DataFrame(new_obj, index=obj.index) - new_frame.columns = obj.columns - self.obj = new_frame - - def _try_convert_types(self) -> None: - if self.obj is None: - return - if self.convert_dates: - self._try_convert_dates() - - self._process_converter( - lambda col, c: self._try_convert_data(col, c, convert_dates=False) + def _try_convert_types(self, obj: DataFrame) -> DataFrame: + arrays = [] + for col_label, series in obj.items(): + result, _ = self._try_convert_data( + col_label, + series, + convert_dates=_should_convert_dates( + self.convert_dates, + keep_default_dates=self.keep_default_dates, + col=col_label, + ), + ) + arrays.append(result.array) + return DataFrame._from_arrays( + arrays, obj.columns, obj.index, verify_integrity=False ) - def _try_convert_dates(self) -> None: - if self.obj is None: - return - - # our columns to parse - convert_dates_list_bool = self.convert_dates - if isinstance(convert_dates_list_bool, bool): - convert_dates_list_bool = [] - convert_dates = set(convert_dates_list_bool) - - def is_ok(col) -> bool: - """ - Return if this col is ok to try for a date parse. - """ - if col in convert_dates: - return True - if not self.keep_default_dates: - return False - if not isinstance(col, str): - return False - - col_lower = col.lower() - if ( - col_lower.endswith(("_at", "_time")) - or col_lower == "modified" - or col_lower == "date" - or col_lower == "datetime" - or col_lower.startswith("timestamp") - ): - return True - return False - self._process_converter(lambda col, c: self._try_convert_to_date(c), filt=is_ok) +def _should_convert_dates( + convert_dates: bool | list[str], + keep_default_dates: bool, + col: Hashable, +) -> bool: + """ + Return bool whether a DataFrame column should be cast to datetime. + """ + if convert_dates is False: + # convert_dates=True means follow keep_default_dates + return False + elif not isinstance(convert_dates, bool) and col in set(convert_dates): + return True + elif not keep_default_dates: + return False + elif not isinstance(col, str): + return False + col_lower = col.lower() + if ( + col_lower.endswith(("_at", "_time")) + or col_lower in {"modified", "date", "datetime"} + or col_lower.startswith("timestamp") + ): + return True + return False diff --git a/pandas/io/json/_table_schema.py b/pandas/io/json/_table_schema.py index d4b412404c308..d966e38fa11a5 100644 --- a/pandas/io/json/_table_schema.py +++ b/pandas/io/json/_table_schema.py @@ -114,7 +114,7 @@ def set_default_names(data): ) return data - data = data.copy() + data = data.copy(deep=False) if data.index.nlevels > 1: data.index.names = com.fill_missing_names(data.index.names) else: @@ -275,7 +275,7 @@ def build_table_schema( >>> df = pd.DataFrame( ... {'A': [1, 2, 3], ... 'B': ['a', 'b', 'c'], - ... 'C': pd.date_range('2016-01-01', freq='d', periods=3), + ... 'C': pd.date_range('2016-01-01', freq='D', periods=3), ... }, index=pd.Index(range(3), name='idx')) >>> build_table_schema(df) {'fields': \ diff --git a/pandas/io/orc.py b/pandas/io/orc.py index d4b4fd90658ad..b297164d5d108 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -9,7 +9,7 @@ Literal, ) -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._libs import lib from pandas.compat._optional import import_optional_dependency @@ -85,6 +85,14 @@ def read_orc( DataFrame DataFrame based on the ORC file. + See Also + -------- + read_csv : Read a comma-separated values (csv) file into a pandas DataFrame. + read_excel : Read an Excel file into a pandas DataFrame. + read_spss : Read an SPSS file into a pandas DataFrame. + read_sas : Load a SAS file into a pandas DataFrame. + read_feather : Load a feather-format object into a pandas DataFrame. + Notes ----- Before using this function you should read the :ref:`user guide about ORC ` @@ -128,7 +136,7 @@ def read_orc( df = pa_table.to_pandas(types_mapper=mapping.get) return df else: - if using_pyarrow_string_dtype(): + if using_string_dtype(): types_mapper = arrow_string_types_mapper() else: types_mapper = None diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 08983ceed44e5..77a9cc3fca644 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -10,9 +10,12 @@ Any, Literal, ) -from warnings import catch_warnings +from warnings import ( + catch_warnings, + filterwarnings, +) -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._libs import lib from pandas.compat._optional import import_optional_dependency @@ -254,7 +257,7 @@ def read( to_pandas_kwargs["types_mapper"] = mapping.get elif dtype_backend == "pyarrow": to_pandas_kwargs["types_mapper"] = pd.ArrowDtype # type: ignore[assignment] - elif using_pyarrow_string_dtype(): + elif using_string_dtype(): to_pandas_kwargs["types_mapper"] = arrow_string_types_mapper() path_or_handle, handles, filesystem = _get_path_or_handle( @@ -271,7 +274,13 @@ def read( filters=filters, **kwargs, ) - result = pa_table.to_pandas(**to_pandas_kwargs) + with catch_warnings(): + filterwarnings( + "ignore", + "make_block is deprecated", + DeprecationWarning, + ) + result = pa_table.to_pandas(**to_pandas_kwargs) if pa_table.schema.metadata: if b"PANDAS_ATTRS" in pa_table.schema.metadata: @@ -384,7 +393,15 @@ def read( try: parquet_file = self.api.ParquetFile(path, **parquet_kwargs) - return parquet_file.to_pandas(columns=columns, filters=filters, **kwargs) + with catch_warnings(): + filterwarnings( + "ignore", + "make_block is deprecated", + DeprecationWarning, + ) + return parquet_file.to_pandas( + columns=columns, filters=filters, **kwargs + ) finally: if handles is not None: handles.close() diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 8b6f7d5750ffe..86bb5f190e403 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -3,7 +3,7 @@ from typing import TYPE_CHECKING import warnings -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._libs import lib from pandas.compat._optional import import_optional_dependency @@ -287,17 +287,23 @@ def read(self) -> DataFrame: table = table.cast(new_schema) - if dtype_backend == "pyarrow": - frame = table.to_pandas(types_mapper=pd.ArrowDtype) - elif dtype_backend == "numpy_nullable": - # Modify the default mapping to also - # map null to Int64 (to match other engines) - dtype_mapping = _arrow_dtype_mapping() - dtype_mapping[pa.null()] = pd.Int64Dtype() - frame = table.to_pandas(types_mapper=dtype_mapping.get) - elif using_pyarrow_string_dtype(): - frame = table.to_pandas(types_mapper=arrow_string_types_mapper()) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + "make_block is deprecated", + DeprecationWarning, + ) + if dtype_backend == "pyarrow": + frame = table.to_pandas(types_mapper=pd.ArrowDtype) + elif dtype_backend == "numpy_nullable": + # Modify the default mapping to also + # map null to Int64 (to match other engines) + dtype_mapping = _arrow_dtype_mapping() + dtype_mapping[pa.null()] = pd.Int64Dtype() + frame = table.to_pandas(types_mapper=dtype_mapping.get) + elif using_string_dtype(): + frame = table.to_pandas(types_mapper=arrow_string_types_mapper()) - else: - frame = table.to_pandas() + else: + frame = table.to_pandas() return self._finalize_pandas_output(frame) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index c6cc85b9f722b..7294efe843cce 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -4,10 +4,10 @@ from copy import copy import csv from enum import Enum +import itertools from typing import ( TYPE_CHECKING, Any, - Callable, cast, final, overload, @@ -29,27 +29,19 @@ ) from pandas.util._exceptions import find_stack_level -from pandas.core.dtypes.astype import astype_array from pandas.core.dtypes.common import ( is_bool_dtype, is_dict_like, - is_extension_array_dtype, is_float_dtype, is_integer, is_integer_dtype, is_list_like, is_object_dtype, is_string_dtype, - pandas_dtype, -) -from pandas.core.dtypes.dtypes import ( - CategoricalDtype, - ExtensionDtype, ) from pandas.core.dtypes.missing import isna from pandas import ( - ArrowDtype, DataFrame, DatetimeIndex, StringDtype, @@ -59,12 +51,9 @@ ArrowExtensionArray, BaseMaskedArray, BooleanArray, - Categorical, - ExtensionArray, FloatingArray, IntegerArray, ) -from pandas.core.arrays.boolean import BooleanDtype from pandas.core.indexes.api import ( Index, MultiIndex, @@ -78,6 +67,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Iterable, Mapping, Sequence, @@ -86,7 +76,6 @@ from pandas._typing import ( ArrayLike, DtypeArg, - DtypeObj, Hashable, HashableT, Scalar, @@ -127,7 +116,6 @@ def __init__(self, kwds) -> None: "for the 'parse_dates' parameter" ) self.parse_dates: bool | list = parse_dates - self._parse_date_cols: set = set() self.date_parser = kwds.pop("date_parser", lib.no_default) self.date_format = kwds.pop("date_format", None) self.dayfirst = kwds.pop("dayfirst", False) @@ -145,12 +133,6 @@ def __init__(self, kwds) -> None: self.false_values = kwds.get("false_values") self.cache_dates = kwds.pop("cache_dates", True) - self._date_conv = _make_date_converter( - date_format=self.date_format, - dayfirst=self.dayfirst, - cache_dates=self.cache_dates, - ) - # validate header options for mi self.header = kwds.get("header") if is_list_like(self.header, allow_sets=False): @@ -181,58 +163,12 @@ def __init__(self, kwds) -> None: self._first_chunk = True - self.usecols, self.usecols_dtype = self._validate_usecols_arg(kwds["usecols"]) + self.usecols, self.usecols_dtype = _validate_usecols_arg(kwds["usecols"]) # Fallback to error to pass a sketchy test(test_override_set_noconvert_columns) # Normally, this arg would get pre-processed earlier on self.on_bad_lines = kwds.get("on_bad_lines", self.BadLineHandleMethod.ERROR) - def _validate_parse_dates_presence(self, columns: Sequence[Hashable]) -> set: - """ - Check if parse_dates are in columns. - - If user has provided names for parse_dates, check if those columns - are available. - - Parameters - ---------- - columns : list - List of names of the dataframe. - - Returns - ------- - The names of the columns which will get parsed later if a list - is given as specification. - - Raises - ------ - ValueError - If column to parse_date is not in dataframe. - - """ - if not isinstance(self.parse_dates, list): - return set() - - # get only columns that are references using names (str), not by index - missing_cols = ", ".join( - sorted( - { - col - for col in self.parse_dates - if isinstance(col, str) and col not in columns - } - ) - ) - if missing_cols: - raise ValueError( - f"Missing column provided to 'parse_dates': '{missing_cols}'" - ) - # Convert positions to actual column names - return { - col if (isinstance(col, str) or col in columns) else columns[col] - for col in self.parse_dates - } - def close(self) -> None: pass @@ -336,49 +272,37 @@ def _maybe_make_multi_index_columns( @final def _make_index( - self, data, alldata, columns, indexnamerow: list[Scalar] | None = None + self, alldata, columns, indexnamerow: list[Scalar] | None = None ) -> tuple[Index | None, Sequence[Hashable] | MultiIndex]: index: Index | None - if not is_index_col(self.index_col) or not self.index_col: - index = None + if isinstance(self.index_col, list) and len(self.index_col): + to_remove = [] + indexes = [] + for idx in self.index_col: + if isinstance(idx, str): + raise ValueError(f"Index {idx} invalid") + to_remove.append(idx) + indexes.append(alldata[idx]) + # remove index items from content and columns, don't pop in + # loop + for i in sorted(to_remove, reverse=True): + alldata.pop(i) + if not self._implicit_index: + columns.pop(i) + index = self._agg_index(indexes) + + # add names for the index + if indexnamerow: + coffset = len(indexnamerow) - len(columns) + index = index.set_names(indexnamerow[:coffset]) else: - simple_index = self._get_simple_index(alldata, columns) - index = self._agg_index(simple_index) - - # add names for the index - if indexnamerow: - coffset = len(indexnamerow) - len(columns) - assert index is not None - index = index.set_names(indexnamerow[:coffset]) + index = None # maybe create a mi on the columns columns = self._maybe_make_multi_index_columns(columns, self.col_names) return index, columns - @final - def _get_simple_index(self, data, columns): - def ix(col): - if not isinstance(col, str): - return col - raise ValueError(f"Index {col} invalid") - - to_remove = [] - index = [] - for idx in self.index_col: - i = ix(idx) - to_remove.append(i) - index.append(data[i]) - - # remove index items from content and columns, don't pop in - # loop - for i in sorted(to_remove, reverse=True): - data.pop(i) - if not self._implicit_index: - columns.pop(i) - - return index - @final def _clean_mapping(self, mapping): """converts col numbers to names""" @@ -398,15 +322,23 @@ def _clean_mapping(self, mapping): return clean @final - def _agg_index(self, index, try_parse_dates: bool = True) -> Index: + def _agg_index(self, index) -> Index: arrays = [] converters = self._clean_mapping(self.converters) + clean_dtypes = self._clean_mapping(self.dtype) - for i, arr in enumerate(index): - if try_parse_dates and self._should_parse_dates(i): - arr = self._date_conv( + if self.index_names is not None: + names: Iterable = self.index_names + else: + names = itertools.cycle([None]) + for i, (arr, name) in enumerate(zip(index, names)): + if self._should_parse_dates(i): + arr = date_converter( arr, col=self.index_names[i] if self.index_names is not None else None, + dayfirst=self.dayfirst, + cache_dates=self.cache_dates, + date_format=self.date_format, ) if self.na_filter: @@ -420,14 +352,12 @@ def _agg_index(self, index, try_parse_dates: bool = True) -> Index: assert self.index_names is not None col_name = self.index_names[i] if col_name is not None: - col_na_values, col_na_fvalues = _get_na_values( + col_na_values, col_na_fvalues = get_na_values( col_name, self.na_values, self.na_fvalues, self.keep_default_na ) else: col_na_values, col_na_fvalues = set(), set() - clean_dtypes = self._clean_mapping(self.dtype) - cast_type = None index_converter = False if self.index_names is not None: @@ -444,96 +374,17 @@ def _agg_index(self, index, try_parse_dates: bool = True) -> Index: arr, _ = self._infer_types( arr, col_na_values | col_na_fvalues, cast_type is None, try_num_bool ) - arrays.append(arr) - - names = self.index_names - index = ensure_index_from_sequences(arrays, names) - - return index - - @final - def _convert_to_ndarrays( - self, - dct: Mapping, - na_values, - na_fvalues, - converters=None, - dtypes=None, - ) -> dict[Any, np.ndarray]: - result = {} - for c, values in dct.items(): - conv_f = None if converters is None else converters.get(c, None) - if isinstance(dtypes, dict): - cast_type = dtypes.get(c, None) - else: - # single dtype or None - cast_type = dtypes - - if self.na_filter: - col_na_values, col_na_fvalues = _get_na_values( - c, na_values, na_fvalues, self.keep_default_na - ) + if cast_type is not None: + # Don't perform RangeIndex inference + idx = Index(arr, name=name, dtype=cast_type) else: - col_na_values, col_na_fvalues = set(), set() - - if c in self._parse_date_cols: - # GH#26203 Do not convert columns which get converted to dates - # but replace nans to ensure to_datetime works - mask = algorithms.isin(values, set(col_na_values) | col_na_fvalues) - np.putmask(values, mask, np.nan) - result[c] = values - continue + idx = ensure_index_from_sequences([arr], [name]) + arrays.append(idx) - if conv_f is not None: - # conv_f applied to data before inference - if cast_type is not None: - warnings.warn( - ( - "Both a converter and dtype were specified " - f"for column {c} - only the converter will be used." - ), - ParserWarning, - stacklevel=find_stack_level(), - ) - - try: - values = lib.map_infer(values, conv_f) - except ValueError: - mask = algorithms.isin(values, list(na_values)).view(np.uint8) - values = lib.map_infer_mask(values, conv_f, mask) - - cvals, na_count = self._infer_types( - values, - set(col_na_values) | col_na_fvalues, - cast_type is None, - try_num_bool=False, - ) - else: - is_ea = is_extension_array_dtype(cast_type) - is_str_or_ea_dtype = is_ea or is_string_dtype(cast_type) - # skip inference if specified dtype is object - # or casting to an EA - try_num_bool = not (cast_type and is_str_or_ea_dtype) - - # general type inference and conversion - cvals, na_count = self._infer_types( - values, - set(col_na_values) | col_na_fvalues, - cast_type is None, - try_num_bool, - ) - - # type specified in dtype param or cast_type is an EA - if cast_type is not None: - cast_type = pandas_dtype(cast_type) - if cast_type and (cvals.dtype != cast_type or is_ea): - if not is_ea and na_count > 0: - if is_bool_dtype(cast_type): - raise ValueError(f"Bool column has NA values in column {c}") - cvals = self._cast_types(cvals, cast_type, c) - - result[c] = cvals - return result + if len(arrays) == 1: + return arrays[0] + else: + return MultiIndex.from_arrays(arrays) @final def _set_noconvert_dtype_columns( @@ -580,6 +431,7 @@ def _set(x) -> int: return x if isinstance(self.parse_dates, list): + validate_parse_dates_presence(self.parse_dates, names) for val in self.parse_dates: noconvert_columns.add(_set(val)) @@ -705,78 +557,6 @@ def _infer_types( return result, na_count - @final - def _cast_types(self, values: ArrayLike, cast_type: DtypeObj, column) -> ArrayLike: - """ - Cast values to specified type - - Parameters - ---------- - values : ndarray or ExtensionArray - cast_type : np.dtype or ExtensionDtype - dtype to cast values to - column : string - column name - used only for error reporting - - Returns - ------- - converted : ndarray or ExtensionArray - """ - if isinstance(cast_type, CategoricalDtype): - known_cats = cast_type.categories is not None - - if not is_object_dtype(values.dtype) and not known_cats: - # TODO: this is for consistency with - # c-parser which parses all categories - # as strings - values = lib.ensure_string_array( - values, skipna=False, convert_na_value=False - ) - - cats = Index(values).unique().dropna() - values = Categorical._from_inferred_categories( - cats, cats.get_indexer(values), cast_type, true_values=self.true_values - ) - - # use the EA's implementation of casting - elif isinstance(cast_type, ExtensionDtype): - array_type = cast_type.construct_array_type() - try: - if isinstance(cast_type, BooleanDtype): - # error: Unexpected keyword argument "true_values" for - # "_from_sequence_of_strings" of "ExtensionArray" - return array_type._from_sequence_of_strings( # type: ignore[call-arg] - values, - dtype=cast_type, - true_values=self.true_values, - false_values=self.false_values, - ) - else: - return array_type._from_sequence_of_strings(values, dtype=cast_type) - except NotImplementedError as err: - raise NotImplementedError( - f"Extension Array: {array_type} must implement " - "_from_sequence_of_strings in order to be used in parser methods" - ) from err - - elif isinstance(values, ExtensionArray): - values = values.astype(cast_type, copy=False) - elif issubclass(cast_type.type, str): - # TODO: why skipna=True here and False above? some tests depend - # on it here, but nothing fails if we change it above - # (as no tests get there as of 2022-12-06) - values = lib.ensure_string_array( - values, skipna=True, convert_na_value=False - ) - else: - try: - values = astype_array(values, cast_type, copy=True) - except ValueError as err: - raise ValueError( - f"Unable to convert column {column} to type {cast_type}" - ) from err - return values - @overload def _do_date_conversions( self, @@ -797,16 +577,25 @@ def _do_date_conversions( names: Sequence[Hashable] | Index, data: Mapping[Hashable, ArrayLike] | DataFrame, ) -> Mapping[Hashable, ArrayLike] | DataFrame: - if isinstance(self.parse_dates, list): - return _process_date_conversion( - data, - self._date_conv, - self.parse_dates, - self.index_col, - self.index_names, - names, - dtype_backend=self.dtype_backend, + if not isinstance(self.parse_dates, list): + return data + for colspec in self.parse_dates: + if isinstance(colspec, int) and colspec not in data: + colspec = names[colspec] + if (isinstance(self.index_col, list) and colspec in self.index_col) or ( + isinstance(self.index_names, list) and colspec in self.index_names + ): + continue + result = date_converter( + data[colspec], + col=colspec, + dayfirst=self.dayfirst, + cache_dates=self.cache_dates, + date_format=self.date_format, ) + # error: Unsupported target for indexed assignment + # ("Mapping[Hashable, ExtensionArray | ndarray[Any, Any]] | DataFrame") + data[colspec] = result # type: ignore[index] return data @@ -840,35 +629,6 @@ def _check_data_length( stacklevel=find_stack_level(), ) - @overload - def _evaluate_usecols( - self, - usecols: Callable[[Hashable], object], - names: Iterable[Hashable], - ) -> set[int]: ... - - @overload - def _evaluate_usecols( - self, usecols: SequenceT, names: Iterable[Hashable] - ) -> SequenceT: ... - - @final - def _evaluate_usecols( - self, - usecols: Callable[[Hashable], object] | SequenceT, - names: Iterable[Hashable], - ) -> SequenceT | set[int]: - """ - Check whether or not the 'usecols' parameter - is a callable. If so, enumerates the 'names' - parameter and returns a set of indices for - each entry in 'names' that evaluates to True. - If not a callable, returns 'usecols'. - """ - if callable(usecols): - return {i for i, name in enumerate(names) if usecols(name)} - return usecols - @final def _validate_usecols_names(self, usecols: SequenceT, names: Sequence) -> SequenceT: """ @@ -901,56 +661,6 @@ def _validate_usecols_names(self, usecols: SequenceT, names: Sequence) -> Sequen return usecols - @final - def _validate_usecols_arg(self, usecols): - """ - Validate the 'usecols' parameter. - - Checks whether or not the 'usecols' parameter contains all integers - (column selection by index), strings (column by name) or is a callable. - Raises a ValueError if that is not the case. - - Parameters - ---------- - usecols : list-like, callable, or None - List of columns to use when parsing or a callable that can be used - to filter a list of table columns. - - Returns - ------- - usecols_tuple : tuple - A tuple of (verified_usecols, usecols_dtype). - - 'verified_usecols' is either a set if an array-like is passed in or - 'usecols' if a callable or None is passed in. - - 'usecols_dtype` is the inferred dtype of 'usecols' if an array-like - is passed in or None if a callable or None is passed in. - """ - msg = ( - "'usecols' must either be list-like of all strings, all unicode, " - "all integers or a callable." - ) - if usecols is not None: - if callable(usecols): - return usecols, None - - if not is_list_like(usecols): - # see gh-20529 - # - # Ensure it is iterable container but not string. - raise ValueError(msg) - - usecols_dtype = lib.infer_dtype(usecols, skipna=False) - - if usecols_dtype not in ("empty", "integer", "string"): - raise ValueError(msg) - - usecols = set(usecols) - - return usecols, usecols_dtype - return usecols, None - @final def _clean_index_names(self, columns, index_col) -> tuple[list | None, list, list]: if not is_index_col(index_col): @@ -1004,12 +714,11 @@ def _get_empty_meta( dtype_dict: defaultdict[Hashable, Any] if not is_dict_like(dtype): # if dtype == None, default will be object. - default_dtype = dtype or object - dtype_dict = defaultdict(lambda: default_dtype) + dtype_dict = defaultdict(lambda: dtype) else: dtype = cast(dict, dtype) dtype_dict = defaultdict( - lambda: object, + lambda: None, {columns[k] if is_integer(k) else k: v for k, v in dtype.items()}, ) @@ -1026,8 +735,14 @@ def _get_empty_meta( if (index_col is None or index_col is False) or index_names is None: index = default_index(0) else: - data = [Series([], dtype=dtype_dict[name]) for name in index_names] - index = ensure_index_from_sequences(data, names=index_names) + # TODO: We could return default_index(0) if dtype_dict[name] is None + data = [ + Index([], name=name, dtype=dtype_dict[name]) for name in index_names + ] + if len(data) == 1: + index = data[0] + else: + index = MultiIndex.from_arrays(data) index_col.sort() for i, n in enumerate(index_col): @@ -1040,40 +755,37 @@ def _get_empty_meta( return index, columns, col_dict -def _make_date_converter( +def date_converter( + date_col, + col: Hashable, dayfirst: bool = False, cache_dates: bool = True, date_format: dict[Hashable, str] | str | None = None, ): - def converter(date_col, col: Hashable): - if date_col.dtype.kind in "Mm": - return date_col - - date_fmt = ( - date_format.get(col) if isinstance(date_format, dict) else date_format + if date_col.dtype.kind in "Mm": + return date_col + + date_fmt = date_format.get(col) if isinstance(date_format, dict) else date_format + + str_objs = lib.ensure_string_array(np.asarray(date_col)) + try: + result = tools.to_datetime( + str_objs, + format=date_fmt, + utc=False, + dayfirst=dayfirst, + cache=cache_dates, ) + except (ValueError, TypeError): + # test_usecols_with_parse_dates4 + # test_multi_index_parse_dates + return str_objs - str_objs = lib.ensure_string_array(date_col) - try: - result = tools.to_datetime( - str_objs, - format=date_fmt, - utc=False, - dayfirst=dayfirst, - cache=cache_dates, - ) - except (ValueError, TypeError): - # test_usecols_with_parse_dates4 - # test_multi_index_parse_dates - return str_objs - - if isinstance(result, DatetimeIndex): - arr = result.to_numpy() - arr.flags.writeable = True - return arr - return result._values - - return converter + if isinstance(result, DatetimeIndex): + arr = result.to_numpy() + arr.flags.writeable = True + return arr + return result._values parser_defaults = { @@ -1116,43 +828,7 @@ def converter(date_col, col: Hashable): } -def _process_date_conversion( - data_dict: Mapping[Hashable, ArrayLike] | DataFrame, - converter: Callable, - parse_spec: list, - index_col, - index_names, - columns: Sequence[Hashable] | Index, - dtype_backend=lib.no_default, -) -> Mapping[Hashable, ArrayLike] | DataFrame: - for colspec in parse_spec: - if isinstance(colspec, int) and colspec not in data_dict: - colspec = columns[colspec] - if (isinstance(index_col, list) and colspec in index_col) or ( - isinstance(index_names, list) and colspec in index_names - ): - continue - elif dtype_backend == "pyarrow": - import pyarrow as pa - - dtype = data_dict[colspec].dtype - if isinstance(dtype, ArrowDtype) and ( - pa.types.is_timestamp(dtype.pyarrow_dtype) - or pa.types.is_date(dtype.pyarrow_dtype) - ): - continue - - # Pyarrow engine returns Series which we need to convert to - # numpy array before converter, its a no-op for other parsers - result = converter(np.asarray(data_dict[colspec]), col=colspec) - # error: Unsupported target for indexed assignment - # ("Mapping[Hashable, ExtensionArray | ndarray[Any, Any]] | DataFrame") - data_dict[colspec] = result # type: ignore[index] - - return data_dict - - -def _get_na_values(col, na_values, na_fvalues, keep_default_na: bool): +def get_na_values(col, na_values, na_fvalues, keep_default_na: bool): """ Get the NaN values for a given column. @@ -1189,3 +865,128 @@ def _get_na_values(col, na_values, na_fvalues, keep_default_na: bool): def is_index_col(col) -> bool: return col is not None and col is not False + + +def validate_parse_dates_presence( + parse_dates: bool | list, columns: Sequence[Hashable] +) -> set: + """ + Check if parse_dates are in columns. + + If user has provided names for parse_dates, check if those columns + are available. + + Parameters + ---------- + columns : list + List of names of the dataframe. + + Returns + ------- + The names of the columns which will get parsed later if a list + is given as specification. + + Raises + ------ + ValueError + If column to parse_date is not in dataframe. + + """ + if not isinstance(parse_dates, list): + return set() + + missing = set() + unique_cols = set() + for col in parse_dates: + if isinstance(col, str): + if col not in columns: + missing.add(col) + else: + unique_cols.add(col) + elif col in columns: + unique_cols.add(col) + else: + unique_cols.add(columns[col]) + if missing: + missing_cols = ", ".join(sorted(missing)) + raise ValueError(f"Missing column provided to 'parse_dates': '{missing_cols}'") + return unique_cols + + +def _validate_usecols_arg(usecols): + """ + Validate the 'usecols' parameter. + + Checks whether or not the 'usecols' parameter contains all integers + (column selection by index), strings (column by name) or is a callable. + Raises a ValueError if that is not the case. + + Parameters + ---------- + usecols : list-like, callable, or None + List of columns to use when parsing or a callable that can be used + to filter a list of table columns. + + Returns + ------- + usecols_tuple : tuple + A tuple of (verified_usecols, usecols_dtype). + + 'verified_usecols' is either a set if an array-like is passed in or + 'usecols' if a callable or None is passed in. + + 'usecols_dtype` is the inferred dtype of 'usecols' if an array-like + is passed in or None if a callable or None is passed in. + """ + msg = ( + "'usecols' must either be list-like of all strings, all unicode, " + "all integers or a callable." + ) + if usecols is not None: + if callable(usecols): + return usecols, None + + if not is_list_like(usecols): + # see gh-20529 + # + # Ensure it is iterable container but not string. + raise ValueError(msg) + + usecols_dtype = lib.infer_dtype(usecols, skipna=False) + + if usecols_dtype not in ("empty", "integer", "string"): + raise ValueError(msg) + + usecols = set(usecols) + + return usecols, usecols_dtype + return usecols, None + + +@overload +def evaluate_callable_usecols( + usecols: Callable[[Hashable], object], + names: Iterable[Hashable], +) -> set[int]: ... + + +@overload +def evaluate_callable_usecols( + usecols: SequenceT, names: Iterable[Hashable] +) -> SequenceT: ... + + +def evaluate_callable_usecols( + usecols: Callable[[Hashable], object] | SequenceT, + names: Iterable[Hashable], +) -> SequenceT | set[int]: + """ + Check whether or not the 'usecols' parameter + is a callable. If so, enumerates the 'names' + parameter and returns a set of indices for + each entry in 'names' that evaluates to True. + If not a callable, returns 'usecols'. + """ + if callable(usecols): + return {i for i, name in enumerate(names) if usecols(name)} + return usecols diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index 4de626288aa41..818c9f5ff6b80 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -30,7 +30,10 @@ from pandas.io.parsers.base_parser import ( ParserBase, ParserError, + date_converter, + evaluate_callable_usecols, is_index_col, + validate_parse_dates_presence, ) if TYPE_CHECKING: @@ -131,7 +134,7 @@ def __init__(self, src: ReadCsvBuffer[str], **kwds) -> None: self.orig_names = self.names[:] # type: ignore[has-type] if self.usecols: - usecols = self._evaluate_usecols(self.usecols, self.orig_names) + usecols = evaluate_callable_usecols(self.usecols, self.orig_names) # GH 14671 # assert for mypy, orig_names is List or None, None would error in issubset @@ -160,7 +163,7 @@ def __init__(self, src: ReadCsvBuffer[str], **kwds) -> None: ) # error: Cannot determine type of 'names' - self._validate_parse_dates_presence(self.names) # type: ignore[has-type] + validate_parse_dates_presence(self.parse_dates, self.names) # type: ignore[has-type] self._set_noconvert_columns() # error: Cannot determine type of 'names' @@ -254,8 +257,7 @@ def read( columns, self.col_names ) - if self.usecols is not None: - columns = self._filter_usecols(columns) + columns = _filter_usecols(self.usecols, columns) col_dict = {k: v for k, v in col_dict.items() if k in columns} @@ -288,13 +290,21 @@ def read( else: values = data.pop(self.index_col[i]) - values = self._maybe_parse_dates(values, i, try_parse_dates=True) + if self._should_parse_dates(i): + values = date_converter( + values, + col=self.index_names[i] + if self.index_names is not None + else None, + dayfirst=self.dayfirst, + cache_dates=self.cache_dates, + date_format=self.date_format, + ) arrays.append(values) index = ensure_index_from_sequences(arrays) - if self.usecols is not None: - names = self._filter_usecols(names) + names = _filter_usecols(self.usecols, names) names = dedup_names(names, is_potential_multi_index(names, self.index_col)) @@ -318,8 +328,7 @@ def read( names = list(self.orig_names) names = dedup_names(names, is_potential_multi_index(names, self.index_col)) - if self.usecols is not None: - names = self._filter_usecols(names) + names = _filter_usecols(self.usecols, names) # columns as list alldata = [x[1] for x in data_tups] @@ -329,26 +338,17 @@ def read( data = {k: v for k, (i, v) in zip(names, data_tups)} date_data = self._do_date_conversions(names, data) - index, column_names = self._make_index(date_data, alldata, names) + index, column_names = self._make_index(alldata, names) return index, column_names, date_data - def _filter_usecols(self, names: SequenceT) -> SequenceT | list[Hashable]: - # hackish - usecols = self._evaluate_usecols(self.usecols, names) - if usecols is not None and len(names) != len(usecols): - return [ - name for i, name in enumerate(names) if i in usecols or name in usecols - ] - return names - def _maybe_parse_dates(self, values, index: int, try_parse_dates: bool = True): - if try_parse_dates and self._should_parse_dates(index): - values = self._date_conv( - values, - col=self.index_names[index] if self.index_names is not None else None, - ) - return values +def _filter_usecols(usecols, names: SequenceT) -> SequenceT | list[Hashable]: + # hackish + usecols = evaluate_callable_usecols(usecols, names) + if usecols is not None and len(names) != len(usecols): + return [name for i, name in enumerate(names) if i in usecols or name in usecols] + return names def _concatenate_chunks( diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index f7d2aa2419429..3a2a1c37f1879 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -10,9 +10,11 @@ from typing import ( IO, TYPE_CHECKING, + Any, DefaultDict, Literal, cast, + final, ) import warnings @@ -27,20 +29,40 @@ from pandas.util._decorators import cache_readonly from pandas.util._exceptions import find_stack_level +from pandas.core.dtypes.astype import astype_array from pandas.core.dtypes.common import ( is_bool_dtype, + is_extension_array_dtype, is_integer, is_numeric_dtype, + is_object_dtype, + is_string_dtype, + pandas_dtype, +) +from pandas.core.dtypes.dtypes import ( + CategoricalDtype, + ExtensionDtype, ) from pandas.core.dtypes.inference import is_dict_like +from pandas.core import algorithms +from pandas.core.arrays import ( + Categorical, + ExtensionArray, +) +from pandas.core.arrays.boolean import BooleanDtype +from pandas.core.indexes.api import Index + from pandas.io.common import ( dedup_names, is_potential_multi_index, ) from pandas.io.parsers.base_parser import ( ParserBase, + evaluate_callable_usecols, + get_na_values, parser_defaults, + validate_parse_dates_presence, ) if TYPE_CHECKING: @@ -53,13 +75,13 @@ from pandas._typing import ( ArrayLike, + DtypeObj, ReadCsvBuffer, Scalar, T, ) from pandas import ( - Index, MultiIndex, Series, ) @@ -106,9 +128,8 @@ def __init__(self, f: ReadCsvBuffer[str] | list, **kwds) -> None: self.quoting = kwds["quoting"] self.skip_blank_lines = kwds["skip_blank_lines"] - self.has_index_names = False - if "has_index_names" in kwds: - self.has_index_names = kwds["has_index_names"] + # Passed from read_excel + self.has_index_names = kwds.get("has_index_names", False) self.thousands = kwds["thousands"] self.decimal = kwds["decimal"] @@ -157,7 +178,6 @@ def __init__(self, f: ReadCsvBuffer[str] | list, **kwds) -> None: if self._col_indices is None: self._col_indices = list(range(len(self.columns))) - self._parse_date_cols = self._validate_parse_dates_presence(self.columns) self._no_thousands_columns = self._set_no_thousand_columns() if len(self.decimal) != 1: @@ -279,9 +299,10 @@ def read( return index, conv_columns, col_dict # handle new style for names in index - count_empty_content_vals = count_empty_vals(content[0]) indexnamerow = None - if self.has_index_names and count_empty_content_vals == len(columns): + if self.has_index_names and sum( + int(v == "" or v is None) for v in content[0] + ) == len(columns): indexnamerow = content[0] content = content[1:] @@ -291,9 +312,7 @@ def read( conv_data = self._convert_data(data) conv_data = self._do_date_conversions(columns, conv_data) - index, result_columns = self._make_index( - conv_data, alldata, columns, indexnamerow - ) + index, result_columns = self._make_index(alldata, columns, indexnamerow) return index, result_columns, conv_data @@ -370,6 +389,165 @@ def _convert_data( clean_dtypes, ) + @final + def _convert_to_ndarrays( + self, + dct: Mapping, + na_values, + na_fvalues, + converters=None, + dtypes=None, + ) -> dict[Any, np.ndarray]: + result = {} + parse_date_cols = validate_parse_dates_presence(self.parse_dates, self.columns) + for c, values in dct.items(): + conv_f = None if converters is None else converters.get(c, None) + if isinstance(dtypes, dict): + cast_type = dtypes.get(c, None) + else: + # single dtype or None + cast_type = dtypes + + if self.na_filter: + col_na_values, col_na_fvalues = get_na_values( + c, na_values, na_fvalues, self.keep_default_na + ) + else: + col_na_values, col_na_fvalues = set(), set() + + if c in parse_date_cols: + # GH#26203 Do not convert columns which get converted to dates + # but replace nans to ensure to_datetime works + mask = algorithms.isin(values, set(col_na_values) | col_na_fvalues) # pyright: ignore[reportArgumentType] + np.putmask(values, mask, np.nan) + result[c] = values + continue + + if conv_f is not None: + # conv_f applied to data before inference + if cast_type is not None: + warnings.warn( + ( + "Both a converter and dtype were specified " + f"for column {c} - only the converter will be used." + ), + ParserWarning, + stacklevel=find_stack_level(), + ) + + try: + values = lib.map_infer(values, conv_f) + except ValueError: + mask = algorithms.isin(values, list(na_values)).view(np.uint8) + values = lib.map_infer_mask(values, conv_f, mask) + + cvals, na_count = self._infer_types( + values, + set(col_na_values) | col_na_fvalues, + cast_type is None, + try_num_bool=False, + ) + else: + is_ea = is_extension_array_dtype(cast_type) + is_str_or_ea_dtype = is_ea or is_string_dtype(cast_type) + # skip inference if specified dtype is object + # or casting to an EA + try_num_bool = not (cast_type and is_str_or_ea_dtype) + + # general type inference and conversion + cvals, na_count = self._infer_types( + values, + set(col_na_values) | col_na_fvalues, + cast_type is None, + try_num_bool, + ) + + # type specified in dtype param or cast_type is an EA + if cast_type is not None: + cast_type = pandas_dtype(cast_type) + if cast_type and (cvals.dtype != cast_type or is_ea): + if not is_ea and na_count > 0: + if is_bool_dtype(cast_type): + raise ValueError(f"Bool column has NA values in column {c}") + cvals = self._cast_types(cvals, cast_type, c) + + result[c] = cvals + return result + + @final + def _cast_types(self, values: ArrayLike, cast_type: DtypeObj, column) -> ArrayLike: + """ + Cast values to specified type + + Parameters + ---------- + values : ndarray or ExtensionArray + cast_type : np.dtype or ExtensionDtype + dtype to cast values to + column : string + column name - used only for error reporting + + Returns + ------- + converted : ndarray or ExtensionArray + """ + if isinstance(cast_type, CategoricalDtype): + known_cats = cast_type.categories is not None + + if not is_object_dtype(values.dtype) and not known_cats: + # TODO: this is for consistency with + # c-parser which parses all categories + # as strings + values = lib.ensure_string_array( + values, skipna=False, convert_na_value=False + ) + + cats = Index(values).unique().dropna() + values = Categorical._from_inferred_categories( + cats, cats.get_indexer(values), cast_type, true_values=self.true_values + ) + + # use the EA's implementation of casting + elif isinstance(cast_type, ExtensionDtype): + array_type = cast_type.construct_array_type() + try: + if isinstance(cast_type, BooleanDtype): + # error: Unexpected keyword argument "true_values" for + # "_from_sequence_of_strings" of "ExtensionArray" + values_str = [str(val) for val in values] + return array_type._from_sequence_of_strings( # type: ignore[call-arg] + values_str, + dtype=cast_type, + true_values=self.true_values, # pyright: ignore[reportCallIssue] + false_values=self.false_values, # pyright: ignore[reportCallIssue] + none_values=self.na_values, # pyright: ignore[reportCallIssue] + ) + else: + return array_type._from_sequence_of_strings(values, dtype=cast_type) + except NotImplementedError as err: + raise NotImplementedError( + f"Extension Array: {array_type} must implement " + "_from_sequence_of_strings in order to be used in parser methods" + ) from err + + elif isinstance(values, ExtensionArray): + values = values.astype(cast_type, copy=False) + elif issubclass(cast_type.type, str): + # TODO: why skipna=True here and False above? some tests depend + # on it here, but nothing fails if we change it above + # (as no tests get there as of 2022-12-06) + values = lib.ensure_string_array( + values, skipna=True, convert_na_value=False + ) + else: + try: + values = astype_array(values, cast_type, copy=True) + except ValueError as err: + raise ValueError( + f"Unable to convert column {column} to type {cast_type}" + ) from err + return values + @cache_readonly def _have_mi_columns(self) -> bool: if self.header is None: @@ -426,7 +604,7 @@ def _infer_columns( # serve as the 'line' for parsing if have_mi_columns and hr > 0: if clear_buffer: - self._clear_buffer() + self.buf.clear() columns.append([None] * len(columns[-1])) return columns, num_original_columns, unnamed_cols @@ -508,7 +686,7 @@ def _infer_columns( num_original_columns = len(this_columns) if clear_buffer: - self._clear_buffer() + self.buf.clear() first_line: list[Scalar] | None if names is not None: @@ -595,7 +773,7 @@ def _handle_usecols( col_indices: set[int] | list[int] if self.usecols is not None: if callable(self.usecols): - col_indices = self._evaluate_usecols(self.usecols, usecols_key) + col_indices = evaluate_callable_usecols(self.usecols, usecols_key) elif any(isinstance(u, str) for u in self.usecols): if len(columns) > 1: raise ValueError( @@ -915,9 +1093,6 @@ def _check_decimal(self, lines: list[list[Scalar]]) -> list[list[Scalar]]: lines=lines, search=self.decimal, replace="." ) - def _clear_buffer(self) -> None: - self.buf = [] - def _get_index_name( self, ) -> tuple[Sequence[Hashable] | None, list[Hashable], list[Hashable]]: @@ -1347,10 +1522,6 @@ def _remove_empty_lines(self, lines: list[list[T]]) -> list[list[T]]: ] -def count_empty_vals(vals) -> int: - return sum(1 for v in vals if v == "" or v is None) - - def _validate_skipfooter_arg(skipfooter: int) -> int: """ Validate the 'skipfooter' parameter. diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 66edbcaa755ed..0cca1ebdb8c8f 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -17,7 +17,6 @@ IO, TYPE_CHECKING, Any, - Callable, Generic, Literal, TypedDict, @@ -70,6 +69,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Hashable, Iterable, Mapping, @@ -142,8 +142,7 @@ class _read_shared(TypedDict, Generic[HashableT], total=False): _read_shared = dict -_doc_read_csv_and_table = ( - r""" +_doc_read_csv_and_table = r""" {summary} Also supports optionally iterating or breaking of the file @@ -272,10 +271,7 @@ class _read_shared(TypedDict, Generic[HashableT], total=False): na_values : Hashable, Iterable of Hashable or dict of {{Hashable : Iterable}}, optional Additional strings to recognize as ``NA``/``NaN``. If ``dict`` passed, specific per-column ``NA`` values. By default the following values are interpreted as - ``NaN``: " """ - + fill('", "'.join(sorted(STR_NA_VALUES)), 70, subsequent_indent=" ") - + """ ". - + ``NaN``: "{na_values_str}". keep_default_na : bool, default True Whether or not to include the default ``NaN`` values when parsing the data. Depending on whether ``na_values`` is passed in, the behavior is as follows: @@ -357,8 +353,7 @@ class _read_shared(TypedDict, Generic[HashableT], total=False): quotechar : str (length 1), optional Character used to denote the start and end of a quoted item. Quoted items can include the ``delimiter`` and it will be ignored. -quoting : {{0 or csv.QUOTE_MINIMAL, 1 or csv.QUOTE_ALL, 2 or csv.QUOTE_NONNUMERIC, \ -3 or csv.QUOTE_NONE}}, default csv.QUOTE_MINIMAL +quoting : {{0 or csv.QUOTE_MINIMAL, 1 or csv.QUOTE_ALL, 2 or csv.QUOTE_NONNUMERIC, 3 or csv.QUOTE_NONE}}, default csv.QUOTE_MINIMAL Control field quoting behavior per ``csv.QUOTE_*`` constants. Default is ``csv.QUOTE_MINIMAL`` (i.e., 0) which implies that only fields containing special characters are quoted (e.g., characters defined in ``quotechar``, ``delimiter``, @@ -544,8 +539,7 @@ class _read_shared(TypedDict, Generic[HashableT], total=False): col 2 datetime64[ns] col 3 datetime64[ns] dtype: object -""" -) +""" # noqa: E501 class _C_Parser_Defaults(TypedDict): @@ -674,6 +668,14 @@ def _read( # Extract some of the arguments (pass chunksize on). iterator = kwds.get("iterator", False) chunksize = kwds.get("chunksize", None) + + # Check type of encoding_errors + errors = kwds.get("encoding_errors", "strict") + if not isinstance(errors, str): + raise ValueError( + f"encoding_errors must be a string, got {type(errors).__name__}" + ) + if kwds.get("engine") == "pyarrow": if iterator: raise ValueError( @@ -748,6 +750,9 @@ def read_csv( summary="Read a comma-separated values (csv) file into DataFrame.", see_also_func_name="read_table", see_also_func_summary="Read general delimited file into DataFrame.", + na_values_str=fill( + '", "'.join(sorted(STR_NA_VALUES)), 70, subsequent_indent=" " + ), _default_sep="','", storage_options=_shared_docs["storage_options"], decompression_options=_shared_docs["decompression_options"] @@ -880,6 +885,9 @@ def read_table( see_also_func_summary=( "Read a comma-separated values (csv) file into DataFrame." ), + na_values_str=fill( + '", "'.join(sorted(STR_NA_VALUES)), 70, subsequent_indent=" " + ), _default_sep=r"'\\t' (tab-stop)", storage_options=_shared_docs["storage_options"], decompression_options=_shared_docs["decompression_options"] @@ -1534,7 +1542,10 @@ def get_chunk(self, size: int | None = None) -> DataFrame: if self.nrows is not None: if self._currow >= self.nrows: raise StopIteration - size = min(size, self.nrows - self._currow) + if size is None: + size = self.nrows - self._currow + else: + size = min(size, self.nrows - self._currow) return self.read(nrows=size) def __enter__(self) -> Self: diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 4fce338ccad6f..4b569fb7e39e2 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -18,7 +18,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Final, Literal, cast, @@ -31,7 +30,7 @@ from pandas._config import ( config, get_option, - using_pyarrow_string_dtype, + using_string_dtype, ) from pandas._libs import ( @@ -102,6 +101,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Hashable, Iterator, Sequence, @@ -2655,7 +2655,7 @@ def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str): # reverse converts if dtype.startswith("datetime64"): # recreate with tz if indicated - converted = _set_tz(converted, tz) + converted = _set_tz(converted, tz, dtype) elif dtype == "timedelta64": converted = np.asarray(converted, dtype="m8[ns]") @@ -3036,7 +3036,7 @@ def read_array(self, key: str, start: int | None = None, stop: int | None = None if dtype and dtype.startswith("datetime64"): # reconstruct a timezone if indicated tz = getattr(attrs, "tz", None) - ret = _set_tz(ret, tz) + ret = _set_tz(ret, tz, dtype) elif dtype == "timedelta64": ret = np.asarray(ret, dtype="m8[ns]") @@ -3294,7 +3294,7 @@ def read( index = self.read_index("index", start=start, stop=stop) values = self.read_array("values", start=start, stop=stop) result = Series(values, index=index, name=self.name, copy=False) - if using_pyarrow_string_dtype() and is_string_array(values, skipna=True): + if using_string_dtype() and is_string_array(values, skipna=True): result = result.astype("string[pyarrow_numpy]") return result @@ -3363,7 +3363,7 @@ def read( columns = items[items.get_indexer(blk_items)] df = DataFrame(values.T, columns=columns, index=axes[1], copy=False) - if using_pyarrow_string_dtype() and is_string_array(values, skipna=True): + if using_string_dtype() and is_string_array(values, skipna=True): df = df.astype("string[pyarrow_numpy]") dfs.append(df) @@ -4735,9 +4735,9 @@ def read( else: # Categorical df = DataFrame._from_arrays([values], columns=cols_, index=index_) - if not (using_pyarrow_string_dtype() and values.dtype.kind == "O"): + if not (using_string_dtype() and values.dtype.kind == "O"): assert (df.dtypes == values.dtype).all(), (df.dtypes, values.dtype) - if using_pyarrow_string_dtype() and is_string_array( + if using_string_dtype() and is_string_array( values, # type: ignore[arg-type] skipna=True, ): @@ -4964,7 +4964,9 @@ def _get_tz(tz: tzinfo) -> str | tzinfo: return zone -def _set_tz(values: npt.NDArray[np.int64], tz: str | tzinfo | None) -> DatetimeArray: +def _set_tz( + values: npt.NDArray[np.int64], tz: str | tzinfo | None, datetime64_dtype: str +) -> DatetimeArray: """ Coerce the values to a DatetimeArray with appropriate tz. @@ -4972,11 +4974,13 @@ def _set_tz(values: npt.NDArray[np.int64], tz: str | tzinfo | None) -> DatetimeA ---------- values : ndarray[int64] tz : str, tzinfo, or None + datetime64_dtype : str, e.g. "datetime64[ns]", "datetime64[25s]" """ assert values.dtype == "i8", values.dtype # Argument "tz" to "tz_to_dtype" has incompatible type "str | tzinfo | None"; # expected "tzinfo" - dtype = tz_to_dtype(tz=tz, unit="ns") # type: ignore[arg-type] + unit, _ = np.datetime_data(datetime64_dtype) # parsing dtype: unit, count + dtype = tz_to_dtype(tz=tz, unit=unit) # type: ignore[arg-type] dta = DatetimeArray._from_sequence(values, dtype=dtype) return dta diff --git a/pandas/io/sas/sasreader.py b/pandas/io/sas/sasreader.py index 12d698a4f76a8..6daf4a24781bd 100644 --- a/pandas/io/sas/sasreader.py +++ b/pandas/io/sas/sasreader.py @@ -124,6 +124,14 @@ def read_sas( DataFrame if iterator=False and chunksize=None, else SAS7BDATReader or XportReader, file format is inferred from file extension. + See Also + -------- + read_csv : Read a comma-separated values (csv) file into a pandas DataFrame. + read_excel : Read an Excel file into a pandas DataFrame. + read_spss : Read an SPSS file into a pandas DataFrame. + read_orc : Load an ORC object into a pandas DataFrame. + read_feather : Load a feather-format object into a pandas DataFrame. + Examples -------- >>> df = pd.read_sas("sas_data.sas7bdat") # doctest: +SKIP diff --git a/pandas/io/spss.py b/pandas/io/spss.py index 2c464cc7e90c4..313ffa79cbd09 100644 --- a/pandas/io/spss.py +++ b/pandas/io/spss.py @@ -52,6 +52,14 @@ def read_spss( DataFrame DataFrame based on the SPSS file. + See Also + -------- + read_csv : Read a comma-separated values (csv) file into a pandas DataFrame. + read_excel : Read an Excel file into a pandas DataFrame. + read_sas : Read an SAS file into a pandas DataFrame. + read_orc : Load an ORC object into a pandas DataFrame. + read_feather : Load a feather-format object into a pandas DataFrame. + Examples -------- >>> df = pd.read_spss("spss_data.sav") # doctest: +SKIP diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 874320f08fb75..4fd7de7a28855 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -23,7 +23,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Literal, cast, overload, @@ -32,7 +31,7 @@ import numpy as np -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._libs import lib from pandas.compat._optional import import_optional_dependency @@ -67,6 +66,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Generator, Iterator, Mapping, @@ -1014,7 +1014,7 @@ def _execute_insert_multi(self, conn, keys: list[str], data_iter) -> int: def insert_data(self) -> tuple[list[str], list[np.ndarray]]: if self.index is not None: - temp = self.frame.copy() + temp = self.frame.copy(deep=False) temp.index.names = self.index try: temp.reset_index(inplace=True) @@ -2197,7 +2197,7 @@ def read_table( from pandas.io._util import _arrow_dtype_mapping mapping = _arrow_dtype_mapping().get - elif using_pyarrow_string_dtype(): + elif using_string_dtype(): from pandas.io._util import arrow_string_types_mapper arrow_string_types_mapper() diff --git a/pandas/io/stata.py b/pandas/io/stata.py index b87ec94b85bb0..4be06f93689f2 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -25,7 +25,6 @@ IO, TYPE_CHECKING, AnyStr, - Callable, Final, cast, ) @@ -74,6 +73,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Hashable, Sequence, ) @@ -91,9 +91,9 @@ _version_error = ( "Version of given Stata file is {version}. pandas supports importing " - "versions 105, 108, 110 (Stata 7), 111 (Stata 7SE), 113 (Stata 8/9), " - "114 (Stata 10/11), 115 (Stata 12), 117 (Stata 13), 118 (Stata 14/15/16)," - "and 119 (Stata 15/16, over 32,767 variables)." + "versions 102, 103, 104, 105, 108, 110 (Stata 7), 111 (Stata 7SE), " + "113 (Stata 8/9), 114 (Stata 10/11), 115 (Stata 12), 117 (Stata 13), " + "118 (Stata 14/15/16), and 119 (Stata 15/16, over 32,767 variables)." ) _statafile_processing_params1 = """\ @@ -983,6 +983,19 @@ def __init__(self) -> None: np.float64(struct.unpack(" None: # These missing values are the generic '.' in Stata, and are used # to replace nans - self.MISSING_VALUES = { + self.MISSING_VALUES: dict[str, int | np.float32 | np.float64] = { "b": 101, "h": 32741, "l": 2147483621, @@ -1352,8 +1365,10 @@ def _get_variable_labels(self) -> list[str]: def _get_nobs(self) -> int: if self._format_version >= 118: return self._read_uint64() - else: + elif self._format_version >= 103: return self._read_uint32() + else: + return self._read_uint16() def _get_data_label(self) -> str: if self._format_version >= 118: @@ -1393,9 +1408,24 @@ def _get_seek_variable_labels(self) -> int: def _read_old_header(self, first_char: bytes) -> None: self._format_version = int(first_char[0]) - if self._format_version not in [104, 105, 108, 110, 111, 113, 114, 115]: + if self._format_version not in [ + 102, + 103, + 104, + 105, + 108, + 110, + 111, + 113, + 114, + 115, + ]: raise ValueError(_version_error.format(version=self._format_version)) self._set_encoding() + # Note 102 format will have a zero in this header position, so support + # relies on little-endian being set whenever this value isn't one, + # even though for later releases strictly speaking the value should + # be either one or two to be valid self._byteorder = ">" if self._read_int8() == 0x1 else "<" self._filetype = self._read_int8() self._path_or_buf.read(1) # unused @@ -1405,7 +1435,8 @@ def _read_old_header(self, first_char: bytes) -> None: self._data_label = self._get_data_label() - self._time_stamp = self._get_time_stamp() + if self._format_version >= 105: + self._time_stamp = self._get_time_stamp() # descriptors if self._format_version >= 111: @@ -1599,14 +1630,13 @@ def _read_strls(self) -> None: v_o = self._read_uint64() else: buf = self._path_or_buf.read(12) - # Only tested on little endian file on little endian machine. + # Only tested on little endian machine. v_size = 2 if self._format_version == 118 else 3 if self._byteorder == "<": buf = buf[0:v_size] + buf[4 : (12 - v_size)] else: - # This path may not be correct, impossible to test - buf = buf[0:v_size] + buf[(4 + v_size) :] - v_o = struct.unpack("Q", buf)[0] + buf = buf[4 - v_size : 4] + buf[(4 + v_size) :] + v_o = struct.unpack(f"{self._byteorder}Q", buf)[0] typ = self._read_uint8() length = self._read_uint32() va = self._path_or_buf.read(length) @@ -1787,15 +1817,31 @@ def read( return data def _do_convert_missing(self, data: DataFrame, convert_missing: bool) -> DataFrame: + # missing code for double was different in version 105 and prior + old_missingdouble = float.fromhex("0x1.0p333") + # Check for missing values, and replace if found replacements = {} for i in range(len(data.columns)): fmt = self._typlist[i] - if fmt not in self.VALID_RANGE: - continue + # recode instances of the old missing code to the currently used value + if self._format_version <= 105 and fmt == "d": + data.iloc[:, i] = data.iloc[:, i].replace( + old_missingdouble, self.MISSING_VALUES["d"] + ) + + if self._format_version <= 111: + if fmt not in self.OLD_VALID_RANGE: + continue + + fmt = cast(str, fmt) # only strs in OLD_VALID_RANGE + nmin, nmax = self.OLD_VALID_RANGE[fmt] + else: + if fmt not in self.VALID_RANGE: + continue - fmt = cast(str, fmt) # only strs in VALID_RANGE - nmin, nmax = self.VALID_RANGE[fmt] + fmt = cast(str, fmt) # only strs in VALID_RANGE + nmin, nmax = self.VALID_RANGE[fmt] series = data.iloc[:, i] # appreciably faster to do this with ndarray instead of Series @@ -1810,7 +1856,12 @@ def _do_convert_missing(self, data: DataFrame, convert_missing: bool) -> DataFra umissing, umissing_loc = np.unique(series[missing], return_inverse=True) replacement = Series(series, dtype=object) for j, um in enumerate(umissing): - missing_value = StataMissingValue(um) + if self._format_version <= 111: + missing_value = StataMissingValue( + float(self.MISSING_VALUES[fmt]) + ) + else: + missing_value = StataMissingValue(um) loc = missing_loc[umissing_loc == j] replacement.iloc[loc] = missing_value @@ -3037,6 +3088,8 @@ def __init__( if byteorder is None: byteorder = sys.byteorder self._byteorder = _set_endianness(byteorder) + # Flag whether chosen byteorder matches the system on which we're running + self._native_byteorder = self._byteorder == _set_endianness(sys.byteorder) gso_v_type = "I" # uint32 gso_o_type = "Q" # uint64 @@ -3049,13 +3102,20 @@ def __init__( o_size = 6 else: # version == 119 o_size = 5 - self._o_offet = 2 ** (8 * (8 - o_size)) + if self._native_byteorder: + self._o_offet = 2 ** (8 * (8 - o_size)) + else: + self._o_offet = 2 ** (8 * o_size) self._gso_o_type = gso_o_type self._gso_v_type = gso_v_type def _convert_key(self, key: tuple[int, int]) -> int: v, o = key - return v + self._o_offet * o + if self._native_byteorder: + return v + self._o_offet * o + else: + # v, o will be swapped when applying byteorder + return o + self._o_offet * v def generate_table(self) -> tuple[dict[str, tuple[int, int]], DataFrame]: """ @@ -3532,7 +3592,9 @@ def _convert_strls(self, data: DataFrame) -> DataFrame: ] if convert_cols: - ssw = StataStrLWriter(data, convert_cols, version=self._dta_version) + ssw = StataStrLWriter( + data, convert_cols, version=self._dta_version, byteorder=self._byteorder + ) tab, new_data = ssw.generate_table() data = new_data self._strl_blob = ssw.generate_blob(tab) diff --git a/pandas/io/xml.py b/pandas/io/xml.py index a6cd06cd61687..8c7381a926e72 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -9,7 +9,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, ) from pandas._libs import lib @@ -35,7 +34,10 @@ from pandas.io.parsers import TextParser if TYPE_CHECKING: - from collections.abc import Sequence + from collections.abc import ( + Callable, + Sequence, + ) from xml.etree.ElementTree import Element from lxml import etree diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index ea5daf02b7252..b60392368d944 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -3,7 +3,6 @@ import importlib from typing import ( TYPE_CHECKING, - Callable, Literal, ) @@ -27,6 +26,7 @@ if TYPE_CHECKING: from collections.abc import ( + Callable, Hashable, Sequence, ) @@ -652,6 +652,9 @@ class PlotAccessor(PandasObject): ---------- data : Series or DataFrame The object for which the method is called. + + Attributes + ---------- x : label or position, default None Only used if data is a DataFrame. y : label, position or list of label, positions, default None @@ -791,6 +794,21 @@ class PlotAccessor(PandasObject): If the backend is not the default matplotlib one, the return value will be the object returned by the backend. + See Also + -------- + matplotlib.pyplot.plot : Plot y versus x as lines and/or markers. + DataFrame.hist : Make a histogram. + DataFrame.boxplot : Make a box plot. + DataFrame.plot.scatter : Make a scatter plot with varying marker + point size and color. + DataFrame.plot.hexbin : Make a hexagonal binning plot of + two variables. + DataFrame.plot.kde : Make Kernel Density Estimate plot using + Gaussian kernels. + DataFrame.plot.area : Make a stacked area plot. + DataFrame.plot.bar : Make a bar plot. + DataFrame.plot.barh : Make a horizontal bar plot. + Notes ----- - See matplotlib documentation online for more on this subject @@ -967,10 +985,7 @@ def __call__(self, *args, **kwargs): f"Valid plot kinds: {self._all_kinds}" ) - # The original data structured can be transformed before passed to the - # backend. For example, for DataFrame is common to set the index as the - # `x` parameter, and return a Series with the parameter `y` as values. - data = self._parent.copy() + data = self._parent if isinstance(data, ABCSeries): kwargs["reuse_plot"] = True @@ -990,7 +1005,7 @@ def __call__(self, *args, **kwargs): if is_integer(y) and not holds_integer(data.columns): y = data.columns[y] # converted to series actually. copy to not modify - data = data[y].copy() + data = data[y].copy(deep=False) data.index.name = y elif isinstance(data, ABCDataFrame): data_cols = data.columns @@ -1017,8 +1032,7 @@ def __call__(self, *args, **kwargs): except (IndexError, KeyError, TypeError): pass - # don't overwrite - data = data[y].copy() + data = data[y] if isinstance(data, ABCSeries): label_name = label_kw or y @@ -1436,6 +1450,7 @@ def kde( self, bw_method: Literal["scott", "silverman"] | float | Callable | None = None, ind: np.ndarray | int | None = None, + weights: np.ndarray | None = None, **kwargs, ) -> PlotAccessor: """ @@ -1461,6 +1476,9 @@ def kde( 1000 equally spaced points are used. If `ind` is a NumPy array, the KDE is evaluated at the points passed. If `ind` is an integer, `ind` number of equally spaced points are used. + weights : NumPy array, optional + Weights of datapoints. This must be the same shape as datapoints. + If None (default), the samples are assumed to be equally weighted. **kwargs Additional keyword arguments are documented in :meth:`DataFrame.plot`. @@ -1546,7 +1564,7 @@ def kde( >>> ax = df.plot.kde(ind=[1, 2, 3, 4, 5, 6]) """ - return self(kind="kde", bw_method=bw_method, ind=ind, **kwargs) + return self(kind="kde", bw_method=bw_method, ind=ind, weights=weights, **kwargs) density = kde @@ -1583,7 +1601,7 @@ def area( See Also -------- - DataFrame.plot : Make plots of DataFrame using matplotlib / pylab. + DataFrame.plot : Make plots of DataFrame using matplotlib. Examples -------- diff --git a/pandas/plotting/_matplotlib/__init__.py b/pandas/plotting/_matplotlib/__init__.py index 75c61da03795a..87f3ca09ad346 100644 --- a/pandas/plotting/_matplotlib/__init__.py +++ b/pandas/plotting/_matplotlib/__init__.py @@ -69,7 +69,7 @@ def plot(data, kind, **kwargs): kwargs["ax"] = getattr(ax, "left_ax", ax) plot_obj = PLOT_CLASSES[kind](data, **kwargs) plot_obj.generate() - plot_obj.draw() + plt.draw_if_interactive() return plot_obj.result diff --git a/pandas/plotting/_matplotlib/boxplot.py b/pandas/plotting/_matplotlib/boxplot.py index 2a28cd94b64e5..6bb10068bee38 100644 --- a/pandas/plotting/_matplotlib/boxplot.py +++ b/pandas/plotting/_matplotlib/boxplot.py @@ -7,7 +7,7 @@ ) import warnings -from matplotlib.artist import setp +import matplotlib as mpl import numpy as np from pandas._libs import lib @@ -274,13 +274,13 @@ def maybe_color_bp(bp, color_tup, **kwds) -> None: # GH#30346, when users specifying those arguments explicitly, our defaults # for these four kwargs should be overridden; if not, use Pandas settings if not kwds.get("boxprops"): - setp(bp["boxes"], color=color_tup[0], alpha=1) + mpl.artist.setp(bp["boxes"], color=color_tup[0], alpha=1) if not kwds.get("whiskerprops"): - setp(bp["whiskers"], color=color_tup[1], alpha=1) + mpl.artist.setp(bp["whiskers"], color=color_tup[1], alpha=1) if not kwds.get("medianprops"): - setp(bp["medians"], color=color_tup[2], alpha=1) + mpl.artist.setp(bp["medians"], color=color_tup[2], alpha=1) if not kwds.get("capprops"): - setp(bp["caps"], color=color_tup[3], alpha=1) + mpl.artist.setp(bp["caps"], color=color_tup[3], alpha=1) def _grouped_plot_by_column( @@ -311,8 +311,6 @@ def _grouped_plot_by_column( layout=layout, ) - _axes = flatten_axes(axes) - # GH 45465: move the "by" label based on "vert" xlabel, ylabel = kwargs.pop("xlabel", None), kwargs.pop("ylabel", None) if kwargs.get("vert", True): @@ -322,8 +320,7 @@ def _grouped_plot_by_column( ax_values = [] - for i, col in enumerate(columns): - ax = _axes[i] + for ax, col in zip(flatten_axes(axes), columns): gp_col = grouped[col] keys, values = zip(*gp_col) re_plotf = plotf(keys, values, ax, xlabel=xlabel, ylabel=ylabel, **kwargs) @@ -455,7 +452,7 @@ def plot_group(keys, values, ax: Axes, **kwds): if ax is None: rc = {"figure.figsize": figsize} if figsize is not None else {} - with plt.rc_context(rc): + with mpl.rc_context(rc): ax = plt.gca() data = data._get_numeric_data() naxes = len(data.columns) @@ -531,10 +528,8 @@ def boxplot_frame_groupby( figsize=figsize, layout=layout, ) - axes = flatten_axes(axes) - data = {} - for (key, group), ax in zip(grouped, axes): + for (key, group), ax in zip(grouped, flatten_axes(axes)): d = group.boxplot( ax=ax, column=column, fontsize=fontsize, rot=rot, grid=grid, **kwds ) diff --git a/pandas/plotting/_matplotlib/converter.py b/pandas/plotting/_matplotlib/converter.py index 50fa722f6dd72..fc63d65f1e160 100644 --- a/pandas/plotting/_matplotlib/converter.py +++ b/pandas/plotting/_matplotlib/converter.py @@ -14,13 +14,8 @@ ) import warnings +import matplotlib as mpl import matplotlib.dates as mdates -from matplotlib.ticker import ( - AutoLocator, - Formatter, - Locator, -) -from matplotlib.transforms import nonsingular import matplotlib.units as munits import numpy as np @@ -174,7 +169,7 @@ def axisinfo(unit, axis) -> munits.AxisInfo | None: if unit != "time": return None - majloc = AutoLocator() + majloc = mpl.ticker.AutoLocator() # pyright: ignore[reportAttributeAccessIssue] majfmt = TimeFormatter(majloc) return munits.AxisInfo(majloc=majloc, majfmt=majfmt, label="time") @@ -184,7 +179,7 @@ def default_units(x, axis) -> str: # time formatter -class TimeFormatter(Formatter): +class TimeFormatter(mpl.ticker.Formatter): # pyright: ignore[reportAttributeAccessIssue] def __init__(self, locs) -> None: self.locs = locs @@ -561,7 +556,8 @@ def _get_periods_per_ymd(freq: BaseOffset) -> tuple[int, int, int]: return ppd, ppm, ppy -def _daily_finder(vmin, vmax, freq: BaseOffset) -> np.ndarray: +@functools.cache +def _daily_finder(vmin: float, vmax: float, freq: BaseOffset) -> np.ndarray: # error: "BaseOffset" has no attribute "_period_dtype_code" dtype_code = freq._period_dtype_code # type: ignore[attr-defined] @@ -760,7 +756,8 @@ def _second_finder(label_interval: int) -> None: return info -def _monthly_finder(vmin, vmax, freq: BaseOffset) -> np.ndarray: +@functools.cache +def _monthly_finder(vmin: float, vmax: float, freq: BaseOffset) -> np.ndarray: _, _, periodsperyear = _get_periods_per_ymd(freq) vmin_orig = vmin @@ -831,7 +828,8 @@ def _monthly_finder(vmin, vmax, freq: BaseOffset) -> np.ndarray: return info -def _quarterly_finder(vmin, vmax, freq: BaseOffset) -> np.ndarray: +@functools.cache +def _quarterly_finder(vmin: float, vmax: float, freq: BaseOffset) -> np.ndarray: _, _, periodsperyear = _get_periods_per_ymd(freq) vmin_orig = vmin (vmin, vmax) = (int(vmin), int(vmax)) @@ -878,7 +876,8 @@ def _quarterly_finder(vmin, vmax, freq: BaseOffset) -> np.ndarray: return info -def _annual_finder(vmin, vmax, freq: BaseOffset) -> np.ndarray: +@functools.cache +def _annual_finder(vmin: float, vmax: float, freq: BaseOffset) -> np.ndarray: # Note: small difference here vs other finders in adding 1 to vmax (vmin, vmax) = (int(vmin), int(vmax + 1)) span = vmax - vmin + 1 @@ -917,7 +916,7 @@ def get_finder(freq: BaseOffset): raise NotImplementedError(f"Unsupported frequency: {dtype_code}") -class TimeSeries_DateLocator(Locator): +class TimeSeries_DateLocator(mpl.ticker.Locator): # pyright: ignore[reportAttributeAccessIssue] """ Locates the ticks along an axis controlled by a :class:`Series`. @@ -998,7 +997,7 @@ def autoscale(self): if vmin == vmax: vmin -= 1 vmax += 1 - return nonsingular(vmin, vmax) + return mpl.transforms.nonsingular(vmin, vmax) # ------------------------------------------------------------------------- @@ -1006,7 +1005,7 @@ def autoscale(self): # ------------------------------------------------------------------------- -class TimeSeries_DateFormatter(Formatter): +class TimeSeries_DateFormatter(mpl.ticker.Formatter): # pyright: ignore[reportAttributeAccessIssue] """ Formats the ticks along an axis controlled by a :class:`PeriodIndex`. @@ -1082,7 +1081,7 @@ def __call__(self, x, pos: int | None = 0) -> str: return period.strftime(fmt) -class TimeSeries_TimedeltaFormatter(Formatter): +class TimeSeries_TimedeltaFormatter(mpl.ticker.Formatter): # pyright: ignore[reportAttributeAccessIssue] """ Formats the ticks along an axis controlled by a :class:`TimedeltaIndex`. """ diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index fffeb9b82492f..fb7d785a94bc4 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -55,7 +55,6 @@ from pandas.core.dtypes.missing import isna import pandas.core.common as com -from pandas.core.frame import DataFrame from pandas.util.version import Version from pandas.io.formats.printing import pprint_thing @@ -94,6 +93,7 @@ ) from pandas import ( + DataFrame, Index, Series, ) @@ -107,9 +107,7 @@ def _color_in_style(style: str) -> bool: """ Check if there is a color letter in the style string. """ - from matplotlib.colors import BASE_COLORS - - return not set(BASE_COLORS).isdisjoint(style) + return not set(mpl.colors.BASE_COLORS).isdisjoint(style) class MPLPlot(ABC): @@ -176,8 +174,6 @@ def __init__( style=None, **kwds, ) -> None: - import matplotlib.pyplot as plt - # if users assign an empty list or tuple, raise `ValueError` # similar to current `df.box` and `df.hist` APIs. if by in ([], ()): @@ -187,7 +183,7 @@ def __init__( # Assign the rest of columns into self.columns if by is explicitly defined # while column is not, only need `columns` in hist/box plot when it's DF # TODO: Might deprecate `column` argument in future PR (#28373) - if isinstance(data, DataFrame): + if isinstance(data, ABCDataFrame): if column: self.columns = com.maybe_make_list(column) elif self.by is None: @@ -238,7 +234,7 @@ def __init__( self.rot = self._default_rot if grid is None: - grid = False if secondary_y else plt.rcParams["axes.grid"] + grid = False if secondary_y else mpl.rcParams["axes.grid"] self.grid = grid self.legend = legend @@ -497,10 +493,6 @@ def _get_nseries(self, data: Series | DataFrame) -> int: def nseries(self) -> int: return self._get_nseries(self.data) - @final - def draw(self) -> None: - self.plt.draw_if_interactive() - @final def generate(self) -> None: self._compute_plot_data() @@ -570,6 +562,8 @@ def axes(self) -> Sequence[Axes]: @final @cache_readonly def _axes_and_fig(self) -> tuple[Sequence[Axes], Figure]: + import matplotlib.pyplot as plt + if self.subplots: naxes = ( self.nseries if isinstance(self.subplots, bool) else len(self.subplots) @@ -584,7 +578,7 @@ def _axes_and_fig(self) -> tuple[Sequence[Axes], Figure]: layout_type=self._layout_type, ) elif self.ax is None: - fig = self.plt.figure(figsize=self.figsize) + fig = plt.figure(figsize=self.figsize) axes = fig.add_subplot(111) else: fig = self.ax.get_figure() @@ -592,7 +586,7 @@ def _axes_and_fig(self) -> tuple[Sequence[Axes], Figure]: fig.set_size_inches(self.figsize) axes = self.ax - axes = flatten_axes(axes) + axes = np.fromiter(flatten_axes(axes), dtype=object) if self.logx is True or self.loglog is True: [a.set_xscale("log") for a in axes] @@ -899,7 +893,13 @@ def _make_legend(self) -> None: elif self.subplots and self.legend: for ax in self.axes: if ax.get_visible(): - ax.legend(loc="best") + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + "No artists with labels found to put in legend.", + UserWarning, + ) + ax.legend(loc="best") @final @staticmethod @@ -918,13 +918,6 @@ def _get_ax_legend(ax: Axes): ax = other_ax return ax, leg - @final - @cache_readonly - def plt(self): - import matplotlib.pyplot as plt - - return plt - _need_to_set_index = False @final @@ -1219,9 +1212,9 @@ def _get_errorbars( @final def _get_subplots(self, fig: Figure) -> list[Axes]: if Version(mpl.__version__) < Version("3.8"): - from matplotlib.axes import Subplot as Klass + Klass = mpl.axes.Subplot else: - from matplotlib.axes import Axes as Klass + Klass = mpl.axes.Axes return [ ax @@ -1386,7 +1379,7 @@ def _get_c_values(self, color, color_by_categorical: bool, c_is_column: bool): if c is not None and color is not None: raise TypeError("Specify exactly one of `c` and `color`") if c is None and color is None: - c_values = self.plt.rcParams["patch.facecolor"] + c_values = mpl.rcParams["patch.facecolor"] elif color is not None: c_values = color elif color_by_categorical: @@ -1411,12 +1404,10 @@ def _get_norm_and_cmap(self, c_values, color_by_categorical: bool): cmap = None if color_by_categorical and cmap is not None: - from matplotlib import colors - n_cats = len(self.data[c].cat.categories) - cmap = colors.ListedColormap([cmap(i) for i in range(cmap.N)]) + cmap = mpl.colors.ListedColormap([cmap(i) for i in range(cmap.N)]) bounds = np.linspace(0, n_cats, n_cats + 1) - norm = colors.BoundaryNorm(bounds, cmap.N) + norm = mpl.colors.BoundaryNorm(bounds, cmap.N) # TODO: warn that we are ignoring self.norm if user specified it? # Doesn't happen in any tests 2023-11-09 else: @@ -1676,8 +1667,6 @@ def _update_stacker(cls, ax: Axes, stacking_id: int | None, values) -> None: ax._stacker_neg_prior[stacking_id] += values # type: ignore[attr-defined] def _post_plot_logic(self, ax: Axes, data) -> None: - from matplotlib.ticker import FixedLocator - def get_label(i): if is_float(i) and i.is_integer(): i = int(i) @@ -1691,7 +1680,7 @@ def get_label(i): xticklabels = [get_label(x) for x in xticks] # error: Argument 1 to "FixedLocator" has incompatible type "ndarray[Any, # Any]"; expected "Sequence[float]" - ax.xaxis.set_major_locator(FixedLocator(xticks)) # type: ignore[arg-type] + ax.xaxis.set_major_locator(mpl.ticker.FixedLocator(xticks)) # type: ignore[arg-type] ax.set_xticklabels(xticklabels) # If the index is an irregular time series, then by default @@ -2046,9 +2035,12 @@ def _kind(self) -> Literal["pie"]: _layout_type = "horizontal" - def __init__(self, data, kind=None, **kwargs) -> None: + def __init__(self, data: Series | DataFrame, kind=None, **kwargs) -> None: data = data.fillna(value=0) - if (data < 0).any().any(): + lt_zero = data < 0 + if isinstance(data, ABCDataFrame) and lt_zero.any().any(): + raise ValueError(f"{self._kind} plot doesn't allow negative values") + elif isinstance(data, ABCSeries) and lt_zero.any(): raise ValueError(f"{self._kind} plot doesn't allow negative values") MPLPlot.__init__(self, data, kind=kind, **kwargs) diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index ca635386be335..97e510982ab93 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -95,11 +95,12 @@ def _adjust_bins(self, bins: int | np.ndarray | list[np.ndarray]): def _calculate_bins(self, data: Series | DataFrame, bins) -> np.ndarray: """Calculate bins given data""" nd_values = data.infer_objects()._get_numeric_data() - values = np.ravel(nd_values) + values = nd_values.values + if nd_values.ndim == 2: + values = values.reshape(-1) values = values[~isna(values)] - hist, bins = np.histogram(values, bins=bins, range=self._bin_range) - return bins + return np.histogram_bin_edges(values, bins=bins, range=self._bin_range) # error: Signature of "_plot" incompatible with supertype "LinePlot" @classmethod @@ -268,6 +269,7 @@ def _plot( # type: ignore[override] y: np.ndarray, style=None, bw_method=None, + weights=None, ind=None, column_num=None, stacking_id: int | None = None, @@ -276,7 +278,7 @@ def _plot( # type: ignore[override] from scipy.stats import gaussian_kde y = remove_na_arraylike(y) - gkde = gaussian_kde(y, bw_method=bw_method) + gkde = gaussian_kde(y, bw_method=bw_method, weights=weights) y = gkde.evaluate(ind) lines = MPLPlot._plot(ax, ind, y, style=style, **kwds) @@ -322,10 +324,7 @@ def _grouped_plot( naxes=naxes, figsize=figsize, sharex=sharex, sharey=sharey, ax=ax, layout=layout ) - _axes = flatten_axes(axes) - - for i, (key, group) in enumerate(grouped): - ax = _axes[i] + for ax, (key, group) in zip(flatten_axes(axes), grouped): if numeric_only and isinstance(group, ABCDataFrame): group = group._get_numeric_data() plotf(group, ax, **kwargs) @@ -557,12 +556,9 @@ def hist_frame( figsize=figsize, layout=layout, ) - _axes = flatten_axes(axes) - can_set_label = "label" not in kwds - for i, col in enumerate(data.columns): - ax = _axes[i] + for ax, col in zip(flatten_axes(axes), data.columns): if legend and can_set_label: kwds["label"] = col ax.hist(data[col].dropna().values, bins=bins, **kwds) diff --git a/pandas/plotting/_matplotlib/misc.py b/pandas/plotting/_matplotlib/misc.py index 1f9212587e05e..4a891ec27e8cb 100644 --- a/pandas/plotting/_matplotlib/misc.py +++ b/pandas/plotting/_matplotlib/misc.py @@ -3,8 +3,7 @@ import random from typing import TYPE_CHECKING -from matplotlib import patches -import matplotlib.lines as mlines +import matplotlib as mpl import numpy as np from pandas.core.dtypes.missing import notna @@ -129,7 +128,7 @@ def scatter_matrix( def _get_marker_compat(marker): - if marker not in mlines.lineMarkers: + if marker not in mpl.lines.lineMarkers: return "o" return marker @@ -190,10 +189,10 @@ def normalize(series): ) ax.legend() - ax.add_patch(patches.Circle((0.0, 0.0), radius=1.0, facecolor="none")) + ax.add_patch(mpl.patches.Circle((0.0, 0.0), radius=1.0, facecolor="none")) for xy, name in zip(s, df.columns): - ax.add_patch(patches.Circle(xy, radius=0.025, facecolor="gray")) + ax.add_patch(mpl.patches.Circle(xy, radius=0.025, facecolor="gray")) if xy[0] < 0.0 and xy[1] < 0.0: ax.text( diff --git a/pandas/plotting/_matplotlib/style.py b/pandas/plotting/_matplotlib/style.py index d725d53bd21ec..962f9711d9916 100644 --- a/pandas/plotting/_matplotlib/style.py +++ b/pandas/plotting/_matplotlib/style.py @@ -260,9 +260,7 @@ def _get_colors_from_color_type(color_type: str, num_colors: int) -> list[Color] def _get_default_colors(num_colors: int) -> list[Color]: """Get `num_colors` of default colors from matplotlib rc params.""" - import matplotlib.pyplot as plt - - colors = [c["color"] for c in plt.rcParams["axes.prop_cycle"]] + colors = [c["color"] for c in mpl.rcParams["axes.prop_cycle"]] return colors[0:num_colors] diff --git a/pandas/plotting/_matplotlib/timeseries.py b/pandas/plotting/_matplotlib/timeseries.py index d438f521c0dbc..d95ccad2da565 100644 --- a/pandas/plotting/_matplotlib/timeseries.py +++ b/pandas/plotting/_matplotlib/timeseries.py @@ -333,7 +333,7 @@ def format_dateaxis( default, changing the limits of the x axis will intelligently change the positions of the ticks. """ - from matplotlib import pylab + import matplotlib.pyplot as plt # handle index specific formatting # Note: DatetimeIndex does not use this @@ -365,4 +365,4 @@ def format_dateaxis( else: raise TypeError("index type not supported") - pylab.draw_if_interactive() + plt.draw_if_interactive() diff --git a/pandas/plotting/_matplotlib/tools.py b/pandas/plotting/_matplotlib/tools.py index 50cfdbd967ea7..f9c370b2486fd 100644 --- a/pandas/plotting/_matplotlib/tools.py +++ b/pandas/plotting/_matplotlib/tools.py @@ -5,8 +5,7 @@ from typing import TYPE_CHECKING import warnings -from matplotlib import ticker -import matplotlib.table +import matplotlib as mpl import numpy as np from pandas.util._exceptions import find_stack_level @@ -19,7 +18,10 @@ ) if TYPE_CHECKING: - from collections.abc import Iterable + from collections.abc import ( + Generator, + Iterable, + ) from matplotlib.axes import Axes from matplotlib.axis import Axis @@ -77,7 +79,7 @@ def table( # error: Argument "cellText" to "table" has incompatible type "ndarray[Any, # Any]"; expected "Sequence[Sequence[str]] | None" - return matplotlib.table.table( + return mpl.table.table( ax, cellText=cellText, # type: ignore[arg-type] rowLabels=rowLabels, @@ -232,7 +234,7 @@ def create_subplots( else: if is_list_like(ax): if squeeze: - ax = flatten_axes(ax) + ax = np.fromiter(flatten_axes(ax), dtype=object) if layout is not None: warnings.warn( "When passing multiple axes, layout keyword is ignored.", @@ -261,7 +263,7 @@ def create_subplots( if squeeze: return fig, ax else: - return fig, flatten_axes(ax) + return fig, np.fromiter(flatten_axes(ax), dtype=object) else: warnings.warn( "To output multiple subplots, the figure containing " @@ -327,10 +329,10 @@ def _remove_labels_from_axis(axis: Axis) -> None: # set_visible will not be effective if # minor axis has NullLocator and NullFormatter (default) - if isinstance(axis.get_minor_locator(), ticker.NullLocator): - axis.set_minor_locator(ticker.AutoLocator()) - if isinstance(axis.get_minor_formatter(), ticker.NullFormatter): - axis.set_minor_formatter(ticker.FormatStrFormatter("")) + if isinstance(axis.get_minor_locator(), mpl.ticker.NullLocator): + axis.set_minor_locator(mpl.ticker.AutoLocator()) + if isinstance(axis.get_minor_formatter(), mpl.ticker.NullFormatter): + axis.set_minor_formatter(mpl.ticker.FormatStrFormatter("")) for t in axis.get_minorticklabels(): t.set_visible(False) @@ -440,12 +442,13 @@ def handle_shared_axes( _remove_labels_from_axis(ax.yaxis) -def flatten_axes(axes: Axes | Iterable[Axes]) -> np.ndarray: +def flatten_axes(axes: Axes | Iterable[Axes]) -> Generator[Axes, None, None]: if not is_list_like(axes): - return np.array([axes]) + yield axes # type: ignore[misc] elif isinstance(axes, (np.ndarray, ABCIndex)): - return np.asarray(axes).ravel() - return np.array(axes) + yield from np.asarray(axes).reshape(-1) + else: + yield from axes # type: ignore[misc] def set_ticks_props( @@ -455,17 +458,15 @@ def set_ticks_props( ylabelsize: int | None = None, yrot=None, ): - import matplotlib.pyplot as plt - for ax in flatten_axes(axes): if xlabelsize is not None: - plt.setp(ax.get_xticklabels(), fontsize=xlabelsize) + mpl.artist.setp(ax.get_xticklabels(), fontsize=xlabelsize) # type: ignore[arg-type] if xrot is not None: - plt.setp(ax.get_xticklabels(), rotation=xrot) + mpl.artist.setp(ax.get_xticklabels(), rotation=xrot) # type: ignore[arg-type] if ylabelsize is not None: - plt.setp(ax.get_yticklabels(), fontsize=ylabelsize) + mpl.artist.setp(ax.get_yticklabels(), fontsize=ylabelsize) # type: ignore[arg-type] if yrot is not None: - plt.setp(ax.get_yticklabels(), rotation=yrot) + mpl.artist.setp(ax.get_yticklabels(), rotation=yrot) # type: ignore[arg-type] return axes diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py index af7ddf39283c0..d8455f44ef0d1 100644 --- a/pandas/plotting/_misc.py +++ b/pandas/plotting/_misc.py @@ -33,6 +33,7 @@ def table(ax: Axes, data: DataFrame | Series, **kwargs) -> Table: Parameters ---------- ax : Matplotlib axes object + The axes on which to draw the table. data : DataFrame or Series Data for table contents. **kwargs @@ -43,6 +44,12 @@ def table(ax: Axes, data: DataFrame | Series, **kwargs) -> Table: Returns ------- matplotlib table object + The created table as a matplotlib Table object. + + See Also + -------- + DataFrame.plot : Make plots of DataFrame using matplotlib. + matplotlib.pyplot.table : Create a table from data in a Matplotlib plot. Examples -------- @@ -472,6 +479,7 @@ def parallel_coordinates( Parameters ---------- frame : DataFrame + The DataFrame to be plotted. class_column : str Column name containing class names. cols : list, optional @@ -498,6 +506,13 @@ def parallel_coordinates( Returns ------- matplotlib.axes.Axes + The matplotlib axes containing the parallel coordinates plot. + + See Also + -------- + plotting.andrews_curves : Generate a matplotlib plot for visualizing clusters + of multivariate data. + plotting.radviz : Plot a multidimensional dataset in 2D. Examples -------- @@ -591,6 +606,12 @@ def autocorrelation_plot(series: Series, ax: Axes | None = None, **kwargs) -> Ax Returns ------- matplotlib.axes.Axes + The matplotlib axes containing the autocorrelation plot. + + See Also + -------- + Series.autocorr : Compute the lag-N autocorrelation for a Series. + plotting.lag_plot : Lag plot for time series. Examples -------- @@ -617,6 +638,14 @@ class _Options(dict): the same as the plot function parameters, but is stored in a canonical format that makes it easy to breakdown into groups later. + See Also + -------- + plotting.register_matplotlib_converters : Register pandas formatters and + converters with matplotlib. + plotting.bootstrap_plot : Bootstrap plot on mean, median and mid-range statistics. + plotting.autocorrelation_plot : Autocorrelation plot for time series. + plotting.lag_plot : Lag plot for time series. + Examples -------- diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index cbc68265a1cc1..b0475b64a844e 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.core.dtypes.dtypes import CategoricalDtype import pandas as pd @@ -61,18 +63,63 @@ def test_apply(float_frame, engine, request): assert result.index is float_frame.index +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize("raw", [True, False]) -def test_apply_args(float_frame, axis, raw, engine, request): - if engine == "numba": - mark = pytest.mark.xfail(reason="numba engine doesn't support args") - request.node.add_marker(mark) +@pytest.mark.parametrize("nopython", [True, False]) +def test_apply_args(float_frame, axis, raw, engine, nopython): + engine_kwargs = {"nopython": nopython} result = float_frame.apply( - lambda x, y: x + y, axis, args=(1,), raw=raw, engine=engine + lambda x, y: x + y, + axis, + args=(1,), + raw=raw, + engine=engine, + engine_kwargs=engine_kwargs, ) expected = float_frame + 1 tm.assert_frame_equal(result, expected) + # GH:58712 + result = float_frame.apply( + lambda x, a, b: x + a + b, + args=(1,), + b=2, + raw=raw, + engine=engine, + engine_kwargs=engine_kwargs, + ) + expected = float_frame + 3 + tm.assert_frame_equal(result, expected) + + if engine == "numba": + # keyword-only arguments are not supported in numba + with pytest.raises( + pd.errors.NumbaUtilError, + match="numba does not support keyword-only arguments", + ): + float_frame.apply( + lambda x, a, *, b: x + a + b, + args=(1,), + b=2, + raw=raw, + engine=engine, + engine_kwargs=engine_kwargs, + ) + + with pytest.raises( + pd.errors.NumbaUtilError, + match="numba does not support keyword-only arguments", + ): + float_frame.apply( + lambda *x, b: x[0] + x[1] + b, + args=(1,), + b=2, + raw=raw, + engine=engine, + engine_kwargs=engine_kwargs, + ) + def test_apply_categorical_func(): # GH 9573 @@ -324,18 +371,18 @@ def test_apply_mixed_dtype_corner(): result = df[:0].apply(np.mean, axis=1) # the result here is actually kind of ambiguous, should it be a Series # or a DataFrame? - expected = Series(np.nan, index=pd.Index([], dtype="int64")) + expected = Series(dtype=np.float64) tm.assert_series_equal(result, expected) def test_apply_mixed_dtype_corner_indexing(): df = DataFrame({"A": ["foo"], "B": [1.0]}) result = df.apply(lambda x: x["A"], axis=1) - expected = Series(["foo"], index=[0]) + expected = Series(["foo"], index=range(1)) tm.assert_series_equal(result, expected) result = df.apply(lambda x: x["B"], axis=1) - expected = Series([1.0], index=[0]) + expected = Series([1.0], index=range(1)) tm.assert_series_equal(result, expected) @@ -993,7 +1040,7 @@ def test_result_type(int_frame_const_col): result = df.apply(lambda x: [1, 2, 3], axis=1, result_type="expand") expected = df.copy() - expected.columns = [0, 1, 2] + expected.columns = range(3) tm.assert_frame_equal(result, expected) @@ -1003,7 +1050,7 @@ def test_result_type_shorter_list(int_frame_const_col): df = int_frame_const_col result = df.apply(lambda x: [1, 2], axis=1, result_type="expand") expected = df[["A", "B"]].copy() - expected.columns = [0, 1] + expected.columns = range(2) tm.assert_frame_equal(result, expected) @@ -1169,6 +1216,7 @@ def test_agg_with_name_as_column_name(): tm.assert_series_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_agg_multiple_mixed(): # GH 20909 mdf = DataFrame( @@ -1286,6 +1334,15 @@ def test_agg_reduce(axis, float_frame): tm.assert_frame_equal(result, expected) +def test_named_agg_reduce_axis1_raises(float_frame): + name1, name2 = float_frame.axes[0].unique()[:2].sort_values() + msg = "Named aggregation is not supported when axis=1." + for axis in [1, "columns"]: + with pytest.raises(NotImplementedError, match=msg): + float_frame.agg(row1=(name1, "sum"), row2=(name2, "max"), axis=axis) + + +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_nuiscance_columns(): # GH 15015 df = DataFrame( @@ -1462,6 +1519,7 @@ def test_apply_datetime_tz_issue(engine, request): tm.assert_series_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("df", [DataFrame({"A": ["a", None], "B": ["c", "d"]})]) @pytest.mark.parametrize("method", ["min", "max", "sum"]) def test_mixed_column_raises(df, method, using_infer_string): diff --git a/pandas/tests/apply/test_numba.py b/pandas/tests/apply/test_numba.py index 57b81711ddb48..aee9100702350 100644 --- a/pandas/tests/apply/test_numba.py +++ b/pandas/tests/apply/test_numba.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas.util._test_decorators as td from pandas import ( @@ -17,6 +19,7 @@ def apply_axis(request): return request.param +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_numba_vs_python_noop(float_frame, apply_axis): func = lambda x: x result = float_frame.apply(func, engine="numba", axis=apply_axis) @@ -40,6 +43,7 @@ def test_numba_vs_python_string_index(): ) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_numba_vs_python_indexing(): frame = DataFrame( {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7.0, 8.0, 9.0]}, diff --git a/pandas/tests/apply/test_str.py b/pandas/tests/apply/test_str.py index e0c5e337fb746..732652f24e2eb 100644 --- a/pandas/tests/apply/test_str.py +++ b/pandas/tests/apply/test_str.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat import WASM from pandas.core.dtypes.common import is_number @@ -79,6 +81,7 @@ def test_apply_np_transformer(float_frame, op, how): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "series, func, expected", chain( @@ -137,6 +140,7 @@ def test_agg_cython_table_series(series, func, expected): assert result == expected +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "series, func, expected", chain( @@ -287,7 +291,7 @@ def test_transform_groupby_kernel_frame(request, float_frame, op): # same thing, but ensuring we have multiple blocks assert "E" not in float_frame.columns float_frame["E"] = float_frame["A"].copy() - assert len(float_frame._mgr.arrays) > 1 + assert len(float_frame._mgr.blocks) > 1 ones = np.ones(float_frame.shape[0]) gb2 = float_frame.groupby(ones) diff --git a/pandas/tests/arithmetic/common.py b/pandas/tests/arithmetic/common.py index d7a8b0510b50f..0730729e2fd94 100644 --- a/pandas/tests/arithmetic/common.py +++ b/pandas/tests/arithmetic/common.py @@ -20,13 +20,16 @@ def assert_cannot_add(left, right, msg="cannot add"): """ - Helper to assert that left and right cannot be added. + Helper function to assert that two objects cannot be added. Parameters ---------- left : object + The first operand. right : object + The second operand. msg : str, default "cannot add" + The error message expected in the TypeError. """ with pytest.raises(TypeError, match=msg): left + right @@ -36,13 +39,17 @@ def assert_cannot_add(left, right, msg="cannot add"): def assert_invalid_addsub_type(left, right, msg=None): """ - Helper to assert that left and right can be neither added nor subtracted. + Helper function to assert that two objects can + neither be added nor subtracted. Parameters ---------- left : object + The first operand. right : object + The second operand. msg : str or None, default None + The error message expected in the TypeError. """ with pytest.raises(TypeError, match=msg): left + right diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index f9807310460b4..26dfcf088e74b 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -5,6 +5,7 @@ datetime, time, timedelta, + timezone, ) from itertools import ( product, @@ -14,7 +15,6 @@ import numpy as np import pytest -import pytz from pandas._libs.tslibs.conversion import localize_pydatetime from pandas._libs.tslibs.offsets import shift_months @@ -389,6 +389,22 @@ def test_dt64_compare_datetime_scalar(self, datetimelike, op, expected): expected = Series(expected, name="A") tm.assert_series_equal(result, expected) + def test_ts_series_numpy_maximum(self): + # GH#50864, test numpy.maximum does not fail + # given a TimeStamp and Series(with dtype datetime64) comparison + ts = Timestamp("2024-07-01") + ts_series = Series( + ["2024-06-01", "2024-07-01", "2024-08-01"], + dtype="datetime64[us]", + ) + + expected = Series( + ["2024-07-01", "2024-07-01", "2024-08-01"], + dtype="datetime64[us]", + ) + + tm.assert_series_equal(expected, np.maximum(ts, ts_series)) + class TestDatetimeIndexComparisons: # TODO: moved from tests.indexes.test_base; parametrize and de-duplicate @@ -1870,8 +1886,10 @@ def test_dt64tz_series_sub_dtitz(self): def test_sub_datetime_compat(self, unit): # see GH#14088 - ser = Series([datetime(2016, 8, 23, 12, tzinfo=pytz.utc), NaT]).dt.as_unit(unit) - dt = datetime(2016, 8, 22, 12, tzinfo=pytz.utc) + ser = Series([datetime(2016, 8, 23, 12, tzinfo=timezone.utc), NaT]).dt.as_unit( + unit + ) + dt = datetime(2016, 8, 22, 12, tzinfo=timezone.utc) # The datetime object has "us" so we upcast lower units exp_unit = tm.get_finest_unit(unit, "us") exp = Series([Timedelta("1 days"), NaT]).dt.as_unit(exp_unit) diff --git a/pandas/tests/arithmetic/test_numeric.py b/pandas/tests/arithmetic/test_numeric.py index 1b8ad1922b9d2..d205569270705 100644 --- a/pandas/tests/arithmetic/test_numeric.py +++ b/pandas/tests/arithmetic/test_numeric.py @@ -1451,7 +1451,7 @@ def test_fill_value_inf_masking(): expected = pd.DataFrame( {"A": [np.inf, 1.0, 0.0, 1.0], "B": [0.0, np.nan, 0.0, np.nan]} ) - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected, check_index_type=False) def test_dataframe_div_silenced(): diff --git a/pandas/tests/arithmetic/test_object.py b/pandas/tests/arithmetic/test_object.py index 4ffd76722286a..4b5156d0007bb 100644 --- a/pandas/tests/arithmetic/test_object.py +++ b/pandas/tests/arithmetic/test_object.py @@ -8,8 +8,6 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype - import pandas.util._test_decorators as td import pandas as pd @@ -303,7 +301,6 @@ def test_iadd_string(self): index += "_x" assert "a_x" in index - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="add doesn't work") def test_add(self): index = pd.Index([str(i) for i in range(10)]) expected = pd.Index(index.values * 2) diff --git a/pandas/tests/arithmetic/test_period.py b/pandas/tests/arithmetic/test_period.py index 18f1993c198df..67762e0b89c73 100644 --- a/pandas/tests/arithmetic/test_period.py +++ b/pandas/tests/arithmetic/test_period.py @@ -1086,7 +1086,7 @@ def test_parr_add_timedeltalike_minute_gt1(self, three_days, box_with_array): with pytest.raises(TypeError, match=msg): other - rng - @pytest.mark.parametrize("freqstr", ["5ns", "5us", "5ms", "5s", "5min", "5h", "5d"]) + @pytest.mark.parametrize("freqstr", ["5ns", "5us", "5ms", "5s", "5min", "5h", "5D"]) def test_parr_add_timedeltalike_tick_gt1(self, three_days, freqstr, box_with_array): # GH#23031 adding a time-delta-like offset to a PeriodArray that has # tick-like frequency with n != 1 @@ -1361,7 +1361,12 @@ def test_period_add_timestamp_raises(self, box_with_array): arr + ts with pytest.raises(TypeError, match=msg): ts + arr - msg = "cannot add PeriodArray and DatetimeArray" + if box_with_array is pd.DataFrame: + # TODO: before implementing resolution-inference we got the same + # message with DataFrame and non-DataFrame. Why did that change? + msg = "cannot add PeriodArray and Timestamp" + else: + msg = "cannot add PeriodArray and DatetimeArray" with pytest.raises(TypeError, match=msg): arr + Series([ts]) with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/arrays/categorical/test_analytics.py b/pandas/tests/arrays/categorical/test_analytics.py index 1021b18f4ae71..dca33dffa3996 100644 --- a/pandas/tests/arrays/categorical/test_analytics.py +++ b/pandas/tests/arrays/categorical/test_analytics.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat import PYPY from pandas import ( @@ -294,6 +296,7 @@ def test_nbytes(self): exp = 3 + 3 * 8 # 3 int8s for values + 3 int64s for categories assert cat.nbytes == exp + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_memory_usage(self): cat = Categorical([1, 2, 3]) diff --git a/pandas/tests/arrays/categorical/test_api.py b/pandas/tests/arrays/categorical/test_api.py index 2791fd55f54d7..2ccc5781c608e 100644 --- a/pandas/tests/arrays/categorical/test_api.py +++ b/pandas/tests/arrays/categorical/test_api.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat import PY311 from pandas import ( @@ -149,6 +151,7 @@ def test_reorder_categories_raises(self, new_categories): with pytest.raises(ValueError, match=msg): cat.reorder_categories(new_categories) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_add_categories(self): cat = Categorical(["a", "b", "c", "a"], ordered=True) old = cat.copy() diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index 1069a9e5aaa90..6752a503016f8 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -6,7 +6,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas.core.dtypes.common import ( is_float_dtype, @@ -442,7 +442,7 @@ def test_constructor_str_unknown(self): with pytest.raises(ValueError, match="Unknown dtype"): Categorical([1, 2], dtype="foo") - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="Can't be NumPy strings") + @pytest.mark.xfail(using_string_dtype(), reason="Can't be NumPy strings") def test_constructor_np_strs(self): # GH#31499 Hashtable.map_locations needs to work on np.str_ objects cat = Categorical(["1", "0", "1"], [np.str_("0"), np.str_("1")]) diff --git a/pandas/tests/arrays/categorical/test_missing.py b/pandas/tests/arrays/categorical/test_missing.py index 9d4b78ce9944e..e3cb9664e19f2 100644 --- a/pandas/tests/arrays/categorical/test_missing.py +++ b/pandas/tests/arrays/categorical/test_missing.py @@ -121,7 +121,7 @@ def test_compare_categorical_with_missing(self, a1, a2, categories): @pytest.mark.parametrize( "na_value, dtype", [ - (pd.NaT, "datetime64[ns]"), + (pd.NaT, "datetime64[s]"), (None, "float64"), (np.nan, "float64"), (pd.NA, "float64"), diff --git a/pandas/tests/arrays/categorical/test_replace.py b/pandas/tests/arrays/categorical/test_replace.py index 3c677142846d7..7f3e8d3ed6e6e 100644 --- a/pandas/tests/arrays/categorical/test_replace.py +++ b/pandas/tests/arrays/categorical/test_replace.py @@ -6,106 +6,66 @@ @pytest.mark.parametrize( - "to_replace,value,expected,flip_categories", + "to_replace,value,expected", [ # one-to-one - (1, 2, [2, 2, 3], False), - (1, 4, [4, 2, 3], False), - (4, 1, [1, 2, 3], False), - (5, 6, [1, 2, 3], False), + (4, 1, [1, 2, 3]), + (3, 1, [1, 2, 1]), # many-to-one - ([1], 2, [2, 2, 3], False), - ([1, 2], 3, [3, 3, 3], False), - ([1, 2], 4, [4, 4, 3], False), - ((1, 2, 4), 5, [5, 5, 3], False), - ((5, 6), 2, [1, 2, 3], False), - ([1], [2], [2, 2, 3], False), - ([1, 4], [5, 2], [5, 2, 3], False), - # GH49404: overlap between to_replace and value - ([1, 2, 3], [2, 3, 4], [2, 3, 4], False), - # GH50872, GH46884: replace with null - (1, None, [None, 2, 3], False), - (1, pd.NA, [None, 2, 3], False), - # check_categorical sorts categories, which crashes on mixed dtypes - (3, "4", [1, 2, "4"], False), - ([1, 2, "3"], "5", ["5", "5", 3], True), + ((5, 6), 2, [1, 2, 3]), + ((3, 2), 1, [1, 1, 1]), ], ) -@pytest.mark.filterwarnings( - "ignore:.*with CategoricalDtype is deprecated:FutureWarning" -) -def test_replace_categorical_series(to_replace, value, expected, flip_categories): +def test_replace_categorical_series(to_replace, value, expected): # GH 31720 - ser = pd.Series([1, 2, 3], dtype="category") result = ser.replace(to_replace, value) - expected = pd.Series(expected, dtype="category") - ser.replace(to_replace, value, inplace=True) - - if flip_categories: - expected = expected.cat.set_categories(expected.cat.categories[::-1]) - - tm.assert_series_equal(expected, result, check_category_order=False) - tm.assert_series_equal(expected, ser, check_category_order=False) + expected = pd.Series(Categorical(expected, categories=[1, 2, 3])) + tm.assert_series_equal(result, expected) @pytest.mark.parametrize( - "to_replace, value, result, expected_error_msg", + "to_replace,value", [ - ("b", "c", ["a", "c"], "Categorical.categories are different"), - ("c", "d", ["a", "b"], None), - # https://github.com/pandas-dev/pandas/issues/33288 - ("a", "a", ["a", "b"], None), - ("b", None, ["a", None], "Categorical.categories length are different"), + # one-to-one + (3, 5), + # many-to-one + ((3, 2), 5), ], ) -def test_replace_categorical(to_replace, value, result, expected_error_msg): - # GH#26988 - cat = Categorical(["a", "b"]) - expected = Categorical(result) - msg = ( - r"The behavior of Series\.replace \(and DataFrame.replace\) " - "with CategoricalDtype" - ) - warn = FutureWarning if expected_error_msg is not None else None - with tm.assert_produces_warning(warn, match=msg): - result = pd.Series(cat, copy=False).replace(to_replace, value)._values +def test_replace_categorical_series_new_category_raises(to_replace, value): + # GH 31720 + ser = pd.Series([1, 2, 3], dtype="category") + with pytest.raises( + TypeError, match="Cannot setitem on a Categorical with a new category" + ): + ser.replace(to_replace, value) - tm.assert_categorical_equal(result, expected) - if to_replace == "b": # the "c" test is supposed to be unchanged - with pytest.raises(AssertionError, match=expected_error_msg): - # ensure non-inplace call does not affect original - tm.assert_categorical_equal(cat, expected) - ser = pd.Series(cat, copy=False) - with tm.assert_produces_warning(warn, match=msg): - ser.replace(to_replace, value, inplace=True) - tm.assert_categorical_equal(cat, expected) +def test_replace_maintain_ordering(): + # GH51016 + dtype = pd.CategoricalDtype([0, 1, 2], ordered=True) + ser = pd.Series([0, 1, 2], dtype=dtype) + result = ser.replace(0, 2) + expected = pd.Series([2, 1, 2], dtype=dtype) + tm.assert_series_equal(expected, result, check_category_order=True) def test_replace_categorical_ea_dtype(): # GH49404 - cat = Categorical(pd.array(["a", "b"], dtype="string")) - msg = ( - r"The behavior of Series\.replace \(and DataFrame.replace\) " - "with CategoricalDtype" + cat = Categorical(pd.array(["a", "b", "c"], dtype="string")) + result = pd.Series(cat).replace(["a", "b"], ["c", "c"])._values + expected = Categorical( + pd.array(["c"] * 3, dtype="string"), + categories=pd.array(["a", "b", "c"], dtype="string"), ) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = pd.Series(cat).replace(["a", "b"], ["c", pd.NA])._values - expected = Categorical(pd.array(["c", pd.NA], dtype="string")) tm.assert_categorical_equal(result, expected) -def test_replace_maintain_ordering(): - # GH51016 - dtype = pd.CategoricalDtype([0, 1, 2], ordered=True) - ser = pd.Series([0, 1, 2], dtype=dtype) - msg = ( - r"The behavior of Series\.replace \(and DataFrame.replace\) " - "with CategoricalDtype" - ) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = ser.replace(0, 2) - expected_dtype = pd.CategoricalDtype([1, 2], ordered=True) - expected = pd.Series([2, 1, 2], dtype=expected_dtype) - tm.assert_series_equal(expected, result, check_category_order=True) +def test_replace_categorical_ea_dtype_different_cats_raises(): + # GH49404 + cat = Categorical(pd.array(["a", "b"], dtype="string")) + with pytest.raises( + TypeError, match="Cannot setitem on a Categorical with a new category" + ): + pd.Series(cat).replace(["a", "b"], ["c", pd.NA]) diff --git a/pandas/tests/arrays/categorical/test_repr.py b/pandas/tests/arrays/categorical/test_repr.py index ef0315130215c..e2e5d47f50209 100644 --- a/pandas/tests/arrays/categorical/test_repr.py +++ b/pandas/tests/arrays/categorical/test_repr.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas import ( Categorical, @@ -78,7 +78,7 @@ def test_print_none_width(self): assert exp == repr(a) @pytest.mark.skipif( - using_pyarrow_string_dtype(), + using_string_dtype(), reason="Change once infer_string is set to True by default", ) def test_unicode_print(self): diff --git a/pandas/tests/arrays/floating/test_arithmetic.py b/pandas/tests/arrays/floating/test_arithmetic.py index ba081bd01062a..768d3c1449fa4 100644 --- a/pandas/tests/arrays/floating/test_arithmetic.py +++ b/pandas/tests/arrays/floating/test_arithmetic.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd import pandas._testing as tm from pandas.core.arrays import FloatingArray @@ -122,6 +124,7 @@ def test_arith_zero_dim_ndarray(other): # ----------------------------------------------------------------------------- +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string): op = all_arithmetic_operators s = pd.Series(data) diff --git a/pandas/tests/arrays/integer/test_arithmetic.py b/pandas/tests/arrays/integer/test_arithmetic.py index 8acd298f37a07..8aa8c2db940b4 100644 --- a/pandas/tests/arrays/integer/test_arithmetic.py +++ b/pandas/tests/arrays/integer/test_arithmetic.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd import pandas._testing as tm from pandas.core import ops @@ -172,6 +174,7 @@ def test_numpy_zero_dim_ndarray(other): # ----------------------------------------------------------------------------- +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string): op = all_arithmetic_operators s = pd.Series(data) diff --git a/pandas/tests/arrays/interval/test_interval_pyarrow.py b/pandas/tests/arrays/interval/test_interval_pyarrow.py index ef8701be81e2b..be87d5d3ef7ba 100644 --- a/pandas/tests/arrays/interval/test_interval_pyarrow.py +++ b/pandas/tests/arrays/interval/test_interval_pyarrow.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd import pandas._testing as tm from pandas.core.arrays import IntervalArray @@ -80,6 +82,7 @@ def test_arrow_array_missing(): assert result.storage.equals(expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.filterwarnings( "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" ) diff --git a/pandas/tests/arrays/masked/test_arrow_compat.py b/pandas/tests/arrays/masked/test_arrow_compat.py index 5f73370554473..c719e19a7c8d1 100644 --- a/pandas/tests/arrays/masked/test_arrow_compat.py +++ b/pandas/tests/arrays/masked/test_arrow_compat.py @@ -1,12 +1,18 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd import pandas._testing as tm -pytestmark = pytest.mark.filterwarnings( - "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" -) +pytestmark = [ + pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" + ), + pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), +] + pa = pytest.importorskip("pyarrow") diff --git a/pandas/tests/arrays/period/test_arrow_compat.py b/pandas/tests/arrays/period/test_arrow_compat.py index 431309aca0df2..ff86b696c8403 100644 --- a/pandas/tests/arrays/period/test_arrow_compat.py +++ b/pandas/tests/arrays/period/test_arrow_compat.py @@ -1,5 +1,7 @@ import pytest +from pandas._config import using_string_dtype + from pandas.compat.pyarrow import pa_version_under10p1 from pandas.core.dtypes.dtypes import PeriodDtype @@ -77,6 +79,7 @@ def test_arrow_array_missing(): assert result.storage.equals(expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_arrow_table_roundtrip(): from pandas.core.arrays.arrow.extension_types import ArrowPeriodType @@ -96,6 +99,7 @@ def test_arrow_table_roundtrip(): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_arrow_load_from_zero_chunks(): # GH-41040 diff --git a/pandas/tests/arrays/sparse/test_accessor.py b/pandas/tests/arrays/sparse/test_accessor.py index 87eb7bcfa9cee..bd3298940ae3a 100644 --- a/pandas/tests/arrays/sparse/test_accessor.py +++ b/pandas/tests/arrays/sparse/test_accessor.py @@ -105,28 +105,36 @@ def test_accessor_raises(self): @pytest.mark.parametrize("format", ["csc", "csr", "coo"]) @pytest.mark.parametrize("labels", [None, list(string.ascii_letters[:10])]) - @pytest.mark.parametrize("dtype", ["float64", "int64"]) + @pytest.mark.parametrize("dtype", [np.complex128, np.float64, np.int64, bool]) def test_from_spmatrix(self, format, labels, dtype): sp_sparse = pytest.importorskip("scipy.sparse") - sp_dtype = SparseDtype(dtype, np.array(0, dtype=dtype).item()) + sp_dtype = SparseDtype(dtype) - mat = sp_sparse.eye(10, format=format, dtype=dtype) - result = pd.DataFrame.sparse.from_spmatrix(mat, index=labels, columns=labels) + sp_mat = sp_sparse.eye(10, format=format, dtype=dtype) + result = pd.DataFrame.sparse.from_spmatrix(sp_mat, index=labels, columns=labels) + mat = np.eye(10, dtype=dtype) expected = pd.DataFrame( - np.eye(10, dtype=dtype), index=labels, columns=labels + np.ma.array(mat, mask=(mat == 0)).filled(sp_dtype.fill_value), + index=labels, + columns=labels, ).astype(sp_dtype) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("format", ["csc", "csr", "coo"]) - def test_from_spmatrix_including_explicit_zero(self, format): + @pytest.mark.parametrize("dtype", [np.int64, bool]) + def test_from_spmatrix_including_explicit_zero(self, format, dtype): sp_sparse = pytest.importorskip("scipy.sparse") - mat = sp_sparse.random(10, 2, density=0.5, format=format) - mat.data[0] = 0 - result = pd.DataFrame.sparse.from_spmatrix(mat) - dtype = SparseDtype("float64", 0.0) - expected = pd.DataFrame(mat.todense()).astype(dtype) + sp_dtype = SparseDtype(dtype) + + sp_mat = sp_sparse.random(10, 2, density=0.5, format=format, dtype=dtype) + sp_mat.data[0] = 0 + result = pd.DataFrame.sparse.from_spmatrix(sp_mat) + mat = sp_mat.toarray() + expected = pd.DataFrame( + np.ma.array(mat, mask=(mat == 0)).filled(sp_dtype.fill_value) + ).astype(sp_dtype) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( @@ -136,41 +144,34 @@ def test_from_spmatrix_including_explicit_zero(self, format): def test_from_spmatrix_columns(self, columns): sp_sparse = pytest.importorskip("scipy.sparse") - dtype = SparseDtype("float64", 0.0) + sp_dtype = SparseDtype(np.float64) - mat = sp_sparse.random(10, 2, density=0.5) - result = pd.DataFrame.sparse.from_spmatrix(mat, columns=columns) - expected = pd.DataFrame(mat.toarray(), columns=columns).astype(dtype) + sp_mat = sp_sparse.random(10, 2, density=0.5) + result = pd.DataFrame.sparse.from_spmatrix(sp_mat, columns=columns) + mat = sp_mat.toarray() + expected = pd.DataFrame( + np.ma.array(mat, mask=(mat == 0)).filled(sp_dtype.fill_value), + columns=columns, + ).astype(sp_dtype) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( - "colnames", [("A", "B"), (1, 2), (1, pd.NA), (0.1, 0.2), ("x", "x"), (0, 0)] + "columns", [("A", "B"), (1, 2), (1, pd.NA), (0.1, 0.2), ("x", "x"), (0, 0)] ) - def test_to_coo(self, colnames): + @pytest.mark.parametrize("dtype", [np.complex128, np.float64, np.int64, bool]) + def test_to_coo(self, columns, dtype): sp_sparse = pytest.importorskip("scipy.sparse") - df = pd.DataFrame( - {colnames[0]: [0, 1, 0], colnames[1]: [1, 0, 0]}, dtype="Sparse[int64, 0]" - ) - result = df.sparse.to_coo() - expected = sp_sparse.coo_matrix(np.asarray(df)) - assert (result != expected).nnz == 0 + sp_dtype = SparseDtype(dtype) - @pytest.mark.parametrize("fill_value", [1, np.nan]) - def test_to_coo_nonzero_fill_val_raises(self, fill_value): - pytest.importorskip("scipy") - df = pd.DataFrame( - { - "A": SparseArray( - [fill_value, fill_value, fill_value, 2], fill_value=fill_value - ), - "B": SparseArray( - [fill_value, 2, fill_value, fill_value], fill_value=fill_value - ), - } - ) - with pytest.raises(ValueError, match="fill value must be 0"): - df.sparse.to_coo() + expected = sp_sparse.random(10, 2, density=0.5, format="coo", dtype=dtype) + mat = expected.toarray() + result = pd.DataFrame( + np.ma.array(mat, mask=(mat == 0)).filled(sp_dtype.fill_value), + columns=columns, + dtype=sp_dtype, + ).sparse.to_coo() + assert (result != expected).nnz == 0 def test_to_coo_midx_categorical(self): # GH#50996 diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 597b407a29c94..293f3c74223fd 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -8,6 +8,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat.pyarrow import pa_version_under12p0 from pandas.core.dtypes.common import is_dtype_equal @@ -20,17 +22,11 @@ ) -def na_val(dtype): - if dtype.storage == "pyarrow_numpy": - return np.nan - else: - return pd.NA - - @pytest.fixture -def dtype(string_storage): - """Fixture giving StringDtype from parametrized 'string_storage'""" - return pd.StringDtype(storage=string_storage) +def dtype(string_dtype_arguments): + """Fixture giving StringDtype from parametrized storage and na_value arguments""" + storage, na_value = string_dtype_arguments + return pd.StringDtype(storage=storage, na_value=na_value) @pytest.fixture @@ -39,24 +35,45 @@ def cls(dtype): return dtype.construct_array_type() +def test_dtype_equality(): + pytest.importorskip("pyarrow") + + dtype1 = pd.StringDtype("python") + dtype2 = pd.StringDtype("pyarrow") + dtype3 = pd.StringDtype("pyarrow", na_value=np.nan) + + assert dtype1 == pd.StringDtype("python", na_value=pd.NA) + assert dtype1 != dtype2 + assert dtype1 != dtype3 + + assert dtype2 == pd.StringDtype("pyarrow", na_value=pd.NA) + assert dtype2 != dtype1 + assert dtype2 != dtype3 + + assert dtype3 == pd.StringDtype("pyarrow", na_value=np.nan) + assert dtype3 == pd.StringDtype("pyarrow", na_value=float("nan")) + assert dtype3 != dtype1 + assert dtype3 != dtype2 + + def test_repr(dtype): df = pd.DataFrame({"A": pd.array(["a", pd.NA, "b"], dtype=dtype)}) - if dtype.storage == "pyarrow_numpy": + if dtype.na_value is np.nan: expected = " A\n0 a\n1 NaN\n2 b" else: expected = " A\n0 a\n1 \n2 b" assert repr(df) == expected - if dtype.storage == "pyarrow_numpy": + if dtype.na_value is np.nan: expected = "0 a\n1 NaN\n2 b\nName: A, dtype: string" else: expected = "0 a\n1 \n2 b\nName: A, dtype: string" assert repr(df.A) == expected - if dtype.storage == "pyarrow": + if dtype.storage == "pyarrow" and dtype.na_value is pd.NA: arr_name = "ArrowStringArray" expected = f"<{arr_name}>\n['a', , 'b']\nLength: 3, dtype: string" - elif dtype.storage == "pyarrow_numpy": + elif dtype.storage == "pyarrow" and dtype.na_value is np.nan: arr_name = "ArrowStringArrayNumpySemantics" expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: string" else: @@ -68,7 +85,7 @@ def test_repr(dtype): def test_none_to_nan(cls, dtype): a = cls._from_sequence(["a", None, "b"], dtype=dtype) assert a[1] is not None - assert a[1] is na_val(a.dtype) + assert a[1] is a.dtype.na_value def test_setitem_validates(cls, dtype): @@ -150,8 +167,8 @@ def test_add(dtype): tm.assert_series_equal(result, expected) -def test_add_2d(dtype, request, arrow_string_storage): - if dtype.storage in arrow_string_storage: +def test_add_2d(dtype, request): + if dtype.storage == "pyarrow": reason = "Failed: DID NOT RAISE " mark = pytest.mark.xfail(raises=None, reason=reason) request.applymarker(mark) @@ -225,7 +242,7 @@ def test_comparison_methods_scalar(comparison_op, dtype): a = pd.array(["a", None, "c"], dtype=dtype) other = "a" result = getattr(a, op_name)(other) - if dtype.storage == "pyarrow_numpy": + if dtype.na_value is np.nan: expected = np.array([getattr(item, op_name)(other) for item in a]) if comparison_op == operator.ne: expected[1] = True @@ -244,7 +261,7 @@ def test_comparison_methods_scalar_pd_na(comparison_op, dtype): a = pd.array(["a", None, "c"], dtype=dtype) result = getattr(a, op_name)(pd.NA) - if dtype.storage == "pyarrow_numpy": + if dtype.na_value is np.nan: if operator.ne == comparison_op: expected = np.array([True, True, True]) else: @@ -271,7 +288,7 @@ def test_comparison_methods_scalar_not_string(comparison_op, dtype): result = getattr(a, op_name)(other) - if dtype.storage == "pyarrow_numpy": + if dtype.na_value is np.nan: expected_data = { "__eq__": [False, False, False], "__ne__": [True, True, True], @@ -293,7 +310,7 @@ def test_comparison_methods_array(comparison_op, dtype): a = pd.array(["a", None, "c"], dtype=dtype) other = [None, None, "c"] result = getattr(a, op_name)(other) - if dtype.storage == "pyarrow_numpy": + if dtype.na_value is np.nan: if operator.ne == comparison_op: expected = np.array([True, True, False]) else: @@ -387,7 +404,7 @@ def test_astype_int(dtype): tm.assert_numpy_array_equal(result, expected) arr = pd.array(["1", pd.NA, "3"], dtype=dtype) - if dtype.storage == "pyarrow_numpy": + if dtype.na_value is np.nan: err = ValueError msg = "cannot convert float NaN to integer" else: @@ -441,13 +458,13 @@ def test_min_max(method, skipna, dtype): expected = "a" if method == "min" else "c" assert result == expected else: - assert result is na_val(arr.dtype) + assert result is arr.dtype.na_value @pytest.mark.parametrize("method", ["min", "max"]) @pytest.mark.parametrize("box", [pd.Series, pd.array]) -def test_min_max_numpy(method, box, dtype, request, arrow_string_storage): - if dtype.storage in arrow_string_storage and box is pd.array: +def test_min_max_numpy(method, box, dtype, request): + if dtype.storage == "pyarrow" and box is pd.array: if box is pd.array: reason = "'<=' not supported between instances of 'str' and 'NoneType'" else: @@ -461,7 +478,7 @@ def test_min_max_numpy(method, box, dtype, request, arrow_string_storage): assert result == expected -def test_fillna_args(dtype, arrow_string_storage): +def test_fillna_args(dtype): # GH 37987 arr = pd.array(["a", pd.NA], dtype=dtype) @@ -474,7 +491,7 @@ def test_fillna_args(dtype, arrow_string_storage): expected = pd.array(["a", "b"], dtype=dtype) tm.assert_extension_array_equal(res, expected) - if dtype.storage in arrow_string_storage: + if dtype.storage == "pyarrow": msg = "Invalid value '1' for dtype string" else: msg = "Cannot set non-string value '1' into a StringArray." @@ -490,25 +507,19 @@ def test_arrow_array(dtype): data = pd.array(["a", "b", "c"], dtype=dtype) arr = pa.array(data) expected = pa.array(list(data), type=pa.large_string(), from_pandas=True) - if dtype.storage in ("pyarrow", "pyarrow_numpy") and pa_version_under12p0: + if dtype.storage == "pyarrow" and pa_version_under12p0: expected = pa.chunked_array(expected) if dtype.storage == "python": expected = pc.cast(expected, pa.string()) assert arr.equals(expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") -def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string): +def test_arrow_roundtrip(dtype, string_storage, using_infer_string): # roundtrip possible from arrow 1.0.0 pa = pytest.importorskip("pyarrow") - if using_infer_string and string_storage2 != "pyarrow_numpy": - request.applymarker( - pytest.mark.xfail( - reason="infer_string takes precedence over string storage" - ) - ) - data = pd.array(["a", "b", None], dtype=dtype) df = pd.DataFrame({"a": data}) table = pa.table(df) @@ -516,29 +527,21 @@ def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string): assert table.field("a").type == "string" else: assert table.field("a").type == "large_string" - with pd.option_context("string_storage", string_storage2): + with pd.option_context("string_storage", string_storage): result = table.to_pandas() assert isinstance(result["a"].dtype, pd.StringDtype) - expected = df.astype(f"string[{string_storage2}]") + expected = df.astype(f"string[{string_storage}]") tm.assert_frame_equal(result, expected) # ensure the missing value is represented by NA and not np.nan or None - assert result.loc[2, "a"] is na_val(result["a"].dtype) + assert result.loc[2, "a"] is result["a"].dtype.na_value +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") -def test_arrow_load_from_zero_chunks( - dtype, string_storage2, request, using_infer_string -): +def test_arrow_load_from_zero_chunks(dtype, string_storage, using_infer_string): # GH-41040 pa = pytest.importorskip("pyarrow") - if using_infer_string and string_storage2 != "pyarrow_numpy": - request.applymarker( - pytest.mark.xfail( - reason="infer_string takes precedence over string storage" - ) - ) - data = pd.array([], dtype=dtype) df = pd.DataFrame({"a": data}) table = pa.table(df) @@ -548,18 +551,18 @@ def test_arrow_load_from_zero_chunks( assert table.field("a").type == "large_string" # Instantiate the same table with no chunks at all table = pa.table([pa.chunked_array([], type=pa.string())], schema=table.schema) - with pd.option_context("string_storage", string_storage2): + with pd.option_context("string_storage", string_storage): result = table.to_pandas() assert isinstance(result["a"].dtype, pd.StringDtype) - expected = df.astype(f"string[{string_storage2}]") + expected = df.astype(f"string[{string_storage}]") tm.assert_frame_equal(result, expected) def test_value_counts_na(dtype): - if getattr(dtype, "storage", "") == "pyarrow": - exp_dtype = "int64[pyarrow]" - elif getattr(dtype, "storage", "") == "pyarrow_numpy": + if dtype.na_value is np.nan: exp_dtype = "int64" + elif dtype.storage == "pyarrow": + exp_dtype = "int64[pyarrow]" else: exp_dtype = "Int64" arr = pd.array(["a", "b", "a", pd.NA], dtype=dtype) @@ -573,10 +576,10 @@ def test_value_counts_na(dtype): def test_value_counts_with_normalize(dtype): - if getattr(dtype, "storage", "") == "pyarrow": - exp_dtype = "double[pyarrow]" - elif getattr(dtype, "storage", "") == "pyarrow_numpy": + if dtype.na_value is np.nan: exp_dtype = np.float64 + elif dtype.storage == "pyarrow": + exp_dtype = "double[pyarrow]" else: exp_dtype = "Float64" ser = pd.Series(["a", "b", "a", pd.NA], dtype=dtype) @@ -586,10 +589,10 @@ def test_value_counts_with_normalize(dtype): def test_value_counts_sort_false(dtype): - if getattr(dtype, "storage", "") == "pyarrow": - exp_dtype = "int64[pyarrow]" - elif getattr(dtype, "storage", "") == "pyarrow_numpy": + if dtype.na_value is np.nan: exp_dtype = "int64" + elif dtype.storage == "pyarrow": + exp_dtype = "int64[pyarrow]" else: exp_dtype = "Int64" ser = pd.Series(["a", "b", "c", "b"], dtype=dtype) @@ -598,10 +601,10 @@ def test_value_counts_sort_false(dtype): tm.assert_series_equal(result, expected) -def test_memory_usage(dtype, arrow_string_storage): +def test_memory_usage(dtype): # GH 33963 - if dtype.storage in arrow_string_storage: + if dtype.storage == "pyarrow": pytest.skip(f"not applicable for {dtype.storage}") series = pd.Series(["a", "b", "c"], dtype=dtype) @@ -621,7 +624,7 @@ def test_astype_from_float_dtype(float_dtype, dtype): def test_to_numpy_returns_pdna_default(dtype): arr = pd.array(["a", pd.NA, "b"], dtype=dtype) result = np.array(arr) - expected = np.array(["a", na_val(dtype), "b"], dtype=object) + expected = np.array(["a", dtype.na_value, "b"], dtype=object) tm.assert_numpy_array_equal(result, expected) @@ -661,7 +664,7 @@ def test_setitem_scalar_with_mask_validation(dtype): mask = np.array([False, True, False]) ser[mask] = None - assert ser.array[1] is na_val(ser.dtype) + assert ser.array[1] is ser.dtype.na_value # for other non-string we should also raise an error ser = pd.Series(["a", "b", "c"], dtype=dtype) diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index 405c1c217b04d..65c6ce8e9cd08 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -27,14 +27,18 @@ def test_eq_all_na(): def test_config(string_storage, request, using_infer_string): - if using_infer_string and string_storage != "pyarrow_numpy": - request.applymarker(pytest.mark.xfail(reason="infer string takes precedence")) + if using_infer_string and string_storage == "python": + # python string storage with na_value=NaN is not yet implemented + request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)")) + with pd.option_context("string_storage", string_storage): assert StringDtype().storage == string_storage result = pd.array(["a", "b"]) assert result.dtype.storage == string_storage - dtype = StringDtype(string_storage) + dtype = StringDtype( + string_storage, na_value=np.nan if using_infer_string else pd.NA + ) expected = dtype.construct_array_type()._from_sequence(["a", "b"], dtype=dtype) tm.assert_equal(result, expected) @@ -46,18 +50,18 @@ def test_config_bad_storage_raises(): @pytest.mark.parametrize("chunked", [True, False]) -@pytest.mark.parametrize("array", ["numpy", "pyarrow"]) -def test_constructor_not_string_type_raises(array, chunked, arrow_string_storage): +@pytest.mark.parametrize("array_lib", ["numpy", "pyarrow"]) +def test_constructor_not_string_type_raises(array_lib, chunked): pa = pytest.importorskip("pyarrow") - array = pa if array in arrow_string_storage else np + array_lib = pa if array_lib == "pyarrow" else np - arr = array.array([1, 2, 3]) + arr = array_lib.array([1, 2, 3]) if chunked: - if array is np: + if array_lib is np: pytest.skip("chunked not applicable to numpy array") arr = pa.chunked_array(arr) - if array is np: + if array_lib is np: msg = "Unsupported type '' for ArrowExtensionArray" else: msg = re.escape( @@ -260,6 +264,6 @@ def test_pickle_roundtrip(dtype): def test_string_dtype_error_message(): # GH#55051 pytest.importorskip("pyarrow") - msg = "Storage must be 'python', 'pyarrow' or 'pyarrow_numpy'." + msg = "Storage must be 'python' or 'pyarrow'." with pytest.raises(ValueError, match=msg): StringDtype("bla") diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index 857509e18fa8e..76b8928f28b65 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -1,9 +1,11 @@ import datetime import decimal +import zoneinfo import numpy as np import pytest -import pytz + +from pandas._config import using_string_dtype import pandas as pd import pandas._testing as tm @@ -125,7 +127,7 @@ def test_dt64_array(dtype_unit): ( pd.DatetimeIndex(["2000", "2001"]), None, - DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[ns]"), + DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[s]"), ), ( ["2000", "2001"], @@ -285,9 +287,7 @@ def test_array_copy(): assert tm.shares_memory(a, b) -cet = pytz.timezone("CET") - - +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "data, expected", [ @@ -301,11 +301,11 @@ def test_array_copy(): # datetime ( [pd.Timestamp("2000"), pd.Timestamp("2001")], - DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[ns]"), + DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[s]"), ), ( [datetime.datetime(2000, 1, 1), datetime.datetime(2001, 1, 1)], - DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[ns]"), + DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[us]"), ), ( np.array([1, 2], dtype="M8[ns]"), @@ -321,16 +321,23 @@ def test_array_copy(): ( [pd.Timestamp("2000", tz="CET"), pd.Timestamp("2001", tz="CET")], DatetimeArray._from_sequence( - ["2000", "2001"], dtype=pd.DatetimeTZDtype(tz="CET", unit="ns") + ["2000", "2001"], dtype=pd.DatetimeTZDtype(tz="CET", unit="s") ), ), ( [ - datetime.datetime(2000, 1, 1, tzinfo=cet), - datetime.datetime(2001, 1, 1, tzinfo=cet), + datetime.datetime( + 2000, 1, 1, tzinfo=zoneinfo.ZoneInfo("Europe/Berlin") + ), + datetime.datetime( + 2001, 1, 1, tzinfo=zoneinfo.ZoneInfo("Europe/Berlin") + ), ], DatetimeArray._from_sequence( - ["2000", "2001"], dtype=pd.DatetimeTZDtype(tz=cet, unit="ns") + ["2000", "2001"], + dtype=pd.DatetimeTZDtype( + tz=zoneinfo.ZoneInfo("Europe/Berlin"), unit="us" + ), ), ), # timedelta diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index 8650be62ae7eb..8e348805de978 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -7,12 +7,6 @@ from datetime import timedelta import operator -try: - from zoneinfo import ZoneInfo -except ImportError: - # Cannot assign to a type - ZoneInfo = None # type: ignore[misc, assignment] - import numpy as np import pytest @@ -724,21 +718,14 @@ def test_tz_localize_t2d(self): roundtrip = expected.tz_localize("US/Pacific") tm.assert_datetime_array_equal(roundtrip, dta) - easts = ["US/Eastern", "dateutil/US/Eastern"] - if ZoneInfo is not None: - try: - tz = ZoneInfo("US/Eastern") - except KeyError: - # no tzdata - pass - else: - # Argument 1 to "append" of "list" has incompatible type "ZoneInfo"; - # expected "str" - easts.append(tz) # type: ignore[arg-type] - - @pytest.mark.parametrize("tz", easts) + @pytest.mark.parametrize( + "tz", ["US/Eastern", "dateutil/US/Eastern", "pytz/US/Eastern"] + ) def test_iter_zoneinfo_fold(self, tz): # GH#49684 + if tz.startswith("pytz/"): + pytz = pytest.importorskip("pytz") + tz = pytz.timezone(tz.removeprefix("pytz/")) utc_vals = np.array( [1320552000, 1320555600, 1320559200, 1320562800], dtype=np.int64 ) @@ -764,31 +751,16 @@ def test_iter_zoneinfo_fold(self, tz): assert left.utcoffset() == right2.utcoffset() @pytest.mark.parametrize( - "freq, freq_depr", - [ - ("2ME", "2M"), - ("2SME", "2SM"), - ("2SME", "2sm"), - ("2QE", "2Q"), - ("2QE-SEP", "2Q-SEP"), - ("1YE", "1Y"), - ("2YE-MAR", "2Y-MAR"), - ("2ME", "2m"), - ("2QE-SEP", "2q-sep"), - ("2YE", "2y"), - ], + "freq", + ["2M", "2SM", "2sm", "2Q", "2Q-SEP", "1Y", "2Y-MAR", "2m", "2q-sep", "2y"], ) - def test_date_range_frequency_M_Q_Y_A_deprecated(self, freq, freq_depr): - # GH#9586, GH#54275 - depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed " - f"in a future version, please use '{freq[1:]}' instead." + def test_date_range_frequency_M_Q_Y_raises(self, freq): + msg = f"Invalid frequency: {freq}" - expected = pd.date_range("1/1/2000", periods=4, freq=freq) - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - result = pd.date_range("1/1/2000", periods=4, freq=freq_depr) - tm.assert_index_equal(result, expected) + with pytest.raises(ValueError, match=msg): + pd.date_range("1/1/2000", periods=4, freq=freq) - @pytest.mark.parametrize("freq_depr", ["2H", "2CBH", "2MIN", "2S", "2mS", "2Us"]) + @pytest.mark.parametrize("freq_depr", ["2MIN", "2nS", "2Us"]) def test_date_range_uppercase_frequency_deprecated(self, freq_depr): # GH#9586, GH#54939 depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed in a " @@ -800,7 +772,7 @@ def test_date_range_uppercase_frequency_deprecated(self, freq_depr): tm.assert_index_equal(result, expected) @pytest.mark.parametrize( - "freq_depr", + "freq", [ "2ye-mar", "2ys", @@ -811,17 +783,21 @@ def test_date_range_uppercase_frequency_deprecated(self, freq_depr): "2bms", "2cbme", "2me", - "2w", ], ) - def test_date_range_lowercase_frequency_deprecated(self, freq_depr): + def test_date_range_lowercase_frequency_raises(self, freq): + msg = f"Invalid frequency: {freq}" + + with pytest.raises(ValueError, match=msg): + pd.date_range("1/1/2000", periods=4, freq=freq) + + def test_date_range_lowercase_frequency_deprecated(self): # GH#9586, GH#54939 - depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed in a " - f"future version, please use '{freq_depr.upper()[1:]}' instead." + depr_msg = "'w' is deprecated and will be removed in a future version" - expected = pd.date_range("1/1/2000", periods=4, freq=freq_depr.upper()) + expected = pd.date_range("1/1/2000", periods=4, freq="2W") with tm.assert_produces_warning(FutureWarning, match=depr_msg): - result = pd.date_range("1/1/2000", periods=4, freq=freq_depr) + result = pd.date_range("1/1/2000", periods=4, freq="2w") tm.assert_index_equal(result, expected) @pytest.mark.parametrize("freq", ["1A", "2A-MAR", "2a-mar"]) @@ -831,6 +807,13 @@ def test_date_range_frequency_A_raises(self, freq): with pytest.raises(ValueError, match=msg): pd.date_range("1/1/2000", periods=4, freq=freq) + @pytest.mark.parametrize("freq", ["2H", "2CBH", "2S"]) + def test_date_range_uppercase_frequency_raises(self, freq): + msg = f"Invalid frequency: {freq}" + + with pytest.raises(ValueError, match=msg): + pd.date_range("1/1/2000", periods=4, freq=freq) + def test_factorize_sort_without_freq(): dta = DatetimeArray._from_sequence([0, 2, 1], dtype="M8[ns]") diff --git a/pandas/tests/base/test_constructors.py b/pandas/tests/base/test_constructors.py index f3ac60f672ee1..c4b02423f8cf0 100644 --- a/pandas/tests/base/test_constructors.py +++ b/pandas/tests/base/test_constructors.py @@ -146,10 +146,12 @@ def test_constructor_datetime_outofbound( # No dtype specified (dtype inference) # datetime64[non-ns] raise error, other cases result in object dtype # and preserve original data - if a.dtype.kind == "M": + result = constructor(a) + if a.dtype.kind == "M" or isinstance(a[0], np.datetime64): # Can't fit in nanosecond bounds -> get the nearest supported unit - result = constructor(a) assert result.dtype == "M8[s]" + elif isinstance(a[0], datetime): + assert result.dtype == "M8[us]", result.dtype else: result = constructor(a) if using_infer_string and "object-string" in request.node.callspec.id: diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py index 6c0df49b0a93a..dd6bf3c7521f8 100644 --- a/pandas/tests/base/test_conversion.py +++ b/pandas/tests/base/test_conversion.py @@ -412,7 +412,7 @@ def test_to_numpy_dtype(as_series): [Timestamp("2000"), Timestamp("2000"), pd.NaT], None, Timestamp("2000"), - [np.datetime64("2000-01-01T00:00:00.000000000")] * 3, + [np.datetime64("2000-01-01T00:00:00", "s")] * 3, ), ], ) @@ -454,7 +454,7 @@ def test_to_numpy_na_value_numpy_dtype( [(0, Timestamp("2021")), (0, Timestamp("2022")), (1, Timestamp("2000"))], None, Timestamp("2000"), - [np.datetime64("2000-01-01T00:00:00.000000000")] * 3, + [np.datetime64("2000-01-01T00:00:00", "s")] * 3, ), ], ) diff --git a/pandas/tests/base/test_misc.py b/pandas/tests/base/test_misc.py index f6a4396ca5be0..bbd9b150b88a8 100644 --- a/pandas/tests/base/test_misc.py +++ b/pandas/tests/base/test_misc.py @@ -3,7 +3,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas.compat import PYPY @@ -82,7 +82,7 @@ def test_ndarray_compat_properties(index_or_series_obj): @pytest.mark.skipif( - PYPY or using_pyarrow_string_dtype(), + PYPY or using_string_dtype(), reason="not relevant for PyPy doesn't work properly for arrow strings", ) def test_memory_usage(index_or_series_memory_obj): diff --git a/pandas/tests/base/test_unique.py b/pandas/tests/base/test_unique.py index 3a8ed471f9dc0..7f094db6ea524 100644 --- a/pandas/tests/base/test_unique.py +++ b/pandas/tests/base/test_unique.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype - import pandas as pd import pandas._testing as tm from pandas.tests.base.common import allow_na_ops @@ -100,12 +98,11 @@ def test_nunique_null(null_obj, index_or_series_obj): @pytest.mark.single_cpu -@pytest.mark.xfail(using_pyarrow_string_dtype(), reason="decoding fails") def test_unique_bad_unicode(index_or_series): # regression test for #34550 uval = "\ud83d" # smiley emoji - obj = index_or_series([uval] * 2) + obj = index_or_series([uval] * 2, dtype=object) result = obj.unique() if isinstance(obj, pd.Index): diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index d52f33fe80434..31d568d7c1e0c 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -758,16 +758,25 @@ class TestTypeCasting: # maybe someday... numexpr has too many upcasting rules now # chain(*(np.core.sctypes[x] for x in ['uint', 'int', 'float'])) @pytest.mark.parametrize("left_right", [("df", "3"), ("3", "df")]) - def test_binop_typecasting(self, engine, parser, op, float_numpy_dtype, left_right): - df = DataFrame( - np.random.default_rng(2).standard_normal((5, 3)), dtype=float_numpy_dtype - ) + def test_binop_typecasting( + self, engine, parser, op, complex_or_float_dtype, left_right, request + ): + # GH#21374 + dtype = complex_or_float_dtype + df = DataFrame(np.random.default_rng(2).standard_normal((5, 3)), dtype=dtype) left, right = left_right s = f"{left} {op} {right}" res = pd.eval(s, engine=engine, parser=parser) - assert df.values.dtype == float_numpy_dtype - assert res.values.dtype == float_numpy_dtype - tm.assert_frame_equal(res, eval(s)) + if dtype == "complex64" and engine == "numexpr": + mark = pytest.mark.xfail( + reason="numexpr issue with complex that are upcast " + "to complex 128 " + "https://github.com/pydata/numexpr/issues/492" + ) + request.applymarker(mark) + assert df.values.dtype == dtype + assert res.values.dtype == dtype + tm.assert_frame_equal(res, eval(s), check_exact=False) # ------------------------------------- @@ -1791,7 +1800,7 @@ def test_numexpr_option_incompatible_op(): {"A": [True, False, True, False, None, None], "B": [1, 2, 3, 4, 5, 6]} ) result = df.query("A.isnull()") - expected = DataFrame({"A": [None, None], "B": [5, 6]}, index=[4, 5]) + expected = DataFrame({"A": [None, None], "B": [5, 6]}, index=range(4, 6)) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py index d1e4104e16465..8724f62de1534 100644 --- a/pandas/tests/copy_view/test_astype.py +++ b/pandas/tests/copy_view/test_astype.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat.pyarrow import pa_version_under12p0 import pandas.util._test_decorators as td @@ -195,6 +197,7 @@ def test_astype_arrow_timestamp(): assert np.shares_memory(get_array(df, "a"), get_array(result, "a")._pa_array) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_convert_dtypes_infer_objects(): ser = Series(["a", "b", "c"]) ser_orig = ser.copy() @@ -210,6 +213,7 @@ def test_convert_dtypes_infer_objects(): tm.assert_series_equal(ser, ser_orig) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_convert_dtypes(): df = DataFrame({"a": ["a", "b"], "b": [1, 2], "c": [1.5, 2.5], "d": [True, False]}) df_orig = df.copy() diff --git a/pandas/tests/copy_view/test_constructors.py b/pandas/tests/copy_view/test_constructors.py index bc931b53b37d0..743e094032505 100644 --- a/pandas/tests/copy_view/test_constructors.py +++ b/pandas/tests/copy_view/test_constructors.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -207,6 +209,7 @@ def test_dataframe_from_dict_of_series_with_reindex(dtype): assert np.shares_memory(arr_before, arr_after) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "data, dtype", [([1, 2], None), ([1, 2], "int64"), (["a", "b"], None)] ) @@ -228,12 +231,12 @@ def test_dataframe_from_series_or_index_different_dtype(index_or_series): assert df._mgr._has_no_reference(0) -def test_dataframe_from_series_infer_datetime(): +def test_dataframe_from_series_dont_infer_datetime(): ser = Series([Timestamp("2019-12-31"), Timestamp("2020-12-31")], dtype=object) - with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): - df = DataFrame(ser) - assert not np.shares_memory(get_array(ser), get_array(df, 0)) - assert df._mgr._has_no_reference(0) + df = DataFrame(ser) + assert df.dtypes.iloc[0] == np.dtype(object) + assert np.shares_memory(get_array(ser), get_array(df, 0)) + assert not df._mgr._has_no_reference(0) @pytest.mark.parametrize("index", [None, [0, 1, 2]]) diff --git a/pandas/tests/copy_view/test_functions.py b/pandas/tests/copy_view/test_functions.py index 196d908a44a46..d2e2d43b0a42b 100644 --- a/pandas/tests/copy_view/test_functions.py +++ b/pandas/tests/copy_view/test_functions.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas import ( DataFrame, Index, @@ -12,6 +14,7 @@ from pandas.tests.copy_view.util import get_array +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_concat_frames(): df = DataFrame({"b": ["a"] * 3}) df2 = DataFrame({"a": ["a"] * 3}) @@ -30,6 +33,7 @@ def test_concat_frames(): tm.assert_frame_equal(df, df_orig) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_concat_frames_updating_input(): df = DataFrame({"b": ["a"] * 3}) df2 = DataFrame({"a": ["a"] * 3}) @@ -149,6 +153,7 @@ def test_concat_copy_keyword(): assert np.shares_memory(get_array(df2, "b"), get_array(result, "b")) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "func", [ @@ -200,6 +205,7 @@ def test_merge_on_index(): tm.assert_frame_equal(df2, df2_orig) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "func, how", [ @@ -243,6 +249,7 @@ def test_merge_copy_keyword(): assert np.shares_memory(get_array(df2, "b"), get_array(result, "b")) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_join_on_key(): df_index = Index(["a", "b", "c"], name="key") @@ -270,6 +277,7 @@ def test_join_on_key(): tm.assert_frame_equal(df2, df2_orig) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_join_multiple_dataframes_on_key(): df_index = Index(["a", "b", "c"], name="key") diff --git a/pandas/tests/copy_view/test_indexing.py b/pandas/tests/copy_view/test_indexing.py index b10141b0d63f4..37a21e1098e78 100644 --- a/pandas/tests/copy_view/test_indexing.py +++ b/pandas/tests/copy_view/test_indexing.py @@ -725,15 +725,13 @@ def test_column_as_series_set_with_upcast(backend): with pytest.raises(TypeError, match="Invalid value"): s[0] = "foo" expected = Series([1, 2, 3], name="a") + tm.assert_series_equal(s, expected) + tm.assert_frame_equal(df, df_orig) + # ensure cached series on getitem is not the changed series + tm.assert_series_equal(df["a"], df_orig["a"]) else: - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): s[0] = "foo" - expected = Series(["foo", 2, 3], dtype=object, name="a") - - tm.assert_series_equal(s, expected) - tm.assert_frame_equal(df, df_orig) - # ensure cached series on getitem is not the changed series - tm.assert_series_equal(df["a"], df_orig["a"]) @pytest.mark.parametrize( @@ -805,16 +803,14 @@ def test_set_value_copy_only_necessary_column(indexer_func, indexer, val, col): view = df[:] if val == "a": - with tm.assert_produces_warning( - FutureWarning, match="Setting an item of incompatible dtype is deprecated" - ): + with pytest.raises(TypeError, match="Invalid value"): indexer_func(df)[indexer] = val + else: + indexer_func(df)[indexer] = val - indexer_func(df)[indexer] = val - - assert np.shares_memory(get_array(df, "b"), get_array(view, "b")) - assert not np.shares_memory(get_array(df, "a"), get_array(view, "a")) - tm.assert_frame_equal(view, df_orig) + assert np.shares_memory(get_array(df, "b"), get_array(view, "b")) + assert not np.shares_memory(get_array(df, "a"), get_array(view, "a")) + tm.assert_frame_equal(view, df_orig) def test_series_midx_slice(): diff --git a/pandas/tests/copy_view/test_internals.py b/pandas/tests/copy_view/test_internals.py index a4cb1e6bea9c9..b2a26ceacd6c3 100644 --- a/pandas/tests/copy_view/test_internals.py +++ b/pandas/tests/copy_view/test_internals.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas import DataFrame import pandas._testing as tm from pandas.tests.copy_view.util import get_array @@ -40,6 +42,7 @@ def test_consolidate(): assert df.loc[0, "b"] == 0.1 +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("dtype", [np.intp, np.int8]) @pytest.mark.parametrize( "locs, arr", diff --git a/pandas/tests/copy_view/test_interp_fillna.py b/pandas/tests/copy_view/test_interp_fillna.py index abd87162ec32e..f80e9b7dcf838 100644 --- a/pandas/tests/copy_view/test_interp_fillna.py +++ b/pandas/tests/copy_view/test_interp_fillna.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas import ( NA, DataFrame, @@ -110,6 +112,7 @@ def test_interp_fill_functions_inplace(func, dtype): assert view._mgr._has_no_reference(0) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_interpolate_cannot_with_object_dtype(): df = DataFrame({"a": ["a", np.nan, "c"], "b": 1}) @@ -118,6 +121,7 @@ def test_interpolate_cannot_with_object_dtype(): df.interpolate() +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_interpolate_object_convert_no_op(): df = DataFrame({"a": ["a", "b", "c"], "b": 1}) arr_a = get_array(df, "a") diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index 3712a74fe54ed..3716df8fbf855 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -712,6 +714,7 @@ def test_head_tail(method): tm.assert_frame_equal(df, df_orig) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_infer_objects(): df = DataFrame({"a": [1, 2], "b": "c", "c": 1, "d": "x"}) df_orig = df.copy() @@ -896,6 +899,7 @@ def test_sort_values_inplace(obj, kwargs): tm.assert_equal(view, obj_orig) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("decimals", [-1, 0, 1]) def test_round(decimals): df = DataFrame({"a": [1, 2], "b": "c"}) @@ -1105,26 +1109,26 @@ def test_putmask_aligns_rhs_no_reference(dtype): assert np.shares_memory(arr_a, get_array(df, "a")) -@pytest.mark.parametrize( - "val, exp, warn", [(5.5, True, FutureWarning), (5, False, None)] -) -def test_putmask_dont_copy_some_blocks(val, exp, warn): +@pytest.mark.parametrize("val, exp, raises", [(5.5, True, True), (5, False, False)]) +def test_putmask_dont_copy_some_blocks(val, exp, raises: bool): df = DataFrame({"a": [1, 2], "b": 1, "c": 1.5}) view = df[:] df_orig = df.copy() indexer = DataFrame( [[True, False, False], [True, False, False]], columns=list("abc") ) - with tm.assert_produces_warning(warn, match="incompatible dtype"): + if raises: + with pytest.raises(TypeError, match="Invalid value"): + df[indexer] = val + else: df[indexer] = val - - assert not np.shares_memory(get_array(view, "a"), get_array(df, "a")) - # TODO(CoW): Could split blocks to avoid copying the whole block - assert np.shares_memory(get_array(view, "b"), get_array(df, "b")) is exp - assert np.shares_memory(get_array(view, "c"), get_array(df, "c")) - assert df._mgr._has_no_reference(1) is not exp - assert not df._mgr._has_no_reference(2) - tm.assert_frame_equal(view, df_orig) + assert not np.shares_memory(get_array(view, "a"), get_array(df, "a")) + # TODO(CoW): Could split blocks to avoid copying the whole block + assert np.shares_memory(get_array(view, "b"), get_array(df, "b")) is exp + assert np.shares_memory(get_array(view, "c"), get_array(df, "c")) + assert df._mgr._has_no_reference(1) is not exp + assert not df._mgr._has_no_reference(2) + tm.assert_frame_equal(view, df_orig) @pytest.mark.parametrize("dtype", ["int64", "Int64"]) diff --git a/pandas/tests/copy_view/test_replace.py b/pandas/tests/copy_view/test_replace.py index 63254f1244a2e..c1120ccfea635 100644 --- a/pandas/tests/copy_view/test_replace.py +++ b/pandas/tests/copy_view/test_replace.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas import ( Categorical, DataFrame, @@ -9,6 +11,7 @@ from pandas.tests.copy_view.util import get_array +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "replace_kwargs", [ @@ -56,6 +59,7 @@ def test_replace_regex_inplace_refs(): tm.assert_frame_equal(view, df_orig) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_replace_regex_inplace(): df = DataFrame({"a": ["aaa", "bbb"]}) arr = get_array(df, "a") @@ -129,18 +133,14 @@ def test_replace_to_replace_wrong_dtype(): def test_replace_list_categorical(): df = DataFrame({"a": ["a", "b", "c"]}, dtype="category") arr = get_array(df, "a") - msg = ( - r"The behavior of Series\.replace \(and DataFrame.replace\) " - "with CategoricalDtype" - ) - with tm.assert_produces_warning(FutureWarning, match=msg): - df.replace(["c"], value="a", inplace=True) + + df.replace(["c"], value="a", inplace=True) assert np.shares_memory(arr.codes, get_array(df, "a").codes) assert df._mgr._has_no_reference(0) df_orig = df.copy() - with tm.assert_produces_warning(FutureWarning, match=msg): - df2 = df.replace(["b"], value="a") + df.replace(["b"], value="a") + df2 = df.apply(lambda x: x.cat.rename_categories({"b": "d"})) assert not np.shares_memory(arr.codes, get_array(df2, "a").codes) tm.assert_frame_equal(df, df_orig) @@ -150,13 +150,7 @@ def test_replace_list_inplace_refs_categorical(): df = DataFrame({"a": ["a", "b", "c"]}, dtype="category") view = df[:] df_orig = df.copy() - msg = ( - r"The behavior of Series\.replace \(and DataFrame.replace\) " - "with CategoricalDtype" - ) - with tm.assert_produces_warning(FutureWarning, match=msg): - df.replace(["c"], value="a", inplace=True) - assert not np.shares_memory(get_array(view, "a").codes, get_array(df, "a").codes) + df.replace(["c"], value="a", inplace=True) tm.assert_frame_equal(df_orig, view) @@ -195,56 +189,34 @@ def test_replace_inplace_reference_no_op(to_replace): @pytest.mark.parametrize("to_replace", [1, [1]]) -@pytest.mark.parametrize("val", [1, 1.5]) -def test_replace_categorical_inplace_reference(val, to_replace): +def test_replace_categorical_inplace_reference(to_replace): df = DataFrame({"a": Categorical([1, 2, 3])}) df_orig = df.copy() arr_a = get_array(df, "a") view = df[:] - msg = ( - r"The behavior of Series\.replace \(and DataFrame.replace\) " - "with CategoricalDtype" - ) - warn = FutureWarning if val == 1.5 else None - with tm.assert_produces_warning(warn, match=msg): - df.replace(to_replace=to_replace, value=val, inplace=True) - + df.replace(to_replace=to_replace, value=1, inplace=True) assert not np.shares_memory(get_array(df, "a").codes, arr_a.codes) assert df._mgr._has_no_reference(0) assert view._mgr._has_no_reference(0) tm.assert_frame_equal(view, df_orig) -@pytest.mark.parametrize("val", [1, 1.5]) -def test_replace_categorical_inplace(val): +def test_replace_categorical_inplace(): df = DataFrame({"a": Categorical([1, 2, 3])}) arr_a = get_array(df, "a") - msg = ( - r"The behavior of Series\.replace \(and DataFrame.replace\) " - "with CategoricalDtype" - ) - warn = FutureWarning if val == 1.5 else None - with tm.assert_produces_warning(warn, match=msg): - df.replace(to_replace=1, value=val, inplace=True) + df.replace(to_replace=1, value=1, inplace=True) assert np.shares_memory(get_array(df, "a").codes, arr_a.codes) assert df._mgr._has_no_reference(0) - expected = DataFrame({"a": Categorical([val, 2, 3])}) + expected = DataFrame({"a": Categorical([1, 2, 3])}) tm.assert_frame_equal(df, expected) -@pytest.mark.parametrize("val", [1, 1.5]) -def test_replace_categorical(val): +def test_replace_categorical(): df = DataFrame({"a": Categorical([1, 2, 3])}) df_orig = df.copy() - msg = ( - r"The behavior of Series\.replace \(and DataFrame.replace\) " - "with CategoricalDtype" - ) - warn = FutureWarning if val == 1.5 else None - with tm.assert_produces_warning(warn, match=msg): - df2 = df.replace(to_replace=1, value=val) + df2 = df.replace(to_replace=1, value=1) assert df._mgr._has_no_reference(0) assert df2._mgr._has_no_reference(0) @@ -285,6 +257,7 @@ def test_replace_empty_list(): assert not df2._mgr._has_no_reference(0) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("value", ["d", None]) def test_replace_object_list_inplace(value): df = DataFrame({"a": ["a", "b", "c"]}) diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index c6da01636247d..b6c5becf49fa0 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -3,7 +3,8 @@ import numpy as np import pytest -import pytz + +from pandas._config import using_string_dtype from pandas._libs.tslibs.dtypes import NpyDatetimeUnit @@ -391,8 +392,9 @@ def test_empty(self): def test_tz_standardize(self): # GH 24713 + pytz = pytest.importorskip("pytz") tz = pytz.timezone("US/Eastern") - dr = date_range("2013-01-01", periods=3, tz="US/Eastern") + dr = date_range("2013-01-01", periods=3, tz=tz) dtype = DatetimeTZDtype("ns", dr.tz) assert dtype.tz == tz dtype = DatetimeTZDtype("ns", dr[0].tz) @@ -959,6 +961,7 @@ def test_same_categories_different_order(self): c2 = CategoricalDtype(["b", "a"], ordered=True) assert c1 is not c2 + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("ordered2", [True, False, None]) def test_categorical_equality(self, ordered, ordered2): # same categories, same order @@ -1231,3 +1234,15 @@ def test_multi_column_dtype_assignment(): df["b"] = 0 tm.assert_frame_equal(df, expected) + + +def test_loc_setitem_empty_labels_no_dtype_conversion(): + # GH 29707 + + df = pd.DataFrame({"a": [2, 3]}) + expected = df.copy() + assert df.a.dtype == "int64" + df.loc[[]] = 0.1 + + assert df.a.dtype == "int64" + tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index f4282c9c7ac3a..b1d7c701e1267 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -12,6 +12,7 @@ datetime, time, timedelta, + timezone, ) from decimal import Decimal from fractions import Fraction @@ -27,7 +28,6 @@ import numpy as np import pytest -import pytz from pandas._libs import ( lib, @@ -830,7 +830,11 @@ def test_maybe_convert_objects_datetime_overflow_safe(self, dtype): out = lib.maybe_convert_objects(arr, convert_non_numeric=True) # no OutOfBoundsDatetime/OutOfBoundsTimedeltas - tm.assert_numpy_array_equal(out, arr) + if dtype == "datetime64[ns]": + expected = np.array(["2363-10-04"], dtype="M8[us]") + else: + expected = arr + tm.assert_numpy_array_equal(out, expected) def test_maybe_convert_objects_mixed_datetimes(self): ts = Timestamp("now") @@ -1018,7 +1022,7 @@ def test_maybe_convert_objects_itemsize(self, data0, data1): def test_mixed_dtypes_remain_object_array(self): # GH14956 - arr = np.array([datetime(2015, 1, 1, tzinfo=pytz.utc), 1], dtype=object) + arr = np.array([datetime(2015, 1, 1, tzinfo=timezone.utc), 1], dtype=object) result = lib.maybe_convert_objects(arr, convert_non_numeric=True) tm.assert_numpy_array_equal(result, arr) diff --git a/pandas/tests/dtypes/test_missing.py b/pandas/tests/dtypes/test_missing.py index 2109c794ad44f..f86ed6f49759f 100644 --- a/pandas/tests/dtypes/test_missing.py +++ b/pandas/tests/dtypes/test_missing.py @@ -697,6 +697,9 @@ def test_array_equivalent_index_with_tuples(): ("f2", np.nan), ("f4", np.nan), ("f8", np.nan), + # Complex + ("c8", np.nan), + ("c16", np.nan), # Object ("O", np.nan), # Interval diff --git a/pandas/tests/extension/base/casting.py b/pandas/tests/extension/base/casting.py index 2bfe801c48a77..e924e38ee5030 100644 --- a/pandas/tests/extension/base/casting.py +++ b/pandas/tests/extension/base/casting.py @@ -30,8 +30,9 @@ def test_astype_object_frame(self, all_data): blk = result._mgr.blocks[0] assert isinstance(blk, NumpyBlock), type(blk) assert blk.is_object - assert isinstance(result._mgr.arrays[0], np.ndarray) - assert result._mgr.arrays[0].dtype == np.dtype(object) + arr = result._mgr.blocks[0].values + assert isinstance(arr, np.ndarray) + assert arr.dtype == np.dtype(object) # check that we can compare the dtypes comp = result.dtypes == df.dtypes diff --git a/pandas/tests/extension/base/constructors.py b/pandas/tests/extension/base/constructors.py index c32a6a6a115ac..639dc874c9fb9 100644 --- a/pandas/tests/extension/base/constructors.py +++ b/pandas/tests/extension/base/constructors.py @@ -69,7 +69,7 @@ def test_dataframe_constructor_from_dict(self, data, from_series): assert result.shape == (len(data), 1) if hasattr(result._mgr, "blocks"): assert isinstance(result._mgr.blocks[0], EABackedBlock) - assert isinstance(result._mgr.arrays[0], ExtensionArray) + assert isinstance(result._mgr.blocks[0].values, ExtensionArray) def test_dataframe_from_series(self, data): result = pd.DataFrame(pd.Series(data)) @@ -77,7 +77,7 @@ def test_dataframe_from_series(self, data): assert result.shape == (len(data), 1) if hasattr(result._mgr, "blocks"): assert isinstance(result._mgr.blocks[0], EABackedBlock) - assert isinstance(result._mgr.arrays[0], ExtensionArray) + assert isinstance(result._mgr.blocks[0].values, ExtensionArray) def test_series_given_mismatched_index_raises(self, data): msg = r"Length of values \(3\) does not match length of index \(5\)" diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py index 935edce32a0ab..27fa1206f6f7f 100644 --- a/pandas/tests/extension/base/getitem.py +++ b/pandas/tests/extension/base/getitem.py @@ -408,7 +408,7 @@ def test_take_series(self, data): result = s.take([0, -1]) expected = pd.Series( data._from_sequence([data[0], data[len(data) - 1]], dtype=s.dtype), - index=[0, len(data) - 1], + index=range(0, 198, 99), ) tm.assert_series_equal(result, expected) @@ -428,7 +428,8 @@ def test_reindex(self, data, na_value): result = s.reindex([n, n + 1]) expected = pd.Series( - data._from_sequence([na_value, na_value], dtype=s.dtype), index=[n, n + 1] + data._from_sequence([na_value, na_value], dtype=s.dtype), + index=range(n, n + 2, 1), ) tm.assert_series_equal(result, expected) @@ -450,7 +451,7 @@ def test_loc_len1(self, data): df = pd.DataFrame({"A": data}) res = df.loc[[0], "A"] assert res.ndim == 1 - assert res._mgr.arrays[0].ndim == 1 + assert res._mgr.blocks[0].ndim == 1 if hasattr(res._mgr, "blocks"): assert res._mgr._block.ndim == 1 diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index b951d4c35d208..dd2ed0bd62a02 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -66,14 +66,14 @@ def test_value_counts_with_normalize(self, data): expected = pd.Series(0.0, index=result.index, name="proportion") expected[result > 0] = 1 / len(values) - if getattr(data.dtype, "storage", "") == "pyarrow" or isinstance( + if isinstance(data.dtype, pd.StringDtype) and data.dtype.na_value is np.nan: + # TODO: avoid special-casing + expected = expected.astype("float64") + elif getattr(data.dtype, "storage", "") == "pyarrow" or isinstance( data.dtype, pd.ArrowDtype ): # TODO: avoid special-casing expected = expected.astype("double[pyarrow]") - elif getattr(data.dtype, "storage", "") == "pyarrow_numpy": - # TODO: avoid special-casing - expected = expected.astype("float64") elif na_value_for_dtype(data.dtype) is pd.NA: # TODO(GH#44692): avoid special-casing expected = expected.astype("Float64") @@ -116,10 +116,8 @@ def test_argsort_missing_array(self, data_missing_for_sorting): tm.assert_numpy_array_equal(result, expected) def test_argsort_missing(self, data_missing_for_sorting): - msg = "The behavior of Series.argsort in the presence of NA values" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = pd.Series(data_missing_for_sorting).argsort() - expected = pd.Series(np.array([1, -1, 0], dtype=np.intp)) + result = pd.Series(data_missing_for_sorting).argsort() + expected = pd.Series(np.array([2, 0, 1], dtype=np.intp)) tm.assert_series_equal(result, expected) def test_argmin_argmax(self, data_for_sorting, data_missing_for_sorting, na_value): diff --git a/pandas/tests/extension/base/ops.py b/pandas/tests/extension/base/ops.py index 5cd66d8a874c7..fad2560265d21 100644 --- a/pandas/tests/extension/base/ops.py +++ b/pandas/tests/extension/base/ops.py @@ -5,7 +5,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas.core.dtypes.common import is_string_dtype @@ -37,7 +37,7 @@ def _get_expected_exception( else: result = self.frame_scalar_exc - if using_pyarrow_string_dtype() and result is not None: + if using_string_dtype() and result is not None: import pyarrow as pa result = ( # type: ignore[assignment] diff --git a/pandas/tests/extension/base/reduce.py b/pandas/tests/extension/base/reduce.py index c3a6daee2dd54..4b3431d938f96 100644 --- a/pandas/tests/extension/base/reduce.py +++ b/pandas/tests/extension/base/reduce.py @@ -4,7 +4,6 @@ import pandas as pd import pandas._testing as tm -from pandas.api.types import is_numeric_dtype class BaseReduceTests: @@ -57,7 +56,7 @@ def check_reduce_frame(self, ser: pd.Series, op_name: str, skipna: bool): arr = ser.array df = pd.DataFrame({"a": arr}) - kwargs = {"ddof": 1} if op_name in ["var", "std"] else {} + kwargs = {"ddof": 1} if op_name in ["var", "std", "sem"] else {} cmp_dtype = self._get_expected_reduction_dtype(arr, op_name, skipna) @@ -119,10 +118,8 @@ def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna): def test_reduce_frame(self, data, all_numeric_reductions, skipna): op_name = all_numeric_reductions ser = pd.Series(data) - if not is_numeric_dtype(ser.dtype): - pytest.skip(f"{ser.dtype} is not numeric dtype") - if op_name in ["count", "kurt", "sem"]: + if op_name == "count": pytest.skip(f"{op_name} not an array method") if not self._supports_reduction(ser, op_name): diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py index 489cd15644d04..2915c0585f373 100644 --- a/pandas/tests/extension/base/reshaping.py +++ b/pandas/tests/extension/base/reshaping.py @@ -29,12 +29,12 @@ def test_concat(self, data, in_frame): assert dtype == data.dtype if hasattr(result._mgr, "blocks"): assert isinstance(result._mgr.blocks[0], EABackedBlock) - assert isinstance(result._mgr.arrays[0], ExtensionArray) + assert isinstance(result._mgr.blocks[0].values, ExtensionArray) @pytest.mark.parametrize("in_frame", [True, False]) def test_concat_all_na_block(self, data_missing, in_frame): - valid_block = pd.Series(data_missing.take([1, 1]), index=[0, 1]) - na_block = pd.Series(data_missing.take([0, 0]), index=[2, 3]) + valid_block = pd.Series(data_missing.take([1, 1]), index=range(2)) + na_block = pd.Series(data_missing.take([0, 0]), index=range(2, 4)) if in_frame: valid_block = pd.DataFrame({"a": valid_block}) na_block = pd.DataFrame({"a": na_block}) diff --git a/pandas/tests/extension/base/setitem.py b/pandas/tests/extension/base/setitem.py index a455b21b9932a..1d613ced2c03f 100644 --- a/pandas/tests/extension/base/setitem.py +++ b/pandas/tests/extension/base/setitem.py @@ -374,7 +374,7 @@ def test_setitem_preserves_views(self, data): def test_setitem_with_expansion_dataframe_column(self, data, full_indexer): # https://github.com/pandas-dev/pandas/issues/32395 - df = expected = pd.DataFrame({0: pd.Series(data)}) + df = expected = pd.DataFrame(pd.Series(data)) result = pd.DataFrame(index=df.index) key = full_indexer(df) diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 6f18761f77138..070feb1fec4b9 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -72,6 +72,8 @@ def _get_expected_exception( return None def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: + if op_name in ["kurt", "sem"]: + return False return True def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool): diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 7d31fe6085c3a..dbf353d87178f 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -32,6 +32,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs import lib from pandas._libs.tslibs import timezones from pandas.compat import ( @@ -67,7 +69,10 @@ pa = pytest.importorskip("pyarrow") -from pandas.core.arrays.arrow.array import ArrowExtensionArray +from pandas.core.arrays.arrow.array import ( + ArrowExtensionArray, + get_unit_from_pa_dtype, +) from pandas.core.arrays.arrow.extension_types import ArrowPeriodType @@ -464,17 +469,14 @@ def test_accumulate_series(self, data, all_numeric_accumulations, skipna, reques self.check_accumulate(ser, op_name, skipna) def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: + if op_name in ["kurt", "skew"]: + return False + dtype = ser.dtype # error: Item "dtype[Any]" of "dtype[Any] | ExtensionDtype" has # no attribute "pyarrow_dtype" pa_dtype = dtype.pyarrow_dtype # type: ignore[union-attr] - if pa.types.is_temporal(pa_dtype) and op_name in [ - "sum", - "var", - "skew", - "kurt", - "prod", - ]: + if pa.types.is_temporal(pa_dtype) and op_name in ["sum", "var", "prod"]: if pa.types.is_duration(pa_dtype) and op_name in ["sum"]: # summing timedeltas is one case that *is* well-defined pass @@ -490,8 +492,6 @@ def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: "std", "sem", "var", - "skew", - "kurt", ]: return False @@ -505,6 +505,16 @@ def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: # behavior which does not support this. return False + if pa.types.is_boolean(pa_dtype) and op_name in [ + "median", + "std", + "var", + "skew", + "kurt", + "sem", + ]: + return False + return True def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool): @@ -528,32 +538,6 @@ def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool): expected = getattr(alt, op_name)(skipna=skipna) tm.assert_almost_equal(result, expected) - @pytest.mark.parametrize("skipna", [True, False]) - def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna, request): - dtype = data.dtype - pa_dtype = dtype.pyarrow_dtype - - xfail_mark = pytest.mark.xfail( - raises=TypeError, - reason=( - f"{all_numeric_reductions} is not implemented in " - f"pyarrow={pa.__version__} for {pa_dtype}" - ), - ) - if all_numeric_reductions in {"skew", "kurt"} and ( - dtype._is_numeric or dtype.kind == "b" - ): - request.applymarker(xfail_mark) - - elif pa.types.is_boolean(pa_dtype) and all_numeric_reductions in { - "sem", - "std", - "var", - "median", - }: - request.applymarker(xfail_mark) - super().test_reduce_series_numeric(data, all_numeric_reductions, skipna) - @pytest.mark.parametrize("skipna", [True, False]) def test_reduce_series_boolean( self, data, all_boolean_reductions, skipna, na_value, request @@ -574,15 +558,32 @@ def test_reduce_series_boolean( return super().test_reduce_series_boolean(data, all_boolean_reductions, skipna) def _get_expected_reduction_dtype(self, arr, op_name: str, skipna: bool): + pa_type = arr._pa_array.type + if op_name in ["max", "min"]: cmp_dtype = arr.dtype + elif pa.types.is_temporal(pa_type): + if op_name in ["std", "sem"]: + if pa.types.is_duration(pa_type): + cmp_dtype = arr.dtype + elif pa.types.is_date(pa_type): + cmp_dtype = ArrowDtype(pa.duration("s")) + elif pa.types.is_time(pa_type): + unit = get_unit_from_pa_dtype(pa_type) + cmp_dtype = ArrowDtype(pa.duration(unit)) + else: + cmp_dtype = ArrowDtype(pa.duration(pa_type.unit)) + else: + cmp_dtype = arr.dtype elif arr.dtype.name == "decimal128(7, 3)[pyarrow]": - if op_name not in ["median", "var", "std"]: + if op_name not in ["median", "var", "std", "sem"]: cmp_dtype = arr.dtype else: cmp_dtype = "float64[pyarrow]" - elif op_name in ["median", "var", "std", "mean", "skew"]: + elif op_name in ["median", "var", "std", "mean", "skew", "sem"]: cmp_dtype = "float64[pyarrow]" + elif op_name in ["sum", "prod"] and pa.types.is_boolean(pa_type): + cmp_dtype = "uint64[pyarrow]" else: cmp_dtype = { "i": "int64[pyarrow]", @@ -598,6 +599,14 @@ def test_reduce_frame(self, data, all_numeric_reductions, skipna, request): if data.dtype._is_numeric: mark = pytest.mark.xfail(reason="skew not implemented") request.applymarker(mark) + elif ( + op_name in ["std", "sem"] + and pa.types.is_date64(data._pa_array.type) + and skipna + ): + # overflow + mark = pytest.mark.xfail(reason="Cannot cast") + request.applymarker(mark) return super().test_reduce_frame(data, all_numeric_reductions, skipna) @pytest.mark.parametrize("typ", ["int64", "uint64", "float64"]) @@ -1986,6 +1995,7 @@ def test_str_find_large_start(): tm.assert_series_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.skipif( pa_version_under13p0, reason="https://github.com/apache/arrow/issues/36311" ) @@ -2437,13 +2447,13 @@ def test_unsupported_dt(data): ["hour", 3], ["minute", 4], ["is_leap_year", False], - ["microsecond", 5], + ["microsecond", 2000], ["month", 1], ["nanosecond", 6], ["quarter", 1], ["second", 7], ["date", date(2023, 1, 2)], - ["time", time(3, 4, 7, 5)], + ["time", time(3, 4, 7, 2000)], ], ) def test_dt_properties(prop, expected): @@ -2456,7 +2466,7 @@ def test_dt_properties(prop, expected): hour=3, minute=4, second=7, - microsecond=5, + microsecond=2000, nanosecond=6, ), None, @@ -2473,6 +2483,28 @@ def test_dt_properties(prop, expected): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize("microsecond", [2000, 5, 0]) +def test_dt_microsecond(microsecond): + # GH 59183 + ser = pd.Series( + [ + pd.Timestamp( + year=2024, + month=7, + day=7, + second=5, + microsecond=microsecond, + nanosecond=6, + ), + None, + ], + dtype=ArrowDtype(pa.timestamp("ns")), + ) + result = ser.dt.microsecond + expected = pd.Series([microsecond, None], dtype="int64[pyarrow]") + tm.assert_series_equal(result, expected) + + def test_dt_is_month_start_end(): ser = pd.Series( [ @@ -2905,6 +2937,31 @@ def test_dt_components(): tm.assert_frame_equal(result, expected) +def test_dt_components_large_values(): + ser = pd.Series( + [ + pd.Timedelta("365 days 23:59:59.999000"), + None, + ], + dtype=ArrowDtype(pa.duration("ns")), + ) + result = ser.dt.components + expected = pd.DataFrame( + [[365, 23, 59, 59, 999, 0, 0], [None, None, None, None, None, None, None]], + columns=[ + "days", + "hours", + "minutes", + "seconds", + "milliseconds", + "microseconds", + "nanoseconds", + ], + dtype="int32[pyarrow]", + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("skipna", [True, False]) def test_boolean_reduce_series_all_null(all_boolean_reductions, skipna): # GH51624 @@ -3445,7 +3502,7 @@ def test_arrow_floor_division_large_divisor(dtype): def test_string_to_datetime_parsing_cast(): # GH 56266 string_dates = ["2020-01-01 04:30:00", "2020-01-02 00:00:00", "2020-01-03 00:00:00"] - result = pd.Series(string_dates, dtype="timestamp[ns][pyarrow]") + result = pd.Series(string_dates, dtype="timestamp[s][pyarrow]") expected = pd.Series( ArrowExtensionArray(pa.array(pd.to_datetime(string_dates), from_pandas=True)) ) diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index 09662f7b793a9..8f8af607585df 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -19,7 +19,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype import pandas as pd from pandas import Categorical @@ -99,7 +99,7 @@ def test_contains(self, data, data_missing): continue assert na_value_obj not in data # this section suffers from super method - if not using_pyarrow_string_dtype(): + if not using_string_dtype(): assert na_value_obj in data_missing def test_empty(self, dtype): diff --git a/pandas/tests/extension/test_datetime.py b/pandas/tests/extension/test_datetime.py index a42fa6088d9c8..356d5352f41f4 100644 --- a/pandas/tests/extension/test_datetime.py +++ b/pandas/tests/extension/test_datetime.py @@ -95,6 +95,11 @@ def _get_expected_exception(self, op_name, obj, other): return None return super()._get_expected_exception(op_name, obj, other) + def _get_expected_reduction_dtype(self, arr, op_name: str, skipna: bool): + if op_name == "std": + return "timedelta64[ns]" + return arr.dtype + def _supports_accumulation(self, ser, op_name: str) -> bool: return op_name in ["cummin", "cummax"] diff --git a/pandas/tests/extension/test_masked.py b/pandas/tests/extension/test_masked.py index 69ce42203d510..3b9079d06e231 100644 --- a/pandas/tests/extension/test_masked.py +++ b/pandas/tests/extension/test_masked.py @@ -301,7 +301,7 @@ def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool): def _get_expected_reduction_dtype(self, arr, op_name: str, skipna: bool): if is_float_dtype(arr.dtype): cmp_dtype = arr.dtype.name - elif op_name in ["mean", "median", "var", "std", "skew"]: + elif op_name in ["mean", "median", "var", "std", "skew", "kurt", "sem"]: cmp_dtype = "Float64" elif op_name in ["max", "min"]: cmp_dtype = arr.dtype.name @@ -323,9 +323,7 @@ def _get_expected_reduction_dtype(self, arr, op_name: str, skipna: bool): else "UInt64" ) elif arr.dtype.kind == "b": - if op_name in ["mean", "median", "var", "std", "skew"]: - cmp_dtype = "Float64" - elif op_name in ["min", "max"]: + if op_name in ["min", "max"]: cmp_dtype = "boolean" elif op_name in ["sum", "prod"]: cmp_dtype = ( diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 49ad3fce92a5c..2ab248787a1cf 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -22,6 +22,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd import pandas._testing as tm from pandas.api.types import is_string_dtype @@ -29,6 +31,10 @@ from pandas.core.arrays.string_ import StringDtype from pandas.tests.extension import base +pytestmark = pytest.mark.xfail( + using_string_dtype(), reason="TODO(infer_string)", strict=False +) + def maybe_split_array(arr, chunked): if not chunked: @@ -53,8 +59,9 @@ def chunked(request): @pytest.fixture -def dtype(string_storage): - return StringDtype(storage=string_storage) +def dtype(string_dtype_arguments): + storage, na_value = string_dtype_arguments + return StringDtype(storage=storage, na_value=na_value) @pytest.fixture @@ -96,16 +103,22 @@ def data_for_grouping(dtype, chunked): class TestStringArray(base.ExtensionTests): def test_eq_with_str(self, dtype): - assert dtype == f"string[{dtype.storage}]" super().test_eq_with_str(dtype) + if dtype.na_value is pd.NA: + # only the NA-variant supports parametrized string alias + assert dtype == f"string[{dtype.storage}]" + elif dtype.storage == "pyarrow": + # TODO(infer_string) deprecate this + assert dtype == "string[pyarrow_numpy]" + def test_is_not_string_type(self, dtype): # Different from BaseDtypeTests.test_is_not_string_type # because StringDtype is a string type assert is_string_dtype(dtype) - def test_view(self, data, request, arrow_string_storage): - if data.dtype.storage in arrow_string_storage: + def test_view(self, data): + if data.dtype.storage == "pyarrow": pytest.skip(reason="2D support not implemented for ArrowStringArray") super().test_view(data) @@ -113,13 +126,13 @@ def test_from_dtype(self, data): # base test uses string representation of dtype pass - def test_transpose(self, data, request, arrow_string_storage): - if data.dtype.storage in arrow_string_storage: + def test_transpose(self, data): + if data.dtype.storage == "pyarrow": pytest.skip(reason="2D support not implemented for ArrowStringArray") super().test_transpose(data) - def test_setitem_preserves_views(self, data, request, arrow_string_storage): - if data.dtype.storage in arrow_string_storage: + def test_setitem_preserves_views(self, data): + if data.dtype.storage == "pyarrow": pytest.skip(reason="2D support not implemented for ArrowStringArray") super().test_setitem_preserves_views(data) @@ -140,28 +153,21 @@ def _get_expected_exception( self, op_name: str, obj, other ) -> type[Exception] | None: if op_name in ["__divmod__", "__rdivmod__"]: - if isinstance(obj, pd.Series) and cast( - StringDtype, tm.get_dtype(obj) - ).storage in [ - "pyarrow", - "pyarrow_numpy", - ]: + if ( + isinstance(obj, pd.Series) + and cast(StringDtype, tm.get_dtype(obj)).storage == "pyarrow" + ): # TODO: re-raise as TypeError? return NotImplementedError - elif isinstance(other, pd.Series) and cast( - StringDtype, tm.get_dtype(other) - ).storage in [ - "pyarrow", - "pyarrow_numpy", - ]: + elif ( + isinstance(other, pd.Series) + and cast(StringDtype, tm.get_dtype(other)).storage == "pyarrow" + ): # TODO: re-raise as TypeError? return NotImplementedError return TypeError elif op_name in ["__mod__", "__rmod__", "__pow__", "__rpow__"]: - if cast(StringDtype, tm.get_dtype(obj)).storage in [ - "pyarrow", - "pyarrow_numpy", - ]: + if cast(StringDtype, tm.get_dtype(obj)).storage == "pyarrow": return NotImplementedError return TypeError elif op_name in ["__mul__", "__rmul__"]: @@ -175,10 +181,7 @@ def _get_expected_exception( "__sub__", "__rsub__", ]: - if cast(StringDtype, tm.get_dtype(obj)).storage in [ - "pyarrow", - "pyarrow_numpy", - ]: + if cast(StringDtype, tm.get_dtype(obj)).storage == "pyarrow": import pyarrow as pa # TODO: better to re-raise as TypeError? @@ -190,7 +193,7 @@ def _get_expected_exception( def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: return ( op_name in ["min", "max"] - or ser.dtype.storage == "pyarrow_numpy" # type: ignore[union-attr] + or ser.dtype.na_value is np.nan # type: ignore[union-attr] and op_name in ("any", "all") ) @@ -198,10 +201,10 @@ def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result): dtype = cast(StringDtype, tm.get_dtype(obj)) if op_name in ["__add__", "__radd__"]: cast_to = dtype + elif dtype.na_value is np.nan: + cast_to = np.bool_ # type: ignore[assignment] elif dtype.storage == "pyarrow": cast_to = "boolean[pyarrow]" # type: ignore[assignment] - elif dtype.storage == "pyarrow_numpy": - cast_to = np.bool_ # type: ignore[assignment] else: cast_to = "boolean" # type: ignore[assignment] return pointwise_result.astype(cast_to) diff --git a/pandas/tests/frame/constructors/test_from_dict.py b/pandas/tests/frame/constructors/test_from_dict.py index 60a8e688b3b8a..fc7c03dc25839 100644 --- a/pandas/tests/frame/constructors/test_from_dict.py +++ b/pandas/tests/frame/constructors/test_from_dict.py @@ -3,7 +3,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas import ( DataFrame, @@ -44,9 +44,7 @@ def test_constructor_single_row(self): ) tm.assert_frame_equal(result, expected) - @pytest.mark.skipif( - using_pyarrow_string_dtype(), reason="columns inferring logic broken" - ) + @pytest.mark.xfail(using_string_dtype(), reason="columns inferring logic broken") def test_constructor_list_of_series(self): data = [ OrderedDict([["a", 1.5], ["b", 3.0], ["c", 4.0]]), @@ -110,6 +108,7 @@ def test_constructor_list_of_series(self): expected = DataFrame.from_dict(sdict, orient="index") tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="columns inferring logic broken") def test_constructor_orient(self, float_string_frame): data_dict = float_string_frame.T._series recons = DataFrame.from_dict(data_dict, orient="index") diff --git a/pandas/tests/frame/constructors/test_from_records.py b/pandas/tests/frame/constructors/test_from_records.py index 66fc234e79b4d..abc3aab1c1492 100644 --- a/pandas/tests/frame/constructors/test_from_records.py +++ b/pandas/tests/frame/constructors/test_from_records.py @@ -1,12 +1,14 @@ from collections.abc import Iterator -from datetime import datetime +from datetime import ( + datetime, + timezone, +) from decimal import Decimal import numpy as np import pytest -import pytz -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas.compat import is_platform_little_endian @@ -39,7 +41,7 @@ def test_from_records_with_datetimes(self): expected = DataFrame({"EXPIRY": [datetime(2005, 3, 1, 0, 0), None]}) arrdata = [np.array([datetime(2005, 3, 1, 0, 0), None])] - dtypes = [("EXPIRY", " None: tm.assert_frame_equal(df, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_add_new_column_infer_string(): # GH#55366 pytest.importorskip("pyarrow") @@ -1925,23 +1893,30 @@ def test_add_new_column_infer_string(): class TestSetitemValidation: # This is adapted from pandas/tests/arrays/masked/test_indexing.py - # but checks for warnings instead of errors. - def _check_setitem_invalid(self, df, invalid, indexer, warn): - msg = "Setting an item of incompatible dtype is deprecated" - msg = re.escape(msg) - + def _check_setitem_invalid(self, df, invalid, indexer): orig_df = df.copy() # iloc - with tm.assert_produces_warning(warn, match=msg): + with pytest.raises(TypeError, match="Invalid value"): df.iloc[indexer, 0] = invalid df = orig_df.copy() # loc - with tm.assert_produces_warning(warn, match=msg): + with pytest.raises(TypeError, match="Invalid value"): df.loc[indexer, "a"] = invalid df = orig_df.copy() + def _check_setitem_valid(self, df, value, indexer): + orig_df = df.copy() + + # iloc + df.iloc[indexer, 0] = value + df = orig_df.copy() + + # loc + df.loc[indexer, "a"] = value + df = orig_df.copy() + _invalid_scalars = [ 1 + 2j, "True", @@ -1959,20 +1934,19 @@ def _check_setitem_invalid(self, df, invalid, indexer, warn): @pytest.mark.parametrize("indexer", _indexers) def test_setitem_validation_scalar_bool(self, invalid, indexer): df = DataFrame({"a": [True, False, False]}, dtype="bool") - self._check_setitem_invalid(df, invalid, indexer, FutureWarning) + self._check_setitem_invalid(df, invalid, indexer) @pytest.mark.parametrize("invalid", _invalid_scalars + [True, 1.5, np.float64(1.5)]) @pytest.mark.parametrize("indexer", _indexers) def test_setitem_validation_scalar_int(self, invalid, any_int_numpy_dtype, indexer): df = DataFrame({"a": [1, 2, 3]}, dtype=any_int_numpy_dtype) if isna(invalid) and invalid is not pd.NaT and not np.isnat(invalid): - warn = None + self._check_setitem_valid(df, invalid, indexer) else: - warn = FutureWarning - self._check_setitem_invalid(df, invalid, indexer, warn) + self._check_setitem_invalid(df, invalid, indexer) @pytest.mark.parametrize("invalid", _invalid_scalars + [True]) @pytest.mark.parametrize("indexer", _indexers) def test_setitem_validation_scalar_float(self, invalid, float_numpy_dtype, indexer): df = DataFrame({"a": [1, 2, None]}, dtype=float_numpy_dtype) - self._check_setitem_invalid(df, invalid, indexer, FutureWarning) + self._check_setitem_invalid(df, invalid, indexer) diff --git a/pandas/tests/frame/indexing/test_insert.py b/pandas/tests/frame/indexing/test_insert.py index b530cb98ef46c..3dd8f7196c594 100644 --- a/pandas/tests/frame/indexing/test_insert.py +++ b/pandas/tests/frame/indexing/test_insert.py @@ -7,6 +7,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import PerformanceWarning from pandas import ( @@ -61,6 +63,7 @@ def test_insert_column_bug_4032(self): expected = DataFrame([[1.3, 1, 1.1], [2.3, 2, 2.2]], columns=["c", "a", "b"]) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_insert_with_columns_dups(self): # GH#14291 df = DataFrame() diff --git a/pandas/tests/frame/indexing/test_mask.py b/pandas/tests/frame/indexing/test_mask.py index 264e27c9c122e..ac6f0a1ac0f73 100644 --- a/pandas/tests/frame/indexing/test_mask.py +++ b/pandas/tests/frame/indexing/test_mask.py @@ -122,7 +122,7 @@ def test_mask_stringdtype(frame_or_series): def test_mask_where_dtype_timedelta(): # https://github.com/pandas-dev/pandas/issues/39548 - df = DataFrame([Timedelta(i, unit="d") for i in range(5)]) + df = DataFrame([Timedelta(i, unit="D") for i in range(5)]) expected = DataFrame(np.full(5, np.nan, dtype="timedelta64[ns]")) tm.assert_frame_equal(df.mask(df.notna()), expected) @@ -130,7 +130,7 @@ def test_mask_where_dtype_timedelta(): expected = DataFrame( [np.nan, np.nan, np.nan, Timedelta("3 day"), Timedelta("4 day")] ) - tm.assert_frame_equal(df.where(df > Timedelta(2, unit="d")), expected) + tm.assert_frame_equal(df.where(df > Timedelta(2, unit="D")), expected) def test_mask_return_dtype(): diff --git a/pandas/tests/frame/indexing/test_set_value.py b/pandas/tests/frame/indexing/test_set_value.py index ce771280bc264..aaf95daf232e2 100644 --- a/pandas/tests/frame/indexing/test_set_value.py +++ b/pandas/tests/frame/indexing/test_set_value.py @@ -1,4 +1,5 @@ import numpy as np +import pytest from pandas.core.dtypes.common import is_float_dtype @@ -6,7 +7,6 @@ DataFrame, isna, ) -import pandas._testing as tm class TestSetValue: @@ -40,11 +40,8 @@ def test_set_value_resize(self, float_frame, using_infer_string): assert is_float_dtype(res["baz"]) assert isna(res["baz"].drop(["foobar"])).all() - with tm.assert_produces_warning( - FutureWarning, match="Setting an item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): res._set_value("foobar", "baz", "sam") - assert res.loc["foobar", "baz"] == "sam" def test_set_value_with_index_dtype_change(self): df_orig = DataFrame( diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index 1fe11c62188e8..cb971b31c13c4 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.core.dtypes.base import _registry as ea_registry from pandas.core.dtypes.common import is_object_dtype from pandas.core.dtypes.dtypes import ( @@ -144,6 +146,7 @@ def test_setitem_different_dtype(self): ) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_empty_columns(self): # GH 13522 df = DataFrame(index=["A", "B", "C"]) @@ -159,13 +162,14 @@ def test_setitem_dt64_index_empty_columns(self): df["A"] = rng assert df["A"].dtype == np.dtype("M8[ns]") + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_timestamp_empty_columns(self): # GH#19843 df = DataFrame(index=range(3)) - df["now"] = Timestamp("20130101", tz="UTC").as_unit("ns") + df["now"] = Timestamp("20130101", tz="UTC") expected = DataFrame( - [[Timestamp("20130101", tz="UTC")]] * 3, index=[0, 1, 2], columns=["now"] + [[Timestamp("20130101", tz="UTC")]] * 3, index=range(3), columns=["now"] ) tm.assert_frame_equal(df, expected) @@ -198,6 +202,7 @@ def test_setitem_with_unaligned_sparse_value(self): expected = Series(SparseArray([1, 0, 0]), name="new_column") tm.assert_series_equal(df["new_column"], expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_period_preserves_dtype(self): # GH: 26861 data = [Period("2003-12", "D")] @@ -340,8 +345,8 @@ def test_setitem_dt64tz(self, timezone_frame): # assert that A & C are not sharing the same base (e.g. they # are copies) # Note: This does not hold with Copy on Write (because of lazy copying) - v1 = df._mgr.arrays[1] - v2 = df._mgr.arrays[2] + v1 = df._mgr.blocks[1].values + v2 = df._mgr.blocks[2].values tm.assert_extension_array_equal(v1, v2) v1base = v1._ndarray.base v2base = v2._ndarray.base @@ -667,6 +672,7 @@ def test_setitem_iloc_two_dimensional_generator(self): expected = DataFrame({"a": [1, 2, 3], "b": [4, 1, 1]}) tm.assert_frame_equal(df, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_dtypes_bytes_type_to_object(self): # GH 20734 index = Series(name="id", dtype="S24") @@ -699,6 +705,7 @@ def test_setitem_ea_dtype_rhs_series(self): expected = DataFrame({"a": [1, 2]}, dtype="Int64") tm.assert_frame_equal(df, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_npmatrix_2d(self): # GH#42376 # for use-case df["x"] = sparse.random((10, 10)).mean(axis=1) @@ -838,6 +845,7 @@ def test_setitem_object_array_of_tzaware_datetimes(self, idx, expected): # object array of datetimes with a tz df["B"] = idx.to_pydatetime() result = df["B"] + expected = expected.dt.as_unit("us") tm.assert_series_equal(result, expected) @@ -919,6 +927,7 @@ def test_setitem_with_expansion_categorical_dtype(self): ser.name = "E" tm.assert_series_equal(result2.sort_index(), ser.sort_index()) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_scalars_no_index(self): # GH#16823 / GH#17894 df = DataFrame() @@ -1355,18 +1364,12 @@ def test_frame_setitem_empty_dataframe(self): def test_full_setter_loc_incompatible_dtype(): # https://github.com/pandas-dev/pandas/issues/55791 df = DataFrame({"a": [1, 2]}) - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): df.loc[:, "a"] = True - expected = DataFrame({"a": [True, True]}) - tm.assert_frame_equal(df, expected) - df = DataFrame({"a": [1, 2]}) - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): df.loc[:, "a"] = {0: 3.5, 1: 4.5} - expected = DataFrame({"a": [3.5, 4.5]}) - tm.assert_frame_equal(df, expected) - df = DataFrame({"a": [1, 2]}) df.loc[:, "a"] = {0: 3, 1: 4} expected = DataFrame({"a": [3, 4]}) tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py index aeffc4835a347..1d7b3e12b2e86 100644 --- a/pandas/tests/frame/indexing/test_where.py +++ b/pandas/tests/frame/indexing/test_where.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.core.dtypes.common import is_scalar import pandas as pd @@ -46,6 +48,7 @@ def is_ok(s): class TestDataFrameIndexingWhere: + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_where_get(self, where_frame, float_string_frame): def _check_get(df, cond, check_dtypes=True): other1 = _safe_add(df) @@ -96,6 +99,7 @@ def test_where_upcasting(self): tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_where_alignment(self, where_frame, float_string_frame): # aligning def _check_align(df, cond, other, check_dtypes=True): @@ -170,6 +174,7 @@ def test_where_invalid(self): with pytest.raises(ValueError, match=msg): df.mask(0) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_where_set(self, where_frame, float_string_frame, mixed_int_frame): # where inplace @@ -513,26 +518,15 @@ def test_where_axis_with_upcast(self): tm.assert_frame_equal(result, expected) result = df.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): - return_value = result.where(mask, ser, axis="index", inplace=True) - assert return_value is None - tm.assert_frame_equal(result, expected) + with pytest.raises(TypeError, match="Invalid value"): + result.where(mask, ser, axis="index", inplace=True) expected = DataFrame([[0, np.nan], [0, np.nan]]) result = df.where(mask, ser, axis="columns") tm.assert_frame_equal(result, expected) - expected = DataFrame( - { - 0: np.array([0, 0], dtype="int64"), - 1: np.array([np.nan, np.nan], dtype="float64"), - } - ) - result = df.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): - return_value = result.where(mask, ser, axis="columns", inplace=True) - assert return_value is None - tm.assert_frame_equal(result, expected) + with pytest.raises(TypeError, match="Invalid value"): + df.where(mask, ser, axis="columns", inplace=True) def test_where_axis_multiple_dtypes(self): # Multiple dtypes (=> multiple Blocks) @@ -584,15 +578,10 @@ def test_where_axis_multiple_dtypes(self): result = df.where(mask, d1, axis="index") tm.assert_frame_equal(result, expected) result = df.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): - return_value = result.where(mask, d1, inplace=True) - assert return_value is None - tm.assert_frame_equal(result, expected) - result = df.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): + result.where(mask, d1, inplace=True) + with pytest.raises(TypeError, match="Invalid value"): return_value = result.where(mask, d1, inplace=True, axis="index") - assert return_value is None - tm.assert_frame_equal(result, expected) d2 = df.copy().drop(1, axis=1) expected = df.copy() @@ -739,11 +728,8 @@ def test_where_interval_fullop_downcast(self, frame_or_series): res = obj.where(~obj.notna(), other) tm.assert_equal(res, other) - with tm.assert_produces_warning( - FutureWarning, match="Setting an item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): obj.mask(obj.notna(), other, inplace=True) - tm.assert_equal(obj, other.astype(object)) @pytest.mark.parametrize( "dtype", @@ -773,14 +759,10 @@ def test_where_datetimelike_noop(self, dtype): res4 = df.mask(mask2, "foo") tm.assert_frame_equal(res4, df) - expected = DataFrame(4, index=df.index, columns=df.columns) # unlike where, Block.putmask does not downcast - with tm.assert_produces_warning( - FutureWarning, match="Setting an item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): df.mask(~mask2, 4, inplace=True) - tm.assert_frame_equal(df, expected.astype(object)) def test_where_int_downcasting_deprecated(): @@ -934,11 +916,8 @@ def test_where_period_invalid_na(frame_or_series, as_cat, request): result = obj.mask(mask, tdnat) tm.assert_equal(result, expected) - with tm.assert_produces_warning( - FutureWarning, match="Setting an item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): obj.mask(mask, tdnat, inplace=True) - tm.assert_equal(obj, expected) def test_where_nullable_invalid_na(frame_or_series, any_numeric_ea_dtype): @@ -1020,9 +999,7 @@ def test_where_dt64_2d(): "B": dta[:, 1], } ) - with tm.assert_produces_warning( - FutureWarning, match="Setting an item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): _check_where_equivalences(df, mask, other, expected) # setting nothing in either column diff --git a/pandas/tests/frame/indexing/test_xs.py b/pandas/tests/frame/indexing/test_xs.py index 4878f74bd152e..a01b68f1fea2a 100644 --- a/pandas/tests/frame/indexing/test_xs.py +++ b/pandas/tests/frame/indexing/test_xs.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas import ( DataFrame, Index, @@ -72,6 +74,7 @@ def test_xs_other(self, float_frame): tm.assert_series_equal(float_frame["A"], float_frame_orig["A"]) assert not (expected == 5).all() + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_xs_corner(self): # pathological mixed-type reordering case df = DataFrame(index=[0]) diff --git a/pandas/tests/frame/methods/test_asfreq.py b/pandas/tests/frame/methods/test_asfreq.py index fb288e19c6e82..1c3c41e2e0299 100644 --- a/pandas/tests/frame/methods/test_asfreq.py +++ b/pandas/tests/frame/methods/test_asfreq.py @@ -236,32 +236,30 @@ def test_asfreq_2ME(self, freq, freq_half): "freq, freq_depr", [ ("2ME", "2M"), + ("2ME", "2m"), ("2QE", "2Q"), ("2QE-SEP", "2Q-SEP"), ("1BQE", "1BQ"), ("2BQE-SEP", "2BQ-SEP"), - ("1YE", "1Y"), + ("2BQE-SEP", "2bq-sep"), + ("1YE", "1y"), ("2YE-MAR", "2Y-MAR"), ], ) - def test_asfreq_frequency_M_Q_Y_deprecated(self, freq, freq_depr): - # GH#9586, #55978 - depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed " - f"in a future version, please use '{freq[1:]}' instead." + def test_asfreq_frequency_M_Q_Y_raises(self, freq, freq_depr): + msg = f"Invalid frequency: {freq_depr}" index = date_range("1/1/2000", periods=4, freq=f"{freq[1:]}") df = DataFrame({"s": Series([0.0, 1.0, 2.0, 3.0], index=index)}) - expected = df.asfreq(freq=freq) - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - result = df.asfreq(freq=freq_depr) - tm.assert_frame_equal(result, expected) + with pytest.raises(ValueError, match=msg): + df.asfreq(freq=freq_depr) @pytest.mark.parametrize( "freq, error_msg", [ ( "2MS", - "MS is not supported as period frequency", + "Invalid frequency: 2MS", ), ( offsets.MonthBegin(), diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index 55f8052d05cf1..c6c702a1a0b1b 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas.util._test_decorators as td import pandas as pd @@ -149,7 +151,7 @@ def test_astype_str(self): # see GH#9757 a = Series(date_range("2010-01-04", periods=5)) b = Series(date_range("3/6/2012 00:00", periods=5, tz="US/Eastern")) - c = Series([Timedelta(x, unit="d") for x in range(5)]) + c = Series([Timedelta(x, unit="D") for x in range(5)]) d = Series(range(5)) e = Series([0.0, 0.2, 0.4, 0.6, 0.8]) @@ -715,8 +717,12 @@ def test_astype_ignores_errors_for_extension_dtypes(self, data, dtype, errors): df.astype(float, errors=errors) def test_astype_tz_conversion(self): - # GH 35973 - val = {"tz": date_range("2020-08-30", freq="d", periods=2, tz="Europe/London")} + # GH 35973, GH#58998 + msg = "'d' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + val = { + "tz": date_range("2020-08-30", freq="d", periods=2, tz="Europe/London") + } df = DataFrame(val) result = df.astype({"tz": "datetime64[ns, Europe/Berlin]"}) @@ -727,7 +733,7 @@ def test_astype_tz_conversion(self): @pytest.mark.parametrize("tz", ["UTC", "Europe/Berlin"]) def test_astype_tz_object_conversion(self, tz): # GH 35973 - val = {"tz": date_range("2020-08-30", freq="d", periods=2, tz="Europe/London")} + val = {"tz": date_range("2020-08-30", freq="D", periods=2, tz="Europe/London")} expected = DataFrame(val) # convert expected to object dtype from other tz str (independently tested) @@ -738,6 +744,7 @@ def test_astype_tz_object_conversion(self, tz): result = result.astype({"tz": "datetime64[ns, Europe/London]"}) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_astype_dt64_to_string( self, frame_or_series, tz_naive_fixture, using_infer_string ): @@ -890,3 +897,12 @@ def test_astype_to_string_not_modifying_input(string_storage, val): with option_context("mode.string_storage", string_storage): df.astype("string") tm.assert_frame_equal(df, expected) + + +@pytest.mark.parametrize("val", [None, 1, 1.5, np.nan, NaT]) +def test_astype_to_string_dtype_not_modifying_input(any_string_dtype, val): + # GH#51073 - variant of the above test with explicit dtype instances + df = DataFrame({"a": ["a", "b", val]}) + expected = df.copy() + df.astype(any_string_dtype) + tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/frame/methods/test_at_time.py b/pandas/tests/frame/methods/test_at_time.py index 126899826fac3..b69db80dee446 100644 --- a/pandas/tests/frame/methods/test_at_time.py +++ b/pandas/tests/frame/methods/test_at_time.py @@ -1,8 +1,11 @@ -from datetime import time +from datetime import ( + time, + timezone, +) +import zoneinfo import numpy as np import pytest -import pytz from pandas._libs.tslibs import timezones @@ -65,7 +68,7 @@ def test_at_time_nonexistent(self, frame_or_series): assert len(rs) == 0 @pytest.mark.parametrize( - "hour", ["1:00", "1:00AM", time(1), time(1, tzinfo=pytz.UTC)] + "hour", ["1:00", "1:00AM", time(1), time(1, tzinfo=timezone.utc)] ) def test_at_time_errors(self, hour): # GH#24043 @@ -83,7 +86,7 @@ def test_at_time_tz(self): # GH#24043 dti = date_range("2018", periods=3, freq="h", tz="US/Pacific") df = DataFrame(list(range(len(dti))), index=dti) - result = df.at_time(time(4, tzinfo=pytz.timezone("US/Eastern"))) + result = df.at_time(time(4, tzinfo=zoneinfo.ZoneInfo("US/Eastern"))) expected = df.iloc[1:2] tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index 8aeab5dacd8b4..87b7d5052a345 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.core.dtypes.cast import find_common_type from pandas.core.dtypes.common import is_dtype_equal @@ -30,6 +32,7 @@ def test_combine_first_mixed(self): combined = f.combine_first(g) tm.assert_frame_equal(combined, exp) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_combine_first(self, float_frame, using_infer_string): # disjoint head, tail = float_frame[:5], float_frame[5:] @@ -199,7 +202,7 @@ def test_combine_first_align_nan(self): # GH 7509 (not fixed) dfa = DataFrame([[pd.Timestamp("2011-01-01"), 2]], columns=["a", "b"]) dfb = DataFrame([[4], [5]], columns=["b"]) - assert dfa["a"].dtype == "datetime64[ns]" + assert dfa["a"].dtype == "datetime64[s]" assert dfa["b"].dtype == "int64" res = dfa.combine_first(dfb) @@ -208,7 +211,7 @@ def test_combine_first_align_nan(self): columns=["a", "b"], ) tm.assert_frame_equal(res, exp) - assert res["a"].dtype == "datetime64[ns]" + assert res["a"].dtype == "datetime64[s]" # TODO: this must be int64 assert res["b"].dtype == "int64" @@ -226,13 +229,13 @@ def test_combine_first_timezone(self, unit): df1 = DataFrame( columns=["UTCdatetime", "abc"], data=data1, - index=pd.date_range("20140627", periods=1), + index=pd.date_range("20140627", periods=1, unit=unit), ) data2 = pd.to_datetime("20121212 12:12").tz_localize("UTC").as_unit(unit) df2 = DataFrame( columns=["UTCdatetime", "xyz"], data=data2, - index=pd.date_range("20140628", periods=1), + index=pd.date_range("20140628", periods=1, unit=unit), ) res = df2[["UTCdatetime"]].combine_first(df1) exp = DataFrame( @@ -244,7 +247,7 @@ def test_combine_first_timezone(self, unit): "abc": [pd.Timestamp("2010-01-01 01:01:00", tz="UTC"), pd.NaT], }, columns=["UTCdatetime", "abc"], - index=pd.date_range("20140627", periods=2, freq="D"), + index=pd.date_range("20140627", periods=2, freq="D", unit=unit), dtype=f"datetime64[{unit}, UTC]", ) assert res["UTCdatetime"].dtype == f"datetime64[{unit}, UTC]" @@ -288,18 +291,17 @@ def test_combine_first_timezone3(self, unit): exp = DataFrame({"DATE": exp_dts}, index=[1, 2, 3, 4, 5, 7]) tm.assert_frame_equal(res, exp) - # FIXME: parametrizing over unit breaks on non-nano - def test_combine_first_timezone4(self): + def test_combine_first_timezone4(self, unit): # different tz - dts1 = pd.date_range("2015-01-01", "2015-01-05", tz="US/Eastern") + dts1 = pd.date_range("2015-01-01", "2015-01-05", tz="US/Eastern", unit=unit) df1 = DataFrame({"DATE": dts1}) - dts2 = pd.date_range("2015-01-03", "2015-01-05") + dts2 = pd.date_range("2015-01-03", "2015-01-05", unit=unit) df2 = DataFrame({"DATE": dts2}) # if df1 doesn't have NaN, keep its dtype res = df1.combine_first(df2) tm.assert_frame_equal(res, df1) - assert res["DATE"].dtype == "datetime64[ns, US/Eastern]" + assert res["DATE"].dtype == f"datetime64[{unit}, US/Eastern]" def test_combine_first_timezone5(self, unit): dts1 = pd.date_range("2015-01-01", "2015-01-02", tz="US/Eastern", unit=unit) @@ -420,7 +422,11 @@ def test_combine_first_timestamp_bug(scalar1, scalar2, nulls_fixture): common_dtype = find_common_type([frame.dtypes["b"], other.dtypes["b"]]) - if is_dtype_equal(common_dtype, "object") or frame.dtypes["b"] == other.dtypes["b"]: + if ( + is_dtype_equal(common_dtype, "object") + or frame.dtypes["b"] == other.dtypes["b"] + or frame.dtypes["b"].kind == frame.dtypes["b"].kind == "M" + ): val = scalar1 else: val = na_value diff --git a/pandas/tests/frame/methods/test_compare.py b/pandas/tests/frame/methods/test_compare.py index 75e60a4816902..2ffc3f933e246 100644 --- a/pandas/tests/frame/methods/test_compare.py +++ b/pandas/tests/frame/methods/test_compare.py @@ -21,7 +21,7 @@ def test_compare_axis(align_axis): result = df.compare(df2, align_axis=align_axis) if align_axis in (1, "columns"): - indices = pd.Index([0, 2]) + indices = pd.RangeIndex(0, 4, 2) columns = pd.MultiIndex.from_product([["col1", "col3"], ["self", "other"]]) expected = pd.DataFrame( [["a", "c", np.nan, np.nan], [np.nan, np.nan, 3.0, 4.0]], @@ -29,7 +29,7 @@ def test_compare_axis(align_axis): columns=columns, ) else: - indices = pd.MultiIndex.from_product([[0, 2], ["self", "other"]]) + indices = pd.MultiIndex.from_product([range(0, 4, 2), ["self", "other"]]) columns = pd.Index(["col1", "col3"]) expected = pd.DataFrame( [["a", np.nan], ["c", np.nan], [np.nan, 3.0], [np.nan, 4.0]], @@ -60,7 +60,7 @@ def test_compare_various_formats(keep_shape, keep_equal): result = df.compare(df2, keep_shape=keep_shape, keep_equal=keep_equal) if keep_shape: - indices = pd.Index([0, 1, 2]) + indices = pd.RangeIndex(3) columns = pd.MultiIndex.from_product( [["col1", "col2", "col3"], ["self", "other"]] ) @@ -85,7 +85,7 @@ def test_compare_various_formats(keep_shape, keep_equal): columns=columns, ) else: - indices = pd.Index([0, 2]) + indices = pd.RangeIndex(0, 4, 2) columns = pd.MultiIndex.from_product([["col1", "col3"], ["self", "other"]]) expected = pd.DataFrame( [["a", "c", 1.0, 1.0], ["c", "c", 3.0, 4.0]], index=indices, columns=columns @@ -203,6 +203,7 @@ def test_compare_result_names(): }, ) result = df1.compare(df2, result_names=("left", "right")) + result.index = pd.Index([0, 2]) expected = pd.DataFrame( { ("col1", "left"): {0: "a", 2: np.nan}, diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py index 521d2cb14ac6a..59779234b46d9 100644 --- a/pandas/tests/frame/methods/test_convert_dtypes.py +++ b/pandas/tests/frame/methods/test_convert_dtypes.py @@ -3,11 +3,15 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd import pandas._testing as tm class TestConvertDtypes: + # TODO convert_dtypes should not use NaN variant of string dtype, but always NA + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "convert_integer, expected", [(False, np.dtype("int32")), (True, "Int32")] ) @@ -16,8 +20,6 @@ def test_convert_dtypes( ): # Specific types are tested in tests/series/test_dtypes.py # Just check that it works for DataFrame here - if using_infer_string: - string_storage = "pyarrow_numpy" df = pd.DataFrame( { "a": pd.Series([1, 2, 3], dtype=np.dtype("int32")), @@ -180,6 +182,7 @@ def test_convert_dtypes_pyarrow_timestamp(self): result = expected.convert_dtypes(dtype_backend="pyarrow") tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_convert_dtypes_avoid_block_splitting(self): # GH#55341 df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": "a"}) @@ -194,6 +197,7 @@ def test_convert_dtypes_avoid_block_splitting(self): tm.assert_frame_equal(result, expected) assert result._mgr.nblocks == 2 + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_convert_dtypes_from_arrow(self): # GH#56581 df = pd.DataFrame([["a", datetime.time(18, 12)]], columns=["a", "b"]) diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index 4151a1d27d06a..c15952339ef18 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas.util._test_decorators as td import pandas as pd @@ -214,7 +216,7 @@ def test_corr_item_cache(self): df["B"] = range(10)[::-1] ser = df["A"] # populate item_cache - assert len(df._mgr.arrays) == 2 # i.e. 2 blocks + assert len(df._mgr.blocks) == 2 _ = df.corr(numeric_only=True) @@ -318,6 +320,7 @@ def test_corrwith_non_timeseries_data(self): for row in index[:4]: tm.assert_almost_equal(correls[row], df1.loc[row].corr(df2.loc[row])) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_corrwith_with_objects(self, using_infer_string): df1 = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), diff --git a/pandas/tests/frame/methods/test_drop_duplicates.py b/pandas/tests/frame/methods/test_drop_duplicates.py index 6bea97b2cf189..419fb75cb3669 100644 --- a/pandas/tests/frame/methods/test_drop_duplicates.py +++ b/pandas/tests/frame/methods/test_drop_duplicates.py @@ -411,10 +411,15 @@ def test_drop_duplicates_inplace(): @pytest.mark.parametrize( "origin_dict, output_dict, ignore_index, output_index", [ - ({"A": [2, 2, 3]}, {"A": [2, 3]}, True, [0, 1]), - ({"A": [2, 2, 3]}, {"A": [2, 3]}, False, [0, 2]), - ({"A": [2, 2, 3], "B": [2, 2, 4]}, {"A": [2, 3], "B": [2, 4]}, True, [0, 1]), - ({"A": [2, 2, 3], "B": [2, 2, 4]}, {"A": [2, 3], "B": [2, 4]}, False, [0, 2]), + ({"A": [2, 2, 3]}, {"A": [2, 3]}, True, range(2)), + ({"A": [2, 2, 3]}, {"A": [2, 3]}, False, range(0, 4, 2)), + ({"A": [2, 2, 3], "B": [2, 2, 4]}, {"A": [2, 3], "B": [2, 4]}, True, range(2)), + ( + {"A": [2, 2, 3], "B": [2, 2, 4]}, + {"A": [2, 3], "B": [2, 4]}, + False, + range(0, 4, 2), + ), ], ) def test_drop_duplicates_ignore_index( diff --git a/pandas/tests/frame/methods/test_dropna.py b/pandas/tests/frame/methods/test_dropna.py index 7899b4aeac3fd..4a60dc09cfe07 100644 --- a/pandas/tests/frame/methods/test_dropna.py +++ b/pandas/tests/frame/methods/test_dropna.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -182,6 +184,7 @@ def test_dropna_multiple_axes(self): with pytest.raises(TypeError, match="supplying multiple axes"): inp.dropna(how="all", axis=(0, 1), inplace=True) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_dropna_tz_aware_datetime(self): # GH13407 df = DataFrame() @@ -195,7 +198,7 @@ def test_dropna_tz_aware_datetime(self): # Ex2 df = DataFrame({"Time": [dt1, None, np.nan, dt2]}) result = df.dropna(axis=0) - expected = DataFrame([dt1, dt2], columns=["Time"], index=[0, 3]) + expected = DataFrame([dt1, dt2], columns=["Time"], index=range(0, 6, 3)) tm.assert_frame_equal(result, expected) def test_dropna_categorical_interval_index(self): @@ -233,7 +236,7 @@ def test_set_single_column_subset(self): # GH 41021 df = DataFrame({"A": [1, 2, 3], "B": list("abc"), "C": [4, np.nan, 5]}) expected = DataFrame( - {"A": [1, 3], "B": list("ac"), "C": [4.0, 5.0]}, index=[0, 2] + {"A": [1, 3], "B": list("ac"), "C": [4.0, 5.0]}, index=range(0, 4, 2) ) result = df.dropna(subset="C") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_dtypes.py b/pandas/tests/frame/methods/test_dtypes.py index 0697f59cd271f..1685f9ee331f5 100644 --- a/pandas/tests/frame/methods/test_dtypes.py +++ b/pandas/tests/frame/methods/test_dtypes.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.core.dtypes.dtypes import DatetimeTZDtype import pandas as pd @@ -133,6 +135,7 @@ def test_dtypes_timedeltas(self): ) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_frame_apply_np_array_return_type(self, using_infer_string): # GH 35517 df = DataFrame([["foo"]]) diff --git a/pandas/tests/frame/methods/test_explode.py b/pandas/tests/frame/methods/test_explode.py index ca9764c023244..876ad5539d603 100644 --- a/pandas/tests/frame/methods/test_explode.py +++ b/pandas/tests/frame/methods/test_explode.py @@ -210,7 +210,7 @@ def test_ignore_index(): df = pd.DataFrame({"id": range(0, 20, 10), "values": [list("ab"), list("cd")]}) result = df.explode("values", ignore_index=True) expected = pd.DataFrame( - {"id": [0, 0, 10, 10], "values": list("abcd")}, index=[0, 1, 2, 3] + {"id": [0, 0, 10, 10], "values": list("abcd")}, index=range(4) ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py index 2ef7780e9a6d5..ad1a37916e381 100644 --- a/pandas/tests/frame/methods/test_fillna.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas import ( Categorical, @@ -47,7 +47,7 @@ def test_fillna_on_column_view(self): assert np.isnan(arr[:, 0]).all() # i.e. we didn't create a new 49-column block - assert len(df._mgr.arrays) == 1 + assert len(df._mgr.blocks) == 1 assert np.shares_memory(df.values, arr) def test_fillna_datetime(self, datetime_frame): @@ -65,7 +65,8 @@ def test_fillna_datetime(self, datetime_frame): with pytest.raises(TypeError, match=msg): datetime_frame.fillna() - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't fill 0 in string") + # TODO(infer_string) test as actual error instead of xfail + @pytest.mark.xfail(using_string_dtype(), reason="can't fill 0 in string") def test_fillna_mixed_type(self, float_string_frame): mf = float_string_frame mf.loc[mf.index[5:20], "foo"] = np.nan @@ -83,6 +84,7 @@ def test_fillna_mixed_float(self, mixed_float_frame): result = mf.ffill() _check_mixed_float(result, dtype={"C": None}) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_fillna_different_dtype(self, using_infer_string): # with different dtype (GH#3386) df = DataFrame( @@ -274,6 +276,7 @@ def test_fillna_dictlike_value_duplicate_colnames(self, columns): expected["A"] = 0.0 tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_fillna_dtype_conversion(self, using_infer_string): # make sure that fillna on an empty frame works df = DataFrame(index=["A", "B", "C"], columns=[1, 2, 3, 4, 5]) @@ -537,7 +540,8 @@ def test_fillna_col_reordering(self): filled = df.ffill() assert df.columns.tolist() == filled.columns.tolist() - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't fill 0 in string") + # TODO(infer_string) test as actual error instead of xfail + @pytest.mark.xfail(using_string_dtype(), reason="can't fill 0 in string") def test_fill_corner(self, float_frame, float_string_frame): mf = float_string_frame mf.loc[mf.index[5:20], "foo"] = np.nan diff --git a/pandas/tests/frame/methods/test_infer_objects.py b/pandas/tests/frame/methods/test_infer_objects.py index a824a615b5c29..c7cdcd177403b 100644 --- a/pandas/tests/frame/methods/test_infer_objects.py +++ b/pandas/tests/frame/methods/test_infer_objects.py @@ -25,7 +25,7 @@ def test_infer_objects(self): assert df["a"].dtype == "int64" assert df["b"].dtype == "float64" - assert df["c"].dtype == "M8[ns]" + assert df["c"].dtype == "M8[us]" assert df["d"].dtype == "object" expected = DataFrame( diff --git a/pandas/tests/frame/methods/test_info.py b/pandas/tests/frame/methods/test_info.py index 4e3726f4dc51d..a4319f8a8ae7f 100644 --- a/pandas/tests/frame/methods/test_info.py +++ b/pandas/tests/frame/methods/test_info.py @@ -7,6 +7,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat import ( IS64, PYPY, @@ -15,6 +17,7 @@ from pandas import ( CategoricalIndex, DataFrame, + Index, MultiIndex, Series, date_range, @@ -360,7 +363,7 @@ def test_info_memory_usage(): df = DataFrame(data) df.columns = dtypes - df_with_object_index = DataFrame({"a": [1]}, index=["foo"]) + df_with_object_index = DataFrame({"a": [1]}, index=Index(["foo"], dtype=object)) df_with_object_index.info(buf=buf, memory_usage=True) res = buf.getvalue().splitlines() assert re.match(r"memory usage: [^+]+\+", res[-1]) @@ -398,25 +401,25 @@ def test_info_memory_usage(): @pytest.mark.skipif(PYPY, reason="on PyPy deep=True doesn't change result") def test_info_memory_usage_deep_not_pypy(): - df_with_object_index = DataFrame({"a": [1]}, index=["foo"]) + df_with_object_index = DataFrame({"a": [1]}, index=Index(["foo"], dtype=object)) assert ( df_with_object_index.memory_usage(index=True, deep=True).sum() > df_with_object_index.memory_usage(index=True).sum() ) - df_object = DataFrame({"a": ["a"]}) + df_object = DataFrame({"a": Series(["a"], dtype=object)}) assert df_object.memory_usage(deep=True).sum() > df_object.memory_usage().sum() @pytest.mark.xfail(not PYPY, reason="on PyPy deep=True does not change result") def test_info_memory_usage_deep_pypy(): - df_with_object_index = DataFrame({"a": [1]}, index=["foo"]) + df_with_object_index = DataFrame({"a": [1]}, index=Index(["foo"], dtype=object)) assert ( df_with_object_index.memory_usage(index=True, deep=True).sum() == df_with_object_index.memory_usage(index=True).sum() ) - df_object = DataFrame({"a": ["a"]}) + df_object = DataFrame({"a": Series(["a"], dtype=object)}) assert df_object.memory_usage(deep=True).sum() == df_object.memory_usage().sum() @@ -432,6 +435,7 @@ def test_usage_via_getsizeof(): assert abs(diff) < 100 +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_info_memory_usage_qualified(): buf = StringIO() df = DataFrame(1, columns=list("ab"), index=[1, 2, 3]) @@ -492,6 +496,7 @@ def test_info_categorical(): df.info(buf=buf) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.xfail(not IS64, reason="GH 36579: fail on 32-bit system") def test_info_int_columns(): # GH#37245 @@ -515,6 +520,7 @@ def test_info_int_columns(): assert result == expected +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_memory_usage_empty_no_warning(): # GH#50066 df = DataFrame(index=["a", "b"]) diff --git a/pandas/tests/frame/methods/test_interpolate.py b/pandas/tests/frame/methods/test_interpolate.py index cdb9ff8a67b6b..b8a34d5eaa226 100644 --- a/pandas/tests/frame/methods/test_interpolate.py +++ b/pandas/tests/frame/methods/test_interpolate.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype import pandas.util._test_decorators as td @@ -64,8 +64,9 @@ def test_interpolate_inplace(self, frame_or_series, request): assert np.shares_memory(orig, obj.values) assert orig.squeeze()[1] == 1.5 + # TODO(infer_string) raise proper TypeError in case of string dtype @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="interpolate doesn't work for string" + using_string_dtype(), reason="interpolate doesn't work for string" ) def test_interp_basic(self): df = DataFrame( @@ -90,7 +91,7 @@ def test_interp_basic(self): assert np.shares_memory(df["D"]._values, dvalues) @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="interpolate doesn't work for string" + using_string_dtype(), reason="interpolate doesn't work for string" ) def test_interp_basic_with_non_range_index(self, using_infer_string): df = DataFrame( diff --git a/pandas/tests/frame/methods/test_join.py b/pandas/tests/frame/methods/test_join.py index 82802dd6e99eb..7de87e633cfb1 100644 --- a/pandas/tests/frame/methods/test_join.py +++ b/pandas/tests/frame/methods/test_join.py @@ -1,4 +1,5 @@ from datetime import datetime +import zoneinfo import numpy as np import pytest @@ -543,17 +544,14 @@ def test_merge_join_different_levels_raises(self): df1.join(df2, on="a") def test_frame_join_tzaware(self): + tz = zoneinfo.ZoneInfo("US/Central") test1 = DataFrame( np.zeros((6, 3)), - index=date_range( - "2012-11-15 00:00:00", periods=6, freq="100ms", tz="US/Central" - ), + index=date_range("2012-11-15 00:00:00", periods=6, freq="100ms", tz=tz), ) test2 = DataFrame( np.zeros((3, 3)), - index=date_range( - "2012-11-15 00:00:00", periods=3, freq="250ms", tz="US/Central" - ), + index=date_range("2012-11-15 00:00:00", periods=3, freq="250ms", tz=tz), columns=range(3, 6), ) @@ -561,4 +559,4 @@ def test_frame_join_tzaware(self): expected = test1.index.union(test2.index) tm.assert_index_equal(result.index, expected) - assert result.index.tz.zone == "US/Central" + assert result.index.tz.key == "US/Central" diff --git a/pandas/tests/frame/methods/test_map.py b/pandas/tests/frame/methods/test_map.py index fe9661a3edc1b..9850de14b2092 100644 --- a/pandas/tests/frame/methods/test_map.py +++ b/pandas/tests/frame/methods/test_map.py @@ -158,14 +158,15 @@ def test_map_box(): tm.assert_frame_equal(result, expected) -def test_frame_map_dont_convert_datetime64(): - df = DataFrame({"x1": [datetime(1996, 1, 1)]}) +def test_frame_map_dont_convert_datetime64(unit): + dtype = f"M8[{unit}]" + df = DataFrame({"x1": [datetime(1996, 1, 1)]}, dtype=dtype) df = df.map(lambda x: x + BDay()) df = df.map(lambda x: x + BDay()) result = df.x1.dtype - assert result == "M8[ns]" + assert result == dtype def test_map_function_runs_once(): diff --git a/pandas/tests/frame/methods/test_nlargest.py b/pandas/tests/frame/methods/test_nlargest.py index 7b6a0487c296a..56bb3126455a5 100644 --- a/pandas/tests/frame/methods/test_nlargest.py +++ b/pandas/tests/frame/methods/test_nlargest.py @@ -82,6 +82,7 @@ def test_nlargest_n(self, nselect_method, n, order): else: ascending = nselect_method == "nsmallest" result = getattr(df, nselect_method)(n, order) + result.index = pd.Index(list(result.index)) expected = df.sort_values(order, ascending=ascending).head(n) tm.assert_frame_equal(result, expected) @@ -132,7 +133,7 @@ def test_nlargest_n_identical_values(self): df = pd.DataFrame({"a": [1] * 5, "b": [1, 2, 3, 4, 5]}) result = df.nlargest(3, "a") - expected = pd.DataFrame({"a": [1] * 3, "b": [1, 2, 3]}, index=[0, 1, 2]) + expected = pd.DataFrame({"a": [1] * 3, "b": [1, 2, 3]}, index=range(3)) tm.assert_frame_equal(result, expected) result = df.nsmallest(3, "a") @@ -179,18 +180,20 @@ def test_nlargest_duplicate_keep_all_ties(self): result = df.nlargest(4, "a", keep="all") expected = pd.DataFrame( { - "a": {0: 5, 1: 4, 2: 4, 4: 3, 5: 3, 6: 3, 7: 3}, - "b": {0: 10, 1: 9, 2: 8, 4: 5, 5: 50, 6: 10, 7: 20}, - } + "a": [5, 4, 4, 3, 3, 3, 3], + "b": [10, 9, 8, 5, 50, 10, 20], + }, + index=[0, 1, 2, 4, 5, 6, 7], ) tm.assert_frame_equal(result, expected) result = df.nsmallest(2, "a", keep="all") expected = pd.DataFrame( { - "a": {3: 2, 4: 3, 5: 3, 6: 3, 7: 3}, - "b": {3: 7, 4: 5, 5: 50, 6: 10, 7: 20}, - } + "a": [2, 3, 3, 3, 3], + "b": [7, 5, 50, 10, 20], + }, + index=range(3, 8), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py index 32ae4c0ff2f50..fedbdbc98660f 100644 --- a/pandas/tests/frame/methods/test_quantile.py +++ b/pandas/tests/frame/methods/test_quantile.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -127,7 +129,7 @@ def test_axis_numeric_only_true(self, interp_method): result = df.quantile( 0.5, axis=1, numeric_only=True, interpolation=interpolation, method=method ) - expected = Series([3.0, 4.0], index=[0, 1], name=0.5) + expected = Series([3.0, 4.0], index=range(2), name=0.5) if interpolation == "nearest": expected = expected.astype(np.int64) tm.assert_series_equal(result, expected) @@ -324,6 +326,7 @@ def test_quantile_multi_empty(self, interp_method): ) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_quantile_datetime(self, unit): dti = pd.to_datetime(["2010", "2011"]).as_unit(unit) df = DataFrame({"a": dti, "b": [0, 5]}) @@ -377,6 +380,7 @@ def test_quantile_datetime(self, unit): expected = DataFrame(index=[0.5], columns=[]) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "dtype", [ @@ -641,6 +645,7 @@ def test_quantile_nat(self, interp_method, unit): ) tm.assert_frame_equal(res, exp) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_quantile_empty_no_rows_floats(self, interp_method): interpolation, method = interp_method @@ -710,14 +715,14 @@ def test_quantile_empty_no_columns(self, interp_method): result = df.quantile( 0.5, numeric_only=True, interpolation=interpolation, method=method ) - expected = Series([], index=[], name=0.5, dtype=np.float64) + expected = Series([], name=0.5, dtype=np.float64) expected.index.name = "captain tightpants" tm.assert_series_equal(result, expected) result = df.quantile( [0.5], numeric_only=True, interpolation=interpolation, method=method ) - expected = DataFrame([], index=[0.5], columns=[]) + expected = DataFrame([], index=[0.5]) expected.columns.name = "captain tightpants" tm.assert_frame_equal(result, expected) @@ -869,6 +874,7 @@ def test_quantile_ea_scalar(self, request, obj, index): else: tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "dtype, expected_data, expected_index, axis", [ @@ -887,6 +893,7 @@ def test_empty_numeric(self, dtype, expected_data, expected_index, axis): ) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "dtype, expected_data, expected_index, axis, expected_dtype", [ @@ -905,6 +912,7 @@ def test_empty_datelike( ) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "expected_data, expected_index, axis", [ @@ -926,3 +934,12 @@ def test_datelike_numeric_only(self, expected_data, expected_index, axis): expected_data, name=0.5, index=Index(expected_index), dtype=np.float64 ) tm.assert_series_equal(result, expected) + + +def test_multi_quantile_numeric_only_retains_columns(): + df = DataFrame(list("abc")) + result = df.quantile([0.5, 0.7], numeric_only=True) + expected = DataFrame(index=[0.5, 0.7]) + tm.assert_frame_equal( + result, expected, check_index_type=True, check_column_type=True + ) diff --git a/pandas/tests/frame/methods/test_reindex.py b/pandas/tests/frame/methods/test_reindex.py index 45109991c4553..37adc31fb0f4d 100644 --- a/pandas/tests/frame/methods/test_reindex.py +++ b/pandas/tests/frame/methods/test_reindex.py @@ -754,7 +754,10 @@ def test_reindex_axes(self): index=[datetime(2012, 1, 1), datetime(2012, 1, 2), datetime(2012, 1, 3)], columns=["a", "b", "c"], ) - time_freq = date_range("2012-01-01", "2012-01-03", freq="d") + + msg = "'d' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + time_freq = date_range("2012-01-01", "2012-01-03", freq="d") some_cols = ["a", "b"] index_freq = df.reindex(index=time_freq).index.freq diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index fb7ba2b7af38a..6b872bf48d550 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -6,7 +6,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype import pandas as pd from pandas import ( @@ -30,9 +30,7 @@ def mix_abc() -> dict[str, list[float | str]]: class TestDataFrameReplace: - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) + @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_replace_inplace(self, datetime_frame, float_string_frame): datetime_frame.loc[datetime_frame.index[:5], "A"] = np.nan datetime_frame.loc[datetime_frame.index[-5:], "A"] = np.nan @@ -293,9 +291,7 @@ def test_regex_replace_dict_nested_non_first_character( expected = DataFrame({"first": [".bc", "bc.", "c.b"]}, dtype=dtype) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) + @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_regex_replace_dict_nested_gh4115(self): df = DataFrame({"Type": ["Q", "T", "Q", "Q", "T"], "tmp": 2}) expected = DataFrame( @@ -304,9 +300,7 @@ def test_regex_replace_dict_nested_gh4115(self): result = df.replace({"Type": {"Q": 0, "T": 1}}) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) + @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_regex_replace_list_to_scalar(self, mix_abc): df = DataFrame(mix_abc) expec = DataFrame( @@ -332,9 +326,7 @@ def test_regex_replace_list_to_scalar(self, mix_abc): tm.assert_frame_equal(res2, expec) tm.assert_frame_equal(res3, expec) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) + @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_regex_replace_str_to_numeric(self, mix_abc): # what happens when you try to replace a numeric value with a regex? df = DataFrame(mix_abc) @@ -350,9 +342,7 @@ def test_regex_replace_str_to_numeric(self, mix_abc): tm.assert_frame_equal(res2, expec) tm.assert_frame_equal(res3, expec) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) + @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_regex_replace_regex_list_to_numeric(self, mix_abc): df = DataFrame(mix_abc) res = df.replace([r"\s*\.\s*", "b"], 0, regex=True) @@ -545,9 +535,7 @@ def test_replace_series_dict(self): result = df.replace(s, df.mean()) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) + @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_replace_convert(self): # gh 3907 df = DataFrame([["foo", "bar", "bah"], ["bar", "foo", "bah"]]) @@ -557,9 +545,7 @@ def test_replace_convert(self): res = rep.dtypes tm.assert_series_equal(expec, res) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) + @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_replace_mixed(self, float_string_frame): mf = float_string_frame mf.iloc[5:20, mf.columns.get_loc("foo")] = np.nan @@ -615,6 +601,7 @@ def test_replace_mixed_int_block_splitting(self): result = df.replace(0, 0.5) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_replace_mixed2(self, using_infer_string): # to object block upcasting df = DataFrame( @@ -902,9 +889,7 @@ def test_replace_input_formats_listlike(self): with pytest.raises(ValueError, match=msg): df.replace(to_rep, values[1:]) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) + @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_replace_input_formats_scalar(self): df = DataFrame( {"A": [np.nan, 0, np.inf], "B": [0, 2, 5], "C": ["", "asdf", "fd"]} @@ -933,9 +918,7 @@ def test_replace_limit(self): # TODO pass - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) + @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_replace_dict_no_regex(self): answer = Series( { @@ -957,9 +940,7 @@ def test_replace_dict_no_regex(self): result = answer.replace(weights) tm.assert_series_equal(result, expected) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) + @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_replace_series_no_regex(self): answer = Series( { @@ -1064,9 +1045,7 @@ def test_nested_dict_overlapping_keys_replace_str(self): expected = df.replace({"a": dict(zip(astr, bstr))}) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) + @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_replace_swapping_bug(self, using_infer_string): df = DataFrame({"a": [True, False, True]}) res = df.replace({"a": {True: "Y", False: "N"}}) @@ -1171,38 +1150,6 @@ def test_replace_with_empty_dictlike(self, mix_abc): tm.assert_frame_equal(df, df.replace({"b": {}})) tm.assert_frame_equal(df, df.replace(Series({"b": {}}))) - @pytest.mark.parametrize( - "replace_dict, final_data", - [({"a": 1, "b": 1}, [[3, 3], [2, 2]]), ({"a": 1, "b": 2}, [[3, 1], [2, 3]])], - ) - def test_categorical_replace_with_dict(self, replace_dict, final_data): - # GH 26988 - df = DataFrame([[1, 1], [2, 2]], columns=["a", "b"], dtype="category") - - final_data = np.array(final_data) - - a = pd.Categorical(final_data[:, 0], categories=[3, 2]) - - ex_cat = [3, 2] if replace_dict["b"] == 1 else [1, 3] - b = pd.Categorical(final_data[:, 1], categories=ex_cat) - - expected = DataFrame({"a": a, "b": b}) - msg2 = "with CategoricalDtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg2): - result = df.replace(replace_dict, 3) - tm.assert_frame_equal(result, expected) - msg = ( - r"Attributes of DataFrame.iloc\[:, 0\] \(column name=\"a\"\) are " - "different" - ) - with pytest.raises(AssertionError, match=msg): - # ensure non-inplace call does not affect original - tm.assert_frame_equal(df, expected) - with tm.assert_produces_warning(FutureWarning, match=msg2): - return_value = df.replace(replace_dict, 3, inplace=True) - assert return_value is None - tm.assert_frame_equal(df, expected) - @pytest.mark.parametrize( "df, to_replace, exp", [ @@ -1229,9 +1176,7 @@ def test_replace_commutative(self, df, to_replace, exp): result = df.replace(to_replace) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) + @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") @pytest.mark.parametrize( "replacer", [ @@ -1300,6 +1245,31 @@ def test_replace_ea_ignore_float(self, frame_or_series, value): result = obj.replace(1.0, 0.0) tm.assert_equal(expected, result) + @pytest.mark.parametrize( + "replace_dict, final_data", + [({"a": 1, "b": 1}, [[2, 2], [2, 2]]), ({"a": 1, "b": 2}, [[2, 1], [2, 2]])], + ) + def test_categorical_replace_with_dict(self, replace_dict, final_data): + # GH 26988 + df = DataFrame([[1, 1], [2, 2]], columns=["a", "b"], dtype="category") + + final_data = np.array(final_data) + + a = pd.Categorical(final_data[:, 0], categories=[1, 2]) + b = pd.Categorical(final_data[:, 1], categories=[1, 2]) + + expected = DataFrame({"a": a, "b": b}) + result = df.replace(replace_dict, 2) + tm.assert_frame_equal(result, expected) + msg = r"DataFrame.iloc\[:, 0\] \(column name=\"a\"\) are " "different" + with pytest.raises(AssertionError, match=msg): + # ensure non-inplace call does not affect original + tm.assert_frame_equal(df, expected) + return_value = df.replace(replace_dict, 2, inplace=True) + assert return_value is None + tm.assert_frame_equal(df, expected) + + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_replace_value_category_type(self): """ Test for #23305: to ensure category dtypes are maintained @@ -1345,15 +1315,17 @@ def test_replace_value_category_type(self): ) # replace values in input dataframe - msg = ( - r"The behavior of Series\.replace \(and DataFrame.replace\) " - "with CategoricalDtype" + input_df = input_df.apply( + lambda x: x.astype("category").cat.rename_categories({"d": "z"}) + ) + input_df = input_df.apply( + lambda x: x.astype("category").cat.rename_categories({"obj1": "obj9"}) + ) + result = input_df.apply( + lambda x: x.astype("category").cat.rename_categories({"cat2": "catX"}) ) - with tm.assert_produces_warning(FutureWarning, match=msg): - input_df = input_df.replace("d", "z") - input_df = input_df.replace("obj1", "obj9") - result = input_df.replace("cat2", "catX") + result = result.astype({"col1": "int64", "col3": "float64", "col5": "object"}) tm.assert_frame_equal(result, expected) def test_replace_dict_category_type(self): @@ -1378,12 +1350,11 @@ def test_replace_dict_category_type(self): ) # replace values in input dataframe using a dict - msg = ( - r"The behavior of Series\.replace \(and DataFrame.replace\) " - "with CategoricalDtype" + result = input_df.apply( + lambda x: x.cat.rename_categories( + {"a": "z", "obj1": "obj9", "cat1": "catX"} + ) ) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = input_df.replace({"a": "z", "obj1": "obj9", "cat1": "catX"}) tm.assert_frame_equal(result, expected) @@ -1395,6 +1366,7 @@ def test_replace_with_compiled_regex(self): expected = DataFrame(["z", "b", "c"]) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_replace_intervals(self): # https://github.com/pandas-dev/pandas/issues/35931 df = DataFrame({"a": [pd.Interval(0, 1), pd.Interval(0, 1)]}) @@ -1499,9 +1471,7 @@ def test_regex_replace_scalar( expected.loc[expected["a"] == ".", "a"] = expected_replace_val tm.assert_frame_equal(result, expected) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't set float into string" - ) + @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") @pytest.mark.parametrize("regex", [False, True]) def test_replace_regex_dtype_frame(self, regex): # GH-48644 diff --git a/pandas/tests/frame/methods/test_reset_index.py b/pandas/tests/frame/methods/test_reset_index.py index 22ce091d4ed62..c487bc4cfb89a 100644 --- a/pandas/tests/frame/methods/test_reset_index.py +++ b/pandas/tests/frame/methods/test_reset_index.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.core.dtypes.common import ( is_float_dtype, is_integer_dtype, @@ -600,8 +602,8 @@ def test_reset_index_with_drop( {"a": [pd.NaT, Timestamp("2020-01-01")], "b": [1, 2], "x": [11, 12]}, ), ( - [(pd.NaT, 1), (pd.Timedelta(123, "d"), 2)], - {"a": [pd.NaT, pd.Timedelta(123, "d")], "b": [1, 2], "x": [11, 12]}, + [(pd.NaT, 1), (pd.Timedelta(123, "D"), 2)], + {"a": [pd.NaT, pd.Timedelta(123, "D")], "b": [1, 2], "x": [11, 12]}, ), ], ) @@ -642,6 +644,7 @@ def test_rest_index_multiindex_categorical_with_missing_values(self, codes): tm.assert_frame_equal(res, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "array, dtype", [ diff --git a/pandas/tests/frame/methods/test_shift.py b/pandas/tests/frame/methods/test_shift.py index 72c1a123eac98..4e490e9e344ba 100644 --- a/pandas/tests/frame/methods/test_shift.py +++ b/pandas/tests/frame/methods/test_shift.py @@ -320,7 +320,7 @@ def test_shift_categorical1(self, frame_or_series): def get_cat_values(ndframe): # For Series we could just do ._values; for DataFrame # we may be able to do this if we ever have 2D Categoricals - return ndframe._mgr.arrays[0] + return ndframe._mgr.blocks[0].values cat = get_cat_values(obj) @@ -560,7 +560,7 @@ def test_shift_dt64values_int_fill_deprecated(self): # same thing but not consolidated; pre-2.0 we got different behavior df3 = DataFrame({"A": ser}) df3["B"] = ser - assert len(df3._mgr.arrays) == 2 + assert len(df3._mgr.blocks) == 2 result = df3.shift(1, axis=1, fill_value=0) tm.assert_frame_equal(result, expected) @@ -621,7 +621,7 @@ def test_shift_dt64values_axis1_invalid_fill(self, vals, as_cat): # same thing but not consolidated df3 = DataFrame({"A": ser}) df3["B"] = ser - assert len(df3._mgr.arrays) == 2 + assert len(df3._mgr.blocks) == 2 result = df3.shift(-1, axis=1, fill_value="foo") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_sort_values.py b/pandas/tests/frame/methods/test_sort_values.py index c146dcc9c2d71..e728526519e9d 100644 --- a/pandas/tests/frame/methods/test_sort_values.py +++ b/pandas/tests/frame/methods/test_sort_values.py @@ -170,7 +170,7 @@ def test_sort_values_multicolumn_uint64(self): "a": pd.Series([18446637057563306014, 1162265347240853609]), "b": pd.Series([1, 2]), }, - index=pd.Index([1, 0]), + index=range(1, -1, -1), ) tm.assert_frame_equal(result, expected) @@ -360,7 +360,7 @@ def test_sort_values_nat_values_in_int_column(self): df_reversed = DataFrame( {"int": int_values[::-1], "float": float_values[::-1]}, columns=["int", "float"], - index=[1, 0], + index=range(1, -1, -1), ) # NaT is not a "na" for int64 columns, so na_position must not @@ -385,7 +385,7 @@ def test_sort_values_nat_values_in_int_column(self): df_reversed = DataFrame( {"datetime": [NaT, Timestamp("2016-01-01")], "float": float_values[::-1]}, columns=["datetime", "float"], - index=[1, 0], + index=range(1, -1, -1), ) df_sorted = df.sort_values(["datetime", "float"], na_position="first") @@ -540,19 +540,19 @@ def test_sort_values_na_position_with_categories_raises(self): @pytest.mark.parametrize( "original_dict, sorted_dict, ignore_index, output_index", [ - ({"A": [1, 2, 3]}, {"A": [3, 2, 1]}, True, [0, 1, 2]), - ({"A": [1, 2, 3]}, {"A": [3, 2, 1]}, False, [2, 1, 0]), + ({"A": [1, 2, 3]}, {"A": [3, 2, 1]}, True, range(3)), + ({"A": [1, 2, 3]}, {"A": [3, 2, 1]}, False, range(2, -1, -1)), ( {"A": [1, 2, 3], "B": [2, 3, 4]}, {"A": [3, 2, 1], "B": [4, 3, 2]}, True, - [0, 1, 2], + range(3), ), ( {"A": [1, 2, 3], "B": [2, 3, 4]}, {"A": [3, 2, 1], "B": [4, 3, 2]}, False, - [2, 1, 0], + range(2, -1, -1), ), ], ) diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py index 3a87f7ded1759..7fb1658394632 100644 --- a/pandas/tests/frame/methods/test_to_csv.py +++ b/pandas/tests/frame/methods/test_to_csv.py @@ -5,6 +5,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import ParserError import pandas as pd @@ -42,6 +44,7 @@ def test_to_csv_from_csv1(self, temp_file, float_frame): float_frame.to_csv(path, header=False) float_frame.to_csv(path, index=False) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_to_csv_from_csv1_datetime(self, temp_file, datetime_frame): path = str(temp_file) # test roundtrip @@ -49,7 +52,9 @@ def test_to_csv_from_csv1_datetime(self, temp_file, datetime_frame): datetime_frame.index = datetime_frame.index._with_freq(None) datetime_frame.to_csv(path) recons = self.read_csv(path, parse_dates=True) - tm.assert_frame_equal(datetime_frame, recons) + expected = datetime_frame.copy() + expected.index = expected.index.as_unit("s") + tm.assert_frame_equal(expected, recons) datetime_frame.to_csv(path, index_label="index") recons = self.read_csv(path, index_col=None, parse_dates=True) @@ -149,9 +154,11 @@ def test_to_csv_from_csv5(self, temp_file, timezone_frame): lambda c: to_datetime(result[c]) .dt.tz_convert("UTC") .dt.tz_convert(timezone_frame[c].dt.tz) + .dt.as_unit("ns") ) result["B"] = converter("B") result["C"] = converter("C") + result["A"] = result["A"].dt.as_unit("ns") tm.assert_frame_equal(result, timezone_frame) def test_to_csv_cols_reordering(self, temp_file): @@ -233,8 +240,12 @@ def make_dtnat_arr(n, nnat=None): df = DataFrame({"a": s1, "b": s2}) df.to_csv(path, chunksize=chunksize) - recons = self.read_csv(path).apply(to_datetime) - tm.assert_frame_equal(df, recons, check_names=False) + result = self.read_csv(path).apply(to_datetime) + + expected = df[:] + expected["a"] = expected["a"].astype("M8[s]") + expected["b"] = expected["b"].astype("M8[s]") + tm.assert_frame_equal(result, expected, check_names=False) def _return_result_expected( self, @@ -352,6 +363,7 @@ def test_to_csv_nrows(self, nrows): columns=Index(list("abcd"), dtype=object), ) result, expected = self._return_result_expected(df, 1000, "dt", "s") + expected.index = expected.index.astype("M8[ns]") tm.assert_frame_equal(result, expected, check_names=False) @pytest.mark.slow @@ -381,6 +393,10 @@ def test_to_csv_idx_types(self, nrows, r_idx_type, c_idx_type, ncols): r_idx_type, c_idx_type, ) + if r_idx_type in ["dt", "p"]: + expected.index = expected.index.astype("M8[ns]") + if c_idx_type in ["dt", "p"]: + expected.columns = expected.columns.astype("M8[ns]") tm.assert_frame_equal(result, expected, check_names=False) @pytest.mark.slow @@ -423,6 +439,7 @@ def test_to_csv_empty(self): result, expected = self._return_result_expected(df, 1000) tm.assert_frame_equal(result, expected, check_column_type=False) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.slow def test_to_csv_chunksize(self): chunksize = 1000 @@ -435,6 +452,7 @@ def test_to_csv_chunksize(self): result, expected = self._return_result_expected(df, chunksize, rnlvl=2) tm.assert_frame_equal(result, expected, check_names=False) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.slow @pytest.mark.parametrize( "nrows", [2, 10, 99, 100, 101, 102, 198, 199, 200, 201, 202, 249, 250, 251] @@ -531,6 +549,7 @@ def test_to_csv_headers(self, temp_file): assert return_value is None tm.assert_frame_equal(to_df, recons) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_to_csv_multiindex(self, temp_file, float_frame, datetime_frame): frame = float_frame old_index = frame.index @@ -565,7 +584,9 @@ def test_to_csv_multiindex(self, temp_file, float_frame, datetime_frame): recons = self.read_csv(path, index_col=[0, 1], parse_dates=True) # TODO to_csv drops column name - tm.assert_frame_equal(tsframe, recons, check_names=False) + expected = tsframe.copy() + expected.index = MultiIndex.from_arrays([old_index.as_unit("s"), new_index[1]]) + tm.assert_frame_equal(recons, expected, check_names=False) # do not load index tsframe.to_csv(path) @@ -722,6 +743,7 @@ def test_to_csv_withcommas(self, temp_file): df2 = self.read_csv(path) tm.assert_frame_equal(df2, df) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_to_csv_mixed(self, temp_file): def create_cols(name): return [f"{name}{i:03d}" for i in range(5)] @@ -741,7 +763,7 @@ def create_cols(name): "foo", index=df_float.index, columns=create_cols("object") ) df_dt = DataFrame( - Timestamp("20010101").as_unit("ns"), + Timestamp("20010101"), index=df_float.index, columns=create_cols("date"), ) @@ -789,9 +811,7 @@ def test_to_csv_dups_cols(self, temp_file): ) df_bool = DataFrame(True, index=df_float.index, columns=range(3)) df_object = DataFrame("foo", index=df_float.index, columns=range(3)) - df_dt = DataFrame( - Timestamp("20010101").as_unit("ns"), index=df_float.index, columns=range(3) - ) + df_dt = DataFrame(Timestamp("20010101"), index=df_float.index, columns=range(3)) df = pd.concat( [df_float, df_int, df_bool, df_object, df_dt], axis=1, ignore_index=True ) @@ -809,6 +829,7 @@ def test_to_csv_dups_cols(self, temp_file): result.columns = df.columns tm.assert_frame_equal(result, df) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_to_csv_dups_cols2(self, temp_file): # GH3457 df = DataFrame( @@ -1167,7 +1188,11 @@ def test_to_csv_with_dst_transitions(self, td, temp_file): # we have to reconvert the index as we # don't parse the tz's result = read_csv(path, index_col=0) - result.index = to_datetime(result.index, utc=True).tz_convert("Europe/London") + result.index = ( + to_datetime(result.index, utc=True) + .tz_convert("Europe/London") + .as_unit("ns") + ) tm.assert_frame_equal(result, df) @pytest.mark.parametrize( @@ -1186,8 +1211,10 @@ def test_to_csv_with_dst_transitions_with_pickle(self, start, end, temp_file): with tm.ensure_clean("csv_date_format_with_dst") as path: df.to_csv(path, index=True) result = read_csv(path, index_col=0) - result.index = to_datetime(result.index, utc=True).tz_convert( - "Europe/Paris" + result.index = ( + to_datetime(result.index, utc=True) + .tz_convert("Europe/Paris") + .as_unit("ns") ) result["idx"] = to_datetime(result["idx"], utc=True).astype( "datetime64[ns, Europe/Paris]" diff --git a/pandas/tests/frame/methods/test_to_dict.py b/pandas/tests/frame/methods/test_to_dict.py index 0272b679e85a2..c43d947b4877e 100644 --- a/pandas/tests/frame/methods/test_to_dict.py +++ b/pandas/tests/frame/methods/test_to_dict.py @@ -2,11 +2,13 @@ OrderedDict, defaultdict, ) -from datetime import datetime +from datetime import ( + datetime, + timezone, +) import numpy as np import pytest -import pytz from pandas import ( NA, @@ -209,15 +211,15 @@ def test_to_dict_tz(self): # GH#18372 When converting to dict with orient='records' columns of # datetime that are tz-aware were not converted to required arrays data = [ - (datetime(2017, 11, 18, 21, 53, 0, 219225, tzinfo=pytz.utc),), - (datetime(2017, 11, 18, 22, 6, 30, 61810, tzinfo=pytz.utc),), + (datetime(2017, 11, 18, 21, 53, 0, 219225, tzinfo=timezone.utc),), + (datetime(2017, 11, 18, 22, 6, 30, 61810, tzinfo=timezone.utc),), ] df = DataFrame(list(data), columns=["d"]) result = df.to_dict(orient="records") expected = [ - {"d": Timestamp("2017-11-18 21:53:00.219225+0000", tz=pytz.utc)}, - {"d": Timestamp("2017-11-18 22:06:30.061810+0000", tz=pytz.utc)}, + {"d": Timestamp("2017-11-18 21:53:00.219225+0000", tz=timezone.utc)}, + {"d": Timestamp("2017-11-18 22:06:30.061810+0000", tz=timezone.utc)}, ] tm.assert_dict_equal(result[0], expected[0]) tm.assert_dict_equal(result[1], expected[1]) diff --git a/pandas/tests/frame/methods/test_to_dict_of_blocks.py b/pandas/tests/frame/methods/test_to_dict_of_blocks.py index 0f1f643209db0..4f621b4643b70 100644 --- a/pandas/tests/frame/methods/test_to_dict_of_blocks.py +++ b/pandas/tests/frame/methods/test_to_dict_of_blocks.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas import ( DataFrame, MultiIndex, @@ -25,6 +27,7 @@ def test_no_copy_blocks(self, float_frame): assert _last_df is not None and not _last_df[column].equals(df[column]) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_to_dict_of_blocks_item_cache(): # Calling to_dict_of_blocks should not poison item_cache df = DataFrame({"a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"]}) diff --git a/pandas/tests/frame/methods/test_transpose.py b/pandas/tests/frame/methods/test_transpose.py index f42fd4483e9ac..1b7b30ac40363 100644 --- a/pandas/tests/frame/methods/test_transpose.py +++ b/pandas/tests/frame/methods/test_transpose.py @@ -25,6 +25,7 @@ def test_transpose_td64_intervals(self): df = DataFrame(ii) result = df.T + result.columns = Index(list(range(len(ii)))) expected = DataFrame({i: ii[i : i + 1] for i in range(len(ii))}) tm.assert_frame_equal(result, expected) @@ -153,7 +154,6 @@ def test_transpose_not_inferring_dt(self): result = df.T expected = DataFrame( [[Timestamp("2019-12-31"), Timestamp("2019-12-31")]], - columns=[0, 1], index=["a"], dtype=object, ) @@ -175,7 +175,6 @@ def test_transpose_not_inferring_dt_mixed_blocks(self): [Timestamp("2019-12-31"), Timestamp("2019-12-31")], [Timestamp("2019-12-31"), Timestamp("2019-12-31")], ], - columns=[0, 1], index=["a", "b"], dtype=object, ) diff --git a/pandas/tests/frame/methods/test_tz_convert.py b/pandas/tests/frame/methods/test_tz_convert.py index e9209f218bca9..5ee4021102f22 100644 --- a/pandas/tests/frame/methods/test_tz_convert.py +++ b/pandas/tests/frame/methods/test_tz_convert.py @@ -1,3 +1,5 @@ +import zoneinfo + import numpy as np import pytest @@ -13,28 +15,34 @@ class TestTZConvert: def test_tz_convert(self, frame_or_series): - rng = date_range("1/1/2011", periods=200, freq="D", tz="US/Eastern") + rng = date_range( + "1/1/2011", periods=200, freq="D", tz=zoneinfo.ZoneInfo("US/Eastern") + ) obj = DataFrame({"a": 1}, index=rng) obj = tm.get_obj(obj, frame_or_series) - result = obj.tz_convert("Europe/Berlin") - expected = DataFrame({"a": 1}, rng.tz_convert("Europe/Berlin")) + berlin = zoneinfo.ZoneInfo("Europe/Berlin") + result = obj.tz_convert(berlin) + expected = DataFrame({"a": 1}, rng.tz_convert(berlin)) expected = tm.get_obj(expected, frame_or_series) - assert result.index.tz.zone == "Europe/Berlin" + assert result.index.tz.key == "Europe/Berlin" tm.assert_equal(result, expected) def test_tz_convert_axis1(self): - rng = date_range("1/1/2011", periods=200, freq="D", tz="US/Eastern") + rng = date_range( + "1/1/2011", periods=200, freq="D", tz=zoneinfo.ZoneInfo("US/Eastern") + ) obj = DataFrame({"a": 1}, index=rng) obj = obj.T - result = obj.tz_convert("Europe/Berlin", axis=1) - assert result.columns.tz.zone == "Europe/Berlin" + berlin = zoneinfo.ZoneInfo("Europe/Berlin") + result = obj.tz_convert(berlin, axis=1) + assert result.columns.tz.key == "Europe/Berlin" - expected = DataFrame({"a": 1}, rng.tz_convert("Europe/Berlin")) + expected = DataFrame({"a": 1}, rng.tz_convert(berlin)) tm.assert_equal(result, expected.T) diff --git a/pandas/tests/frame/methods/test_update.py b/pandas/tests/frame/methods/test_update.py index 269b9e372bd70..ea63b2264d4f6 100644 --- a/pandas/tests/frame/methods/test_update.py +++ b/pandas/tests/frame/methods/test_update.py @@ -152,18 +152,9 @@ def test_update_with_different_dtype(self): # GH#3217 df = DataFrame({"a": [1, 3], "b": [np.nan, 2]}) df["c"] = np.nan - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): df.update({"c": Series(["foo"], index=[0])}) - expected = DataFrame( - { - "a": [1, 3], - "b": [np.nan, 2], - "c": Series(["foo", np.nan], dtype="object"), - } - ) - tm.assert_frame_equal(df, expected) - def test_update_modify_view(self, using_infer_string): # GH#47188 df = DataFrame({"A": ["1", np.nan], "B": ["100", np.nan]}) diff --git a/pandas/tests/frame/methods/test_values.py b/pandas/tests/frame/methods/test_values.py index dfece3fc7552b..2de2053bb705f 100644 --- a/pandas/tests/frame/methods/test_values.py +++ b/pandas/tests/frame/methods/test_values.py @@ -256,7 +256,7 @@ def test_private_values_dt64_multiblock(self): df = DataFrame({"A": dta[:4]}, copy=False) df["B"] = dta[4:] - assert len(df._mgr.arrays) == 2 + assert len(df._mgr.blocks) == 2 result = df._values expected = dta.reshape(2, 4).T diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index c68171ab254c7..b4c16b94fcf8b 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -1,6 +1,7 @@ -from datetime import datetime - -import pytz +from datetime import ( + datetime, + timezone, +) from pandas import DataFrame import pandas._testing as tm @@ -13,7 +14,7 @@ def test_set_axis_setattr_index(self): # GH 6785 # set the index manually - df = DataFrame([{"ts": datetime(2014, 4, 1, tzinfo=pytz.utc), "foo": 1}]) + df = DataFrame([{"ts": datetime(2014, 4, 1, tzinfo=timezone.utc), "foo": 1}]) expected = df.set_index("ts") df.index = df["ts"] df.pop("ts") diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index 48f51dfa981ca..e8ef0592ac432 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -5,7 +5,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._config.config import option_context import pandas as pd @@ -113,7 +113,7 @@ def test_not_hashable(self): with pytest.raises(TypeError, match=msg): hash(empty_frame) - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="surrogates not allowed") + @pytest.mark.xfail(using_string_dtype(), reason="surrogates not allowed") def test_column_name_contains_unicode_surrogate(self): # GH 25509 colname = "\ud83d" diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 91b5f905ada22..734bfc8b30053 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -11,7 +11,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype import pandas as pd from pandas import ( @@ -251,9 +251,6 @@ def test_timestamp_compare(self, left, right): with pytest.raises(TypeError, match=msg): right_f(pd.Timestamp("nat"), df) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't compare string and int" - ) def test_mixed_comparison(self): # GH#13128, GH#22163 != datetime64 vs non-dt64 should be False, # not raise TypeError @@ -1545,6 +1542,7 @@ def test_comparisons(self, simple_frame, float_frame, func): with pytest.raises(ValueError, match=msg): func(simple_frame, simple_frame[:2]) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_strings_to_numbers_comparisons_raises(self, compare_operators_no_eq_ne): # GH 11565 df = DataFrame( @@ -2099,6 +2097,7 @@ def test_enum_column_equality(): tm.assert_series_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_mixed_col_index_dtype(): # GH 47382 df1 = DataFrame(columns=list("abc"), data=1.0, index=[0]) diff --git a/pandas/tests/frame/test_arrow_interface.py b/pandas/tests/frame/test_arrow_interface.py index 098d1829b973c..dc163268f64b9 100644 --- a/pandas/tests/frame/test_arrow_interface.py +++ b/pandas/tests/frame/test_arrow_interface.py @@ -2,6 +2,8 @@ import pytest +from pandas._config import using_string_dtype + import pandas.util._test_decorators as td import pandas as pd @@ -9,6 +11,7 @@ pa = pytest.importorskip("pyarrow") +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @td.skip_if_no("pyarrow", min_version="14.0") def test_dataframe_arrow_interface(): df = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}) @@ -31,6 +34,7 @@ def test_dataframe_arrow_interface(): assert table.equals(expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @td.skip_if_no("pyarrow", min_version="15.0") def test_dataframe_to_arrow(): df = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}) diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index 3f0e829f66361..c95c382bb5131 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -7,6 +7,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( Categorical, @@ -160,6 +162,7 @@ def test_constructor_with_convert(self): ) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_construction_with_mixed(self, float_string_frame, using_infer_string): # test construction edge cases with mixed types @@ -191,6 +194,7 @@ def test_construction_with_mixed(self, float_string_frame, using_infer_string): ) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_construction_with_conversions(self): # convert from a numpy array of non-ns timedelta64; as of 2.0 this does # *not* convert @@ -395,6 +399,7 @@ def test_update_inplace_sets_valid_block_values(): assert isinstance(df._mgr.blocks[0].values, Categorical) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_nonconsolidated_item_cache_take(): # https://github.com/pandas-dev/pandas/issues/35521 diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index e2f12e6e459cb..607e333d82823 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -14,14 +14,14 @@ ) import functools import re +import zoneinfo import numpy as np from numpy import ma from numpy.ma import mrecords import pytest -import pytz -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._libs import lib from pandas.compat.numpy import np_version_gt2 @@ -101,7 +101,7 @@ def test_constructor_dict_with_tzaware_scalar(self): df = DataFrame({"dt": dt}, index=[0]) expected = DataFrame({"dt": [dt]}) - tm.assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected, check_index_type=False) # Non-homogeneous df = DataFrame({"dt": dt, "value": [1]}) @@ -121,7 +121,7 @@ def test_construct_ndarray_with_nas_and_int_dtype(self): def test_construct_from_list_of_datetimes(self): df = DataFrame([datetime.now(), datetime.now()]) - assert df[0].dtype == np.dtype("M8[ns]") + assert df[0].dtype == np.dtype("M8[us]") def test_constructor_from_tzaware_datetimeindex(self): # don't cast a DatetimeIndex WITH a tz, leave as object @@ -180,24 +180,24 @@ def test_datetimelike_values_with_object_dtype(self, kind, frame_or_series): arr = arr[:, 0] obj = frame_or_series(arr, dtype=object) - assert obj._mgr.arrays[0].dtype == object - assert isinstance(obj._mgr.arrays[0].ravel()[0], scalar_type) + assert obj._mgr.blocks[0].values.dtype == object + assert isinstance(obj._mgr.blocks[0].values.ravel()[0], scalar_type) # go through a different path in internals.construction obj = frame_or_series(frame_or_series(arr), dtype=object) - assert obj._mgr.arrays[0].dtype == object - assert isinstance(obj._mgr.arrays[0].ravel()[0], scalar_type) + assert obj._mgr.blocks[0].values.dtype == object + assert isinstance(obj._mgr.blocks[0].values.ravel()[0], scalar_type) obj = frame_or_series(frame_or_series(arr), dtype=NumpyEADtype(object)) - assert obj._mgr.arrays[0].dtype == object - assert isinstance(obj._mgr.arrays[0].ravel()[0], scalar_type) + assert obj._mgr.blocks[0].values.dtype == object + assert isinstance(obj._mgr.blocks[0].values.ravel()[0], scalar_type) if frame_or_series is DataFrame: # other paths through internals.construction sers = [Series(x) for x in arr] obj = frame_or_series(sers, dtype=object) - assert obj._mgr.arrays[0].dtype == object - assert isinstance(obj._mgr.arrays[0].ravel()[0], scalar_type) + assert obj._mgr.blocks[0].values.dtype == object + assert isinstance(obj._mgr.blocks[0].values.ravel()[0], scalar_type) def test_series_with_name_not_matching_column(self): # GH#9232 @@ -297,16 +297,16 @@ def test_constructor_dtype_nocast_view_dataframe(self): def test_constructor_dtype_nocast_view_2d_array(self): df = DataFrame([[1, 2], [3, 4]], dtype="int64") df2 = DataFrame(df.values, dtype=df[0].dtype) - assert df2._mgr.arrays[0].flags.c_contiguous + assert df2._mgr.blocks[0].values.flags.c_contiguous - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="conversion copies") + @pytest.mark.xfail(using_string_dtype(), reason="conversion copies") def test_1d_object_array_does_not_copy(self): # https://github.com/pandas-dev/pandas/issues/39272 arr = np.array(["a", "b"], dtype="object") df = DataFrame(arr, copy=False) assert np.shares_memory(df.values, arr) - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="conversion copies") + @pytest.mark.xfail(using_string_dtype(), reason="conversion copies") def test_2d_object_array_does_not_copy(self): # https://github.com/pandas-dev/pandas/issues/39272 arr = np.array([["a", "b"], ["c", "d"]], dtype="object") @@ -566,7 +566,7 @@ def test_constructor_invalid_items_unused(self, scalar): expected = DataFrame(columns=["b"]) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("value", [2, np.nan, None, float("nan")]) + @pytest.mark.parametrize("value", [4, np.nan, None, float("nan")]) def test_constructor_dict_nan_key(self, value): # GH 18455 cols = [1, value, 3] @@ -852,16 +852,18 @@ def create_data(constructor): expected = DataFrame( [ - {0: 0, 1: None, 2: None, 3: None}, - {0: None, 1: 2, 2: None, 3: None}, - {0: None, 1: None, 2: 4, 3: None}, - {0: None, 1: None, 2: None, 3: 6}, + [0, None, None, None], + [None, 2, None, None], + [None, None, 4, None], + [None, None, None, 6], ], index=[Timestamp(dt) for dt in dates_as_str], ) result_datetime64 = DataFrame(data_datetime64) result_datetime = DataFrame(data_datetime) + assert result_datetime.index.unit == "us" + result_datetime.index = result_datetime.index.as_unit("s") result_Timestamp = DataFrame(data_Timestamp) tm.assert_frame_equal(result_datetime64, expected) tm.assert_frame_equal(result_datetime, expected) @@ -931,7 +933,7 @@ def test_constructor_dict_extension_scalar(self, ea_scalar_and_dtype): ) def test_constructor_extension_scalar_data(self, data, dtype): # GH 34832 - df = DataFrame(index=[0, 1], columns=["a", "b"], data=data) + df = DataFrame(index=range(2), columns=["a", "b"], data=data) assert df["a"].dtype == dtype assert df["b"].dtype == dtype @@ -1267,7 +1269,7 @@ def test_constructor_list_of_lists(self, using_infer_string): # GH 4851 # list of 0-dim ndarrays - expected = DataFrame({0: np.arange(10)}) + expected = DataFrame(np.arange(10)) data = [np.array(x) for x in range(10)] result = DataFrame(data) tm.assert_frame_equal(result, expected) @@ -1319,12 +1321,12 @@ def test_constructor_unequal_length_nested_list_column(self): [[Timestamp("2021-01-01")]], [{"x": Timestamp("2021-01-01")}], {"x": [Timestamp("2021-01-01")]}, - {"x": Timestamp("2021-01-01").as_unit("ns")}, + {"x": Timestamp("2021-01-01")}, ], ) def test_constructor_one_element_data_list(self, data): # GH#42810 - result = DataFrame(data, index=[0, 1, 2], columns=["x"]) + result = DataFrame(data, index=range(3), columns=["x"]) expected = DataFrame({"x": [Timestamp("2021-01-01")] * 3}) tm.assert_frame_equal(result, expected) @@ -1631,7 +1633,7 @@ def test_constructor_Series_named(self): s = Series(arr, index=range(3, 13)) df = DataFrame(s) expected = DataFrame({0: s}) - tm.assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected, check_column_type=False) msg = r"Shape of passed values is \(10, 1\), indices imply \(10, 2\)" with pytest.raises(ValueError, match=msg): @@ -1650,7 +1652,7 @@ def test_constructor_Series_named(self): # this is a bit non-intuitive here; the series collapse down to arrays df = DataFrame([arr, s1]).T - expected = DataFrame({1: s1, 0: arr}, columns=[0, 1]) + expected = DataFrame({1: s1, 0: arr}, columns=range(2)) tm.assert_frame_equal(df, expected) def test_constructor_Series_named_and_columns(self): @@ -1887,7 +1889,7 @@ def test_constructor_with_datetimes1(self): ind = date_range(start="2000-01-01", freq="D", periods=10) datetimes = [ts.to_pydatetime() for ts in ind] datetime_s = Series(datetimes) - assert datetime_s.dtype == "M8[ns]" + assert datetime_s.dtype == "M8[us]" def test_constructor_with_datetimes2(self): # GH 2810 @@ -1898,7 +1900,7 @@ def test_constructor_with_datetimes2(self): df["dates"] = dates result = df.dtypes expected = Series( - [np.dtype("datetime64[ns]"), np.dtype("object")], + [np.dtype("datetime64[us]"), np.dtype("object")], index=["datetimes", "dates"], ) tm.assert_series_equal(result, expected) @@ -1906,8 +1908,7 @@ def test_constructor_with_datetimes2(self): def test_constructor_with_datetimes3(self): # GH 7594 # don't coerce tz-aware - tz = pytz.timezone("US/Eastern") - dt = tz.localize(datetime(2012, 1, 1)) + dt = datetime(2012, 1, 1, tzinfo=zoneinfo.ZoneInfo("US/Eastern")) df = DataFrame({"End Date": dt}, index=[0]) assert df.iat[0, 0] == dt @@ -1918,7 +1919,7 @@ def test_constructor_with_datetimes3(self): df = DataFrame([{"End Date": dt}]) assert df.iat[0, 0] == dt tm.assert_series_equal( - df.dtypes, Series({"End Date": "datetime64[ns, US/Eastern]"}, dtype=object) + df.dtypes, Series({"End Date": "datetime64[us, US/Eastern]"}, dtype=object) ) def test_constructor_with_datetimes4(self): @@ -1934,6 +1935,7 @@ def test_constructor_with_datetimes4(self): df = DataFrame({"value": dr}) assert str(df.iat[0, 0].tz) == "US/Eastern" + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_constructor_with_datetimes5(self): # GH 7822 # preserver an index with a tz on dict construction @@ -1971,7 +1973,14 @@ def test_constructor_with_datetimes6(self): def test_constructor_datetimes_with_nulls(self, arr): # gh-15869, GH#11220 result = DataFrame(arr).dtypes - expected = Series([np.dtype("datetime64[ns]")]) + unit = "ns" + if isinstance(arr, np.ndarray): + # inferred from a pydatetime object + unit = "us" + elif not any(isinstance(x, np.datetime64) for y in arr for x in y): + # TODO: this condition is not clear about why we have different behavior + unit = "s" + expected = Series([np.dtype(f"datetime64[{unit}]")]) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("order", ["K", "A", "C", "F"]) @@ -2095,7 +2104,7 @@ def test_constructor_for_list_with_dtypes(self, using_infer_string): np.dtype("int64"), np.dtype("float64"), np.dtype("object") if not using_infer_string else "string", - np.dtype("datetime64[ns]"), + np.dtype("datetime64[us]"), np.dtype("float64"), ], index=list("abcde"), @@ -2398,7 +2407,7 @@ class DatetimeSubclass(datetime): pass data = DataFrame({"datetime": [DatetimeSubclass(2020, 1, 1, 1, 1)]}) - assert data.datetime.dtype == "datetime64[ns]" + assert data.datetime.dtype == "datetime64[us]" def test_with_mismatched_index_length_raises(self): # GH#33437 @@ -2484,9 +2493,9 @@ def get_base(obj): def check_views(c_only: bool = False): # Check that the underlying data behind df["c"] is still `c` # after setting with iloc. Since we don't know which entry in - # df._mgr.arrays corresponds to df["c"], we just check that exactly + # df._mgr.blocks corresponds to df["c"], we just check that exactly # one of these arrays is `c`. GH#38939 - assert sum(x is c for x in df._mgr.arrays) == 1 + assert sum(x.values is c for x in df._mgr.blocks) == 1 if c_only: # If we ever stop consolidating in setitem_with_indexer, # this will become unnecessary. @@ -2494,17 +2503,17 @@ def check_views(c_only: bool = False): assert ( sum( - get_base(x) is a - for x in df._mgr.arrays - if isinstance(x.dtype, np.dtype) + get_base(x.values) is a + for x in df._mgr.blocks + if isinstance(x.values.dtype, np.dtype) ) == 1 ) assert ( sum( - get_base(x) is b - for x in df._mgr.arrays - if isinstance(x.dtype, np.dtype) + get_base(x.values) is b + for x in df._mgr.blocks + if isinstance(x.values.dtype, np.dtype) ) == 1 ) @@ -2514,11 +2523,13 @@ def check_views(c_only: bool = False): check_views() # TODO: most of the rest of this test belongs in indexing tests - if lib.is_np_dtype(df.dtypes.iloc[0], "fciuO"): - warn = None + should_raise = not lib.is_np_dtype(df.dtypes.iloc[0], "fciuO") + if should_raise: + with pytest.raises(TypeError, match="Invalid value"): + df.iloc[0, 0] = 0 + df.iloc[0, 1] = 0 + return else: - warn = FutureWarning - with tm.assert_produces_warning(warn, match="incompatible dtype"): df.iloc[0, 0] = 0 df.iloc[0, 1] = 0 if not copy: @@ -2693,21 +2704,14 @@ def test_frame_string_inference_block_dim(self): df = DataFrame(np.array([["hello", "goodbye"], ["hello", "Hello"]])) assert df._mgr.blocks[0].ndim == 2 - def test_inference_on_pandas_objects(self): + @pytest.mark.parametrize("klass", [Series, Index]) + def test_inference_on_pandas_objects(self, klass): # GH#56012 - idx = Index([Timestamp("2019-12-31")], dtype=object) - with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): - result = DataFrame(idx, columns=["a"]) - assert result.dtypes.iloc[0] != np.object_ - result = DataFrame({"a": idx}) + obj = klass([Timestamp("2019-12-31")], dtype=object) + result = DataFrame(obj, columns=["a"]) assert result.dtypes.iloc[0] == np.object_ - ser = Series([Timestamp("2019-12-31")], dtype=object) - - with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): - result = DataFrame(ser, columns=["a"]) - assert result.dtypes.iloc[0] != np.object_ - result = DataFrame({"a": ser}) + result = DataFrame({"a": obj}) assert result.dtypes.iloc[0] == np.object_ def test_dict_keys_returns_rangeindex(self): @@ -2715,6 +2719,21 @@ def test_dict_keys_returns_rangeindex(self): expected = RangeIndex(2) tm.assert_index_equal(result, expected, exact=True) + @pytest.mark.parametrize( + "cons", [Series, Index, DatetimeIndex, DataFrame, pd.array, pd.to_datetime] + ) + def test_construction_datetime_resolution_inference(self, cons): + ts = Timestamp(2999, 1, 1) + ts2 = ts.tz_localize("US/Pacific") + + obj = cons([ts]) + res_dtype = tm.get_dtype(obj) + assert res_dtype == "M8[us]", res_dtype + + obj2 = cons([ts2]) + res_dtype2 = tm.get_dtype(obj2) + assert res_dtype2 == "M8[us, US/Pacific]", res_dtype2 + class TestDataFrameConstructorIndexInference: def test_frame_from_dict_of_series_overlapping_monthly_period_indexes(self): @@ -2846,8 +2865,8 @@ def test_construction_preserves_tzaware_dtypes(self, tz): [ np.dtype("datetime64[ns]"), DatetimeTZDtype(tz=tz), - np.dtype("datetime64[ns]"), - DatetimeTZDtype(tz=tz), + np.dtype("datetime64[us]"), + DatetimeTZDtype(tz=tz, unit="us"), ], index=["dr", "dr_tz", "datetimes_naive", "datetimes_with_tz"], ) @@ -2944,7 +2963,8 @@ def test_frame_timeseries_column(self): Timestamp("20130101T10:01:00", tz="US/Eastern"), Timestamp("20130101T10:02:00", tz="US/Eastern"), ] - } + }, + dtype="M8[ns, US/Eastern]", ) tm.assert_frame_equal(result, expected) @@ -2997,9 +3017,9 @@ def test_from_tzaware_mixed_object_array(self): res = DataFrame(arr, columns=["A", "B", "C"]) expected_dtypes = [ - "datetime64[ns]", - "datetime64[ns, US/Eastern]", - "datetime64[ns, CET]", + "datetime64[s]", + "datetime64[s, US/Eastern]", + "datetime64[s, CET]", ] assert (res.dtypes == expected_dtypes).all() @@ -3027,7 +3047,7 @@ def test_construction_from_ndarray_datetimelike(self): # constructed from 2D ndarray arr = np.arange(0, 12, dtype="datetime64[ns]").reshape(4, 3) df = DataFrame(arr) - assert all(isinstance(arr, DatetimeArray) for arr in df._mgr.arrays) + assert all(isinstance(block.values, DatetimeArray) for block in df._mgr.blocks) def test_construction_from_ndarray_with_eadtype_mismatched_columns(self): arr = np.random.default_rng(2).standard_normal((10, 2)) @@ -3153,14 +3173,6 @@ def test_from_out_of_bounds_ns_datetime( self, constructor, cls, request, box, frame_or_series ): # scalar that won't fit in nanosecond dt64, but will fit in microsecond - if box is list or (frame_or_series is Series and box is dict): - mark = pytest.mark.xfail( - reason="Timestamp constructor has been updated to cast dt64 to " - "non-nano, but DatetimeArray._from_sequence has not", - strict=True, - ) - request.applymarker(mark) - scalar = datetime(9999, 1, 1) exp_dtype = "M8[us]" # pydatetime objects default to this reso diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index 643d342b052a4..aa2fb19fe8528 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import ( NumExprClobberingError, UndefinedVariableError, @@ -202,6 +204,25 @@ def test_eval_simple(self, engine, parser): expected = df["a"] tm.assert_series_equal(expected, res) + def test_extension_array_eval(self, engine, parser, request): + # GH#58748 + if engine == "numexpr": + mark = pytest.mark.xfail( + reason="numexpr does not support extension array dtypes" + ) + request.applymarker(mark) + df = DataFrame({"a": pd.array([1, 2, 3]), "b": pd.array([4, 5, 6])}) + result = df.eval("a / b", engine=engine, parser=parser) + expected = Series(pd.array([0.25, 0.40, 0.50])) + tm.assert_series_equal(result, expected) + + def test_complex_eval(self, engine, parser): + # GH#21374 + df = DataFrame({"a": [1 + 2j], "b": [1 + 1j]}) + result = df.eval("a/b", engine=engine, parser=parser) + expected = Series([1.5 + 0.5j]) + tm.assert_series_equal(result, expected) + class TestDataFrameQueryWithMultiIndex: def test_query_with_named_multiindex(self, parser, engine): @@ -740,11 +761,12 @@ def test_inf(self, op, f, engine, parser): result = df.query(q, engine=engine, parser=parser) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_check_tz_aware_index_query(self, tz_aware_fixture): # https://github.com/pandas-dev/pandas/issues/29463 tz = tz_aware_fixture df_index = date_range( - start="2019-01-01", freq="1d", periods=10, tz=tz, name="time" + start="2019-01-01", freq="1D", periods=10, tz=tz, name="time" ) expected = DataFrame(index=df_index) df = DataFrame(index=df_index) @@ -1158,6 +1180,7 @@ def test_query_string_null_elements(self, in_list): df_expected = DataFrame({"a": expected}, dtype="string") df_expected.index = df_expected.index.astype("int64") df = DataFrame({"a": in_list}, dtype="string") + df.index = Index(list(df.index), dtype=df.index.dtype) res1 = df.query("a == 'asdf'", parser=parser, engine=engine) res2 = df[df["a"] == "asdf"] res3 = df.query("a <= 'asdf'", parser=parser, engine=engine) @@ -1400,12 +1423,12 @@ def test_query_ea_dtypes(self, dtype): if dtype == "int64[pyarrow]": pytest.importorskip("pyarrow") # GH#50261 - df = DataFrame({"a": Series([1, 2], dtype=dtype)}) + df = DataFrame({"a": [1, 2]}, dtype=dtype) ref = {2} # noqa: F841 warning = RuntimeWarning if dtype == "Int64" and NUMEXPR_INSTALLED else None with tm.assert_produces_warning(warning): result = df.query("a in @ref") - expected = DataFrame({"a": Series([2], dtype=dtype, index=[1])}) + expected = DataFrame({"a": [2]}, index=range(1, 2), dtype=dtype) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("engine", ["python", "numexpr"]) @@ -1424,8 +1447,8 @@ def test_query_ea_equality_comparison(self, dtype, engine): result = df.query("A == B", engine=engine) expected = DataFrame( { - "A": Series([1, 2], dtype="Int64", index=[0, 2]), - "B": Series([1, 2], dtype=dtype, index=[0, 2]), + "A": Series([1, 2], dtype="Int64", index=range(0, 4, 2)), + "B": Series([1, 2], dtype=dtype, index=range(0, 4, 2)), } ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 5118561f67338..4c355ed92b6c3 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -6,7 +6,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas.compat import ( IS64, @@ -465,7 +465,7 @@ def test_mixed_ops(self, op): getattr(df, op)() @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="sum doesn't work for arrow strings" + using_string_dtype(), reason="sum doesn't work for arrow strings" ) def test_reduce_mixed_frame(self): # GH 6806 @@ -490,10 +490,8 @@ def test_nunique(self): tm.assert_series_equal( df.nunique(dropna=False), Series({"A": 1, "B": 3, "C": 3}) ) - tm.assert_series_equal(df.nunique(axis=1), Series({0: 1, 1: 2, 2: 2})) - tm.assert_series_equal( - df.nunique(axis=1, dropna=False), Series({0: 1, 1: 3, 2: 2}) - ) + tm.assert_series_equal(df.nunique(axis=1), Series([1, 2, 2])) + tm.assert_series_equal(df.nunique(axis=1, dropna=False), Series([1, 3, 2])) @pytest.mark.parametrize("tz", [None, "UTC"]) def test_mean_mixed_datetime_numeric(self, tz): @@ -608,6 +606,7 @@ def test_sem(self, datetime_frame): result = nanops.nansem(arr, axis=0) assert not (result < 0).any() + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "dropna, expected", [ @@ -707,8 +706,8 @@ def test_mode_sortwarning(self, using_infer_string): def test_mode_empty_df(self): df = DataFrame([], columns=["a", "b"]) + expected = df.copy() result = df.mode() - expected = DataFrame([], columns=["a", "b"], index=Index([], dtype=np.int64)) tm.assert_frame_equal(result, expected) def test_operators_timedelta64(self): @@ -769,7 +768,7 @@ def test_operators_timedelta64(self): # excludes non-numeric result = mixed.min(axis=1, numeric_only=True) - expected = Series([1, 1, 1.0], index=[0, 1, 2]) + expected = Series([1, 1, 1.0]) tm.assert_series_equal(result, expected) # works when only those columns are selected @@ -1059,6 +1058,7 @@ def test_sum_bools(self): # ---------------------------------------------------------------------- # Index of max / min + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("axis", [0, 1]) def test_idxmin(self, float_frame, int_frame, skipna, axis): frame = float_frame @@ -1109,6 +1109,7 @@ def test_idxmin_axis_2(self, float_frame): with pytest.raises(ValueError, match=msg): frame.idxmin(axis=2) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("axis", [0, 1]) def test_idxmax(self, float_frame, int_frame, skipna, axis): frame = float_frame @@ -1186,21 +1187,21 @@ def test_idxmax_mixed_dtype(self): df = DataFrame({1: [0, 2, 1], 2: range(3)[::-1], 3: dti}) result = df.idxmax() - expected = Series([1, 0, 2], index=[1, 2, 3]) + expected = Series([1, 0, 2], index=range(1, 4)) tm.assert_series_equal(result, expected) result = df.idxmin() - expected = Series([0, 2, 0], index=[1, 2, 3]) + expected = Series([0, 2, 0], index=range(1, 4)) tm.assert_series_equal(result, expected) # with NaTs df.loc[0, 3] = pd.NaT result = df.idxmax() - expected = Series([1, 0, 2], index=[1, 2, 3]) + expected = Series([1, 0, 2], index=range(1, 4)) tm.assert_series_equal(result, expected) result = df.idxmin() - expected = Series([0, 2, 1], index=[1, 2, 3]) + expected = Series([0, 2, 1], index=range(1, 4)) tm.assert_series_equal(result, expected) # with multi-column dt64 block @@ -1208,11 +1209,11 @@ def test_idxmax_mixed_dtype(self): df._consolidate_inplace() result = df.idxmax() - expected = Series([1, 0, 2, 0], index=[1, 2, 3, 4]) + expected = Series([1, 0, 2, 0], index=range(1, 5)) tm.assert_series_equal(result, expected) result = df.idxmin() - expected = Series([0, 2, 1, 2], index=[1, 2, 3, 4]) + expected = Series([0, 2, 1, 2], index=range(1, 5)) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( @@ -1348,6 +1349,7 @@ def test_any_all_extra(self): result = df[["C"]].all(axis=None).item() assert result is True + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("axis", [0, 1]) def test_any_all_object_dtype( self, axis, all_boolean_reductions, skipna, using_infer_string @@ -1829,7 +1831,7 @@ def test_df_empty_min_count_0(self, opname, dtype, exp_value, exp_dtype): df = DataFrame({0: [], 1: []}, dtype=dtype) result = getattr(df, opname)(min_count=0) - expected = Series([exp_value, exp_value], dtype=exp_dtype) + expected = Series([exp_value, exp_value], dtype=exp_dtype, index=range(2)) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( @@ -1852,7 +1854,7 @@ def test_df_empty_min_count_1(self, opname, dtype, exp_dtype): df = DataFrame({0: [], 1: []}, dtype=dtype) result = getattr(df, opname)(min_count=1) - expected = Series([np.nan, np.nan], dtype=exp_dtype) + expected = Series([np.nan, np.nan], dtype=exp_dtype, index=Index([0, 1])) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( @@ -1875,7 +1877,7 @@ def test_df_empty_nullable_min_count_0(self, opname, dtype, exp_value, exp_dtype df = DataFrame({0: [], 1: []}, dtype=dtype) result = getattr(df, opname)(min_count=0) - expected = Series([exp_value, exp_value], dtype=exp_dtype) + expected = Series([exp_value, exp_value], dtype=exp_dtype, index=Index([0, 1])) tm.assert_series_equal(result, expected) # TODO: why does min_count=1 impact the resulting Windows dtype @@ -1900,7 +1902,7 @@ def test_df_empty_nullable_min_count_1(self, opname, dtype, exp_dtype): df = DataFrame({0: [], 1: []}, dtype=dtype) result = getattr(df, opname)(min_count=1) - expected = Series([pd.NA, pd.NA], dtype=exp_dtype) + expected = Series([pd.NA, pd.NA], dtype=exp_dtype, index=Index([0, 1])) tm.assert_series_equal(result, expected) @@ -1932,9 +1934,7 @@ def test_sum_timedelta64_skipna_false(): tm.assert_series_equal(result, expected) -@pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="sum doesn't work with arrow strings" -) +@pytest.mark.xfail(using_string_dtype(), reason="sum doesn't work with arrow strings") def test_mixed_frame_with_integer_sum(): # https://github.com/pandas-dev/pandas/issues/34520 df = DataFrame([["a", 1]], columns=list("ab")) diff --git a/pandas/tests/frame/test_repr.py b/pandas/tests/frame/test_repr.py index f6e0251d52de1..10cc86385af1b 100644 --- a/pandas/tests/frame/test_repr.py +++ b/pandas/tests/frame/test_repr.py @@ -7,7 +7,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas import ( NA, @@ -38,10 +38,10 @@ def test_repr_should_return_str(self): index1 = ["\u03c3", "\u03c4", "\u03c5", "\u03c6"] cols = ["\u03c8"] df = DataFrame(data, columns=cols, index=index1) - assert type(df.__repr__()) is str # noqa: E721 + assert type(df.__repr__()) is str ser = df[cols[0]] - assert type(ser.__repr__()) is str # noqa: E721 + assert type(ser.__repr__()) is str def test_repr_bytes_61_lines(self): # GH#12857 @@ -176,7 +176,7 @@ def test_repr_mixed_big(self): repr(biggie) - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="/r in") + @pytest.mark.xfail(using_string_dtype(), reason="/r in") def test_repr(self): # columns but no index no_index = DataFrame(columns=[0, 1, 3]) diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index a3a1da6e57cb0..92bcd6f0c7d0c 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -5,6 +5,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs import lib import pandas as pd @@ -714,13 +716,13 @@ def test_unstack_unused_levels(self): df = DataFrame([[1, 0]] * 3, index=idx) result = df.unstack() - exp_col = MultiIndex.from_product([[0, 1], ["A", "B", "C"]]) + exp_col = MultiIndex.from_product([range(2), ["A", "B", "C"]]) expected = DataFrame([[1, 1, 1, 0, 0, 0]], index=["a"], columns=exp_col) tm.assert_frame_equal(result, expected) assert (result.columns.levels[1] == idx.levels[1]).all() # Unused items on both levels - levels = [[0, 1, 7], [0, 1, 2, 3]] + levels = [range(3), range(4)] codes = [[0, 0, 1, 1], [0, 2, 0, 2]] idx = MultiIndex(levels, codes) block = np.arange(4).reshape(2, 2) @@ -752,7 +754,7 @@ def test_unstack_unused_levels_mixed_with_nan( result = df.unstack(level=level) exp_data = np.zeros(18) * np.nan exp_data[idces] = data - cols = MultiIndex.from_product([[0, 1], col_level]) + cols = MultiIndex.from_product([range(2), col_level]) expected = DataFrame(exp_data.reshape(3, 6), index=idx_level, columns=cols) tm.assert_frame_equal(result, expected) @@ -1067,7 +1069,7 @@ def test_stack_datetime_column_multiIndex(self, future_stack): with tm.assert_produces_warning(warn, match=msg): result = df.stack(future_stack=future_stack) - eidx = MultiIndex.from_product([(0, 1, 2, 3), ("B",)]) + eidx = MultiIndex.from_product([range(4), ("B",)]) ecols = MultiIndex.from_tuples([(t, "A")]) expected = DataFrame([1, 2, 3, 4], index=eidx, columns=ecols) tm.assert_frame_equal(result, expected) @@ -1150,7 +1152,7 @@ def test_stack_full_multiIndex(self, future_stack): expected = DataFrame( [[0, 2], [1, np.nan], [3, 5], [4, np.nan]], index=MultiIndex( - levels=[[0, 1], ["u", "x", "y", "z"]], + levels=[range(2), ["u", "x", "y", "z"]], codes=[[0, 0, 1, 1], [1, 3, 1, 3]], names=[None, "Lower"], ), @@ -1201,7 +1203,7 @@ def test_stack_multi_preserve_categorical_dtype( s_cidx = pd.CategoricalIndex(labels, ordered=ordered) expected_data = sorted(data) if future_stack else data expected = Series( - expected_data, index=MultiIndex.from_product([[0], s_cidx, cidx2]) + expected_data, index=MultiIndex.from_product([range(1), s_cidx, cidx2]) ) tm.assert_series_equal(result, expected) @@ -1214,7 +1216,7 @@ def test_stack_preserve_categorical_dtype_values(self, future_stack): cat = pd.Categorical(["a", "a", "b", "c"]) df = DataFrame({"A": cat, "B": cat}) result = df.stack(future_stack=future_stack) - index = MultiIndex.from_product([[0, 1, 2, 3], ["A", "B"]]) + index = MultiIndex.from_product([range(4), ["A", "B"]]) expected = Series( pd.Categorical(["a", "a", "a", "a", "b", "b", "c", "c"]), index=index ) @@ -1298,7 +1300,7 @@ def test_unstack_mixed_extension_types(self, level): @pytest.mark.parametrize("level", [0, "baz"]) def test_unstack_swaplevel_sortlevel(self, level): # GH 20994 - mi = MultiIndex.from_product([[0], ["d", "c"]], names=["bar", "baz"]) + mi = MultiIndex.from_product([range(1), ["d", "c"]], names=["bar", "baz"]) df = DataFrame([[0, 2], [1, 3]], index=mi, columns=["B", "A"]) df.columns.name = "foo" @@ -1339,7 +1341,9 @@ def test_unstack_sort_false(frame_or_series, dtype): result = obj.unstack(level=-1, sort=False) if frame_or_series is DataFrame: - expected_columns = MultiIndex.from_tuples([(0, "b"), (0, "a")]) + expected_columns = MultiIndex( + levels=[range(1), ["b", "a"]], codes=[[0, 0], [0, 1]] + ) else: expected_columns = ["b", "a"] expected = DataFrame( @@ -1355,7 +1359,9 @@ def test_unstack_sort_false(frame_or_series, dtype): result = obj.unstack(level=[1, 2], sort=False) if frame_or_series is DataFrame: - expected_columns = MultiIndex.from_tuples([(0, "z", "b"), (0, "y", "a")]) + expected_columns = MultiIndex( + levels=[range(1), ["z", "y"], ["b", "a"]], codes=[[0, 0], [0, 1], [0, 1]] + ) else: expected_columns = MultiIndex.from_tuples([("z", "b"), ("y", "a")]) expected = DataFrame( @@ -1432,7 +1438,7 @@ def test_stack_timezone_aware_values(future_stack): @pytest.mark.parametrize("dropna", [True, False, lib.no_default]) def test_stack_empty_frame(dropna, future_stack): # GH 36113 - levels = [np.array([], dtype=np.int64), np.array([], dtype=np.int64)] + levels = [pd.RangeIndex(0), pd.RangeIndex(0)] expected = Series(dtype=np.float64, index=MultiIndex(levels=levels, codes=[[], []])) if future_stack and dropna is not lib.no_default: with pytest.raises(ValueError, match="dropna must be unspecified"): @@ -1510,7 +1516,9 @@ def test_stack_positional_level_duplicate_column_names(future_stack): result = df.stack(0, future_stack=future_stack) new_columns = Index(["y", "z"], name="a") - new_index = MultiIndex.from_tuples([(0, "x"), (0, "y")], names=[None, "a"]) + new_index = MultiIndex( + levels=[range(1), ["x", "y"]], codes=[[0, 0], [0, 1]], names=[None, "a"] + ) expected = DataFrame([[1, 1], [1, 1]], index=new_index, columns=new_columns) tm.assert_frame_equal(result, expected) @@ -1663,6 +1671,7 @@ def test_unstack_multiple_no_empty_columns(self): expected = unstacked.dropna(axis=1, how="all") tm.assert_frame_equal(unstacked, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.filterwarnings( "ignore:The previous implementation of stack is deprecated" ) @@ -1913,6 +1922,7 @@ def test_stack_level_name(self, multiindex_dataframe_random_data, future_stack): expected = frame.stack(future_stack=future_stack) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.filterwarnings( "ignore:The previous implementation of stack is deprecated" ) @@ -2318,7 +2328,7 @@ def test_stack_unstack_unordered_multiindex(self, future_stack): ) expected = DataFrame( [["a0", "b0"], ["a1", "b1"], ["a2", "b2"], ["a3", "b3"], ["a4", "b4"]], - index=[0, 1, 2, 3, 4], + index=range(5), columns=MultiIndex.from_tuples( [("a", "x"), ("b", "x")], names=["first", "second"] ), @@ -2520,7 +2530,7 @@ def test_multi_level_stack_categorical(self, future_stack): ] ), ) - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected, check_index_type=False) @pytest.mark.filterwarnings( "ignore:The previous implementation of stack is deprecated" @@ -2657,7 +2667,7 @@ def test_stack_tuple_columns(future_stack): expected = Series( [1, 2, 3, 4, 5, 6, 7, 8, 9], index=MultiIndex( - levels=[[0, 1, 2], [("a", 1), ("a", 2), ("b", 1)]], + levels=[range(3), [("a", 1), ("a", 2), ("b", 1)]], codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]], ), ) diff --git a/pandas/tests/frame/test_unary.py b/pandas/tests/frame/test_unary.py index e89175ceff0c1..1887fa61ad081 100644 --- a/pandas/tests/frame/test_unary.py +++ b/pandas/tests/frame/test_unary.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat.numpy import np_version_gte1p25 import pandas as pd @@ -128,6 +130,7 @@ def test_pos_object(self, df_data): tm.assert_frame_equal(+df, df) tm.assert_series_equal(+df["a"], df["a"]) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.filterwarnings("ignore:Applying:DeprecationWarning") def test_pos_object_raises(self): # GH#21380 diff --git a/pandas/tests/generic/test_generic.py b/pandas/tests/generic/test_generic.py index 0b607d91baf65..b591b1b1092d4 100644 --- a/pandas/tests/generic/test_generic.py +++ b/pandas/tests/generic/test_generic.py @@ -93,8 +93,7 @@ def test_get_numeric_data(self, frame_or_series): if isinstance(o, DataFrame): # preserve columns dtype expected.columns = o.columns[:0] - # https://github.com/pandas-dev/pandas/issues/50862 - tm.assert_equal(result.reset_index(drop=True), expected) + tm.assert_equal(result, expected) # get the bool data arr = np.array([True, True, False, True]) @@ -102,6 +101,11 @@ def test_get_numeric_data(self, frame_or_series): result = o._get_numeric_data() tm.assert_equal(result, o) + def test_get_bool_data_empty_preserve_index(self): + expected = Series([], dtype="bool") + result = expected._get_bool_data() + tm.assert_series_equal(result, expected, check_index_type=True) + def test_nonzero(self, frame_or_series): # GH 4633 # look at the boolean/nonzero behavior for objects diff --git a/pandas/tests/generic/test_to_xarray.py b/pandas/tests/generic/test_to_xarray.py index 491f621783a76..d8401a8b2ae3f 100644 --- a/pandas/tests/generic/test_to_xarray.py +++ b/pandas/tests/generic/test_to_xarray.py @@ -9,7 +9,6 @@ date_range, ) import pandas._testing as tm -from pandas.util.version import Version pytest.importorskip("xarray") @@ -30,17 +29,11 @@ def df(self): } ) - def test_to_xarray_index_types(self, index_flat, df, using_infer_string, request): + def test_to_xarray_index_types(self, index_flat, df, using_infer_string): index = index_flat # MultiIndex is tested in test_to_xarray_with_multiindex if len(index) == 0: pytest.skip("Test doesn't make sense for empty index") - import xarray - - if Version(xarray.__version__) >= Version("2024.5"): - request.applymarker( - pytest.mark.xfail(reason="https://github.com/pydata/xarray/issues/9026") - ) from xarray import Dataset diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 3362d6209af6d..46c27849356b5 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -9,6 +9,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import SpecificationError from pandas.core.dtypes.common import is_integer_dtype @@ -62,6 +64,32 @@ def test_agg_ser_multi_key(df): tm.assert_series_equal(results, expected) +def test_agg_with_missing_values(): + # GH#58810 + missing_df = DataFrame( + { + "nan": [np.nan, np.nan, np.nan, np.nan], + "na": [pd.NA, pd.NA, pd.NA, pd.NA], + "nat": [pd.NaT, pd.NaT, pd.NaT, pd.NaT], + "none": [None, None, None, None], + "values": [1, 2, 3, 4], + } + ) + + result = missing_df.agg(x=("nan", "min"), y=("na", "min"), z=("values", "sum")) + + expected = DataFrame( + { + "nan": [np.nan, np.nan, np.nan], + "na": [np.nan, np.nan, np.nan], + "values": [np.nan, np.nan, 10.0], + }, + index=["x", "y", "z"], + ) + + tm.assert_frame_equal(result, expected) + + def test_groupby_aggregation_mixed_dtype(): # GH 6212 expected = DataFrame( @@ -268,6 +296,7 @@ def aggfun_1(ser): assert len(result) == 0 +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_wrap_agg_out(three_group): grouped = three_group.groupby(["A", "B"]) @@ -1088,6 +1117,7 @@ def test_lambda_named_agg(func): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_aggregate_mixed_types(): # GH 16916 df = DataFrame( @@ -1662,3 +1692,121 @@ def func(x): msg = "length must not be 0" with pytest.raises(ValueError, match=msg): df.groupby("A", observed=False).agg(func) + + +def test_groupby_aggregation_duplicate_columns_single_dict_value(): + # GH#55041 + df = DataFrame( + [[1, 2, 3, 4], [1, 3, 4, 5], [2, 4, 5, 6]], + columns=["a", "b", "c", "c"], + ) + gb = df.groupby("a") + result = gb.agg({"c": "sum"}) + + expected = DataFrame( + [[7, 9], [5, 6]], columns=["c", "c"], index=Index([1, 2], name="a") + ) + tm.assert_frame_equal(result, expected) + + +def test_groupby_aggregation_duplicate_columns_multiple_dict_values(): + # GH#55041 + df = DataFrame( + [[1, 2, 3, 4], [1, 3, 4, 5], [2, 4, 5, 6]], + columns=["a", "b", "c", "c"], + ) + gb = df.groupby("a") + result = gb.agg({"c": ["sum", "min", "max", "min"]}) + + expected = DataFrame( + [[7, 3, 4, 3, 9, 4, 5, 4], [5, 5, 5, 5, 6, 6, 6, 6]], + columns=MultiIndex( + levels=[["c"], ["sum", "min", "max"]], + codes=[[0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 2, 1, 0, 1, 2, 1]], + ), + index=Index([1, 2], name="a"), + ) + tm.assert_frame_equal(result, expected) + + +def test_groupby_aggregation_duplicate_columns_some_empty_result(): + # GH#55041 + df = DataFrame( + [ + [1, 9843, 43, 54, 7867], + [2, 940, 9, -34, 44], + [1, -34, -546, -549358, 0], + [2, 244, -33, -100, 44], + ], + columns=["a", "b", "b", "c", "c"], + ) + gb = df.groupby("a") + result = gb.agg({"b": [], "c": ["var"]}) + + expected = DataFrame( + [[1.509268e11, 30944844.5], [2.178000e03, 0.0]], + columns=MultiIndex(levels=[["c"], ["var"]], codes=[[0, 0], [0, 0]]), + index=Index([1, 2], name="a"), + ) + tm.assert_frame_equal(result, expected) + + +def test_groupby_aggregation_multi_index_duplicate_columns(): + # GH#55041 + df = DataFrame( + [ + [1, -9843, 43, 54, 7867], + [2, 940, 9, -34, 44], + [1, -34, 546, -549358, 0], + [2, 244, -33, -100, 44], + ], + columns=MultiIndex( + levels=[["level1.1", "level1.2"], ["level2.1", "level2.2"]], + codes=[[0, 0, 0, 1, 1], [0, 1, 1, 0, 1]], + ), + index=MultiIndex( + levels=[["level1.1", "level1.2"], ["level2.1", "level2.2"]], + codes=[[0, 0, 0, 1], [0, 1, 1, 0]], + ), + ) + gb = df.groupby(level=0) + result = gb.agg({("level1.1", "level2.2"): "min"}) + + expected = DataFrame( + [[-9843, 9], [244, -33]], + columns=MultiIndex(levels=[["level1.1"], ["level2.2"]], codes=[[0, 0], [0, 0]]), + index=Index(["level1.1", "level1.2"]), + ) + tm.assert_frame_equal(result, expected) + + +def test_groupby_aggregation_func_list_multi_index_duplicate_columns(): + # GH#55041 + df = DataFrame( + [ + [1, -9843, 43, 54, 7867], + [2, 940, 9, -34, 44], + [1, -34, 546, -549358, 0], + [2, 244, -33, -100, 44], + ], + columns=MultiIndex( + levels=[["level1.1", "level1.2"], ["level2.1", "level2.2"]], + codes=[[0, 0, 0, 1, 1], [0, 1, 1, 0, 1]], + ), + index=MultiIndex( + levels=[["level1.1", "level1.2"], ["level2.1", "level2.2"]], + codes=[[0, 0, 0, 1], [0, 1, 1, 0]], + ), + ) + gb = df.groupby(level=0) + result = gb.agg({("level1.1", "level2.2"): ["min", "max"]}) + + expected = DataFrame( + [[-9843, 940, 9, 546], [244, 244, -33, -33]], + columns=MultiIndex( + levels=[["level1.1"], ["level2.2"], ["min", "max"]], + codes=[[0, 0, 0, 0], [0, 0, 0, 0], [0, 1, 0, 1]], + ), + index=Index(["level1.1", "level1.2"]), + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index aafd06e8f88cf..4a4f5882b7e85 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -5,6 +5,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.core.dtypes.common import ( is_float_dtype, is_integer_dtype, @@ -90,6 +92,7 @@ def test_cython_agg_boolean(): tm.assert_series_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_cython_agg_nothing_to_agg(): frame = DataFrame( {"a": np.random.default_rng(2).integers(0, 5, 50), "b": ["foo", "bar"] * 25} @@ -143,6 +146,7 @@ def test_cython_agg_return_dict(): tm.assert_series_equal(ts, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_cython_fail_agg(): dr = bdate_range("1/1/2000", periods=50) ts = Series(["A", "B", "C", "D", "E"] * 10, index=dr) @@ -285,7 +289,7 @@ def test_read_only_buffer_source_agg(agg): "species": ["setosa", "setosa", "setosa", "setosa", "setosa"], } ) - df._mgr.arrays[0].flags.writeable = False + df._mgr.blocks[0].values.flags.writeable = False result = df.groupby(["species"]).agg({"sepal_length": agg}) expected = df.copy().groupby(["species"]).agg({"sepal_length": agg}) diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 78f2917e9a057..835cad0d13078 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -8,6 +8,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import SpecificationError import pandas as pd @@ -306,6 +308,7 @@ def test_series_agg_multikey(): tm.assert_series_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_series_agg_multi_pure_python(): data = DataFrame( { diff --git a/pandas/tests/groupby/methods/test_describe.py b/pandas/tests/groupby/methods/test_describe.py index 0f5fc915f9523..5f1f85d8179cd 100644 --- a/pandas/tests/groupby/methods/test_describe.py +++ b/pandas/tests/groupby/methods/test_describe.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -71,6 +73,7 @@ def test_series_describe_as_index(as_index, keys): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_frame_describe_multikey(tsframe): grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month]) result = grouped.describe() @@ -246,6 +249,7 @@ def test_describe_non_cython_paths(): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("dtype", [int, float, object]) @pytest.mark.parametrize( "kwargs", diff --git a/pandas/tests/groupby/methods/test_nth.py b/pandas/tests/groupby/methods/test_nth.py index 1b852abad6c8e..d20b30834dea2 100644 --- a/pandas/tests/groupby/methods/test_nth.py +++ b/pandas/tests/groupby/methods/test_nth.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -677,6 +679,7 @@ def test_first_multi_key_groupby_categorical(): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("method", ["first", "last", "nth"]) def test_groupby_last_first_nth_with_none(method, nulls_fixture): # GH29645 diff --git a/pandas/tests/groupby/methods/test_quantile.py b/pandas/tests/groupby/methods/test_quantile.py index af0deba138469..0e31c0698cb1e 100644 --- a/pandas/tests/groupby/methods/test_quantile.py +++ b/pandas/tests/groupby/methods/test_quantile.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -156,6 +158,7 @@ def test_groupby_quantile_with_arraylike_q_and_int_columns(frame_size, groupby, tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_quantile_raises(): df = DataFrame([["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"]) @@ -238,6 +241,7 @@ def test_groupby_quantile_nullable_array(values, q): tm.assert_series_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]]) @pytest.mark.parametrize("numeric_only", [True, False]) def test_groupby_quantile_raises_on_invalid_dtype(q, numeric_only): diff --git a/pandas/tests/groupby/methods/test_size.py b/pandas/tests/groupby/methods/test_size.py index 5a3eb49e97fb7..edeac642551a0 100644 --- a/pandas/tests/groupby/methods/test_size.py +++ b/pandas/tests/groupby/methods/test_size.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas.util._test_decorators as td from pandas import ( @@ -76,6 +78,7 @@ def test_size_series_masked_type_returns_Int64(dtype): tm.assert_series_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "dtype", [ diff --git a/pandas/tests/groupby/methods/test_value_counts.py b/pandas/tests/groupby/methods/test_value_counts.py index 0f136b06c782a..14d3dbd6fa496 100644 --- a/pandas/tests/groupby/methods/test_value_counts.py +++ b/pandas/tests/groupby/methods/test_value_counts.py @@ -7,6 +7,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas.util._test_decorators as td from pandas import ( @@ -273,6 +275,7 @@ def _frame_value_counts(df, keys, normalize, sort, ascending): return df[keys].value_counts(normalize=normalize, sort=sort, ascending=ascending) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("groupby", ["column", "array", "function"]) @pytest.mark.parametrize("normalize, name", [(True, "proportion"), (False, "count")]) @pytest.mark.parametrize( @@ -356,6 +359,7 @@ def test_against_frame_and_seriesgroupby( tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "dtype", [ diff --git a/pandas/tests/groupby/test_all_methods.py b/pandas/tests/groupby/test_all_methods.py index ad35bec70f668..945c3e421a132 100644 --- a/pandas/tests/groupby/test_all_methods.py +++ b/pandas/tests/groupby/test_all_methods.py @@ -25,9 +25,12 @@ def test_multiindex_group_all_columns_when_empty(groupby_func): gb = df.groupby(["a", "b", "c"], group_keys=False) method = getattr(gb, groupby_func) args = get_groupby_method_args(groupby_func, df) - - warn = FutureWarning if groupby_func == "fillna" else None - warn_msg = "DataFrameGroupBy.fillna is deprecated" + if groupby_func == "corrwith": + warn = FutureWarning + warn_msg = "DataFrameGroupBy.corrwith is deprecated" + else: + warn = None + warn_msg = "" with tm.assert_produces_warning(warn, match=warn_msg): result = method(*args).index expected = df.index @@ -42,18 +45,12 @@ def test_duplicate_columns(request, groupby_func, as_index): df = DataFrame([[1, 3, 6], [1, 4, 7], [2, 5, 8]], columns=list("abb")) args = get_groupby_method_args(groupby_func, df) gb = df.groupby("a", as_index=as_index) - warn = FutureWarning if groupby_func == "fillna" else None - warn_msg = "DataFrameGroupBy.fillna is deprecated" - with tm.assert_produces_warning(warn, match=warn_msg): - result = getattr(gb, groupby_func)(*args) + result = getattr(gb, groupby_func)(*args) expected_df = df.set_axis(["a", "b", "c"], axis=1) expected_args = get_groupby_method_args(groupby_func, expected_df) expected_gb = expected_df.groupby("a", as_index=as_index) - warn = FutureWarning if groupby_func == "fillna" else None - warn_msg = "DataFrameGroupBy.fillna is deprecated" - with tm.assert_produces_warning(warn, match=warn_msg): - expected = getattr(expected_gb, groupby_func)(*expected_args) + expected = getattr(expected_gb, groupby_func)(*expected_args) if groupby_func not in ("size", "ngroup", "cumcount"): expected = expected.rename(columns={"c": "b"}) tm.assert_equal(result, expected) @@ -74,8 +71,12 @@ def test_dup_labels_output_shape(groupby_func, idx): grp_by = df.groupby([0]) args = get_groupby_method_args(groupby_func, df) - warn = FutureWarning if groupby_func == "fillna" else None - warn_msg = "DataFrameGroupBy.fillna is deprecated" + if groupby_func == "corrwith": + warn = FutureWarning + warn_msg = "DataFrameGroupBy.corrwith is deprecated" + else: + warn = None + warn_msg = "" with tm.assert_produces_warning(warn, match=warn_msg): result = getattr(grp_by, groupby_func)(*args) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 6f30dcfaaba7e..644f93a37a3a3 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -6,6 +6,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -799,7 +801,7 @@ def func_with_date(batch): with tm.assert_produces_warning(DeprecationWarning, match=msg): dfg_conversion = df.groupby(by=["a"]).apply(func_with_date) dfg_conversion_expected = DataFrame( - {"b": pd.Timestamp(2015, 1, 1).as_unit("ns"), "c": 2}, index=[1] + {"b": pd.Timestamp(2015, 1, 1), "c": 2}, index=[1] ) dfg_conversion_expected.index.name = "a" @@ -920,6 +922,7 @@ def test_func_returns_object(): tm.assert_series_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "group_column_dtlike", [datetime.today(), datetime.today().date(), datetime.today().time()], @@ -1019,7 +1022,7 @@ def test_groupby_apply_datetime_result_dtypes(using_infer_string): result = data.groupby("color").apply(lambda g: g.iloc[0]).dtypes dtype = "string" if using_infer_string else object expected = Series( - [np.dtype("datetime64[ns]"), dtype, dtype, np.int64, dtype], + [np.dtype("datetime64[us]"), dtype, dtype, np.int64, dtype], index=["observation", "color", "mood", "intensity", "score"], ) tm.assert_series_equal(result, expected) @@ -1197,7 +1200,14 @@ def test_apply_is_unchanged_when_other_methods_are_called_first(reduction_func): # Check output when another method is called before .apply() grp = df.groupby(by="a") args = get_groupby_method_args(reduction_func, df) - _ = getattr(grp, reduction_func)(*args) + if reduction_func == "corrwith": + warn = FutureWarning + msg = "DataFrameGroupBy.corrwith is deprecated" + else: + warn = None + msg = "" + with tm.assert_produces_warning(warn, match=msg): + _ = getattr(grp, reduction_func)(*args) result = grp.apply(np.sum, axis=0, include_groups=False) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 2194e5692aa0e..c35f5d2bc26e8 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( Categorical, @@ -320,6 +322,7 @@ def test_apply(ordered): tm.assert_series_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_observed(observed): # multiple groupers, don't re-expand the output space # of the grouper @@ -1473,7 +1476,14 @@ def test_dataframe_groupby_on_2_categoricals_when_observed_is_true(reduction_fun df_grp = df.groupby(["cat_1", "cat_2"], observed=True) args = get_groupby_method_args(reduction_func, df) - res = getattr(df_grp, reduction_func)(*args) + if reduction_func == "corrwith": + warn = FutureWarning + warn_msg = "DataFrameGroupBy.corrwith is deprecated" + else: + warn = None + warn_msg = "" + with tm.assert_produces_warning(warn, match=warn_msg): + res = getattr(df_grp, reduction_func)(*args) for cat in unobserved_cats: assert cat not in res.index @@ -1512,7 +1522,14 @@ def test_dataframe_groupby_on_2_categoricals_when_observed_is_false( getattr(df_grp, reduction_func)(*args) return - res = getattr(df_grp, reduction_func)(*args) + if reduction_func == "corrwith": + warn = FutureWarning + warn_msg = "DataFrameGroupBy.corrwith is deprecated" + else: + warn = None + warn_msg = "" + with tm.assert_produces_warning(warn, match=warn_msg): + res = getattr(df_grp, reduction_func)(*args) expected = _results_for_groupbys_with_missing_categories[reduction_func] @@ -1904,8 +1921,14 @@ def test_category_order_reducer( ): getattr(gb, reduction_func)(*args) return - - op_result = getattr(gb, reduction_func)(*args) + if reduction_func == "corrwith": + warn = FutureWarning + warn_msg = "DataFrameGroupBy.corrwith is deprecated" + else: + warn = None + warn_msg = "" + with tm.assert_produces_warning(warn, match=warn_msg): + op_result = getattr(gb, reduction_func)(*args) if as_index: result = op_result.index.get_level_values("a").categories else: diff --git a/pandas/tests/groupby/test_filters.py b/pandas/tests/groupby/test_filters.py index 04883b3ef6b78..4fe3aac629513 100644 --- a/pandas/tests/groupby/test_filters.py +++ b/pandas/tests/groupby/test_filters.py @@ -248,7 +248,7 @@ def test_filter_using_len(): actual = grouped.filter(lambda x: len(x) > 2) expected = DataFrame( {"A": np.arange(2, 6), "B": list("bbbb"), "C": np.arange(2, 6)}, - index=np.arange(2, 6, dtype=np.int64), + index=range(2, 6), ) tm.assert_frame_equal(actual, expected) @@ -262,7 +262,7 @@ def test_filter_using_len_series(): s = Series(list("aabbbbcc"), name="B") grouped = s.groupby(s) actual = grouped.filter(lambda x: len(x) > 2) - expected = Series(4 * ["b"], index=np.arange(2, 6, dtype=np.int64), name="B") + expected = Series(4 * ["b"], index=range(2, 6), name="B") tm.assert_series_equal(actual, expected) actual = grouped.filter(lambda x: len(x) > 4) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index b99ef2a0e840d..5ac6dc990c092 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -6,6 +6,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import SpecificationError import pandas.util._test_decorators as td @@ -74,7 +76,7 @@ def max_value(group): tm.assert_series_equal(result, expected) -def test_pass_args_kwargs(ts, tsframe): +def test_pass_args_kwargs(ts): def f(x, q=None, axis=0): return np.percentile(x, q, axis=axis) @@ -100,28 +102,31 @@ def f(x, q=None, axis=0): tm.assert_series_equal(apply_result, agg_expected) tm.assert_series_equal(trans_result, trans_expected) - # DataFrame - for as_index in [True, False]: - df_grouped = tsframe.groupby(lambda x: x.month, as_index=as_index) - agg_result = df_grouped.agg(np.percentile, 80, axis=0) - apply_result = df_grouped.apply(DataFrame.quantile, 0.8) - expected = df_grouped.quantile(0.8) - tm.assert_frame_equal(apply_result, expected, check_names=False) - tm.assert_frame_equal(agg_result, expected) - - apply_result = df_grouped.apply(DataFrame.quantile, [0.4, 0.8]) - expected_seq = df_grouped.quantile([0.4, 0.8]) - if not as_index: - # apply treats the op as a transform; .quantile knows it's a reduction - apply_result.index = range(4) - apply_result.insert(loc=0, column="level_0", value=[1, 1, 2, 2]) - apply_result.insert(loc=1, column="level_1", value=[0.4, 0.8, 0.4, 0.8]) - tm.assert_frame_equal(apply_result, expected_seq, check_names=False) - - agg_result = df_grouped.agg(f, q=80) - apply_result = df_grouped.apply(DataFrame.quantile, q=0.8) - tm.assert_frame_equal(agg_result, expected) - tm.assert_frame_equal(apply_result, expected, check_names=False) + +def test_pass_args_kwargs_dataframe(tsframe, as_index): + def f(x, q=None, axis=0): + return np.percentile(x, q, axis=axis) + + df_grouped = tsframe.groupby(lambda x: x.month, as_index=as_index) + agg_result = df_grouped.agg(np.percentile, 80, axis=0) + apply_result = df_grouped.apply(DataFrame.quantile, 0.8) + expected = df_grouped.quantile(0.8) + tm.assert_frame_equal(apply_result, expected, check_names=False) + tm.assert_frame_equal(agg_result, expected) + + apply_result = df_grouped.apply(DataFrame.quantile, [0.4, 0.8]) + expected_seq = df_grouped.quantile([0.4, 0.8]) + if not as_index: + # apply treats the op as a transform; .quantile knows it's a reduction + apply_result.index = range(4) + apply_result.insert(loc=0, column="level_0", value=[1, 1, 2, 2]) + apply_result.insert(loc=1, column="level_1", value=[0.4, 0.8, 0.4, 0.8]) + tm.assert_frame_equal(apply_result, expected_seq, check_names=False) + + agg_result = df_grouped.agg(f, q=80) + apply_result = df_grouped.apply(DataFrame.quantile, q=0.8) + tm.assert_frame_equal(agg_result, expected) + tm.assert_frame_equal(apply_result, expected, check_names=False) def test_len(): @@ -148,8 +153,8 @@ def test_len_nan_group(): def test_groupby_timedelta_median(): # issue 57926 - expected = Series(data=Timedelta("1d"), index=["foo"]) - df = DataFrame({"label": ["foo", "foo"], "timedelta": [pd.NaT, Timedelta("1d")]}) + expected = Series(data=Timedelta("1D"), index=["foo"]) + df = DataFrame({"label": ["foo", "foo"], "timedelta": [pd.NaT, Timedelta("1D")]}) gb = df.groupby("label")["timedelta"] actual = gb.median() tm.assert_series_equal(actual, expected, check_names=False) @@ -828,7 +833,7 @@ def test_groupby_level_mapper(multiindex_dataframe_random_data): def test_groupby_level_nonmulti(): # GH 1313, GH 13901 s = Series([1, 2, 3, 10, 4, 5, 20, 6], Index([1, 2, 3, 1, 4, 5, 2, 6], name="foo")) - expected = Series([11, 22, 3, 4, 5, 6], Index(range(1, 7), name="foo")) + expected = Series([11, 22, 3, 4, 5, 6], Index(list(range(1, 7)), name="foo")) result = s.groupby(level=0).sum() tm.assert_series_equal(result, expected) @@ -860,7 +865,7 @@ def test_groupby_level_nonmulti(): def test_groupby_complex(): # GH 12902 a = Series(data=np.arange(4) * (1 + 2j), index=[0, 0, 1, 1]) - expected = Series((1 + 2j, 5 + 10j)) + expected = Series((1 + 2j, 5 + 10j), index=Index([0, 1])) result = a.groupby(level=0).sum() tm.assert_series_equal(result, expected) @@ -1205,7 +1210,10 @@ def test_groupby_nat_exclude(): ) grouped = df.groupby("dt") - expected = [Index([1, 7]), Index([3, 5])] + expected = [ + RangeIndex(start=1, stop=13, step=6), + RangeIndex(start=3, stop=7, step=2), + ] keys = sorted(grouped.groups.keys()) assert len(keys) == 2 for k, e in zip(keys, expected): @@ -1235,7 +1243,7 @@ def test_groupby_nat_exclude(): {"nan": [np.nan, np.nan, np.nan], "nat": [pd.NaT, pd.NaT, pd.NaT]} ) assert nan_df["nan"].dtype == "float64" - assert nan_df["nat"].dtype == "datetime64[ns]" + assert nan_df["nat"].dtype == "datetime64[s]" for key in ["nan", "nat"]: grouped = nan_df.groupby(key) @@ -1255,6 +1263,7 @@ def test_groupby_two_group_keys_all_nan(): assert result == {} +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_groupby_2d_malformed(): d = DataFrame(index=range(2)) d["group"] = ["g1", "g2"] @@ -1955,9 +1964,9 @@ def test_groups_sort_dropna(sort, dropna): df = DataFrame([[2.0, 1.0], [np.nan, 4.0], [0.0, 3.0]]) keys = [(2.0, 1.0), (np.nan, 4.0), (0.0, 3.0)] values = [ - Index([0], dtype="int64"), - Index([1], dtype="int64"), - Index([2], dtype="int64"), + RangeIndex(0, 1), + RangeIndex(1, 2), + RangeIndex(2, 3), ] if sort: taker = [2, 0] if dropna else [2, 0, 1] @@ -2319,6 +2328,7 @@ def test_groupby_all_nan_groups_drop(): tm.assert_series_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("numeric_only", [True, False]) def test_groupby_empty_multi_column(as_index, numeric_only): # GH 15106 & GH 41998 @@ -2335,6 +2345,7 @@ def test_groupby_empty_multi_column(as_index, numeric_only): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_groupby_aggregation_non_numeric_dtype(): # GH #43108 df = DataFrame( @@ -2445,7 +2456,7 @@ def test_rolling_wrong_param_min_period(): test_df.columns = ["name", "val"] result_error_msg = ( - r"^[a-zA-Z._]*\(\) got an unexpected keyword argument 'min_period'$" + r"^[a-zA-Z._]*\(\) got an unexpected keyword argument 'min_period'" ) with pytest.raises(TypeError, match=result_error_msg): test_df.groupby("name")["val"].rolling(window=2, min_period=1).sum() @@ -2492,6 +2503,7 @@ def test_groupby_none_in_first_mi_level(): tm.assert_series_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_groupby_none_column_name(): # GH#47348 df = DataFrame({None: [1, 1, 2, 2], "b": [1, 1, 2, 3], "c": [4, 5, 6, 7]}) @@ -2665,7 +2677,9 @@ def test_groupby_method_drop_na(method): Series(["a", "b", "c"], name="A") ) else: - expected = DataFrame({"A": ["a", "b", "c"], "B": [0, 2, 4]}, index=[0, 2, 4]) + expected = DataFrame( + {"A": ["a", "b", "c"], "B": [0, 2, 4]}, index=range(0, 6, 2) + ) tm.assert_frame_equal(result, expected) @@ -2985,3 +2999,14 @@ def test_groupby_agg_namedagg_with_duplicate_columns(): ) tm.assert_frame_equal(result, expected) + + +def test_groupby_multi_index_codes(): + # GH#54347 + df = DataFrame( + {"A": [1, 2, 3, 4], "B": [1, float("nan"), 2, float("nan")], "C": [2, 4, 6, 8]} + ) + df_grouped = df.groupby(["A", "B"], dropna=False).sum() + + index = df_grouped.index + tm.assert_index_equal(index, MultiIndex.from_frame(index.to_frame())) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index d3b3c945e06de..d42aa06d6bbfe 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat.pyarrow import pa_version_under10p1 from pandas.core.dtypes.missing import na_value_for_dtype @@ -97,6 +99,7 @@ def test_groupby_dropna_multi_index_dataframe_nan_in_two_groups( tm.assert_frame_equal(grouped, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "dropna, idx, outputs", [ @@ -420,7 +423,7 @@ def test_groupby_drop_nan_with_multi_index(): ), ), "datetime64[ns]", - "period[d]", + "period[D]", "Sparse[float]", ], ) @@ -437,7 +440,7 @@ def test_no_sort_keep_na(sequence_index, dtype, test_series, as_index): # Unique values to use for grouper, depends on dtype if dtype in ("string", "string[pyarrow]"): uniques = {"x": "x", "y": "y", "z": pd.NA} - elif dtype in ("datetime64[ns]", "period[d]"): + elif dtype in ("datetime64[ns]", "period[D]"): uniques = {"x": "2016-01-01", "y": "2017-01-01", "z": pd.NA} else: uniques = {"x": 1, "y": 2, "z": np.nan} @@ -543,7 +546,14 @@ def test_categorical_reducers(reduction_func, observed, sort, as_index, index_ki return gb_filled = df_filled.groupby(keys, observed=observed, sort=sort, as_index=True) - expected = getattr(gb_filled, reduction_func)(*args_filled).reset_index() + if reduction_func == "corrwith": + warn = FutureWarning + msg = "DataFrameGroupBy.corrwith is deprecated" + else: + warn = None + msg = "" + with tm.assert_produces_warning(warn, match=msg): + expected = getattr(gb_filled, reduction_func)(*args_filled).reset_index() expected["x"] = expected["x"].cat.remove_categories([4]) if index_kind == "multi": expected["x2"] = expected["x2"].cat.remove_categories([4]) @@ -567,7 +577,14 @@ def test_categorical_reducers(reduction_func, observed, sort, as_index, index_ki if as_index: expected = expected["size"].rename(None) - result = getattr(gb_keepna, reduction_func)(*args) + if reduction_func == "corrwith": + warn = FutureWarning + msg = "DataFrameGroupBy.corrwith is deprecated" + else: + warn = None + msg = "" + with tm.assert_produces_warning(warn, match=msg): + result = getattr(gb_keepna, reduction_func)(*args) # size will return a Series, others are DataFrame tm.assert_equal(result, expected) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 39eadd32f300d..fc2a8a970010a 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -10,6 +10,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import SpecificationError import pandas as pd @@ -545,31 +547,38 @@ def test_multiindex_columns_empty_level(self): df = DataFrame([[1, "A"]], columns=midx) + msg = "`groups` by one element list returns scalar is deprecated" grouped = df.groupby("to filter").groups assert grouped["A"] == [0] - grouped = df.groupby([("to filter", "")]).groups + with tm.assert_produces_warning(FutureWarning, match=msg): + grouped = df.groupby([("to filter", "")]).groups assert grouped["A"] == [0] df = DataFrame([[1, "A"], [2, "B"]], columns=midx) expected = df.groupby("to filter").groups - result = df.groupby([("to filter", "")]).groups + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby([("to filter", "")]).groups assert result == expected df = DataFrame([[1, "A"], [2, "A"]], columns=midx) expected = df.groupby("to filter").groups - result = df.groupby([("to filter", "")]).groups + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby([("to filter", "")]).groups tm.assert_dict_equal(result, expected) def test_groupby_multiindex_tuple(self): - # GH 17979 + # GH 17979, GH#59179 df = DataFrame( [[1, 2, 3, 4], [3, 4, 5, 6], [1, 4, 2, 3]], columns=MultiIndex.from_arrays([["a", "b", "b", "c"], [1, 1, 2, 2]]), ) - expected = df.groupby([("b", 1)]).groups + + msg = "`groups` by one element list returns scalar is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = df.groupby([("b", 1)]).groups result = df.groupby(("b", 1)).groups tm.assert_dict_equal(expected, result) @@ -579,17 +588,21 @@ def test_groupby_multiindex_tuple(self): [["a", "b", "b", "c"], ["d", "d", "e", "e"]] ), ) - expected = df2.groupby([("b", "d")]).groups + + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = df2.groupby([("b", "d")]).groups result = df.groupby(("b", 1)).groups tm.assert_dict_equal(expected, result) df3 = DataFrame(df.values, columns=[("a", "d"), ("b", "d"), ("b", "e"), "c"]) - expected = df3.groupby([("b", "d")]).groups + + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = df3.groupby([("b", "d")]).groups result = df.groupby(("b", 1)).groups tm.assert_dict_equal(expected, result) def test_groupby_multiindex_partial_indexing_equivalence(self): - # GH 17977 + # GH 17977, GH#59179 df = DataFrame( [[1, 2, 3, 4], [3, 4, 5, 6], [1, 4, 2, 3]], columns=MultiIndex.from_arrays([["a", "b", "b", "c"], [1, 1, 2, 2]]), @@ -615,8 +628,10 @@ def test_groupby_multiindex_partial_indexing_equivalence(self): result_max = df.groupby([("a", 1)])["b"].max() tm.assert_frame_equal(expected_max, result_max) - expected_groups = df.groupby([("a", 1)])[[("b", 1), ("b", 2)]].groups - result_groups = df.groupby([("a", 1)])["b"].groups + msg = "`groups` by one element list returns scalar is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected_groups = df.groupby([("a", 1)])[[("b", 1), ("b", 2)]].groups + result_groups = df.groupby([("a", 1)])["b"].groups tm.assert_dict_equal(expected_groups, result_groups) def test_groupby_level(self, sort, multiindex_dataframe_random_data, df): @@ -719,15 +734,18 @@ def test_grouping_labels(self, multiindex_dataframe_random_data): tm.assert_almost_equal(grouped._grouper.codes[0], exp_labels) def test_list_grouper_with_nat(self): - # GH 14715 + # GH 14715, GH#59179 df = DataFrame({"date": date_range("1/1/2011", periods=365, freq="D")}) df.iloc[-1] = pd.NaT grouper = Grouper(key="date", freq="YS") + msg = "`groups` by one element list returns scalar is deprecated" # Grouper in a list grouping - result = df.groupby([grouper]) + gb = df.groupby([grouper]) expected = {Timestamp("2011-01-01"): Index(list(range(364)))} - tm.assert_dict_equal(result.groups, expected) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = gb.groups + tm.assert_dict_equal(result, expected) # Test case without a list result = df.groupby(grouper) @@ -789,6 +807,7 @@ def test_groupby_empty(self): expected = ["name"] assert result == expected + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_groupby_level_index_value_all_na(self): # issue 20519 df = DataFrame( @@ -962,6 +981,7 @@ def test_groupby_with_empty(self): grouped = series.groupby(grouper) assert next(iter(grouped), None) is None + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_groupby_with_single_column(self): df = DataFrame({"a": list("abssbab")}) tm.assert_frame_equal(df.groupby("a").get_group("a"), df.iloc[[0, 5]]) @@ -994,17 +1014,20 @@ def test_gb_key_len_equal_axis_len(self): class TestIteration: def test_groups(self, df): grouped = df.groupby(["A"]) - groups = grouped.groups - assert groups is grouped.groups # caching works + msg = "`groups` by one element list returns scalar is deprecated" + + with tm.assert_produces_warning(FutureWarning, match=msg): + groups = grouped.groups + assert groups is grouped.groups # caching works - for k, v in grouped.groups.items(): + for k, v in groups.items(): assert (df.loc[v]["A"] == k).all() grouped = df.groupby(["A", "B"]) groups = grouped.groups assert groups is grouped.groups # caching works - for k, v in grouped.groups.items(): + for k, v in groups.items(): assert (df.loc[v]["A"] == k[0]).all() assert (df.loc[v]["B"] == k[1]).all() diff --git a/pandas/tests/groupby/test_numeric_only.py b/pandas/tests/groupby/test_numeric_only.py index 33cdd1883e1b9..7e7c84fa2b390 100644 --- a/pandas/tests/groupby/test_numeric_only.py +++ b/pandas/tests/groupby/test_numeric_only.py @@ -256,7 +256,14 @@ def test_numeric_only(kernel, has_arg, numeric_only, keys): method = getattr(gb, kernel) if has_arg and numeric_only is True: # Cases where b does not appear in the result - result = method(*args, **kwargs) + if kernel == "corrwith": + warn = FutureWarning + msg = "DataFrameGroupBy.corrwith is deprecated" + else: + warn = None + msg = "" + with tm.assert_produces_warning(warn, match=msg): + result = method(*args, **kwargs) assert "b" not in result.columns elif ( # kernels that work on any dtype and have numeric_only arg @@ -284,8 +291,7 @@ def test_numeric_only(kernel, has_arg, numeric_only, keys): [ "not allowed for this dtype", "cannot be performed against 'object' dtypes", - # On PY39 message is "a number"; on PY310 and after is "a real number" - "must be a string or a.* number", + "must be a string or a real number", "unsupported operand type", "function is not implemented for this dtype", re.escape(f"agg function failed [how->{kernel},dtype->object]"), @@ -296,7 +302,14 @@ def test_numeric_only(kernel, has_arg, numeric_only, keys): elif kernel == "idxmax": msg = "'>' not supported between instances of 'type' and 'type'" with pytest.raises(exception, match=msg): - method(*args, **kwargs) + if kernel == "corrwith": + warn = FutureWarning + msg = "DataFrameGroupBy.corrwith is deprecated" + else: + warn = None + msg = "" + with tm.assert_produces_warning(warn, match=msg): + method(*args, **kwargs) elif not has_arg and numeric_only is not lib.no_default: with pytest.raises( TypeError, match="got an unexpected keyword argument 'numeric_only'" diff --git a/pandas/tests/groupby/test_pipe.py b/pandas/tests/groupby/test_pipe.py index 7d5c1625b8ab4..1044c83e3e56b 100644 --- a/pandas/tests/groupby/test_pipe.py +++ b/pandas/tests/groupby/test_pipe.py @@ -1,4 +1,7 @@ import numpy as np +import pytest + +from pandas._config import using_string_dtype import pandas as pd from pandas import ( @@ -8,6 +11,7 @@ import pandas._testing as tm +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_pipe(): # Test the pipe method of DataFrameGroupBy. # Issue #17871 diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index 9301f8d56d9d2..f28967fa81ddb 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -8,6 +8,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas import ( Categorical, DataFrame, @@ -104,6 +106,7 @@ def _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg=""): gb.transform(groupby_func, *args) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("how", ["method", "agg", "transform"]) def test_groupby_raises_string( how, by, groupby_series, groupby_func, df_with_string_col @@ -183,6 +186,8 @@ def test_groupby_raises_string( if groupby_func == "fillna": kind = "Series" if groupby_series else "DataFrame" warn_msg = f"{kind}GroupBy.fillna is deprecated" + elif groupby_func == "corrwith": + warn_msg = "DataFrameGroupBy.corrwith is deprecated" else: warn_msg = "" _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg) @@ -203,6 +208,7 @@ def func(x): getattr(gb, how)(func) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("how", ["agg", "transform"]) @pytest.mark.parametrize("groupby_func_np", [np.sum, np.mean]) def test_groupby_raises_string_np( @@ -288,6 +294,8 @@ def test_groupby_raises_datetime( if groupby_func == "fillna": kind = "Series" if groupby_series else "DataFrame" warn_msg = f"{kind}GroupBy.fillna is deprecated" + elif groupby_func == "corrwith": + warn_msg = "DataFrameGroupBy.corrwith is deprecated" else: warn_msg = "" _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg=warn_msg) @@ -485,6 +493,8 @@ def test_groupby_raises_category( if groupby_func == "fillna": kind = "Series" if groupby_series else "DataFrame" warn_msg = f"{kind}GroupBy.fillna is deprecated" + elif groupby_func == "corrwith": + warn_msg = "DataFrameGroupBy.corrwith is deprecated" else: warn_msg = "" _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg) @@ -528,6 +538,9 @@ def test_groupby_raises_category_np( _call_and_check(klass, msg, how, gb, groupby_func_np, ()) +@pytest.mark.filterwarnings( + "ignore:`groups` by one element list returns scalar is deprecated" +) @pytest.mark.parametrize("how", ["method", "agg", "transform"]) def test_groupby_raises_category_on_category( how, @@ -658,6 +671,8 @@ def test_groupby_raises_category_on_category( if groupby_func == "fillna": kind = "Series" if groupby_series else "DataFrame" warn_msg = f"{kind}GroupBy.fillna is deprecated" + elif groupby_func == "corrwith": + warn_msg = "DataFrameGroupBy.corrwith is deprecated" else: warn_msg = "" _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg) diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index edc94b2beeec1..8a421654cdf9b 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -5,6 +5,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs.tslibs import iNaT from pandas.core.dtypes.common import pandas_dtype @@ -468,6 +470,7 @@ def test_max_min_non_numeric(): assert "ss" in result +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_max_min_object_multiple_columns(): # GH#41111 case where the aggregation is valid for some columns but not # others; we split object blocks column-wise, consistent with @@ -982,7 +985,7 @@ def test_groupby_sum_timedelta_with_nat(): df = DataFrame( { "a": [1, 1, 2, 2], - "b": [pd.Timedelta("1d"), pd.Timedelta("2d"), pd.Timedelta("3d"), pd.NaT], + "b": [pd.Timedelta("1D"), pd.Timedelta("2D"), pd.Timedelta("3D"), pd.NaT], } ) td3 = pd.Timedelta(days=3) diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index ea556d043be2d..ee4973cbf18af 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -5,11 +5,13 @@ from datetime import ( datetime, timedelta, + timezone, ) import numpy as np import pytest -import pytz + +from pandas._config import using_string_dtype import pandas as pd from pandas import ( @@ -74,6 +76,7 @@ def groupby_with_truncated_bingrouper(frame_for_truncated_bingrouper): class TestGroupBy: + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_groupby_with_timegrouper(self): # GH 4161 # TimeGrouper requires a sorted index @@ -774,12 +777,12 @@ def test_groupby_with_timezone_selection(self): def test_timezone_info(self): # see gh-11682: Timezone info lost when broadcasting # scalar datetime to DataFrame - - df = DataFrame({"a": [1], "b": [datetime.now(pytz.utc)]}) - assert df["b"][0].tzinfo == pytz.utc + utc = timezone.utc + df = DataFrame({"a": [1], "b": [datetime.now(utc)]}) + assert df["b"][0].tzinfo == utc df = DataFrame({"a": [1, 2, 3]}) - df["b"] = datetime.now(pytz.utc) - assert df["b"][0].tzinfo == pytz.utc + df["b"] = datetime.now(utc) + assert df["b"][0].tzinfo == utc def test_datetime_count(self): df = DataFrame( diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index d6d545a8c4834..a65dda1570944 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs import lib from pandas.core.dtypes.common import ensure_platform_int @@ -370,6 +372,7 @@ def test_transform_select_columns(df): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_transform_nuisance_raises(df): # case that goes through _transform_item_by_item @@ -442,6 +445,7 @@ def test_transform_coercion(): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_groupby_transform_with_int(): # GH 3740, make sure that we might upcast on item-by-item transform @@ -701,6 +705,7 @@ def test_cython_transform_frame(request, op, args, targop, df_fix, gb_target): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.slow @pytest.mark.parametrize( "op, args, targop", @@ -1025,6 +1030,7 @@ def test_groupby_transform_with_datetimes(func, values): tm.assert_series_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_groupby_transform_dtype(): # GH 22243 df = DataFrame({"a": [1], "val": [1.35]}) @@ -1104,7 +1110,14 @@ def test_transform_agg_by_name(request, reduction_func, frame_or_series): return args = get_groupby_method_args(reduction_func, obj) - result = g.transform(func, *args) + if func == "corrwith": + warn = FutureWarning + msg = "DataFrameGroupBy.corrwith is deprecated" + else: + warn = None + msg = "" + with tm.assert_produces_warning(warn, match=msg): + result = g.transform(func, *args) # this is the *definition* of a transformation tm.assert_index_equal(result.index, obj.index) @@ -1468,8 +1481,12 @@ def test_as_index_no_change(keys, df, groupby_func): args = get_groupby_method_args(groupby_func, df) gb_as_index_true = df.groupby(keys, as_index=True) gb_as_index_false = df.groupby(keys, as_index=False) - warn = FutureWarning if groupby_func == "fillna" else None - msg = "DataFrameGroupBy.fillna is deprecated" + if groupby_func == "corrwith": + warn = FutureWarning + msg = "DataFrameGroupBy.corrwith is deprecated" + else: + warn = None + msg = "" with tm.assert_produces_warning(warn, match=msg): result = gb_as_index_true.transform(groupby_func, *args) with tm.assert_produces_warning(warn, match=msg): @@ -1580,3 +1597,12 @@ def test_min_one_dim_no_type_coercion(): expected = DataFrame({"Y": [9435, -5465765, -5465765, 0, 9435]}, dtype="int32") tm.assert_frame_equal(expected, result) + + +def test_nan_in_cumsum_group_label(): + # GH#58811 + df = DataFrame({"A": [1, None], "B": [2, 3]}, dtype="Int16") + gb = df.groupby("A")["B"] + result = gb.cumsum() + expected = Series([2, None], dtype="Int16", name="B") + tm.assert_series_equal(expected, result) diff --git a/pandas/tests/indexes/base_class/test_constructors.py b/pandas/tests/indexes/base_class/test_constructors.py index e5956f808286d..6036eddce7a01 100644 --- a/pandas/tests/indexes/base_class/test_constructors.py +++ b/pandas/tests/indexes/base_class/test_constructors.py @@ -59,18 +59,12 @@ def test_index_string_inference(self): ser = Index(["a", 1]) tm.assert_index_equal(ser, expected) - def test_inference_on_pandas_objects(self): + @pytest.mark.parametrize("klass", [Series, Index]) + def test_inference_on_pandas_objects(self, klass): # GH#56012 - idx = Index([pd.Timestamp("2019-12-31")], dtype=object) - with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): - result = Index(idx) - assert result.dtype != np.object_ - - ser = Series([pd.Timestamp("2019-12-31")], dtype=object) - - with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): - result = Index(ser) - assert result.dtype != np.object_ + obj = klass([pd.Timestamp("2019-12-31")], dtype=object) + result = Index(obj) + assert result.dtype == np.object_ def test_constructor_not_read_only(self): # GH#57130 diff --git a/pandas/tests/indexes/base_class/test_formats.py b/pandas/tests/indexes/base_class/test_formats.py index 4580e00069dc1..dc4763d96bc71 100644 --- a/pandas/tests/indexes/base_class/test_formats.py +++ b/pandas/tests/indexes/base_class/test_formats.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype import pandas._config.config as cf from pandas import Index @@ -9,6 +9,7 @@ class TestIndexRendering: + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_repr_is_valid_construction_code(self): # for the case of Index, where the repr is traditional rather than # stylized @@ -16,7 +17,7 @@ def test_repr_is_valid_construction_code(self): res = eval(repr(idx)) tm.assert_index_equal(res, idx) - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="repr different") + @pytest.mark.xfail(using_string_dtype(), reason="repr different") @pytest.mark.parametrize( "index,expected", [ @@ -81,7 +82,7 @@ def test_string_index_repr(self, index, expected): result = repr(index) assert result == expected - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="repr different") + @pytest.mark.xfail(using_string_dtype(), reason="repr different") @pytest.mark.parametrize( "index,expected", [ diff --git a/pandas/tests/indexes/base_class/test_setops.py b/pandas/tests/indexes/base_class/test_setops.py index d57df82b2358c..f9636ec19f2ec 100644 --- a/pandas/tests/indexes/base_class/test_setops.py +++ b/pandas/tests/indexes/base_class/test_setops.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( Index, @@ -231,6 +233,7 @@ def test_tuple_union_bug(self, method, expected, sort): expected = Index(expected) tm.assert_index_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("first_list", [["b", "a"], []]) @pytest.mark.parametrize("second_list", [["a", "b"], []]) @pytest.mark.parametrize( diff --git a/pandas/tests/indexes/categorical/test_category.py b/pandas/tests/indexes/categorical/test_category.py index 03a298a13dc2b..d9c9fdc62b0bc 100644 --- a/pandas/tests/indexes/categorical/test_category.py +++ b/pandas/tests/indexes/categorical/test_category.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._libs import index as libindex from pandas._libs.arrays import NDArrayBacked @@ -21,6 +21,9 @@ class TestCategoricalIndex: @pytest.fixture def simple_index(self) -> CategoricalIndex: + """ + Fixture that provides a CategoricalIndex. + """ return CategoricalIndex(list("aabbca"), categories=list("cab"), ordered=False) def test_can_hold_identifiers(self): @@ -196,7 +199,7 @@ def test_unique(self, data, categories, expected_data, ordered): expected = CategoricalIndex(expected_data, dtype=dtype) tm.assert_index_equal(idx.unique(), expected) - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="repr doesn't roundtrip") + @pytest.mark.xfail(using_string_dtype(), reason="repr doesn't roundtrip") def test_repr_roundtrip(self): ci = CategoricalIndex(["a", "b"], categories=["a", "b"], ordered=True) str(ci) @@ -392,3 +395,10 @@ def test_remove_maintains_order(self): ["a", "b", np.nan, "d", "d", "a"], categories=list("dba"), ordered=True ), ) + + +def test_contains_rangeindex_categories_no_engine(): + ci = CategoricalIndex(range(3)) + assert 2 in ci + assert 5 not in ci + assert "_engine" not in ci._cache diff --git a/pandas/tests/indexes/categorical/test_formats.py b/pandas/tests/indexes/categorical/test_formats.py index 491db3a63cc0d..b1361b3e8106e 100644 --- a/pandas/tests/indexes/categorical/test_formats.py +++ b/pandas/tests/indexes/categorical/test_formats.py @@ -4,14 +4,14 @@ import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype import pandas._config.config as cf from pandas import CategoricalIndex class TestCategoricalIndexRepr: - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="repr different") + @pytest.mark.xfail(using_string_dtype(), reason="repr different") def test_string_categorical_index_repr(self): # short idx = CategoricalIndex(["a", "bb", "ccc"]) diff --git a/pandas/tests/indexes/datetimelike_/test_drop_duplicates.py b/pandas/tests/indexes/datetimelike_/test_drop_duplicates.py index 61a79c4ceabf9..c2d76c0bcc8bd 100644 --- a/pandas/tests/indexes/datetimelike_/test_drop_duplicates.py +++ b/pandas/tests/indexes/datetimelike_/test_drop_duplicates.py @@ -70,20 +70,32 @@ def test_drop_duplicates(self, keep, expected, index, idx): class TestDropDuplicatesPeriodIndex(DropDuplicates): @pytest.fixture(params=["D", "3D", "h", "2h", "min", "2min", "s", "3s"]) def freq(self, request): + """ + Fixture to test for different frequencies for PeriodIndex. + """ return request.param @pytest.fixture def idx(self, freq): + """ + Fixture to get PeriodIndex for 10 periods for different frequencies. + """ return period_range("2011-01-01", periods=10, freq=freq, name="idx") class TestDropDuplicatesDatetimeIndex(DropDuplicates): @pytest.fixture def idx(self, freq_sample): + """ + Fixture to get DatetimeIndex for 10 periods for different frequencies. + """ return date_range("2011-01-01", freq=freq_sample, periods=10, name="idx") class TestDropDuplicatesTimedeltaIndex(DropDuplicates): @pytest.fixture def idx(self, freq_sample): + """ + Fixture to get TimedeltaIndex for 10 periods for different frequencies. + """ return timedelta_range("1 day", periods=10, freq=freq_sample, name="idx") diff --git a/pandas/tests/indexes/datetimelike_/test_equals.py b/pandas/tests/indexes/datetimelike_/test_equals.py index 08134d9f3efb4..df9182b2dd1c2 100644 --- a/pandas/tests/indexes/datetimelike_/test_equals.py +++ b/pandas/tests/indexes/datetimelike_/test_equals.py @@ -52,6 +52,7 @@ def test_not_equals_misc_strs(self, index): class TestPeriodIndexEquals(EqualsTests): @pytest.fixture def index(self): + """Fixture for creating a PeriodIndex for use in equality tests.""" return period_range("2013-01-01", periods=5, freq="D") # TODO: de-duplicate with other test_equals2 methods @@ -91,6 +92,7 @@ def test_equals2(self, freq): class TestDatetimeIndexEquals(EqualsTests): @pytest.fixture def index(self): + """Fixture for creating a DatetimeIndex for use in equality tests.""" return date_range("2013-01-01", periods=5) def test_equals2(self): @@ -143,6 +145,7 @@ def test_not_equals_bday(self, freq): class TestTimedeltaIndexEquals(EqualsTests): @pytest.fixture def index(self): + """Fixture for creating a TimedeltaIndex for use in equality tests.""" return timedelta_range("1 day", periods=10) def test_equals2(self): diff --git a/pandas/tests/indexes/datetimelike_/test_indexing.py b/pandas/tests/indexes/datetimelike_/test_indexing.py index ee7128601256a..7b2c81aaf17de 100644 --- a/pandas/tests/indexes/datetimelike_/test_indexing.py +++ b/pandas/tests/indexes/datetimelike_/test_indexing.py @@ -19,7 +19,7 @@ @pytest.mark.parametrize("ldtype", dtlike_dtypes) @pytest.mark.parametrize("rdtype", dtlike_dtypes) def test_get_indexer_non_unique_wrong_dtype(ldtype, rdtype): - vals = np.tile(3600 * 10**9 * np.arange(3), 2) + vals = np.tile(3600 * 10**9 * np.arange(3, dtype=np.int64), 2) def construct(dtype): if dtype is dtlike_dtypes[-1]: diff --git a/pandas/tests/indexes/datetimes/methods/test_astype.py b/pandas/tests/indexes/datetimes/methods/test_astype.py index c0bc6601769b1..81dc3b3ecc45e 100644 --- a/pandas/tests/indexes/datetimes/methods/test_astype.py +++ b/pandas/tests/indexes/datetimes/methods/test_astype.py @@ -3,7 +3,6 @@ import dateutil import numpy as np import pytest -import pytz import pandas as pd from pandas import ( @@ -251,6 +250,8 @@ def _check_rng(rng): _check_rng(rng_utc) def test_index_convert_to_datetime_array_explicit_pytz(self): + pytz = pytest.importorskip("pytz") + def _check_rng(rng): converted = rng.to_pydatetime() assert isinstance(converted, np.ndarray) diff --git a/pandas/tests/indexes/datetimes/methods/test_insert.py b/pandas/tests/indexes/datetimes/methods/test_insert.py index ebfe490e0e067..4a5b7bcc1a86f 100644 --- a/pandas/tests/indexes/datetimes/methods/test_insert.py +++ b/pandas/tests/indexes/datetimes/methods/test_insert.py @@ -1,8 +1,8 @@ from datetime import datetime +import zoneinfo import numpy as np import pytest -import pytz from pandas import ( NA, @@ -133,49 +133,59 @@ def test_insert3(self, unit): assert result.name == expected.name assert result.freq is None - def test_insert4(self, unit): - for tz in ["US/Pacific", "Asia/Singapore"]: - idx = date_range( - "1/1/2000 09:00", periods=6, freq="h", tz=tz, name="idx", unit=unit - ) - # preserve freq - expected = date_range( - "1/1/2000 09:00", periods=7, freq="h", tz=tz, name="idx", unit=unit - ) - for d in [ - Timestamp("2000-01-01 15:00", tz=tz), - pytz.timezone(tz).localize(datetime(2000, 1, 1, 15)), - ]: - result = idx.insert(6, d) - tm.assert_index_equal(result, expected) - assert result.name == expected.name - assert result.freq == expected.freq - assert result.tz == expected.tz - - expected = DatetimeIndex( - [ - "2000-01-01 09:00", - "2000-01-01 10:00", - "2000-01-01 11:00", - "2000-01-01 12:00", - "2000-01-01 13:00", - "2000-01-01 14:00", - "2000-01-01 10:00", - ], - name="idx", - tz=tz, - freq=None, - ).as_unit(unit) - # reset freq to None - for d in [ - Timestamp("2000-01-01 10:00", tz=tz), - pytz.timezone(tz).localize(datetime(2000, 1, 1, 10)), - ]: - result = idx.insert(6, d) - tm.assert_index_equal(result, expected) - assert result.name == expected.name - assert result.tz == expected.tz - assert result.freq is None + @pytest.mark.parametrize("tz", ["US/Pacific", "Asia/Singapore"]) + @pytest.mark.parametrize( + "to_ts", + [lambda x: x, lambda x: x.to_pydatetime()], + ids=["Timestamp", "datetime"], + ) + def test_insert4(self, unit, tz, to_ts): + idx = date_range( + "1/1/2000 09:00", periods=6, freq="h", tz=tz, name="idx", unit=unit + ) + # preserve freq + expected = date_range( + "1/1/2000 09:00", periods=7, freq="h", tz=tz, name="idx", unit=unit + ) + tz = zoneinfo.ZoneInfo(tz) + d = to_ts(Timestamp("2000-01-01 15:00", tz=tz)) + result = idx.insert(6, d) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freq == expected.freq + assert result.tz == expected.tz + + @pytest.mark.parametrize("tz", ["US/Pacific", "Asia/Singapore"]) + @pytest.mark.parametrize( + "to_ts", + [lambda x: x, lambda x: x.to_pydatetime()], + ids=["Timestamp", "datetime"], + ) + def test_insert4_no_freq(self, unit, tz, to_ts): + idx = date_range( + "1/1/2000 09:00", periods=6, freq="h", tz=tz, name="idx", unit=unit + ) + expected = DatetimeIndex( + [ + "2000-01-01 09:00", + "2000-01-01 10:00", + "2000-01-01 11:00", + "2000-01-01 12:00", + "2000-01-01 13:00", + "2000-01-01 14:00", + "2000-01-01 10:00", + ], + name="idx", + tz=tz, + freq=None, + ).as_unit(unit) + # reset freq to None + d = to_ts(Timestamp("2000-01-01 10:00", tz=tz)) + result = idx.insert(6, d) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.tz == expected.tz + assert result.freq is None # TODO: also changes DataFrame.__setitem__ with expansion def test_insert_mismatched_tzawareness(self): @@ -214,7 +224,7 @@ def test_insert_mismatched_tz(self): assert expected.dtype == idx.dtype tm.assert_index_equal(result, expected) - item = datetime(2000, 1, 4, tzinfo=pytz.timezone("US/Eastern")) + item = datetime(2000, 1, 4, tzinfo=zoneinfo.ZoneInfo("US/Eastern")) result = idx.insert(3, item) expected = Index( list(idx[:3]) + [item.astimezone(idx.tzinfo)] + list(idx[3:]), diff --git a/pandas/tests/indexes/datetimes/methods/test_shift.py b/pandas/tests/indexes/datetimes/methods/test_shift.py index 375dea01974bb..a202627550cd2 100644 --- a/pandas/tests/indexes/datetimes/methods/test_shift.py +++ b/pandas/tests/indexes/datetimes/methods/test_shift.py @@ -1,7 +1,7 @@ from datetime import datetime +import zoneinfo import pytest -import pytz from pandas.errors import NullFrequencyError @@ -13,8 +13,6 @@ ) import pandas._testing as tm -START, END = datetime(2009, 1, 1), datetime(2010, 1, 1) - class TestDatetimeIndexShift: # ------------------------------------------------------------- @@ -122,24 +120,28 @@ def test_dti_shift_across_dst(self, unit): ) def test_dti_shift_near_midnight(self, shift, result_time, unit): # GH 8616 - dt = datetime(2014, 11, 14, 0) - dt_est = pytz.timezone("EST").localize(dt) + tz = zoneinfo.ZoneInfo("US/Eastern") + dt_est = datetime(2014, 11, 14, 0, tzinfo=tz) idx = DatetimeIndex([dt_est]).as_unit(unit) ser = Series(data=[1], index=idx) result = ser.shift(shift, freq="h") - exp_index = DatetimeIndex([result_time], tz="EST").as_unit(unit) + exp_index = DatetimeIndex([result_time], tz=tz).as_unit(unit) expected = Series(1, index=exp_index) tm.assert_series_equal(result, expected) def test_shift_periods(self, unit): # GH#22458 : argument 'n' was deprecated in favor of 'periods' - idx = date_range(start=START, end=END, periods=3, unit=unit) + idx = date_range( + start=datetime(2009, 1, 1), end=datetime(2010, 1, 1), periods=3, unit=unit + ) tm.assert_index_equal(idx.shift(periods=0), idx) tm.assert_index_equal(idx.shift(0), idx) @pytest.mark.parametrize("freq", ["B", "C"]) def test_shift_bday(self, freq, unit): - rng = date_range(START, END, freq=freq, unit=unit) + rng = date_range( + datetime(2009, 1, 1), datetime(2010, 1, 1), freq=freq, unit=unit + ) shifted = rng.shift(5) assert shifted[0] == rng[5] assert shifted.freq == rng.freq @@ -153,11 +155,21 @@ def test_shift_bday(self, freq, unit): assert shifted.freq == rng.freq def test_shift_bmonth(self, performance_warning, unit): - rng = date_range(START, END, freq=pd.offsets.BMonthEnd(), unit=unit) + rng = date_range( + datetime(2009, 1, 1), + datetime(2010, 1, 1), + freq=pd.offsets.BMonthEnd(), + unit=unit, + ) shifted = rng.shift(1, freq=pd.offsets.BDay()) assert shifted[0] == rng[0] + pd.offsets.BDay() - rng = date_range(START, END, freq=pd.offsets.BMonthEnd(), unit=unit) + rng = date_range( + datetime(2009, 1, 1), + datetime(2010, 1, 1), + freq=pd.offsets.BMonthEnd(), + unit=unit, + ) with tm.assert_produces_warning(performance_warning): shifted = rng.shift(1, freq=pd.offsets.CDay()) assert shifted[0] == rng[0] + pd.offsets.CDay() diff --git a/pandas/tests/indexes/datetimes/methods/test_snap.py b/pandas/tests/indexes/datetimes/methods/test_snap.py index 651e4383a3fac..a3c06ac6257cf 100644 --- a/pandas/tests/indexes/datetimes/methods/test_snap.py +++ b/pandas/tests/indexes/datetimes/methods/test_snap.py @@ -7,6 +7,8 @@ import pandas._testing as tm +@pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") +@pytest.mark.filterwarnings("ignore:Period with BDay freq:FutureWarning") @pytest.mark.parametrize("tz", [None, "Asia/Shanghai", "Europe/Berlin"]) @pytest.mark.parametrize("name", [None, "my_dti"]) def test_dti_snap(name, tz, unit): @@ -27,7 +29,9 @@ def test_dti_snap(name, tz, unit): dti = dti.as_unit(unit) result = dti.snap(freq="W-MON") - expected = date_range("12/31/2001", "1/7/2002", name=name, tz=tz, freq="w-mon") + msg = "'w-mon' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = date_range("12/31/2001", "1/7/2002", name=name, tz=tz, freq="w-mon") expected = expected.repeat([3, 4]) expected = expected.as_unit(unit) tm.assert_index_equal(result, expected) @@ -37,7 +41,9 @@ def test_dti_snap(name, tz, unit): result = dti.snap(freq="B") - expected = date_range("1/1/2002", "1/7/2002", name=name, tz=tz, freq="b") + msg = "'b' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = date_range("1/1/2002", "1/7/2002", name=name, tz=tz, freq="b") expected = expected.repeat([1, 1, 1, 2, 2]) expected = expected.as_unit(unit) tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/datetimes/methods/test_to_period.py b/pandas/tests/indexes/datetimes/methods/test_to_period.py index 5b2cc55d6dc56..cd4a142dd5b30 100644 --- a/pandas/tests/indexes/datetimes/methods/test_to_period.py +++ b/pandas/tests/indexes/datetimes/methods/test_to_period.py @@ -1,7 +1,8 @@ +from datetime import timezone + import dateutil.tz from dateutil.tz import tzlocal import pytest -import pytz from pandas._libs.tslibs.ccalendar import MONTHS from pandas._libs.tslibs.offsets import MonthEnd @@ -90,24 +91,14 @@ def test_dti_to_period_2monthish(self, freq_offset, freq_period): tm.assert_index_equal(pi, period_range("2020-01", "2020-05", freq=freq_period)) @pytest.mark.parametrize( - "freq, freq_depr", - [ - ("2ME", "2M"), - ("2QE", "2Q"), - ("2QE-SEP", "2Q-SEP"), - ("1YE", "1Y"), - ("2YE-MAR", "2Y-MAR"), - ], + "freq", ["2ME", "1me", "2QE", "2QE-SEP", "1YE", "ye", "2YE-MAR"] ) - def test_to_period_frequency_M_Q_Y_deprecated(self, freq, freq_depr): - # GH#9586 - msg = f"'{freq_depr[1:]}' is deprecated and will be removed " - f"in a future version, please use '{freq[1:]}' instead." + def test_to_period_frequency_M_Q_Y_raises(self, freq): + msg = f"Invalid frequency: {freq}" - rng = date_range("01-Jan-2012", periods=8, freq=freq) - prng = rng.to_period() - with tm.assert_produces_warning(FutureWarning, match=msg): - assert prng.freq == freq_depr + rng = date_range("01-Jan-2012", periods=8, freq="ME") + with pytest.raises(ValueError, match=msg): + rng.to_period(freq) def test_to_period_infer(self): # https://github.com/pandas-dev/pandas/issues/33358 @@ -165,7 +156,13 @@ def test_to_period_microsecond(self): @pytest.mark.parametrize( "tz", - ["US/Eastern", pytz.utc, tzlocal(), "dateutil/US/Eastern", dateutil.tz.tzutc()], + [ + "US/Eastern", + timezone.utc, + tzlocal(), + "dateutil/US/Eastern", + dateutil.tz.tzutc(), + ], ) def test_to_period_tz(self, tz): ts = date_range("1/1/2000", "2/1/2000", tz=tz) @@ -208,10 +205,16 @@ def test_to_period_nofreq(self): assert idx.freqstr is None tm.assert_index_equal(idx.to_period(), expected) - @pytest.mark.parametrize("freq", ["2BMS", "1SME-15"]) + @pytest.mark.parametrize("freq", ["2BME", "SME-15", "2BMS"]) def test_to_period_offsets_not_supported(self, freq): # GH#56243 - msg = f"{freq[1:]} is not supported as period frequency" + msg = "|".join( + [ + f"Invalid frequency: {freq}", + f"{freq} is not supported as period frequency", + ] + ) + ts = date_range("1/1/2012", periods=4, freq=freq) with pytest.raises(ValueError, match=msg): ts.to_period() diff --git a/pandas/tests/indexes/datetimes/methods/test_to_series.py b/pandas/tests/indexes/datetimes/methods/test_to_series.py index 0c397c8ab2cd3..cd67775b7a5fc 100644 --- a/pandas/tests/indexes/datetimes/methods/test_to_series.py +++ b/pandas/tests/indexes/datetimes/methods/test_to_series.py @@ -13,6 +13,6 @@ def test_to_series(self): idx = naive.tz_localize("US/Pacific") expected = Series(np.array(idx.tolist(), dtype="object"), name="B") - result = idx.to_series(index=[0, 1]) + result = idx.to_series(index=range(2)) assert expected.dtype == idx.dtype tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexes/datetimes/methods/test_tz_convert.py b/pandas/tests/indexes/datetimes/methods/test_tz_convert.py index b2cf488ac8313..9eabb742b93a4 100644 --- a/pandas/tests/indexes/datetimes/methods/test_tz_convert.py +++ b/pandas/tests/indexes/datetimes/methods/test_tz_convert.py @@ -4,7 +4,6 @@ from dateutil.tz import gettz import numpy as np import pytest -import pytz from pandas._libs.tslibs import timezones @@ -260,11 +259,14 @@ def test_dti_tz_convert_tzlocal(self): [ "US/Eastern", "dateutil/US/Eastern", - pytz.timezone("US/Eastern"), + "pytz/US/Eastern", gettz("US/Eastern"), ], ) def test_dti_tz_convert_utc_to_local_no_modify(self, tz): + if isinstance(tz, str) and tz.startswith("pytz/"): + pytz = pytest.importorskip("pytz") + tz = pytz.timezone(tz.removeprefix("pytz/")) rng = date_range("3/11/2012", "3/12/2012", freq="h", tz="utc") rng_eastern = rng.tz_convert(tz) diff --git a/pandas/tests/indexes/datetimes/methods/test_tz_localize.py b/pandas/tests/indexes/datetimes/methods/test_tz_localize.py index ad7769c6b9671..c6697fd169e8a 100644 --- a/pandas/tests/indexes/datetimes/methods/test_tz_localize.py +++ b/pandas/tests/indexes/datetimes/methods/test_tz_localize.py @@ -1,7 +1,9 @@ from datetime import ( datetime, timedelta, + timezone, ) +from zoneinfo import ZoneInfo import dateutil.tz from dateutil.tz import gettz @@ -19,22 +21,13 @@ ) import pandas._testing as tm -try: - from zoneinfo import ZoneInfo -except ImportError: - # Cannot assign to a type [misc] - ZoneInfo = None # type: ignore[misc, assignment] - -easts = [pytz.timezone("US/Eastern"), gettz("US/Eastern")] -if ZoneInfo is not None: - try: - tz = ZoneInfo("US/Eastern") - except KeyError: - # no tzdata - pass - else: - easts.append(tz) +@pytest.fixture(params=["pytz/US/Eastern", gettz("US/Eastern"), ZoneInfo("US/Eastern")]) +def tz(request): + if isinstance(request.param, str) and request.param.startswith("pytz/"): + pytz = pytest.importorskip("pytz") + return pytz.timezone(request.param.removeprefix("pytz/")) + return request.param class TestTZLocalize: @@ -88,7 +81,6 @@ def test_dti_tz_localize_nonexistent_raise_coerce(self): expected = dti.tz_convert("US/Eastern") tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("tz", easts) def test_dti_tz_localize_ambiguous_infer(self, tz): # November 6, 2011, fall back, repeat 2 AM hour # With no repeated hours, we cannot infer the transition @@ -96,7 +88,6 @@ def test_dti_tz_localize_ambiguous_infer(self, tz): with pytest.raises(pytz.AmbiguousTimeError, match="Cannot infer dst time"): dr.tz_localize(tz) - @pytest.mark.parametrize("tz", easts) def test_dti_tz_localize_ambiguous_infer2(self, tz, unit): # With repeated hours, we can infer the transition dr = date_range( @@ -116,7 +107,6 @@ def test_dti_tz_localize_ambiguous_infer2(self, tz, unit): result2 = DatetimeIndex(times, tz=tz, ambiguous="infer").as_unit(unit) tm.assert_index_equal(result2, expected) - @pytest.mark.parametrize("tz", easts) def test_dti_tz_localize_ambiguous_infer3(self, tz): # When there is no dst transition, nothing special happens dr = date_range(datetime(2011, 6, 1, 0), periods=10, freq=offsets.Hour()) @@ -124,7 +114,6 @@ def test_dti_tz_localize_ambiguous_infer3(self, tz): localized_infer = dr.tz_localize(tz, ambiguous="infer") tm.assert_index_equal(localized, localized_infer) - @pytest.mark.parametrize("tz", easts) def test_dti_tz_localize_ambiguous_times(self, tz): # March 13, 2011, spring forward, skip from 2 AM to 3 AM dr = date_range(datetime(2011, 3, 13, 1, 30), periods=3, freq=offsets.Hour()) @@ -143,7 +132,7 @@ def test_dti_tz_localize_ambiguous_times(self, tz): # UTC is OK dr = date_range( - datetime(2011, 3, 13), periods=48, freq=offsets.Minute(30), tz=pytz.utc + datetime(2011, 3, 13), periods=48, freq=offsets.Minute(30), tz=timezone.utc ) @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) @@ -181,15 +170,6 @@ def test_dti_tz_localize(self, prefix): with pytest.raises(pytz.NonExistentTimeError, match="2011-03-13 02:00:00"): dti.tz_localize(tzstr) - @pytest.mark.parametrize( - "tz", - [ - "US/Eastern", - "dateutil/US/Eastern", - pytz.timezone("US/Eastern"), - gettz("US/Eastern"), - ], - ) def test_dti_tz_localize_utc_conversion(self, tz): # Localizing to time zone should: # 1) check for DST ambiguities @@ -245,7 +225,6 @@ def test_dti_tz_localize_tzlocal(self): dti2 = dti.tz_localize(None) tm.assert_numpy_array_equal(dti2.asi8 - offset, dti.asi8) - @pytest.mark.parametrize("tz", easts) def test_dti_tz_localize_ambiguous_nat(self, tz): times = [ "11/06/2011 00:00", @@ -270,7 +249,6 @@ def test_dti_tz_localize_ambiguous_nat(self, tz): # right is datetime64[ns, tzfile('/usr/share/zoneinfo/US/Eastern')] tm.assert_numpy_array_equal(di_test.values, localized.values) - @pytest.mark.parametrize("tz", easts) def test_dti_tz_localize_ambiguous_flags(self, tz, unit): # November 6, 2011, fall back, repeat 2 AM hour @@ -321,8 +299,7 @@ def test_dti_tz_localize_ambiguous_flags(self, tz, unit): dr = dr.append(dr) tm.assert_index_equal(dr, localized) - @pytest.mark.parametrize("tz", easts) - def test_dti_tz_localize_ambiguous_flags2(self, tz, unit): + def test_dti_tz_localize_ambiguous_flags2(self, tz): # When there is no dst transition, nothing special happens dr = date_range(datetime(2011, 6, 1, 0), periods=10, freq=offsets.Hour()) is_dst = np.array([1] * 10) @@ -332,8 +309,8 @@ def test_dti_tz_localize_ambiguous_flags2(self, tz, unit): def test_dti_tz_localize_bdate_range(self): dr = bdate_range("1/1/2009", "1/1/2010") - dr_utc = bdate_range("1/1/2009", "1/1/2010", tz=pytz.utc) - localized = dr.tz_localize(pytz.utc) + dr_utc = bdate_range("1/1/2009", "1/1/2010", tz=timezone.utc) + localized = dr.tz_localize(timezone.utc) tm.assert_index_equal(dr_utc, localized) @pytest.mark.parametrize( diff --git a/pandas/tests/indexes/datetimes/test_constructors.py b/pandas/tests/indexes/datetimes/test_constructors.py index 4be45e834ce31..aba440ceeb56b 100644 --- a/pandas/tests/indexes/datetimes/test_constructors.py +++ b/pandas/tests/indexes/datetimes/test_constructors.py @@ -7,6 +7,7 @@ ) from functools import partial from operator import attrgetter +import zoneinfo import dateutil import dateutil.tz @@ -16,7 +17,6 @@ import pytz from pandas._libs.tslibs import ( - OutOfBoundsDatetime, astype_overflowsafe, timezones, ) @@ -153,7 +153,9 @@ def test_construction_caching(self): df = pd.DataFrame( { "dt": date_range("20130101", periods=3), - "dttz": date_range("20130101", periods=3, tz="US/Eastern"), + "dttz": date_range( + "20130101", periods=3, tz=zoneinfo.ZoneInfo("US/Eastern") + ), "dt_with_null": [ Timestamp("20130101"), pd.NaT, @@ -162,7 +164,7 @@ def test_construction_caching(self): "dtns": date_range("20130101", periods=3, freq="ns"), } ) - assert df.dttz.dtype.tz.zone == "US/Eastern" + assert df.dttz.dtype.tz.key == "US/Eastern" @pytest.mark.parametrize( "kwargs", @@ -199,7 +201,11 @@ def test_construction_with_alt_tz_localize(self, kwargs, tz_aware_fixture): # incompat tz/dtype msg = "cannot supply both a tz and a dtype with a tz" with pytest.raises(ValueError, match=msg): - DatetimeIndex(i.tz_localize(None).asi8, dtype=i.dtype, tz="US/Pacific") + DatetimeIndex( + i.tz_localize(None).asi8, + dtype=i.dtype, + tz=zoneinfo.ZoneInfo("US/Hawaii"), + ) def test_construction_index_with_mixed_timezones(self): # gh-11488: no tz results in DatetimeIndex @@ -519,7 +525,7 @@ def test_construction_dti_with_mixed_timezones(self): Timestamp("2011-01-01 10:00", tz="Asia/Tokyo"), Timestamp("2011-01-02 10:00", tz="US/Eastern"), ], - dtype="M8[ns, US/Eastern]", + dtype="M8[s, US/Eastern]", name="idx", ) tm.assert_index_equal(dti, expected) @@ -541,31 +547,25 @@ def test_construction_outofbounds(self): datetime(5000, 1, 1), datetime(6000, 1, 1), ] - exp = Index(dates, dtype=object) - # coerces to object - tm.assert_index_equal(Index(dates), exp) + exp = Index(dates, dtype="M8[us]") + res = Index(dates) + tm.assert_index_equal(res, exp) - msg = "^Out of bounds nanosecond timestamp: 3000-01-01 00:00:00, at position 0$" - with pytest.raises(OutOfBoundsDatetime, match=msg): - # can't create DatetimeIndex - DatetimeIndex(dates) + DatetimeIndex(dates) @pytest.mark.parametrize("data", [["1400-01-01"], [datetime(1400, 1, 1)]]) def test_dti_date_out_of_range(self, data): # GH#1475 - msg = ( - "^Out of bounds nanosecond timestamp: " - "1400-01-01( 00:00:00)?, at position 0$" - ) - with pytest.raises(OutOfBoundsDatetime, match=msg): - DatetimeIndex(data) + DatetimeIndex(data) def test_construction_with_ndarray(self): # GH 5152 dates = [datetime(2013, 10, 7), datetime(2013, 10, 8), datetime(2013, 10, 9)] data = DatetimeIndex(dates, freq=offsets.BDay()).values result = DatetimeIndex(data, freq=offsets.BDay()) - expected = DatetimeIndex(["2013-10-07", "2013-10-08", "2013-10-09"], freq="B") + expected = DatetimeIndex( + ["2013-10-07", "2013-10-08", "2013-10-09"], dtype="M8[us]", freq="B" + ) tm.assert_index_equal(result, expected) def test_integer_values_and_tz_interpreted_as_utc(self): @@ -603,7 +603,7 @@ def test_constructor_coverage(self): expected = DatetimeIndex(strings.astype("O")) tm.assert_index_equal(result, expected) - from_ints = DatetimeIndex(expected.asi8) + from_ints = DatetimeIndex(expected.as_unit("ns").asi8).as_unit("s") tm.assert_index_equal(from_ints, expected) # string with NaT @@ -612,7 +612,7 @@ def test_constructor_coverage(self): expected = DatetimeIndex(strings.astype("O")) tm.assert_index_equal(result, expected) - from_ints = DatetimeIndex(expected.asi8) + from_ints = DatetimeIndex(expected.as_unit("ns").asi8).as_unit("s") tm.assert_index_equal(from_ints, expected) # non-conforming @@ -743,7 +743,7 @@ def test_disallow_setting_tz(self): dti = DatetimeIndex(["2010"], tz="UTC") msg = "Cannot directly set timezone" with pytest.raises(AttributeError, match=msg): - dti.tz = pytz.timezone("US/Pacific") + dti.tz = zoneinfo.ZoneInfo("US/Pacific") @pytest.mark.parametrize( "tz", @@ -771,7 +771,9 @@ def test_constructor_start_end_with_tz(self, tz): @pytest.mark.parametrize("tz", ["US/Pacific", "US/Eastern", "Asia/Tokyo"]) def test_constructor_with_non_normalized_pytz(self, tz): # GH 18595 - non_norm_tz = Timestamp("2010", tz=tz).tz + pytz = pytest.importorskip("pytz") + tz_in = pytz.timezone(tz) + non_norm_tz = Timestamp("2010", tz=tz_in).tz result = DatetimeIndex(["2010"], tz=non_norm_tz) assert pytz.timezone(tz) is result.tz @@ -781,8 +783,10 @@ def test_constructor_timestamp_near_dst(self): Timestamp("2016-10-30 03:00:00+0300", tz="Europe/Helsinki"), Timestamp("2016-10-30 03:00:00+0200", tz="Europe/Helsinki"), ] - result = DatetimeIndex(ts) - expected = DatetimeIndex([ts[0].to_pydatetime(), ts[1].to_pydatetime()]) + result = DatetimeIndex(ts).as_unit("ns") + expected = DatetimeIndex( + [ts[0].to_pydatetime(), ts[1].to_pydatetime()] + ).as_unit("ns") tm.assert_index_equal(result, expected) @pytest.mark.parametrize("klass", [Index, DatetimeIndex]) @@ -825,7 +829,7 @@ def test_construction_from_replaced_timestamps_with_dst(self): "2005-06-01 00:00:00", ], tz="Australia/Melbourne", - ) + ).as_unit("ns") tm.assert_index_equal(result, expected) def test_construction_with_tz_and_tz_aware_dti(self): @@ -837,8 +841,8 @@ def test_construction_with_tz_and_tz_aware_dti(self): def test_construction_with_nat_and_tzlocal(self): tz = dateutil.tz.tzlocal() - result = DatetimeIndex(["2018", "NaT"], tz=tz) - expected = DatetimeIndex([Timestamp("2018", tz=tz), pd.NaT]) + result = DatetimeIndex(["2018", "NaT"], tz=tz).as_unit("ns") + expected = DatetimeIndex([Timestamp("2018", tz=tz), pd.NaT]).as_unit("ns") tm.assert_index_equal(result, expected) def test_constructor_with_ambiguous_keyword_arg(self): @@ -881,7 +885,7 @@ def test_constructor_with_nonexistent_keyword_arg(self, warsaw): Timestamp("2015-03-29 03:00:00+02:00", tz=timezone), Timestamp("2015-03-29 04:00:00+02:00", tz=timezone), ] - ) + ).as_unit("ns") tm.assert_index_equal(result, expected) @@ -893,7 +897,7 @@ def test_constructor_with_nonexistent_keyword_arg(self, warsaw): Timestamp("2015-03-29 01:00:00+01:00", tz=timezone), Timestamp("2015-03-29 03:00:00+02:00", tz=timezone), ] - ) + ).as_unit("ns") tm.assert_index_equal(result, expected) @@ -919,7 +923,9 @@ def test_index_constructor_with_numpy_object_array_and_timestamp_tz_with_nan(sel expected = DatetimeIndex([Timestamp("2019", tz="UTC"), pd.NaT]) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("tz", [pytz.timezone("US/Eastern"), gettz("US/Eastern")]) + @pytest.mark.parametrize( + "tz", [zoneinfo.ZoneInfo("US/Eastern"), gettz("US/Eastern")] + ) def test_dti_from_tzaware_datetime(self, tz): d = [datetime(2012, 8, 19, tzinfo=tz)] @@ -934,13 +940,16 @@ def test_dti_tz_constructors(self, tzstr): arr = ["11/10/2005 08:00:00", "11/10/2005 09:00:00"] idx1 = to_datetime(arr).tz_localize(tzstr) - idx2 = date_range(start="2005-11-10 08:00:00", freq="h", periods=2, tz=tzstr) + idx2 = date_range( + start="2005-11-10 08:00:00", freq="h", periods=2, tz=tzstr, unit="s" + ) idx2 = idx2._with_freq(None) # the others all have freq=None - idx3 = DatetimeIndex(arr, tz=tzstr) - idx4 = DatetimeIndex(np.array(arr), tz=tzstr) + idx3 = DatetimeIndex(arr, tz=tzstr).as_unit("s") + idx4 = DatetimeIndex(np.array(arr), tz=tzstr).as_unit("s") - for other in [idx2, idx3, idx4]: - tm.assert_index_equal(idx1, other) + tm.assert_index_equal(idx1, idx2) + tm.assert_index_equal(idx1, idx3) + tm.assert_index_equal(idx1, idx4) def test_dti_construction_idempotent(self, unit): rng = date_range( @@ -965,7 +974,7 @@ def test_dti_convert_datetime_list(self, tzstr): @pytest.mark.parametrize( "tz", [ - pytz.timezone("US/Eastern"), + "pytz/US/Eastern", gettz("US/Eastern"), ], ) @@ -974,6 +983,8 @@ def test_dti_convert_datetime_list(self, tzstr): def test_dti_ambiguous_matches_timestamp(self, tz, use_str, box_cls, request): # GH#47471 check that we get the same raising behavior in the DTI # constructor and Timestamp constructor + if isinstance(tz, str) and tz.startswith("pytz/"): + tz = pytz.timezone(tz.removeprefix("pytz/")) dtstr = "2013-11-03 01:59:59.999999" item = dtstr if not use_str: @@ -1187,9 +1198,9 @@ def test_dti_constructor_object_dtype_dayfirst_yearfirst_with_tz(self): yfirst = Timestamp(2005, 10, 16, tz="US/Pacific") result1 = DatetimeIndex([val], tz="US/Pacific", dayfirst=True) - expected1 = DatetimeIndex([dfirst]) + expected1 = DatetimeIndex([dfirst]).as_unit("s") tm.assert_index_equal(result1, expected1) result2 = DatetimeIndex([val], tz="US/Pacific", yearfirst=True) - expected2 = DatetimeIndex([yfirst]) + expected2 = DatetimeIndex([yfirst]).as_unit("s") tm.assert_index_equal(result2, expected2) diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py index 99d05dd0f26e4..b37b5cf74b347 100644 --- a/pandas/tests/indexes/datetimes/test_date_range.py +++ b/pandas/tests/indexes/datetimes/test_date_range.py @@ -12,7 +12,6 @@ import numpy as np import pytest import pytz -from pytz import timezone from pandas._libs.tslibs import timezones from pandas._libs.tslibs.offsets import ( @@ -97,6 +96,7 @@ def test_date_range_timestamp_equiv_dateutil(self): assert ts == stamp def test_date_range_timestamp_equiv_explicit_pytz(self): + pytz = pytest.importorskip("pytz") rng = date_range("20090415", "20090519", tz=pytz.timezone("US/Eastern")) stamp = rng[0] @@ -144,24 +144,12 @@ def test_date_range_fractional_period(self): with pytest.raises(TypeError, match=msg): date_range("1/1/2000", periods=10.5) - @pytest.mark.parametrize( - "freq,freq_depr", - [ - ("2ME", "2M"), - ("2SME", "2SM"), - ("2BQE", "2BQ"), - ("2BYE", "2BY"), - ], - ) - def test_date_range_frequency_M_SM_BQ_BY_deprecated(self, freq, freq_depr): - # GH#52064 - depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed " - f"in a future version, please use '{freq[1:]}' instead." + @pytest.mark.parametrize("freq", ["2M", "1m", "2SM", "2BQ", "1bq", "2BY"]) + def test_date_range_frequency_M_SM_BQ_BY_raises(self, freq): + msg = f"Invalid frequency: {freq}" - expected = date_range("1/1/2000", periods=4, freq=freq) - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - result = date_range("1/1/2000", periods=4, freq=freq_depr) - tm.assert_index_equal(result, expected) + with pytest.raises(ValueError, match=msg): + date_range("1/1/2000", periods=4, freq=freq) def test_date_range_tuple_freq_raises(self): # GH#34703 @@ -502,7 +490,8 @@ def test_range_bug(self, unit): def test_range_tz_pytz(self): # see gh-2906 - tz = timezone("US/Eastern") + pytz = pytest.importorskip("pytz") + tz = pytz.timezone("US/Eastern") start = tz.localize(datetime(2011, 1, 1)) end = tz.localize(datetime(2011, 1, 3)) @@ -529,14 +518,16 @@ def test_range_tz_pytz(self): ], ) def test_range_tz_dst_straddle_pytz(self, start, end): - start = Timestamp(start, tz="US/Eastern") - end = Timestamp(end, tz="US/Eastern") + pytz = pytest.importorskip("pytz") + tz = pytz.timezone("US/Eastern") + start = Timestamp(start, tz=tz) + end = Timestamp(end, tz=tz) dr = date_range(start, end, freq="D") assert dr[0] == start assert dr[-1] == end assert np.all(dr.hour == 0) - dr = date_range(start, end, freq="D", tz="US/Eastern") + dr = date_range(start, end, freq="D", tz=tz) assert dr[0] == start assert dr[-1] == end assert np.all(dr.hour == 0) @@ -545,7 +536,7 @@ def test_range_tz_dst_straddle_pytz(self, start, end): start.replace(tzinfo=None), end.replace(tzinfo=None), freq="D", - tz="US/Eastern", + tz=tz, ) assert dr[0] == start assert dr[-1] == end @@ -777,34 +768,13 @@ def test_frequency_H_T_S_L_U_N_raises(self, freq): date_range("1/1/2000", periods=2, freq=freq) @pytest.mark.parametrize( - "freq,freq_depr", - [ - ("YE", "Y"), - ("YE-MAY", "Y-MAY"), - ], + "freq_depr", ["m", "bm", "CBM", "SM", "BQ", "q-feb", "y-may", "Y-MAY"] ) - def test_frequencies_Y_renamed(self, freq, freq_depr): - # GH#9586, GH#54275 - freq_msg = re.split("[0-9]*", freq, maxsplit=1)[1] - freq_depr_msg = re.split("[0-9]*", freq_depr, maxsplit=1)[1] - msg = f"'{freq_depr_msg}' is deprecated and will be removed " - f"in a future version, please use '{freq_msg}' instead." + def test_frequency_raises(self, freq_depr): + msg = f"Invalid frequency: {freq_depr}" - expected = date_range("1/1/2000", periods=2, freq=freq) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = date_range("1/1/2000", periods=2, freq=freq_depr) - tm.assert_index_equal(result, expected) - - def test_to_offset_with_lowercase_deprecated_freq(self) -> None: - # https://github.com/pandas-dev/pandas/issues/56847 - msg = ( - "'m' is deprecated and will be removed in a future version, please use " - "'ME' instead." - ) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = date_range("2010-01-01", periods=2, freq="m") - expected = DatetimeIndex(["2010-01-31", "2010-02-28"], freq="ME") - tm.assert_index_equal(result, expected) + with pytest.raises(ValueError, match=msg): + date_range("1/1/2000", periods=2, freq=freq_depr) def test_date_range_bday(self): sdate = datetime(1999, 12, 25) @@ -821,6 +791,26 @@ def test_frequency_A_raises(self, freq): with pytest.raises(ValueError, match=msg): date_range("1/1/2000", periods=2, freq=freq) + @pytest.mark.parametrize( + "freq,freq_depr", + [ + ("2W", "2w"), + ("2W-WED", "2w-wed"), + ("2B", "2b"), + ("2D", "2d"), + ("2C", "2c"), + ], + ) + def test_date_range_depr_lowercase_frequency(self, freq, freq_depr): + # GH#58998 + depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed " + "in a future version." + + expected = date_range("1/1/2000", periods=4, freq=freq) + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + result = date_range("1/1/2000", periods=4, freq=freq_depr) + tm.assert_index_equal(result, expected) + class TestDateRangeTZ: """Tests for date_range with timezones""" diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index 84a616f05cd63..04334a1d8d0c8 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -133,53 +133,19 @@ def test_asarray_tz_aware(self): tm.assert_numpy_array_equal(result, expected) - def test_CBH_deprecated(self): - msg = "'CBH' is deprecated and will be removed in a future version." - - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = date_range( - dt.datetime(2022, 12, 11), dt.datetime(2022, 12, 13), freq="CBH" - ) - result = DatetimeIndex( - [ - "2022-12-12 09:00:00", - "2022-12-12 10:00:00", - "2022-12-12 11:00:00", - "2022-12-12 12:00:00", - "2022-12-12 13:00:00", - "2022-12-12 14:00:00", - "2022-12-12 15:00:00", - "2022-12-12 16:00:00", - ], - dtype="datetime64[ns]", - freq="cbh", - ) + @pytest.mark.parametrize("freq", ["2H", "2BH", "2S"]) + def test_CBH_raises(self, freq): + msg = f"Invalid frequency: {freq}" - tm.assert_index_equal(result, expected) - - @pytest.mark.parametrize( - "freq, expected_values, freq_depr", - [ - ("2BYE-JUN", ["2016-06-30"], "2BY-JUN"), - ("2BME", ["2016-02-29", "2016-04-29", "2016-06-30"], "2BM"), - ("2BQE", ["2016-03-31"], "2BQ"), - ("1BQE-MAR", ["2016-03-31", "2016-06-30"], "1BQ-MAR"), - ], - ) - def test_BM_BQ_BY_deprecated(self, freq, expected_values, freq_depr): - # GH#52064 - msg = f"'{freq_depr[1:]}' is deprecated and will be removed " - f"in a future version, please use '{freq[1:]}' instead." - - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = date_range(start="2016-02-21", end="2016-08-21", freq=freq_depr) - result = DatetimeIndex( - data=expected_values, - dtype="datetime64[ns]", - freq=freq, - ) + with pytest.raises(ValueError, match=msg): + date_range(dt.datetime(2022, 12, 11), dt.datetime(2022, 12, 13), freq=freq) - tm.assert_index_equal(result, expected) + @pytest.mark.parametrize("freq", ["2BM", "1bm", "2BQ", "1BQ-MAR", "2BY-JUN", "1by"]) + def test_BM_BQ_BY_raises(self, freq): + msg = f"Invalid frequency: {freq}" + + with pytest.raises(ValueError, match=msg): + date_range(start="2016-02-21", end="2016-08-21", freq=freq) @pytest.mark.parametrize("freq", ["2BA-MAR", "1BAS-MAY", "2AS-AUG"]) def test_BA_BAS_raises(self, freq): diff --git a/pandas/tests/indexes/datetimes/test_formats.py b/pandas/tests/indexes/datetimes/test_formats.py index 6e4e22942ab07..4551fdf073193 100644 --- a/pandas/tests/indexes/datetimes/test_formats.py +++ b/pandas/tests/indexes/datetimes/test_formats.py @@ -1,9 +1,11 @@ -from datetime import datetime +from datetime import ( + datetime, + timezone, +) import dateutil.tz import numpy as np import pytest -import pytz import pandas as pd from pandas import ( @@ -276,7 +278,7 @@ def test_dti_summary(self): result = idx._summary() assert result == expected - @pytest.mark.parametrize("tz", [None, pytz.utc, dateutil.tz.tzutc()]) + @pytest.mark.parametrize("tz", [None, timezone.utc, dateutil.tz.tzutc()]) @pytest.mark.parametrize("freq", ["B", "C"]) def test_dti_business_repr_etc_smoke(self, tz, freq): # only really care that it works diff --git a/pandas/tests/indexes/datetimes/test_partial_slicing.py b/pandas/tests/indexes/datetimes/test_partial_slicing.py index 173b32b12e2d1..94175a56f1c4a 100644 --- a/pandas/tests/indexes/datetimes/test_partial_slicing.py +++ b/pandas/tests/indexes/datetimes/test_partial_slicing.py @@ -35,7 +35,7 @@ def test_string_index_series_name_converted(self): def test_stringified_slice_with_tz(self): # GH#2658 start = "2013-01-07" - idx = date_range(start=start, freq="1d", periods=10, tz="US/Eastern") + idx = date_range(start=start, freq="1D", periods=10, tz="US/Eastern") df = DataFrame(np.arange(10), index=idx) df["2013-01-14 23:44:34.437768-05:00":] # no exception here diff --git a/pandas/tests/indexes/datetimes/test_scalar_compat.py b/pandas/tests/indexes/datetimes/test_scalar_compat.py index 5831846c9ceb6..eb472b099fb1f 100644 --- a/pandas/tests/indexes/datetimes/test_scalar_compat.py +++ b/pandas/tests/indexes/datetimes/test_scalar_compat.py @@ -11,6 +11,8 @@ import locale import unicodedata +from hypothesis import given +import hypothesis.strategies as st import numpy as np import pytest @@ -329,6 +331,84 @@ def test_dti_is_month_start_custom(self): with pytest.raises(ValueError, match=msg): dti.is_month_start + @pytest.mark.parametrize( + "timestamp, freq, periods, expected_values", + [ + ("2017-12-01", "MS", 3, np.array([False, True, False])), + ("2017-12-01", "QS", 3, np.array([True, False, False])), + ("2017-12-01", "YS", 3, np.array([True, True, True])), + ], + ) + def test_dti_dr_is_year_start(self, timestamp, freq, periods, expected_values): + # GH57377 + result = date_range(timestamp, freq=freq, periods=periods).is_year_start + tm.assert_numpy_array_equal(result, expected_values) + + @pytest.mark.parametrize( + "timestamp, freq, periods, expected_values", + [ + ("2017-12-01", "ME", 3, np.array([True, False, False])), + ("2017-12-01", "QE", 3, np.array([True, False, False])), + ("2017-12-01", "YE", 3, np.array([True, True, True])), + ], + ) + def test_dti_dr_is_year_end(self, timestamp, freq, periods, expected_values): + # GH57377 + result = date_range(timestamp, freq=freq, periods=periods).is_year_end + tm.assert_numpy_array_equal(result, expected_values) + + @pytest.mark.parametrize( + "timestamp, freq, periods, expected_values", + [ + ("2017-12-01", "MS", 3, np.array([False, True, False])), + ("2017-12-01", "QS", 3, np.array([True, True, True])), + ("2017-12-01", "YS", 3, np.array([True, True, True])), + ], + ) + def test_dti_dr_is_quarter_start(self, timestamp, freq, periods, expected_values): + # GH57377 + result = date_range(timestamp, freq=freq, periods=periods).is_quarter_start + tm.assert_numpy_array_equal(result, expected_values) + + @pytest.mark.parametrize( + "timestamp, freq, periods, expected_values", + [ + ("2017-12-01", "ME", 3, np.array([True, False, False])), + ("2017-12-01", "QE", 3, np.array([True, True, True])), + ("2017-12-01", "YE", 3, np.array([True, True, True])), + ], + ) + def test_dti_dr_is_quarter_end(self, timestamp, freq, periods, expected_values): + # GH57377 + result = date_range(timestamp, freq=freq, periods=periods).is_quarter_end + tm.assert_numpy_array_equal(result, expected_values) + + @pytest.mark.parametrize( + "timestamp, freq, periods, expected_values", + [ + ("2017-12-01", "MS", 3, np.array([True, True, True])), + ("2017-12-01", "QS", 3, np.array([True, True, True])), + ("2017-12-01", "YS", 3, np.array([True, True, True])), + ], + ) + def test_dti_dr_is_month_start(self, timestamp, freq, periods, expected_values): + # GH57377 + result = date_range(timestamp, freq=freq, periods=periods).is_month_start + tm.assert_numpy_array_equal(result, expected_values) + + @pytest.mark.parametrize( + "timestamp, freq, periods, expected_values", + [ + ("2017-12-01", "ME", 3, np.array([True, True, True])), + ("2017-12-01", "QE", 3, np.array([True, True, True])), + ("2017-12-01", "YE", 3, np.array([True, True, True])), + ], + ) + def test_dti_dr_is_month_end(self, timestamp, freq, periods, expected_values): + # GH57377 + result = date_range(timestamp, freq=freq, periods=periods).is_month_end + tm.assert_numpy_array_equal(result, expected_values) + def test_dti_is_year_quarter_start_doubledigit_freq(self): # GH#58523 dr = date_range("2017-01-01", periods=2, freq="10YS") @@ -343,3 +423,29 @@ def test_dti_is_year_start_freq_custom_business_day_with_digit(self): msg = "Custom business days is not supported by is_year_start" with pytest.raises(ValueError, match=msg): dr.is_year_start + + @pytest.mark.parametrize("freq", ["3BMS", offsets.BusinessMonthBegin(3)]) + def test_dti_is_year_quarter_start_freq_business_month_begin(self, freq): + # GH#58729 + dr = date_range("2020-01-01", periods=5, freq=freq) + result = [x.is_year_start for x in dr] + assert result == [True, False, False, False, True] + + dr = date_range("2020-01-01", periods=4, freq=freq) + result = [x.is_quarter_start for x in dr] + assert all(dr.is_quarter_start) + + +@given( + dt=st.datetimes(min_value=datetime(1960, 1, 1), max_value=datetime(1980, 1, 1)), + n=st.integers(min_value=1, max_value=10), + freq=st.sampled_from(["MS", "QS", "YS"]), +) +@pytest.mark.slow +def test_against_scalar_parametric(freq, dt, n): + # https://github.com/pandas-dev/pandas/issues/49606 + freq = f"{n}{freq}" + d = date_range(dt, periods=3, freq=freq) + result = list(d.is_year_start) + expected = [x.is_year_start for x in d] + assert result == expected diff --git a/pandas/tests/indexes/datetimes/test_setops.py b/pandas/tests/indexes/datetimes/test_setops.py index fc3a1d4721841..7ef6efad0ff6f 100644 --- a/pandas/tests/indexes/datetimes/test_setops.py +++ b/pandas/tests/indexes/datetimes/test_setops.py @@ -1,11 +1,11 @@ from datetime import ( datetime, + timedelta, timezone, ) import numpy as np import pytest -import pytz import pandas.util._test_decorators as td @@ -560,6 +560,7 @@ def test_intersection_list(self): tm.assert_index_equal(res, idx) def test_month_range_union_tz_pytz(self, sort): + pytz = pytest.importorskip("pytz") tz = pytz.timezone("US/Eastern") early_start = datetime(2011, 1, 1) @@ -648,7 +649,7 @@ def test_intersection_bug(self): assert result.freq == b.freq @pytest.mark.parametrize( - "tz", [None, "UTC", "Europe/Berlin", pytz.FixedOffset(-60)] + "tz", [None, "UTC", "Europe/Berlin", timezone(timedelta(hours=-1))] ) def test_intersection_dst_transition(self, tz): # GH 46702: Europe/Berlin has DST transition @@ -664,3 +665,32 @@ def test_intersection_dst_transition(self, tz): result = index1.union(index2) expected = date_range("2021-10-28", periods=6, freq="D", tz="Europe/London") tm.assert_index_equal(result, expected) + + +def test_union_non_nano_rangelike(): + # GH 59036 + l1 = DatetimeIndex( + ["2024-05-11", "2024-05-12"], dtype="datetime64[us]", name="Date", freq="D" + ) + l2 = DatetimeIndex(["2024-05-13"], dtype="datetime64[us]", name="Date", freq="D") + result = l1.union(l2) + expected = DatetimeIndex( + ["2024-05-11", "2024-05-12", "2024-05-13"], + dtype="datetime64[us]", + name="Date", + freq="D", + ) + tm.assert_index_equal(result, expected) + + +def test_intersection_non_nano_rangelike(): + # GH 59271 + l1 = date_range("2024-01-01", "2024-01-03", unit="s") + l2 = date_range("2024-01-02", "2024-01-04", unit="s") + result = l1.intersection(l2) + expected = DatetimeIndex( + ["2024-01-02", "2024-01-03"], + dtype="datetime64[s]", + freq="D", + ) + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/datetimes/test_timezones.py b/pandas/tests/indexes/datetimes/test_timezones.py index 0c8bdbdd2fb22..e4b8a909add0d 100644 --- a/pandas/tests/indexes/datetimes/test_timezones.py +++ b/pandas/tests/indexes/datetimes/test_timezones.py @@ -8,11 +8,11 @@ timezone, tzinfo, ) +import zoneinfo from dateutil.tz import gettz import numpy as np import pytest -import pytz from pandas._libs.tslibs import ( conversion, @@ -184,8 +184,11 @@ def test_dti_tz_nat(self, tzstr): assert isna(idx[1]) assert idx[0].tzinfo is not None - @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) + @pytest.mark.parametrize("tzstr", ["pytz/US/Eastern", "dateutil/US/Eastern"]) def test_utc_box_timestamp_and_localize(self, tzstr): + if tzstr.startswith("pytz/"): + pytest.importorskip("pytz") + tzstr = tzstr.removeprefix("pytz/") tz = timezones.maybe_get_tz(tzstr) rng = date_range("3/11/2012", "3/12/2012", freq="h", tz="utc") @@ -206,15 +209,17 @@ def test_utc_box_timestamp_and_localize(self, tzstr): rng_eastern[0].tzinfo ) - @pytest.mark.parametrize("tz", [pytz.timezone("US/Central"), gettz("US/Central")]) + @pytest.mark.parametrize( + "tz", [zoneinfo.ZoneInfo("US/Central"), gettz("US/Central")] + ) def test_with_tz(self, tz): # just want it to work - start = datetime(2011, 3, 12, tzinfo=pytz.utc) + start = datetime(2011, 3, 12, tzinfo=timezone.utc) dr = bdate_range(start, periods=50, freq=pd.offsets.Hour()) - assert dr.tz is pytz.utc + assert dr.tz is timezone.utc # DateRange with naive datetimes - dr = bdate_range("1/1/2005", "1/1/2009", tz=pytz.utc) + dr = bdate_range("1/1/2005", "1/1/2009", tz=timezone.utc) dr = bdate_range("1/1/2005", "1/1/2009", tz=tz) # normalized @@ -231,13 +236,16 @@ def test_with_tz(self, tz): # datetimes with tzinfo set dr = bdate_range( - datetime(2005, 1, 1, tzinfo=pytz.utc), datetime(2009, 1, 1, tzinfo=pytz.utc) + datetime(2005, 1, 1, tzinfo=timezone.utc), + datetime(2009, 1, 1, tzinfo=timezone.utc), ) msg = "Start and end cannot both be tz-aware with different timezones" with pytest.raises(Exception, match=msg): - bdate_range(datetime(2005, 1, 1, tzinfo=pytz.utc), "1/1/2009", tz=tz) + bdate_range(datetime(2005, 1, 1, tzinfo=timezone.utc), "1/1/2009", tz=tz) - @pytest.mark.parametrize("tz", [pytz.timezone("US/Eastern"), gettz("US/Eastern")]) + @pytest.mark.parametrize( + "tz", [zoneinfo.ZoneInfo("US/Eastern"), gettz("US/Eastern")] + ) def test_dti_convert_tz_aware_datetime_datetime(self, tz): # GH#1581 dates = [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)] diff --git a/pandas/tests/indexes/interval/test_constructors.py b/pandas/tests/indexes/interval/test_constructors.py index 4a9eb4dd9fc0c..8db483751438c 100644 --- a/pandas/tests/indexes/interval/test_constructors.py +++ b/pandas/tests/indexes/interval/test_constructors.py @@ -210,6 +210,7 @@ class TestFromArrays(ConstructorTests): @pytest.fixture def constructor(self): + """Fixture for IntervalIndex.from_arrays constructor""" return IntervalIndex.from_arrays def get_kwargs_from_breaks(self, breaks, closed="right"): @@ -282,6 +283,7 @@ class TestFromBreaks(ConstructorTests): @pytest.fixture def constructor(self): + """Fixture for IntervalIndex.from_breaks constructor""" return IntervalIndex.from_breaks def get_kwargs_from_breaks(self, breaks, closed="right"): @@ -320,6 +322,7 @@ class TestFromTuples(ConstructorTests): @pytest.fixture def constructor(self): + """Fixture for IntervalIndex.from_tuples constructor""" return IntervalIndex.from_tuples def get_kwargs_from_breaks(self, breaks, closed="right"): @@ -370,6 +373,7 @@ class TestClassConstructors(ConstructorTests): @pytest.fixture def constructor(self): + """Fixture for IntervalIndex class constructor""" return IntervalIndex def get_kwargs_from_breaks(self, breaks, closed="right"): diff --git a/pandas/tests/indexes/interval/test_formats.py b/pandas/tests/indexes/interval/test_formats.py index 3b8e18463160f..f858ae137ca4e 100644 --- a/pandas/tests/indexes/interval/test_formats.py +++ b/pandas/tests/indexes/interval/test_formats.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype - from pandas import ( DataFrame, DatetimeIndex, @@ -42,12 +40,11 @@ def test_repr_missing(self, constructor, expected, using_infer_string, request): result = repr(obj) assert result == expected - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="repr different") def test_repr_floats(self): # GH 32553 markers = Series( - ["foo", "bar"], + [1, 2], index=IntervalIndex( [ Interval(left, right) @@ -59,7 +56,7 @@ def test_repr_floats(self): ), ) result = str(markers) - expected = "(329.973, 345.137] foo\n(345.137, 360.191] bar\ndtype: object" + expected = "(329.973, 345.137] 1\n(345.137, 360.191] 2\ndtype: int64" assert result == expected @pytest.mark.parametrize( diff --git a/pandas/tests/indexes/multi/test_get_level_values.py b/pandas/tests/indexes/multi/test_get_level_values.py index 28c77e78924cb..4db74a716c514 100644 --- a/pandas/tests/indexes/multi/test_get_level_values.py +++ b/pandas/tests/indexes/multi/test_get_level_values.py @@ -122,3 +122,12 @@ def test_values_loses_freq_of_underlying_index(): midx.values assert idx.freq is not None tm.assert_index_equal(idx, expected) + + +def test_get_level_values_gets_frequency_correctly(): + # GH#57949 GH#58327 + datetime_index = date_range(start=pd.to_datetime("1/1/2018"), periods=4, freq="YS") + other_index = ["A"] + multi_index = MultiIndex.from_product([datetime_index, other_index]) + + assert multi_index.get_level_values(0).freq == datetime_index.freq diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index f08a7625e7f8a..d82203a53a60f 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -1,3 +1,4 @@ +from collections import namedtuple from datetime import timedelta import re @@ -1006,3 +1007,26 @@ def test_get_indexer_for_multiindex_with_nans(nulls_fixture): result = idx1.get_indexer(idx2) expected = np.array([-1, 1], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) + + +def test_get_loc_namedtuple_behaves_like_tuple(): + # GH57922 + NamedIndex = namedtuple("NamedIndex", ("a", "b")) + multi_idx = MultiIndex.from_tuples( + [NamedIndex("i1", "i2"), NamedIndex("i3", "i4"), NamedIndex("i5", "i6")] + ) + for idx in (multi_idx, multi_idx.to_flat_index()): + assert idx.get_loc(NamedIndex("i1", "i2")) == 0 + assert idx.get_loc(NamedIndex("i3", "i4")) == 1 + assert idx.get_loc(NamedIndex("i5", "i6")) == 2 + assert idx.get_loc(("i1", "i2")) == 0 + assert idx.get_loc(("i3", "i4")) == 1 + assert idx.get_loc(("i5", "i6")) == 2 + multi_idx = MultiIndex.from_tuples([("i1", "i2"), ("i3", "i4"), ("i5", "i6")]) + for idx in (multi_idx, multi_idx.to_flat_index()): + assert idx.get_loc(NamedIndex("i1", "i2")) == 0 + assert idx.get_loc(NamedIndex("i3", "i4")) == 1 + assert idx.get_loc(NamedIndex("i5", "i6")) == 2 + assert idx.get_loc(("i1", "i2")) == 0 + assert idx.get_loc(("i3", "i4")) == 1 + assert idx.get_loc(("i5", "i6")) == 2 diff --git a/pandas/tests/indexes/multi/test_reshape.py b/pandas/tests/indexes/multi/test_reshape.py index 06dbb33aadf97..cc3dadc6bb61c 100644 --- a/pandas/tests/indexes/multi/test_reshape.py +++ b/pandas/tests/indexes/multi/test_reshape.py @@ -1,8 +1,8 @@ from datetime import datetime +import zoneinfo import numpy as np import pytest -import pytz import pandas as pd from pandas import ( @@ -114,11 +114,11 @@ def test_append_index(): result = idx1.append(midx_lv2) # see gh-7112 - tz = pytz.timezone("Asia/Tokyo") + tz = zoneinfo.ZoneInfo("Asia/Tokyo") expected_tuples = [ - (1.1, tz.localize(datetime(2011, 1, 1))), - (1.2, tz.localize(datetime(2011, 1, 2))), - (1.3, tz.localize(datetime(2011, 1, 3))), + (1.1, datetime(2011, 1, 1, tzinfo=tz)), + (1.2, datetime(2011, 1, 2, tzinfo=tz)), + (1.3, datetime(2011, 1, 3, tzinfo=tz)), ] expected = Index([1.1, 1.2, 1.3] + expected_tuples) tm.assert_index_equal(result, expected) @@ -138,9 +138,9 @@ def test_append_index(): expected = Index._simple_new( np.array( [ - (1.1, tz.localize(datetime(2011, 1, 1)), "A"), - (1.2, tz.localize(datetime(2011, 1, 2)), "B"), - (1.3, tz.localize(datetime(2011, 1, 3)), "C"), + (1.1, datetime(2011, 1, 1, tzinfo=tz), "A"), + (1.2, datetime(2011, 1, 2, tzinfo=tz), "B"), + (1.3, datetime(2011, 1, 3, tzinfo=tz), "C"), ] + expected_tuples, dtype=object, diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py index 47f21cc7f8182..e85091aaae608 100644 --- a/pandas/tests/indexes/multi/test_setops.py +++ b/pandas/tests/indexes/multi/test_setops.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( CategoricalIndex, @@ -752,6 +754,7 @@ def test_intersection_keep_ea_dtypes(val, any_numeric_ea_dtype): tm.assert_index_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_union_with_na_when_constructing_dataframe(): # GH43222 series1 = Series( diff --git a/pandas/tests/reshape/test_util.py b/pandas/tests/indexes/multi/test_util.py similarity index 78% rename from pandas/tests/reshape/test_util.py rename to pandas/tests/indexes/multi/test_util.py index d2971db3d7aa2..68792ce53f04e 100644 --- a/pandas/tests/reshape/test_util.py +++ b/pandas/tests/indexes/multi/test_util.py @@ -6,7 +6,7 @@ date_range, ) import pandas._testing as tm -from pandas.core.reshape.util import cartesian_product +from pandas.core.indexes.multi import cartesian_product class TestCartesianProduct: @@ -28,22 +28,6 @@ def test_datetimeindex(self): tm.assert_index_equal(result1, expected1) tm.assert_index_equal(result2, expected2) - def test_tzaware_retained(self): - x = date_range("2000-01-01", periods=2, tz="US/Pacific") - y = np.array([3, 4]) - result1, result2 = cartesian_product([x, y]) - - expected = x.repeat(2) - tm.assert_index_equal(result1, expected) - - def test_tzaware_retained_categorical(self): - x = date_range("2000-01-01", periods=2, tz="US/Pacific").astype("category") - y = np.array([3, 4]) - result1, result2 = cartesian_product([x, y]) - - expected = x.repeat(2) - tm.assert_index_equal(result1, expected) - @pytest.mark.parametrize("x, y", [[[], []], [[0, 1], []], [[], ["a", "b", "c"]]]) def test_empty(self, x, y): # product of empty factors diff --git a/pandas/tests/indexes/numeric/test_setops.py b/pandas/tests/indexes/numeric/test_setops.py index e9e5a57dfe9e5..5d3981dbf93d0 100644 --- a/pandas/tests/indexes/numeric/test_setops.py +++ b/pandas/tests/indexes/numeric/test_setops.py @@ -41,7 +41,7 @@ def test_intersection(self): other = Index([1, 2, 3, 4, 5]) result = index.intersection(other) - expected = Index(np.sort(np.intersect1d(index.values, other.values))) + expected = Index(range(1, 5)) tm.assert_index_equal(result, expected) result = other.intersection(index) diff --git a/pandas/tests/indexes/period/methods/test_asfreq.py b/pandas/tests/indexes/period/methods/test_asfreq.py index ea305a9766103..8fca53c28a036 100644 --- a/pandas/tests/indexes/period/methods/test_asfreq.py +++ b/pandas/tests/indexes/period/methods/test_asfreq.py @@ -142,21 +142,24 @@ def test_asfreq_with_different_n(self): tm.assert_series_equal(result, excepted) @pytest.mark.parametrize( - "freq, is_str", + "freq", [ - ("2BMS", True), - ("2YS-MAR", True), - ("2bh", True), - (offsets.MonthBegin(2), False), - (offsets.BusinessMonthEnd(2), False), + "2BMS", + "2YS-MAR", + "2bh", + offsets.MonthBegin(2), + offsets.BusinessMonthEnd(2), ], ) - def test_pi_asfreq_not_supported_frequency(self, freq, is_str): + def test_pi_asfreq_not_supported_frequency(self, freq): # GH#55785, GH#56945 - if is_str: - msg = f"{freq[1:]} is not supported as period frequency" - else: - msg = re.escape(f"{freq} is not supported as period frequency") + msg = "|".join( + [ + f"Invalid frequency: {freq}", + re.escape(f"{freq} is not supported as period frequency"), + "bh is not supported as period frequency", + ] + ) pi = PeriodIndex(["2020-01-01", "2021-01-01"], freq="M") with pytest.raises(ValueError, match=msg): diff --git a/pandas/tests/indexes/period/methods/test_to_timestamp.py b/pandas/tests/indexes/period/methods/test_to_timestamp.py index 3867f9e3245dc..4fe429ce71ee4 100644 --- a/pandas/tests/indexes/period/methods/test_to_timestamp.py +++ b/pandas/tests/indexes/period/methods/test_to_timestamp.py @@ -140,3 +140,10 @@ def test_to_timestamp_1703(self): result = index.to_timestamp() assert result[0] == Timestamp("1/1/2012") + + +def test_ms_to_timestamp_error_message(): + # https://github.com/pandas-dev/pandas/issues/58974#issuecomment-2164265446 + ix = period_range("2000", periods=3, freq="M") + with pytest.raises(ValueError, match="for Period, please use 'M' instead of 'MS'"): + ix.to_timestamp("MS") diff --git a/pandas/tests/indexes/period/test_constructors.py b/pandas/tests/indexes/period/test_constructors.py index 6aba9f17326ba..be07a71b283fd 100644 --- a/pandas/tests/indexes/period/test_constructors.py +++ b/pandas/tests/indexes/period/test_constructors.py @@ -33,7 +33,7 @@ class TestPeriodIndexDisallowedFreqs: ) def test_period_index_offsets_frequency_error_message(self, freq, freq_depr): # GH#52064 - msg = f"for Period, please use '{freq[1:]}' instead of '{freq_depr[1:]}'" + msg = f"Invalid frequency: {freq_depr}" with pytest.raises(ValueError, match=msg): PeriodIndex(["2020-01-01", "2020-01-02"], freq=freq_depr) @@ -41,20 +41,23 @@ def test_period_index_offsets_frequency_error_message(self, freq, freq_depr): with pytest.raises(ValueError, match=msg): period_range(start="2020-01-01", end="2020-01-02", freq=freq_depr) - @pytest.mark.parametrize("freq_depr", ["2SME", "2sme", "2CBME", "2BYE", "2Bye"]) - def test_period_index_frequency_invalid_freq(self, freq_depr): + @pytest.mark.parametrize( + "freq", + ["2SME", "2sme", "2BYE", "2Bye", "2CBME"], + ) + def test_period_index_frequency_invalid_freq(self, freq): # GH#9586 - msg = f"Invalid frequency: {freq_depr[1:]}" + msg = f"Invalid frequency: {freq}" with pytest.raises(ValueError, match=msg): - period_range("2020-01", "2020-05", freq=freq_depr) + period_range("2020-01", "2020-05", freq=freq) with pytest.raises(ValueError, match=msg): - PeriodIndex(["2020-01", "2020-05"], freq=freq_depr) + PeriodIndex(["2020-01", "2020-05"], freq=freq) @pytest.mark.parametrize("freq", ["2BQE-SEP", "2BYE-MAR", "2BME"]) def test_period_index_from_datetime_index_invalid_freq(self, freq): # GH#56899 - msg = f"Invalid frequency: {freq[1:]}" + msg = f"Invalid frequency: {freq}" rng = date_range("01-Jan-2012", periods=8, freq=freq) with pytest.raises(ValueError, match=msg): @@ -70,6 +73,30 @@ def test_period_index_T_L_U_N_raises(self, freq_depr): with pytest.raises(ValueError, match=msg): PeriodIndex(["2020-01", "2020-05"], freq=freq_depr) + @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") + @pytest.mark.filterwarnings("ignore:Period with BDay freq:FutureWarning") + @pytest.mark.parametrize( + "freq,freq_depr", + [("2W", "2w"), ("2W-FRI", "2w-fri"), ("2D", "2d"), ("2B", "2b")], + ) + def test_period_index_depr_lowercase_frequency(self, freq, freq_depr): + # GH#58998 + msg = ( + f"'{freq_depr[1:]}' is deprecated and will be removed in a future version." + ) + + with tm.assert_produces_warning(FutureWarning, match=msg): + result = PeriodIndex(["2020-01-01", "2020-01-02"], freq=freq_depr) + + expected = PeriodIndex(["2020-01-01", "2020-01-02"], freq=freq) + tm.assert_index_equal(result, expected) + + with tm.assert_produces_warning(FutureWarning, match=msg): + result = period_range(start="2020-01-01", end="2020-01-02", freq=freq_depr) + + expected = period_range(start="2020-01-01", end="2020-01-02", freq=freq) + tm.assert_index_equal(result, expected) + class TestPeriodIndex: def test_from_ordinals(self): @@ -542,9 +569,7 @@ def test_mixed_freq_raises(self): with tm.assert_produces_warning(FutureWarning, match=msg): end_intv = Period("2005-05-01", "B") - msg = "'w' is deprecated and will be removed in a future version." - with tm.assert_produces_warning(FutureWarning, match=msg): - vals = [end_intv, Period("2006-12-31", "w")] + vals = [end_intv, Period("2006-12-31", "W")] msg = r"Input has different freq=W-SUN from PeriodIndex\(freq=B\)" depr_msg = r"PeriodDtype\[B\] is deprecated" with pytest.raises(IncompatibleFrequency, match=msg): diff --git a/pandas/tests/indexes/period/test_period_range.py b/pandas/tests/indexes/period/test_period_range.py index 67f4d7421df23..51b03024ce272 100644 --- a/pandas/tests/indexes/period/test_period_range.py +++ b/pandas/tests/indexes/period/test_period_range.py @@ -181,10 +181,8 @@ def test_construction_from_period(self): def test_mismatched_start_end_freq_raises(self): depr_msg = "Period with BDay freq is deprecated" - msg = "'w' is deprecated and will be removed in a future version." - with tm.assert_produces_warning(FutureWarning, match=msg): - end_w = Period("2006-12-31", "1w") + end_w = Period("2006-12-31", "1W") with tm.assert_produces_warning(FutureWarning, match=depr_msg): start_b = Period("02-Apr-2005", "B") end_b = Period("2005-05-01", "B") @@ -205,7 +203,7 @@ def test_constructor_U(self): with pytest.raises(ValueError, match="Invalid frequency: X"): period_range("2007-1-1", periods=500, freq="X") - @pytest.mark.parametrize("freq_depr", ["2H", "2MIN", "2S", "2US", "2NS"]) + @pytest.mark.parametrize("freq_depr", ["2MIN", "2US", "2NS"]) def test_uppercase_freq_deprecated_from_time_series(self, freq_depr): # GH#52536, GH#54939 msg = f"'{freq_depr[1:]}' is deprecated and will be removed in a " @@ -214,14 +212,13 @@ def test_uppercase_freq_deprecated_from_time_series(self, freq_depr): with tm.assert_produces_warning(FutureWarning, match=msg): period_range("2020-01-01 00:00:00 00:00", periods=2, freq=freq_depr) - @pytest.mark.parametrize("freq_depr", ["2m", "2q-sep", "2y", "2w"]) - def test_lowercase_freq_deprecated_from_time_series(self, freq_depr): - # GH#52536, GH#54939 - msg = f"'{freq_depr[1:]}' is deprecated and will be removed in a " - f"future version. Please use '{freq_depr.upper()[1:]}' instead." + @pytest.mark.parametrize("freq", ["2m", "2q-sep", "2y", "2H", "2S"]) + def test_incorrect_case_freq_from_time_series_raises(self, freq): + # GH#52536, GH#54939, GH#59143 + msg = f"Invalid frequency: {freq}" - with tm.assert_produces_warning(FutureWarning, match=msg): - period_range(freq=freq_depr, start="1/1/2001", end="12/1/2009") + with pytest.raises(ValueError, match=msg): + period_range(freq=freq, start="1/1/2001", end="12/1/2009") @pytest.mark.parametrize("freq", ["2A", "2a", "2A-AUG", "2A-aug"]) def test_A_raises_from_time_series(self, freq): @@ -229,3 +226,12 @@ def test_A_raises_from_time_series(self, freq): with pytest.raises(ValueError, match=msg): period_range(freq=freq, start="1/1/2001", end="12/1/2009") + + @pytest.mark.parametrize("freq", ["2w"]) + def test_lowercase_freq_from_time_series_deprecated(self, freq): + # GH#52536, GH#54939 + msg = f"'{freq[1:]}' is deprecated and will be removed in a " + f"future version. Please use '{freq.upper()[1:]}' instead." + + with tm.assert_produces_warning(FutureWarning, match=msg): + period_range(freq=freq, start="1/1/2001", end="12/1/2009") diff --git a/pandas/tests/indexes/test_any_index.py b/pandas/tests/indexes/test_any_index.py index 284e219fd20e4..e1ed96195e0a7 100644 --- a/pandas/tests/indexes/test_any_index.py +++ b/pandas/tests/indexes/test_any_index.py @@ -22,12 +22,6 @@ def test_boolean_context_compat(index): bool(index) -def test_sort(index): - msg = "cannot sort an Index object in-place, use sort_values instead" - with pytest.raises(TypeError, match=msg): - index.sort() - - def test_hash_error(index): with pytest.raises(TypeError, match=f"unhashable type: '{type(index).__name__}'"): hash(index) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index c13473cd2e746..8d2245d0d9978 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -104,16 +104,9 @@ def test_constructor_copy(self, using_infer_string): ) def test_constructor_from_index_dtlike(self, cast_as_obj, index): if cast_as_obj: - with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): - result = Index(index.astype(object)) - else: - result = Index(index) - - tm.assert_index_equal(result, index) - - if isinstance(index, DatetimeIndex): - assert result.tz == index.tz - if cast_as_obj: + result = Index(index.astype(object)) + assert result.dtype == np.dtype(object) + if isinstance(index, DatetimeIndex): # GH#23524 check that Index(dti, dtype=object) does not # incorrectly raise ValueError, and that nanoseconds are not # dropped @@ -121,6 +114,10 @@ def test_constructor_from_index_dtlike(self, cast_as_obj, index): result = Index(index, dtype=object) assert result.dtype == np.object_ assert list(result) == list(index) + else: + result = Index(index) + + tm.assert_index_equal(result, index) @pytest.mark.parametrize( "index,has_tz", @@ -186,7 +183,7 @@ def test_constructor_int_dtype_nan(self): "klass,dtype,na_val", [ (Index, np.float64, np.nan), - (DatetimeIndex, "datetime64[ns]", pd.NaT), + (DatetimeIndex, "datetime64[s]", pd.NaT), ], ) def test_index_ctor_infer_nan_nat(self, klass, dtype, na_val): @@ -1586,7 +1583,7 @@ def test_ensure_index_uint64(self): def test_get_combined_index(self): result = _get_combined_index([]) - expected = Index([]) + expected = RangeIndex(0) tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index 43445433e2a04..bf16554871efc 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -223,7 +223,9 @@ def test_unique(self, index_flat): pass result = idx.unique() - tm.assert_index_equal(result, idx_unique) + tm.assert_index_equal( + result, idx_unique, exact=not isinstance(index, RangeIndex) + ) # nans: if not index._can_hold_na: diff --git a/pandas/tests/indexes/test_index_new.py b/pandas/tests/indexes/test_index_new.py index b544ebac43ece..4a31ae88a757a 100644 --- a/pandas/tests/indexes/test_index_new.py +++ b/pandas/tests/indexes/test_index_new.py @@ -61,16 +61,16 @@ def test_infer_nat(self, val): values = [NaT, val] idx = Index(values) - assert idx.dtype == "datetime64[ns]" and idx.isna().all() + assert idx.dtype == "datetime64[s]" and idx.isna().all() idx = Index(values[::-1]) - assert idx.dtype == "datetime64[ns]" and idx.isna().all() + assert idx.dtype == "datetime64[s]" and idx.isna().all() idx = Index(np.array(values, dtype=object)) - assert idx.dtype == "datetime64[ns]" and idx.isna().all() + assert idx.dtype == "datetime64[s]" and idx.isna().all() idx = Index(np.array(values, dtype=object)[::-1]) - assert idx.dtype == "datetime64[ns]" and idx.isna().all() + assert idx.dtype == "datetime64[s]" and idx.isna().all() @pytest.mark.parametrize("na_value", [None, np.nan]) @pytest.mark.parametrize("vtype", [list, tuple, iter]) @@ -138,6 +138,9 @@ def test_constructor_infer_nat_dt_like( ) expected = klass([NaT, NaT]) + if dtype[0] == "d": + # we infer all-NaT as second resolution + expected = expected.astype("M8[ns]") assert expected.dtype == dtype data = [ctor] data.insert(pos, nulls_fixture) diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py index b929616c814ee..6d01ba6adc87a 100644 --- a/pandas/tests/indexes/test_old_base.py +++ b/pandas/tests/indexes/test_old_base.py @@ -6,7 +6,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._libs.tslibs import Timestamp @@ -228,6 +228,7 @@ def test_logical_compat(self, simple_index): with pytest.raises(TypeError, match=msg): idx.any() + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_repr_roundtrip(self, simple_index): if isinstance(simple_index, IntervalIndex): pytest.skip(f"Not a valid repr for {type(simple_index).__name__}") @@ -438,7 +439,7 @@ def test_insert_base(self, index): assert index[0:4].equals(result) @pytest.mark.skipif( - using_pyarrow_string_dtype(), + using_string_dtype(), reason="completely different behavior, tested elsewher", ) def test_insert_out_of_bounds(self, index): @@ -822,12 +823,14 @@ def test_append_preserves_dtype(self, simple_index): result = index.append(index) assert result.dtype == index.dtype - tm.assert_index_equal(result[:N], index, check_exact=True) - tm.assert_index_equal(result[N:], index, check_exact=True) + + tm.assert_index_equal(result[:N], index, exact=False, check_exact=True) + tm.assert_index_equal(result[N:], index, exact=False, check_exact=True) alt = index.take(list(range(N)) * 2) tm.assert_index_equal(result, alt, check_exact=True) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_inv(self, simple_index, using_infer_string): idx = simple_index diff --git a/pandas/tests/indexes/timedeltas/methods/test_shift.py b/pandas/tests/indexes/timedeltas/methods/test_shift.py index a0986d1496881..9bbf06dc51a0c 100644 --- a/pandas/tests/indexes/timedeltas/methods/test_shift.py +++ b/pandas/tests/indexes/timedeltas/methods/test_shift.py @@ -37,7 +37,7 @@ def test_tdi_shift_minutes(self): def test_tdi_shift_int(self): # GH#8083 - tdi = pd.to_timedelta(range(5), unit="d") + tdi = pd.to_timedelta(range(5), unit="D") trange = tdi._with_freq("infer") + pd.offsets.Hour(1) result = trange.shift(1) expected = TimedeltaIndex( @@ -54,7 +54,7 @@ def test_tdi_shift_int(self): def test_tdi_shift_nonstandard_freq(self): # GH#8083 - tdi = pd.to_timedelta(range(5), unit="d") + tdi = pd.to_timedelta(range(5), unit="D") trange = tdi._with_freq("infer") + pd.offsets.Hour(1) result = trange.shift(3, freq="2D 1s") expected = TimedeltaIndex( diff --git a/pandas/tests/indexes/timedeltas/test_constructors.py b/pandas/tests/indexes/timedeltas/test_constructors.py index 12ac5dd63bd8c..ace0ab7990138 100644 --- a/pandas/tests/indexes/timedeltas/test_constructors.py +++ b/pandas/tests/indexes/timedeltas/test_constructors.py @@ -168,7 +168,7 @@ def test_constructor_coverage(self): # NumPy string array strings = np.array(["1 days", "2 days", "3 days"]) result = TimedeltaIndex(strings) - expected = to_timedelta([1, 2, 3], unit="d") + expected = to_timedelta([1, 2, 3], unit="D") tm.assert_index_equal(result, expected) from_ints = TimedeltaIndex(expected.asi8) @@ -239,3 +239,28 @@ def test_from_categorical(self): ci = pd.CategoricalIndex(tdi) result = TimedeltaIndex(ci) tm.assert_index_equal(result, tdi) + + @pytest.mark.parametrize( + "unit,unit_depr", + [ + ("W", "w"), + ("D", "d"), + ("min", "MIN"), + ("s", "S"), + ("h", "H"), + ("ms", "MS"), + ("us", "US"), + ], + ) + def test_unit_deprecated(self, unit, unit_depr): + # GH#52536, GH#59051 + msg = f"'{unit_depr}' is deprecated and will be removed in a future version." + + expected = TimedeltaIndex([f"1{unit}", f"2{unit}"]) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = TimedeltaIndex([f"1{unit_depr}", f"2{unit_depr}"]) + tm.assert_index_equal(result, expected) + + with tm.assert_produces_warning(FutureWarning, match=msg): + tdi = to_timedelta([1, 2], unit=unit_depr) + tm.assert_index_equal(tdi, expected) diff --git a/pandas/tests/indexes/timedeltas/test_delete.py b/pandas/tests/indexes/timedeltas/test_delete.py index 6e6f54702ce1a..f49af7cd0befd 100644 --- a/pandas/tests/indexes/timedeltas/test_delete.py +++ b/pandas/tests/indexes/timedeltas/test_delete.py @@ -44,7 +44,7 @@ def test_delete_slice(self): # reset freq to None expected_3_5 = TimedeltaIndex( - ["1 d", "2 d", "3 d", "7 d", "8 d", "9 d", "10d"], freq=None, name="idx" + ["1 D", "2 D", "3 D", "7 D", "8 D", "9 D", "10D"], freq=None, name="idx" ) cases = { diff --git a/pandas/tests/indexes/timedeltas/test_indexing.py b/pandas/tests/indexes/timedeltas/test_indexing.py index 397f9d9e18331..e411555c65bea 100644 --- a/pandas/tests/indexes/timedeltas/test_indexing.py +++ b/pandas/tests/indexes/timedeltas/test_indexing.py @@ -20,8 +20,10 @@ class TestGetItem: def test_getitem_slice_keeps_name(self): - # GH#4226 - tdi = timedelta_range("1d", "5d", freq="h", name="timebucket") + # GH#4226, GH#59051 + msg = "'d' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + tdi = timedelta_range("1d", "5d", freq="h", name="timebucket") assert tdi[1:].name == tdi.name def test_getitem(self): @@ -230,7 +232,7 @@ def test_take_invalid_kwargs(self): def test_take_equiv_getitem(self): tds = ["1day 02:00:00", "1 day 04:00:00", "1 day 10:00:00"] - idx = timedelta_range(start="1d", end="2d", freq="h", name="idx") + idx = timedelta_range(start="1D", end="2D", freq="h", name="idx") expected = TimedeltaIndex(tds, freq=None, name="idx") taken1 = idx.take([2, 4, 10]) @@ -337,8 +339,10 @@ def test_contains_nonunique(self): def test_contains(self): # Checking for any NaT-like objects - # GH#13603 - td = to_timedelta(range(5), unit="d") + offsets.Hour(1) + # GH#13603, GH#59051 + msg = "'d' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + td = to_timedelta(range(5), unit="d") + offsets.Hour(1) for v in [NaT, None, float("nan"), np.nan]: assert v not in td diff --git a/pandas/tests/indexes/timedeltas/test_scalar_compat.py b/pandas/tests/indexes/timedeltas/test_scalar_compat.py index 9f0552f8baa90..9a00c556dc515 100644 --- a/pandas/tests/indexes/timedeltas/test_scalar_compat.py +++ b/pandas/tests/indexes/timedeltas/test_scalar_compat.py @@ -103,30 +103,34 @@ def test_round(self): t1c = TimedeltaIndex(np.array([1, 1, 1], "m8[D]")).as_unit("ns") # note that negative times round DOWN! so don't give whole numbers - for freq, s1, s2 in [ - ("ns", t1, t2), - ("us", t1, t2), - ( - "ms", - t1a, - TimedeltaIndex( - ["-1 days +00:00:00", "-2 days +23:58:58", "-2 days +23:57:56"] + msg = "'d' is deprecated and will be removed in a future version." + + with tm.assert_produces_warning(FutureWarning, match=msg): + for freq, s1, s2 in [ + ("ns", t1, t2), + ("us", t1, t2), + ( + "ms", + t1a, + TimedeltaIndex( + ["-1 days +00:00:00", "-2 days +23:58:58", "-2 days +23:57:56"] + ), ), - ), - ( - "s", - t1a, - TimedeltaIndex( - ["-1 days +00:00:00", "-2 days +23:58:58", "-2 days +23:57:56"] + ( + "s", + t1a, + TimedeltaIndex( + ["-1 days +00:00:00", "-2 days +23:58:58", "-2 days +23:57:56"] + ), ), - ), - ("12min", t1c, TimedeltaIndex(["-1 days", "-1 days", "-1 days"])), - ("h", t1c, TimedeltaIndex(["-1 days", "-1 days", "-1 days"])), - ("d", t1c, -1 * t1c), - ]: - r1 = t1.round(freq) + ("12min", t1c, TimedeltaIndex(["-1 days", "-1 days", "-1 days"])), + ("h", t1c, TimedeltaIndex(["-1 days", "-1 days", "-1 days"])), + ("d", t1c, -1 * t1c), + ]: + r1 = t1.round(freq) + r2 = t2.round(freq) + tm.assert_index_equal(r1, s1) - r2 = t2.round(freq) tm.assert_index_equal(r2, s2) def test_components(self): diff --git a/pandas/tests/indexes/timedeltas/test_setops.py b/pandas/tests/indexes/timedeltas/test_setops.py index fce10d9176d74..ae88caf18fdae 100644 --- a/pandas/tests/indexes/timedeltas/test_setops.py +++ b/pandas/tests/indexes/timedeltas/test_setops.py @@ -42,7 +42,10 @@ def test_union_sort_false(self): tm.assert_index_equal(result, expected) def test_union_coverage(self): - idx = TimedeltaIndex(["3d", "1d", "2d"]) + # GH#59051 + msg = "'d' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + idx = TimedeltaIndex(["3d", "1d", "2d"]) ordered = TimedeltaIndex(idx.sort_values(), freq="infer") result = ordered.union(idx) tm.assert_index_equal(result, ordered) @@ -70,7 +73,7 @@ def test_union_bug_1745(self): tm.assert_index_equal(result, exp) def test_union_bug_4564(self): - left = timedelta_range("1 day", "30d") + left = timedelta_range("1 day", "30D") right = left + pd.offsets.Minute(15) result = left.union(right) diff --git a/pandas/tests/indexes/timedeltas/test_timedelta.py b/pandas/tests/indexes/timedeltas/test_timedelta.py index 3120066741ffa..2066be8976e7f 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta.py @@ -51,9 +51,9 @@ def test_fields(self): s = Series(rng) s[1] = np.nan - tm.assert_series_equal(s.dt.days, Series([1, np.nan], index=[0, 1])) + tm.assert_series_equal(s.dt.days, Series([1, np.nan], index=range(2))) tm.assert_series_equal( - s.dt.seconds, Series([10 * 3600 + 11 * 60 + 12, np.nan], index=[0, 1]) + s.dt.seconds, Series([10 * 3600 + 11 * 60 + 12, np.nan], index=range(2)) ) # preserve name (GH15589) diff --git a/pandas/tests/indexes/timedeltas/test_timedelta_range.py b/pandas/tests/indexes/timedeltas/test_timedelta_range.py index 1b645e2bc607f..6f3d29fb4240a 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta_range.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta_range.py @@ -3,7 +3,6 @@ from pandas import ( Timedelta, - TimedeltaIndex, timedelta_range, to_timedelta, ) @@ -70,14 +69,12 @@ def test_linspace_behavior(self, periods, freq): expected = timedelta_range(start="0 days", end="4 days", freq=freq) tm.assert_index_equal(result, expected) - def test_timedelta_range_H_deprecated(self): + def test_timedelta_range_H_raises(self): # GH#52536 - msg = "'H' is deprecated and will be removed in a future version." + msg = "Invalid frequency: H" - result = timedelta_range(start="0 days", end="4 days", periods=6) - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = timedelta_range(start="0 days", end="4 days", freq="19H12min") - tm.assert_index_equal(result, expected) + with pytest.raises(ValueError, match=msg): + timedelta_range(start="0 days", end="4 days", freq="19H12min") def test_timedelta_range_T_raises(self): msg = "Invalid frequency: T" @@ -130,33 +127,6 @@ def test_timedelta_range_infer_freq(self): result = timedelta_range("0s", "1s", periods=31) assert result.freq is None - @pytest.mark.parametrize( - "freq_depr, start, end, expected_values, expected_freq", - [ - ( - "3.5S", - "05:03:01", - "05:03:10", - ["0 days 05:03:01", "0 days 05:03:04.500000", "0 days 05:03:08"], - "3500ms", - ), - ], - ) - def test_timedelta_range_deprecated_freq( - self, freq_depr, start, end, expected_values, expected_freq - ): - # GH#52536 - msg = ( - f"'{freq_depr[-1]}' is deprecated and will be removed in a future version." - ) - - with tm.assert_produces_warning(FutureWarning, match=msg): - result = timedelta_range(start=start, end=end, freq=freq_depr) - expected = TimedeltaIndex( - expected_values, dtype="timedelta64[ns]", freq=expected_freq - ) - tm.assert_index_equal(result, expected) - @pytest.mark.parametrize( "freq_depr, start, end", [ @@ -170,9 +140,15 @@ def test_timedelta_range_deprecated_freq( "5 hours", "5 hours 8 minutes", ), + ( + "3.5S", + "05:03:01", + "05:03:10", + ), ], ) def test_timedelta_range_removed_freq(self, freq_depr, start, end): + # GH#59143 msg = f"Invalid frequency: {freq_depr}" with pytest.raises(ValueError, match=msg): timedelta_range(start=start, end=end, freq=freq_depr) diff --git a/pandas/tests/indexing/interval/test_interval_new.py b/pandas/tests/indexing/interval/test_interval_new.py index 283921a23e368..4c1efe9e4f81d 100644 --- a/pandas/tests/indexing/interval/test_interval_new.py +++ b/pandas/tests/indexing/interval/test_interval_new.py @@ -17,6 +17,9 @@ class TestIntervalIndex: @pytest.fixture def series_with_interval_index(self): + """ + Fixture providing a Series with an IntervalIndex. + """ return Series(np.arange(5), IntervalIndex.from_breaks(np.arange(6))) def test_loc_with_interval(self, series_with_interval_index, indexer_sl): diff --git a/pandas/tests/indexing/multiindex/test_multiindex.py b/pandas/tests/indexing/multiindex/test_multiindex.py index 481a77fd03b05..7140ad7d1e9f5 100644 --- a/pandas/tests/indexing/multiindex/test_multiindex.py +++ b/pandas/tests/indexing/multiindex/test_multiindex.py @@ -232,3 +232,20 @@ def test_multiindex_from_tuples_with_nan(self): [("a", "b", "c"), (np.nan, np.nan, np.nan), ("d", "", "")] ) tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("operation", ["div", "mul", "add", "sub"]) + def test_groupyby_rename_categories_operation_with_multiindex(self, operation): + # GH#51500 + data = DataFrame( + [["C", "B", "B"], ["B", "A", "A"], ["B", "A", "B"]], columns=["0", "1", "2"] + ) + data["0"] = data["0"].astype("category") + data["0"] = data["0"].cat.rename_categories({"C": "B", "B": "C"}) + + a = data.groupby(by=["0", "1"])["2"].value_counts() + b = data.groupby(by=["0", "1"]).size() + + result = getattr(a, operation)(b) + expected = getattr(a, operation)(b.sort_index(ascending=False)) + + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexing/multiindex/test_setitem.py b/pandas/tests/indexing/multiindex/test_setitem.py index abf89c2b0d096..d732cb4d7fbbc 100644 --- a/pandas/tests/indexing/multiindex/test_setitem.py +++ b/pandas/tests/indexing/multiindex/test_setitem.py @@ -213,13 +213,11 @@ def test_multiindex_assignment_single_dtype(self): tm.assert_series_equal(result, exp) # arr + 0.5 cannot be cast losslessly to int, so we upcast - with tm.assert_produces_warning( - FutureWarning, match="item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): df.loc[4, "c"] = arr + 0.5 - result = df.loc[4, "c"] - exp = exp + 0.5 - tm.assert_series_equal(result, exp) + # Upcast so that we can add .5 + df = df.astype({"c": "float64"}) + df.loc[4, "c"] = arr + 0.5 # scalar ok df.loc[4, "c"] = 10 diff --git a/pandas/tests/indexing/test_at.py b/pandas/tests/indexing/test_at.py index 217ca74bd7fbd..10a8fa88b4b5e 100644 --- a/pandas/tests/indexing/test_at.py +++ b/pandas/tests/indexing/test_at.py @@ -24,12 +24,8 @@ def test_at_timezone(): # https://github.com/pandas-dev/pandas/issues/33544 result = DataFrame({"foo": [datetime(2000, 1, 1)]}) - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): result.at[0, "foo"] = datetime(2000, 1, 2, tzinfo=timezone.utc) - expected = DataFrame( - {"foo": [datetime(2000, 1, 2, tzinfo=timezone.utc)]}, dtype=object - ) - tm.assert_frame_equal(result, expected) def test_selection_methods_of_assigned_col(): diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index 1b58f8e8b9831..c9f29b2cb55fe 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -511,13 +511,13 @@ def test_loc_and_at_with_categorical_index(self): # pandas scalars [Interval(1, 4), Interval(4, 6), Interval(6, 9)], [Timestamp(2019, 1, 1), Timestamp(2019, 2, 1), Timestamp(2019, 3, 1)], - [Timedelta(1, "d"), Timedelta(2, "d"), Timedelta(3, "D")], + [Timedelta(1, "D"), Timedelta(2, "D"), Timedelta(3, "D")], # pandas Integer arrays *(pd.array([1, 2, 3], dtype=dtype) for dtype in tm.ALL_INT_EA_DTYPES), # other pandas arrays pd.IntervalIndex.from_breaks([1, 4, 6, 9]).array, pd.date_range("2019-01-01", periods=3).array, - pd.timedelta_range(start="1d", periods=3).array, + pd.timedelta_range(start="1D", periods=3).array, ], ) def test_loc_getitem_with_non_string_categories(self, idx_values, ordered): diff --git a/pandas/tests/indexing/test_chaining_and_caching.py b/pandas/tests/indexing/test_chaining_and_caching.py index efae0b4dd84cc..64d8068fa9291 100644 --- a/pandas/tests/indexing/test_chaining_and_caching.py +++ b/pandas/tests/indexing/test_chaining_and_caching.py @@ -285,11 +285,9 @@ def test_detect_chained_assignment_changing_dtype(self): df.loc[2]["C"] = "foo" tm.assert_frame_equal(df, df_original) # TODO: Use tm.raises_chained_assignment_error() when PDEP-6 is enforced - with tm.raises_chained_assignment_error( - extra_warnings=(FutureWarning,), extra_match=(None,) - ): - df["C"][2] = "foo" - tm.assert_frame_equal(df, df_original) + with pytest.raises(TypeError, match="Invalid value"): + with tm.raises_chained_assignment_error(): + df["C"][2] = "foo" def test_setting_with_copy_bug(self): # operating on a copy diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index d4bc0341e732e..d5002a47c3447 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -9,8 +9,6 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype - from pandas.compat import ( IS64, is_platform_windows, @@ -598,7 +596,7 @@ def test_fillna_complex128(self, index_or_series, fill_val, fill_dtype): @pytest.mark.parametrize( "fill_val,fill_dtype", [ - (pd.Timestamp("2012-01-01"), "datetime64[ns]"), + (pd.Timestamp("2012-01-01"), "datetime64[s]"), (pd.Timestamp("2012-01-01", tz="US/Eastern"), object), (1, object), ("x", object), @@ -615,7 +613,7 @@ def test_fillna_datetime(self, index_or_series, fill_val, fill_dtype): pd.Timestamp("2011-01-04"), ] ) - assert obj.dtype == "datetime64[ns]" + assert obj.dtype == "datetime64[s]" exp = klass( [ @@ -630,10 +628,10 @@ def test_fillna_datetime(self, index_or_series, fill_val, fill_dtype): @pytest.mark.parametrize( "fill_val,fill_dtype", [ - (pd.Timestamp("2012-01-01", tz="US/Eastern"), "datetime64[ns, US/Eastern]"), + (pd.Timestamp("2012-01-01", tz="US/Eastern"), "datetime64[s, US/Eastern]"), (pd.Timestamp("2012-01-01"), object), # pre-2.0 with a mismatched tz we would get object result - (pd.Timestamp("2012-01-01", tz="Asia/Tokyo"), "datetime64[ns, US/Eastern]"), + (pd.Timestamp("2012-01-01", tz="Asia/Tokyo"), "datetime64[s, US/Eastern]"), (1, object), ("x", object), ], @@ -650,7 +648,7 @@ def test_fillna_datetime64tz(self, index_or_series, fill_val, fill_dtype): pd.Timestamp("2011-01-04", tz=tz), ] ) - assert obj.dtype == "datetime64[ns, US/Eastern]" + assert obj.dtype == "datetime64[s, US/Eastern]" if getattr(fill_val, "tz", None) is None: fv = fill_val @@ -825,11 +823,10 @@ def replacer(self, how, from_key, to_key): raise ValueError return replacer - # Expected needs adjustment for the infer string option, seems to work as expecetd - @pytest.mark.skipif(using_pyarrow_string_dtype(), reason="TODO: test is to complex") def test_replace_series(self, how, to_key, from_key, replacer): index = pd.Index([3, 4], name="xxx") obj = pd.Series(self.rep[from_key], index=index, name="yyy") + obj = obj.astype(from_key) assert obj.dtype == from_key if from_key.startswith("datetime") and to_key.startswith("datetime"): @@ -850,7 +847,6 @@ def test_replace_series(self, how, to_key, from_key, replacer): else: exp = pd.Series(self.rep[to_key], index=index, name="yyy") - assert exp.dtype == to_key result = obj.replace(replacer) tm.assert_series_equal(result, exp, check_dtype=False) @@ -867,7 +863,7 @@ def test_replace_series_datetime_tz( self, how, to_key, from_key, replacer, using_infer_string ): index = pd.Index([3, 4], name="xyz") - obj = pd.Series(self.rep[from_key], index=index, name="yyy") + obj = pd.Series(self.rep[from_key], index=index, name="yyy").dt.as_unit("ns") assert obj.dtype == from_key exp = pd.Series(self.rep[to_key], index=index, name="yyy") @@ -891,7 +887,7 @@ def test_replace_series_datetime_tz( ) def test_replace_series_datetime_datetime(self, how, to_key, from_key, replacer): index = pd.Index([3, 4], name="xyz") - obj = pd.Series(self.rep[from_key], index=index, name="yyy") + obj = pd.Series(self.rep[from_key], index=index, name="yyy").dt.as_unit("ns") assert obj.dtype == from_key exp = pd.Series(self.rep[to_key], index=index, name="yyy") @@ -900,8 +896,8 @@ def test_replace_series_datetime_datetime(self, how, to_key, from_key, replacer) ): # with mismatched tzs, we retain the original dtype as of 2.0 exp = exp.astype(obj.dtype) - else: - assert exp.dtype == to_key + elif to_key == from_key: + exp = exp.dt.as_unit("ns") result = obj.replace(replacer) tm.assert_series_equal(result, exp, check_dtype=False) diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 172aa9878caec..b05b5d3dea2dc 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -6,10 +6,11 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import IndexingError from pandas import ( - NA, Categorical, CategoricalDtype, DataFrame, @@ -114,7 +115,7 @@ def test_iloc_setitem_ea_inplace(self, frame_or_series, index_or_series_or_array if frame_or_series is Series: values = obj.values else: - values = obj._mgr.arrays[0] + values = obj._mgr.blocks[0].values if frame_or_series is Series: obj.iloc[:2] = index_or_series_or_array(arr[2:]) @@ -528,10 +529,9 @@ def test_iloc_setitem_frame_duplicate_columns_multiple_blocks(self): assert len(df._mgr.blocks) == 1 # if the assigned values cannot be held by existing integer arrays, - # we cast - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + # we raise + with pytest.raises(TypeError, match="Invalid value"): df.iloc[:, 0] = df.iloc[:, 0] + 0.5 - assert len(df._mgr.blocks) == 2 expected = df.copy() @@ -1198,6 +1198,7 @@ def test_iloc_getitem_int_single_ea_block_view(self): arr[2] = arr[-1] assert ser[0] == arr[-1] + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_iloc_setitem_multicolumn_to_datetime(self): # GH#20511 df = DataFrame({"A": ["2022-01-01", "2022-01-02"], "B": ["2021", "2022"]}) @@ -1445,7 +1446,5 @@ def test_iloc_setitem_pure_position_based(self): def test_iloc_nullable_int64_size_1_nan(self): # GH 31861 result = DataFrame({"a": ["test"], "b": [np.nan]}) - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): result.loc[:, "b"] = result.loc[:, "b"].astype("Int64") - expected = DataFrame({"a": ["test"], "b": array([NA], dtype="Int64")}) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 60a3ccf0b7483..6b072bc27ed81 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -8,7 +8,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas.errors import IndexingError @@ -180,14 +180,8 @@ def test_setitem_dtype_upcast(self): df["c"] = np.nan assert df["c"].dtype == np.float64 - with tm.assert_produces_warning( - FutureWarning, match="item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): df.loc[0, "c"] = "foo" - expected = DataFrame( - {"a": [1, 3], "b": [np.nan, 2], "c": Series(["foo", np.nan], dtype=object)} - ) - tm.assert_frame_equal(df, expected) @pytest.mark.parametrize("val", [3.14, "wxyz"]) def test_setitem_dtype_upcast2(self, val): @@ -199,19 +193,8 @@ def test_setitem_dtype_upcast2(self, val): ) left = df.copy() - with tm.assert_produces_warning( - FutureWarning, match="item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): left.loc["a", "bar"] = val - right = DataFrame( - [[0, val, 2], [3, 4, 5]], - index=list("ab"), - columns=["foo", "bar", "baz"], - ) - - tm.assert_frame_equal(left, right) - assert is_integer_dtype(left["foo"]) - assert is_integer_dtype(left["baz"]) def test_setitem_dtype_upcast3(self): left = DataFrame( @@ -219,21 +202,9 @@ def test_setitem_dtype_upcast3(self): index=list("ab"), columns=["foo", "bar", "baz"], ) - with tm.assert_produces_warning( - FutureWarning, match="item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): left.loc["a", "bar"] = "wxyz" - right = DataFrame( - [[0, "wxyz", 0.2], [0.3, 0.4, 0.5]], - index=list("ab"), - columns=["foo", "bar", "baz"], - ) - - tm.assert_frame_equal(left, right) - assert is_float_dtype(left["foo"]) - assert is_float_dtype(left["baz"]) - def test_dups_fancy_indexing(self): # GH 3455 @@ -455,9 +426,6 @@ def test_set_index_nan(self): ) tm.assert_frame_equal(result, df) - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="can't multiply arrow strings" - ) def test_multi_assign(self): # GH 3626, an assignment of a sub-df to a df # set float64 to avoid upcast when setting nan @@ -560,6 +528,7 @@ def test_string_slice_empty(self): with pytest.raises(KeyError, match="^0$"): df.loc["2011", 0] + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_astype_assignment(self, using_infer_string): # GH4312 (iloc) df_orig = DataFrame( @@ -683,7 +652,6 @@ def test_loc_setitem_fullindex_views(self): df.loc[df.index] = df.loc[df.index] tm.assert_frame_equal(df, df2) - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't set int into string") def test_rhs_alignment(self): # GH8258, tests that both rows & columns are aligned to what is # assigned to. covers both uniform data-type & multi-type cases @@ -728,7 +696,7 @@ def run_tests(df, rhs, right_loc, right_iloc): frame["jolie"] = frame["jolie"].map(lambda x: f"@{x}") right_iloc["joe"] = [1.0, "@-28", "@-20", "@-12", 17.0] right_iloc["jolie"] = ["@2", -26.0, -18.0, -10.0, "@18"] - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): run_tests(df, rhs, right_loc, right_iloc) @pytest.mark.parametrize( @@ -780,10 +748,10 @@ def test_loc_range_in_series_indexing(self, size): # GH 11652 s = Series(index=range(size), dtype=np.float64) s.loc[range(1)] = 42 - tm.assert_series_equal(s.loc[range(1)], Series(42.0, index=[0])) + tm.assert_series_equal(s.loc[range(1)], Series(42.0, index=range(1))) s.loc[range(2)] = 43 - tm.assert_series_equal(s.loc[range(2)], Series(43.0, index=[0, 1])) + tm.assert_series_equal(s.loc[range(2)], Series(43.0, index=range(2))) def test_partial_boolean_frame_indexing(self): # GH 17170 diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 01dab14c7e528..1b2dc0819006c 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -13,10 +13,9 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas._libs import index as libindex -from pandas.compat.numpy import np_version_gt2 from pandas.errors import IndexingError import pandas as pd @@ -383,12 +382,8 @@ def test_loc_setitem_slice(self): df2 = DataFrame({"a": [0, 1, 1], "b": [100, 200, 300]}, dtype="uint64") ix = df1["a"] == 1 newb2 = df2.loc[ix, "b"] - with tm.assert_produces_warning( - FutureWarning, match="item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): df1.loc[ix, "b"] = newb2 - expected = DataFrame({"a": [0, 1, 1], "b": [100, 200, 300]}, dtype="uint64") - tm.assert_frame_equal(df2, expected) def test_loc_setitem_dtype(self): # GH31340 @@ -572,54 +567,31 @@ def frame_for_consistency(self): def test_loc_setitem_consistency(self, frame_for_consistency, val): # GH 6149 # coerce similarly for setitem and loc when rows have a null-slice - expected = DataFrame( - { - "date": Series(0, index=range(5), dtype=np.int64), - "val": Series(range(5), dtype=np.int64), - } - ) df = frame_for_consistency.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): df.loc[:, "date"] = val - tm.assert_frame_equal(df, expected) def test_loc_setitem_consistency_dt64_to_str(self, frame_for_consistency): # GH 6149 # coerce similarly for setitem and loc when rows have a null-slice - expected = DataFrame( - { - "date": Series("foo", index=range(5)), - "val": Series(range(5), dtype=np.int64), - } - ) df = frame_for_consistency.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): df.loc[:, "date"] = "foo" - tm.assert_frame_equal(df, expected) def test_loc_setitem_consistency_dt64_to_float(self, frame_for_consistency): # GH 6149 # coerce similarly for setitem and loc when rows have a null-slice - expected = DataFrame( - { - "date": Series(1.0, index=range(5)), - "val": Series(range(5), dtype=np.int64), - } - ) df = frame_for_consistency.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): df.loc[:, "date"] = 1.0 - tm.assert_frame_equal(df, expected) def test_loc_setitem_consistency_single_row(self): # GH 15494 # setting on frame with single row df = DataFrame({"date": Series([Timestamp("20180101")])}) - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): df.loc[:, "date"] = "string" - expected = DataFrame({"date": Series(["string"])}) - tm.assert_frame_equal(df, expected) def test_loc_setitem_consistency_empty(self): # empty (essentially noops) @@ -637,6 +609,7 @@ def test_loc_setitem_consistency_empty(self): expected["x"] = expected["x"].astype(np.int64) tm.assert_frame_equal(df, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_loc_setitem_consistency_slice_column_len(self): # .loc[:,column] setting with slice == len of the column # GH10408 @@ -677,16 +650,11 @@ def test_loc_setitem_consistency_slice_column_len(self): # timedelta64[m] -> float, so this cannot be done inplace, so # no warning - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): df.loc[:, ("Respondent", "Duration")] = df.loc[ :, ("Respondent", "Duration") ] / Timedelta(60_000_000_000) - expected = Series( - [23.0, 12.0, 14.0, 36.0], index=df.index, name=("Respondent", "Duration") - ) - tm.assert_series_equal(df[("Respondent", "Duration")], expected) - @pytest.mark.parametrize("unit", ["Y", "M", "D", "h", "m", "s", "ms", "us"]) def test_loc_assign_non_ns_datetime(self, unit): # GH 27395, non-ns dtype assignment via .loc should work @@ -711,7 +679,7 @@ def test_loc_modify_datetime(self): {"date": [1485264372711, 1485265925110, 1540215845888, 1540282121025]} ) - df["date_dt"] = to_datetime(df["date"], unit="ms", cache=True) + df["date_dt"] = to_datetime(df["date"], unit="ms", cache=True).dt.as_unit("ms") df.loc[:, "date_dt_cp"] = df.loc[:, "date_dt"] df.loc[[2, 3], "date_dt_cp"] = df.loc[[2, 3], "date_dt"] @@ -865,6 +833,7 @@ def test_loc_setitem_frame_multiples(self): "val": Series([0, 1, 0, 1, 2], dtype=np.int64), } ) + expected["date"] = expected["date"].astype("M8[ns]") rhs = df.loc[0:2] rhs.index = df.index[2:5] df.loc[2:4] = rhs @@ -1236,7 +1205,7 @@ def test_loc_reverse_assignment(self): tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't set int into string") + @pytest.mark.xfail(using_string_dtype(), reason="can't set int into string") def test_loc_setitem_str_to_small_float_conversion_type(self): # GH#20388 @@ -1280,7 +1249,7 @@ def test_loc_getitem_time_object(self, frame_or_series): tm.assert_equal(result, expected) @pytest.mark.parametrize("spmatrix_t", ["coo_matrix", "csc_matrix", "csr_matrix"]) - @pytest.mark.parametrize("dtype", [np.int64, np.float64, complex]) + @pytest.mark.parametrize("dtype", [np.complex128, np.float64, np.int64, bool]) def test_loc_getitem_range_from_spmatrix(self, spmatrix_t, dtype): sp_sparse = pytest.importorskip("scipy.sparse") @@ -1295,13 +1264,13 @@ def test_loc_getitem_range_from_spmatrix(self, spmatrix_t, dtype): # regression test for GH#34526 itr_idx = range(2, rows) - result = df.loc[itr_idx].values + result = np.nan_to_num(df.loc[itr_idx].values) expected = spmatrix.toarray()[itr_idx] tm.assert_numpy_array_equal(result, expected) # regression test for GH#34540 result = df.loc[itr_idx].dtypes.values - expected = np.full(cols, SparseDtype(dtype, fill_value=0)) + expected = np.full(cols, SparseDtype(dtype)) tm.assert_numpy_array_equal(result, expected) def test_loc_getitem_listlike_all_retains_sparse(self): @@ -1313,18 +1282,16 @@ def test_loc_getitem_sparse_frame(self): # GH34687 sp_sparse = pytest.importorskip("scipy.sparse") - df = DataFrame.sparse.from_spmatrix(sp_sparse.eye(5)) + df = DataFrame.sparse.from_spmatrix(sp_sparse.eye(5, dtype=np.int64)) result = df.loc[range(2)] expected = DataFrame( - [[1.0, 0.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0, 0.0]], - dtype=SparseDtype("float64", 0.0), + [[1, 0, 0, 0, 0], [0, 1, 0, 0, 0]], + dtype=SparseDtype(np.int64), ) tm.assert_frame_equal(result, expected) result = df.loc[range(2)].loc[range(1)] - expected = DataFrame( - [[1.0, 0.0, 0.0, 0.0, 0.0]], dtype=SparseDtype("float64", 0.0) - ) + expected = DataFrame([[1, 0, 0, 0, 0]], dtype=SparseDtype(np.int64)) tm.assert_frame_equal(result, expected) def test_loc_getitem_sparse_series(self): @@ -1412,13 +1379,9 @@ def test_loc_setitem_categorical_values_partial_column_slice(self): # Assigning a Category to parts of a int/... column uses the values of # the Categorical df = DataFrame({"a": [1, 1, 1, 1, 1], "b": list("aaaaa")}) - exp = DataFrame({"a": [1, "b", "b", 1, 1], "b": list("aabba")}) - with tm.assert_produces_warning( - FutureWarning, match="item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): df.loc[1:2, "a"] = Categorical(["b", "b"], categories=["a", "b"]) df.loc[2:3, "b"] = Categorical(["b", "b"], categories=["a", "b"]) - tm.assert_frame_equal(df, exp) def test_loc_setitem_single_row_categorical(self, using_infer_string): # GH#25495 @@ -1445,9 +1408,8 @@ def test_loc_setitem_datetime_coercion(self): df.loc[0:1, "c"] = np.datetime64("2008-08-08") assert Timestamp("2008-08-08") == df.loc[0, "c"] assert Timestamp("2008-08-08") == df.loc[1, "c"] - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): df.loc[2, "c"] = date(2005, 5, 5) - assert Timestamp("2005-05-05").date() == df.loc[2, "c"] @pytest.mark.parametrize("idxer", ["var", ["var"]]) def test_loc_setitem_datetimeindex_tz(self, idxer, tz_naive_fixture): @@ -1458,12 +1420,13 @@ def test_loc_setitem_datetimeindex_tz(self, idxer, tz_naive_fixture): # if result started off with object dtype, then the .loc.__setitem__ # below would retain object dtype result = DataFrame(index=idx, columns=["var"], dtype=np.float64) - with tm.assert_produces_warning( - FutureWarning if idxer == "var" else None, match="incompatible dtype" - ): + if idxer == "var": + with pytest.raises(TypeError, match="Invalid value"): + result.loc[:, idxer] = expected + else: # See https://github.com/pandas-dev/pandas/issues/56223 result.loc[:, idxer] = expected - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_loc_setitem_time_key(self): index = date_range("2012-01-01", "2012-01-05", freq="30min") @@ -1609,16 +1572,8 @@ def test_loc_setitem_cast2(self): # dtype conversion on setting df = DataFrame(np.random.default_rng(2).random((30, 3)), columns=tuple("ABC")) df["event"] = np.nan - with tm.assert_produces_warning( - FutureWarning, match="item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): df.loc[10, "event"] = "foo" - result = df.dtypes - expected = Series( - [np.dtype("float64")] * 3 + [np.dtype("object")], - index=["A", "B", "C", "event"], - ) - tm.assert_series_equal(result, expected) def test_loc_setitem_cast3(self): # Test that data type is preserved . GH#5782 @@ -1814,7 +1769,7 @@ def test_loc_getitem_datetime_string_with_datetimeindex(self): result = df.loc[["2010-01-01", "2010-01-05"], ["a", "b"]] expected = DataFrame( {"a": [0, 4], "b": [0, 4]}, - index=DatetimeIndex(["2010-01-01", "2010-01-05"]), + index=DatetimeIndex(["2010-01-01", "2010-01-05"]).as_unit("ns"), ) tm.assert_frame_equal(result, expected) @@ -1974,7 +1929,7 @@ def test_loc_setitem_empty_series(self): # partially set with an empty object series ser = Series(dtype=object) ser.loc[1] = 1 - tm.assert_series_equal(ser, Series([1], index=[1])) + tm.assert_series_equal(ser, Series([1], index=range(1, 2))) ser.loc[3] = 3 tm.assert_series_equal(ser, Series([1, 3], index=[1, 3])) @@ -1984,7 +1939,7 @@ def test_loc_setitem_empty_series_float(self): # partially set with an empty object series ser = Series(dtype=object) ser.loc[1] = 1.0 - tm.assert_series_equal(ser, Series([1.0], index=[1])) + tm.assert_series_equal(ser, Series([1.0], index=range(1, 2))) ser.loc[3] = 3.0 tm.assert_series_equal(ser, Series([1.0, 3.0], index=[1, 3])) @@ -2082,7 +2037,7 @@ def test_setitem_with_expansion(self): expected = Series([v[0].tz_convert("UTC"), df.loc[1, "time"]], name="time") tm.assert_series_equal(df2.time, expected) - v = df.loc[df.new_col == "new", "time"] + Timedelta("1s") + v = df.loc[df.new_col == "new", "time"] + Timedelta("1s").as_unit("s") df.loc[df.new_col == "new", "time"] = v tm.assert_series_equal(df.loc[df.new_col == "new", "time"], v) @@ -2107,7 +2062,7 @@ def test_loc_setitem_with_expansion_nonunique_index(self, index): N = len(index) arr = np.arange(N).astype(np.int64) - orig = DataFrame(arr, index=index, columns=[0]) + orig = DataFrame(arr, index=index) # key that will requiring object-dtype casting in the index key = "kapow" @@ -2120,7 +2075,7 @@ def test_loc_setitem_with_expansion_nonunique_index(self, index): else: assert exp_index[-1] == key exp_data = np.arange(N + 1).astype(np.float64) - expected = DataFrame(exp_data, index=exp_index, columns=[0]) + expected = DataFrame(exp_data, index=exp_index) # Add new row, but no new columns df = orig.copy() @@ -2973,20 +2928,9 @@ def test_loc_setitem_uint8_upcast(value): # GH#26049 df = DataFrame([1, 2, 3, 4], columns=["col1"], dtype="uint8") - with tm.assert_produces_warning(FutureWarning, match="item of incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): df.loc[2, "col1"] = value # value that can't be held in uint8 - if np_version_gt2 and isinstance(value, np.int16): - # Note, result type of uint8 + int16 is int16 - # in numpy < 2, though, numpy would inspect the - # value and see that it could fit in an uint16, resulting in a uint16 - dtype = "int16" - else: - dtype = "uint16" - - expected = DataFrame([1, 2, 300, 4], columns=["col1"], dtype=dtype) - tm.assert_frame_equal(df, expected) - @pytest.mark.parametrize( "fill_val,exp_dtype", @@ -3321,3 +3265,18 @@ def test_loc_nonunique_masked_index(self): index=Index(np.array(ids).repeat(1000), dtype="Int64"), ) tm.assert_frame_equal(result, expected) + + def test_loc_index_alignment_for_series(self): + # GH #56024 + df = DataFrame({"a": [1, 2], "b": [3, 4]}) + other = Series([200, 999], index=[1, 0]) + df.loc[:, "a"] = other + expected = DataFrame({"a": [999, 200], "b": [3, 4]}) + tm.assert_frame_equal(expected, df) + + def test_loc_reindexing_of_empty_index(self): + # GH 57735 + df = DataFrame(index=[1, 1, 2, 2], data=["1", "1", "2", "2"]) + df.loc[Series([False] * 4, index=df.index, name=0), 0] = df[0] + expected = DataFrame(index=[1, 1, 2, 2], data=["1", "1", "2", "2"]) + tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py index b0a041ed5b69c..4d232d5ed1312 100644 --- a/pandas/tests/indexing/test_partial.py +++ b/pandas/tests/indexing/test_partial.py @@ -580,7 +580,7 @@ def test_partial_set_invalid(self): ], ), ( - date_range(start="2000", periods=20, freq="D"), + date_range(start="2000", periods=20, freq="D", unit="s"), ["2000-01-04", "2000-01-08", "2000-01-12"], [ Timestamp("2000-01-04"), diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index 60e05c2c65124..76910db941d36 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -6,6 +6,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs.tslibs import iNaT from pandas.compat import ( is_ci_environment, @@ -407,6 +409,7 @@ def test_empty_string_column(): tm.assert_frame_equal(df, result) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_large_string(): # GH#56702 pytest.importorskip("pyarrow") @@ -423,6 +426,7 @@ def test_non_str_names(): assert names == ["0"] +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_non_str_names_w_duplicates(): # https://github.com/pandas-dev/pandas/issues/56701 df = pd.DataFrame({"0": [1, 2, 3], 0: [4, 5, 6]}) @@ -603,7 +607,8 @@ def test_empty_dataframe(): ), ( pd.Series( - [datetime(2022, 1, 1), datetime(2022, 1, 2), datetime(2022, 1, 3)] + [datetime(2022, 1, 1), datetime(2022, 1, 2), datetime(2022, 1, 3)], + dtype="M8[ns]", ), (DtypeKind.DATETIME, 64, "tsn:", "="), (DtypeKind.INT, 64, ArrowCTypes.INT64, "="), diff --git a/pandas/tests/internals/test_api.py b/pandas/tests/internals/test_api.py index c189d5248b1f3..591157bbe87fe 100644 --- a/pandas/tests/internals/test_api.py +++ b/pandas/tests/internals/test_api.py @@ -44,7 +44,10 @@ def test_namespace(): def test_make_block_2d_with_dti(): # GH#41168 dti = pd.date_range("2012", periods=3, tz="UTC") - blk = api.make_block(dti, placement=[0]) + + msg = "make_block is deprecated" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + blk = api.make_block(dti, placement=[0]) assert blk.shape == (1, 3) assert blk.values.shape == (1, 3) diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 749e2c4a86b55..579d3fbfb3435 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -1280,20 +1280,19 @@ def test_interval_can_hold_element(self, dtype, element): # `elem` to not have the same length as `arr` ii2 = IntervalIndex.from_breaks(arr[:-1], closed="neither") elem = element(ii2) - msg = "Setting an item of incompatible dtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): + with pytest.raises(TypeError, match="Invalid value"): self.check_series_setitem(elem, ii, False) assert not blk._can_hold_element(elem) ii3 = IntervalIndex.from_breaks([Timestamp(1), Timestamp(3), Timestamp(4)]) elem = element(ii3) - with tm.assert_produces_warning(FutureWarning, match=msg): + with pytest.raises(TypeError, match="Invalid value"): self.check_series_setitem(elem, ii, False) assert not blk._can_hold_element(elem) ii4 = IntervalIndex.from_breaks([Timedelta(1), Timedelta(3), Timedelta(4)]) elem = element(ii4) - with tm.assert_produces_warning(FutureWarning, match=msg): + with pytest.raises(TypeError, match="Invalid value"): self.check_series_setitem(elem, ii, False) assert not blk._can_hold_element(elem) @@ -1313,13 +1312,12 @@ def test_period_can_hold_element(self, element): # `elem` to not have the same length as `arr` pi2 = pi.asfreq("D")[:-1] elem = element(pi2) - msg = "Setting an item of incompatible dtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): + with pytest.raises(TypeError, match="Invalid value"): self.check_series_setitem(elem, pi, False) dti = pi.to_timestamp("s")[:-1] elem = element(dti) - with tm.assert_produces_warning(FutureWarning, match=msg): + with pytest.raises(TypeError, match="Invalid value"): self.check_series_setitem(elem, pi, False) def check_can_hold_element(self, obj, elem, inplace: bool): @@ -1368,8 +1366,10 @@ def test_validate_ndim(): placement = BlockPlacement(slice(2)) msg = r"Wrong number of dimensions. values.ndim != ndim \[1 != 2\]" + depr_msg = "make_block is deprecated" with pytest.raises(ValueError, match=msg): - make_block(values, placement, ndim=2) + with tm.assert_produces_warning(DeprecationWarning, match=depr_msg): + make_block(values, placement, ndim=2) def test_block_shape(): @@ -1384,8 +1384,12 @@ def test_make_block_no_pandas_array(block_maker): # https://github.com/pandas-dev/pandas/pull/24866 arr = pd.arrays.NumpyExtensionArray(np.array([1, 2])) + depr_msg = "make_block is deprecated" + warn = DeprecationWarning if block_maker is make_block else None + # NumpyExtensionArray, no dtype - result = block_maker(arr, BlockPlacement(slice(len(arr))), ndim=arr.ndim) + with tm.assert_produces_warning(warn, match=depr_msg): + result = block_maker(arr, BlockPlacement(slice(len(arr))), ndim=arr.ndim) assert result.dtype.kind in ["i", "u"] if block_maker is make_block: @@ -1393,14 +1397,16 @@ def test_make_block_no_pandas_array(block_maker): assert result.is_extension is False # NumpyExtensionArray, NumpyEADtype - result = block_maker(arr, slice(len(arr)), dtype=arr.dtype, ndim=arr.ndim) + with tm.assert_produces_warning(warn, match=depr_msg): + result = block_maker(arr, slice(len(arr)), dtype=arr.dtype, ndim=arr.ndim) assert result.dtype.kind in ["i", "u"] assert result.is_extension is False - # new_block no longer taked dtype keyword + # new_block no longer accepts dtype keyword # ndarray, NumpyEADtype - result = block_maker( - arr.to_numpy(), slice(len(arr)), dtype=arr.dtype, ndim=arr.ndim - ) + with tm.assert_produces_warning(warn, match=depr_msg): + result = block_maker( + arr.to_numpy(), slice(len(arr)), dtype=arr.dtype, ndim=arr.ndim + ) assert result.dtype.kind in ["i", "u"] assert result.is_extension is False diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index ab6cacc4cc860..bdefadf3dbec0 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -224,19 +224,3 @@ def compression_format(request): @pytest.fixture(params=_compression_formats_params) def compression_ext(request): return request.param[0] - - -@pytest.fixture( - params=[ - "python", - pytest.param("pyarrow", marks=td.skip_if_no("pyarrow")), - ] -) -def string_storage(request): - """ - Parametrized fixture for pd.options.mode.string_storage. - - * 'python' - * 'pyarrow' - """ - return request.param diff --git a/pandas/tests/io/data/excel/test_none_type.xlsx b/pandas/tests/io/data/excel/test_none_type.xlsx new file mode 100644 index 0000000000000..38aaf72ddfc8f Binary files /dev/null and b/pandas/tests/io/data/excel/test_none_type.xlsx differ diff --git a/pandas/tests/io/data/stata/stata-compat-102.dta b/pandas/tests/io/data/stata/stata-compat-102.dta new file mode 100644 index 0000000000000..424b767b0011c Binary files /dev/null and b/pandas/tests/io/data/stata/stata-compat-102.dta differ diff --git a/pandas/tests/io/data/stata/stata-compat-103.dta b/pandas/tests/io/data/stata/stata-compat-103.dta new file mode 100644 index 0000000000000..adfeb6c672333 Binary files /dev/null and b/pandas/tests/io/data/stata/stata-compat-103.dta differ diff --git a/pandas/tests/io/data/stata/stata-compat-104.dta b/pandas/tests/io/data/stata/stata-compat-104.dta new file mode 100644 index 0000000000000..9bc3659afd31c Binary files /dev/null and b/pandas/tests/io/data/stata/stata-compat-104.dta differ diff --git a/pandas/tests/io/data/stata/stata-compat-be-103.dta b/pandas/tests/io/data/stata/stata-compat-be-103.dta new file mode 100644 index 0000000000000..0e2ef231f91c0 Binary files /dev/null and b/pandas/tests/io/data/stata/stata-compat-be-103.dta differ diff --git a/pandas/tests/io/data/stata/stata-compat-be-104.dta b/pandas/tests/io/data/stata/stata-compat-be-104.dta new file mode 100644 index 0000000000000..98185d8ce27dc Binary files /dev/null and b/pandas/tests/io/data/stata/stata-compat-be-104.dta differ diff --git a/pandas/tests/io/data/stata/stata12_118.dta b/pandas/tests/io/data/stata/stata12_118.dta new file mode 100644 index 0000000000000..87c6d1f063150 Binary files /dev/null and b/pandas/tests/io/data/stata/stata12_118.dta differ diff --git a/pandas/tests/io/data/stata/stata12_119.dta b/pandas/tests/io/data/stata/stata12_119.dta new file mode 100644 index 0000000000000..fa63f0135738e Binary files /dev/null and b/pandas/tests/io/data/stata/stata12_119.dta differ diff --git a/pandas/tests/io/data/stata/stata12_be_117.dta b/pandas/tests/io/data/stata/stata12_be_117.dta new file mode 100644 index 0000000000000..7f84d15fb76d0 Binary files /dev/null and b/pandas/tests/io/data/stata/stata12_be_117.dta differ diff --git a/pandas/tests/io/data/stata/stata12_be_118.dta b/pandas/tests/io/data/stata/stata12_be_118.dta new file mode 100644 index 0000000000000..9ed6f39b0f9b5 Binary files /dev/null and b/pandas/tests/io/data/stata/stata12_be_118.dta differ diff --git a/pandas/tests/io/data/stata/stata12_be_119.dta b/pandas/tests/io/data/stata/stata12_be_119.dta new file mode 100644 index 0000000000000..3c9736d0f3af3 Binary files /dev/null and b/pandas/tests/io/data/stata/stata12_be_119.dta differ diff --git a/pandas/tests/io/data/stata/stata14_119.dta b/pandas/tests/io/data/stata/stata14_119.dta new file mode 100644 index 0000000000000..e64353213b1c9 Binary files /dev/null and b/pandas/tests/io/data/stata/stata14_119.dta differ diff --git a/pandas/tests/io/data/stata/stata14_be_118.dta b/pandas/tests/io/data/stata/stata14_be_118.dta new file mode 100644 index 0000000000000..584ec0984c49e Binary files /dev/null and b/pandas/tests/io/data/stata/stata14_be_118.dta differ diff --git a/pandas/tests/io/data/stata/stata14_be_119.dta b/pandas/tests/io/data/stata/stata14_be_119.dta new file mode 100644 index 0000000000000..09d08f7e992ea Binary files /dev/null and b/pandas/tests/io/data/stata/stata14_be_119.dta differ diff --git a/pandas/tests/io/data/stata/stata16_119.dta b/pandas/tests/io/data/stata/stata16_119.dta new file mode 100644 index 0000000000000..d03c489d4342d Binary files /dev/null and b/pandas/tests/io/data/stata/stata16_119.dta differ diff --git a/pandas/tests/io/data/stata/stata16_be_118.dta b/pandas/tests/io/data/stata/stata16_be_118.dta new file mode 100644 index 0000000000000..bae769c038820 Binary files /dev/null and b/pandas/tests/io/data/stata/stata16_be_118.dta differ diff --git a/pandas/tests/io/data/stata/stata16_be_119.dta b/pandas/tests/io/data/stata/stata16_be_119.dta new file mode 100644 index 0000000000000..e928a9713715d Binary files /dev/null and b/pandas/tests/io/data/stata/stata16_be_119.dta differ diff --git a/pandas/tests/io/data/stata/stata1_102.dta b/pandas/tests/io/data/stata/stata1_102.dta new file mode 100644 index 0000000000000..d0ca1b2a8c02d Binary files /dev/null and b/pandas/tests/io/data/stata/stata1_102.dta differ diff --git a/pandas/tests/io/data/stata/stata1_103.dta b/pandas/tests/io/data/stata/stata1_103.dta new file mode 100644 index 0000000000000..98072ba6bd4fc Binary files /dev/null and b/pandas/tests/io/data/stata/stata1_103.dta differ diff --git a/pandas/tests/io/data/stata/stata1_104.dta b/pandas/tests/io/data/stata/stata1_104.dta new file mode 100644 index 0000000000000..a46aeb9128ecf Binary files /dev/null and b/pandas/tests/io/data/stata/stata1_104.dta differ diff --git a/pandas/tests/io/data/stata/stata1_105.dta b/pandas/tests/io/data/stata/stata1_105.dta new file mode 100644 index 0000000000000..ba2c463486dbf Binary files /dev/null and b/pandas/tests/io/data/stata/stata1_105.dta differ diff --git a/pandas/tests/io/data/stata/stata1_108.dta b/pandas/tests/io/data/stata/stata1_108.dta new file mode 100644 index 0000000000000..6c948b4490589 Binary files /dev/null and b/pandas/tests/io/data/stata/stata1_108.dta differ diff --git a/pandas/tests/io/data/stata/stata1_110.dta b/pandas/tests/io/data/stata/stata1_110.dta new file mode 100644 index 0000000000000..c9e2ca72dbd4e Binary files /dev/null and b/pandas/tests/io/data/stata/stata1_110.dta differ diff --git a/pandas/tests/io/data/stata/stata1_111.dta b/pandas/tests/io/data/stata/stata1_111.dta new file mode 100644 index 0000000000000..21370d3027458 Binary files /dev/null and b/pandas/tests/io/data/stata/stata1_111.dta differ diff --git a/pandas/tests/io/data/stata/stata1_113.dta b/pandas/tests/io/data/stata/stata1_113.dta new file mode 100644 index 0000000000000..6fcf55f0406e9 Binary files /dev/null and b/pandas/tests/io/data/stata/stata1_113.dta differ diff --git a/pandas/tests/io/data/stata/stata1_115.dta b/pandas/tests/io/data/stata/stata1_115.dta new file mode 100644 index 0000000000000..2e5258da49c3c Binary files /dev/null and b/pandas/tests/io/data/stata/stata1_115.dta differ diff --git a/pandas/tests/io/data/stata/stata1_118.dta b/pandas/tests/io/data/stata/stata1_118.dta new file mode 100644 index 0000000000000..26d7beccb745c Binary files /dev/null and b/pandas/tests/io/data/stata/stata1_118.dta differ diff --git a/pandas/tests/io/data/stata/stata1_119.dta b/pandas/tests/io/data/stata/stata1_119.dta new file mode 100644 index 0000000000000..284daa78bf6db Binary files /dev/null and b/pandas/tests/io/data/stata/stata1_119.dta differ diff --git a/pandas/tests/io/data/stata/stata4_102.dta b/pandas/tests/io/data/stata/stata4_102.dta new file mode 100644 index 0000000000000..669fe06c2b492 Binary files /dev/null and b/pandas/tests/io/data/stata/stata4_102.dta differ diff --git a/pandas/tests/io/data/stata/stata4_103.dta b/pandas/tests/io/data/stata/stata4_103.dta new file mode 100644 index 0000000000000..3c63935e63df9 Binary files /dev/null and b/pandas/tests/io/data/stata/stata4_103.dta differ diff --git a/pandas/tests/io/data/stata/stata4_104.dta b/pandas/tests/io/data/stata/stata4_104.dta new file mode 100644 index 0000000000000..c2517355ebff1 Binary files /dev/null and b/pandas/tests/io/data/stata/stata4_104.dta differ diff --git a/pandas/tests/io/data/stata/stata8_102.dta b/pandas/tests/io/data/stata/stata8_102.dta new file mode 100644 index 0000000000000..5d3a4fb171e9c Binary files /dev/null and b/pandas/tests/io/data/stata/stata8_102.dta differ diff --git a/pandas/tests/io/data/stata/stata8_103.dta b/pandas/tests/io/data/stata/stata8_103.dta new file mode 100644 index 0000000000000..623a21e37650f Binary files /dev/null and b/pandas/tests/io/data/stata/stata8_103.dta differ diff --git a/pandas/tests/io/data/stata/stata8_104.dta b/pandas/tests/io/data/stata/stata8_104.dta new file mode 100644 index 0000000000000..df79d6a8af230 Binary files /dev/null and b/pandas/tests/io/data/stata/stata8_104.dta differ diff --git a/pandas/tests/io/data/stata/stata8_105.dta b/pandas/tests/io/data/stata/stata8_105.dta new file mode 100644 index 0000000000000..cf01463a83d81 Binary files /dev/null and b/pandas/tests/io/data/stata/stata8_105.dta differ diff --git a/pandas/tests/io/data/stata/stata8_108.dta b/pandas/tests/io/data/stata/stata8_108.dta new file mode 100644 index 0000000000000..962f7f4331fb3 Binary files /dev/null and b/pandas/tests/io/data/stata/stata8_108.dta differ diff --git a/pandas/tests/io/data/stata/stata8_110.dta b/pandas/tests/io/data/stata/stata8_110.dta new file mode 100644 index 0000000000000..a7fe9a3b7e639 Binary files /dev/null and b/pandas/tests/io/data/stata/stata8_110.dta differ diff --git a/pandas/tests/io/data/stata/stata8_111.dta b/pandas/tests/io/data/stata/stata8_111.dta new file mode 100644 index 0000000000000..cb96ac0e0f5d3 Binary files /dev/null and b/pandas/tests/io/data/stata/stata8_111.dta differ diff --git a/pandas/tests/io/data/stata/stata_int_validranges_102.dta b/pandas/tests/io/data/stata/stata_int_validranges_102.dta new file mode 100644 index 0000000000000..69de2e2f7f91d Binary files /dev/null and b/pandas/tests/io/data/stata/stata_int_validranges_102.dta differ diff --git a/pandas/tests/io/data/stata/stata_int_validranges_103.dta b/pandas/tests/io/data/stata/stata_int_validranges_103.dta new file mode 100644 index 0000000000000..71f03873808e2 Binary files /dev/null and b/pandas/tests/io/data/stata/stata_int_validranges_103.dta differ diff --git a/pandas/tests/io/data/stata/stata_int_validranges_104.dta b/pandas/tests/io/data/stata/stata_int_validranges_104.dta new file mode 100644 index 0000000000000..f6dff2a6b42d9 Binary files /dev/null and b/pandas/tests/io/data/stata/stata_int_validranges_104.dta differ diff --git a/pandas/tests/io/data/stata/stata_int_validranges_105.dta b/pandas/tests/io/data/stata/stata_int_validranges_105.dta new file mode 100644 index 0000000000000..d0a7ad0f01d16 Binary files /dev/null and b/pandas/tests/io/data/stata/stata_int_validranges_105.dta differ diff --git a/pandas/tests/io/data/stata/stata_int_validranges_108.dta b/pandas/tests/io/data/stata/stata_int_validranges_108.dta new file mode 100644 index 0000000000000..47b715bce21ef Binary files /dev/null and b/pandas/tests/io/data/stata/stata_int_validranges_108.dta differ diff --git a/pandas/tests/io/data/stata/stata_int_validranges_110.dta b/pandas/tests/io/data/stata/stata_int_validranges_110.dta new file mode 100644 index 0000000000000..2fe5dee018f4e Binary files /dev/null and b/pandas/tests/io/data/stata/stata_int_validranges_110.dta differ diff --git a/pandas/tests/io/data/stata/stata_int_validranges_111.dta b/pandas/tests/io/data/stata/stata_int_validranges_111.dta new file mode 100644 index 0000000000000..07052d824f132 Binary files /dev/null and b/pandas/tests/io/data/stata/stata_int_validranges_111.dta differ diff --git a/pandas/tests/io/data/stata/stata_int_validranges_113.dta b/pandas/tests/io/data/stata/stata_int_validranges_113.dta new file mode 100644 index 0000000000000..4060c1c88ea12 Binary files /dev/null and b/pandas/tests/io/data/stata/stata_int_validranges_113.dta differ diff --git a/pandas/tests/io/data/stata/stata_int_validranges_114.dta b/pandas/tests/io/data/stata/stata_int_validranges_114.dta new file mode 100644 index 0000000000000..71c22366e9b1a Binary files /dev/null and b/pandas/tests/io/data/stata/stata_int_validranges_114.dta differ diff --git a/pandas/tests/io/data/stata/stata_int_validranges_115.dta b/pandas/tests/io/data/stata/stata_int_validranges_115.dta new file mode 100644 index 0000000000000..80e1dc8670b38 Binary files /dev/null and b/pandas/tests/io/data/stata/stata_int_validranges_115.dta differ diff --git a/pandas/tests/io/data/stata/stata_int_validranges_117.dta b/pandas/tests/io/data/stata/stata_int_validranges_117.dta new file mode 100644 index 0000000000000..c220037941f4f Binary files /dev/null and b/pandas/tests/io/data/stata/stata_int_validranges_117.dta differ diff --git a/pandas/tests/io/data/stata/stata_int_validranges_118.dta b/pandas/tests/io/data/stata/stata_int_validranges_118.dta new file mode 100644 index 0000000000000..4bbd823bff63e Binary files /dev/null and b/pandas/tests/io/data/stata/stata_int_validranges_118.dta differ diff --git a/pandas/tests/io/data/stata/stata_int_validranges_119.dta b/pandas/tests/io/data/stata/stata_int_validranges_119.dta new file mode 100644 index 0000000000000..6bd9bbde1d22d Binary files /dev/null and b/pandas/tests/io/data/stata/stata_int_validranges_119.dta differ diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index f0a72ba6163fa..65a52bc8e0794 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -17,7 +17,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype import pandas.util._test_decorators as td @@ -141,10 +141,13 @@ def df_ref(datapath): def get_exp_unit(read_ext: str, engine: str | None) -> str: - return "ns" + unit = "us" + if (read_ext == ".ods") ^ (engine == "calamine"): + unit = "s" + return unit -def adjust_expected(expected: DataFrame, read_ext: str, engine: str) -> None: +def adjust_expected(expected: DataFrame, read_ext: str, engine: str | None) -> None: expected.index.name = None unit = get_exp_unit(read_ext, engine) # error: "Index" has no attribute "as_unit" @@ -161,6 +164,36 @@ def xfail_datetimes_with_pyxlsb(engine, request): class TestReaders: + @pytest.mark.parametrize("col", [[True, None, False], [True], [True, False]]) + def test_read_excel_type_check(self, col, tmp_excel, read_ext): + # GH 58159 + if read_ext in (".xlsb", ".xls"): + pytest.skip(f"No engine for filetype: '{read_ext}'") + df = DataFrame({"bool_column": col}, dtype="boolean") + df.to_excel(tmp_excel, index=False) + df2 = pd.read_excel(tmp_excel, dtype={"bool_column": "boolean"}) + tm.assert_frame_equal(df, df2) + + def test_pass_none_type(self, datapath): + # GH 58159 + f_path = datapath("io", "data", "excel", "test_none_type.xlsx") + + with pd.ExcelFile(f_path) as excel: + parsed = pd.read_excel( + excel, + sheet_name="Sheet1", + keep_default_na=True, + na_values=["nan", "None", "abcd"], + dtype="boolean", + engine="openpyxl", + ) + expected = DataFrame( + {"Test": [True, None, False, None, False, None, True]}, + dtype="boolean", + ) + + tm.assert_frame_equal(parsed, expected) + @pytest.fixture(autouse=True) def cd_and_set_engine(self, engine, datapath, monkeypatch): """ @@ -596,6 +629,7 @@ def test_reader_dtype_str(self, read_ext, dtype, expected): expected = DataFrame(expected) tm.assert_frame_equal(actual, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_dtype_backend(self, read_ext, dtype_backend, engine, tmp_excel): # GH#36712 if read_ext in (".xlsb", ".xls"): @@ -659,7 +693,7 @@ def test_dtype_backend_and_dtype(self, read_ext, tmp_excel): tm.assert_frame_equal(result, df) @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="infer_string takes precedence" + using_string_dtype(), reason="infer_string takes precedence", strict=False ) def test_dtype_backend_string(self, read_ext, string_storage, tmp_excel): # GH#36712 @@ -1065,7 +1099,7 @@ def test_read_excel_multiindex(self, request, engine, read_ext): tm.assert_frame_equal(actual, expected) # "mi_column_name" sheet - expected.index = list(range(4)) + expected.index = range(4) expected.columns = mi.set_names(["c1", "c2"]) actual = pd.read_excel( mi_file, sheet_name="mi_column_name", header=[0, 1], index_col=0 @@ -1117,7 +1151,6 @@ def test_read_excel_multiindex_blank_after_name( mi = MultiIndex.from_product([["foo", "bar"], ["a", "b"]], names=["c1", "c2"]) unit = get_exp_unit(read_ext, engine) - expected = DataFrame( [ [1, 2.5, pd.Timestamp("2015-01-01"), True], @@ -1675,6 +1708,7 @@ def test_read_datetime_multiindex(self, request, engine, read_ext): actual = pd.read_excel(excel, header=[0, 1], index_col=0, engine=engine) unit = get_exp_unit(read_ext, engine) + dti = pd.DatetimeIndex(["2020-02-29", "2020-03-01"], dtype=f"M8[{unit}]") expected_column_index = MultiIndex.from_arrays( [dti[:1], dti[1:]], diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index 859152db84b7d..0d753cb871c64 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -12,7 +12,8 @@ import numpy as np import pytest -from pandas.compat._constants import PY310 +from pandas._config import using_string_dtype + from pandas.compat._optional import import_optional_dependency import pandas.util._test_decorators as td @@ -37,7 +38,9 @@ def get_exp_unit(path: str) -> str: - return "ns" + if path.endswith(".ods"): + return "s" + return "us" @pytest.fixture @@ -48,7 +51,7 @@ def frame(float_frame): return float_frame[:10] -@pytest.fixture(params=[True, False]) +@pytest.fixture(params=[True, False, "columns"]) def merge_cells(request): return request.param @@ -279,6 +282,7 @@ def test_excel_multindex_roundtrip( ) tm.assert_frame_equal(df, act, check_names=check_names) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_read_excel_parse_dates(self, tmp_excel): # see gh-11544, gh-12051 df = DataFrame( @@ -293,12 +297,15 @@ def test_read_excel_parse_dates(self, tmp_excel): tm.assert_frame_equal(df2, res) res = pd.read_excel(tmp_excel, parse_dates=["date_strings"], index_col=0) - tm.assert_frame_equal(df, res) + expected = df[:] + expected["date_strings"] = expected["date_strings"].astype("M8[s]") + tm.assert_frame_equal(res, expected) res = pd.read_excel( tmp_excel, parse_dates=["date_strings"], date_format="%m/%d/%Y", index_col=0 ) - tm.assert_frame_equal(df, res) + expected["date_strings"] = expected["date_strings"].astype("M8[s]") + tm.assert_frame_equal(expected, res) def test_multiindex_interval_datetimes(self, tmp_excel): # GH 30986 @@ -326,6 +333,7 @@ def test_multiindex_interval_datetimes(self, tmp_excel): ], ] ), + columns=Index([0]), ) tm.assert_frame_equal(result, expected) @@ -371,7 +379,10 @@ def test_excel_sheet_size(self, tmp_excel): col_df.to_excel(tmp_excel) def test_excel_sheet_by_name_raise(self, tmp_excel): - gt = DataFrame(np.random.default_rng(2).standard_normal((10, 2))) + gt = DataFrame( + np.random.default_rng(2).standard_normal((10, 2)), + index=Index(list(range(10))), + ) gt.to_excel(tmp_excel) with ExcelFile(tmp_excel) as xl: @@ -492,7 +503,9 @@ def test_int_types(self, np_type, tmp_excel): # Test np.int values read come back as int # (rather than float which is Excel's format). df = DataFrame( - np.random.default_rng(2).integers(-10, 10, size=(10, 2)), dtype=np_type + np.random.default_rng(2).integers(-10, 10, size=(10, 2)), + dtype=np_type, + index=Index(list(range(10))), ) df.to_excel(tmp_excel, sheet_name="test1") @@ -508,7 +521,11 @@ def test_int_types(self, np_type, tmp_excel): @pytest.mark.parametrize("np_type", [np.float16, np.float32, np.float64]) def test_float_types(self, np_type, tmp_excel): # Test np.float values read come back as float. - df = DataFrame(np.random.default_rng(2).random(10), dtype=np_type) + df = DataFrame( + np.random.default_rng(2).random(10), + dtype=np_type, + index=Index(list(range(10))), + ) df.to_excel(tmp_excel, sheet_name="test1") with ExcelFile(tmp_excel) as reader: @@ -520,7 +537,7 @@ def test_float_types(self, np_type, tmp_excel): def test_bool_types(self, tmp_excel): # Test np.bool_ values read come back as float. - df = DataFrame([1, 0, True, False], dtype=np.bool_) + df = DataFrame([1, 0, True, False], dtype=np.bool_, index=Index(list(range(4)))) df.to_excel(tmp_excel, sheet_name="test1") with ExcelFile(tmp_excel) as reader: @@ -531,7 +548,7 @@ def test_bool_types(self, tmp_excel): tm.assert_frame_equal(df, recons) def test_inf_roundtrip(self, tmp_excel): - df = DataFrame([(1, np.inf), (2, 3), (5, -np.inf)]) + df = DataFrame([(1, np.inf), (2, 3), (5, -np.inf)], index=Index(list(range(3)))) df.to_excel(tmp_excel, sheet_name="test1") with ExcelFile(tmp_excel) as reader: @@ -547,6 +564,7 @@ def test_sheets(self, frame, tmp_excel): columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=5, freq="B"), ) + index = pd.DatetimeIndex(np.asarray(tsframe.index), freq=None) tsframe.index = index @@ -627,7 +645,13 @@ def test_roundtrip_indexlabels(self, merge_cells, frame, tmp_excel): df.index.names = ["test"] assert df.index.names == recons.index.names - df = DataFrame(np.random.default_rng(2).standard_normal((10, 2))) >= 0 + df = ( + DataFrame( + np.random.default_rng(2).standard_normal((10, 2)), + index=Index(list(range(10))), + ) + >= 0 + ) df.to_excel( tmp_excel, sheet_name="test1", index_label="test", merge_cells=merge_cells ) @@ -695,7 +719,6 @@ def test_excel_date_datetime_format(self, ext, tmp_excel, tmp_path): # # Excel output format strings unit = get_exp_unit(tmp_excel) - df = DataFrame( [ [date(2014, 1, 31), date(1999, 9, 24)], @@ -732,6 +755,9 @@ def test_excel_date_datetime_format(self, ext, tmp_excel, tmp_path): with ExcelFile(filename2) as reader2: rs2 = pd.read_excel(reader2, sheet_name="test1", index_col=0) + # TODO: why do we get different units? + rs2 = rs2.astype(f"M8[{unit}]") + tm.assert_frame_equal(rs1, rs2) # Since the reader returns a datetime object for dates, @@ -1243,13 +1269,12 @@ def test_engine_kwargs(self, engine, tmp_excel): "xlsxwriter": r"__init__() got an unexpected keyword argument 'foo'", } - if PY310: - msgs["openpyxl"] = ( - "Workbook.__init__() got an unexpected keyword argument 'foo'" - ) - msgs["xlsxwriter"] = ( - "Workbook.__init__() got an unexpected keyword argument 'foo'" - ) + msgs["openpyxl"] = ( + "Workbook.__init__() got an unexpected keyword argument 'foo'" + ) + msgs["xlsxwriter"] = ( + "Workbook.__init__() got an unexpected keyword argument 'foo'" + ) # Handle change in error message for openpyxl (write and append mode) if engine == "openpyxl" and not os.path.exists(tmp_excel): @@ -1310,6 +1335,7 @@ def test_freeze_panes(self, tmp_excel): result = pd.read_excel(tmp_excel, index_col=0) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_path_path_lib(self, engine, ext): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), diff --git a/pandas/tests/io/formats/style/test_matplotlib.py b/pandas/tests/io/formats/style/test_matplotlib.py index 70ddd65c02d14..296fb20d855c4 100644 --- a/pandas/tests/io/formats/style/test_matplotlib.py +++ b/pandas/tests/io/formats/style/test_matplotlib.py @@ -7,11 +7,9 @@ Series, ) -pytest.importorskip("matplotlib") +mpl = pytest.importorskip("matplotlib") pytest.importorskip("jinja2") -import matplotlib as mpl - from pandas.io.formats.style import Styler pytestmark = pytest.mark.usefixtures("mpl_cleanup") diff --git a/pandas/tests/io/formats/style/test_to_latex.py b/pandas/tests/io/formats/style/test_to_latex.py index eb221686dd165..1abe6238d3922 100644 --- a/pandas/tests/io/formats/style/test_to_latex.py +++ b/pandas/tests/io/formats/style/test_to_latex.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas import ( DataFrame, MultiIndex, @@ -729,6 +731,7 @@ def test_longtable_caption_label(styler, caption, cap_exp, label, lab_exp): ) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("index", [True, False]) @pytest.mark.parametrize( "columns, siunitx", diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index b12cfc6876a8e..af7b04d66096a 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -11,7 +11,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype import pandas as pd from pandas import ( @@ -1347,9 +1347,7 @@ def test_unicode_name_in_footer(self): sf = fmt.SeriesFormatter(s, name="\u05e2\u05d1\u05e8\u05d9\u05ea") sf._get_footer() # should not raise exception - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="Fixup when arrow is default" - ) + @pytest.mark.xfail(using_string_dtype(), reason="Fixup when arrow is default") def test_east_asian_unicode_series(self): # not aligned properly because of east asian width @@ -1724,9 +1722,7 @@ def chck_ncols(self, s): ncolsizes = len({len(line.strip()) for line in lines}) assert ncolsizes == 1 - @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="change when arrow is default" - ) + @pytest.mark.xfail(using_string_dtype(), reason="change when arrow is default") def test_format_explicit(self): test_sers = gen_series_formatting() with option_context("display.max_rows", 4, "display.show_dimensions", False): diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index 49776d532db1d..7bf041a50b745 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -221,7 +221,7 @@ def test_to_csv_na_rep_nullable_string(self, nullable_string_dtype): def test_to_csv_date_format(self): # GH 10209 df_sec = DataFrame({"A": pd.date_range("20130101", periods=5, freq="s")}) - df_day = DataFrame({"A": pd.date_range("20130101", periods=5, freq="d")}) + df_day = DataFrame({"A": pd.date_range("20130101", periods=5, freq="D")}) expected_rows = [ ",A", diff --git a/pandas/tests/io/formats/test_to_markdown.py b/pandas/tests/io/formats/test_to_markdown.py index fffb1b9b9d2a4..7aa7cebb5120f 100644 --- a/pandas/tests/io/formats/test_to_markdown.py +++ b/pandas/tests/io/formats/test_to_markdown.py @@ -11,7 +11,7 @@ def test_keyword_deprecation(): # GH 57280 msg = ( - "Starting with pandas version 3.0.0 all arguments of to_markdown " + "Starting with pandas version 4.0 all arguments of to_markdown " "except for the argument 'buf' will be keyword-only." ) s = pd.Series() diff --git a/pandas/tests/io/formats/test_to_string.py b/pandas/tests/io/formats/test_to_string.py index 7c7069aa74eeb..5731f74a03852 100644 --- a/pandas/tests/io/formats/test_to_string.py +++ b/pandas/tests/io/formats/test_to_string.py @@ -10,7 +10,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas import ( CategoricalIndex, @@ -38,7 +38,7 @@ class TestDataFrameToStringFormatters: def test_keyword_deprecation(self): # GH 57280 msg = ( - "Starting with pandas version 3.0.0 all arguments of to_string " + "Starting with pandas version 4.0 all arguments of to_string " "except for the argument 'buf' will be keyword-only." ) s = Series(["a", "b"]) @@ -849,7 +849,7 @@ def test_to_string(self): frame.to_string() # TODO: split or simplify this test? - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="fix when arrow is default") + @pytest.mark.xfail(using_string_dtype(), reason="fix when arrow is default") def test_to_string_index_with_nan(self): # GH#2850 df = DataFrame( diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py index a0d5b3a741aaf..bddd71d2bd5f6 100644 --- a/pandas/tests/io/json/test_json_table_schema.py +++ b/pandas/tests/io/json/test_json_table_schema.py @@ -7,6 +7,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.core.dtypes.dtypes import ( CategoricalDtype, DatetimeTZDtype, @@ -25,6 +27,10 @@ set_default_names, ) +pytestmark = pytest.mark.xfail( + using_string_dtype(), reason="TODO(infer_string)", strict=False +) + @pytest.fixture def df_schema(): @@ -32,7 +38,7 @@ def df_schema(): { "A": [1, 2, 3, 4], "B": ["a", "b", "c", "c"], - "C": pd.date_range("2016-01-01", freq="d", periods=4), + "C": pd.date_range("2016-01-01", freq="D", periods=4), "D": pd.timedelta_range("1h", periods=4, freq="min"), }, index=pd.Index(range(4), name="idx"), @@ -45,12 +51,12 @@ def df_table(): { "A": [1, 2, 3, 4], "B": ["a", "b", "c", "c"], - "C": pd.date_range("2016-01-01", freq="d", periods=4), + "C": pd.date_range("2016-01-01", freq="D", periods=4), "D": pd.timedelta_range("1h", periods=4, freq="min"), "E": pd.Series(pd.Categorical(["a", "b", "c", "c"])), "F": pd.Series(pd.Categorical(["a", "b", "c", "c"], ordered=True)), "G": [1.0, 2.0, 3, 4.0], - "H": pd.date_range("2016-01-01", freq="d", periods=4, tz="US/Central"), + "H": pd.date_range("2016-01-01", freq="D", periods=4, tz="US/Central"), }, index=pd.Index(range(4), name="idx"), ) @@ -687,7 +693,7 @@ class TestTableOrientReader: {"ints": [1, 2, 3, 4]}, {"objects": ["a", "b", "c", "d"]}, {"objects": ["1", "2", "3", "4"]}, - {"date_ranges": pd.date_range("2016-01-01", freq="d", periods=4)}, + {"date_ranges": pd.date_range("2016-01-01", freq="D", periods=4)}, {"categoricals": pd.Series(pd.Categorical(["a", "b", "c", "c"]))}, { "ordered_cats": pd.Series( @@ -699,7 +705,7 @@ class TestTableOrientReader: {"bools": [True, False, False, True]}, { "timezones": pd.date_range( - "2016-01-01", freq="d", periods=4, tz="US/Central" + "2016-01-01", freq="D", periods=4, tz="US/Central" ) # added in # GH 35973 }, ], @@ -738,7 +744,7 @@ def test_read_json_table_orient_raises(self, index_nm): {"ints": [1, 2, 3, 4]}, {"objects": ["a", "b", "c", "d"]}, {"objects": ["1", "2", "3", "4"]}, - {"date_ranges": pd.date_range("2016-01-01", freq="d", periods=4)}, + {"date_ranges": pd.date_range("2016-01-01", freq="D", periods=4)}, {"categoricals": pd.Series(pd.Categorical(["a", "b", "c", "c"]))}, { "ordered_cats": pd.Series( @@ -750,7 +756,7 @@ def test_read_json_table_orient_raises(self, index_nm): {"bools": [True, False, False, True]}, { "timezones": pd.date_range( - "2016-01-01", freq="d", periods=4, tz="US/Central" + "2016-01-01", freq="D", periods=4, tz="US/Central" ) # added in # GH 35973 }, ], @@ -772,15 +778,15 @@ def test_read_json_table_period_orient(self, index_nm, vals): pd.Index(range(4)), pd.date_range( "2020-08-30", - freq="d", + freq="D", periods=4, )._with_freq(None), pd.date_range( - "2020-08-30", freq="d", periods=4, tz="US/Central" + "2020-08-30", freq="D", periods=4, tz="US/Central" )._with_freq(None), pd.MultiIndex.from_product( [ - pd.date_range("2020-08-30", freq="d", periods=2, tz="US/Central"), + pd.date_range("2020-08-30", freq="D", periods=2, tz="US/Central"), ["x", "y"], ], ), @@ -790,10 +796,10 @@ def test_read_json_table_period_orient(self, index_nm, vals): "vals", [ {"floats": [1.1, 2.2, 3.3, 4.4]}, - {"dates": pd.date_range("2020-08-30", freq="d", periods=4)}, + {"dates": pd.date_range("2020-08-30", freq="D", periods=4)}, { "timezones": pd.date_range( - "2020-08-30", freq="d", periods=4, tz="Europe/London" + "2020-08-30", freq="D", periods=4, tz="Europe/London" ) }, ], @@ -810,12 +816,12 @@ def test_comprehensive(self): { "A": [1, 2, 3, 4], "B": ["a", "b", "c", "c"], - "C": pd.date_range("2016-01-01", freq="d", periods=4), + "C": pd.date_range("2016-01-01", freq="D", periods=4), # 'D': pd.timedelta_range('1h', periods=4, freq='min'), "E": pd.Series(pd.Categorical(["a", "b", "c", "c"])), "F": pd.Series(pd.Categorical(["a", "b", "c", "c"], ordered=True)), "G": [1.1, 2.2, 3.3, 4.4], - "H": pd.date_range("2016-01-01", freq="d", periods=4, tz="US/Central"), + "H": pd.date_range("2016-01-01", freq="D", periods=4, tz="US/Central"), "I": [True, False, False, True], }, index=pd.Index(range(4), name="idx"), diff --git a/pandas/tests/io/json/test_json_table_schema_ext_dtype.py b/pandas/tests/io/json/test_json_table_schema_ext_dtype.py index 68c7a96920533..8de289afe9ff9 100644 --- a/pandas/tests/io/json/test_json_table_schema_ext_dtype.py +++ b/pandas/tests/io/json/test_json_table_schema_ext_dtype.py @@ -97,18 +97,22 @@ def test_as_json_table_type_ext_integer_dtype(self): class TestTableOrient: @pytest.fixture def da(self): + """Fixture for creating a DateArray.""" return DateArray([dt.date(2021, 10, 10)]) @pytest.fixture def dc(self): + """Fixture for creating a DecimalArray.""" return DecimalArray([decimal.Decimal(10)]) @pytest.fixture def sa(self): + """Fixture for creating a StringDtype array.""" return array(["pandas"], dtype="string") @pytest.fixture def ia(self): + """Fixture for creating an Int64Dtype array.""" return array([10], dtype="Int64") def test_build_date_series(self, da): diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index d83e7b4641e88..fdbfbd004617e 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -516,7 +516,7 @@ def test_nonetype_record_path(self, nulls_fixture): ], record_path=["info"], ) - expected = DataFrame({"i": 2}, index=[0]) + expected = DataFrame({"i": 2}, index=range(1)) tm.assert_equal(result, expected) @pytest.mark.parametrize("value", ["false", "true", "{}", "1", '"text"']) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index c4065ea01988f..d281729e9704c 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -10,7 +10,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas.compat import IS64 import pandas.util._test_decorators as td @@ -133,7 +133,13 @@ def test_frame_non_unique_index_raises(self, orient): [[Timestamp("20130101"), 3.5], [Timestamp("20130102"), 4.5]], ], ) - def test_frame_non_unique_columns(self, orient, data): + def test_frame_non_unique_columns(self, orient, data, request): + if isinstance(data[0][0], Timestamp) and orient == "split": + mark = pytest.mark.xfail( + reason="GH#55827 non-nanosecond dt64 fails to round-trip" + ) + request.applymarker(mark) + df = DataFrame(data, index=[1, 2], columns=["x", "x"]) expected_warning = None @@ -141,7 +147,7 @@ def test_frame_non_unique_columns(self, orient, data): "The default 'epoch' date format is deprecated and will be removed " "in a future version, please use 'iso' date format instead." ) - if df.iloc[:, 0].dtype == "datetime64[ns]": + if df.iloc[:, 0].dtype == "datetime64[s]": expected_warning = FutureWarning with tm.assert_produces_warning(expected_warning, match=msg): @@ -150,7 +156,7 @@ def test_frame_non_unique_columns(self, orient, data): ) if orient == "values": expected = DataFrame(data) - if expected.iloc[:, 0].dtype == "datetime64[ns]": + if expected.iloc[:, 0].dtype == "datetime64[s]": # orient == "values" by default will write Timestamp objects out # in milliseconds; these are internally stored in nanosecond, # so divide to get where we need @@ -183,6 +189,7 @@ def test_roundtrip_simple(self, orient, convert_axes, dtype, float_frame): assert_json_roundtrip_equal(result, expected, orient) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("dtype", [False, np.int64]) @pytest.mark.parametrize("convert_axes", [True, False]) def test_roundtrip_intframe(self, orient, convert_axes, dtype, int_frame): @@ -268,6 +275,7 @@ def test_roundtrip_empty(self, orient, convert_axes): tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("convert_axes", [True, False]) def test_roundtrip_timestamp(self, orient, convert_axes, datetime_frame): # TODO: improve coverage with date_format parameter @@ -695,6 +703,7 @@ def test_series_roundtrip_simple(self, orient, string_series, using_infer_string tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("dtype", [False, None]) def test_series_roundtrip_object(self, orient, dtype, object_series): data = StringIO(object_series.to_json(orient=orient)) @@ -786,7 +795,7 @@ def test_frame_from_json_precise_float(self): def test_typ(self): s = Series(range(6), index=["a", "b", "c", "d", "e", "f"], dtype="int64") - result = read_json(StringIO(s.to_json()), typ=None) + result = read_json(StringIO(s.to_json()), typ="series") tm.assert_series_equal(result, s) def test_reconstruction_index(self): @@ -804,6 +813,7 @@ def test_path(self, float_frame, int_frame, datetime_frame): df.to_json(path) read_json(path) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_axis_dates(self, datetime_series, datetime_frame): # frame json = StringIO(datetime_frame.to_json()) @@ -816,6 +826,7 @@ def test_axis_dates(self, datetime_series, datetime_frame): tm.assert_series_equal(result, datetime_series, check_names=False) assert result.name is None + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_convert_dates(self, datetime_series, datetime_frame): # frame df = datetime_frame @@ -856,6 +867,10 @@ def test_date_index_and_values(self, date_format, as_object, date_typ): data.append("a") ser = Series(data, index=data) + if not as_object: + ser = ser.astype("M8[ns]") + if isinstance(ser.index, DatetimeIndex): + ser.index = ser.index.as_unit("ns") expected_warning = None if date_format == "epoch": @@ -897,10 +912,12 @@ def test_convert_dates_infer(self, infer_word): expected = DataFrame( [[1, Timestamp("2002-11-08")], [2, pd.NaT]], columns=["id", infer_word] ) + expected[infer_word] = expected[infer_word].astype("M8[ns]") result = read_json(StringIO(ujson_dumps(data)))[["id", infer_word]] tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "date,date_unit", [ @@ -961,6 +978,7 @@ def test_date_format_series_raises(self, datetime_series): with pytest.raises(ValueError, match=msg): ts.to_json(date_format="iso", date_unit="foo") + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_date_unit(self, unit, datetime_frame): df = datetime_frame df["date"] = Timestamp("20130101 20:43:42").as_unit("ns") @@ -1101,6 +1119,7 @@ def test_round_trip_exception(self, datapath): res = res.fillna(np.nan) tm.assert_frame_equal(res, df) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.network @pytest.mark.single_cpu @pytest.mark.parametrize( @@ -1541,6 +1560,7 @@ def test_data_frame_size_after_to_json(self): assert size_before == size_after + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "index", [None, [1, 2], [1.0, 2.0], ["a", "b"], ["1", "2"], ["1.", "2."]] ) @@ -1553,6 +1573,7 @@ def test_from_json_to_json_table_index_and_columns(self, index, columns): result = read_json(StringIO(dfjson), orient="table") tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_from_json_to_json_table_dtypes(self): # GH21345 expected = DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["5", "6"]}) @@ -1562,7 +1583,7 @@ def test_from_json_to_json_table_dtypes(self): # TODO: We are casting to string which coerces None to NaN before casting back # to object, ending up with incorrect na values - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="incorrect na conversion") + @pytest.mark.xfail(using_string_dtype(), reason="incorrect na conversion") @pytest.mark.parametrize("orient", ["split", "records", "index", "columns"]) def test_to_json_from_json_columns_dtypes(self, orient): # GH21892 GH33205 @@ -1599,6 +1620,13 @@ def test_to_json_from_json_columns_dtypes(self, orient): ) tm.assert_frame_equal(result, expected) + def test_to_json_with_index_as_a_column_name(self): + df = DataFrame(data={"index": [1, 2], "a": [2, 3]}) + with pytest.raises( + ValueError, match="Overlapping names between the index and columns" + ): + df.to_json(orient="table") + @pytest.mark.parametrize("dtype", [True, {"b": int, "c": int}]) def test_read_json_table_dtype_raises(self, dtype): # GH21345 @@ -1836,7 +1864,7 @@ def test_to_json_indent(self, indent): assert result == expected @pytest.mark.skipif( - using_pyarrow_string_dtype(), + using_string_dtype(), reason="Adjust expected when infer_string is default, no bug here, " "just a complicated parametrization", ) @@ -2116,6 +2144,7 @@ def test_json_uint64(self): result = df.to_json(orient="split") assert result == expected + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_read_json_dtype_backend( self, string_storage, dtype_backend, orient, using_infer_string ): diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py index d96ccb4b94cc2..3c843479b446a 100644 --- a/pandas/tests/io/json/test_readlines.py +++ b/pandas/tests/io/json/test_readlines.py @@ -165,11 +165,11 @@ def test_readjson_chunks_series(request, engine): s = pd.Series({"A": 1, "B": 2}) strio = StringIO(s.to_json(lines=True, orient="records")) - unchunked = read_json(strio, lines=True, typ="Series", engine=engine) + unchunked = read_json(strio, lines=True, typ="series", engine=engine) strio = StringIO(s.to_json(lines=True, orient="records")) with read_json( - strio, lines=True, typ="Series", chunksize=1, engine=engine + strio, lines=True, typ="series", chunksize=1, engine=engine ) as reader: chunked = pd.concat(reader) diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index 8e05a8e6fc5d8..62118f1c82ebb 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -10,7 +10,6 @@ import dateutil import numpy as np import pytest -import pytz import pandas._libs.json as ujson from pandas.compat import IS64 @@ -370,6 +369,7 @@ def test_encode_time_conversion_basic(self, test): def test_encode_time_conversion_pytz(self): # see gh-11473: to_json segfaults with timezone-aware datetimes + pytz = pytest.importorskip("pytz") test = datetime.time(10, 12, 15, 343243, pytz.utc) output = ujson.ujson_dumps(test) expected = f'"{test.isoformat()}"' diff --git a/pandas/tests/io/parser/common/test_chunksize.py b/pandas/tests/io/parser/common/test_chunksize.py index 78a0b016bd353..a6504473fb55f 100644 --- a/pandas/tests/io/parser/common/test_chunksize.py +++ b/pandas/tests/io/parser/common/test_chunksize.py @@ -8,6 +8,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs import parsers as libparsers from pandas.errors import DtypeWarning @@ -229,6 +231,7 @@ def test_chunks_have_consistent_numerical_type(all_parsers, monkeypatch): assert result.a.dtype == float +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_warn_if_chunks_have_mismatched_type(all_parsers): warning_type = None parser = all_parsers diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index df76b46cc6a7b..511db2c6a33d8 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -13,6 +13,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import ( EmptyDataError, ParserError, @@ -40,9 +42,7 @@ def test_read_csv_local(all_parsers, csv1): fname = prefix + str(os.path.abspath(csv1)) result = parser.read_csv(fname, index_col=0, parse_dates=True) - # TODO: make unit check more specific - if parser.engine == "pyarrow": - result.index = result.index.as_unit("ns") + expected = DataFrame( [ [0.980269, 3.685731, -0.364216805298, -1.159738], @@ -64,6 +64,7 @@ def test_read_csv_local(all_parsers, csv1): datetime(2000, 1, 10), datetime(2000, 1, 11), ], + dtype="M8[s]", name="index", ), ) @@ -144,9 +145,6 @@ def test_read_csv_low_memory_no_rows_with_index(all_parsers): def test_read_csv_dataframe(all_parsers, csv1): parser = all_parsers result = parser.read_csv(csv1, index_col=0, parse_dates=True) - # TODO: make unit check more specific - if parser.engine == "pyarrow": - result.index = result.index.as_unit("ns") expected = DataFrame( [ [0.980269, 3.685731, -0.364216805298, -1.159738], @@ -168,6 +166,7 @@ def test_read_csv_dataframe(all_parsers, csv1): datetime(2000, 1, 10), datetime(2000, 1, 11), ], + dtype="M8[s]", name="index", ), ) @@ -767,6 +766,7 @@ def test_dict_keys_as_names(all_parsers): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @xfail_pyarrow # UnicodeDecodeError: 'utf-8' codec can't decode byte 0xed in position 0 def test_encoding_surrogatepass(all_parsers): # GH39017 diff --git a/pandas/tests/io/parser/common/test_file_buffer_url.py b/pandas/tests/io/parser/common/test_file_buffer_url.py index ba31a9bc15fb5..d8b8f24abcedd 100644 --- a/pandas/tests/io/parser/common/test_file_buffer_url.py +++ b/pandas/tests/io/parser/common/test_file_buffer_url.py @@ -15,6 +15,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat import WASM from pandas.errors import ( EmptyDataError, @@ -69,6 +71,7 @@ def test_local_file(all_parsers, csv_dir_path): pytest.skip("Failing on: " + " ".join(platform.uname())) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @xfail_pyarrow # AssertionError: DataFrame.index are different def test_path_path_lib(all_parsers): parser = all_parsers diff --git a/pandas/tests/io/parser/common/test_index.py b/pandas/tests/io/parser/common/test_index.py index 2fcc80f58ae30..54b59ac4e25ed 100644 --- a/pandas/tests/io/parser/common/test_index.py +++ b/pandas/tests/io/parser/common/test_index.py @@ -9,6 +9,8 @@ import pytest +from pandas._config import using_string_dtype + from pandas import ( DataFrame, Index, @@ -86,6 +88,7 @@ def test_pass_names_with_index(all_parsers, data, kwargs, expected): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("index_col", [[0, 1], [1, 0]]) def test_multi_index_no_level_names(all_parsers, index_col): data = """index1,index2,A,B,C,D @@ -260,7 +263,8 @@ def test_read_csv_no_index_name(all_parsers, csv_dir_path): datetime(2000, 1, 5), datetime(2000, 1, 6), datetime(2000, 1, 7), - ] + ], + dtype="M8[s]", ), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/common/test_iterator.py b/pandas/tests/io/parser/common/test_iterator.py index 091edb67f6e19..668aab05b9fa4 100644 --- a/pandas/tests/io/parser/common/test_iterator.py +++ b/pandas/tests/io/parser/common/test_iterator.py @@ -98,6 +98,31 @@ def test_iterator_stop_on_chunksize(all_parsers): tm.assert_frame_equal(concat(result), expected) +def test_nrows_iterator_without_chunksize(all_parsers): + # GH 59079 + parser = all_parsers + data = """A,B,C +foo,1,2,3 +bar,4,5,6 +baz,7,8,9 +""" + if parser.engine == "pyarrow": + msg = "The 'iterator' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), iterator=True, nrows=2) + return + + with parser.read_csv(StringIO(data), iterator=True, nrows=2) as reader: + result = reader.get_chunk() + + expected = DataFrame( + [[1, 2, 3], [4, 5, 6]], + index=["foo", "bar"], + columns=["A", "B", "C"], + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( "kwargs", [{"iterator": True, "chunksize": 1}, {"iterator": True}, {"chunksize": 1}] ) diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index d45368dece6d2..3f410a13c8f80 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -9,6 +9,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import ParserWarning import pandas as pd @@ -27,6 +29,8 @@ "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" ) +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") + @pytest.mark.parametrize("dtype", [str, object]) @pytest.mark.parametrize("check_orig", [True, False]) @@ -55,6 +59,7 @@ def test_dtype_all_columns(all_parsers, dtype, check_orig): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.usefixtures("pyarrow_xfail") def test_dtype_per_column(all_parsers): parser = all_parsers @@ -139,7 +144,7 @@ def test_numeric_dtype(all_parsers, any_real_numpy_dtype): expected = DataFrame([0, 1], dtype=any_real_numpy_dtype) result = parser.read_csv(StringIO(data), header=None, dtype=any_real_numpy_dtype) - tm.assert_frame_equal(expected, result) + tm.assert_frame_equal(expected, result, check_column_type=False) @pytest.mark.usefixtures("pyarrow_xfail") @@ -299,6 +304,7 @@ def test_true_values_cast_to_bool(all_parsers): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.usefixtures("pyarrow_xfail") @pytest.mark.parametrize("dtypes, exp_value", [({}, "1"), ({"a.1": "int64"}, 1)]) def test_dtype_mangle_dup_cols(all_parsers, dtypes, exp_value): @@ -314,6 +320,7 @@ def test_dtype_mangle_dup_cols(all_parsers, dtypes, exp_value): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.usefixtures("pyarrow_xfail") def test_dtype_mangle_dup_cols_single_dtype(all_parsers): # GH#42022 @@ -456,6 +463,7 @@ def test_dtype_backend_and_dtype(all_parsers): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_dtype_backend_string(all_parsers, string_storage): # GH#36712 pa = pytest.importorskip("pyarrow") @@ -499,6 +507,7 @@ def test_dtype_backend_ea_dtype_specified(all_parsers): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_dtype_backend_pyarrow(all_parsers, request): # GH#36712 pa = pytest.importorskip("pyarrow") @@ -607,6 +616,7 @@ def test_string_inference_object_dtype(all_parsers, dtype): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_accurate_parsing_of_large_integers(all_parsers): # GH#52505 data = """SYMBOL,MOMENT,ID,ID_DEAL @@ -617,7 +627,7 @@ def test_accurate_parsing_of_large_integers(all_parsers): AMZN,20230301181139587,2023552585717889759,2023552585717263360 MSFT,20230301181139587,2023552585717889863,2023552585717263361 NVDA,20230301181139587,2023552585717889827,2023552585717263361""" - orders = pd.read_csv(StringIO(data), dtype={"ID_DEAL": pd.Int64Dtype()}) + orders = all_parsers.read_csv(StringIO(data), dtype={"ID_DEAL": pd.Int64Dtype()}) assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263358, "ID_DEAL"]) == 1 assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263359, "ID_DEAL"]) == 1 assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263360, "ID_DEAL"]) == 2 @@ -639,3 +649,16 @@ def test_dtypes_with_usecols(all_parsers): values = ["1", "4"] expected = DataFrame({"a": pd.Series(values, dtype=object), "c": [3, 6]}) tm.assert_frame_equal(result, expected) + + +def test_index_col_with_dtype_no_rangeindex(all_parsers): + data = StringIO("345.5,519.5,0\n519.5,726.5,1") + result = all_parsers.read_csv( + data, + header=None, + names=["start", "stop", "bin_id"], + dtype={"start": np.float32, "stop": np.float32, "bin_id": np.uint32}, + index_col="bin_id", + ).index + expected = pd.Index([0, 1], dtype=np.uint32, name="bin_id") + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py index 39718ca2ec134..9226f265ca2b3 100644 --- a/pandas/tests/io/parser/test_c_parser_only.py +++ b/pandas/tests/io/parser/test_c_parser_only.py @@ -18,6 +18,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat import WASM from pandas.compat.numpy import np_version_gte1p24 from pandas.errors import ( @@ -182,6 +184,7 @@ def error(val: float, actual_val: Decimal) -> Decimal: assert max(precise_errors) <= max(normal_errors) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_usecols_dtypes(c_parser_only): parser = c_parser_only data = """\ diff --git a/pandas/tests/io/parser/test_converters.py b/pandas/tests/io/parser/test_converters.py index 7986df62a6b6f..0423327c7333c 100644 --- a/pandas/tests/io/parser/test_converters.py +++ b/pandas/tests/io/parser/test_converters.py @@ -9,6 +9,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -186,6 +188,7 @@ def convert_score(x): tm.assert_frame_equal(results[0], results[1]) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("conv_f", [lambda x: x, str]) def test_converter_index_col_bug(all_parsers, conv_f): # see gh-1835 , GH#40589 diff --git a/pandas/tests/io/parser/test_dialect.py b/pandas/tests/io/parser/test_dialect.py index 7a72e66996d43..803114723bc74 100644 --- a/pandas/tests/io/parser/test_dialect.py +++ b/pandas/tests/io/parser/test_dialect.py @@ -26,7 +26,7 @@ def custom_dialect(): "escapechar": "~", "delimiter": ":", "skipinitialspace": False, - "quotechar": "~", + "quotechar": "`", "quoting": 3, } return dialect_name, dialect_kwargs diff --git a/pandas/tests/io/parser/test_header.py b/pandas/tests/io/parser/test_header.py index b7e3a13ec28b8..c6efbd8059138 100644 --- a/pandas/tests/io/parser/test_header.py +++ b/pandas/tests/io/parser/test_header.py @@ -368,7 +368,7 @@ def test_header_multi_index_common_format_malformed2(all_parsers): parser = all_parsers expected = DataFrame( np.array([[2, 3, 4, 5, 6], [8, 9, 10, 11, 12]], dtype="int64"), - index=Index([1, 7]), + index=range(1, 13, 6), columns=MultiIndex( levels=[["a", "b", "c"], ["r", "s", "t", "u", "v"]], codes=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]], diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py index 24d0a7626723e..ce2ed5e9764bd 100644 --- a/pandas/tests/io/parser/test_index_col.py +++ b/pandas/tests/io/parser/test_index_col.py @@ -9,6 +9,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas import ( DataFrame, Index, @@ -343,6 +345,7 @@ def test_infer_types_boolean_sum(all_parsers): tm.assert_frame_equal(result, expected, check_index_type=False) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("dtype, val", [(object, "01"), ("int64", 1)]) def test_specify_dtype_for_index_col(all_parsers, dtype, val, request): # GH#9435 diff --git a/pandas/tests/io/parser/test_mangle_dupes.py b/pandas/tests/io/parser/test_mangle_dupes.py index 61d328138da96..6a2ae3bffdc74 100644 --- a/pandas/tests/io/parser/test_mangle_dupes.py +++ b/pandas/tests/io/parser/test_mangle_dupes.py @@ -8,6 +8,8 @@ import pytest +from pandas._config import using_string_dtype + from pandas import DataFrame import pandas._testing as tm @@ -119,6 +121,7 @@ def test_thorough_mangle_names(all_parsers, data, names, expected): parser.read_csv(StringIO(data), names=names) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @xfail_pyarrow # AssertionError: DataFrame.columns are different def test_mangled_unnamed_placeholders(all_parsers): # xref gh-13017 diff --git a/pandas/tests/io/parser/test_multi_thread.py b/pandas/tests/io/parser/test_multi_thread.py index 649a1324686a7..348c19ac0f0c6 100644 --- a/pandas/tests/io/parser/test_multi_thread.py +++ b/pandas/tests/io/parser/test_multi_thread.py @@ -152,7 +152,8 @@ def test_multi_thread_path_multipart_read_csv(all_parsers): with tm.ensure_clean(file_name) as path: df.to_csv(path) - final_dataframe = _generate_multi_thread_dataframe( - parser, path, num_rows, num_tasks - ) - tm.assert_frame_equal(df, final_dataframe) + result = _generate_multi_thread_dataframe(parser, path, num_rows, num_tasks) + + expected = df[:] + expected["date"] = expected["date"].astype("M8[s]") + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index 1e370f649aef8..360a5feebe073 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -8,6 +8,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs.parsers import STR_NA_VALUES from pandas import ( @@ -259,6 +261,7 @@ def test_na_value_dict_multi_index(all_parsers, index_col, expected): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "kwargs,expected", [ @@ -426,6 +429,7 @@ def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_v tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @xfail_pyarrow # mismatched dtypes in both cases, FutureWarning in the True case @pytest.mark.parametrize( "na_filter,row_data", @@ -532,6 +536,7 @@ def test_na_values_dict_aliasing(all_parsers): tm.assert_dict_equal(na_values, na_values_copy) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_na_values_dict_null_column_name(all_parsers): # see gh-57547 parser = all_parsers @@ -662,6 +667,7 @@ def test_inf_na_values_with_int_index(all_parsers): tm.assert_frame_equal(out, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @xfail_pyarrow # mismatched shape @pytest.mark.parametrize("na_filter", [True, False]) def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter): @@ -713,6 +719,7 @@ def test_cast_NA_to_bool_raises_error(all_parsers, data, na_values): # TODO: this test isn't about the na_values keyword, it is about the empty entries # being returned with NaN entries, whereas the pyarrow engine returns "nan" @xfail_pyarrow # mismatched shapes +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_str_nan_dropped(all_parsers): # see gh-21131 parser = all_parsers diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py index f63cc3d56bf89..4ccfa8e81e883 100644 --- a/pandas/tests/io/parser/test_network.py +++ b/pandas/tests/io/parser/test_network.py @@ -75,6 +75,7 @@ def tips_df(datapath): @pytest.mark.single_cpu +@pytest.mark.network @pytest.mark.usefixtures("s3_resource") @td.skip_if_not_us_locale() class TestS3: diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 3bb3d793606e1..386348c4bd687 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -12,7 +12,8 @@ import numpy as np import pytest -import pytz + +from pandas._config import using_string_dtype import pandas as pd from pandas import ( @@ -62,6 +63,7 @@ def test_date_col_as_index_col(all_parsers): datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 22, 0), ], + dtype="M8[s]", name="X1", ) expected = DataFrame( @@ -90,7 +92,7 @@ def test_nat_parse(all_parsers): df = DataFrame( { "A": np.arange(10, dtype="float64"), - "B": Timestamp("20010101").as_unit("ns"), + "B": Timestamp("20010101"), } ) df.iloc[3:6, :] = np.nan @@ -126,7 +128,7 @@ def test_parse_dates_string(all_parsers): parser = all_parsers result = parser.read_csv(StringIO(data), index_col="date", parse_dates=["date"]) # freq doesn't round-trip - index = date_range("1/1/2009", periods=3, name="date")._with_freq(None) + index = date_range("1/1/2009", periods=3, name="date", unit="s")._with_freq(None) expected = DataFrame( {"A": ["a", "b", "c"], "B": [1, 3, 4], "C": [2, 4, 5]}, index=index @@ -143,6 +145,8 @@ def test_parse_dates_column_list(all_parsers, parse_dates): expected = DataFrame( {"a": [datetime(2010, 1, 1)], "b": [1], "c": [datetime(2010, 2, 15)]} ) + expected["a"] = expected["a"].astype("M8[s]") + expected["c"] = expected["c"].astype("M8[s]") expected = expected.set_index(["a", "b"]) result = parser.read_csv( @@ -166,9 +170,10 @@ def test_multi_index_parse_dates(all_parsers, index_col): 20090103,three,c,4,5 """ parser = all_parsers + dti = date_range("2009-01-01", periods=3, freq="D", unit="s") index = MultiIndex.from_product( [ - (datetime(2009, 1, 1), datetime(2009, 1, 2), datetime(2009, 1, 3)), + dti, ("one", "two", "three"), ], names=["index1", "index2"], @@ -209,13 +214,11 @@ def test_parse_tz_aware(all_parsers): data = "Date,x\n2012-06-13T01:39:00Z,0.5" result = parser.read_csv(StringIO(data), index_col=0, parse_dates=True) - # TODO: make unit check more specific - if parser.engine == "pyarrow": - result.index = result.index.as_unit("ns") expected = DataFrame( {"x": [0.5]}, index=Index([Timestamp("2012-06-13 01:39:00+00:00")], name="Date") ) if parser.engine == "pyarrow": + pytz = pytest.importorskip("pytz") expected_tz = pytz.utc else: expected_tz = timezone.utc @@ -302,6 +305,7 @@ def test_parse_dates_empty_string(all_parsers): expected = DataFrame( [[datetime(2012, 1, 1), 1], [pd.NaT, 2]], columns=["Date", "test"] ) + expected["Date"] = expected["Date"].astype("M8[s]") tm.assert_frame_equal(result, expected) @@ -312,18 +316,22 @@ def test_parse_dates_empty_string(all_parsers): ( "a\n04.15.2016", {"parse_dates": ["a"]}, - DataFrame([datetime(2016, 4, 15)], columns=["a"]), + DataFrame([datetime(2016, 4, 15)], columns=["a"], dtype="M8[s]"), ), ( "a\n04.15.2016", {"parse_dates": True, "index_col": 0}, - DataFrame(index=DatetimeIndex(["2016-04-15"], name="a"), columns=[]), + DataFrame( + index=DatetimeIndex(["2016-04-15"], dtype="M8[s]", name="a"), columns=[] + ), ), ( "a,b\n04.15.2016,09.16.2013", {"parse_dates": ["a", "b"]}, DataFrame( - [[datetime(2016, 4, 15), datetime(2013, 9, 16)]], columns=["a", "b"] + [[datetime(2016, 4, 15), datetime(2013, 9, 16)]], + dtype="M8[s]", + columns=["a", "b"], ), ), ( @@ -331,7 +339,13 @@ def test_parse_dates_empty_string(all_parsers): {"parse_dates": True, "index_col": [0, 1]}, DataFrame( index=MultiIndex.from_tuples( - [(datetime(2016, 4, 15), datetime(2013, 9, 16))], names=["a", "b"] + [ + ( + Timestamp(2016, 4, 15).as_unit("s"), + Timestamp(2013, 9, 16).as_unit("s"), + ) + ], + names=["a", "b"], ), columns=[], ), @@ -399,6 +413,7 @@ def test_parse_timezone(all_parsers): end="2018-01-04 09:05:00", freq="1min", tz=timezone(timedelta(minutes=540)), + unit="s", )._with_freq(None) expected_data = {"dt": dti, "val": [23350, 23400, 23400, 23400, 23400]} @@ -406,6 +421,7 @@ def test_parse_timezone(all_parsers): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @skip_pyarrow # pandas.errors.ParserError: CSV parse error @pytest.mark.parametrize( "date_string", @@ -437,7 +453,7 @@ def test_parse_delimited_date_swap_no_warning( all_parsers, date_string, dayfirst, expected, request ): parser = all_parsers - expected = DataFrame({0: [expected]}, dtype="datetime64[ns]") + expected = DataFrame({0: [expected]}, dtype="datetime64[s]") if parser.engine == "pyarrow": if not dayfirst: # "CSV parse error: Empty CSV file or block" @@ -470,7 +486,7 @@ def test_parse_delimited_date_swap_with_warning( all_parsers, date_string, dayfirst, expected ): parser = all_parsers - expected = DataFrame({0: [expected]}, dtype="datetime64[ns]") + expected = DataFrame({0: [expected]}, dtype="datetime64[s]") warning_msg = ( "Parsing dates in .* format when dayfirst=.* was specified. " "Pass `dayfirst=.*` or specify a format to silence this warning." @@ -555,9 +571,7 @@ def test_date_parser_multiindex_columns(all_parsers): 1,2 2019-12-31,6""" result = parser.read_csv(StringIO(data), parse_dates=[("a", "1")], header=[0, 1]) - expected = DataFrame( - {("a", "1"): Timestamp("2019-12-31").as_unit("ns"), ("b", "2"): [6]} - ) + expected = DataFrame({("a", "1"): Timestamp("2019-12-31"), ("b", "2"): [6]}) tm.assert_frame_equal(result, expected) @@ -591,16 +605,18 @@ def test_date_parser_usecols_thousands(all_parsers): thousands="-", ) expected = DataFrame({"B": [3, 4], "C": [Timestamp("20-09-2001 01:00:00")] * 2}) + expected["C"] = expected["C"].astype("M8[s]") tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_dayfirst_warnings(): # GH 12585 # CASE 1: valid input input = "date\n31/12/2014\n10/03/2011" expected = DatetimeIndex( - ["2014-12-31", "2011-03-10"], dtype="datetime64[ns]", freq=None, name="date" + ["2014-12-31", "2011-03-10"], dtype="datetime64[s]", freq=None, name="date" ) warning_msg = ( "Parsing dates in .* format when dayfirst=.* was specified. " @@ -661,7 +677,7 @@ def test_dayfirst_warnings_no_leading_zero(date_string, dayfirst): # GH47880 initial_value = f"date\n{date_string}" expected = DatetimeIndex( - ["2014-01-31"], dtype="datetime64[ns]", freq=None, name="date" + ["2014-01-31"], dtype="datetime64[s]", freq=None, name="date" ) warning_msg = ( "Parsing dates in .* format when dayfirst=.* was specified. " @@ -716,7 +732,8 @@ def test_replace_nans_before_parsing_dates(all_parsers): pd.NaT, Timestamp("2017-09-09"), ] - } + }, + dtype="M8[s]", ) tm.assert_frame_equal(result, expected) @@ -731,9 +748,11 @@ def test_parse_dates_and_string_dtype(all_parsers): result = parser.read_csv(StringIO(data), dtype="string", parse_dates=["b"]) expected = DataFrame({"a": ["1"], "b": [Timestamp("2019-12-31")]}) expected["a"] = expected["a"].astype("string") + expected["b"] = expected["b"].astype("M8[s]") tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_parse_dot_separated_dates(all_parsers): # https://github.com/pandas-dev/pandas/issues/2586 parser = all_parsers @@ -750,7 +769,7 @@ def test_parse_dot_separated_dates(all_parsers): else: expected_index = DatetimeIndex( ["2003-03-27 14:55:00", "2003-08-03 15:20:00"], - dtype="datetime64[ns]", + dtype="datetime64[ms]", name="a", ) warn = UserWarning @@ -783,7 +802,8 @@ def test_parse_dates_dict_format(all_parsers): { "a": [Timestamp("2019-12-31"), Timestamp("2020-12-31")], "b": [Timestamp("2019-12-31"), Timestamp("2020-12-31")], - } + }, + dtype="M8[s]", ) tm.assert_frame_equal(result, expected) @@ -816,9 +836,6 @@ def test_parse_dates_arrow_engine(all_parsers): 2000-01-01 00:00:01,1""" result = parser.read_csv(StringIO(data), parse_dates=["a"]) - # TODO: make unit check more specific - if parser.engine == "pyarrow": - result["a"] = result["a"].dt.as_unit("ns") expected = DataFrame( { "a": [ diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index c0ea5936164a1..26480010fc687 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -18,6 +18,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import ( ParserError, ParserWarning, @@ -497,6 +499,7 @@ def test_header_int_do_not_infer_multiindex_names_on_different_line(python_parse tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "dtype", [{"a": object}, {"a": str, "b": np.int64, "c": np.int64}] ) @@ -524,6 +527,7 @@ def test_no_thousand_convert_with_dot_for_non_numeric_cols(python_parser_only, d tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "dtype,expected", [ diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index 0a9f6bd83e0d9..b7b4a77c9e048 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -13,6 +13,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import EmptyDataError import pandas as pd @@ -298,7 +300,8 @@ def test_fwf_regression(): "2009-06-13 20:40:00", "2009-06-13 20:50:00", "2009-06-13 21:00:00", - ] + ], + dtype="M8[us]", ), columns=["SST", "T010", "T020", "T030", "T060", "T080", "T100"], ) @@ -311,6 +314,7 @@ def test_fwf_regression(): parse_dates=True, date_format="%Y%j%H%M%S", ) + expected.index = expected.index.astype("M8[s]") tm.assert_frame_equal(result, expected) @@ -937,6 +941,7 @@ def test_widths_and_usecols(): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_dtype_backend(string_storage, dtype_backend): # GH#50289 if string_storage == "python": diff --git a/pandas/tests/io/parser/test_skiprows.py b/pandas/tests/io/parser/test_skiprows.py index 17a806d05fe28..99642ee4befc6 100644 --- a/pandas/tests/io/parser/test_skiprows.py +++ b/pandas/tests/io/parser/test_skiprows.py @@ -42,7 +42,9 @@ def test_skip_rows_bug(all_parsers, skiprows): StringIO(text), skiprows=skiprows, header=None, index_col=0, parse_dates=True ) index = Index( - [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], name=0 + [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], + dtype="M8[s]", + name=0, ) expected = DataFrame( @@ -85,7 +87,9 @@ def test_skip_rows_blank(all_parsers): StringIO(text), skiprows=6, header=None, index_col=0, parse_dates=True ) index = Index( - [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], name=0 + [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], + dtype="M8[s]", + name=0, ) expected = DataFrame( diff --git a/pandas/tests/io/parser/test_upcast.py b/pandas/tests/io/parser/test_upcast.py index bc4c4c2e24e9c..01e576ba40f26 100644 --- a/pandas/tests/io/parser/test_upcast.py +++ b/pandas/tests/io/parser/test_upcast.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs.parsers import ( _maybe_upcast, na_values, @@ -84,6 +86,7 @@ def test_maybe_upcaste_all_nan(): tm.assert_extension_array_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("val", [na_values[np.object_], "c"]) def test_maybe_upcast_object(val, string_storage): # GH#36712 diff --git a/pandas/tests/io/parser/usecols/test_parse_dates.py b/pandas/tests/io/parser/usecols/test_parse_dates.py index 0cf3fe894c916..cc54f2487aa60 100644 --- a/pandas/tests/io/parser/usecols/test_parse_dates.py +++ b/pandas/tests/io/parser/usecols/test_parse_dates.py @@ -70,7 +70,7 @@ def test_usecols_with_parse_dates3(all_parsers): parse_dates = [0] cols = { - "a": Timestamp("2016-09-21").as_unit("ns"), + "a": Timestamp("2016-09-21"), "b": [1], "c": [1], "d": [2], diff --git a/pandas/tests/io/parser/usecols/test_usecols_basic.py b/pandas/tests/io/parser/usecols/test_usecols_basic.py index 82b42beb38ae0..d02364a77df90 100644 --- a/pandas/tests/io/parser/usecols/test_usecols_basic.py +++ b/pandas/tests/io/parser/usecols/test_usecols_basic.py @@ -8,6 +8,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import ParserError from pandas import ( @@ -529,6 +531,7 @@ def test_usecols_additional_columns_integer_columns(all_parsers): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_usecols_dtype(all_parsers): parser = all_parsers data = """ diff --git a/pandas/tests/io/pytables/test_append.py b/pandas/tests/io/pytables/test_append.py index 7f7f7eccb2382..d3b4bb0ea6c72 100644 --- a/pandas/tests/io/pytables/test_append.py +++ b/pandas/tests/io/pytables/test_append.py @@ -5,6 +5,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs.tslibs import Timestamp from pandas.compat import PY312 @@ -23,7 +25,10 @@ ensure_clean_store, ) -pytestmark = pytest.mark.single_cpu +pytestmark = [ + pytest.mark.single_cpu, + pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), +] tables = pytest.importorskip("tables") diff --git a/pandas/tests/io/pytables/test_categorical.py b/pandas/tests/io/pytables/test_categorical.py index 2ab9f1ac8be1c..998021bad9001 100644 --- a/pandas/tests/io/pytables/test_categorical.py +++ b/pandas/tests/io/pytables/test_categorical.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas import ( Categorical, DataFrame, @@ -14,7 +16,10 @@ ensure_clean_store, ) -pytestmark = pytest.mark.single_cpu +pytestmark = [ + pytest.mark.single_cpu, + pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), +] def test_categorical(setup_path): diff --git a/pandas/tests/io/pytables/test_complex.py b/pandas/tests/io/pytables/test_complex.py index c5cac5a5caf09..d140cfc941e16 100644 --- a/pandas/tests/io/pytables/test_complex.py +++ b/pandas/tests/io/pytables/test_complex.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -11,6 +13,10 @@ from pandas.io.pytables import read_hdf +pytestmark = pytest.mark.xfail( + using_string_dtype(), reason="TODO(infer_string)", strict=False +) + def test_complex_fixed(tmp_path, setup_path): df = DataFrame( diff --git a/pandas/tests/io/pytables/test_errors.py b/pandas/tests/io/pytables/test_errors.py index 2021101098892..c31b9989ef35e 100644 --- a/pandas/tests/io/pytables/test_errors.py +++ b/pandas/tests/io/pytables/test_errors.py @@ -5,6 +5,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas import ( CategoricalIndex, DataFrame, @@ -22,7 +24,10 @@ _maybe_adjust_name, ) -pytestmark = pytest.mark.single_cpu +pytestmark = [ + pytest.mark.single_cpu, + pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), +] def test_pass_spec_to_storer(setup_path): diff --git a/pandas/tests/io/pytables/test_file_handling.py b/pandas/tests/io/pytables/test_file_handling.py index d8f38e9cdad1f..606b19ac0ed75 100644 --- a/pandas/tests/io/pytables/test_file_handling.py +++ b/pandas/tests/io/pytables/test_file_handling.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat import ( PY311, is_ci_environment, @@ -33,7 +35,10 @@ from pandas.io import pytables from pandas.io.pytables import Term -pytestmark = pytest.mark.single_cpu +pytestmark = [ + pytest.mark.single_cpu, + pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), +] @pytest.mark.parametrize("mode", ["r", "r+", "a", "w"]) diff --git a/pandas/tests/io/pytables/test_put.py b/pandas/tests/io/pytables/test_put.py index d526697c7574a..a4257b54dd6db 100644 --- a/pandas/tests/io/pytables/test_put.py +++ b/pandas/tests/io/pytables/test_put.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs.tslibs import Timestamp import pandas as pd @@ -22,7 +24,10 @@ ) from pandas.util import _test_decorators as td -pytestmark = pytest.mark.single_cpu +pytestmark = [ + pytest.mark.single_cpu, + pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), +] def test_format_type(tmp_path, setup_path): diff --git a/pandas/tests/io/pytables/test_read.py b/pandas/tests/io/pytables/test_read.py index e33ddaf3b81f0..dd3a0eabe95ae 100644 --- a/pandas/tests/io/pytables/test_read.py +++ b/pandas/tests/io/pytables/test_read.py @@ -5,6 +5,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat import is_platform_windows import pandas as pd @@ -24,7 +26,10 @@ from pandas.io.pytables import TableIterator -pytestmark = pytest.mark.single_cpu +pytestmark = [ + pytest.mark.single_cpu, + pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), +] def test_read_missing_key_close_store(tmp_path, setup_path): @@ -317,3 +322,14 @@ def test_read_infer_string(tmp_path, setup_path): columns=Index(["a"], dtype="string[pyarrow_numpy]"), ) tm.assert_frame_equal(result, expected) + + +def test_hdfstore_read_datetime64_unit_s(tmp_path, setup_path): + # GH 59004 + df_s = DataFrame(["2001-01-01", "2002-02-02"], dtype="datetime64[s]") + path = tmp_path / setup_path + with HDFStore(path, mode="w") as store: + store.put("df_s", df_s) + with HDFStore(path, mode="r") as store: + df_fromstore = store.get("df_s") + tm.assert_frame_equal(df_s, df_fromstore) diff --git a/pandas/tests/io/pytables/test_round_trip.py b/pandas/tests/io/pytables/test_round_trip.py index 51ee289c8e27a..6b98a720e4299 100644 --- a/pandas/tests/io/pytables/test_round_trip.py +++ b/pandas/tests/io/pytables/test_round_trip.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs.tslibs import Timestamp from pandas.compat import is_platform_windows @@ -24,7 +26,10 @@ ) from pandas.util import _test_decorators as td -pytestmark = pytest.mark.single_cpu +pytestmark = [ + pytest.mark.single_cpu, + pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), +] def test_conv_read_write(): @@ -236,8 +241,10 @@ def test_table_values_dtypes_roundtrip(setup_path): df1["float322"] = 1.0 df1["float322"] = df1["float322"].astype("float32") df1["bool"] = df1["float32"] > 0 - df1["time1"] = Timestamp("20130101") - df1["time2"] = Timestamp("20130102") + df1["time_s_1"] = Timestamp("20130101") + df1["time_s_2"] = Timestamp("20130101 00:00:00") + df1["time_ms"] = Timestamp("20130101 00:00:00.000") + df1["time_ns"] = Timestamp("20130102 00:00:00.000000000") store.append("df_mixed_dtypes1", df1) result = store.select("df_mixed_dtypes1").dtypes.value_counts() @@ -252,7 +259,9 @@ def test_table_values_dtypes_roundtrip(setup_path): "int8": 1, "int64": 1, "object": 1, - "datetime64[ns]": 2, + "datetime64[s]": 2, + "datetime64[ms]": 1, + "datetime64[ns]": 1, }, name="count", ) diff --git a/pandas/tests/io/pytables/test_select.py b/pandas/tests/io/pytables/test_select.py index 752e2fc570023..4b20b929ef447 100644 --- a/pandas/tests/io/pytables/test_select.py +++ b/pandas/tests/io/pytables/test_select.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs.tslibs import Timestamp from pandas.compat import PY312 @@ -25,7 +27,10 @@ from pandas.io.pytables import Term -pytestmark = pytest.mark.single_cpu +pytestmark = [ + pytest.mark.single_cpu, + pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), +] def test_select_columns_in_where(setup_path): diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index 471f7b8958ee4..a6fe9529c594a 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -7,6 +7,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat import PY312 import pandas as pd @@ -33,7 +35,10 @@ read_hdf, ) -pytestmark = pytest.mark.single_cpu +pytestmark = [ + pytest.mark.single_cpu, + pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), +] tables = pytest.importorskip("tables") @@ -613,10 +618,14 @@ def test_store_index_name(setup_path): @pytest.mark.parametrize("table_format", ["table", "fixed"]) def test_store_index_name_numpy_str(tmp_path, table_format, setup_path, unit, tz): # GH #13492 - idx = DatetimeIndex( - [dt.date(2000, 1, 1), dt.date(2000, 1, 2)], - name="cols\u05d2", - ).tz_localize(tz) + idx = ( + DatetimeIndex( + [dt.date(2000, 1, 1), dt.date(2000, 1, 2)], + name="cols\u05d2", + ) + .tz_localize(tz) + .as_unit(unit) + ) idx1 = ( DatetimeIndex( [dt.date(2010, 1, 1), dt.date(2010, 1, 2)], diff --git a/pandas/tests/io/pytables/test_timezones.py b/pandas/tests/io/pytables/test_timezones.py index 9192804e49bd1..8f179f844e4d0 100644 --- a/pandas/tests/io/pytables/test_timezones.py +++ b/pandas/tests/io/pytables/test_timezones.py @@ -6,6 +6,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs.tslibs.timezones import maybe_get_tz import pandas.util._test_decorators as td @@ -23,6 +25,10 @@ ensure_clean_store, ) +pytestmark = pytest.mark.xfail( + using_string_dtype(), reason="TODO(infer_string)", strict=False +) + def _compare_with_tz(a, b): tm.assert_frame_equal(a, b) diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index fc5df6d9babcb..3f5b73f4aa8a4 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -7,6 +7,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat._constants import ( IS64, WASM, @@ -18,6 +20,10 @@ from pandas.io.sas.sas7bdat import SAS7BDATReader +pytestmark = pytest.mark.xfail( + using_string_dtype(), reason="TODO(infer_string)", strict=False +) + @pytest.fixture def dirpath(datapath): @@ -30,9 +36,9 @@ def data_test_ix(request, dirpath): fname = os.path.join(dirpath, f"test_sas7bdat_{i}.csv") df = pd.read_csv(fname) epoch = datetime(1960, 1, 1) - t1 = pd.to_timedelta(df["Column4"], unit="d") + t1 = pd.to_timedelta(df["Column4"], unit="D") df["Column4"] = (epoch + t1).astype("M8[s]") - t2 = pd.to_timedelta(df["Column12"], unit="d") + t2 = pd.to_timedelta(df["Column12"], unit="D") df["Column12"] = (epoch + t2).astype("M8[s]") for k in range(df.shape[1]): col = df.iloc[:, k] diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index babbddafa3b49..923b880004c26 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import ( PyperclipException, PyperclipWindowsException, @@ -28,6 +30,10 @@ init_qt_clipboard, ) +pytestmark = pytest.mark.xfail( + using_string_dtype(), reason="TODO(infer_string)", strict=False +) + def build_kwargs(sep, excel): kwargs = {} diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index e4b4d3a82669d..c583f9b2c4f99 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -19,6 +19,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat import ( WASM, is_platform_windows, @@ -137,6 +139,7 @@ def test_bytesiowrapper_returns_correct_bytes(self): assert result == data.encode("utf-8") # Test that pyarrow can handle a file opened with get_handle + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_get_handle_pyarrow_compat(self): pa_csv = pytest.importorskip("pyarrow.csv") @@ -334,6 +337,7 @@ def test_read_fspath_all(self, reader, module, path, datapath): ("to_stata", {"time_stamp": pd.to_datetime("2019-01-01 00:00")}, "os"), ], ) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_write_fspath_all(self, writer_name, writer_kwargs, module): if writer_name in ["to_latex"]: # uses Styler implementation pytest.importorskip("jinja2") @@ -439,6 +443,7 @@ def test_unknown_engine(self): with pytest.raises(ValueError, match="Unknown engine"): pd.read_csv(path, engine="pyt") + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_binary_mode(self): """ 'encoding' shouldn't be passed to 'open' in binary mode. @@ -474,7 +479,10 @@ def test_warning_missing_utf_bom(self, encoding, compression_): df.to_csv(path, compression=compression_, encoding=encoding) # reading should fail (otherwise we wouldn't need the warning) - msg = r"UTF-\d+ stream does not start with BOM" + msg = ( + r"UTF-\d+ stream does not start with BOM|" + r"'utf-\d+' codec can't decode byte" + ) with pytest.raises(UnicodeError, match=msg): pd.read_csv(path, compression=compression_, encoding=encoding) @@ -494,6 +502,7 @@ def test_is_fsspec_url(): assert icom.is_fsspec_url("RFC-3986+compliant.spec://something") +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("encoding", [None, "utf-8"]) @pytest.mark.parametrize("format", ["csv", "json"]) def test_codecs_encoding(encoding, format): @@ -514,6 +523,7 @@ def test_codecs_encoding(encoding, format): tm.assert_frame_equal(expected, df) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_codecs_get_writer_reader(): # GH39247 expected = pd.DataFrame( @@ -552,7 +562,7 @@ def test_explicit_encoding(io_class, mode, msg): expected.to_csv(buffer, mode=f"w{mode}") -@pytest.mark.parametrize("encoding_errors", [None, "strict", "replace"]) +@pytest.mark.parametrize("encoding_errors", ["strict", "replace"]) @pytest.mark.parametrize("format", ["csv", "json"]) def test_encoding_errors(encoding_errors, format): # GH39450 @@ -587,6 +597,17 @@ def test_encoding_errors(encoding_errors, format): tm.assert_frame_equal(df, expected) +@pytest.mark.parametrize("encoding_errors", [0, None]) +def test_encoding_errors_badtype(encoding_errors): + # GH 59075 + content = StringIO("A,B\n1,2\n3,4\n") + reader = partial(pd.read_csv, encoding_errors=encoding_errors) + expected_error = "encoding_errors must be a string, got " + expected_error += f"{type(encoding_errors).__name__}" + with pytest.raises(ValueError, match=expected_error): + reader(content) + + def test_bad_encdoing_errors(): # GH 39777 with tm.ensure_clean() as path: diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py index 00082be7e07e8..5eb202dd5aa24 100644 --- a/pandas/tests/io/test_compression.py +++ b/pandas/tests/io/test_compression.py @@ -12,6 +12,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat import is_platform_windows import pandas as pd @@ -137,6 +139,7 @@ def test_compression_warning(compression_only): df.to_csv(handles.handle, compression=compression_only) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_compression_binary(compression_only): """ Binary file handles support compression. @@ -231,7 +234,7 @@ def test_with_missing_lzma(): @pytest.mark.single_cpu def test_with_missing_lzma_runtime(): - """Tests if RuntimeError is hit when calling lzma without + """Tests if ModuleNotFoundError is hit when calling lzma without having the module available. """ code = textwrap.dedent( @@ -241,7 +244,7 @@ def test_with_missing_lzma_runtime(): sys.modules['lzma'] = None import pandas as pd df = pd.DataFrame() - with pytest.raises(RuntimeError, match='lzma module'): + with pytest.raises(ModuleNotFoundError, match='import of lzma'): df.to_csv('foo.csv', compression='xz') """ ) diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 893728748f276..5aa8f1c69fe44 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -1,8 +1,12 @@ """test feather-format compat""" +import zoneinfo + import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd import pandas._testing as tm from pandas.core.arrays import ( @@ -12,9 +16,12 @@ from pandas.io.feather_format import read_feather, to_feather # isort:skip -pytestmark = pytest.mark.filterwarnings( - "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" -) +pytestmark = [ + pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" + ), + pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), +] pa = pytest.importorskip("pyarrow") @@ -62,6 +69,7 @@ def test_error(self): self.check_error_on_write(obj, ValueError, msg) def test_basic(self): + tz = zoneinfo.ZoneInfo("US/Eastern") df = pd.DataFrame( { "string": list("abc"), @@ -76,7 +84,7 @@ def test_basic(self): list(pd.date_range("20130101", periods=3)), freq=None ), "dttz": pd.DatetimeIndex( - list(pd.date_range("20130101", periods=3, tz="US/Eastern")), + list(pd.date_range("20130101", periods=3, tz=tz)), freq=None, ), "dt_with_null": [ @@ -93,7 +101,7 @@ def test_basic(self): df["timedeltas"] = pd.timedelta_range("1 day", periods=3) df["intervals"] = pd.interval_range(0, 3, 3) - assert df.dttz.dtype.tz.zone == "US/Eastern" + assert df.dttz.dtype.tz.key == "US/Eastern" expected = df.copy() expected.loc[1, "bool_with_null"] = None diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index f6fb032b9d51a..7ffee9ea78ddc 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas import ( DataFrame, date_range, @@ -72,7 +74,9 @@ def test_read_csv(cleared_fs, df1): w.write(text) df2 = read_csv("memory://test/test.csv", parse_dates=["dt"]) - tm.assert_frame_equal(df1, df2) + expected = df1.copy() + expected["dt"] = expected["dt"].astype("M8[s]") + tm.assert_frame_equal(df2, expected) def test_reasonable_error(monkeypatch, cleared_fs): @@ -95,7 +99,9 @@ def test_to_csv(cleared_fs, df1): df2 = read_csv("memory://test/test.csv", parse_dates=["dt"], index_col=0) - tm.assert_frame_equal(df1, df2) + expected = df1.copy() + expected["dt"] = expected["dt"].astype("M8[s]") + tm.assert_frame_equal(df2, expected) def test_to_excel(cleared_fs, df1): @@ -106,7 +112,9 @@ def test_to_excel(cleared_fs, df1): df2 = read_excel(path, parse_dates=["dt"], index_col=0) - tm.assert_frame_equal(df1, df2) + expected = df1.copy() + expected["dt"] = expected["dt"].astype("M8[s]") + tm.assert_frame_equal(df2, expected) @pytest.mark.parametrize("binary_mode", [False, True]) @@ -128,7 +136,9 @@ def test_to_csv_fsspec_object(cleared_fs, binary_mode, df1): ) assert not fsspec_object.closed - tm.assert_frame_equal(df1, df2) + expected = df1.copy() + expected["dt"] = expected["dt"].astype("M8[s]") + tm.assert_frame_equal(df2, expected) def test_csv_options(fsspectest): @@ -194,6 +204,7 @@ def test_arrowparquet_options(fsspectest): assert fsspectest.test[0] == "parquet_read" +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_fastparquet_options(fsspectest): """Regression test for writing to a not-yet-existent GCS Parquet file.""" pytest.importorskip("fastparquet") @@ -251,6 +262,7 @@ def test_s3_protocols(s3_public_bucket_with_data, tips_file, protocol, s3so): ) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.single_cpu def test_s3_parquet(s3_public_bucket, s3so, df1): pytest.importorskip("fastparquet") @@ -271,6 +283,7 @@ def test_not_present_exception(): read_csv("memory://test/test.csv") +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_feather_options(fsspectest): pytest.importorskip("pyarrow") df = DataFrame({"a": [0]}) diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py index 4b2be41d0c9f9..e113fa25b2a3f 100644 --- a/pandas/tests/io/test_gcs.py +++ b/pandas/tests/io/test_gcs.py @@ -7,6 +7,10 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + +from pandas.compat.pyarrow import pa_version_under17p0 + from pandas import ( DataFrame, Index, @@ -52,7 +56,7 @@ def ls(self, path, **kwargs): # Patches pyarrow; other processes should not pick up change @pytest.mark.single_cpu @pytest.mark.parametrize("format", ["csv", "json", "parquet", "excel", "markdown"]) -def test_to_read_gcs(gcs_buffer, format, monkeypatch, capsys): +def test_to_read_gcs(gcs_buffer, format, monkeypatch, capsys, request): """ Test that many to/read functions support GCS. @@ -96,6 +100,13 @@ def from_uri(path): to_local = pathlib.Path(path.replace("gs://", "")).absolute().as_uri() return pa_fs.LocalFileSystem(to_local) + request.applymarker( + pytest.mark.xfail( + not pa_version_under17p0, + raises=TypeError, + reason="pyarrow 17 broke the mocked filesystem", + ) + ) with monkeypatch.context() as m: m.setattr(pa_fs, "FileSystem", MockFileSystem) df1.to_parquet(path) @@ -107,7 +118,11 @@ def from_uri(path): df1.to_markdown(path) df2 = df1 - tm.assert_frame_equal(df1, df2) + expected = df1[:] + if format in ["csv", "excel"]: + expected["dt"] = expected["dt"].dt.as_unit("s") + + tm.assert_frame_equal(df2, expected) def assert_equal_zip_safe(result: bytes, expected: bytes, compression: str): @@ -143,6 +158,7 @@ def assert_equal_zip_safe(result: bytes, expected: bytes, compression: str): assert result == expected +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("encoding", ["utf-8", "cp1251"]) def test_to_csv_compression_encoding_gcs( gcs_buffer, compression_only, encoding, compression_to_extension diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 594c1d02b94cc..164646aedf464 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -13,6 +13,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat import is_platform_windows import pandas.util._test_decorators as td @@ -36,6 +38,10 @@ from pandas.io.common import file_path_to_url +pytestmark = pytest.mark.xfail( + using_string_dtype(), reason="TODO(infer_string)", strict=False +) + @pytest.fixture( params=[ @@ -1044,11 +1050,15 @@ def test_header_inferred_from_rows_with_only_th(self, flavor_read_html): def test_parse_dates_list(self, flavor_read_html): df = DataFrame({"date": date_range("1/1/2001", periods=10)}) - expected = df.to_html() - res = flavor_read_html(StringIO(expected), parse_dates=[1], index_col=0) - tm.assert_frame_equal(df, res[0]) - res = flavor_read_html(StringIO(expected), parse_dates=["date"], index_col=0) - tm.assert_frame_equal(df, res[0]) + + expected = df[:] + expected["date"] = expected["date"].dt.as_unit("s") + + str_df = df.to_html() + res = flavor_read_html(StringIO(str_df), parse_dates=[1], index_col=0) + tm.assert_frame_equal(expected, res[0]) + res = flavor_read_html(StringIO(str_df), parse_dates=["date"], index_col=0) + tm.assert_frame_equal(expected, res[0]) def test_wikipedia_states_table(self, datapath, flavor_read_html): data = datapath("io", "data", "html", "wikipedia_states.html") diff --git a/pandas/tests/io/test_http_headers.py b/pandas/tests/io/test_http_headers.py index dfae294a147a2..b11fe931f46e5 100644 --- a/pandas/tests/io/test_http_headers.py +++ b/pandas/tests/io/test_http_headers.py @@ -8,6 +8,8 @@ import pytest +from pandas._config import using_string_dtype + import pandas.util._test_decorators as td import pandas as pd @@ -84,6 +86,7 @@ def stata_responder(df): return bio.getvalue() +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "responder, read_method", [ diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index de6d46492e916..a189afbac070d 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -9,6 +9,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import read_orc import pandas._testing as tm @@ -18,9 +20,12 @@ import pyarrow as pa -pytestmark = pytest.mark.filterwarnings( - "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" -) +pytestmark = [ + pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" + ), + pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), +] @pytest.fixture @@ -321,6 +326,8 @@ def test_orc_dtype_backend_pyarrow(): ], } ) + # FIXME: without casting to ns we do not round-trip correctly + df["datetime_with_nat"] = df["datetime_with_nat"].astype("M8[ns]") bytes_data = df.copy().to_orc() result = read_orc(BytesIO(bytes_data), dtype_backend="pyarrow") diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 2860b3a6483af..561c718ea5851 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -9,11 +9,14 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat import is_platform_windows from pandas.compat.pyarrow import ( pa_version_under11p0, pa_version_under13p0, pa_version_under15p0, + pa_version_under17p0, ) import pandas as pd @@ -48,6 +51,7 @@ pytest.mark.filterwarnings( "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" ), + pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), ] @@ -670,6 +674,7 @@ def test_read_empty_array(self, pa, dtype): class TestParquetPyArrow(Base): + @pytest.mark.xfail(reason="datetime_with_nat unit doesn't round-trip") def test_basic(self, pa, df_full): df = df_full pytest.importorskip("pyarrow", "11.0.0") @@ -706,6 +711,14 @@ def test_to_bytes_without_path_or_buf_provided(self, pa, df_full): expected = df_full.copy() expected.loc[1, "string_with_nan"] = None + if pa_version_under11p0: + expected["datetime_with_nat"] = expected["datetime_with_nat"].astype( + "M8[ns]" + ) + else: + expected["datetime_with_nat"] = expected["datetime_with_nat"].astype( + "M8[ms]" + ) tm.assert_frame_equal(res, expected) def test_duplicate_columns(self, pa): @@ -961,7 +974,11 @@ def test_timezone_aware_index(self, request, pa, timezone_aware_date_list): # they both implement datetime.tzinfo # they both wrap datetime.timedelta() # this use-case sets the resolution to 1 minute - check_round_trip(df, pa, check_dtype=False) + + expected = df[:] + if pa_version_under11p0: + expected.index = expected.index.as_unit("ns") + check_round_trip(df, pa, check_dtype=False, expected=expected) def test_filter_row_groups(self, pa): # https://github.com/pandas-dev/pandas/issues/26551 @@ -972,6 +989,7 @@ def test_filter_row_groups(self, pa): result = read_parquet(path, pa, filters=[("a", "==", 0)]) assert len(result) == 1 + @pytest.mark.filterwarnings("ignore:make_block is deprecated:DeprecationWarning") def test_read_dtype_backend_pyarrow_config(self, pa, df_full): import pyarrow @@ -988,13 +1006,14 @@ def test_read_dtype_backend_pyarrow_config(self, pa, df_full): if pa_version_under13p0: # pyarrow infers datetimes as us instead of ns expected["datetime"] = expected["datetime"].astype("timestamp[us][pyarrow]") - expected["datetime_with_nat"] = expected["datetime_with_nat"].astype( - "timestamp[us][pyarrow]" - ) expected["datetime_tz"] = expected["datetime_tz"].astype( pd.ArrowDtype(pyarrow.timestamp(unit="us", tz="Europe/Brussels")) ) + expected["datetime_with_nat"] = expected["datetime_with_nat"].astype( + "timestamp[ms][pyarrow]" + ) + check_round_trip( df, engine=pa, @@ -1018,6 +1037,9 @@ def test_read_dtype_backend_pyarrow_config_index(self, pa): expected=expected, ) + @pytest.mark.xfail( + pa_version_under17p0, reason="pa.pandas_compat passes 'datetime64' to .astype" + ) def test_columns_dtypes_not_invalid(self, pa): df = pd.DataFrame({"string": list("abc"), "int": list(range(1, 4))}) @@ -1107,13 +1129,17 @@ def test_infer_string_large_string_type(self, tmp_path, pa): # df.to_parquet(tmp_path / "test.parquet") # result = read_parquet(tmp_path / "test.parquet") # assert result["strings"].dtype == "string" + # FIXME: don't leave commented-out class TestParquetFastParquet(Base): + @pytest.mark.xfail(reason="datetime_with_nat gets incorrect values") def test_basic(self, fp, df_full): + pytz = pytest.importorskip("pytz") + tz = pytz.timezone("US/Eastern") df = df_full - dti = pd.date_range("20130101", periods=3, tz="US/Eastern") + dti = pd.date_range("20130101", periods=3, tz=tz) dti = dti._with_freq(None) # freq doesn't round-trip df["datetime_tz"] = dti df["timedelta"] = pd.timedelta_range("1 day", periods=3) @@ -1254,6 +1280,25 @@ def test_error_on_using_partition_cols_and_partition_on( partition_cols=partition_cols, ) + def test_empty_dataframe(self, fp): + # GH #27339 + df = pd.DataFrame() + expected = df.copy() + check_round_trip(df, fp, expected=expected) + + @pytest.mark.xfail( + reason="fastparquet passed mismatched values/dtype to DatetimeArray " + "constructor, see https://github.com/dask/fastparquet/issues/891" + ) + def test_timezone_aware_index(self, fp, timezone_aware_date_list): + idx = 5 * [timezone_aware_date_list] + + df = pd.DataFrame(index=idx, data={"index_as_col": idx}) + + expected = df.copy() + expected.index.name = "index" + check_round_trip(df, fp, expected=expected) + def test_close_file_handle_on_read_error(self): with tm.ensure_clean("test.parquet") as path: pathlib.Path(path).write_bytes(b"breakit") diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 1420e24858ffb..5fe0f1265edff 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -13,7 +13,6 @@ from __future__ import annotations -from array import array import bz2 import datetime import functools @@ -32,12 +31,8 @@ import numpy as np import pytest -from pandas.compat import ( - get_lzma_file, - is_platform_little_endian, -) +from pandas.compat import is_platform_little_endian from pandas.compat._optional import import_optional_dependency -from pandas.compat.compressors import flatten_buffer import pandas as pd from pandas import ( @@ -81,36 +76,8 @@ def compare_element(result, expected, typ): # --------------------- -@pytest.mark.parametrize( - "data", - [ - b"123", - b"123456", - bytearray(b"123"), - memoryview(b"123"), - pickle.PickleBuffer(b"123"), - array("I", [1, 2, 3]), - memoryview(b"123456").cast("B", (3, 2)), - memoryview(b"123456").cast("B", (3, 2))[::2], - np.arange(12).reshape((3, 4), order="C"), - np.arange(12).reshape((3, 4), order="F"), - np.arange(12).reshape((3, 4), order="C")[:, ::2], - ], -) -def test_flatten_buffer(data): - result = flatten_buffer(data) - expected = memoryview(data).tobytes("A") - assert result == expected - if isinstance(data, (bytes, bytearray)): - assert result is data - elif isinstance(result, memoryview): - assert result.ndim == 1 - assert result.format == "B" - assert result.contiguous - assert result.shape == (result.nbytes,) - - def test_pickles(datapath): + pytest.importorskip("pytz") if not is_platform_little_endian(): pytest.skip("known failure on non-little endian") @@ -261,7 +228,9 @@ def compress_file(self, src_path, dest_path, compression): tarinfo = tar.gettarinfo(src_path, os.path.basename(src_path)) tar.addfile(tarinfo, fh) elif compression == "xz": - f = get_lzma_file()(dest_path, "w") + import lzma + + f = lzma.LZMAFile(dest_path, "w") elif compression == "zstd": f = import_optional_dependency("zstandard").open(dest_path, "wb") else: diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 6058f34d25ad3..a21893f66722a 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -18,11 +18,10 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs import lib -from pandas.compat import ( - pa_version_under13p0, - pa_version_under14p1, -) +from pandas.compat import pa_version_under14p1 from pandas.compat._optional import import_optional_dependency import pandas.util._test_decorators as td @@ -61,9 +60,12 @@ import sqlalchemy -pytestmark = pytest.mark.filterwarnings( - "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" -) +pytestmark = [ + pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" + ), + pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), +] @pytest.fixture @@ -368,7 +370,7 @@ def create_and_load_postgres_datetz(conn): Timestamp("2000-01-01 08:00:00", tz="UTC"), Timestamp("2000-06-01 07:00:00", tz="UTC"), ] - return Series(expected_data, name="DateColWithTz") + return Series(expected_data, name="DateColWithTz").astype("M8[us, UTC]") def check_iris_frame(frame: DataFrame): @@ -1702,11 +1704,9 @@ def test_api_roundtrip(conn, request, test_frame1): # HACK! if "adbc" in conn_name: - result = result.rename(columns={"__index_level_0__": "level_0"}) - result.index = test_frame1.index - result.set_index("level_0", inplace=True) - result.index.astype(int) - result.index.name = None + result = result.drop(columns="__index_level_0__") + else: + result = result.drop(columns="level_0") tm.assert_frame_equal(result, test_frame1) @@ -1824,7 +1824,7 @@ def test_api_custom_dateparsing_error( pytest.mark.xfail(reason="failing combination of arguments") ) - expected = types_data_frame.astype({"DateCol": "datetime64[ns]"}) + expected = types_data_frame.astype({"DateCol": "datetime64[s]"}) result = read_sql( text, @@ -1847,10 +1847,12 @@ def test_api_custom_dateparsing_error( } ) - if not pa_version_under13p0: - # TODO: is this astype safe? - expected["DateCol"] = expected["DateCol"].astype("datetime64[us]") - + if conn_name == "postgresql_adbc_types" and pa_version_under14p1: + expected["DateCol"] = expected["DateCol"].astype("datetime64[ns]") + elif "postgres" in conn_name or "mysql" in conn_name: + expected["DateCol"] = expected["DateCol"].astype("datetime64[us]") + else: + expected["DateCol"] = expected["DateCol"].astype("datetime64[s]") tm.assert_frame_equal(result, expected) @@ -2835,7 +2837,9 @@ def test_datetime_with_timezone_table(conn, request): conn = request.getfixturevalue(conn) expected = create_and_load_postgres_datetz(conn) result = sql.read_sql_table("datetz", conn) - tm.assert_frame_equal(result, expected.to_frame()) + + exp_frame = expected.to_frame() + tm.assert_frame_equal(result, exp_frame) @pytest.mark.parametrize("conn", sqlalchemy_connectable) @@ -2847,7 +2851,7 @@ def test_datetime_with_timezone_roundtrip(conn, request): # For dbs that support timestamps with timezones, should get back UTC # otherwise naive data should be returned expected = DataFrame( - {"A": date_range("2013-01-01 09:00:00", periods=3, tz="US/Pacific")} + {"A": date_range("2013-01-01 09:00:00", periods=3, tz="US/Pacific", unit="us")} ) assert expected.to_sql(name="test_datetime_tz", con=conn, index=False) == 3 @@ -2865,7 +2869,7 @@ def test_datetime_with_timezone_roundtrip(conn, request): if "sqlite" in conn_name: # read_sql_query does not return datetime type like read_sql_table assert isinstance(result.loc[0, "A"], str) - result["A"] = to_datetime(result["A"]) + result["A"] = to_datetime(result["A"]).dt.as_unit("us") tm.assert_frame_equal(result, expected) @@ -2876,7 +2880,9 @@ def test_out_of_bounds_datetime(conn, request): data = DataFrame({"date": datetime(9999, 1, 1)}, index=[0]) assert data.to_sql(name="test_datetime_obb", con=conn, index=False) == 1 result = sql.read_sql_table("test_datetime_obb", conn) - expected = DataFrame([pd.NaT], columns=["date"]) + expected = DataFrame( + np.array([datetime(9999, 1, 1)], dtype="M8[us]"), columns=["date"] + ) tm.assert_frame_equal(result, expected) @@ -2885,7 +2891,7 @@ def test_naive_datetimeindex_roundtrip(conn, request): # GH 23510 # Ensure that a naive DatetimeIndex isn't converted to UTC conn = request.getfixturevalue(conn) - dates = date_range("2018-01-01", periods=5, freq="6h")._with_freq(None) + dates = date_range("2018-01-01", periods=5, freq="6h", unit="us")._with_freq(None) expected = DataFrame({"nums": range(5)}, index=dates) assert expected.to_sql(name="foo_table", con=conn, index_label="info_date") == 5 result = sql.read_sql_table("foo_table", conn, index_col="info_date") @@ -2937,7 +2943,10 @@ def test_datetime(conn, request): # with read_table -> type information from schema used result = sql.read_sql_table("test_datetime", conn) result = result.drop("index", axis=1) - tm.assert_frame_equal(result, df) + + expected = df[:] + expected["A"] = expected["A"].astype("M8[us]") + tm.assert_frame_equal(result, expected) # with read_sql -> no type information -> sqlite has no native result = sql.read_sql_query("SELECT * FROM test_datetime", conn) @@ -2945,9 +2954,7 @@ def test_datetime(conn, request): if "sqlite" in conn_name: assert isinstance(result.loc[0, "A"], str) result["A"] = to_datetime(result["A"]) - tm.assert_frame_equal(result, df) - else: - tm.assert_frame_equal(result, df) + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("conn", sqlalchemy_connectable) @@ -2962,16 +2969,17 @@ def test_datetime_NaT(conn, request): # with read_table -> type information from schema used result = sql.read_sql_table("test_datetime", conn) - tm.assert_frame_equal(result, df) + expected = df[:] + expected["A"] = expected["A"].astype("M8[us]") + tm.assert_frame_equal(result, expected) # with read_sql -> no type information -> sqlite has no native result = sql.read_sql_query("SELECT * FROM test_datetime", conn) if "sqlite" in conn_name: assert isinstance(result.loc[0, "A"], str) result["A"] = to_datetime(result["A"], errors="coerce") - tm.assert_frame_equal(result, df) - else: - tm.assert_frame_equal(result, df) + + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("conn", sqlalchemy_connectable) @@ -3963,6 +3971,7 @@ def test_self_join_date_columns(postgresql_psycopg2_engine): expected = DataFrame( [[1, Timestamp("2021", tz="UTC")] * 2], columns=["id", "created_dt"] * 2 ) + expected["created_dt"] = expected["created_dt"].astype("M8[us, UTC]") tm.assert_frame_equal(result, expected) # Cleanup diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 2f981953a6237..9f5085ff2ad28 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -11,6 +11,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas.util._test_decorators as td import pandas as pd @@ -120,9 +122,11 @@ def test_read_index_col_none(self, version, temp_file): expected["a"] = expected["a"].astype(np.int32) tm.assert_frame_equal(read_df, expected, check_index_type=True) - @pytest.mark.parametrize("file", ["stata1_114", "stata1_117"]) - def test_read_dta1(self, file, datapath): - file = datapath("io", "data", "stata", f"{file}.dta") + @pytest.mark.parametrize( + "version", [102, 103, 104, 105, 108, 110, 111, 113, 114, 115, 117, 118, 119] + ) + def test_read_dta1(self, version, datapath): + file = datapath("io", "data", "stata", f"stata1_{version}.dta") parsed = self.read_dta(file) # Pandas uses np.nan as missing value. @@ -136,6 +140,18 @@ def test_read_dta1(self, file, datapath): # the casting doesn't fail so need to match stata here expected["float_miss"] = expected["float_miss"].astype(np.float32) + # Column names too long for older Stata formats + if version <= 108: + expected = expected.rename( + columns={ + "float_miss": "f_miss", + "double_miss": "d_miss", + "byte_miss": "b_miss", + "int_miss": "i_miss", + "long_miss": "l_miss", + } + ) + tm.assert_frame_equal(parsed, expected) def test_read_dta2(self, datapath): @@ -181,9 +197,7 @@ def test_read_dta2(self, datapath): expected["monthly_date"] = expected["monthly_date"].astype("M8[s]") expected["quarterly_date"] = expected["quarterly_date"].astype("M8[s]") expected["half_yearly_date"] = expected["half_yearly_date"].astype("M8[s]") - expected["yearly_date"] = ( - expected["yearly_date"].astype("Period[s]").array.view("M8[s]") - ) + expected["yearly_date"] = expected["yearly_date"].astype("M8[s]") path1 = datapath("io", "data", "stata", "stata2_114.dta") path2 = datapath("io", "data", "stata", "stata2_115.dta") @@ -206,9 +220,9 @@ def test_read_dta2(self, datapath): # buggy test because of the NaT comparison on certain platforms # Format 113 test fails since it does not support tc and tC formats # tm.assert_frame_equal(parsed_113, expected) - tm.assert_frame_equal(parsed_114, expected, check_datetimelike_compat=True) - tm.assert_frame_equal(parsed_115, expected, check_datetimelike_compat=True) - tm.assert_frame_equal(parsed_117, expected, check_datetimelike_compat=True) + tm.assert_frame_equal(parsed_114, expected) + tm.assert_frame_equal(parsed_115, expected) + tm.assert_frame_equal(parsed_117, expected) @pytest.mark.parametrize( "file", ["stata3_113", "stata3_114", "stata3_115", "stata3_117"] @@ -269,7 +283,7 @@ def test_read_dta4(self, version, datapath): # stata doesn't save .category metadata tm.assert_frame_equal(parsed, expected) - @pytest.mark.parametrize("version", [105, 108]) + @pytest.mark.parametrize("version", [102, 103, 104, 105, 108]) def test_readold_dta4(self, version, datapath): # This test is the same as test_read_dta4 above except that the columns # had to be renamed to match the restrictions in older file format @@ -316,8 +330,19 @@ def test_readold_dta4(self, version, datapath): tm.assert_frame_equal(parsed, expected) # File containing strls - def test_read_dta12(self, datapath): - parsed_117 = self.read_dta(datapath("io", "data", "stata", "stata12_117.dta")) + @pytest.mark.parametrize( + "file", + [ + "stata12_117", + "stata12_be_117", + "stata12_118", + "stata12_be_118", + "stata12_119", + "stata12_be_119", + ], + ) + def test_read_dta_strl(self, file, datapath): + parsed = self.read_dta(datapath("io", "data", "stata", f"{file}.dta")) expected = DataFrame.from_records( [ [1, "abc", "abcdefghi"], @@ -327,10 +352,20 @@ def test_read_dta12(self, datapath): columns=["x", "y", "z"], ) - tm.assert_frame_equal(parsed_117, expected, check_dtype=False) + tm.assert_frame_equal(parsed, expected, check_dtype=False) - def test_read_dta18(self, datapath): - parsed_118 = self.read_dta(datapath("io", "data", "stata", "stata14_118.dta")) + # 117 is not included in this list as it uses ASCII strings + @pytest.mark.parametrize( + "file", + [ + "stata14_118", + "stata14_be_118", + "stata14_119", + "stata14_be_119", + ], + ) + def test_read_dta118_119(self, file, datapath): + parsed_118 = self.read_dta(datapath("io", "data", "stata", f"{file}.dta")) parsed_118["Bytes"] = parsed_118["Bytes"].astype("O") expected = DataFrame.from_records( [ @@ -354,7 +389,7 @@ def test_read_dta18(self, datapath): for col in parsed_118.columns: tm.assert_almost_equal(parsed_118[col], expected[col]) - with StataReader(datapath("io", "data", "stata", "stata14_118.dta")) as rdr: + with StataReader(datapath("io", "data", "stata", f"{file}.dta")) as rdr: vl = rdr.variable_labels() vl_expected = { "Unicode_Cities_Strl": "Here are some strls with Ünicode chars", @@ -400,6 +435,7 @@ def test_write_dta6(self, datapath, temp_file): check_index_type=False, ) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) def test_read_write_dta10(self, version, temp_file): original = DataFrame( @@ -885,8 +921,8 @@ def test_missing_value_generator(self, temp_file): ) assert val.string == ".z" - @pytest.mark.parametrize("file", ["stata8_113", "stata8_115", "stata8_117"]) - def test_missing_value_conversion(self, file, datapath): + @pytest.mark.parametrize("version", [113, 115, 117]) + def test_missing_value_conversion(self, version, datapath): columns = ["int8_", "int16_", "int32_", "float32_", "float64_"] smv = StataMissingValue(101) keys = sorted(smv.MISSING_VALUES.keys()) @@ -897,7 +933,41 @@ def test_missing_value_conversion(self, file, datapath): expected = DataFrame(data, columns=columns) parsed = read_stata( - datapath("io", "data", "stata", f"{file}.dta"), convert_missing=True + datapath("io", "data", "stata", f"stata8_{version}.dta"), + convert_missing=True, + ) + tm.assert_frame_equal(parsed, expected) + + @pytest.mark.parametrize("version", [104, 105, 108, 110, 111]) + def test_missing_value_conversion_compat(self, version, datapath): + columns = ["int8_", "int16_", "int32_", "float32_", "float64_"] + smv = StataMissingValue(101) + keys = sorted(smv.MISSING_VALUES.keys()) + data = [] + row = [StataMissingValue(keys[j * 27]) for j in range(5)] + data.append(row) + expected = DataFrame(data, columns=columns) + + parsed = read_stata( + datapath("io", "data", "stata", f"stata8_{version}.dta"), + convert_missing=True, + ) + tm.assert_frame_equal(parsed, expected) + + # The byte type was not supported prior to the 104 format + @pytest.mark.parametrize("version", [102, 103]) + def test_missing_value_conversion_compat_nobyte(self, version, datapath): + columns = ["int8_", "int16_", "int32_", "float32_", "float64_"] + smv = StataMissingValue(101) + keys = sorted(smv.MISSING_VALUES.keys()) + data = [] + row = [StataMissingValue(keys[j * 27]) for j in [1, 1, 2, 3, 4]] + data.append(row) + expected = DataFrame(data, columns=columns) + + parsed = read_stata( + datapath("io", "data", "stata", f"stata8_{version}.dta"), + convert_missing=True, ) tm.assert_frame_equal(parsed, expected) @@ -952,8 +1022,8 @@ def test_big_dates(self, datapath, temp_file): parsed_115 = read_stata(datapath("io", "data", "stata", "stata9_115.dta")) parsed_117 = read_stata(datapath("io", "data", "stata", "stata9_117.dta")) - tm.assert_frame_equal(expected, parsed_115, check_datetimelike_compat=True) - tm.assert_frame_equal(expected, parsed_117, check_datetimelike_compat=True) + tm.assert_frame_equal(expected, parsed_115) + tm.assert_frame_equal(expected, parsed_117) date_conversion = {c: c[-2:] for c in columns} # {c : c[-2:] for c in columns} @@ -965,7 +1035,6 @@ def test_big_dates(self, datapath, temp_file): tm.assert_frame_equal( written_and_read_again.set_index("index"), expected.set_index(expected.index.astype(np.int32)), - check_datetimelike_compat=True, ) def test_dtype_conversion(self, datapath): @@ -1207,6 +1276,7 @@ def test_categorical_ordering(self, file, datapath): assert parsed[col].cat.ordered assert not parsed_unordered[col].cat.ordered + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.filterwarnings("ignore::UserWarning") @pytest.mark.parametrize( "file", @@ -1252,7 +1322,9 @@ def test_read_chunks_117( from_frame = parsed.iloc[pos : pos + chunksize, :].copy() from_frame = self._convert_categorical(from_frame) tm.assert_frame_equal( - from_frame, chunk, check_dtype=False, check_datetimelike_compat=True + from_frame, + chunk, + check_dtype=False, ) pos += chunksize @@ -1297,6 +1369,7 @@ def test_iterator(self, datapath): from_chunks = pd.concat(itr) tm.assert_frame_equal(parsed, from_chunks) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.filterwarnings("ignore::UserWarning") @pytest.mark.parametrize( "file", @@ -1344,7 +1417,9 @@ def test_read_chunks_115( from_frame = parsed.iloc[pos : pos + chunksize, :].copy() from_frame = self._convert_categorical(from_frame) tm.assert_frame_equal( - from_frame, chunk, check_dtype=False, check_datetimelike_compat=True + from_frame, + chunk, + check_dtype=False, ) pos += chunksize @@ -1599,6 +1674,7 @@ def test_inf(self, infval, temp_file): path = temp_file df.to_stata(path) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_path_pathlib(self): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), @@ -1623,6 +1699,7 @@ def test_value_labels_iterator(self, write_index, temp_file): value_labels = dta_iter.value_labels() assert value_labels == {"A": {0: "A", 1: "B", 2: "C", 3: "E"}} + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_set_index(self, temp_file): # GH 17328 df = DataFrame( @@ -1656,7 +1733,9 @@ def test_date_parsing_ignores_format_details(self, column, datapath): formatted = df.loc[0, column + "_fmt"] assert unformatted == formatted - def test_writer_117(self, temp_file): + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") + @pytest.mark.parametrize("byteorder", ["little", "big"]) + def test_writer_117(self, byteorder, temp_file): original = DataFrame( data=[ [ @@ -1714,6 +1793,7 @@ def test_writer_117(self, temp_file): original.to_stata( path, convert_dates={"datetime": "tc"}, + byteorder=byteorder, convert_strl=["forced_strl"], version=117, ) @@ -1765,6 +1845,7 @@ def test_invalid_date_conversion(self, temp_file): with pytest.raises(ValueError, match=msg): original.to_stata(path, convert_dates={"wrong_name": "tc"}) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) def test_nonfile_writing(self, version, temp_file): # GH 21041 @@ -1783,6 +1864,7 @@ def test_nonfile_writing(self, version, temp_file): reread = read_stata(path, index_col="index") tm.assert_frame_equal(df, reread) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_gzip_writing(self, temp_file): # writing version 117 requires seek and cannot be used with gzip df = DataFrame( @@ -1798,8 +1880,18 @@ def test_gzip_writing(self, temp_file): reread = read_stata(gz, index_col="index") tm.assert_frame_equal(df, reread) - def test_unicode_dta_118(self, datapath): - unicode_df = self.read_dta(datapath("io", "data", "stata", "stata16_118.dta")) + # 117 is not included in this list as it uses ASCII strings + @pytest.mark.parametrize( + "file", + [ + "stata16_118", + "stata16_be_118", + "stata16_119", + "stata16_be_119", + ], + ) + def test_unicode_dta_118_119(self, file, datapath): + unicode_df = self.read_dta(datapath("io", "data", "stata", f"{file}.dta")) columns = ["utf8", "latin1", "ascii", "utf8_strl", "ascii_strl"] values = [ @@ -1815,6 +1907,7 @@ def test_unicode_dta_118(self, datapath): tm.assert_frame_equal(unicode_df, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_mixed_string_strl(self, temp_file): # GH 23633 output = [{"mixed": "string" * 500, "number": 0}, {"mixed": None, "number": 1}] @@ -1907,8 +2000,10 @@ def test_stata_119(self, datapath): reader._ensure_open() assert reader._nvar == 32999 + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("version", [118, 119, None]) - def test_utf8_writer(self, version, temp_file): + @pytest.mark.parametrize("byteorder", ["little", "big"]) + def test_utf8_writer(self, version, byteorder, temp_file): cat = pd.Categorical(["a", "β", "ĉ"], ordered=True) data = DataFrame( [ @@ -1936,6 +2031,7 @@ def test_utf8_writer(self, version, temp_file): convert_strl=["strls"], variable_labels=variable_labels, write_index=False, + byteorder=byteorder, version=version, value_labels=value_labels, ) @@ -1999,6 +2095,52 @@ def test_read_write_ea_dtypes(self, dtype_backend, temp_file, tmp_path): tm.assert_frame_equal(written_and_read_again.set_index("index"), expected) + @pytest.mark.parametrize("version", [113, 114, 115, 117, 118, 119]) + def test_read_data_int_validranges(self, version, datapath): + expected = DataFrame( + { + "byte": np.array([-127, 100], dtype=np.int8), + "int": np.array([-32767, 32740], dtype=np.int16), + "long": np.array([-2147483647, 2147483620], dtype=np.int32), + } + ) + + parsed = read_stata( + datapath("io", "data", "stata", f"stata_int_validranges_{version}.dta") + ) + tm.assert_frame_equal(parsed, expected) + + @pytest.mark.parametrize("version", [104, 105, 108, 110, 111]) + def test_read_data_int_validranges_compat(self, version, datapath): + expected = DataFrame( + { + "byte": np.array([-128, 126], dtype=np.int8), + "int": np.array([-32768, 32766], dtype=np.int16), + "long": np.array([-2147483648, 2147483646], dtype=np.int32), + } + ) + + parsed = read_stata( + datapath("io", "data", "stata", f"stata_int_validranges_{version}.dta") + ) + tm.assert_frame_equal(parsed, expected) + + # The byte type was not supported prior to the 104 format + @pytest.mark.parametrize("version", [102, 103]) + def test_read_data_int_validranges_compat_nobyte(self, version, datapath): + expected = DataFrame( + { + "byte": np.array([-128, 126], dtype=np.int16), + "int": np.array([-32768, 32766], dtype=np.int16), + "long": np.array([-2147483648, 2147483646], dtype=np.int32), + } + ) + + parsed = read_stata( + datapath("io", "data", "stata", f"stata_int_validranges_{version}.dta") + ) + tm.assert_frame_equal(parsed, expected) + @pytest.mark.parametrize("version", [105, 108, 110, 111, 113, 114]) def test_backward_compat(version, datapath): @@ -2010,6 +2152,31 @@ def test_backward_compat(version, datapath): tm.assert_frame_equal(old_dta, expected, check_dtype=False) +@pytest.mark.parametrize("version", [103, 104]) +def test_backward_compat_nodateconversion(version, datapath): + # The Stata data format prior to 105 did not support a date format + # so read the raw values for comparison + data_base = datapath("io", "data", "stata") + ref = os.path.join(data_base, "stata-compat-118.dta") + old = os.path.join(data_base, f"stata-compat-{version}.dta") + expected = read_stata(ref, convert_dates=False) + old_dta = read_stata(old, convert_dates=False) + tm.assert_frame_equal(old_dta, expected, check_dtype=False) + + +@pytest.mark.parametrize("version", [102]) +def test_backward_compat_nostring(version, datapath): + # The Stata data format prior to 105 did not support a date format + # so read the raw values for comparison + ref = datapath("io", "data", "stata", "stata-compat-118.dta") + old = datapath("io", "data", "stata", f"stata-compat-{version}.dta") + expected = read_stata(ref, convert_dates=False) + # The Stata data format prior to 103 did not support string data + expected = expected.drop(columns=["s10"]) + old_dta = read_stata(old, convert_dates=False) + tm.assert_frame_equal(old_dta, expected, check_dtype=False) + + @pytest.mark.parametrize("version", [105, 108, 110, 111, 113, 114, 118]) def test_bigendian(version, datapath): ref = datapath("io", "data", "stata", f"stata-compat-{version}.dta") @@ -2019,6 +2186,18 @@ def test_bigendian(version, datapath): tm.assert_frame_equal(big_dta, expected) +# Note: 102 format does not support big-endian byte order +@pytest.mark.parametrize("version", [103, 104]) +def test_bigendian_nodateconversion(version, datapath): + # The Stata data format prior to 105 did not support a date format + # so read the raw values for comparison + ref = datapath("io", "data", "stata", f"stata-compat-{version}.dta") + big = datapath("io", "data", "stata", f"stata-compat-be-{version}.dta") + expected = read_stata(ref, convert_dates=False) + big_dta = read_stata(big, convert_dates=False) + tm.assert_frame_equal(big_dta, expected) + + def test_direct_read(datapath, monkeypatch): file_path = datapath("io", "data", "stata", "stata-compat-118.dta") @@ -2169,6 +2348,7 @@ def test_iterator_errors(datapath, chunksize): pass +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_iterator_value_labels(temp_file): # GH 31544 values = ["c_label", "b_label"] + ["a_label"] * 500 diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index 357e6129dd8f1..036a5d6265dd7 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -14,6 +14,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat import WASM from pandas.compat._optional import import_optional_dependency from pandas.errors import ( @@ -244,7 +246,8 @@ "-87.65362593118043,41.94742799535678,0" ), }, - } + }, + index=range(5), ) @@ -414,7 +417,7 @@ def test_string_charset(parser): df_str = read_xml(StringIO(txt), parser=parser) - df_expected = DataFrame({"c1": 1, "c2": 2}, index=[0]) + df_expected = DataFrame({"c1": 1, "c2": 2}, index=range(1)) tm.assert_frame_equal(df_str, df_expected) @@ -1038,7 +1041,7 @@ def test_utf16_encoding(xml_baby_names, parser): UnicodeError, match=( "UTF-16 stream does not start with BOM|" - "'utf-16-le' codec can't decode byte" + "'utf-16(-le)?' codec can't decode byte" ), ): read_xml(xml_baby_names, encoding="UTF-16", parser=parser) @@ -1989,6 +1992,7 @@ def test_s3_parser_consistency(s3_public_bucket_with_data, s3so): tm.assert_frame_equal(df_lxml, df_etree) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_read_xml_nullable_dtypes( parser, string_storage, dtype_backend, using_infer_string ): diff --git a/pandas/tests/io/xml/test_xml_dtypes.py b/pandas/tests/io/xml/test_xml_dtypes.py index 96ef50f9d7149..409aafee58e49 100644 --- a/pandas/tests/io/xml/test_xml_dtypes.py +++ b/pandas/tests/io/xml/test_xml_dtypes.py @@ -4,6 +4,8 @@ import pytest +from pandas._config import using_string_dtype + from pandas.errors import ParserWarning import pandas.util._test_decorators as td @@ -83,6 +85,7 @@ def read_xml_iterparse(data, **kwargs): # DTYPE +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_dtype_single_str(parser): df_result = read_xml(StringIO(xml_types), dtype={"degrees": "str"}, parser=parser) df_iter = read_xml_iterparse( @@ -208,6 +211,7 @@ def test_wrong_dtype(xml_books, parser, iterparse): ) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_both_dtype_converters(parser): df_expected = DataFrame( { diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py index b70386191d9d9..50b561aefcf49 100644 --- a/pandas/tests/libs/test_hashtable.py +++ b/pandas/tests/libs/test_hashtable.py @@ -1,3 +1,4 @@ +from collections import namedtuple from collections.abc import Generator from contextlib import contextmanager import re @@ -405,9 +406,8 @@ def test_nan_complex_real(self): table = ht.PyObjectHashTable() table.set_item(nan1, 42) assert table.get_item(nan2) == 42 - with pytest.raises(KeyError, match=None) as error: + with pytest.raises(KeyError, match=re.escape(repr(other))): table.get_item(other) - assert str(error.value) == str(other) def test_nan_complex_imag(self): nan1 = complex(1, float("nan")) @@ -417,9 +417,8 @@ def test_nan_complex_imag(self): table = ht.PyObjectHashTable() table.set_item(nan1, 42) assert table.get_item(nan2) == 42 - with pytest.raises(KeyError, match=None) as error: + with pytest.raises(KeyError, match=re.escape(repr(other))): table.get_item(other) - assert str(error.value) == str(other) def test_nan_in_tuple(self): nan1 = (float("nan"),) @@ -436,9 +435,28 @@ def test_nan_in_nested_tuple(self): table = ht.PyObjectHashTable() table.set_item(nan1, 42) assert table.get_item(nan2) == 42 - with pytest.raises(KeyError, match=None) as error: + with pytest.raises(KeyError, match=re.escape(repr(other))): + table.get_item(other) + + def test_nan_in_namedtuple(self): + T = namedtuple("T", ["x"]) + nan1 = T(float("nan")) + nan2 = T(float("nan")) + assert nan1.x is not nan2.x + table = ht.PyObjectHashTable() + table.set_item(nan1, 42) + assert table.get_item(nan2) == 42 + + def test_nan_in_nested_namedtuple(self): + T = namedtuple("T", ["x", "y"]) + nan1 = T(1, (2, (float("nan"),))) + nan2 = T(1, (2, (float("nan"),))) + other = T(1, 2) + table = ht.PyObjectHashTable() + table.set_item(nan1, 42) + assert table.get_item(nan2) == 42 + with pytest.raises(KeyError, match=re.escape(repr(other))): table.get_item(other) - assert str(error.value) == str(other) def test_hash_equal_tuple_with_nans(): @@ -448,6 +466,22 @@ def test_hash_equal_tuple_with_nans(): assert ht.objects_are_equal(a, b) +def test_hash_equal_namedtuple_with_nans(): + T = namedtuple("T", ["x", "y"]) + a = T(float("nan"), (float("nan"), float("nan"))) + b = T(float("nan"), (float("nan"), float("nan"))) + assert ht.object_hash(a) == ht.object_hash(b) + assert ht.objects_are_equal(a, b) + + +def test_hash_equal_namedtuple_and_tuple(): + T = namedtuple("T", ["x", "y"]) + a = T(1, (2, 3)) + b = (1, (2, 3)) + assert ht.object_hash(a) == ht.object_hash(b) + assert ht.objects_are_equal(a, b) + + def test_get_labels_groupby_for_Int64(writable): table = ht.Int64HashTable() vals = np.array([1, 2, -1, 2, 1, -1], dtype=np.int64) diff --git a/pandas/tests/plotting/common.py b/pandas/tests/plotting/common.py index 5a46cdcb051b6..d8c49d6d47f28 100644 --- a/pandas/tests/plotting/common.py +++ b/pandas/tests/plotting/common.py @@ -76,8 +76,6 @@ def _check_data(xp, rs): xp : matplotlib Axes object rs : matplotlib Axes object """ - import matplotlib.pyplot as plt - xp_lines = xp.get_lines() rs_lines = rs.get_lines() @@ -87,8 +85,6 @@ def _check_data(xp, rs): rsdata = rsl.get_xydata() tm.assert_almost_equal(xpdata, rsdata) - plt.close("all") - def _check_visible(collections, visible=True): """ @@ -495,6 +491,28 @@ def get_y_axis(ax): return ax._shared_axes["y"] +def assert_is_valid_plot_return_object(objs) -> None: + from matplotlib.artist import Artist + from matplotlib.axes import Axes + + if isinstance(objs, (Series, np.ndarray)): + if isinstance(objs, Series): + objs = objs._values + for el in objs.reshape(-1): + msg = ( + "one of 'objs' is not a matplotlib Axes instance, " + f"type encountered {type(el).__name__!r}" + ) + assert isinstance(el, (Axes, dict)), msg + else: + msg = ( + "objs is neither an ndarray of Artist instances nor a single " + "ArtistArtist instance, tuple, or dict, 'objs' is a " + f"{type(objs).__name__!r}" + ) + assert isinstance(objs, (Artist, tuple, dict)), msg + + def _check_plot_works(f, default_axes=False, **kwargs): """ Create plot and ensure that plot return object is valid. @@ -530,15 +548,11 @@ def _check_plot_works(f, default_axes=False, **kwargs): gen_plots = _gen_two_subplots ret = None - try: - fig = kwargs.get("figure", plt.gcf()) - plt.clf() - - for ret in gen_plots(f, fig, **kwargs): - tm.assert_is_valid_plot_return_object(ret) + fig = kwargs.get("figure", plt.gcf()) + fig.clf() - finally: - plt.close(fig) + for ret in gen_plots(f, fig, **kwargs): + assert_is_valid_plot_return_object(ret) return ret diff --git a/pandas/tests/plotting/frame/test_frame.py b/pandas/tests/plotting/frame/test_frame.py index adb56a40b0071..b381c4fce8430 100644 --- a/pandas/tests/plotting/frame/test_frame.py +++ b/pandas/tests/plotting/frame/test_frame.py @@ -1120,7 +1120,7 @@ def test_boxplot_return_type_invalid_type(self, return_type): def test_kde_df(self): pytest.importorskip("scipy") - df = DataFrame(np.random.default_rng(2).standard_normal((100, 4))) + df = DataFrame(np.random.default_rng(2).standard_normal((10, 4))) ax = _check_plot_works(df.plot, kind="kde") expected = [pprint_thing(c) for c in df.columns] _check_legend_labels(ax, labels=expected) @@ -1177,20 +1177,16 @@ def test_hist_df_series(self): _check_ticks_props(axes, xrot=40, yrot=0) def test_hist_df_series_cumulative_density(self): - from matplotlib.patches import Rectangle - series = Series(np.random.default_rng(2).random(10)) ax = series.plot.hist(cumulative=True, bins=4, density=True) # height of last bin (index 5) must be 1.0 - rects = [x for x in ax.get_children() if isinstance(x, Rectangle)] + rects = [x for x in ax.get_children() if isinstance(x, mpl.patches.Rectangle)] tm.assert_almost_equal(rects[-1].get_height(), 1.0) def test_hist_df_series_cumulative(self): - from matplotlib.patches import Rectangle - series = Series(np.random.default_rng(2).random(10)) ax = series.plot.hist(cumulative=True, bins=4) - rects = [x for x in ax.get_children() if isinstance(x, Rectangle)] + rects = [x for x in ax.get_children() if isinstance(x, mpl.patches.Rectangle)] tm.assert_almost_equal(rects[-2].get_height(), 10.0) @@ -1385,8 +1381,6 @@ def test_plot_int_columns(self): ], ) def test_style_by_column(self, markers): - import matplotlib.pyplot as plt - fig = plt.gcf() fig.clf() fig.add_subplot(111) @@ -1969,9 +1963,6 @@ def test_sharex_and_ax(self): # https://github.com/pandas-dev/pandas/issues/9737 using gridspec, # the axis in fig.get_axis() are sorted differently than pandas # expected them, so make sure that only the right ones are removed - import matplotlib.pyplot as plt - - plt.close("all") gs, axes = _generate_4_axes_via_gridspec() df = DataFrame( @@ -2009,8 +2000,6 @@ def test_sharex_false_and_ax(self): # https://github.com/pandas-dev/pandas/issues/9737 using gridspec, # the axis in fig.get_axis() are sorted differently than pandas # expected them, so make sure that only the right ones are removed - import matplotlib.pyplot as plt - df = DataFrame( { "a": [1, 2, 3, 4, 5, 6], @@ -2035,8 +2024,6 @@ def test_sharey_and_ax(self): # https://github.com/pandas-dev/pandas/issues/9737 using gridspec, # the axis in fig.get_axis() are sorted differently than pandas # expected them, so make sure that only the right ones are removed - import matplotlib.pyplot as plt - gs, axes = _generate_4_axes_via_gridspec() df = DataFrame( @@ -2073,8 +2060,6 @@ def _check(axes): def test_sharey_and_ax_tight(self): # https://github.com/pandas-dev/pandas/issues/9737 using gridspec, - import matplotlib.pyplot as plt - df = DataFrame( { "a": [1, 2, 3, 4, 5, 6], @@ -2134,9 +2119,6 @@ def test_memory_leak(self, kind): def test_df_gridspec_patterns_vert_horiz(self): # GH 10819 - from matplotlib import gridspec - import matplotlib.pyplot as plt - ts = Series( np.random.default_rng(2).standard_normal(10), index=date_range("1/1/2000", periods=10), @@ -2149,14 +2131,14 @@ def test_df_gridspec_patterns_vert_horiz(self): ) def _get_vertical_grid(): - gs = gridspec.GridSpec(3, 1) + gs = mpl.gridspec.GridSpec(3, 1) fig = plt.figure() ax1 = fig.add_subplot(gs[:2, :]) ax2 = fig.add_subplot(gs[2, :]) return ax1, ax2 def _get_horizontal_grid(): - gs = gridspec.GridSpec(1, 3) + gs = mpl.gridspec.GridSpec(1, 3) fig = plt.figure() ax1 = fig.add_subplot(gs[:, :2]) ax2 = fig.add_subplot(gs[:, 2]) @@ -2217,9 +2199,6 @@ def _get_horizontal_grid(): def test_df_gridspec_patterns_boxed(self): # GH 10819 - from matplotlib import gridspec - import matplotlib.pyplot as plt - ts = Series( np.random.default_rng(2).standard_normal(10), index=date_range("1/1/2000", periods=10), @@ -2227,7 +2206,7 @@ def test_df_gridspec_patterns_boxed(self): # boxed def _get_boxed_grid(): - gs = gridspec.GridSpec(3, 3) + gs = mpl.gridspec.GridSpec(3, 3) fig = plt.figure() ax1 = fig.add_subplot(gs[:2, :2]) ax2 = fig.add_subplot(gs[:2, 2]) @@ -2595,8 +2574,6 @@ def test_plot_period_index_makes_no_right_shift(self, freq): def _generate_4_axes_via_gridspec(): - import matplotlib.pyplot as plt - gs = mpl.gridspec.GridSpec(2, 2) ax_tl = plt.subplot(gs[0, 0]) ax_ll = plt.subplot(gs[1, 0]) diff --git a/pandas/tests/plotting/frame/test_frame_color.py b/pandas/tests/plotting/frame/test_frame_color.py index 76d3b20aaa2c6..4b35e896e1a6c 100644 --- a/pandas/tests/plotting/frame/test_frame_color.py +++ b/pandas/tests/plotting/frame/test_frame_color.py @@ -364,14 +364,16 @@ def test_line_colors_and_styles_subplots_list_styles(self): _check_colors(ax.get_lines(), linecolors=[c]) def test_area_colors(self): - from matplotlib.collections import PolyCollection - custom_colors = "rgcby" df = DataFrame(np.random.default_rng(2).random((5, 5))) ax = df.plot.area(color=custom_colors) _check_colors(ax.get_lines(), linecolors=custom_colors) - poly = [o for o in ax.get_children() if isinstance(o, PolyCollection)] + poly = [ + o + for o in ax.get_children() + if isinstance(o, mpl.collections.PolyCollection) + ] _check_colors(poly, facecolors=custom_colors) handles, _ = ax.get_legend_handles_labels() @@ -381,14 +383,15 @@ def test_area_colors(self): assert h.get_alpha() is None def test_area_colors_poly(self): - from matplotlib import cm - from matplotlib.collections import PolyCollection - df = DataFrame(np.random.default_rng(2).random((5, 5))) ax = df.plot.area(colormap="jet") - jet_colors = [cm.jet(n) for n in np.linspace(0, 1, len(df))] + jet_colors = [mpl.cm.jet(n) for n in np.linspace(0, 1, len(df))] _check_colors(ax.get_lines(), linecolors=jet_colors) - poly = [o for o in ax.get_children() if isinstance(o, PolyCollection)] + poly = [ + o + for o in ax.get_children() + if isinstance(o, mpl.collections.PolyCollection) + ] _check_colors(poly, facecolors=jet_colors) handles, _ = ax.get_legend_handles_labels() @@ -397,15 +400,16 @@ def test_area_colors_poly(self): assert h.get_alpha() is None def test_area_colors_stacked_false(self): - from matplotlib import cm - from matplotlib.collections import PolyCollection - df = DataFrame(np.random.default_rng(2).random((5, 5))) - jet_colors = [cm.jet(n) for n in np.linspace(0, 1, len(df))] + jet_colors = [mpl.cm.jet(n) for n in np.linspace(0, 1, len(df))] # When stacked=False, alpha is set to 0.5 - ax = df.plot.area(colormap=cm.jet, stacked=False) + ax = df.plot.area(colormap=mpl.cm.jet, stacked=False) _check_colors(ax.get_lines(), linecolors=jet_colors) - poly = [o for o in ax.get_children() if isinstance(o, PolyCollection)] + poly = [ + o + for o in ax.get_children() + if isinstance(o, mpl.collections.PolyCollection) + ] jet_with_alpha = [(c[0], c[1], c[2], 0.5) for c in jet_colors] _check_colors(poly, facecolors=jet_with_alpha) diff --git a/pandas/tests/plotting/frame/test_frame_legend.py b/pandas/tests/plotting/frame/test_frame_legend.py index 402a4b9531e5d..a9723fe4ef871 100644 --- a/pandas/tests/plotting/frame/test_frame_legend.py +++ b/pandas/tests/plotting/frame/test_frame_legend.py @@ -26,9 +26,6 @@ class TestFrameLegend: ) def test_mixed_yerr(self): # https://github.com/pandas-dev/pandas/issues/39522 - from matplotlib.collections import LineCollection - from matplotlib.lines import Line2D - df = DataFrame([{"x": 1, "a": 1, "b": 1}, {"x": 2, "a": 2, "b": 3}]) ax = df.plot("x", "a", c="orange", yerr=0.1, label="orange") @@ -40,8 +37,8 @@ def test_mixed_yerr(self): else: result_handles = legend.legend_handles - assert isinstance(result_handles[0], LineCollection) - assert isinstance(result_handles[1], Line2D) + assert isinstance(result_handles[0], mpl.collections.LineCollection) + assert isinstance(result_handles[1], mpl.lines.Line2D) def test_legend_false(self): # https://github.com/pandas-dev/pandas/issues/40044 diff --git a/pandas/tests/plotting/test_boxplot_method.py b/pandas/tests/plotting/test_boxplot_method.py index 573f95eed15ef..4916963ab7c87 100644 --- a/pandas/tests/plotting/test_boxplot_method.py +++ b/pandas/tests/plotting/test_boxplot_method.py @@ -38,9 +38,7 @@ def _check_ax_limits(col, ax): class TestDataFramePlots: def test_stacked_boxplot_set_axis(self): # GH2980 - import matplotlib.pyplot as plt - - n = 80 + n = 30 df = DataFrame( { "Clinical": np.random.default_rng(2).choice([0, 1, 2, 3], n), @@ -51,10 +49,10 @@ def test_stacked_boxplot_set_axis(self): ) ax = df.plot(kind="bar", stacked=True) assert [int(x.get_text()) for x in ax.get_xticklabels()] == df.index.to_list() - ax.set_xticks(np.arange(0, 80, 10)) + ax.set_xticks(np.arange(0, n, 10)) plt.draw() # Update changes assert [int(x.get_text()) for x in ax.get_xticklabels()] == list( - np.arange(0, 80, 10) + np.arange(0, n, 10) ) @pytest.mark.slow @@ -227,12 +225,12 @@ def test_boxplot_numeric_data(self): # GH 22799 df = DataFrame( { - "a": date_range("2012-01-01", periods=100), - "b": np.random.default_rng(2).standard_normal(100), - "c": np.random.default_rng(2).standard_normal(100) + 2, - "d": date_range("2012-01-01", periods=100).astype(str), - "e": date_range("2012-01-01", periods=100, tz="UTC"), - "f": timedelta_range("1 days", periods=100), + "a": date_range("2012-01-01", periods=10), + "b": np.random.default_rng(2).standard_normal(10), + "c": np.random.default_rng(2).standard_normal(10) + 2, + "d": date_range("2012-01-01", periods=10).astype(str), + "e": date_range("2012-01-01", periods=10, tz="UTC"), + "f": timedelta_range("1 days", periods=10), } ) ax = df.plot(kind="box") @@ -282,8 +280,6 @@ def test_color_kwd(self, colors_kwd, expected): def test_colors_in_theme(self, scheme, expected): # GH: 40769 df = DataFrame(np.random.default_rng(2).random((10, 2))) - import matplotlib.pyplot as plt - plt.style.use(scheme) result = df.plot.box(return_type="dict") for k, v in expected.items(): @@ -334,8 +330,8 @@ def test_plot_xlabel_ylabel(self, vert): def test_plot_box(self, vert): # GH 54941 rng = np.random.default_rng(2) - df1 = DataFrame(rng.integers(0, 100, size=(100, 4)), columns=list("ABCD")) - df2 = DataFrame(rng.integers(0, 100, size=(100, 4)), columns=list("ABCD")) + df1 = DataFrame(rng.integers(0, 100, size=(10, 4)), columns=list("ABCD")) + df2 = DataFrame(rng.integers(0, 100, size=(10, 4)), columns=list("ABCD")) xlabel, ylabel = "x", "y" _, axs = plt.subplots(ncols=2, figsize=(10, 7), sharey=True) @@ -344,7 +340,6 @@ def test_plot_box(self, vert): for ax in axs: assert ax.get_xlabel() == xlabel assert ax.get_ylabel() == ylabel - mpl.pyplot.close() @pytest.mark.parametrize("vert", [True, False]) def test_boxplot_xlabel_ylabel(self, vert): @@ -374,7 +369,6 @@ def test_boxplot_group_xlabel_ylabel(self, vert): for subplot in ax: assert subplot.get_xlabel() == xlabel assert subplot.get_ylabel() == ylabel - mpl.pyplot.close() @pytest.mark.parametrize("vert", [True, False]) def test_boxplot_group_no_xlabel_ylabel(self, vert): @@ -389,7 +383,6 @@ def test_boxplot_group_no_xlabel_ylabel(self, vert): for subplot in ax: target_label = subplot.get_xlabel() if vert else subplot.get_ylabel() assert target_label == pprint_thing(["group"]) - mpl.pyplot.close() class TestDataFrameGroupByPlots: diff --git a/pandas/tests/plotting/test_converter.py b/pandas/tests/plotting/test_converter.py index d4774a5cd0439..cfdfa7f723599 100644 --- a/pandas/tests/plotting/test_converter.py +++ b/pandas/tests/plotting/test_converter.py @@ -34,15 +34,11 @@ Second, ) -try: - from pandas.plotting._matplotlib import converter -except ImportError: - # try / except, rather than skip, to avoid internal refactoring - # causing an improper skip - pass - -pytest.importorskip("matplotlib.pyplot") +plt = pytest.importorskip("matplotlib.pyplot") dates = pytest.importorskip("matplotlib.dates") +units = pytest.importorskip("matplotlib.units") + +from pandas.plotting._matplotlib import converter @pytest.mark.single_cpu @@ -79,30 +75,22 @@ def test_dont_register_by_default(self): assert subprocess.check_call(call) == 0 def test_registering_no_warning(self): - plt = pytest.importorskip("matplotlib.pyplot") s = Series(range(12), index=date_range("2017", periods=12)) _, ax = plt.subplots() # Set to the "warn" state, in case this isn't the first test run register_matplotlib_converters() ax.plot(s.index, s.values) - plt.close() def test_pandas_plots_register(self): - plt = pytest.importorskip("matplotlib.pyplot") s = Series(range(12), index=date_range("2017", periods=12)) # Set to the "warn" state, in case this isn't the first test run with tm.assert_produces_warning(None) as w: s.plot() - try: - assert len(w) == 0 - finally: - plt.close() + assert len(w) == 0 def test_matplotlib_formatters(self): - units = pytest.importorskip("matplotlib.units") - # Can't make any assertion about the start state. # We we check that toggling converters off removes it, and toggling it # on restores it. @@ -113,8 +101,6 @@ def test_matplotlib_formatters(self): assert Timestamp in units.registry def test_option_no_warning(self): - pytest.importorskip("matplotlib.pyplot") - plt = pytest.importorskip("matplotlib.pyplot") s = Series(range(12), index=date_range("2017", periods=12)) _, ax = plt.subplots() @@ -126,12 +112,8 @@ def test_option_no_warning(self): register_matplotlib_converters() with cf.option_context("plotting.matplotlib.register_converters", False): ax.plot(s.index, s.values) - plt.close() def test_registry_resets(self): - units = pytest.importorskip("matplotlib.units") - dates = pytest.importorskip("matplotlib.dates") - # make a copy, to reset to original = dict(units.registry) @@ -214,7 +196,7 @@ def test_conversion_float(self, dtc): rtol = 0.5 * 10**-9 rs = dtc.convert(Timestamp("2012-1-1 01:02:03", tz="UTC"), None, None) - xp = converter.mdates.date2num(Timestamp("2012-1-1 01:02:03", tz="UTC")) + xp = dates.date2num(Timestamp("2012-1-1 01:02:03", tz="UTC")) tm.assert_almost_equal(rs, xp, rtol=rtol) rs = dtc.convert( @@ -235,10 +217,10 @@ def test_conversion_float(self, dtc): def test_conversion_outofbounds_datetime(self, dtc, values): # 2579 rs = dtc.convert(values, None, None) - xp = converter.mdates.date2num(values) + xp = dates.date2num(values) tm.assert_numpy_array_equal(rs, xp) rs = dtc.convert(values[0], None, None) - xp = converter.mdates.date2num(values[0]) + xp = dates.date2num(values[0]) assert rs == xp @pytest.mark.parametrize( @@ -261,7 +243,7 @@ def test_dateindex_conversion(self, freq, dtc): rtol = 10**-9 dateindex = date_range("2020-01-01", periods=10, freq=freq) rs = dtc.convert(dateindex, None, None) - xp = converter.mdates.date2num(dateindex._mpl_repr()) + xp = dates.date2num(dateindex._mpl_repr()) tm.assert_almost_equal(rs, xp, rtol=rtol) @pytest.mark.parametrize("offset", [Second(), Milli(), Micro(50)]) diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py index 4b4eeada58366..1275f3d6f7d6d 100644 --- a/pandas/tests/plotting/test_datetimelike.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -46,6 +46,8 @@ mpl = pytest.importorskip("matplotlib") plt = pytest.importorskip("matplotlib.pyplot") +import pandas.plotting._matplotlib.converter as conv + class TestTSPlot: @pytest.mark.filterwarnings("ignore::UserWarning") @@ -73,7 +75,7 @@ def test_fontsize_set_correctly(self): def test_frame_inferred(self): # inferred freq - idx = date_range("1/1/1987", freq="MS", periods=100) + idx = date_range("1/1/1987", freq="MS", periods=10) idx = DatetimeIndex(idx.values, freq=None) df = DataFrame( @@ -82,7 +84,7 @@ def test_frame_inferred(self): _check_plot_works(df.plot) # axes freq - idx = idx[0:40].union(idx[45:99]) + idx = idx[0:4].union(idx[6:]) df2 = DataFrame( np.random.default_rng(2).standard_normal((len(idx), 3)), index=idx ) @@ -111,7 +113,6 @@ def test_nonnumeric_exclude(self): fig, ax = mpl.pyplot.subplots() df.plot(ax=ax) # it works assert len(ax.get_lines()) == 1 # B was plotted - mpl.pyplot.close(fig) def test_nonnumeric_exclude_error(self): idx = date_range("1/1/1987", freq="YE", periods=3) @@ -122,7 +123,7 @@ def test_nonnumeric_exclude_error(self): @pytest.mark.parametrize("freq", ["s", "min", "h", "D", "W", "M", "Q", "Y"]) def test_tsplot_period(self, freq): - idx = period_range("12/31/1999", freq=freq, periods=100) + idx = period_range("12/31/1999", freq=freq, periods=10) ser = Series(np.random.default_rng(2).standard_normal(len(idx)), idx) _, ax = mpl.pyplot.subplots() _check_plot_works(ser.plot, ax=ax) @@ -131,7 +132,7 @@ def test_tsplot_period(self, freq): "freq", ["s", "min", "h", "D", "W", "ME", "QE-DEC", "YE", "1B30Min"] ) def test_tsplot_datetime(self, freq): - idx = date_range("12/31/1999", freq=freq, periods=100) + idx = date_range("12/31/1999", freq=freq, periods=10) ser = Series(np.random.default_rng(2).standard_normal(len(idx)), idx) _, ax = mpl.pyplot.subplots() _check_plot_works(ser.plot, ax=ax) @@ -145,10 +146,9 @@ def test_tsplot(self): color = (0.0, 0.0, 0.0, 1) assert color == ax.get_lines()[0].get_color() - def test_both_style_and_color(self): - ts = Series( - np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10) - ) + @pytest.mark.parametrize("index", [None, date_range("2020-01-01", periods=10)]) + def test_both_style_and_color(self, index): + ts = Series(np.arange(10, dtype=np.float64), index=index) msg = ( "Cannot pass 'style' string with a color symbol and 'color' " "keyword argument. Please use one or the other or pass 'style' " @@ -157,46 +157,37 @@ def test_both_style_and_color(self): with pytest.raises(ValueError, match=msg): ts.plot(style="b-", color="#000099") - s = ts.reset_index(drop=True) - with pytest.raises(ValueError, match=msg): - s.plot(style="b-", color="#000099") - @pytest.mark.parametrize("freq", ["ms", "us"]) def test_high_freq(self, freq): _, ax = mpl.pyplot.subplots() - rng = date_range("1/1/2012", periods=100, freq=freq) + rng = date_range("1/1/2012", periods=10, freq=freq) ser = Series(np.random.default_rng(2).standard_normal(len(rng)), rng) _check_plot_works(ser.plot, ax=ax) def test_get_datevalue(self): - from pandas.plotting._matplotlib.converter import get_datevalue - - assert get_datevalue(None, "D") is None - assert get_datevalue(1987, "Y") == 1987 - assert get_datevalue(Period(1987, "Y"), "M") == Period("1987-12", "M").ordinal - assert get_datevalue("1/1/1987", "D") == Period("1987-1-1", "D").ordinal - - def test_ts_plot_format_coord(self): - def check_format_of_first_point(ax, expected_string): - first_line = ax.get_lines()[0] - first_x = first_line.get_xdata()[0].ordinal - first_y = first_line.get_ydata()[0] - assert expected_string == ax.format_coord(first_x, first_y) + assert conv.get_datevalue(None, "D") is None + assert conv.get_datevalue(1987, "Y") == 1987 + assert ( + conv.get_datevalue(Period(1987, "Y"), "M") == Period("1987-12", "M").ordinal + ) + assert conv.get_datevalue("1/1/1987", "D") == Period("1987-1-1", "D").ordinal - annual = Series(1, index=date_range("2014-01-01", periods=3, freq="YE-DEC")) + @pytest.mark.parametrize( + "freq, expected_string", + [["YE-DEC", "t = 2014 y = 1.000000"], ["D", "t = 2014-01-01 y = 1.000000"]], + ) + def test_ts_plot_format_coord(self, freq, expected_string): + ser = Series(1, index=date_range("2014-01-01", periods=3, freq=freq)) _, ax = mpl.pyplot.subplots() - annual.plot(ax=ax) - check_format_of_first_point(ax, "t = 2014 y = 1.000000") - - # note this is added to the annual plot already in existence, and - # changes its freq field - daily = Series(1, index=date_range("2014-01-01", periods=3, freq="D")) - daily.plot(ax=ax) - check_format_of_first_point(ax, "t = 2014-01-01 y = 1.000000") + ser.plot(ax=ax) + first_line = ax.get_lines()[0] + first_x = first_line.get_xdata()[0].ordinal + first_y = first_line.get_ydata()[0] + assert expected_string == ax.format_coord(first_x, first_y) @pytest.mark.parametrize("freq", ["s", "min", "h", "D", "W", "M", "Q", "Y"]) def test_line_plot_period_series(self, freq): - idx = period_range("12/31/1999", freq=freq, periods=100) + idx = period_range("12/31/1999", freq=freq, periods=10) ser = Series(np.random.default_rng(2).standard_normal(len(idx)), idx) _check_plot_works(ser.plot, ser.index.freq) @@ -206,7 +197,7 @@ def test_line_plot_period_series(self, freq): def test_line_plot_period_mlt_series(self, frqncy): # test period index line plot for series with multiples (`mlt`) of the # frequency (`frqncy`) rule code. tests resolution of issue #14763 - idx = period_range("12/31/1999", freq=frqncy, periods=100) + idx = period_range("12/31/1999", freq=frqncy, periods=10) s = Series(np.random.default_rng(2).standard_normal(len(idx)), idx) _check_plot_works(s.plot, s.index.freq.rule_code) @@ -214,13 +205,13 @@ def test_line_plot_period_mlt_series(self, frqncy): "freq", ["s", "min", "h", "D", "W", "ME", "QE-DEC", "YE", "1B30Min"] ) def test_line_plot_datetime_series(self, freq): - idx = date_range("12/31/1999", freq=freq, periods=100) + idx = date_range("12/31/1999", freq=freq, periods=10) ser = Series(np.random.default_rng(2).standard_normal(len(idx)), idx) _check_plot_works(ser.plot, ser.index.freq.rule_code) @pytest.mark.parametrize("freq", ["s", "min", "h", "D", "W", "ME", "QE", "YE"]) def test_line_plot_period_frame(self, freq): - idx = date_range("12/31/1999", freq=freq, periods=100) + idx = date_range("12/31/1999", freq=freq, periods=10) df = DataFrame( np.random.default_rng(2).standard_normal((len(idx), 3)), index=idx, @@ -235,7 +226,7 @@ def test_line_plot_period_mlt_frame(self, frqncy): # test period index line plot for DataFrames with multiples (`mlt`) # of the frequency (`frqncy`) rule code. tests resolution of issue # #14763 - idx = period_range("12/31/1999", freq=frqncy, periods=100) + idx = period_range("12/31/1999", freq=frqncy, periods=10) df = DataFrame( np.random.default_rng(2).standard_normal((len(idx), 3)), index=idx, @@ -249,7 +240,7 @@ def test_line_plot_period_mlt_frame(self, frqncy): "freq", ["s", "min", "h", "D", "W", "ME", "QE-DEC", "YE", "1B30Min"] ) def test_line_plot_datetime_frame(self, freq): - idx = date_range("12/31/1999", freq=freq, periods=100) + idx = date_range("12/31/1999", freq=freq, periods=10) df = DataFrame( np.random.default_rng(2).standard_normal((len(idx), 3)), index=idx, @@ -263,7 +254,7 @@ def test_line_plot_datetime_frame(self, freq): "freq", ["s", "min", "h", "D", "W", "ME", "QE-DEC", "YE", "1B30Min"] ) def test_line_plot_inferred_freq(self, freq): - idx = date_range("12/31/1999", freq=freq, periods=100) + idx = date_range("12/31/1999", freq=freq, periods=10) ser = Series(np.random.default_rng(2).standard_normal(len(idx)), idx) ser = Series(ser.values, Index(np.asarray(ser.index))) _check_plot_works(ser.plot, ser.index.inferred_freq) @@ -350,8 +341,8 @@ def test_business_freq(self): def test_business_freq_convert(self): bts = Series( - np.arange(300, dtype=np.float64), - index=date_range("2020-01-01", periods=300, freq="B"), + np.arange(50, dtype=np.float64), + index=date_range("2020-01-01", periods=50, freq="B"), ).asfreq("BME") ts = bts.to_period("M") _, ax = mpl.pyplot.subplots() @@ -444,12 +435,8 @@ def test_axis_limits(self, obj): result = ax.get_xlim() assert int(result[0]) == expected[0].ordinal assert int(result[1]) == expected[1].ordinal - fig = ax.get_figure() - mpl.pyplot.close(fig) def test_get_finder(self): - import pandas.plotting._matplotlib.converter as conv - assert conv.get_finder(to_offset("B")) == conv._daily_finder assert conv.get_finder(to_offset("D")) == conv._daily_finder assert conv.get_finder(to_offset("ME")) == conv._monthly_finder @@ -552,7 +539,7 @@ def test_finder_annual(self): @pytest.mark.slow def test_finder_minutely(self): - nminutes = 50 * 24 * 60 + nminutes = 1 * 24 * 60 rng = date_range("1/1/1999", freq="Min", periods=nminutes) ser = Series(np.random.default_rng(2).standard_normal(len(rng)), rng) _, ax = mpl.pyplot.subplots() @@ -577,9 +564,9 @@ def test_finder_hourly(self): def test_gaps(self): ts = Series( - np.arange(30, dtype=np.float64), index=date_range("2020-01-01", periods=30) + np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10) ) - ts.iloc[5:25] = np.nan + ts.iloc[5:7] = np.nan _, ax = mpl.pyplot.subplots() ts.plot(ax=ax) lines = ax.get_lines() @@ -591,8 +578,7 @@ def test_gaps(self): assert isinstance(data, np.ma.core.MaskedArray) mask = data.mask - assert mask[5:25, 1].all() - mpl.pyplot.close(ax.get_figure()) + assert mask[5:7, 1].all() def test_gaps_irregular(self): # irregular @@ -613,7 +599,6 @@ def test_gaps_irregular(self): assert isinstance(data, np.ma.core.MaskedArray) mask = data.mask assert mask[2:5, 1].all() - mpl.pyplot.close(ax.get_figure()) def test_gaps_non_ts(self): # non-ts @@ -634,9 +619,9 @@ def test_gaps_non_ts(self): def test_gap_upsample(self): low = Series( - np.arange(30, dtype=np.float64), index=date_range("2020-01-01", periods=30) + np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10) ) - low.iloc[5:25] = np.nan + low.iloc[5:7] = np.nan _, ax = mpl.pyplot.subplots() low.plot(ax=ax) @@ -653,7 +638,7 @@ def test_gap_upsample(self): assert isinstance(data, np.ma.core.MaskedArray) mask = data.mask - assert mask[5:25, 1].all() + assert mask[5:7, 1].all() def test_secondary_y(self): ser = Series(np.random.default_rng(2).standard_normal(10)) @@ -667,7 +652,6 @@ def test_secondary_y(self): tm.assert_series_equal(ser, xp) assert ax.get_yaxis().get_ticks_position() == "right" assert not axes[0].get_yaxis().get_visible() - mpl.pyplot.close(fig) def test_secondary_y_yaxis(self): Series(np.random.default_rng(2).standard_normal(10)) @@ -675,7 +659,6 @@ def test_secondary_y_yaxis(self): _, ax2 = mpl.pyplot.subplots() ser2.plot(ax=ax2) assert ax2.get_yaxis().get_ticks_position() == "left" - mpl.pyplot.close(ax2.get_figure()) def test_secondary_both(self): ser = Series(np.random.default_rng(2).standard_normal(10)) @@ -701,7 +684,6 @@ def test_secondary_y_ts(self): tm.assert_series_equal(ser, xp) assert ax.get_yaxis().get_ticks_position() == "right" assert not axes[0].get_yaxis().get_visible() - mpl.pyplot.close(fig) def test_secondary_y_ts_yaxis(self): idx = date_range("1/1/2000", periods=10) @@ -709,7 +691,6 @@ def test_secondary_y_ts_yaxis(self): _, ax2 = mpl.pyplot.subplots() ser2.plot(ax=ax2) assert ax2.get_yaxis().get_ticks_position() == "left" - mpl.pyplot.close(ax2.get_figure()) def test_secondary_y_ts_visible(self): idx = date_range("1/1/2000", periods=10) @@ -1108,8 +1089,8 @@ def test_from_resampling_area_line_mixed_high_to_low(self, kind1, kind2): def test_mixed_freq_second_millisecond(self): # GH 7772, GH 7760 - idxh = date_range("2014-07-01 09:00", freq="s", periods=50) - idxl = date_range("2014-07-01 09:00", freq="100ms", periods=500) + idxh = date_range("2014-07-01 09:00", freq="s", periods=5) + idxl = date_range("2014-07-01 09:00", freq="100ms", periods=50) high = Series(np.random.default_rng(2).standard_normal(len(idxh)), idxh) low = Series(np.random.default_rng(2).standard_normal(len(idxl)), idxl) # high to low @@ -1122,8 +1103,8 @@ def test_mixed_freq_second_millisecond(self): def test_mixed_freq_second_millisecond_low_to_high(self): # GH 7772, GH 7760 - idxh = date_range("2014-07-01 09:00", freq="s", periods=50) - idxl = date_range("2014-07-01 09:00", freq="100ms", periods=500) + idxh = date_range("2014-07-01 09:00", freq="s", periods=5) + idxl = date_range("2014-07-01 09:00", freq="100ms", periods=50) high = Series(np.random.default_rng(2).standard_normal(len(idxh)), idxh) low = Series(np.random.default_rng(2).standard_normal(len(idxl)), idxl) # low to high @@ -1298,7 +1279,6 @@ def test_secondary_legend(self): # TODO: color cycle problems assert len(colors) == 4 - mpl.pyplot.close(fig) def test_secondary_legend_right(self): df = DataFrame( @@ -1315,7 +1295,6 @@ def test_secondary_legend_right(self): assert leg.get_texts()[1].get_text() == "B" assert leg.get_texts()[2].get_text() == "C" assert leg.get_texts()[3].get_text() == "D" - mpl.pyplot.close(fig) def test_secondary_legend_bar(self): df = DataFrame( @@ -1328,7 +1307,6 @@ def test_secondary_legend_bar(self): leg = ax.get_legend() assert leg.get_texts()[0].get_text() == "A (right)" assert leg.get_texts()[1].get_text() == "B" - mpl.pyplot.close(fig) def test_secondary_legend_bar_right(self): df = DataFrame( @@ -1341,7 +1319,6 @@ def test_secondary_legend_bar_right(self): leg = ax.get_legend() assert leg.get_texts()[0].get_text() == "A" assert leg.get_texts()[1].get_text() == "B" - mpl.pyplot.close(fig) def test_secondary_legend_multi_col(self): df = DataFrame( @@ -1366,14 +1343,13 @@ def test_secondary_legend_multi_col(self): # TODO: color cycle problems assert len(colors) == 4 - mpl.pyplot.close(fig) def test_secondary_legend_nonts(self): # non-ts df = DataFrame( - 1.1 * np.arange(120).reshape((30, 4)), + 1.1 * np.arange(40).reshape((10, 4)), columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + index=Index([f"i-{i}" for i in range(10)], dtype=object), ) fig = mpl.pyplot.figure() ax = fig.add_subplot(211) @@ -1387,14 +1363,13 @@ def test_secondary_legend_nonts(self): # TODO: color cycle problems assert len(colors) == 4 - mpl.pyplot.close() def test_secondary_legend_nonts_multi_col(self): # non-ts df = DataFrame( - 1.1 * np.arange(120).reshape((30, 4)), + 1.1 * np.arange(40).reshape((10, 4)), columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + index=Index([f"i-{i}" for i in range(10)], dtype=object), ) fig = mpl.pyplot.figure() ax = fig.add_subplot(211) @@ -1448,13 +1423,10 @@ def test_mpl_nopandas(self): exp = np.array([x.toordinal() for x in dates], dtype=np.float64) tm.assert_numpy_array_equal(line1.get_xydata()[:, 0], exp) - exp = np.array([x.toordinal() for x in dates], dtype=np.float64) tm.assert_numpy_array_equal(line2.get_xydata()[:, 0], exp) def test_irregular_ts_shared_ax_xlim(self): # GH 2960 - from pandas.plotting._matplotlib.converter import DatetimeConverter - ts = Series( np.arange(20, dtype=np.float64), index=date_range("2020-01-01", periods=20) ) @@ -1467,8 +1439,8 @@ def test_irregular_ts_shared_ax_xlim(self): # check that axis limits are correct left, right = ax.get_xlim() - assert left <= DatetimeConverter.convert(ts_irregular.index.min(), "", ax) - assert right >= DatetimeConverter.convert(ts_irregular.index.max(), "", ax) + assert left <= conv.DatetimeConverter.convert(ts_irregular.index.min(), "", ax) + assert right >= conv.DatetimeConverter.convert(ts_irregular.index.max(), "", ax) def test_secondary_y_non_ts_xlim(self): # GH 3490 - non-timeseries with secondary y @@ -1504,7 +1476,7 @@ def test_secondary_y_regular_ts_xlim(self): def test_secondary_y_mixed_freq_ts_xlim(self): # GH 3490 - mixed frequency timeseries with secondary y - rng = date_range("2000-01-01", periods=10000, freq="min") + rng = date_range("2000-01-01", periods=10, freq="min") ts = Series(1, index=rng) _, ax = mpl.pyplot.subplots() @@ -1519,8 +1491,6 @@ def test_secondary_y_mixed_freq_ts_xlim(self): def test_secondary_y_irregular_ts_xlim(self): # GH 3490 - irregular-timeseries with secondary y - from pandas.plotting._matplotlib.converter import DatetimeConverter - ts = Series( np.arange(20, dtype=np.float64), index=date_range("2020-01-01", periods=20) ) @@ -1534,8 +1504,8 @@ def test_secondary_y_irregular_ts_xlim(self): ts_irregular[:5].plot(ax=ax) left, right = ax.get_xlim() - assert left <= DatetimeConverter.convert(ts_irregular.index.min(), "", ax) - assert right >= DatetimeConverter.convert(ts_irregular.index.max(), "", ax) + assert left <= conv.DatetimeConverter.convert(ts_irregular.index.min(), "", ax) + assert right >= conv.DatetimeConverter.convert(ts_irregular.index.max(), "", ax) def test_plot_outofbounds_datetime(self): # 2579 - checking this does not raise @@ -1573,7 +1543,7 @@ def test_format_timedelta_ticks_wide(self): "9 days 06:13:20", ] - rng = timedelta_range("0", periods=10, freq="1 d") + rng = timedelta_range("0", periods=10, freq="1 D") df = DataFrame(np.random.default_rng(2).standard_normal((len(rng), 3)), rng) _, ax = mpl.pyplot.subplots() ax = df.plot(fontsize=2, ax=ax) @@ -1592,7 +1562,7 @@ def test_timedelta_plot(self): def test_timedelta_long_period(self): # test long period - index = timedelta_range("1 day 2 hr 30 min 10 s", periods=10, freq="1 d") + index = timedelta_range("1 day 2 hr 30 min 10 s", periods=10, freq="1 D") s = Series(np.random.default_rng(2).standard_normal(len(index)), index) _, ax = mpl.pyplot.subplots() _check_plot_works(s.plot, ax=ax) @@ -1722,35 +1692,28 @@ def test_pickle_fig(self, temp_file, frame_or_series, idx): def _check_plot_works(f, freq=None, series=None, *args, **kwargs): - import matplotlib.pyplot as plt - fig = plt.gcf() - try: - plt.clf() - ax = fig.add_subplot(211) - orig_ax = kwargs.pop("ax", plt.gca()) - orig_axfreq = getattr(orig_ax, "freq", None) - - ret = f(*args, **kwargs) - assert ret is not None # do something more intelligent - - ax = kwargs.pop("ax", plt.gca()) - if series is not None: - dfreq = series.index.freq - if isinstance(dfreq, BaseOffset): - dfreq = dfreq.rule_code - if orig_axfreq is None: - assert ax.freq == dfreq - - if freq is not None: - ax_freq = to_offset(ax.freq, is_period=True) - if freq is not None and orig_axfreq is None: - assert ax_freq == freq - - ax = fig.add_subplot(212) - kwargs["ax"] = ax - ret = f(*args, **kwargs) - assert ret is not None # TODO: do something more intelligent - finally: - plt.close(fig) + fig.clf() + ax = fig.add_subplot(211) + orig_ax = kwargs.pop("ax", plt.gca()) + orig_axfreq = getattr(orig_ax, "freq", None) + + ret = f(*args, **kwargs) + assert ret is not None # do something more intelligent + + ax = kwargs.pop("ax", plt.gca()) + if series is not None: + dfreq = series.index.freq + if isinstance(dfreq, BaseOffset): + dfreq = dfreq.rule_code + if orig_axfreq is None: + assert ax.freq == dfreq + + if freq is not None and orig_axfreq is None: + assert to_offset(ax.freq, is_period=True) == freq + + ax = fig.add_subplot(212) + kwargs["ax"] = ax + ret = f(*args, **kwargs) + assert ret is not None # TODO: do something more intelligent diff --git a/pandas/tests/plotting/test_hist_method.py b/pandas/tests/plotting/test_hist_method.py index 511c1dd7761d5..65cb62917dc4e 100644 --- a/pandas/tests/plotting/test_hist_method.py +++ b/pandas/tests/plotting/test_hist_method.py @@ -27,6 +27,9 @@ ) mpl = pytest.importorskip("matplotlib") +plt = pytest.importorskip("matplotlib.pyplot") + +from pandas.plotting._matplotlib.hist import _grouped_hist @pytest.fixture @@ -119,18 +122,13 @@ def test_hist_layout_with_by_shape(self, hist_df): _check_axes_shape(axes, axes_num=4, layout=(4, 2), figsize=(12, 7)) def test_hist_no_overlap(self): - from matplotlib.pyplot import ( - gcf, - subplot, - ) - x = Series(np.random.default_rng(2).standard_normal(2)) y = Series(np.random.default_rng(2).standard_normal(2)) - subplot(121) + plt.subplot(121) x.hist() - subplot(122) + plt.subplot(122) y.hist() - fig = gcf() + fig = plt.gcf() axes = fig.axes assert len(axes) == 2 @@ -140,10 +138,8 @@ def test_hist_by_no_extra_plots(self, hist_df): assert len(mpl.pyplot.get_fignums()) == 1 def test_plot_fails_when_ax_differs_from_figure(self, ts): - from pylab import figure - - fig1 = figure() - fig2 = figure() + fig1 = plt.figure(1) + fig2 = plt.figure(2) ax1 = fig1.add_subplot(111) msg = "passed axis not bound to passed figure" with pytest.raises(AssertionError, match=msg): @@ -169,8 +165,8 @@ def test_histtype_argument(self, histtype, expected): ) def test_hist_with_legend(self, by, expected_axes_num, expected_layout): # GH 6279 - Series histogram can have a legend - index = 15 * ["1"] + 15 * ["2"] - s = Series(np.random.default_rng(2).standard_normal(30), index=index, name="a") + index = 5 * ["1"] + 5 * ["2"] + s = Series(np.random.default_rng(2).standard_normal(10), index=index, name="a") s.index.name = "b" # Use default_axes=True when plotting method generate subplots itself @@ -181,8 +177,8 @@ def test_hist_with_legend(self, by, expected_axes_num, expected_layout): @pytest.mark.parametrize("by", [None, "b"]) def test_hist_with_legend_raises(self, by): # GH 6279 - Series histogram with legend and label raises - index = 15 * ["1"] + 15 * ["2"] - s = Series(np.random.default_rng(2).standard_normal(30), index=index, name="a") + index = 5 * ["1"] + 5 * ["2"] + s = Series(np.random.default_rng(2).standard_normal(10), index=index, name="a") s.index.name = "b" with pytest.raises(ValueError, match="Cannot use both legend and label"): @@ -331,12 +327,10 @@ def test_hist_df_legacy_layout_labelsize_rot(self, frame_or_series): @pytest.mark.slow def test_hist_df_legacy_rectangles(self): - from matplotlib.patches import Rectangle - ser = Series(range(10)) ax = ser.hist(cumulative=True, bins=4, density=True) # height of last bin (index 5) must be 1.0 - rects = [x for x in ax.get_children() if isinstance(x, Rectangle)] + rects = [x for x in ax.get_children() if isinstance(x, mpl.patches.Rectangle)] tm.assert_almost_equal(rects[-1].get_height(), 1.0) @pytest.mark.slow @@ -431,12 +425,12 @@ def test_hist_layout_error(self): # GH 9351 def test_tight_layout(self): - df = DataFrame(np.random.default_rng(2).standard_normal((100, 2))) + df = DataFrame(np.random.default_rng(2).standard_normal((10, 2))) df[2] = to_datetime( np.random.default_rng(2).integers( 812419200000000000, 819331200000000000, - size=100, + size=10, dtype=np.int64, ) ) @@ -504,7 +498,7 @@ def test_hist_column_order_unchanged(self, column, expected): def test_histtype_argument(self, histtype, expected): # GH23992 Verify functioning of histtype argument df = DataFrame( - np.random.default_rng(2).integers(1, 10, size=(100, 2)), columns=["a", "b"] + np.random.default_rng(2).integers(1, 10, size=(10, 2)), columns=["a", "b"] ) ax = df.hist(histtype=histtype) _check_patches_all_filled(ax, filled=expected) @@ -519,9 +513,9 @@ def test_hist_with_legend(self, by, column): if by is not None: expected_labels = [expected_labels] * 2 - index = Index(15 * ["1"] + 15 * ["2"], name="c") + index = Index(5 * ["1"] + 5 * ["2"], name="c") df = DataFrame( - np.random.default_rng(2).standard_normal((30, 2)), + np.random.default_rng(2).standard_normal((10, 2)), index=index, columns=["a", "b"], ) @@ -545,9 +539,9 @@ def test_hist_with_legend(self, by, column): @pytest.mark.parametrize("column", [None, "b"]) def test_hist_with_legend_raises(self, by, column): # GH 6279 - DataFrame histogram with legend and label raises - index = Index(15 * ["1"] + 15 * ["2"], name="c") + index = Index(5 * ["1"] + 5 * ["2"], name="c") df = DataFrame( - np.random.default_rng(2).standard_normal((30, 2)), + np.random.default_rng(2).standard_normal((10, 2)), index=index, columns=["a", "b"], ) @@ -586,7 +580,7 @@ def test_hist_df_with_nonnumerics_no_bins(self): def test_hist_secondary_legend(self): # GH 9610 df = DataFrame( - np.random.default_rng(2).standard_normal((30, 4)), columns=list("abcd") + np.random.default_rng(2).standard_normal((10, 4)), columns=list("abcd") ) # primary -> secondary @@ -602,7 +596,7 @@ def test_hist_secondary_legend(self): def test_hist_secondary_secondary(self): # GH 9610 df = DataFrame( - np.random.default_rng(2).standard_normal((30, 4)), columns=list("abcd") + np.random.default_rng(2).standard_normal((10, 4)), columns=list("abcd") ) # secondary -> secondary _, ax = mpl.pyplot.subplots() @@ -617,7 +611,7 @@ def test_hist_secondary_secondary(self): def test_hist_secondary_primary(self): # GH 9610 df = DataFrame( - np.random.default_rng(2).standard_normal((30, 4)), columns=list("abcd") + np.random.default_rng(2).standard_normal((10, 4)), columns=list("abcd") ) # secondary -> primary _, ax = mpl.pyplot.subplots() @@ -632,7 +626,6 @@ def test_hist_secondary_primary(self): def test_hist_with_nans_and_weights(self): # GH 48884 - mpl_patches = pytest.importorskip("matplotlib.patches") df = DataFrame( [[np.nan, 0.2, 0.3], [0.4, np.nan, np.nan], [0.7, 0.8, 0.9]], columns=list("abc"), @@ -643,12 +636,12 @@ def test_hist_with_nans_and_weights(self): _, ax0 = mpl.pyplot.subplots() df.plot.hist(ax=ax0, weights=weights) - rects = [x for x in ax0.get_children() if isinstance(x, mpl_patches.Rectangle)] + rects = [x for x in ax0.get_children() if isinstance(x, mpl.patches.Rectangle)] heights = [rect.get_height() for rect in rects] _, ax1 = mpl.pyplot.subplots() no_nan_df.plot.hist(ax=ax1, weights=no_nan_weights) no_nan_rects = [ - x for x in ax1.get_children() if isinstance(x, mpl_patches.Rectangle) + x for x in ax1.get_children() if isinstance(x, mpl.patches.Rectangle) ] no_nan_heights = [rect.get_height() for rect in no_nan_rects] assert all(h0 == h1 for h0, h1 in zip(heights, no_nan_heights)) @@ -663,8 +656,6 @@ def test_hist_with_nans_and_weights(self): class TestDataFrameGroupByPlots: def test_grouped_hist_legacy(self): - from pandas.plotting._matplotlib.hist import _grouped_hist - rs = np.random.default_rng(10) df = DataFrame(rs.standard_normal((10, 1)), columns=["A"]) df["B"] = to_datetime( @@ -716,10 +707,6 @@ def test_grouped_hist_legacy_single_key(self): _check_ticks_props(axes, xrot=30) def test_grouped_hist_legacy_grouped_hist_kwargs(self): - from matplotlib.patches import Rectangle - - from pandas.plotting._matplotlib.hist import _grouped_hist - rs = np.random.default_rng(2) df = DataFrame(rs.standard_normal((10, 1)), columns=["A"]) df["B"] = to_datetime( @@ -748,14 +735,14 @@ def test_grouped_hist_legacy_grouped_hist_kwargs(self): ) # height of last bin (index 5) must be 1.0 for ax in axes.ravel(): - rects = [x for x in ax.get_children() if isinstance(x, Rectangle)] + rects = [ + x for x in ax.get_children() if isinstance(x, mpl.patches.Rectangle) + ] height = rects[-1].get_height() tm.assert_almost_equal(height, 1.0) _check_ticks_props(axes, xlabelsize=xf, xrot=xrot, ylabelsize=yf, yrot=yrot) def test_grouped_hist_legacy_grouped_hist(self): - from pandas.plotting._matplotlib.hist import _grouped_hist - rs = np.random.default_rng(2) df = DataFrame(rs.standard_normal((10, 1)), columns=["A"]) df["B"] = to_datetime( @@ -773,8 +760,6 @@ def test_grouped_hist_legacy_grouped_hist(self): _check_ax_scales(axes, yaxis="log") def test_grouped_hist_legacy_external_err(self): - from pandas.plotting._matplotlib.hist import _grouped_hist - rs = np.random.default_rng(2) df = DataFrame(rs.standard_normal((10, 1)), columns=["A"]) df["B"] = to_datetime( diff --git a/pandas/tests/plotting/test_misc.py b/pandas/tests/plotting/test_misc.py index d593ddbbaa0b8..43e1255404784 100644 --- a/pandas/tests/plotting/test_misc.py +++ b/pandas/tests/plotting/test_misc.py @@ -31,6 +31,8 @@ plt = pytest.importorskip("matplotlib.pyplot") cm = pytest.importorskip("matplotlib.cm") +from pandas.plotting._matplotlib.style import get_standard_colors + @pytest.fixture def iris(datapath) -> DataFrame: @@ -109,8 +111,6 @@ def test_savefig(kind, data, index): class TestSeriesPlots: def test_autocorrelation_plot(self): - from pandas.plotting import autocorrelation_plot - ser = Series( np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10), @@ -118,32 +118,28 @@ def test_autocorrelation_plot(self): ) # Ensure no UserWarning when making plot with tm.assert_produces_warning(None): - _check_plot_works(autocorrelation_plot, series=ser) - _check_plot_works(autocorrelation_plot, series=ser.values) + _check_plot_works(plotting.autocorrelation_plot, series=ser) + _check_plot_works(plotting.autocorrelation_plot, series=ser.values) - ax = autocorrelation_plot(ser, label="Test") + ax = plotting.autocorrelation_plot(ser, label="Test") _check_legend_labels(ax, labels=["Test"]) @pytest.mark.parametrize("kwargs", [{}, {"lag": 5}]) def test_lag_plot(self, kwargs): - from pandas.plotting import lag_plot - ser = Series( np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10), name="ts", ) - _check_plot_works(lag_plot, series=ser, **kwargs) + _check_plot_works(plotting.lag_plot, series=ser, **kwargs) def test_bootstrap_plot(self): - from pandas.plotting import bootstrap_plot - ser = Series( np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10), name="ts", ) - _check_plot_works(bootstrap_plot, series=ser, size=10) + _check_plot_works(plotting.bootstrap_plot, series=ser, size=10) class TestDataFramePlots: @@ -156,7 +152,7 @@ def test_scatter_matrix_axis(self, pass_axis): if pass_axis: _, ax = mpl.pyplot.subplots(3, 3) - df = DataFrame(np.random.default_rng(2).standard_normal((100, 3))) + df = DataFrame(np.random.default_rng(2).standard_normal((10, 3))) # we are plotting multiples on a sub-plot with tm.assert_produces_warning(UserWarning, check_stacklevel=False): @@ -168,7 +164,7 @@ def test_scatter_matrix_axis(self, pass_axis): ) axes0_labels = axes[0][0].yaxis.get_majorticklabels() # GH 5662 - expected = ["-2", "0", "2"] + expected = ["-2", "-1", "0"] _check_text_labels(axes0_labels, expected) _check_ticks_props(axes, xlabelsize=8, xrot=90, ylabelsize=8, yrot=0) @@ -181,7 +177,7 @@ def test_scatter_matrix_axis_smaller(self, pass_axis): if pass_axis: _, ax = mpl.pyplot.subplots(3, 3) - df = DataFrame(np.random.default_rng(11).standard_normal((100, 3))) + df = DataFrame(np.random.default_rng(11).standard_normal((10, 3))) df[0] = (df[0] - 2) / 3 # we are plotting multiples on a sub-plot @@ -193,18 +189,15 @@ def test_scatter_matrix_axis_smaller(self, pass_axis): ax=ax, ) axes0_labels = axes[0][0].yaxis.get_majorticklabels() - expected = ["-1.0", "-0.5", "0.0"] + expected = ["-1.25", "-1.0", "-0.75", "-0.5"] _check_text_labels(axes0_labels, expected) _check_ticks_props(axes, xlabelsize=8, xrot=90, ylabelsize=8, yrot=0) @pytest.mark.slow def test_andrews_curves_no_warning(self, iris): - from pandas.plotting import andrews_curves - - df = iris # Ensure no UserWarning when making plot with tm.assert_produces_warning(None): - _check_plot_works(andrews_curves, frame=df, class_column="Name") + _check_plot_works(plotting.andrews_curves, frame=iris, class_column="Name") @pytest.mark.slow @pytest.mark.parametrize( @@ -229,12 +222,10 @@ def test_andrews_curves_no_warning(self, iris): ], ) def test_andrews_curves_linecolors(self, request, df, linecolors): - from pandas.plotting import andrews_curves - if isinstance(df, str): df = request.getfixturevalue(df) ax = _check_plot_works( - andrews_curves, frame=df, class_column="Name", color=linecolors + plotting.andrews_curves, frame=df, class_column="Name", color=linecolors ) _check_colors( ax.get_lines()[:10], linecolors=linecolors, mapping=df["Name"][:10] @@ -256,23 +247,19 @@ def test_andrews_curves_linecolors(self, request, df, linecolors): ], ) def test_andrews_curves_cmap(self, request, df): - from pandas.plotting import andrews_curves - if isinstance(df, str): df = request.getfixturevalue(df) cmaps = [cm.jet(n) for n in np.linspace(0, 1, df["Name"].nunique())] ax = _check_plot_works( - andrews_curves, frame=df, class_column="Name", color=cmaps + plotting.andrews_curves, frame=df, class_column="Name", color=cmaps ) _check_colors(ax.get_lines()[:10], linecolors=cmaps, mapping=df["Name"][:10]) @pytest.mark.slow def test_andrews_curves_handle(self): - from pandas.plotting import andrews_curves - colors = ["b", "g", "r"] df = DataFrame({"A": [1, 2, 3], "B": [1, 2, 3], "C": [1, 2, 3], "Name": colors}) - ax = andrews_curves(df, "Name", color=colors) + ax = plotting.andrews_curves(df, "Name", color=colors) handles, _ = ax.get_legend_handles_labels() _check_colors(handles, linecolors=colors) @@ -282,61 +269,54 @@ def test_andrews_curves_handle(self): [("#556270", "#4ECDC4", "#C7F464"), ["dodgerblue", "aquamarine", "seagreen"]], ) def test_parallel_coordinates_colors(self, iris, color): - from pandas.plotting import parallel_coordinates - df = iris ax = _check_plot_works( - parallel_coordinates, frame=df, class_column="Name", color=color + plotting.parallel_coordinates, frame=df, class_column="Name", color=color ) _check_colors(ax.get_lines()[:10], linecolors=color, mapping=df["Name"][:10]) @pytest.mark.slow def test_parallel_coordinates_cmap(self, iris): - from matplotlib import cm - - from pandas.plotting import parallel_coordinates - df = iris ax = _check_plot_works( - parallel_coordinates, frame=df, class_column="Name", colormap=cm.jet + plotting.parallel_coordinates, + frame=df, + class_column="Name", + colormap=cm.jet, ) - cmaps = [cm.jet(n) for n in np.linspace(0, 1, df["Name"].nunique())] + cmaps = [mpl.cm.jet(n) for n in np.linspace(0, 1, df["Name"].nunique())] _check_colors(ax.get_lines()[:10], linecolors=cmaps, mapping=df["Name"][:10]) @pytest.mark.slow def test_parallel_coordinates_line_diff(self, iris): - from pandas.plotting import parallel_coordinates - df = iris - ax = _check_plot_works(parallel_coordinates, frame=df, class_column="Name") + ax = _check_plot_works( + plotting.parallel_coordinates, frame=df, class_column="Name" + ) nlines = len(ax.get_lines()) nxticks = len(ax.xaxis.get_ticklabels()) ax = _check_plot_works( - parallel_coordinates, frame=df, class_column="Name", axvlines=False + plotting.parallel_coordinates, frame=df, class_column="Name", axvlines=False ) assert len(ax.get_lines()) == (nlines - nxticks) @pytest.mark.slow def test_parallel_coordinates_handles(self, iris): - from pandas.plotting import parallel_coordinates - df = iris colors = ["b", "g", "r"] df = DataFrame({"A": [1, 2, 3], "B": [1, 2, 3], "C": [1, 2, 3], "Name": colors}) - ax = parallel_coordinates(df, "Name", color=colors) + ax = plotting.parallel_coordinates(df, "Name", color=colors) handles, _ = ax.get_legend_handles_labels() _check_colors(handles, linecolors=colors) # not sure if this is indicative of a problem @pytest.mark.filterwarnings("ignore:Attempting to set:UserWarning") def test_parallel_coordinates_with_sorted_labels(self): - """For #15908""" - from pandas.plotting import parallel_coordinates - + # GH 15908 df = DataFrame( { "feat": list(range(30)), @@ -345,7 +325,7 @@ def test_parallel_coordinates_with_sorted_labels(self): + [1 for _ in range(10)], } ) - ax = parallel_coordinates(df, "class", sort_labels=True) + ax = plotting.parallel_coordinates(df, "class", sort_labels=True) polylines, labels = ax.get_legend_handles_labels() color_label_tuples = zip( [polyline.get_color() for polyline in polylines], labels @@ -359,45 +339,38 @@ def test_parallel_coordinates_with_sorted_labels(self): assert prev[1] < nxt[1] and prev[0] < nxt[0] def test_radviz_no_warning(self, iris): - from pandas.plotting import radviz - - df = iris # Ensure no UserWarning when making plot with tm.assert_produces_warning(None): - _check_plot_works(radviz, frame=df, class_column="Name") + _check_plot_works(plotting.radviz, frame=iris, class_column="Name") @pytest.mark.parametrize( "color", [("#556270", "#4ECDC4", "#C7F464"), ["dodgerblue", "aquamarine", "seagreen"]], ) def test_radviz_color(self, iris, color): - from pandas.plotting import radviz - df = iris - ax = _check_plot_works(radviz, frame=df, class_column="Name", color=color) + ax = _check_plot_works( + plotting.radviz, frame=df, class_column="Name", color=color + ) # skip Circle drawn as ticks patches = [p for p in ax.patches[:20] if p.get_label() != ""] _check_colors(patches[:10], facecolors=color, mapping=df["Name"][:10]) def test_radviz_color_cmap(self, iris): - from matplotlib import cm - - from pandas.plotting import radviz - df = iris - ax = _check_plot_works(radviz, frame=df, class_column="Name", colormap=cm.jet) - cmaps = [cm.jet(n) for n in np.linspace(0, 1, df["Name"].nunique())] + ax = _check_plot_works( + plotting.radviz, frame=df, class_column="Name", colormap=cm.jet + ) + cmaps = [mpl.cm.jet(n) for n in np.linspace(0, 1, df["Name"].nunique())] patches = [p for p in ax.patches[:20] if p.get_label() != ""] _check_colors(patches, facecolors=cmaps, mapping=df["Name"][:10]) def test_radviz_colors_handles(self): - from pandas.plotting import radviz - colors = [[0.0, 0.0, 1.0, 1.0], [0.0, 0.5, 1.0, 1.0], [1.0, 0.0, 0.0, 1.0]] df = DataFrame( {"A": [1, 2, 3], "B": [2, 1, 3], "C": [3, 2, 1], "Name": ["b", "g", "r"]} ) - ax = radviz(df, "Name", color=colors) + ax = plotting.radviz(df, "Name", color=colors) handles, _ = ax.get_legend_handles_labels() _check_colors(handles, facecolors=colors) @@ -471,15 +444,11 @@ def test_get_standard_colors_random_seed(self): def test_get_standard_colors_consistency(self): # GH17525 # Make sure it produces the same colors every time it's called - from pandas.plotting._matplotlib.style import get_standard_colors - color1 = get_standard_colors(1, color_type="random") color2 = get_standard_colors(1, color_type="random") assert color1 == color2 def test_get_standard_colors_default_num_colors(self): - from pandas.plotting._matplotlib.style import get_standard_colors - # Make sure the default color_types returns the specified amount color1 = get_standard_colors(1, color_type="default") color2 = get_standard_colors(9, color_type="default") @@ -509,11 +478,7 @@ def test_get_standard_colors_no_appending(self): # Make sure not to add more colors so that matplotlib can cycle # correctly. - from matplotlib import cm - - from pandas.plotting._matplotlib.style import get_standard_colors - - color_before = cm.gnuplot(range(5)) + color_before = mpl.cm.gnuplot(range(5)) color_after = get_standard_colors(1, color=color_before) assert len(color_after) == len(color_before) @@ -521,7 +486,7 @@ def test_get_standard_colors_no_appending(self): np.random.default_rng(2).standard_normal((48, 4)), columns=list("ABCD") ) - color_list = cm.gnuplot(np.linspace(0, 1, 16)) + color_list = mpl.cm.gnuplot(np.linspace(0, 1, 16)) p = df.A.plot.bar(figsize=(16, 7), color=color_list) assert p.patches[1].get_facecolor() == p.patches[17].get_facecolor() @@ -546,9 +511,7 @@ def test_dictionary_color(self, kind): def test_bar_plot(self): # GH38947 # Test bar plot with string and int index - from matplotlib.text import Text - - expected = [Text(0, 0, "0"), Text(1, 0, "Total")] + expected = [mpl.text.Text(0, 0, "0"), mpl.text.Text(1, 0, "Total")] df = DataFrame( { @@ -565,11 +528,12 @@ def test_bar_plot(self): def test_barh_plot_labels_mixed_integer_string(self): # GH39126 # Test barh plot with string and integer at the same column - from matplotlib.text import Text - df = DataFrame([{"word": 1, "value": 0}, {"word": "knowledge", "value": 2}]) plot_barh = df.plot.barh(x="word", legend=None) - expected_yticklabels = [Text(0, 0, "1"), Text(0, 1, "knowledge")] + expected_yticklabels = [ + mpl.text.Text(0, 0, "1"), + mpl.text.Text(0, 1, "knowledge"), + ] assert all( actual.get_text() == expected.get_text() for actual, expected in zip( @@ -649,8 +613,8 @@ def test_externally_shared_axes(self): # Create data df = DataFrame( { - "a": np.random.default_rng(2).standard_normal(1000), - "b": np.random.default_rng(2).standard_normal(1000), + "a": np.random.default_rng(2).standard_normal(10), + "b": np.random.default_rng(2).standard_normal(10), } ) @@ -707,9 +671,7 @@ def test_plot_bar_axis_units_timestamp_conversion(self): def test_bar_plt_xaxis_intervalrange(self): # GH 38969 # Ensure IntervalIndex x-axis produces a bar plot as expected - from matplotlib.text import Text - - expected = [Text(0, 0, "([0, 1],)"), Text(1, 0, "([1, 2],)")] + expected = [mpl.text.Text(0, 0, "([0, 1],)"), mpl.text.Text(1, 0, "([1, 2],)")] s = Series( [1, 2], index=[interval_range(0, 2, closed="both")], diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index 54f09c7007330..52ca66c218862 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -33,9 +33,14 @@ get_y_axis, ) +from pandas.tseries.offsets import CustomBusinessDay + mpl = pytest.importorskip("matplotlib") plt = pytest.importorskip("matplotlib.pyplot") +from pandas.plotting._matplotlib.converter import DatetimeConverter +from pandas.plotting._matplotlib.style import get_standard_colors + @pytest.fixture def ts(): @@ -49,7 +54,7 @@ def ts(): @pytest.fixture def series(): return Series( - range(20), dtype=np.float64, name="series", index=[f"i_{i}" for i in range(20)] + range(10), dtype=np.float64, name="series", index=[f"i_{i}" for i in range(10)] ) @@ -192,28 +197,24 @@ def test_area_sharey_dont_overwrite(self, ts): assert get_y_axis(ax1).joined(ax1, ax2) assert get_y_axis(ax2).joined(ax1, ax2) - plt.close(fig) def test_label(self): s = Series([1, 2]) _, ax = mpl.pyplot.subplots() ax = s.plot(label="LABEL", legend=True, ax=ax) _check_legend_labels(ax, labels=["LABEL"]) - mpl.pyplot.close("all") def test_label_none(self): s = Series([1, 2]) _, ax = mpl.pyplot.subplots() ax = s.plot(legend=True, ax=ax) _check_legend_labels(ax, labels=[""]) - mpl.pyplot.close("all") def test_label_ser_name(self): s = Series([1, 2], name="NAME") _, ax = mpl.pyplot.subplots() ax = s.plot(legend=True, ax=ax) _check_legend_labels(ax, labels=["NAME"]) - mpl.pyplot.close("all") def test_label_ser_name_override(self): s = Series([1, 2], name="NAME") @@ -221,7 +222,6 @@ def test_label_ser_name_override(self): _, ax = mpl.pyplot.subplots() ax = s.plot(legend=True, label="LABEL", ax=ax) _check_legend_labels(ax, labels=["LABEL"]) - mpl.pyplot.close("all") def test_label_ser_name_override_dont_draw(self): s = Series([1, 2], name="NAME") @@ -231,7 +231,6 @@ def test_label_ser_name_override_dont_draw(self): assert ax.get_legend() is None # Hasn't been drawn ax.legend() # draw it _check_legend_labels(ax, labels=["LABEL"]) - mpl.pyplot.close("all") def test_boolean(self): # GH 23719 @@ -344,9 +343,7 @@ def test_rotation_30(self): _check_ticks_props(axes, xrot=30) def test_irregular_datetime(self): - from pandas.plotting._matplotlib.converter import DatetimeConverter - - rng = date_range("1/1/2000", "3/1/2000") + rng = date_range("1/1/2000", "1/15/2000") rng = rng[[0, 1, 2, 3, 5, 9, 10, 11, 12]] ser = Series(np.random.default_rng(2).standard_normal(len(rng)), rng) _, ax = mpl.pyplot.subplots() @@ -380,6 +377,12 @@ def test_pie_series(self): _check_text_labels(ax.texts, series.index) assert ax.get_ylabel() == "" + def test_pie_arrow_type(self): + # GH 59192 + pytest.importorskip("pyarrow") + ser = Series([1, 2, 3, 4], dtype="int32[pyarrow]") + _check_plot_works(ser.plot.pie) + def test_pie_series_no_label(self): series = Series( np.random.default_rng(2).integers(1, 5), @@ -453,9 +456,9 @@ def test_pie_nan(self): def test_df_series_secondary_legend(self): # GH 9779 df = DataFrame( - np.random.default_rng(2).standard_normal((30, 3)), columns=list("abc") + np.random.default_rng(2).standard_normal((10, 3)), columns=list("abc") ) - s = Series(np.random.default_rng(2).standard_normal(30), name="x") + s = Series(np.random.default_rng(2).standard_normal(10), name="x") # primary -> secondary (without passing ax) _, ax = mpl.pyplot.subplots() @@ -467,28 +470,12 @@ def test_df_series_secondary_legend(self): assert ax.get_yaxis().get_visible() assert ax.right_ax.get_yaxis().get_visible() - def test_df_series_secondary_legend_with_axes(self): - # GH 9779 - df = DataFrame( - np.random.default_rng(2).standard_normal((30, 3)), columns=list("abc") - ) - s = Series(np.random.default_rng(2).standard_normal(30), name="x") - # primary -> secondary (with passing ax) - _, ax = mpl.pyplot.subplots() - ax = df.plot(ax=ax) - s.plot(ax=ax, legend=True, secondary_y=True) - # both legends are drawn on left ax - # left and right axis must be visible - _check_legend_labels(ax, labels=["a", "b", "c", "x (right)"]) - assert ax.get_yaxis().get_visible() - assert ax.right_ax.get_yaxis().get_visible() - def test_df_series_secondary_legend_both(self): # GH 9779 df = DataFrame( - np.random.default_rng(2).standard_normal((30, 3)), columns=list("abc") + np.random.default_rng(2).standard_normal((10, 3)), columns=list("abc") ) - s = Series(np.random.default_rng(2).standard_normal(30), name="x") + s = Series(np.random.default_rng(2).standard_normal(10), name="x") # secondary -> secondary (without passing ax) _, ax = mpl.pyplot.subplots() ax = df.plot(secondary_y=True, ax=ax) @@ -500,29 +487,12 @@ def test_df_series_secondary_legend_both(self): assert not ax.left_ax.get_yaxis().get_visible() assert ax.get_yaxis().get_visible() - def test_df_series_secondary_legend_both_with_axis(self): - # GH 9779 - df = DataFrame( - np.random.default_rng(2).standard_normal((30, 3)), columns=list("abc") - ) - s = Series(np.random.default_rng(2).standard_normal(30), name="x") - # secondary -> secondary (with passing ax) - _, ax = mpl.pyplot.subplots() - ax = df.plot(secondary_y=True, ax=ax) - s.plot(ax=ax, legend=True, secondary_y=True) - # both legends are drawn on left ax - # left axis must be invisible and right axis must be visible - expected = ["a (right)", "b (right)", "c (right)", "x (right)"] - _check_legend_labels(ax.left_ax, expected) - assert not ax.left_ax.get_yaxis().get_visible() - assert ax.get_yaxis().get_visible() - def test_df_series_secondary_legend_both_with_axis_2(self): # GH 9779 df = DataFrame( - np.random.default_rng(2).standard_normal((30, 3)), columns=list("abc") + np.random.default_rng(2).standard_normal((10, 3)), columns=list("abc") ) - s = Series(np.random.default_rng(2).standard_normal(30), name="x") + s = Series(np.random.default_rng(2).standard_normal(10), name="x") # secondary -> secondary (with passing ax) _, ax = mpl.pyplot.subplots() ax = df.plot(secondary_y=True, mark_right=False, ax=ax) @@ -537,17 +507,12 @@ def test_df_series_secondary_legend_both_with_axis_2(self): @pytest.mark.parametrize( "input_logy, expected_scale", [(True, "log"), ("sym", "symlog")] ) - def test_secondary_logy(self, input_logy, expected_scale): - # GH 25545 - s1 = Series(np.random.default_rng(2).standard_normal(100)) - s2 = Series(np.random.default_rng(2).standard_normal(100)) - - # GH 24980 - ax1 = s1.plot(logy=input_logy) - ax2 = s2.plot(secondary_y=True, logy=input_logy) - + @pytest.mark.parametrize("secondary_kwarg", [{}, {"secondary_y": True}]) + def test_secondary_logy(self, input_logy, expected_scale, secondary_kwarg): + # GH 25545, GH 24980 + s1 = Series(np.random.default_rng(2).standard_normal(10)) + ax1 = s1.plot(logy=input_logy, **secondary_kwarg) assert ax1.get_yscale() == expected_scale - assert ax2.get_yscale() == expected_scale def test_plot_fails_with_dupe_color_and_style(self): x = Series(np.random.default_rng(2).standard_normal(2)) @@ -573,6 +538,22 @@ def test_kde_kwargs(self, ts, bw_method, ind): pytest.importorskip("scipy") _check_plot_works(ts.plot.kde, bw_method=bw_method, ind=ind) + @pytest.mark.parametrize( + "bw_method, ind, weights", + [ + ["scott", 20, None], + [None, 20, None], + [None, np.int_(20), None], + [0.5, np.linspace(-100, 100, 20), None], + ["scott", 40, np.linspace(0.0, 2.0, 50)], + ], + ) + def test_kde_kwargs_weights(self, bw_method, ind, weights): + # GH59337 + pytest.importorskip("scipy") + s = Series(np.random.default_rng(2).uniform(size=50)) + _check_plot_works(s.plot.kde, bw_method=bw_method, ind=ind, weights=weights) + def test_density_kwargs(self, ts): pytest.importorskip("scipy") sample_points = np.linspace(-100, 100, 20) @@ -673,6 +654,9 @@ def test_errorbar_asymmetrical(self): expected = (err.T * np.array([-1, 1])) + s.to_numpy().reshape(-1, 1) tm.assert_numpy_array_equal(result, expected) + def test_errorbar_asymmetrical_error(self): + # GH9536 + s = Series(np.arange(10), name="x") msg = ( "Asymmetrical error bars should be provided " f"with the shape \\(2, {len(s)}\\)" @@ -759,8 +743,6 @@ def test_series_grid_settings(self): @pytest.mark.parametrize("c", ["r", "red", "green", "#FF0000"]) def test_standard_colors(self, c): - from pandas.plotting._matplotlib.style import get_standard_colors - result = get_standard_colors(1, color=c) assert result == [c] @@ -774,12 +756,8 @@ def test_standard_colors(self, c): assert result == [c] * 3 def test_standard_colors_all(self): - from matplotlib import colors - - from pandas.plotting._matplotlib.style import get_standard_colors - # multiple colors like mediumaquamarine - for c in colors.cnames: + for c in mpl.colors.cnames: result = get_standard_colors(num_colors=1, color=c) assert result == [c] @@ -793,7 +771,7 @@ def test_standard_colors_all(self): assert result == [c] * 3 # single letter colors like k - for c in colors.ColorConverter.colors: + for c in mpl.colors.ColorConverter.colors: result = get_standard_colors(num_colors=1, color=c) assert result == [c] @@ -821,8 +799,6 @@ def test_time_series_plot_color_kwargs(self): _check_colors(ax.get_lines(), linecolors=["green"]) def test_time_series_plot_color_with_empty_kwargs(self): - import matplotlib as mpl - def_colors = _unpack_cycler(mpl.rcParams) index = date_range("1/1/2000", periods=12) s = Series(np.arange(1, 13), index=index) @@ -851,8 +827,6 @@ def test_xtick_barPlot(self): def test_custom_business_day_freq(self): # GH7222 - from pandas.tseries.offsets import CustomBusinessDay - s = Series( range(100, 121), index=pd.bdate_range( diff --git a/pandas/tests/plotting/test_style.py b/pandas/tests/plotting/test_style.py index 665bda15724fd..f9c89e0a7893f 100644 --- a/pandas/tests/plotting/test_style.py +++ b/pandas/tests/plotting/test_style.py @@ -2,7 +2,8 @@ from pandas import Series -pytest.importorskip("matplotlib") +mpl = pytest.importorskip("matplotlib") +plt = pytest.importorskip("matplotlib.pyplot") from pandas.plotting._matplotlib.style import get_standard_colors @@ -18,11 +19,8 @@ class TestGetStandardColors: ], ) def test_default_colors_named_from_prop_cycle(self, num_colors, expected): - import matplotlib as mpl - from matplotlib.pyplot import cycler - mpl_params = { - "axes.prop_cycle": cycler(color=["red", "green", "blue"]), + "axes.prop_cycle": plt.cycler(color=["red", "green", "blue"]), } with mpl.rc_context(rc=mpl_params): result = get_standard_colors(num_colors=num_colors) @@ -39,11 +37,8 @@ def test_default_colors_named_from_prop_cycle(self, num_colors, expected): ], ) def test_default_colors_named_from_prop_cycle_string(self, num_colors, expected): - import matplotlib as mpl - from matplotlib.pyplot import cycler - mpl_params = { - "axes.prop_cycle": cycler(color="bgry"), + "axes.prop_cycle": plt.cycler(color="bgry"), } with mpl.rc_context(rc=mpl_params): result = get_standard_colors(num_colors=num_colors) @@ -74,11 +69,8 @@ def test_default_colors_named_from_prop_cycle_string(self, num_colors, expected) ], ) def test_default_colors_named_undefined_prop_cycle(self, num_colors, expected_name): - import matplotlib as mpl - import matplotlib.colors as mcolors - with mpl.rc_context(rc={}): - expected = [mcolors.to_hex(x) for x in expected_name] + expected = [mpl.colors.to_hex(x) for x in expected_name] result = get_standard_colors(num_colors=num_colors) assert result == expected diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index 422ed8d4f3d2b..63e9e89cabd58 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -7,6 +7,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( Categorical, @@ -1387,6 +1389,7 @@ def test_mode_numerical_nan(self, dropna, expected): expected = Series(expected) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "dropna, expected1, expected2, expected3", [(True, ["b"], ["bar"], ["nan"]), (False, ["b"], [np.nan], ["nan"])], @@ -1414,6 +1417,7 @@ def test_mode_str_obj(self, dropna, expected1, expected2, expected3): expected3 = Series(expected3) tm.assert_series_equal(result, expected3) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "dropna, expected1, expected2", [(True, ["foo"], ["foo"]), (False, ["foo"], [np.nan])], @@ -1551,6 +1555,7 @@ def test_mode_intoverflow(self, dropna, expected1, expected2): expected2 = Series(expected2, dtype=np.uint64) tm.assert_series_equal(result, expected2) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_mode_sortwarning(self): # Check for the warning that is raised when the mode # results cannot be sorted @@ -1568,7 +1573,7 @@ def test_mode_boolean_with_na(self): # GH#42107 ser = Series([True, False, True, pd.NA], dtype="boolean") result = ser.mode() - expected = Series({0: True}, dtype="boolean") + expected = Series([True], dtype="boolean") tm.assert_series_equal(result, expected) @pytest.mark.parametrize( diff --git a/pandas/tests/resample/test_base.py b/pandas/tests/resample/test_base.py index 3428abacd509e..b2d9f6c0e3eb0 100644 --- a/pandas/tests/resample/test_base.py +++ b/pandas/tests/resample/test_base.py @@ -436,7 +436,7 @@ def test_resample_empty_dtypes(index, dtype, resample_method): empty_series_dti = Series([], index, dtype) with tm.assert_produces_warning(warn, match=msg): - rs = empty_series_dti.resample("d", group_keys=False) + rs = empty_series_dti.resample("D", group_keys=False) try: getattr(rs, resample_method)() except DataError: @@ -557,7 +557,8 @@ def test_first_last_skipna(any_real_nullable_dtype, skipna, how): method = getattr(rs, how) result = method(skipna=skipna) - gb = df.groupby(df.shape[0] * [pd.to_datetime("2020-01-31")]) + ts = pd.to_datetime("2020-01-31").as_unit("ns") + gb = df.groupby(df.shape[0] * [ts]) expected = getattr(gb, how)(skipna=skipna) expected.index.freq = "ME" tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index c38d223c9d6a0..dc2ddcc70828f 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -1,9 +1,9 @@ from datetime import datetime from functools import partial +import zoneinfo import numpy as np import pytest -import pytz from pandas._libs import lib from pandas._typing import DatetimeNaTType @@ -239,7 +239,9 @@ def _ohlc(group): def test_resample_how_callables(unit): # GH#7929 data = np.arange(5, dtype=np.int64) - ind = date_range(start="2014-01-01", periods=len(data), freq="d").as_unit(unit) + msg = "'d' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + ind = date_range(start="2014-01-01", periods=len(data), freq="d").as_unit(unit) df = DataFrame({"A": data, "B": data}, index=ind) def fn(x, a=1): @@ -334,7 +336,9 @@ def test_resample_basic_from_daily(unit): s = Series(np.random.default_rng(2).random(len(dti)), dti) # to weekly - result = s.resample("w-sun").last() + msg = "'w-sun' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + result = s.resample("w-sun").last() assert len(result) == 3 assert (result.index.dayofweek == [6, 6, 6]).all() @@ -1190,7 +1194,9 @@ def test_anchored_lowercase_buglet(unit): dates = date_range("4/16/2012 20:00", periods=50000, freq="s").as_unit(unit) ts = Series(np.random.default_rng(2).standard_normal(len(dates)), index=dates) # it works! - ts.resample("d").mean() + msg = "'d' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + ts.resample("d").mean() def test_upsample_apply_functions(unit): @@ -1531,9 +1537,9 @@ def test_groupby_with_dst_time_change(unit): ) df = DataFrame([1, 2], index=index) - result = df.groupby(Grouper(freq="1d")).last() + result = df.groupby(Grouper(freq="1D")).last() expected_index_values = date_range( - "2016-11-02", "2016-11-24", freq="d", tz="America/Chicago" + "2016-11-02", "2016-11-24", freq="D", tz="America/Chicago" ).as_unit(unit) index = DatetimeIndex(expected_index_values) @@ -1655,13 +1661,13 @@ def test_resample_dst_anchor2(unit): def test_downsample_across_dst(unit): # GH 8531 - tz = pytz.timezone("Europe/Berlin") + tz = zoneinfo.ZoneInfo("Europe/Berlin") dt = datetime(2014, 10, 26) - dates = date_range(tz.localize(dt), periods=4, freq="2h").as_unit(unit) + dates = date_range(dt.astimezone(tz), periods=4, freq="2h").as_unit(unit) result = Series(5, index=dates).resample("h").mean() expected = Series( [5.0, np.nan] * 3 + [5.0], - index=date_range(tz.localize(dt), periods=7, freq="h").as_unit(unit), + index=date_range(dt.astimezone(tz), periods=7, freq="h").as_unit(unit), ) tm.assert_series_equal(result, expected) @@ -2014,45 +2020,42 @@ def test_resample_empty_series_with_tz(): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize( - "freq, freq_depr", - [ - ("2ME", "2M"), - ("2QE", "2Q"), - ("2QE-SEP", "2Q-SEP"), - ("1YE", "1Y"), - ("2YE-MAR", "2Y-MAR"), - ], -) -def test_resample_M_Q_Y_deprecated(freq, freq_depr): - # GH#9586 - depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed " - f"in a future version, please use '{freq[1:]}' instead." - - s = Series(range(10), index=date_range("20130101", freq="d", periods=10)) - expected = s.resample(freq).mean() - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - result = s.resample(freq_depr).mean() - tm.assert_series_equal(result, expected) +@pytest.mark.parametrize("freq", ["2M", "2m", "2Q", "2Q-SEP", "2q-sep", "1Y", "2Y-MAR"]) +def test_resample_M_Q_Y_raises(freq): + msg = f"Invalid frequency: {freq}" + + s = Series(range(10), index=date_range("20130101", freq="D", periods=10)) + with pytest.raises(ValueError, match=msg): + s.resample(freq).mean() + + +@pytest.mark.parametrize("freq", ["2BM", "1bm", "1BQ", "2BQ-MAR", "2bq=-mar"]) +def test_resample_BM_BQ_raises(freq): + msg = f"Invalid frequency: {freq}" + + s = Series(range(10), index=date_range("20130101", freq="D", periods=10)) + with pytest.raises(ValueError, match=msg): + s.resample(freq).mean() @pytest.mark.parametrize( - "freq, freq_depr", + "freq,freq_depr,data", [ - ("2BME", "2BM"), - ("2BQE", "2BQ"), - ("2BQE-MAR", "2BQ-MAR"), + ("1W-SUN", "1w-sun", ["2013-01-06"]), + ("1D", "1d", ["2013-01-01"]), + ("1B", "1b", ["2013-01-01"]), + ("1C", "1c", ["2013-01-01"]), ], ) -def test_resample_BM_BQ_deprecated(freq, freq_depr): - # GH#52064 - depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed " - f"in a future version, please use '{freq[1:]}' instead." - - s = Series(range(10), index=date_range("20130101", freq="d", periods=10)) - expected = s.resample(freq).mean() - with tm.assert_produces_warning(FutureWarning, match=depr_msg): +def test_resample_depr_lowercase_frequency(freq, freq_depr, data): + msg = f"'{freq_depr[1:]}' is deprecated and will be removed in a future version." + + s = Series(range(5), index=date_range("20130101", freq="h", periods=5)) + with tm.assert_produces_warning(FutureWarning, match=msg): result = s.resample(freq_depr).mean() + + exp_dti = DatetimeIndex(data=data, dtype="datetime64[ns]", freq=freq) + expected = Series(2.0, index=exp_dti) tm.assert_series_equal(result, expected) @@ -2153,6 +2156,6 @@ def test_arrow_timestamp_resample(tz): def test_resample_A_raises(freq): msg = f"Invalid frequency: {freq[1:]}" - s = Series(range(10), index=date_range("20130101", freq="d", periods=10)) + s = Series(range(10), index=date_range("20130101", freq="D", periods=10)) with pytest.raises(ValueError, match=msg): s.resample(freq).mean() diff --git a/pandas/tests/resample/test_period_index.py b/pandas/tests/resample/test_period_index.py index 67db427a2cdb7..e17529dfab00c 100644 --- a/pandas/tests/resample/test_period_index.py +++ b/pandas/tests/resample/test_period_index.py @@ -1,11 +1,14 @@ -from datetime import datetime +from datetime import ( + datetime, + timezone, +) import re import warnings +import zoneinfo import dateutil import numpy as np import pytest -import pytz from pandas._libs.tslibs.ccalendar import ( DAYS, @@ -161,12 +164,12 @@ def test_basic_downsample(self, simple_period_range_series): ("Y-DEC", ""), ("Q-MAR", ""), ("M", ""), - ("w-thu", ""), + ("W-THU", ""), ], ) def test_not_subperiod(self, simple_period_range_series, rule, expected_error_msg): # These are incompatible period rules for resampling - ts = simple_period_range_series("1/1/1990", "6/30/1995", freq="w-wed") + ts = simple_period_range_series("1/1/1990", "6/30/1995", freq="W-WED") msg = ( "Frequency cannot be resampled to " f"{expected_error_msg}, as they are not sub or super periods" @@ -304,7 +307,7 @@ def test_resample_incompat_freq(self): @pytest.mark.parametrize( "tz", [ - pytz.timezone("America/Los_Angeles"), + zoneinfo.ZoneInfo("America/Los_Angeles"), dateutil.tz.gettz("America/Los_Angeles"), ], ) @@ -312,9 +315,13 @@ def test_with_local_timezone(self, tz): # see gh-5430 local_timezone = tz - start = datetime(year=2013, month=11, day=1, hour=0, minute=0, tzinfo=pytz.utc) + start = datetime( + year=2013, month=11, day=1, hour=0, minute=0, tzinfo=timezone.utc + ) # 1 day later - end = datetime(year=2013, month=11, day=2, hour=0, minute=0, tzinfo=pytz.utc) + end = datetime( + year=2013, month=11, day=2, hour=0, minute=0, tzinfo=timezone.utc + ) index = date_range(start, end, freq="h", name="idx") @@ -336,7 +343,7 @@ def test_with_local_timezone(self, tz): @pytest.mark.parametrize( "tz", [ - pytz.timezone("America/Los_Angeles"), + zoneinfo.ZoneInfo("America/Los_Angeles"), dateutil.tz.gettz("America/Los_Angeles"), ], ) @@ -353,8 +360,6 @@ def test_resample_with_tz(self, tz, unit): index=exp_dti, ) tm.assert_series_equal(result, expected) - # Especially assert that the timezone is LMT for pytz - assert result.index.tz == tz def test_resample_nonexistent_time_bin_edge(self): # GH 19375 @@ -988,30 +993,22 @@ def test_resample_t_l_deprecated(self): ser.resample("T").mean() @pytest.mark.parametrize( - "freq, freq_depr, freq_res, freq_depr_res, data", + "freq, freq_depr, freq_depr_res", [ - ("2Q", "2q", "2Y", "2y", [0.5]), - ("2M", "2m", "2Q", "2q", [1.0, 3.0]), + ("2Q", "2q", "2y"), + ("2M", "2m", "2q"), ], ) - def test_resample_lowercase_frequency_deprecated( - self, freq, freq_depr, freq_res, freq_depr_res, data - ): - depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed in a " - f"future version. Please use '{freq[1:]}' instead." - depr_msg_res = f"'{freq_depr_res[1:]}' is deprecated and will be removed in a " - f"future version. Please use '{freq_res[1:]}' instead." - - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - rng_l = period_range("2020-01-01", "2020-08-01", freq=freq_depr) - ser = Series(np.arange(len(rng_l)), index=rng_l) - - rng = period_range("2020-01-01", "2020-08-01", freq=freq_res) - expected = Series(data=data, index=rng) + def test_resample_lowercase_frequency_raises(self, freq, freq_depr, freq_depr_res): + msg = f"Invalid frequency: {freq_depr}" + with pytest.raises(ValueError, match=msg): + period_range("2020-01-01", "2020-08-01", freq=freq_depr) - with tm.assert_produces_warning(FutureWarning, match=depr_msg_res): - result = ser.resample(freq_depr_res).mean() - tm.assert_series_equal(result, expected) + msg = f"Invalid frequency: {freq_depr_res}" + rng = period_range("2020-01-01", "2020-08-01", freq=freq) + ser = Series(np.arange(len(rng)), index=rng) + with pytest.raises(ValueError, match=msg): + ser.resample(freq_depr_res).mean() @pytest.mark.parametrize( "offset", @@ -1031,25 +1028,26 @@ def test_asfreq_invalid_period_offset(self, offset, frame_or_series): @pytest.mark.parametrize( - "freq,freq_depr", + "freq", [ - ("2M", "2ME"), - ("2Q", "2QE"), - ("2Q-FEB", "2QE-FEB"), - ("2Y", "2YE"), - ("2Y-MAR", "2YE-MAR"), - ("2M", "2me"), - ("2Q", "2qe"), - ("2Y-MAR", "2ye-mar"), + ("2ME"), + ("2QE"), + ("2QE-FEB"), + ("2YE"), + ("2YE-MAR"), + ("2me"), + ("2qe"), + ("2ye-mar"), ], ) -def test_resample_frequency_ME_QE_YE_error_message(frame_or_series, freq, freq_depr): +def test_resample_frequency_ME_QE_YE_raises(frame_or_series, freq): # GH#9586 - msg = f"for Period, please use '{freq[1:]}' instead of '{freq_depr[1:]}'" + msg = f"{freq[1:]} is not supported as period frequency" obj = frame_or_series(range(5), index=period_range("2020-01-01", periods=5)) + msg = f"Invalid frequency: {freq}" with pytest.raises(ValueError, match=msg): - obj.resample(freq_depr) + obj.resample(freq) def test_corner_cases_period(simple_period_range_series): @@ -1062,20 +1060,11 @@ def test_corner_cases_period(simple_period_range_series): assert len(result) == 0 -@pytest.mark.parametrize( - "freq_depr", - [ - "2BME", - "2CBME", - "2SME", - "2BQE-FEB", - "2BYE-MAR", - ], -) -def test_resample_frequency_invalid_freq(frame_or_series, freq_depr): +@pytest.mark.parametrize("freq", ["2BME", "2CBME", "2SME", "2BQE-FEB", "2BYE-MAR"]) +def test_resample_frequency_invalid_freq(frame_or_series, freq): # GH#9586 - msg = f"Invalid frequency: {freq_depr[1:]}" + msg = f"Invalid frequency: {freq}" obj = frame_or_series(range(5), index=period_range("2020-01-01", periods=5)) with pytest.raises(ValueError, match=msg): - obj.resample(freq_depr) + obj.resample(freq) diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index bf1f6bd34b171..a8fb1b392322d 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -732,7 +732,7 @@ def test_agg_with_datetime_index_list_agg_func(col_name): ), columns=[col_name], ) - result = df.resample("1d").aggregate(["mean"]) + result = df.resample("1D").aggregate(["mean"]) expected = DataFrame( [47.5, 143.5, 195.5], index=date_range(start="2017-01-01", freq="D", periods=3, tz="Europe/Berlin"), diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index 520ef40153ecd..ff1b82210e20d 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat import is_platform_windows import pandas as pd @@ -491,6 +493,7 @@ def test_empty(keys): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("consolidate", [True, False]) def test_resample_groupby_agg_object_dtype_all_nan(consolidate): # https://github.com/pandas-dev/pandas/issues/39329 diff --git a/pandas/tests/resample/test_time_grouper.py b/pandas/tests/resample/test_time_grouper.py index 5f5a54c4d92a3..f694b90a707c7 100644 --- a/pandas/tests/resample/test_time_grouper.py +++ b/pandas/tests/resample/test_time_grouper.py @@ -193,7 +193,7 @@ def test_aggregate_nth(): ) def test_resample_entirely_nat_window(method, method_args, unit): ser = Series([0] * 2 + [np.nan] * 2, index=date_range("2017", periods=4)) - result = methodcaller(method, **method_args)(ser.resample("2d")) + result = methodcaller(method, **method_args)(ser.resample("2D")) exp_dti = pd.DatetimeIndex(["2017-01-01", "2017-01-03"], dtype="M8[ns]", freq="2D") expected = Series([0.0, unit], index=exp_dti) @@ -372,7 +372,7 @@ def test_groupby_resample_interpolate_with_apply_syntax(groupy_test_df): for df in dfs: result = df.groupby("volume").apply( - lambda x: x.resample("1d").interpolate(method="linear"), + lambda x: x.resample("1D").interpolate(method="linear"), include_groups=False, ) @@ -421,11 +421,13 @@ def test_groupby_resample_interpolate_with_apply_syntax_off_grid(groupy_test_df) ) volume = [50, 50, 60] - week_starting = [ - Timestamp("2018-01-07"), - Timestamp("2018-01-18 01:00:00"), - Timestamp("2018-01-14"), - ] + week_starting = pd.DatetimeIndex( + [ + Timestamp("2018-01-07"), + Timestamp("2018-01-18 01:00:00"), + Timestamp("2018-01-14"), + ] + ).as_unit("ns") expected_ind = pd.MultiIndex.from_arrays( [volume, week_starting], names=["volume", "week_starting"], diff --git a/pandas/tests/reshape/concat/test_append_common.py b/pandas/tests/reshape/concat/test_append_common.py index c831cb8293943..d0ff950e7985f 100644 --- a/pandas/tests/reshape/concat/test_append_common.py +++ b/pandas/tests/reshape/concat/test_append_common.py @@ -1,3 +1,5 @@ +import zoneinfo + import numpy as np import pytest @@ -19,12 +21,12 @@ "float64": [1.1, np.nan, 3.3], "category": Categorical(["X", "Y", "Z"]), "object": ["a", "b", "c"], - "datetime64[ns]": [ + "datetime64[s]": [ pd.Timestamp("2011-01-01"), pd.Timestamp("2011-01-02"), pd.Timestamp("2011-01-03"), ], - "datetime64[ns, US/Eastern]": [ + "datetime64[s, US/Eastern]": [ pd.Timestamp("2011-01-01", tz="US/Eastern"), pd.Timestamp("2011-01-02", tz="US/Eastern"), pd.Timestamp("2011-01-03", tz="US/Eastern"), @@ -353,14 +355,15 @@ def test_concatlike_datetimetz_to_object(self, tz_aware_fixture): tm.assert_series_equal(res, Series(exp, index=[0, 1, 0, 1])) # different tz - dti3 = pd.DatetimeIndex(["2012-01-01", "2012-01-02"], tz="US/Pacific") + tz_diff = zoneinfo.ZoneInfo("US/Hawaii") + dti3 = pd.DatetimeIndex(["2012-01-01", "2012-01-02"], tz=tz_diff) exp = Index( [ pd.Timestamp("2011-01-01", tz=tz), pd.Timestamp("2011-01-02", tz=tz), - pd.Timestamp("2012-01-01", tz="US/Pacific"), - pd.Timestamp("2012-01-02", tz="US/Pacific"), + pd.Timestamp("2012-01-01", tz=tz_diff), + pd.Timestamp("2012-01-02", tz=tz_diff), ], dtype=object, ) diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index f86cc0c69d363..b2caa1fadd1a5 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -5,10 +5,13 @@ from collections.abc import Iterator from datetime import datetime from decimal import Decimal +import itertools import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import InvalidIndexError import pandas as pd @@ -44,6 +47,7 @@ def test_append_concat(self): assert isinstance(result.index, PeriodIndex) assert result.index[0] == s1.index[0] + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_concat_copy(self): df = DataFrame(np.random.default_rng(2).standard_normal((4, 3))) df2 = DataFrame(np.random.default_rng(2).integers(0, 10, size=4).reshape(4, 1)) @@ -51,35 +55,39 @@ def test_concat_copy(self): # These are actual copies. result = concat([df, df2, df3], axis=1) - for arr in result._mgr.arrays: - assert arr.base is not None + for block in result._mgr.blocks: + assert block.values.base is not None # These are the same. result = concat([df, df2, df3], axis=1) - for arr in result._mgr.arrays: + for block in result._mgr.blocks: + arr = block.values if arr.dtype.kind == "f": - assert arr.base is df._mgr.arrays[0].base + assert arr.base is df._mgr.blocks[0].values.base elif arr.dtype.kind in ["i", "u"]: - assert arr.base is df2._mgr.arrays[0].base + assert arr.base is df2._mgr.blocks[0].values.base elif arr.dtype == object: assert arr.base is not None # Float block was consolidated. df4 = DataFrame(np.random.default_rng(2).standard_normal((4, 1))) result = concat([df, df2, df3, df4], axis=1) - for arr in result._mgr.arrays: + for blocks in result._mgr.blocks: + arr = blocks.values if arr.dtype.kind == "f": # this is a view on some array in either df or df4 assert any( - np.shares_memory(arr, other) - for other in df._mgr.arrays + df4._mgr.arrays + np.shares_memory(arr, block.values) + for block in itertools.chain(df._mgr.blocks, df4._mgr.blocks) ) elif arr.dtype.kind in ["i", "u"]: - assert arr.base is df2._mgr.arrays[0].base + assert arr.base is df2._mgr.blocks[0].values.base elif arr.dtype == object: # this is a view on df3 - assert any(np.shares_memory(arr, other) for other in df3._mgr.arrays) + assert any( + np.shares_memory(arr, block.values) for block in df3._mgr.blocks + ) def test_concat_with_group_keys(self): # axis=0 diff --git a/pandas/tests/reshape/concat/test_datetimes.py b/pandas/tests/reshape/concat/test_datetimes.py index 3e046b2df72d8..0cf3192ea3a74 100644 --- a/pandas/tests/reshape/concat/test_datetimes.py +++ b/pandas/tests/reshape/concat/test_datetimes.py @@ -213,7 +213,7 @@ def test_concat_NaT_dataframes(self, tz): @pytest.mark.parametrize("tz1", [None, "UTC"]) @pytest.mark.parametrize("tz2", [None, "UTC"]) - @pytest.mark.parametrize("item", [pd.NaT, Timestamp("20150101")]) + @pytest.mark.parametrize("item", [pd.NaT, Timestamp("20150101").as_unit("ns")]) def test_concat_NaT_dataframes_all_NaT_axis_0(self, tz1, tz2, item): # GH 12396 @@ -358,7 +358,7 @@ def test_concat_tz_series_tzlocal(self): result = concat([Series(x), Series(y)], ignore_index=True) tm.assert_series_equal(result, Series(x + y)) - assert result.dtype == "datetime64[ns, tzlocal()]" + assert result.dtype == "datetime64[s, tzlocal()]" def test_concat_tz_series_with_datetimelike(self): # see gh-12620: tz and timedelta @@ -539,8 +539,8 @@ def test_concat_timedelta64_block(): df = DataFrame({"time": rng}) result = concat([df, df]) - tm.assert_frame_equal(result.iloc[:10], df) - tm.assert_frame_equal(result.iloc[10:], df) + tm.assert_frame_equal(result.iloc[:10], df, check_index_type=False) + tm.assert_frame_equal(result.iloc[10:], df, check_index_type=False) def test_concat_multiindex_datetime_nat(): diff --git a/pandas/tests/reshape/concat/test_index.py b/pandas/tests/reshape/concat/test_index.py index 68d77b79a59e7..e13b042192fc6 100644 --- a/pandas/tests/reshape/concat/test_index.py +++ b/pandas/tests/reshape/concat/test_index.py @@ -346,9 +346,11 @@ def test_concat_with_key_not_unique(self, performance_warning): performance_warning, match="indexing past lexsort depth" ): out_a = df_a.loc[("x", 0), :] - df_b = DataFrame( - {"name": [1, 2, 3]}, index=Index([("x", 0), ("y", 0), ("x", 0)]) + {"name": [1, 2, 3]}, + index=MultiIndex( + levels=[["x", "y"], range(1)], codes=[[0, 1, 0], [0, 0, 0]] + ), ) with tm.assert_produces_warning( performance_warning, match="indexing past lexsort depth" diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 5c5c06dea0008..4a6228e47eba0 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1367,8 +1367,8 @@ def test_merge_two_empty_df_no_division_error(self): ), ), ( - TimedeltaIndex(["1d", "2d", "3d"]), - TimedeltaIndex(["1d", "2d", "3d", pd.NaT, pd.NaT, pd.NaT]), + TimedeltaIndex(["1D", "2D", "3D"]), + TimedeltaIndex(["1D", "2D", "3D", pd.NaT, pd.NaT, pd.NaT]), ), ], ) @@ -1451,8 +1451,8 @@ def test_merge_readonly(self): ) # make each underlying block array / column array read-only - for arr in data1._mgr.arrays: - arr.flags.writeable = False + for block in data1._mgr.blocks: + block.values.flags.writeable = False data1.merge(data2) # no error @@ -2970,7 +2970,7 @@ def test_merge_empty_frames_column_order(left_empty, right_empty): df2 = df2.iloc[:0] result = merge(df1, df2, on=["A"], how="outer") - expected = DataFrame(1, index=[0], columns=["A", "B", "C", "D"]) + expected = DataFrame(1, index=range(1), columns=["A", "B", "C", "D"]) if left_empty and right_empty: expected = expected.iloc[:0] elif left_empty: diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index 4fc57c14ec4c3..62fd8c5a7e231 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -2,7 +2,8 @@ import numpy as np import pytest -import pytz + +from pandas._config import using_string_dtype import pandas.util._test_decorators as td @@ -2071,7 +2072,7 @@ def test_tolerance_tz(self, unit): start=to_datetime("2016-01-02"), freq="D", periods=5, - tz=pytz.timezone("UTC"), + tz=datetime.timezone.utc, unit=unit, ), "value1": np.arange(5), @@ -2083,7 +2084,7 @@ def test_tolerance_tz(self, unit): start=to_datetime("2016-01-01"), freq="D", periods=5, - tz=pytz.timezone("UTC"), + tz=datetime.timezone.utc, unit=unit, ), "value2": list("ABCDE"), @@ -2097,7 +2098,7 @@ def test_tolerance_tz(self, unit): start=to_datetime("2016-01-02"), freq="D", periods=5, - tz=pytz.timezone("UTC"), + tz=datetime.timezone.utc, unit=unit, ), "value1": np.arange(5), @@ -3063,6 +3064,7 @@ def test_on_float_by_int(self): tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_merge_datatype_error_raises(self, using_infer_string): if using_infer_string: msg = "incompatible merge keys" diff --git a/pandas/tests/reshape/test_cut.py b/pandas/tests/reshape/test_cut.py index 340c5c449aea7..d8bb4fba1e1fe 100644 --- a/pandas/tests/reshape/test_cut.py +++ b/pandas/tests/reshape/test_cut.py @@ -1,3 +1,5 @@ +from datetime import datetime + import numpy as np import pytest @@ -445,10 +447,16 @@ def test_datetime_bin(conv): Interval(Timestamp(bin_data[1]), Timestamp(bin_data[2])), ] ) - ).astype(CategoricalDtype(ordered=True)) + ) bins = [conv(v) for v in bin_data] result = Series(cut(data, bins=bins)) + + if type(bins[0]) is datetime: + # The bins have microsecond dtype -> so does result + expected = expected.astype("interval[datetime64[us]]") + + expected = expected.astype(CategoricalDtype(ordered=True)) tm.assert_series_equal(result, expected) @@ -461,10 +469,6 @@ def test_datetime_cut(unit, box): data = box(data) result, _ = cut(data, 3, retbins=True) - if box is list: - # We don't (yet) do inference on these, so get nanos - unit = "ns" - if unit == "s": # See https://github.com/pandas-dev/pandas/pull/56101#discussion_r1405325425 # for why we round to 8 seconds instead of 7 @@ -531,24 +535,26 @@ def test_datetime_tz_cut(bins, box): bins = box(bins) result = cut(ser, bins) - expected = Series( - IntervalIndex( - [ - Interval( - Timestamp("2012-12-31 23:57:07.200000", tz=tz), - Timestamp("2013-01-01 16:00:00", tz=tz), - ), - Interval( - Timestamp("2013-01-01 16:00:00", tz=tz), - Timestamp("2013-01-02 08:00:00", tz=tz), - ), - Interval( - Timestamp("2013-01-02 08:00:00", tz=tz), - Timestamp("2013-01-03 00:00:00", tz=tz), - ), - ] - ) - ).astype(CategoricalDtype(ordered=True)) + ii = IntervalIndex( + [ + Interval( + Timestamp("2012-12-31 23:57:07.200000", tz=tz), + Timestamp("2013-01-01 16:00:00", tz=tz), + ), + Interval( + Timestamp("2013-01-01 16:00:00", tz=tz), + Timestamp("2013-01-02 08:00:00", tz=tz), + ), + Interval( + Timestamp("2013-01-02 08:00:00", tz=tz), + Timestamp("2013-01-03 00:00:00", tz=tz), + ), + ] + ) + if isinstance(bins, int): + # the dtype is inferred from ser, which has nanosecond unit + ii = ii.astype("interval[datetime64[ns, US/Eastern]]") + expected = Series(ii).astype(CategoricalDtype(ordered=True)) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py index ba71bb24e8a16..bfb6a3c0167c8 100644 --- a/pandas/tests/reshape/test_from_dummies.py +++ b/pandas/tests/reshape/test_from_dummies.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas import ( DataFrame, Series, @@ -362,6 +364,7 @@ def test_with_prefix_contains_get_dummies_NaN_column(): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "default_category, expected", [ diff --git a/pandas/tests/reshape/test_get_dummies.py b/pandas/tests/reshape/test_get_dummies.py index 082d5f0ee81ab..304ba65f38d3c 100644 --- a/pandas/tests/reshape/test_get_dummies.py +++ b/pandas/tests/reshape/test_get_dummies.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas.util._test_decorators as td from pandas.core.dtypes.common import is_integer_dtype @@ -214,6 +216,7 @@ def test_dataframe_dummies_all_obj(self, df, sparse): tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_dataframe_dummies_string_dtype(self, df, using_infer_string): # GH44965 df = df[["A", "B"]] diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index 49200face66c5..be4f2ab4d183d 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -81,6 +83,7 @@ def test_default_col_names(self, df): result2 = df.melt(id_vars=["id1", "id2"]) assert result2.columns.tolist() == ["id1", "id2", "variable", "value"] + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_value_vars(self, df): result3 = df.melt(id_vars=["id1", "id2"], value_vars="A") assert len(result3) == 10 @@ -97,6 +100,7 @@ def test_value_vars(self, df): ) tm.assert_frame_equal(result4, expected4) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("type_", (tuple, list, np.array)) def test_value_vars_types(self, type_, df): # GH 15348 @@ -174,6 +178,7 @@ def test_tuple_vars_fail_with_multiindex(self, id_vars, value_vars, df1): with pytest.raises(ValueError, match=msg): df1.melt(id_vars=id_vars, value_vars=value_vars) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_custom_var_name(self, df, var_name): result5 = df.melt(var_name=var_name) assert result5.columns.tolist() == ["var", "value"] @@ -201,6 +206,7 @@ def test_custom_var_name(self, df, var_name): ) tm.assert_frame_equal(result9, expected9) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_custom_value_name(self, df, value_name): result10 = df.melt(value_name=value_name) assert result10.columns.tolist() == ["variable", "val"] @@ -230,6 +236,7 @@ def test_custom_value_name(self, df, value_name): ) tm.assert_frame_equal(result14, expected14) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_custom_var_and_value_name(self, df, value_name, var_name): result15 = df.melt(var_name=var_name, value_name=value_name) assert result15.columns.tolist() == ["var", "val"] @@ -354,6 +361,7 @@ def test_melt_missing_columns_raises(self): with pytest.raises(KeyError, match=msg): df.melt(["A"], ["F"], col_level=0) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_melt_mixed_int_str_id_vars(self): # GH 29718 df = DataFrame({0: ["foo"], "a": ["bar"], "b": [1], "d": [2]}) @@ -1214,6 +1222,7 @@ def test_raise_of_column_name_value(self): ): df.melt(id_vars="value", value_name="value") + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("dtype", ["O", "string"]) def test_missing_stubname(self, dtype): # GH46044 @@ -1239,6 +1248,7 @@ def test_missing_stubname(self, dtype): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_wide_to_long_pyarrow_string_columns(): # GH 57066 pytest.importorskip("pyarrow") diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 4a13c1f5e1167..44b96afaa4ef5 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -9,7 +9,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas.compat.numpy import np_version_gte1p25 @@ -1068,6 +1068,7 @@ def test_margins_dtype_len(self, data): tm.assert_frame_equal(expected, result) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("cols", [(1, 2), ("a", "b"), (1, "b"), ("a", 1)]) def test_pivot_table_multiindex_only(self, cols): # GH 17038 @@ -2058,6 +2059,60 @@ def test_pivot_string_as_func(self): ).rename_axis("A") tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("kwargs", [{"a": 2}, {"a": 2, "b": 3}, {"b": 3, "a": 2}]) + def test_pivot_table_kwargs(self, kwargs): + # GH#57884 + def f(x, a, b=3): + return x.sum() * a + b + + def g(x): + return f(x, **kwargs) + + df = DataFrame( + { + "A": ["good", "bad", "good", "bad", "good"], + "B": ["one", "two", "one", "three", "two"], + "X": [2, 5, 4, 20, 10], + } + ) + result = pivot_table( + df, index="A", columns="B", values="X", aggfunc=f, **kwargs + ) + expected = pivot_table(df, index="A", columns="B", values="X", aggfunc=g) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "kwargs", [{}, {"b": 10}, {"a": 3}, {"a": 3, "b": 10}, {"b": 10, "a": 3}] + ) + def test_pivot_table_kwargs_margin(self, data, kwargs): + # GH#57884 + def f(x, a=5, b=7): + return (x.sum() + b) * a + + def g(x): + return f(x, **kwargs) + + result = data.pivot_table( + values="D", + index=["A", "B"], + columns="C", + aggfunc=f, + margins=True, + fill_value=0, + **kwargs, + ) + + expected = data.pivot_table( + values="D", + index=["A", "B"], + columns="C", + aggfunc=g, + margins=True, + fill_value=0, + ) + + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( "f, f_numpy", [ @@ -2515,6 +2570,7 @@ def test_pivot_empty(self): expected = DataFrame(index=[], columns=[]) tm.assert_frame_equal(result, expected, check_names=False) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("dtype", [object, "string"]) def test_pivot_integer_bug(self, dtype): df = DataFrame(data=[("A", "1", "A1"), ("B", "2", "B2")], dtype=dtype) @@ -2602,7 +2658,7 @@ def test_pivot_columns_not_given(self): with pytest.raises(TypeError, match="missing 1 required keyword-only argument"): df.pivot() - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="None is cast to NaN") + @pytest.mark.xfail(using_string_dtype(), reason="None is cast to NaN") def test_pivot_columns_is_none(self): # GH#48293 df = DataFrame({None: [1], "b": 2, "c": 3}) @@ -2618,7 +2674,7 @@ def test_pivot_columns_is_none(self): expected = DataFrame({1: 3}, index=Index([2], name="b")) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="None is cast to NaN") + @pytest.mark.xfail(using_string_dtype(), reason="None is cast to NaN") def test_pivot_index_is_none(self): # GH#48293 df = DataFrame({None: [1], "b": 2, "c": 3}) @@ -2632,7 +2688,7 @@ def test_pivot_index_is_none(self): expected = DataFrame(3, index=[1], columns=Index([2], name="b")) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="None is cast to NaN") + @pytest.mark.xfail(using_string_dtype(), reason="None is cast to NaN") def test_pivot_values_is_none(self): # GH#48293 df = DataFrame({None: [1], "b": 2, "c": 3}) @@ -2715,3 +2771,17 @@ def test_unstack_copy(self, m): result = df.unstack(sort=False) result.iloc[0, 0] = -1 tm.assert_frame_equal(df, df_orig) + + def test_pivot_empty_with_datetime(self): + # GH#59126 + df = DataFrame( + { + "timestamp": Series([], dtype=pd.DatetimeTZDtype(tz="UTC")), + "category": Series([], dtype=str), + "value": Series([], dtype=str), + } + ) + df_pivoted = df.pivot_table( + index="category", columns="value", values="timestamp" + ) + assert df_pivoted.empty diff --git a/pandas/tests/reshape/test_qcut.py b/pandas/tests/reshape/test_qcut.py index 53af673e0f7b0..5f769db7f8acf 100644 --- a/pandas/tests/reshape/test_qcut.py +++ b/pandas/tests/reshape/test_qcut.py @@ -271,8 +271,10 @@ def test_datetime_tz_qcut(bins): ], ], ) -def test_date_like_qcut_bins(arg, expected_bins): +def test_date_like_qcut_bins(arg, expected_bins, unit): # see gh-19891 + arg = arg.as_unit(unit) + expected_bins = expected_bins.as_unit(unit) ser = Series(arg) result, result_bins = qcut(ser, 2, retbins=True) tm.assert_index_equal(result_bins, expected_bins) diff --git a/pandas/tests/reshape/test_union_categoricals.py b/pandas/tests/reshape/test_union_categoricals.py index 8d78d34e936f0..1d5d16f39e648 100644 --- a/pandas/tests/reshape/test_union_categoricals.py +++ b/pandas/tests/reshape/test_union_categoricals.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.core.dtypes.concat import union_categoricals import pandas as pd @@ -122,6 +124,7 @@ def test_union_categoricals_nan(self): exp = Categorical([np.nan, np.nan, np.nan, np.nan]) tm.assert_categorical_equal(res, exp) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("val", [[], ["1"]]) def test_union_categoricals_empty(self, val, request, using_infer_string): # GH 13759 diff --git a/pandas/tests/scalar/period/test_asfreq.py b/pandas/tests/scalar/period/test_asfreq.py index 1a21d234f1d50..0ae5389a3e9b5 100644 --- a/pandas/tests/scalar/period/test_asfreq.py +++ b/pandas/tests/scalar/period/test_asfreq.py @@ -59,6 +59,7 @@ def test_asfreq_corner(self): def test_conv_annual(self): # frequency conversion tests: from Annual Frequency + msg = INVALID_FREQ_ERR_MSG ival_A = Period(freq="Y", year=2007) @@ -110,18 +111,14 @@ def test_conv_annual(self): assert ival_A.asfreq("B", "E") == ival_A_to_B_end assert ival_A.asfreq("D", "s") == ival_A_to_D_start assert ival_A.asfreq("D", "E") == ival_A_to_D_end - msg = "'H' is deprecated and will be removed in a future version." - with tm.assert_produces_warning(FutureWarning, match=msg): + with pytest.raises(ValueError, match=msg): assert ival_A.asfreq("H", "s") == ival_A_to_H_start assert ival_A.asfreq("H", "E") == ival_A_to_H_end assert ival_A.asfreq("min", "s") == ival_A_to_T_start assert ival_A.asfreq("min", "E") == ival_A_to_T_end - msg = "Invalid frequency: T" with pytest.raises(ValueError, match=msg): assert ival_A.asfreq("T", "s") == ival_A_to_T_start assert ival_A.asfreq("T", "E") == ival_A_to_T_end - msg = "'S' is deprecated and will be removed in a future version." - with tm.assert_produces_warning(FutureWarning, match=msg): assert ival_A.asfreq("S", "S") == ival_A_to_S_start assert ival_A.asfreq("S", "E") == ival_A_to_S_end @@ -820,7 +817,7 @@ def test_asfreq_MS(self): assert initial.asfreq(freq="M", how="S") == Period("2013-01", "M") - msg = "MS is not supported as period frequency" + msg = INVALID_FREQ_ERR_MSG with pytest.raises(ValueError, match=msg): initial.asfreq(freq="MS", how="S") diff --git a/pandas/tests/scalar/period/test_period.py b/pandas/tests/scalar/period/test_period.py index 2c3a0816737fc..fe51817a78be8 100644 --- a/pandas/tests/scalar/period/test_period.py +++ b/pandas/tests/scalar/period/test_period.py @@ -60,7 +60,7 @@ def test_invalid_frequency_error_message(self): Period("2012-01-02", freq="WOM-1MON") def test_invalid_frequency_period_error_message(self): - msg = "for Period, please use 'M' instead of 'ME'" + msg = "Invalid frequency: ME" with pytest.raises(ValueError, match=msg): Period("2012-01-02", freq="ME") @@ -117,7 +117,9 @@ def test_construction(self): i2 = Period("3/1/2005", freq="D") assert i1 == i2 - i3 = Period(year=2005, month=3, day=1, freq="d") + msg = "'d' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + i3 = Period(year=2005, month=3, day=1, freq="d") assert i1 == i3 i1 = Period("2007-01-01 09:00:00.001") @@ -613,6 +615,25 @@ def test_period_large_ordinal(self, hour): p = Period(ordinal=2562048 + hour, freq="1h") assert p.hour == hour + @pytest.mark.filterwarnings( + "ignore:Period with BDay freq is deprecated:FutureWarning" + ) + @pytest.mark.parametrize( + "freq,freq_depr", + [("2W", "2w"), ("2W-FRI", "2w-fri"), ("2D", "2d"), ("2B", "2b")], + ) + def test_period_deprecated_lowercase_freq(self, freq, freq_depr): + # GH#58998 + msg = ( + f"'{freq_depr[1:]}' is deprecated and will be removed in a future version." + ) + + with tm.assert_produces_warning(FutureWarning, match=msg): + result = Period("2016-03-01 09:00", freq=freq_depr) + + expected = Period("2016-03-01 09:00", freq=freq) + assert result == expected + class TestPeriodMethods: def test_round_trip(self): diff --git a/pandas/tests/scalar/test_nat.py b/pandas/tests/scalar/test_nat.py index e352e2601cef3..b20df43dd49a6 100644 --- a/pandas/tests/scalar/test_nat.py +++ b/pandas/tests/scalar/test_nat.py @@ -3,10 +3,10 @@ timedelta, ) import operator +import zoneinfo import numpy as np import pytest -import pytz from pandas._libs.tslibs import iNaT from pandas.compat.numpy import np_version_gte1p24p3 @@ -361,7 +361,7 @@ def test_nat_doc_strings(compare): (Timestamp("2014-01-01"), "timestamp"), (Timestamp("2014-01-01", tz="UTC"), "timestamp"), (Timestamp("2014-01-01", tz="US/Eastern"), "timestamp"), - (pytz.timezone("Asia/Tokyo").localize(datetime(2014, 1, 1)), "timestamp"), + (datetime(2014, 1, 1).astimezone(zoneinfo.ZoneInfo("Asia/Tokyo")), "timestamp"), ], ) def test_nat_arithmetic_scalar(op_name, value, val_type): @@ -439,8 +439,10 @@ def test_nat_rfloordiv_timedelta(val, expected): @pytest.mark.parametrize( "value", [ - DatetimeIndex(["2011-01-01", "2011-01-02"], name="x"), - DatetimeIndex(["2011-01-01", "2011-01-02"], tz="US/Eastern", name="x"), + DatetimeIndex(["2011-01-01", "2011-01-02"], dtype="M8[ns]", name="x"), + DatetimeIndex( + ["2011-01-01", "2011-01-02"], dtype="M8[ns, US/Eastern]", name="x" + ), DatetimeArray._from_sequence(["2011-01-01", "2011-01-02"], dtype="M8[ns]"), DatetimeArray._from_sequence( ["2011-01-01", "2011-01-02"], dtype=DatetimeTZDtype(tz="US/Pacific") diff --git a/pandas/tests/scalar/timedelta/methods/test_round.py b/pandas/tests/scalar/timedelta/methods/test_round.py index 082c36999e06f..96cb1c07d2b76 100644 --- a/pandas/tests/scalar/timedelta/methods/test_round.py +++ b/pandas/tests/scalar/timedelta/methods/test_round.py @@ -38,7 +38,7 @@ class TestTimedeltaRound: ("min", "1 days 02:35:00", "-1 days 02:35:00"), ("12min", "1 days 02:36:00", "-1 days 02:36:00"), ("h", "1 days 03:00:00", "-1 days 03:00:00"), - ("d", "1 days", "-1 days"), + ("D", "1 days", "-1 days"), ], ) def test_round(self, freq, s1, s2): diff --git a/pandas/tests/scalar/timedelta/test_arithmetic.py b/pandas/tests/scalar/timedelta/test_arithmetic.py index efeca375affbb..2183a5851ea9c 100644 --- a/pandas/tests/scalar/timedelta/test_arithmetic.py +++ b/pandas/tests/scalar/timedelta/test_arithmetic.py @@ -79,7 +79,7 @@ def test_td_add_sub_one_day_ten_seconds(self, one_day_ten_secs): @pytest.mark.parametrize("op", [operator.add, ops.radd]) def test_td_add_datetimelike_scalar(self, op): # GH#19738 - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = op(td, datetime(2016, 1, 1)) if op is operator.add: @@ -111,7 +111,7 @@ def test_td_add_timestamp_overflow(self): @pytest.mark.parametrize("op", [operator.add, ops.radd]) def test_td_add_td(self, op): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = op(td, Timedelta(days=10)) assert isinstance(result, Timedelta) @@ -119,35 +119,35 @@ def test_td_add_td(self, op): @pytest.mark.parametrize("op", [operator.add, ops.radd]) def test_td_add_pytimedelta(self, op): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = op(td, timedelta(days=9)) assert isinstance(result, Timedelta) assert result == Timedelta(days=19) @pytest.mark.parametrize("op", [operator.add, ops.radd]) def test_td_add_timedelta64(self, op): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = op(td, np.timedelta64(-4, "D")) assert isinstance(result, Timedelta) assert result == Timedelta(days=6) @pytest.mark.parametrize("op", [operator.add, ops.radd]) def test_td_add_offset(self, op): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = op(td, offsets.Hour(6)) assert isinstance(result, Timedelta) assert result == Timedelta(days=10, hours=6) def test_td_sub_td(self): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") expected = Timedelta(0, unit="ns") result = td - td assert isinstance(result, Timedelta) assert result == expected def test_td_sub_pytimedelta(self): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") expected = Timedelta(0, unit="ns") result = td - td.to_pytimedelta() @@ -159,7 +159,7 @@ def test_td_sub_pytimedelta(self): assert result == expected def test_td_sub_timedelta64(self): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") expected = Timedelta(0, unit="ns") result = td - td.to_timedelta64() @@ -172,12 +172,12 @@ def test_td_sub_timedelta64(self): def test_td_sub_nat(self): # In this context pd.NaT is treated as timedelta-like - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = td - NaT assert result is NaT def test_td_sub_td64_nat(self): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") td_nat = np.timedelta64("NaT") result = td - td_nat @@ -187,13 +187,13 @@ def test_td_sub_td64_nat(self): assert result is NaT def test_td_sub_offset(self): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = td - offsets.Hour(1) assert isinstance(result, Timedelta) assert result == Timedelta(239, unit="h") def test_td_add_sub_numeric_raises(self): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") msg = "unsupported operand type" for other in [2, 2.0, np.int64(2), np.float64(2)]: with pytest.raises(TypeError, match=msg): @@ -234,7 +234,7 @@ def test_td_add_sub_int_ndarray(self): other - td def test_td_rsub_nat(self): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = NaT - td assert result is NaT @@ -242,7 +242,7 @@ def test_td_rsub_nat(self): assert result is NaT def test_td_rsub_offset(self): - result = offsets.Hour(1) - Timedelta(10, unit="d") + result = offsets.Hour(1) - Timedelta(10, unit="D") assert isinstance(result, Timedelta) assert result == Timedelta(-239, unit="h") @@ -362,7 +362,7 @@ class TestTimedeltaMultiplicationDivision: @pytest.mark.parametrize("op", [operator.mul, ops.rmul]) def test_td_mul_nat(self, op, td_nat): # GH#19819 - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") typs = "|".join(["numpy.timedelta64", "NaTType", "Timedelta"]) msg = "|".join( [ @@ -377,7 +377,7 @@ def test_td_mul_nat(self, op, td_nat): @pytest.mark.parametrize("op", [operator.mul, ops.rmul]) def test_td_mul_nan(self, op, nan): # np.float64('NaN') has a 'dtype' attr, avoid treating as array - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = op(td, nan) assert result is NaT @@ -419,7 +419,7 @@ def test_td_mul_numeric_ndarray(self): def test_td_mul_numeric_ndarray_0d(self): td = Timedelta("1 day") - other = np.array(2) + other = np.array(2, dtype=np.int64) assert other.ndim == 0 expected = Timedelta("2 days") @@ -449,7 +449,7 @@ def test_td_mul_td64_ndarray_invalid(self): def test_td_div_timedeltalike_scalar(self): # GH#19738 - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = td / offsets.Hour(1) assert result == 240 @@ -480,7 +480,7 @@ def test_td_div_td64_non_nano(self): def test_td_div_numeric_scalar(self): # GH#19738 - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = td / 2 assert isinstance(result, Timedelta) @@ -500,7 +500,7 @@ def test_td_div_numeric_scalar(self): ) def test_td_div_nan(self, nan): # np.float64('NaN') has a 'dtype' attr, avoid treating as array - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = td / nan assert result is NaT @@ -532,7 +532,7 @@ def test_td_div_ndarray_0d(self): def test_td_rdiv_timedeltalike_scalar(self): # GH#19738 - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = offsets.Hour(1) / td assert result == 1 / 240.0 @@ -540,7 +540,7 @@ def test_td_rdiv_timedeltalike_scalar(self): def test_td_rdiv_na_scalar(self): # GH#31869 None gets cast to NaT - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = NaT / td assert np.isnan(result) @@ -560,7 +560,7 @@ def test_td_rdiv_na_scalar(self): np.nan / td def test_td_rdiv_ndarray(self): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") arr = np.array([td], dtype=object) result = arr / td @@ -583,7 +583,7 @@ def test_td_rdiv_ndarray(self): arr / td def test_td_rdiv_ndarray_0d(self): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") arr = np.array(td.asm8) @@ -623,6 +623,7 @@ def test_td_floordiv_invalid_scalar(self): [ r"Invalid dtype datetime64\[D\] for __floordiv__", "'dtype' is an invalid keyword argument for this function", + "this function got an unexpected keyword argument 'dtype'", r"ufunc '?floor_divide'? cannot use operands with types", ] ) diff --git a/pandas/tests/scalar/timedelta/test_constructors.py b/pandas/tests/scalar/timedelta/test_constructors.py index 5509216f4daf4..e029dfc3b2703 100644 --- a/pandas/tests/scalar/timedelta/test_constructors.py +++ b/pandas/tests/scalar/timedelta/test_constructors.py @@ -32,20 +32,31 @@ def test_unit_m_y_raises(self, unit): with pytest.raises(ValueError, match=msg): to_timedelta([1, 2], unit) - @pytest.mark.parametrize("unit", ["h", "s"]) - def test_units_H_S_deprecated(self, unit): + @pytest.mark.parametrize( + "unit,unit_depr", + [ + ("W", "w"), + ("D", "d"), + ("min", "MIN"), + ("s", "S"), + ("h", "H"), + ("ms", "MS"), + ("us", "US"), + ], + ) + def test_unit_deprecated(self, unit, unit_depr): # GH#52536 - msg = f"'{unit.upper()}' is deprecated and will be removed in a future version." + msg = f"'{unit_depr}' is deprecated and will be removed in a future version." expected = Timedelta(1, unit=unit) with tm.assert_produces_warning(FutureWarning, match=msg): - result = Timedelta(1, unit=unit.upper()) + result = Timedelta(1, unit=unit_depr) tm.assert_equal(result, expected) @pytest.mark.parametrize( "unit, np_unit", - [(value, "W") for value in ["W", "w"]] - + [(value, "D") for value in ["D", "d", "days", "day", "Days", "Day"]] + [("W", "W")] + + [(value, "D") for value in ["D", "days", "day", "Days", "Day"]] + [ (value, "m") for value in [ @@ -78,7 +89,6 @@ def test_units_H_S_deprecated(self, unit): "millisecond", "milli", "millis", - "MS", "Milliseconds", "Millisecond", "Milli", @@ -93,7 +103,6 @@ def test_units_H_S_deprecated(self, unit): "microsecond", "micro", "micros", - "US", "Microseconds", "Microsecond", "Micro", @@ -108,7 +117,6 @@ def test_units_H_S_deprecated(self, unit): "nanosecond", "nano", "nanos", - "NS", "Nanoseconds", "Nanosecond", "Nano", @@ -250,8 +258,8 @@ def test_from_tick_reso(): def test_construction(): expected = np.timedelta64(10, "D").astype("m8[ns]").view("i8") - assert Timedelta(10, unit="d")._value == expected - assert Timedelta(10.0, unit="d")._value == expected + assert Timedelta(10, unit="D")._value == expected + assert Timedelta(10.0, unit="D")._value == expected assert Timedelta("10 days")._value == expected assert Timedelta(days=10)._value == expected assert Timedelta(days=10.0)._value == expected diff --git a/pandas/tests/scalar/timedelta/test_formats.py b/pandas/tests/scalar/timedelta/test_formats.py index e1b0076d5b7b9..1aafeec2ceed5 100644 --- a/pandas/tests/scalar/timedelta/test_formats.py +++ b/pandas/tests/scalar/timedelta/test_formats.py @@ -6,7 +6,7 @@ @pytest.mark.parametrize( "td, expected_repr", [ - (Timedelta(10, unit="d"), "Timedelta('10 days 00:00:00')"), + (Timedelta(10, unit="D"), "Timedelta('10 days 00:00:00')"), (Timedelta(10, unit="s"), "Timedelta('0 days 00:00:10')"), (Timedelta(10, unit="ms"), "Timedelta('0 days 00:00:00.010000')"), (Timedelta(-10, unit="ms"), "Timedelta('-1 days +23:59:59.990000')"), diff --git a/pandas/tests/scalar/timedelta/test_timedelta.py b/pandas/tests/scalar/timedelta/test_timedelta.py index 01e7ba52e58aa..8be2ec846a6d9 100644 --- a/pandas/tests/scalar/timedelta/test_timedelta.py +++ b/pandas/tests/scalar/timedelta/test_timedelta.py @@ -280,7 +280,7 @@ def test_timedelta_class_min_max_resolution(): class TestTimedeltaUnaryOps: def test_invert(self): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") msg = "bad operand type for unary ~" with pytest.raises(TypeError, match=msg): @@ -295,17 +295,17 @@ def test_invert(self): ~(td.to_timedelta64()) def test_unary_ops(self): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") # __neg__, __pos__ - assert -td == Timedelta(-10, unit="d") - assert -td == Timedelta("-10d") - assert +td == Timedelta(10, unit="d") + assert -td == Timedelta(-10, unit="D") + assert -td == Timedelta("-10D") + assert +td == Timedelta(10, unit="D") # __abs__, __abs__(__neg__) assert abs(td) == td assert abs(-td) == td - assert abs(-td) == Timedelta("10d") + assert abs(-td) == Timedelta("10D") class TestTimedeltas: @@ -334,7 +334,7 @@ def test_total_seconds_scalar(self): assert np.isnan(rng.total_seconds()) def test_conversion(self): - for td in [Timedelta(10, unit="d"), Timedelta("1 days, 10:11:12.012345")]: + for td in [Timedelta(10, unit="D"), Timedelta("1 days, 10:11:12.012345")]: pydt = td.to_pytimedelta() assert td == Timedelta(pydt) assert td == pydt @@ -450,7 +450,7 @@ def test_numeric_conversions(self): assert Timedelta(10, unit="us") == np.timedelta64(10, "us") assert Timedelta(10, unit="ms") == np.timedelta64(10, "ms") assert Timedelta(10, unit="s") == np.timedelta64(10, "s") - assert Timedelta(10, unit="d") == np.timedelta64(10, "D") + assert Timedelta(10, unit="D") == np.timedelta64(10, "D") def test_timedelta_conversions(self): assert Timedelta(timedelta(seconds=1)) == np.timedelta64(1, "s").astype( @@ -474,7 +474,7 @@ def test_to_numpy_alias(self): td.to_numpy(copy=True) def test_identity(self): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") assert isinstance(td, Timedelta) assert isinstance(td, timedelta) @@ -489,7 +489,10 @@ def conv(v): assert Timedelta("1000") == np.timedelta64(1000, "ns") assert Timedelta("1000ns") == np.timedelta64(1000, "ns") - assert Timedelta("1000NS") == np.timedelta64(1000, "ns") + + msg = "'NS' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + assert Timedelta("1000NS") == np.timedelta64(1000, "ns") assert Timedelta("10us") == np.timedelta64(10000, "ns") assert Timedelta("100us") == np.timedelta64(100000, "ns") @@ -508,8 +511,10 @@ def conv(v): assert Timedelta("100s") == np.timedelta64(100000000000, "ns") assert Timedelta("1000s") == np.timedelta64(1000000000000, "ns") - assert Timedelta("1d") == conv(np.timedelta64(1, "D")) - assert Timedelta("-1d") == -conv(np.timedelta64(1, "D")) + msg = "'d' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + assert Timedelta("1d") == conv(np.timedelta64(1, "D")) + assert Timedelta("-1D") == -conv(np.timedelta64(1, "D")) assert Timedelta("1D") == conv(np.timedelta64(1, "D")) assert Timedelta("10D") == conv(np.timedelta64(10, "D")) assert Timedelta("100D") == conv(np.timedelta64(100, "D")) @@ -663,6 +668,26 @@ def test_resolution_deprecated(self): result = Timedelta.resolution assert result == Timedelta(nanoseconds=1) + @pytest.mark.parametrize( + "unit,unit_depr", + [ + ("W", "w"), + ("D", "d"), + ("min", "MIN"), + ("s", "S"), + ("h", "H"), + ("ms", "MS"), + ("us", "US"), + ], + ) + def test_unit_deprecated(self, unit, unit_depr): + # GH#59051 + msg = f"'{unit_depr}' is deprecated and will be removed in a future version." + + with tm.assert_produces_warning(FutureWarning, match=msg): + result = Timedelta(1, unit_depr) + assert result == Timedelta(1, unit) + @pytest.mark.parametrize( "value, expected", diff --git a/pandas/tests/scalar/timestamp/methods/test_replace.py b/pandas/tests/scalar/timestamp/methods/test_replace.py index c5169fdff0cd4..f15ea0e485cae 100644 --- a/pandas/tests/scalar/timestamp/methods/test_replace.py +++ b/pandas/tests/scalar/timestamp/methods/test_replace.py @@ -1,9 +1,9 @@ from datetime import datetime +import zoneinfo from dateutil.tz import gettz import numpy as np import pytest -import pytz from pandas._libs.tslibs import ( OutOfBoundsDatetime, @@ -111,8 +111,8 @@ def test_replace_tzinfo_equiv_tz_localize_none(self): @pytest.mark.skipif(WASM, reason="tzset is not available on WASM") def test_replace_tzinfo(self): # GH#15683 - dt = datetime(2016, 3, 27, 1) - tzinfo = pytz.timezone("CET").localize(dt, is_dst=False).tzinfo + dt = datetime(2016, 3, 27, 1, fold=1) + tzinfo = dt.astimezone(zoneinfo.ZoneInfo("Europe/Berlin")).tzinfo result_dt = dt.replace(tzinfo=tzinfo) result_pd = Timestamp(dt).replace(tzinfo=tzinfo) @@ -137,13 +137,16 @@ def test_replace_tzinfo(self): @pytest.mark.parametrize( "tz, normalize", [ - (pytz.timezone("US/Eastern"), lambda x: x.tzinfo.normalize(x)), + ("pytz/US/Eastern", lambda x: x.tzinfo.normalize(x)), (gettz("US/Eastern"), lambda x: x), ], ) def test_replace_across_dst(self, tz, normalize): # GH#18319 check that 1) timezone is correctly normalized and # 2) that hour is not incorrectly changed by this normalization + if isinstance(tz, str) and tz.startswith("pytz/"): + pytz = pytest.importorskip("pytz") + tz = pytz.timezone(tz.removeprefix("pytz/")) ts_naive = Timestamp("2017-12-03 16:03:30") ts_aware = conversion.localize_pydatetime(ts_naive, tz) diff --git a/pandas/tests/scalar/timestamp/methods/test_timestamp_method.py b/pandas/tests/scalar/timestamp/methods/test_timestamp_method.py index b576317fca8b4..beacaaf04e6b2 100644 --- a/pandas/tests/scalar/timestamp/methods/test_timestamp_method.py +++ b/pandas/tests/scalar/timestamp/methods/test_timestamp_method.py @@ -1,8 +1,8 @@ # NB: This is for the Timestamp.timestamp *method* specifically, not # the Timestamp class in general. +from datetime import timezone import pytest -from pytz import utc from pandas._libs.tslibs import Timestamp from pandas.compat import WASM @@ -18,7 +18,7 @@ def test_timestamp(self, fixed_now_ts): # GH#17329 # tz-naive --> treat it as if it were UTC for purposes of timestamp() ts = fixed_now_ts - uts = ts.replace(tzinfo=utc) + uts = ts.replace(tzinfo=timezone.utc) assert ts.timestamp() == uts.timestamp() tsc = Timestamp("2014-10-11 11:00:01.12345678", tz="US/Central") diff --git a/pandas/tests/scalar/timestamp/methods/test_to_pydatetime.py b/pandas/tests/scalar/timestamp/methods/test_to_pydatetime.py index be6ec7dbc24c7..07e57b51a7f1e 100644 --- a/pandas/tests/scalar/timestamp/methods/test_to_pydatetime.py +++ b/pandas/tests/scalar/timestamp/methods/test_to_pydatetime.py @@ -3,7 +3,7 @@ timedelta, ) -import pytz +import pytest from pandas._libs.tslibs.timezones import dateutil_gettz as gettz import pandas.util._test_decorators as td @@ -43,6 +43,7 @@ def test_timestamp_to_pydatetime_dateutil(self): assert stamp.tzinfo == dtval.tzinfo def test_timestamp_to_pydatetime_explicit_pytz(self): + pytz = pytest.importorskip("pytz") stamp = Timestamp("20090415", tz=pytz.timezone("US/Eastern")) dtval = stamp.to_pydatetime() assert stamp == dtval diff --git a/pandas/tests/scalar/timestamp/methods/test_tz_localize.py b/pandas/tests/scalar/timestamp/methods/test_tz_localize.py index 0786cc58a4f95..90dc8d77608cb 100644 --- a/pandas/tests/scalar/timestamp/methods/test_tz_localize.py +++ b/pandas/tests/scalar/timestamp/methods/test_tz_localize.py @@ -1,5 +1,6 @@ from datetime import timedelta import re +import zoneinfo from dateutil.tz import gettz import pytest @@ -17,68 +18,56 @@ Timestamp, ) -try: - from zoneinfo import ZoneInfo -except ImportError: - # Cannot assign to a type - ZoneInfo = None # type: ignore[misc, assignment] - class TestTimestampTZLocalize: @pytest.mark.skip_ubsan def test_tz_localize_pushes_out_of_bounds(self): # GH#12677 # tz_localize that pushes away from the boundary is OK + pytz = pytest.importorskip("pytz") msg = ( f"Converting {Timestamp.min.strftime('%Y-%m-%d %H:%M:%S')} " f"underflows past {Timestamp.min}" ) - pac = Timestamp.min.tz_localize("US/Pacific") + pac = Timestamp.min.tz_localize(pytz.timezone("US/Pacific")) assert pac._value > Timestamp.min._value pac.tz_convert("Asia/Tokyo") # tz_convert doesn't change value with pytest.raises(OutOfBoundsDatetime, match=msg): - Timestamp.min.tz_localize("Asia/Tokyo") + Timestamp.min.tz_localize(pytz.timezone("Asia/Tokyo")) # tz_localize that pushes away from the boundary is OK msg = ( f"Converting {Timestamp.max.strftime('%Y-%m-%d %H:%M:%S')} " f"overflows past {Timestamp.max}" ) - tokyo = Timestamp.max.tz_localize("Asia/Tokyo") + tokyo = Timestamp.max.tz_localize(pytz.timezone("Asia/Tokyo")) assert tokyo._value < Timestamp.max._value tokyo.tz_convert("US/Pacific") # tz_convert doesn't change value with pytest.raises(OutOfBoundsDatetime, match=msg): - Timestamp.max.tz_localize("US/Pacific") + Timestamp.max.tz_localize(pytz.timezone("US/Pacific")) - def test_tz_localize_ambiguous_bool(self, unit): + @pytest.mark.parametrize( + "tz", + [zoneinfo.ZoneInfo("US/Central"), "dateutil/US/Central", "pytz/US/Central"], + ) + def test_tz_localize_ambiguous_bool(self, unit, tz): # make sure that we are correctly accepting bool values as ambiguous # GH#14402 + if isinstance(tz, str) and tz.startswith("pytz/"): + tz = pytz.timezone(tz.removeprefix("pytz/")) ts = Timestamp("2015-11-01 01:00:03").as_unit(unit) - expected0 = Timestamp("2015-11-01 01:00:03-0500", tz="US/Central") - expected1 = Timestamp("2015-11-01 01:00:03-0600", tz="US/Central") + expected0 = Timestamp("2015-11-01 01:00:03-0500", tz=tz) + expected1 = Timestamp("2015-11-01 01:00:03-0600", tz=tz) msg = "Cannot infer dst time from 2015-11-01 01:00:03" with pytest.raises(pytz.AmbiguousTimeError, match=msg): - ts.tz_localize("US/Central") + ts.tz_localize(tz) - with pytest.raises(pytz.AmbiguousTimeError, match=msg): - ts.tz_localize("dateutil/US/Central") - - if ZoneInfo is not None: - try: - tz = ZoneInfo("US/Central") - except KeyError: - # no tzdata - pass - else: - with pytest.raises(pytz.AmbiguousTimeError, match=msg): - ts.tz_localize(tz) - - result = ts.tz_localize("US/Central", ambiguous=True) + result = ts.tz_localize(tz, ambiguous=True) assert result == expected0 assert result._creso == getattr(NpyDatetimeUnit, f"NPY_FR_{unit}").value - result = ts.tz_localize("US/Central", ambiguous=False) + result = ts.tz_localize(tz, ambiguous=False) assert result == expected1 assert result._creso == getattr(NpyDatetimeUnit, f"NPY_FR_{unit}").value @@ -205,9 +194,10 @@ def test_tz_localize_roundtrip(self, stamp, tz_aware_fixture): def test_tz_localize_ambiguous_compat(self): # validate that pytz and dateutil are compat for dst # when the transition happens + pytz = pytest.importorskip("pytz") naive = Timestamp("2013-10-27 01:00:00") - pytz_zone = "Europe/London" + pytz_zone = pytz.timezone("Europe/London") dateutil_zone = "dateutil/Europe/London" result_pytz = naive.tz_localize(pytz_zone, ambiguous=False) result_dateutil = naive.tz_localize(dateutil_zone, ambiguous=False) @@ -236,13 +226,16 @@ def test_tz_localize_ambiguous_compat(self): @pytest.mark.parametrize( "tz", [ - pytz.timezone("US/Eastern"), + "pytz/US/Eastern", gettz("US/Eastern"), - "US/Eastern", + zoneinfo.ZoneInfo("US/Eastern"), "dateutil/US/Eastern", ], ) def test_timestamp_tz_localize(self, tz): + if isinstance(tz, str) and tz.startswith("pytz/"): + pytz = pytest.importorskip("pytz") + tz = pytz.timezone(tz.removeprefix("pytz/")) stamp = Timestamp("3/11/2012 04:00") result = stamp.tz_localize(tz) diff --git a/pandas/tests/scalar/timestamp/test_arithmetic.py b/pandas/tests/scalar/timestamp/test_arithmetic.py index 2d58513989a66..7aa6c6c0496a9 100644 --- a/pandas/tests/scalar/timestamp/test_arithmetic.py +++ b/pandas/tests/scalar/timestamp/test_arithmetic.py @@ -7,7 +7,6 @@ from dateutil.tz import gettz import numpy as np import pytest -import pytz from pandas._libs.tslibs import ( OutOfBoundsDatetime, @@ -294,7 +293,7 @@ def test_subtract_different_utc_objects(self, utc_fixture, utc_fixture2): @pytest.mark.parametrize( "tz", [ - pytz.timezone("US/Eastern"), + "pytz/US/Eastern", gettz("US/Eastern"), "US/Eastern", "dateutil/US/Eastern", @@ -302,7 +301,9 @@ def test_subtract_different_utc_objects(self, utc_fixture, utc_fixture2): ) def test_timestamp_add_timedelta_push_over_dst_boundary(self, tz): # GH#1389 - + if isinstance(tz, str) and tz.startswith("pytz/"): + pytz = pytest.importorskip("pytz") + tz = pytz.timezone(tz.removeprefix("pytz/")) # 4 hours before DST transition stamp = Timestamp("3/10/2012 22:00", tz=tz) diff --git a/pandas/tests/scalar/timestamp/test_constructors.py b/pandas/tests/scalar/timestamp/test_constructors.py index 4ebdea3733484..39f302c3357de 100644 --- a/pandas/tests/scalar/timestamp/test_constructors.py +++ b/pandas/tests/scalar/timestamp/test_constructors.py @@ -18,7 +18,6 @@ import pytz from pandas._libs.tslibs.dtypes import NpyDatetimeUnit -from pandas.compat import PY310 from pandas.errors import OutOfBoundsDatetime from pandas import ( @@ -123,6 +122,7 @@ def test_timestamp_constructor_pytz_fold_raise(self): # Test for GH#25057 # pytz doesn't support fold. Check that we raise # if fold is passed with pytz + pytz = pytest.importorskip("pytz") msg = "pytz timezones do not support fold. Please use dateutil timezones." tz = pytz.timezone("Europe/London") with pytest.raises(ValueError, match=msg): @@ -160,15 +160,13 @@ def test_timestamp_constructor_retain_fold(self, tz, fold): expected = fold assert result == expected - try: - _tzs = [ + @pytest.mark.parametrize( + "tz", + [ "dateutil/Europe/London", zoneinfo.ZoneInfo("Europe/London"), - ] - except zoneinfo.ZoneInfoNotFoundError: - _tzs = ["dateutil/Europe/London"] - - @pytest.mark.parametrize("tz", _tzs) + ], + ) @pytest.mark.parametrize( "ts_input,fold_out", [ @@ -211,11 +209,7 @@ def test_timestamp_constructor_adjust_value_for_fold(self, tz, fold, value_out): class TestTimestampConstructorPositionalAndKeywordSupport: def test_constructor_positional(self): # see GH#10758 - msg = ( - "'NoneType' object cannot be interpreted as an integer" - if PY310 - else "an integer is required" - ) + msg = "'NoneType' object cannot be interpreted as an integer" with pytest.raises(TypeError, match=msg): Timestamp(2000, 1) @@ -565,11 +559,11 @@ def test_constructor(self): timezones = [ (None, 0), ("UTC", 0), - (pytz.utc, 0), + (timezone.utc, 0), ("Asia/Tokyo", 9), ("US/Eastern", -4), ("dateutil/US/Pacific", -7), - (pytz.FixedOffset(-180), -3), + (timezone(timedelta(hours=-3)), -3), (dateutil.tz.tzoffset(None, 18000), 5), ] @@ -622,11 +616,11 @@ def test_constructor_with_stringoffset(self): timezones = [ ("UTC", 0), - (pytz.utc, 0), + (timezone.utc, 0), ("Asia/Tokyo", 9), ("US/Eastern", -4), ("dateutil/US/Pacific", -7), - (pytz.FixedOffset(-180), -3), + (timezone(timedelta(hours=-3)), -3), (dateutil.tz.tzoffset(None, 18000), 5), ] @@ -706,7 +700,7 @@ def test_constructor_invalid_tz(self): msg = "at most one of" with pytest.raises(ValueError, match=msg): - Timestamp("2017-10-22", tzinfo=pytz.utc, tz="UTC") + Timestamp("2017-10-22", tzinfo=timezone.utc, tz="UTC") msg = "Cannot pass a date attribute keyword argument when passing a date string" with pytest.raises(ValueError, match=msg): @@ -719,11 +713,11 @@ def test_constructor_tz_or_tzinfo(self): # GH#17943, GH#17690, GH#5168 stamps = [ Timestamp(year=2017, month=10, day=22, tz="UTC"), - Timestamp(year=2017, month=10, day=22, tzinfo=pytz.utc), - Timestamp(year=2017, month=10, day=22, tz=pytz.utc), - Timestamp(datetime(2017, 10, 22), tzinfo=pytz.utc), + Timestamp(year=2017, month=10, day=22, tzinfo=timezone.utc), + Timestamp(year=2017, month=10, day=22, tz=timezone.utc), + Timestamp(datetime(2017, 10, 22), tzinfo=timezone.utc), Timestamp(datetime(2017, 10, 22), tz="UTC"), - Timestamp(datetime(2017, 10, 22), tz=pytz.utc), + Timestamp(datetime(2017, 10, 22), tz=timezone.utc), ] assert all(ts == stamps[0] for ts in stamps) @@ -898,13 +892,13 @@ def test_construct_timestamp_near_dst(self, offset): def test_construct_with_different_string_format(self, arg): # GH 12064 result = Timestamp(arg) - expected = Timestamp(datetime(2013, 1, 1), tz=pytz.FixedOffset(540)) + expected = Timestamp(datetime(2013, 1, 1), tz=timezone(timedelta(hours=9))) assert result == expected @pytest.mark.parametrize("box", [datetime, Timestamp]) def test_raise_tz_and_tzinfo_in_datetime_input(self, box): # GH 23579 - kwargs = {"year": 2018, "month": 1, "day": 1, "tzinfo": pytz.utc} + kwargs = {"year": 2018, "month": 1, "day": 1, "tzinfo": timezone.utc} msg = "Cannot pass a datetime or Timestamp" with pytest.raises(ValueError, match=msg): Timestamp(box(**kwargs), tz="US/Pacific") @@ -912,7 +906,7 @@ def test_raise_tz_and_tzinfo_in_datetime_input(self, box): with pytest.raises(ValueError, match=msg): Timestamp(box(**kwargs), tzinfo=pytz.timezone("US/Pacific")) - def test_dont_convert_dateutil_utc_to_pytz_utc(self): + def test_dont_convert_dateutil_utc_to_default_utc(self): result = Timestamp(datetime(2018, 1, 1), tz=tzutc()) expected = Timestamp(datetime(2018, 1, 1)).tz_localize(tzutc()) assert result == expected @@ -996,7 +990,7 @@ def test_timestamp_constructor_near_dst_boundary(self): @pytest.mark.parametrize( "tz", [ - pytz.timezone("US/Eastern"), + "pytz/US/Eastern", gettz("US/Eastern"), "US/Eastern", "dateutil/US/Eastern", @@ -1005,7 +999,9 @@ def test_timestamp_constructor_near_dst_boundary(self): def test_timestamp_constructed_by_date_and_tz(self, tz): # GH#2993, Timestamp cannot be constructed by datetime.date # and tz correctly - + if isinstance(tz, str) and tz.startswith("pytz/"): + pytz = pytest.importorskip("pytz") + tz = pytz.timezone(tz.removeprefix("pytz/")) result = Timestamp(date(2012, 3, 11), tz=tz) expected = Timestamp("3/11/2012", tz=tz) diff --git a/pandas/tests/scalar/timestamp/test_formats.py b/pandas/tests/scalar/timestamp/test_formats.py index 44db1187850c9..7b20f0a17556d 100644 --- a/pandas/tests/scalar/timestamp/test_formats.py +++ b/pandas/tests/scalar/timestamp/test_formats.py @@ -1,9 +1,11 @@ -from datetime import datetime +from datetime import ( + datetime, + timezone, +) import pprint import dateutil.tz import pytest -import pytz # a test below uses pytz but only inside a `eval` call from pandas.compat import WASM @@ -181,14 +183,14 @@ def test_repr_matches_pydatetime_no_tz(self): ts_nanos_micros = Timestamp(1200) assert str(ts_nanos_micros) == "1970-01-01 00:00:00.000001200" - def test_repr_matches_pydatetime_tz_pytz(self): - dt_date = datetime(2013, 1, 2, tzinfo=pytz.utc) + def test_repr_matches_pydatetime_tz_stdlib(self): + dt_date = datetime(2013, 1, 2, tzinfo=timezone.utc) assert str(dt_date) == str(Timestamp(dt_date)) - dt_datetime = datetime(2013, 1, 2, 12, 1, 3, tzinfo=pytz.utc) + dt_datetime = datetime(2013, 1, 2, 12, 1, 3, tzinfo=timezone.utc) assert str(dt_datetime) == str(Timestamp(dt_datetime)) - dt_datetime_us = datetime(2013, 1, 2, 12, 1, 3, 45, tzinfo=pytz.utc) + dt_datetime_us = datetime(2013, 1, 2, 12, 1, 3, 45, tzinfo=timezone.utc) assert str(dt_datetime_us) == str(Timestamp(dt_datetime_us)) def test_repr_matches_pydatetime_tz_dateutil(self): diff --git a/pandas/tests/scalar/timestamp/test_timestamp.py b/pandas/tests/scalar/timestamp/test_timestamp.py index 79fd285073983..38d0ddfbc13bd 100644 --- a/pandas/tests/scalar/timestamp/test_timestamp.py +++ b/pandas/tests/scalar/timestamp/test_timestamp.py @@ -9,6 +9,7 @@ import locale import time import unicodedata +import zoneinfo from dateutil.tz import ( tzlocal, @@ -20,8 +21,6 @@ ) import numpy as np import pytest -import pytz -from pytz import utc from pandas._libs.tslibs.dtypes import NpyDatetimeUnit from pandas._libs.tslibs.timezones import ( @@ -259,7 +258,7 @@ def test_dow_parametric(self, ts, sign): class TestTimestamp: - @pytest.mark.parametrize("tz", [None, pytz.timezone("US/Pacific")]) + @pytest.mark.parametrize("tz", [None, zoneinfo.ZoneInfo("US/Pacific")]) def test_disallow_setting_tz(self, tz): # GH#3746 ts = Timestamp("2010") @@ -311,7 +310,7 @@ def compare(x, y): assert int((Timestamp(x)._value - Timestamp(y)._value) / 1e9) == 0 compare(Timestamp.now(), datetime.now()) - compare(Timestamp.now("UTC"), datetime.now(pytz.timezone("UTC"))) + compare(Timestamp.now("UTC"), datetime.now(timezone.utc)) compare(Timestamp.now("UTC"), datetime.now(tzutc())) msg = "Timestamp.utcnow is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): @@ -329,12 +328,12 @@ def compare(x, y): compare( # Support tz kwarg in Timestamp.fromtimestamp Timestamp.fromtimestamp(current_time, "UTC"), - datetime.fromtimestamp(current_time, utc), + datetime.fromtimestamp(current_time, timezone.utc), ) compare( # Support tz kwarg in Timestamp.fromtimestamp Timestamp.fromtimestamp(current_time, tz="UTC"), - datetime.fromtimestamp(current_time, utc), + datetime.fromtimestamp(current_time, timezone.utc), ) date_component = datetime.now(timezone.utc) @@ -585,9 +584,9 @@ def test_month_name(self, dt64, ts): assert ts.month_name() == alt.month_name() def test_tz_convert(self, ts): - ts = Timestamp._from_value_and_reso(ts._value, ts._creso, utc) + ts = Timestamp._from_value_and_reso(ts._value, ts._creso, timezone.utc) - tz = pytz.timezone("US/Pacific") + tz = zoneinfo.ZoneInfo("US/Pacific") result = ts.tz_convert(tz) assert isinstance(result, Timestamp) diff --git a/pandas/tests/series/accessors/test_dt_accessor.py b/pandas/tests/series/accessors/test_dt_accessor.py index 8c60f7beb317d..03e823ce607fb 100644 --- a/pandas/tests/series/accessors/test_dt_accessor.py +++ b/pandas/tests/series/accessors/test_dt_accessor.py @@ -11,6 +11,8 @@ import pytest import pytz +from pandas._config import using_string_dtype + from pandas._libs.tslibs.timezones import maybe_get_tz from pandas.core.dtypes.common import ( @@ -256,9 +258,8 @@ def test_dt_accessor_limited_display_api(self): tm.assert_almost_equal(results, sorted(set(ok_for_dt + ok_for_dt_methods))) # Period - idx = period_range("20130101", periods=5, freq="D", name="xxx").astype(object) - with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): - ser = Series(idx) + idx = period_range("20130101", periods=5, freq="D", name="xxx") + ser = Series(idx) results = get_dir(ser) tm.assert_almost_equal( results, sorted(set(ok_for_period + ok_for_period_methods)) @@ -513,6 +514,7 @@ def test_dt_accessor_datetime_name_accessors(self, time_locale): ser = pd.concat([ser, Series([pd.NaT])]) assert np.isnan(ser.dt.month_name(locale=time_locale).iloc[-1]) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_strftime(self): # GH 10086 ser = Series(date_range("20130101", periods=5)) @@ -555,6 +557,7 @@ def test_strftime(self): ) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_strftime_dt64_days(self): ser = Series(date_range("20130101", periods=5)) ser.iloc[0] = pd.NaT @@ -585,6 +588,7 @@ def test_strftime_period_days(self, using_infer_string): expected = expected.astype("string[pyarrow_numpy]") tm.assert_index_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_strftime_dt64_microsecond_resolution(self): ser = Series([datetime(2013, 1, 1, 2, 32, 59), datetime(2013, 1, 2, 14, 32, 1)]) result = ser.dt.strftime("%Y-%m-%d %H:%M:%S") @@ -617,6 +621,7 @@ def test_strftime_period_minutes(self): ) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "data", [ diff --git a/pandas/tests/series/indexing/test_datetime.py b/pandas/tests/series/indexing/test_datetime.py index 3b41c8ee463d8..97cafc33611ed 100644 --- a/pandas/tests/series/indexing/test_datetime.py +++ b/pandas/tests/series/indexing/test_datetime.py @@ -14,7 +14,6 @@ ) import numpy as np import pytest -import pytz from pandas._libs import index as libindex @@ -63,6 +62,7 @@ def test_fancy_setitem(): @pytest.mark.parametrize("tz_source", ["pytz", "dateutil"]) def test_getitem_setitem_datetime_tz(tz_source): if tz_source == "pytz": + pytz = pytest.importorskip(tz_source) tzget = pytz.timezone else: # handle special case for utc in dateutil diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index 5002b6d20da09..9f310d8c8ab5f 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -6,6 +6,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import IndexingError from pandas import ( @@ -249,6 +251,7 @@ def test_slice(string_series, object_series): tm.assert_series_equal(string_series, original) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_timedelta_assignment(): # GH 8209 s = Series([], dtype=object) @@ -432,28 +435,38 @@ def test_setitem_dict_and_set_disallowed_multiindex(self, key): class TestSetitemValidation: # This is adapted from pandas/tests/arrays/masked/test_indexing.py - # but checks for warnings instead of errors. - def _check_setitem_invalid(self, ser, invalid, indexer, warn): - msg = "Setting an item of incompatible dtype is deprecated" - msg = re.escape(msg) - + def _check_setitem_invalid(self, ser, invalid, indexer): orig_ser = ser.copy() - with tm.assert_produces_warning(warn, match=msg): + with pytest.raises(TypeError, match="Invalid value"): ser[indexer] = invalid ser = orig_ser.copy() - with tm.assert_produces_warning(warn, match=msg): + with pytest.raises(TypeError, match="Invalid value"): ser.iloc[indexer] = invalid ser = orig_ser.copy() - with tm.assert_produces_warning(warn, match=msg): + with pytest.raises(TypeError, match="Invalid value"): ser.loc[indexer] = invalid ser = orig_ser.copy() - with tm.assert_produces_warning(warn, match=msg): + with pytest.raises(TypeError, match="Invalid value"): ser[:] = invalid + def _check_setitem_valid(self, ser, value, indexer): + orig_ser = ser.copy() + + ser[indexer] = value + ser = orig_ser.copy() + + ser.iloc[indexer] = value + ser = orig_ser.copy() + + ser.loc[indexer] = value + ser = orig_ser.copy() + + ser[:] = value + _invalid_scalars = [ 1 + 2j, "True", @@ -471,20 +484,19 @@ def _check_setitem_invalid(self, ser, invalid, indexer, warn): @pytest.mark.parametrize("indexer", _indexers) def test_setitem_validation_scalar_bool(self, invalid, indexer): ser = Series([True, False, False], dtype="bool") - self._check_setitem_invalid(ser, invalid, indexer, FutureWarning) + self._check_setitem_invalid(ser, invalid, indexer) @pytest.mark.parametrize("invalid", _invalid_scalars + [True, 1.5, np.float64(1.5)]) @pytest.mark.parametrize("indexer", _indexers) def test_setitem_validation_scalar_int(self, invalid, any_int_numpy_dtype, indexer): ser = Series([1, 2, 3], dtype=any_int_numpy_dtype) if isna(invalid) and invalid is not NaT and not np.isnat(invalid): - warn = None + self._check_setitem_valid(ser, invalid, indexer) else: - warn = FutureWarning - self._check_setitem_invalid(ser, invalid, indexer, warn) + self._check_setitem_invalid(ser, invalid, indexer) @pytest.mark.parametrize("invalid", _invalid_scalars + [True]) @pytest.mark.parametrize("indexer", _indexers) def test_setitem_validation_scalar_float(self, invalid, float_numpy_dtype, indexer): ser = Series([1, 2, None], dtype=float_numpy_dtype) - self._check_setitem_invalid(ser, invalid, indexer, FutureWarning) + self._check_setitem_invalid(ser, invalid, indexer) diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index b94e6b6f0c6c8..3fcf664c3f01b 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -1,3 +1,4 @@ +import contextlib from datetime import ( date, datetime, @@ -7,6 +8,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat.numpy import np_version_gte1p24 from pandas.errors import IndexingError @@ -273,25 +276,16 @@ def test_setitem_mask_align_and_promote(self): mask = ts > 0 left = ts.copy() right = ts[mask].copy().map(str) - with tm.assert_produces_warning( - FutureWarning, match="item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): left[mask] = right - expected = ts.map(lambda t: str(t) if t > 0 else t) - tm.assert_series_equal(left, expected) def test_setitem_mask_promote_strs(self): ser = Series([0, 1, 2, 0]) mask = ser > 0 ser2 = ser[mask].map(str) - with tm.assert_produces_warning( - FutureWarning, match="item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): ser[mask] = ser2 - expected = Series([0, "1", "2", 0]) - tm.assert_series_equal(ser, expected) - def test_setitem_mask_promote(self): ser = Series([0, "foo", "bar", 0]) mask = Series([False, True, True, False]) @@ -379,12 +373,8 @@ def test_setitem_with_bool_mask_and_values_matching_n_trues_in_length(self): def test_setitem_nan_with_bool(self): # GH 13034 result = Series([True, False, True]) - with tm.assert_produces_warning( - FutureWarning, match="item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): result[0] = np.nan - expected = Series([np.nan, False, True], dtype=object) - tm.assert_series_equal(result, expected) def test_setitem_mask_smallint_upcast(self): orig = Series([1, 2, 3], dtype="int8") @@ -393,22 +383,14 @@ def test_setitem_mask_smallint_upcast(self): mask = np.array([True, False, True]) ser = orig.copy() - with tm.assert_produces_warning( - FutureWarning, match="item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): ser[mask] = Series(alt) - expected = Series([999, 2, 1001]) - tm.assert_series_equal(ser, expected) - ser2 = orig.copy() - with tm.assert_produces_warning( - FutureWarning, match="item of incompatible dtype" - ): - ser2.mask(mask, alt, inplace=True) - tm.assert_series_equal(ser2, expected) + with pytest.raises(TypeError, match="Invalid value"): + ser.mask(mask, alt, inplace=True) - ser3 = orig.copy() - res = ser3.where(~mask, Series(alt)) + res = ser.where(~mask, Series(alt)) + expected = Series([999, 2, 1001]) tm.assert_series_equal(res, expected) def test_setitem_mask_smallint_no_upcast(self): @@ -461,9 +443,9 @@ def test_dt64tz_setitem_does_not_mutate_dti(self): ser = Series(dti) assert ser._values is not dti assert ser._values._ndarray.base is dti._data._ndarray.base - assert ser._mgr.arrays[0]._ndarray.base is dti._data._ndarray.base + assert ser._mgr.blocks[0].values._ndarray.base is dti._data._ndarray.base - assert ser._mgr.arrays[0] is not dti + assert ser._mgr.blocks[0].values is not dti ser[::3] = NaT assert ser[0] is NaT @@ -548,6 +530,7 @@ def test_append_timedelta_does_not_cast(self, td, using_infer_string, request): tm.assert_series_equal(ser, expected) assert isinstance(ser["td"], Timedelta) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_with_expansion_type_promotion(self): # GH#12599 ser = Series(dtype=object) @@ -557,6 +540,7 @@ def test_setitem_with_expansion_type_promotion(self): expected = Series([Timestamp("2016-01-01"), 3.0, "foo"], index=["a", "b", "c"]) tm.assert_series_equal(ser, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_not_contained(self, string_series): # set item that's not contained ser = string_series.copy() @@ -575,32 +559,35 @@ def test_setitem_keep_precision(self, any_numeric_ea_dtype): tm.assert_series_equal(ser, expected) @pytest.mark.parametrize( - "na, target_na, dtype, target_dtype, indexer, warn", + "na, target_na, dtype, target_dtype, indexer, raises", [ - (NA, NA, "Int64", "Int64", 1, None), - (NA, NA, "Int64", "Int64", 2, None), - (NA, np.nan, "int64", "float64", 1, None), - (NA, np.nan, "int64", "float64", 2, None), - (NaT, NaT, "int64", "object", 1, FutureWarning), - (NaT, NaT, "int64", "object", 2, None), - (np.nan, NA, "Int64", "Int64", 1, None), - (np.nan, NA, "Int64", "Int64", 2, None), - (np.nan, NA, "Float64", "Float64", 1, None), - (np.nan, NA, "Float64", "Float64", 2, None), - (np.nan, np.nan, "int64", "float64", 1, None), - (np.nan, np.nan, "int64", "float64", 2, None), + (NA, NA, "Int64", "Int64", 1, False), + (NA, NA, "Int64", "Int64", 2, False), + (NA, np.nan, "int64", "float64", 1, False), + (NA, np.nan, "int64", "float64", 2, False), + (NaT, NaT, "int64", "object", 1, True), + (NaT, NaT, "int64", "object", 2, False), + (np.nan, NA, "Int64", "Int64", 1, False), + (np.nan, NA, "Int64", "Int64", 2, False), + (np.nan, NA, "Float64", "Float64", 1, False), + (np.nan, NA, "Float64", "Float64", 2, False), + (np.nan, np.nan, "int64", "float64", 1, False), + (np.nan, np.nan, "int64", "float64", 2, False), ], ) def test_setitem_enlarge_with_na( - self, na, target_na, dtype, target_dtype, indexer, warn + self, na, target_na, dtype, target_dtype, indexer, raises ): # GH#32346 ser = Series([1, 2], dtype=dtype) - with tm.assert_produces_warning(warn, match="incompatible dtype"): + if raises: + with pytest.raises(TypeError, match="Invalid value"): + ser[indexer] = na + else: ser[indexer] = na - expected_values = [1, target_na] if indexer == 1 else [1, 2, target_na] - expected = Series(expected_values, dtype=target_dtype) - tm.assert_series_equal(ser, expected) + expected_values = [1, target_na] if indexer == 1 else [1, 2, target_na] + expected = Series(expected_values, dtype=target_dtype) + tm.assert_series_equal(ser, expected) def test_setitem_enlargement_object_none(self, nulls_fixture, using_infer_string): # GH#48665 @@ -694,14 +681,8 @@ def test_setitem_non_bool_into_bool(self, val, indexer_sli, unique): if not unique: ser.index = [1, 1] - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): indexer_sli(ser)[1] = val - assert type(ser.iloc[1]) == type(val) - - expected = Series([True, val], dtype=object, index=ser.index) - if not unique and indexer_sli is not tm.iloc: - expected = Series([val, val], dtype=object, index=[1, 1]) - tm.assert_series_equal(ser, expected) def test_setitem_boolean_array_into_npbool(self): # GH#45462 @@ -712,10 +693,8 @@ def test_setitem_boolean_array_into_npbool(self): ser[:2] = arr[:2] # no NAs -> can set inplace assert ser._values is values - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): ser[1:] = arr[1:] # has an NA -> cast to boolean dtype - expected = Series(arr) - tm.assert_series_equal(ser, expected) class SetitemCastingEquivalents: @@ -759,64 +738,72 @@ def _check_inplace(self, is_inplace, orig, arr, obj): # otherwise original array should be unchanged tm.assert_equal(arr, orig._values) - def test_int_key(self, obj, key, expected, warn, val, indexer_sli, is_inplace): + def test_int_key(self, obj, key, expected, raises, val, indexer_sli, is_inplace): if not isinstance(key, int): pytest.skip("Not relevant for int key") + if raises: + ctx = pytest.raises(TypeError, match="Invalid value") + else: + ctx = contextlib.nullcontext() - with tm.assert_produces_warning(warn, match="incompatible dtype"): + with ctx: self.check_indexer(obj, key, expected, val, indexer_sli, is_inplace) if indexer_sli is tm.loc: - with tm.assert_produces_warning(warn, match="incompatible dtype"): + with ctx: self.check_indexer(obj, key, expected, val, tm.at, is_inplace) elif indexer_sli is tm.iloc: - with tm.assert_produces_warning(warn, match="incompatible dtype"): + with ctx: self.check_indexer(obj, key, expected, val, tm.iat, is_inplace) rng = range(key, key + 1) - with tm.assert_produces_warning(warn, match="incompatible dtype"): + with ctx: self.check_indexer(obj, rng, expected, val, indexer_sli, is_inplace) if indexer_sli is not tm.loc: # Note: no .loc because that handles slice edges differently slc = slice(key, key + 1) - with tm.assert_produces_warning(warn, match="incompatible dtype"): + with ctx: self.check_indexer(obj, slc, expected, val, indexer_sli, is_inplace) ilkey = [key] - with tm.assert_produces_warning(warn, match="incompatible dtype"): + with ctx: self.check_indexer(obj, ilkey, expected, val, indexer_sli, is_inplace) indkey = np.array(ilkey) - with tm.assert_produces_warning(warn, match="incompatible dtype"): + with ctx: self.check_indexer(obj, indkey, expected, val, indexer_sli, is_inplace) genkey = (x for x in [key]) - with tm.assert_produces_warning(warn, match="incompatible dtype"): + with ctx: self.check_indexer(obj, genkey, expected, val, indexer_sli, is_inplace) - def test_slice_key(self, obj, key, expected, warn, val, indexer_sli, is_inplace): + def test_slice_key(self, obj, key, expected, raises, val, indexer_sli, is_inplace): if not isinstance(key, slice): pytest.skip("Not relevant for slice key") + if raises: + ctx = pytest.raises(TypeError, match="Invalid value") + else: + ctx = contextlib.nullcontext() if indexer_sli is not tm.loc: # Note: no .loc because that handles slice edges differently - with tm.assert_produces_warning(warn, match="incompatible dtype"): + with ctx: self.check_indexer(obj, key, expected, val, indexer_sli, is_inplace) ilkey = list(range(len(obj)))[key] - with tm.assert_produces_warning(warn, match="incompatible dtype"): + with ctx: self.check_indexer(obj, ilkey, expected, val, indexer_sli, is_inplace) indkey = np.array(ilkey) - with tm.assert_produces_warning(warn, match="incompatible dtype"): + with ctx: self.check_indexer(obj, indkey, expected, val, indexer_sli, is_inplace) genkey = (x for x in indkey) - with tm.assert_produces_warning(warn, match="incompatible dtype"): + with ctx: self.check_indexer(obj, genkey, expected, val, indexer_sli, is_inplace) - def test_mask_key(self, obj, key, expected, warn, val, indexer_sli): + def test_mask_key(self, obj, key, expected, raises, val, indexer_sli): # setitem with boolean mask mask = np.zeros(obj.shape, dtype=bool) mask[key] = True @@ -829,11 +816,13 @@ def test_mask_key(self, obj, key, expected, warn, val, indexer_sli): indexer_sli(obj)[mask] = val return - with tm.assert_produces_warning(warn, match="incompatible dtype"): + if raises: + with pytest.raises(TypeError, match="Invalid value"): + indexer_sli(obj)[mask] = val + else: indexer_sli(obj)[mask] = val - tm.assert_series_equal(obj, expected) - def test_series_where(self, obj, key, expected, warn, val, is_inplace): + def test_series_where(self, obj, key, expected, raises, val, is_inplace): mask = np.zeros(obj.shape, dtype=bool) mask[key] = True @@ -860,7 +849,8 @@ def test_series_where(self, obj, key, expected, warn, val, is_inplace): self._check_inplace(is_inplace, orig, arr, obj) - def test_index_where(self, obj, key, expected, warn, val, using_infer_string): + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) + def test_index_where(self, obj, key, expected, raises, val, using_infer_string): mask = np.zeros(obj.shape, dtype=bool) mask[key] = True @@ -872,7 +862,8 @@ def test_index_where(self, obj, key, expected, warn, val, using_infer_string): expected_idx = Index(expected, dtype=expected.dtype) tm.assert_index_equal(res, expected_idx) - def test_index_putmask(self, obj, key, expected, warn, val, using_infer_string): + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) + def test_index_putmask(self, obj, key, expected, raises, val, using_infer_string): mask = np.zeros(obj.shape, dtype=bool) mask[key] = True @@ -885,7 +876,7 @@ def test_index_putmask(self, obj, key, expected, warn, val, using_infer_string): @pytest.mark.parametrize( - "obj,expected,key,warn", + "obj,expected,key,raises", [ pytest.param( # GH#45568 setting a valid NA value into IntervalDtype[int] should @@ -896,7 +887,7 @@ def test_index_putmask(self, obj, key, expected, warn, val, using_infer_string): dtype="interval[float64]", ), 1, - FutureWarning, + True, id="interval_int_na_value", ), pytest.param( @@ -904,14 +895,14 @@ def test_index_putmask(self, obj, key, expected, warn, val, using_infer_string): Series([2, 3, 4, 5, 6, 7, 8, 9, 10]), Series([np.nan, 3, np.nan, 5, np.nan, 7, np.nan, 9, np.nan]), slice(None, None, 2), - None, + False, id="int_series_slice_key_step", ), pytest.param( Series([True, True, False, False]), Series([np.nan, True, np.nan, False], dtype=object), slice(None, None, 2), - FutureWarning, + True, id="bool_series_slice_key_step", ), pytest.param( @@ -919,7 +910,7 @@ def test_index_putmask(self, obj, key, expected, warn, val, using_infer_string): Series(np.arange(10)), Series([np.nan, np.nan, np.nan, np.nan, np.nan, 5, 6, 7, 8, 9]), slice(None, 5), - None, + False, id="int_series_slice_key", ), pytest.param( @@ -927,7 +918,7 @@ def test_index_putmask(self, obj, key, expected, warn, val, using_infer_string): Series([1, 2, 3]), Series([np.nan, 2, 3]), 0, - None, + False, id="int_series_int_key", ), pytest.param( @@ -936,7 +927,7 @@ def test_index_putmask(self, obj, key, expected, warn, val, using_infer_string): Series([np.nan], dtype=object), # TODO: maybe go to float64 since we are changing the _whole_ Series? 0, - FutureWarning, + True, id="bool_series_int_key_change_all", ), pytest.param( @@ -944,7 +935,7 @@ def test_index_putmask(self, obj, key, expected, warn, val, using_infer_string): Series([False, True]), Series([np.nan, True], dtype=object), 0, - FutureWarning, + True, id="bool_series_int_key", ), ], @@ -994,8 +985,8 @@ def key(self): return 0 @pytest.fixture - def warn(self): - return FutureWarning + def raises(self): + return True class TestSetitemDT64IntoInt(SetitemCastingEquivalents): @@ -1034,8 +1025,8 @@ def val(self, scalar, request): return box([scalar, scalar]) @pytest.fixture - def warn(self): - return FutureWarning + def raises(self): + return True class TestSetitemNAPeriodDtype(SetitemCastingEquivalents): @@ -1061,8 +1052,8 @@ def val(self, request): return request.param @pytest.fixture - def warn(self): - return None + def raises(self): + return False class TestSetitemNADatetimeLikeDtype(SetitemCastingEquivalents): @@ -1114,8 +1105,8 @@ def key(self): return 0 @pytest.fixture - def warn(self, is_inplace): - return None if is_inplace else FutureWarning + def raises(self, is_inplace): + return False if is_inplace else True class TestSetitemMismatchedTZCastsToObject(SetitemCastingEquivalents): @@ -1146,24 +1137,23 @@ def expected(self, obj, val): return expected @pytest.fixture - def warn(self): - return None + def raises(self): + return False @pytest.mark.parametrize( - "obj,expected,warn", + "obj,expected", [ # For numeric series, we should coerce to NaN. - (Series([1, 2, 3]), Series([np.nan, 2, 3]), None), - (Series([1.0, 2.0, 3.0]), Series([np.nan, 2.0, 3.0]), None), + (Series([1, 2, 3]), Series([np.nan, 2, 3])), + (Series([1.0, 2.0, 3.0]), Series([np.nan, 2.0, 3.0])), # For datetime series, we should coerce to NaT. ( Series([datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)]), Series([NaT, datetime(2000, 1, 2), datetime(2000, 1, 3)]), - None, ), # For objects, we should preserve the None value. - (Series(["foo", "bar", "baz"]), Series([None, "bar", "baz"]), None), + (Series(["foo", "bar", "baz"]), Series([None, "bar", "baz"])), ], ) class TestSeriesNoneCoercion(SetitemCastingEquivalents): @@ -1175,6 +1165,10 @@ def key(self): def val(self): return None + @pytest.fixture + def raises(self): + return False + class TestSetitemFloatIntervalWithIntIntervalValues(SetitemCastingEquivalents): # GH#44201 Cast to shared IntervalDtype rather than object @@ -1185,34 +1179,46 @@ def test_setitem_example(self): obj = Series(idx) val = Interval(0.5, 1.5) - with tm.assert_produces_warning( - FutureWarning, match="Setting an item of incompatible dtype" - ): + with pytest.raises(TypeError, match="Invalid value"): obj[0] = val - assert obj.dtype == "Interval[float64, right]" @pytest.fixture def obj(self): + """ + Fixture to create a Series [(0, 1], (1, 2], (2, 3]] + """ idx = IntervalIndex.from_breaks(range(4)) return Series(idx) @pytest.fixture def val(self): + """ + Fixture to get an interval (0.5, 1.5] + """ return Interval(0.5, 1.5) @pytest.fixture def key(self): + """ + Fixture to get a key 0 + """ return 0 @pytest.fixture def expected(self, obj, val): + """ + Fixture to get a Series [(0.5, 1.5], (1.0, 2.0], (2.0, 3.0]] + """ data = [val] + list(obj[1:]) idx = IntervalIndex(data, dtype="Interval[float64]") return Series(idx) @pytest.fixture - def warn(self): - return FutureWarning + def raises(self): + """ + Fixture to enable raising pytest exceptions + """ + return True class TestSetitemRangeIntoIntegerSeries(SetitemCastingEquivalents): @@ -1240,18 +1246,18 @@ def expected(self, any_int_numpy_dtype): return exp @pytest.fixture - def warn(self): - return None + def raises(self): + return False @pytest.mark.parametrize( - "val, warn", + "val, raises", [ - (np.array([2.0, 3.0]), None), - (np.array([2.5, 3.5]), FutureWarning), + (np.array([2.0, 3.0]), False), + (np.array([2.5, 3.5]), True), ( np.array([2**65, 2**65 + 1], dtype=np.float64), - FutureWarning, + True, ), # all ints, but can't cast ], ) @@ -1291,8 +1297,8 @@ def expected(self): return Series([1, 512, 3], dtype=np.int16) @pytest.fixture - def warn(self): - return FutureWarning + def raises(self): + return True @pytest.mark.parametrize("val", [2**33 + 1.0, 2**33 + 1.1, 2**62]) @@ -1315,8 +1321,8 @@ def expected(self, val): return Series([val, 2, 3], dtype=dtype) @pytest.fixture - def warn(self): - return FutureWarning + def raises(self): + return True class CoercionTest(SetitemCastingEquivalents): @@ -1334,8 +1340,8 @@ def expected(self, obj, key, val, exp_dtype): @pytest.mark.parametrize( - "val,exp_dtype,warn", - [(np.int32(1), np.int8, None), (np.int16(2**9), np.int16, FutureWarning)], + "val,exp_dtype,raises", + [(np.int32(1), np.int8, None), (np.int16(2**9), np.int16, True)], ) class TestCoercionInt8(CoercionTest): # previously test_setitem_series_int8 in tests.indexing.test_coercion @@ -1353,17 +1359,17 @@ def obj(self): return Series(["a", "b", "c", "d"], dtype=object) @pytest.fixture - def warn(self): - return None + def raises(self): + return False @pytest.mark.parametrize( - "val,exp_dtype,warn", + "val,exp_dtype,raises", [ - (1, np.complex128, None), - (1.1, np.complex128, None), - (1 + 1j, np.complex128, None), - (True, object, FutureWarning), + (1, np.complex128, False), + (1.1, np.complex128, False), + (1 + 1j, np.complex128, False), + (True, object, True), ], ) class TestCoercionComplex(CoercionTest): @@ -1374,14 +1380,14 @@ def obj(self): @pytest.mark.parametrize( - "val,exp_dtype,warn", + "val,exp_dtype,raises", [ - (1, object, FutureWarning), - ("3", object, FutureWarning), - (3, object, FutureWarning), - (1.1, object, FutureWarning), - (1 + 1j, object, FutureWarning), - (True, bool, None), + (1, object, True), + ("3", object, True), + (3, object, True), + (1.1, object, True), + (1 + 1j, object, True), + (True, bool, False), ], ) class TestCoercionBool(CoercionTest): @@ -1392,12 +1398,12 @@ def obj(self): @pytest.mark.parametrize( - "val,exp_dtype,warn", + "val,exp_dtype,raises", [ - (1, np.int64, None), - (1.1, np.float64, FutureWarning), - (1 + 1j, np.complex128, FutureWarning), - (True, object, FutureWarning), + (1, np.int64, False), + (1.1, np.float64, True), + (1 + 1j, np.complex128, True), + (True, object, True), ], ) class TestCoercionInt64(CoercionTest): @@ -1408,12 +1414,12 @@ def obj(self): @pytest.mark.parametrize( - "val,exp_dtype,warn", + "val,exp_dtype,raises", [ - (1, np.float64, None), - (1.1, np.float64, None), - (1 + 1j, np.complex128, FutureWarning), - (True, object, FutureWarning), + (1, np.float64, False), + (1.1, np.float64, False), + (1 + 1j, np.complex128, True), + (True, object, True), ], ) class TestCoercionFloat64(CoercionTest): @@ -1424,13 +1430,13 @@ def obj(self): @pytest.mark.parametrize( - "val,exp_dtype,warn", + "val,exp_dtype,raises", [ - (1, np.float32, None), + (1, np.float32, False), pytest.param( 1.1, np.float32, - None, + False, marks=pytest.mark.xfail( ( not np_version_gte1p24 @@ -1440,16 +1446,16 @@ def obj(self): "np_can_hold_element raises and we cast to float64", ), ), - (1 + 1j, np.complex128, FutureWarning), - (True, object, FutureWarning), - (np.uint8(2), np.float32, None), - (np.uint32(2), np.float32, None), + (1 + 1j, np.complex128, True), + (True, object, True), + (np.uint8(2), np.float32, False), + (np.uint32(2), np.float32, False), # float32 cannot hold np.iinfo(np.uint32).max exactly # (closest it can hold is 4294967300.0 which off by 5.0), so # we cast to float64 - (np.uint32(np.iinfo(np.uint32).max), np.float64, FutureWarning), - (np.uint64(2), np.float32, None), - (np.int64(2), np.float32, None), + (np.uint32(np.iinfo(np.uint32).max), np.float64, True), + (np.uint64(2), np.float32, False), + (np.int64(2), np.float32, False), ], ) class TestCoercionFloat32(CoercionTest): @@ -1457,8 +1463,8 @@ class TestCoercionFloat32(CoercionTest): def obj(self): return Series([1.1, 2.2, 3.3, 4.4], dtype=np.float32) - def test_slice_key(self, obj, key, expected, warn, val, indexer_sli, is_inplace): - super().test_slice_key(obj, key, expected, warn, val, indexer_sli, is_inplace) + def test_slice_key(self, obj, key, expected, raises, val, indexer_sli, is_inplace): + super().test_slice_key(obj, key, expected, raises, val, indexer_sli, is_inplace) if isinstance(val, float): # the xfail would xpass bc test_slice_key short-circuits @@ -1494,16 +1500,16 @@ def val(self, exp_dtype): return ts @pytest.fixture - def warn(self): - return FutureWarning + def raises(self): + return True @pytest.mark.parametrize( - "val,exp_dtype,warn", + "val,exp_dtype,raises", [ - (Timestamp("2012-01-01"), "datetime64[ns]", None), - (1, object, FutureWarning), - ("x", object, FutureWarning), + (Timestamp("2012-01-01"), "datetime64[ns]", False), + (1, object, True), + ("x", object, True), ], ) class TestCoercionDatetime64(CoercionTest): @@ -1514,18 +1520,18 @@ def obj(self): return Series(date_range("2011-01-01", freq="D", periods=4)) @pytest.fixture - def warn(self): - return None + def raises(self): + return False @pytest.mark.parametrize( - "val,exp_dtype,warn", + "val,exp_dtype,raises", [ - (Timestamp("2012-01-01", tz="US/Eastern"), "datetime64[ns, US/Eastern]", None), + (Timestamp("2012-01-01", tz="US/Eastern"), "datetime64[ns, US/Eastern]", False), # pre-2.0, a mis-matched tz would end up casting to object - (Timestamp("2012-01-01", tz="US/Pacific"), "datetime64[ns, US/Eastern]", None), - (Timestamp("2012-01-01"), object, FutureWarning), - (1, object, FutureWarning), + (Timestamp("2012-01-01", tz="US/Pacific"), "datetime64[ns, US/Eastern]", False), + (Timestamp("2012-01-01"), object, True), + (1, object, True), ], ) class TestCoercionDatetime64TZ(CoercionTest): @@ -1536,16 +1542,16 @@ def obj(self): return Series(date_range("2011-01-01", freq="D", periods=4, tz=tz)) @pytest.fixture - def warn(self): - return None + def raises(self): + return False @pytest.mark.parametrize( - "val,exp_dtype,warn", + "val,exp_dtype,raises", [ - (Timedelta("12 day"), "timedelta64[ns]", None), - (1, object, FutureWarning), - ("x", object, FutureWarning), + (Timedelta("12 day"), "timedelta64[ns]", False), + (1, object, True), + ("x", object, True), ], ) class TestCoercionTimedelta64(CoercionTest): @@ -1555,8 +1561,8 @@ def obj(self): return Series(timedelta_range("1 day", periods=4)) @pytest.fixture - def warn(self): - return None + def raises(self): + return False @pytest.mark.parametrize( @@ -1575,63 +1581,45 @@ def obj(self, request): return Series(request.param) @pytest.fixture - def warn(self): - return FutureWarning + def raises(self): + return True def test_20643(): # closed by GH#45121 orig = Series([0, 1, 2], index=["a", "b", "c"]) - expected = Series([0, 2.7, 2], index=["a", "b", "c"]) - ser = orig.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): ser.at["b"] = 2.7 - tm.assert_series_equal(ser, expected) - ser = orig.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): ser.loc["b"] = 2.7 - tm.assert_series_equal(ser, expected) - ser = orig.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): ser["b"] = 2.7 - tm.assert_series_equal(ser, expected) ser = orig.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): ser.iat[1] = 2.7 - tm.assert_series_equal(ser, expected) - ser = orig.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): ser.iloc[1] = 2.7 - tm.assert_series_equal(ser, expected) orig_df = orig.to_frame("A") - expected_df = expected.to_frame("A") df = orig_df.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): df.at["b", "A"] = 2.7 - tm.assert_frame_equal(df, expected_df) - df = orig_df.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): df.loc["b", "A"] = 2.7 - tm.assert_frame_equal(df, expected_df) - df = orig_df.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): df.iloc[1, 0] = 2.7 - tm.assert_frame_equal(df, expected_df) - df = orig_df.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): df.iat[1, 0] = 2.7 - tm.assert_frame_equal(df, expected_df) def test_20643_comment(): @@ -1653,35 +1641,23 @@ def test_15413(): # fixed by GH#45121 ser = Series([1, 2, 3]) - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): ser[ser == 2] += 0.5 - expected = Series([1, 2.5, 3]) - tm.assert_series_equal(ser, expected) - ser = Series([1, 2, 3]) - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): ser[1] += 0.5 - tm.assert_series_equal(ser, expected) - ser = Series([1, 2, 3]) - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): ser.loc[1] += 0.5 - tm.assert_series_equal(ser, expected) - ser = Series([1, 2, 3]) - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): ser.iloc[1] += 0.5 - tm.assert_series_equal(ser, expected) - ser = Series([1, 2, 3]) - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): ser.iat[1] += 0.5 - tm.assert_series_equal(ser, expected) - ser = Series([1, 2, 3]) - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): ser.at[1] += 0.5 - tm.assert_series_equal(ser, expected) def test_32878_int_itemsize(): @@ -1689,10 +1665,8 @@ def test_32878_int_itemsize(): arr = np.arange(5).astype("i4") ser = Series(arr) val = np.int64(np.iinfo(np.int64).max) - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): ser[0] = val - expected = Series([val, 1, 2, 3, 4], dtype=np.int64) - tm.assert_series_equal(ser, expected) def test_32878_complex_itemsize(): @@ -1702,20 +1676,15 @@ def test_32878_complex_itemsize(): val = val.astype("c16") # GH#32878 used to coerce val to inf+0.000000e+00j - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): ser[0] = val - assert ser[0] == val - expected = Series([val, 1, 2, 3, 4], dtype="c16") - tm.assert_series_equal(ser, expected) def test_37692(indexer_al): # GH#37692 ser = Series([1, 2, 3], index=["a", "b", "c"]) - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): indexer_al(ser)["b"] = "test" - expected = Series([1, "test", 3], index=["a", "b", "c"], dtype=object) - tm.assert_series_equal(ser, expected) def test_setitem_bool_int_float_consistency(indexer_sli): @@ -1725,14 +1694,12 @@ def test_setitem_bool_int_float_consistency(indexer_sli): # as the setitem can be done losslessly for dtype in [np.float64, np.int64]: ser = Series(0, index=range(3), dtype=dtype) - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): indexer_sli(ser)[0] = True - assert ser.dtype == object ser = Series(0, index=range(3), dtype=bool) - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): ser[0] = dtype(1) - assert ser.dtype == object # 1.0 can be held losslessly, so no casting ser = Series(0, index=range(3), dtype=np.int64) diff --git a/pandas/tests/series/indexing/test_where.py b/pandas/tests/series/indexing/test_where.py index 4979bcb42d7ab..053c290999f2f 100644 --- a/pandas/tests/series/indexing/test_where.py +++ b/pandas/tests/series/indexing/test_where.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype from pandas.core.dtypes.common import is_integer @@ -55,15 +55,13 @@ def test_where_unsafe_upcast(dtype, expected_dtype): s = Series(np.arange(10), dtype=dtype) values = [2.5, 3.5, 4.5, 5.5, 6.5] mask = s < 5 - expected = Series(values + list(range(5, 10)), dtype=expected_dtype) - warn = ( - None - if np.dtype(dtype).kind == np.dtype(expected_dtype).kind == "f" - else FutureWarning - ) - with tm.assert_produces_warning(warn, match="incompatible dtype"): + if np.dtype(dtype).kind == np.dtype(expected_dtype).kind == "f": s[mask] = values - tm.assert_series_equal(s, expected) + expected = Series(values + list(range(5, 10)), dtype=expected_dtype) + tm.assert_series_equal(s, expected) + else: + with pytest.raises(TypeError, match="Invalid value"): + s[mask] = values def test_where_unsafe(): @@ -74,9 +72,10 @@ def test_where_unsafe(): mask = s > 5 expected = Series(list(range(6)) + values, dtype="float64") - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): s[mask] = values - tm.assert_series_equal(s, expected) + s = s.astype("float64") + s[mask] = values # see gh-3235 s = Series(np.arange(10), dtype="int64") @@ -232,7 +231,7 @@ def test_where_ndframe_align(): tm.assert_series_equal(out, expected) -@pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't set ints into string") +@pytest.mark.xfail(using_string_dtype(), reason="can't set ints into string") def test_where_setitem_invalid(): # GH 2702 # make sure correct exceptions are raised on invalid list assignment diff --git a/pandas/tests/series/methods/test_argsort.py b/pandas/tests/series/methods/test_argsort.py index 432c0eceee011..c1082c06ce307 100644 --- a/pandas/tests/series/methods/test_argsort.py +++ b/pandas/tests/series/methods/test_argsort.py @@ -20,21 +20,15 @@ def test_argsort_axis(self): def test_argsort_numpy(self, datetime_series): ser = datetime_series - res = np.argsort(ser).values expected = np.argsort(np.array(ser)) tm.assert_numpy_array_equal(res, expected) - # with missing values - ts = ser.copy() - ts[::2] = np.nan - - msg = "The behavior of Series.argsort in the presence of NA values" - with tm.assert_produces_warning( - FutureWarning, match=msg, check_stacklevel=False - ): - result = np.argsort(ts)[1::2] - expected = np.argsort(np.array(ts.dropna())) + def test_argsort_numpy_missing(self): + data = [0.1, np.nan, 0.2, np.nan, 0.3] + ser = Series(data) + result = np.argsort(ser) + expected = np.argsort(np.array(data)) tm.assert_numpy_array_equal(result.values, expected) @@ -56,10 +50,8 @@ def test_argsort_dt64(self, unit): expected = Series(range(5), dtype=np.intp) tm.assert_series_equal(result, expected) - msg = "The behavior of Series.argsort in the presence of NA values" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = shifted.argsort() - expected = Series(list(range(4)) + [-1], dtype=np.intp) + result = shifted.argsort() + expected = Series(list(range(4)) + [4], dtype=np.intp) tm.assert_series_equal(result, expected) def test_argsort_stable(self): diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py index 4b2122e25f819..d2d92d7273d3d 100644 --- a/pandas/tests/series/methods/test_astype.py +++ b/pandas/tests/series/methods/test_astype.py @@ -298,7 +298,7 @@ def test_astype_str_cast_dt64(self): def test_astype_str_cast_td64(self): # see GH#9757 - td = Series([Timedelta(1, unit="d")]) + td = Series([Timedelta(1, unit="D")]) ser = td.astype(str) expected = Series(["1 days"], dtype=object) diff --git a/pandas/tests/series/methods/test_combine_first.py b/pandas/tests/series/methods/test_combine_first.py index 0f2f533c8feff..293919173c2d5 100644 --- a/pandas/tests/series/methods/test_combine_first.py +++ b/pandas/tests/series/methods/test_combine_first.py @@ -78,7 +78,7 @@ def test_combine_first_dt64(self, unit): s1 = Series([np.nan, "2011"]) rs = s0.combine_first(s1) - xp = Series([datetime(2010, 1, 1), "2011"], dtype="datetime64[ns]") + xp = Series([datetime(2010, 1, 1), "2011"], dtype=f"datetime64[{unit}]") tm.assert_series_equal(rs, xp) diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index f6f3a3b0fb07e..4a8af259b4134 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs import lib import pandas as pd @@ -10,6 +12,7 @@ class TestSeriesConvertDtypes: + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "data, maindtype, expected_default, expected_other", [ @@ -231,7 +234,7 @@ def test_convert_dtypes( copy = series.copy(deep=True) if result.notna().sum() > 0 and result.dtype in ["interval[int64, right]"]: - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): result[result.notna()] = np.nan else: result[result.notna()] = np.nan diff --git a/pandas/tests/series/methods/test_drop_duplicates.py b/pandas/tests/series/methods/test_drop_duplicates.py index 31ef8ff896bcc..2dbd61530dc41 100644 --- a/pandas/tests/series/methods/test_drop_duplicates.py +++ b/pandas/tests/series/methods/test_drop_duplicates.py @@ -75,10 +75,16 @@ class TestSeriesDropDuplicates: params=["int_", "uint", "float64", "str_", "timedelta64[h]", "datetime64[D]"] ) def dtype(self, request): + """ + Fixture that provides different data types for testing. + """ return request.param @pytest.fixture def cat_series_unused_category(self, dtype, ordered): + """ + Fixture that creates a Categorical Series with some unused categories. + """ # Test case 1 cat_array = np.array([1, 2, 3, 4, 5], dtype=np.dtype(dtype)) @@ -141,7 +147,9 @@ def test_drop_duplicates_categorical_non_bool_keepfalse( @pytest.fixture def cat_series(self, dtype, ordered): - # no unused categories, unlike cat_series_unused_category + """ + Fixture that creates a Categorical Series with no unused categories. + """ cat_array = np.array([1, 2, 3, 4, 5], dtype=np.dtype(dtype)) input2 = np.array([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype(dtype)) diff --git a/pandas/tests/series/methods/test_equals.py b/pandas/tests/series/methods/test_equals.py index 875ffdd3fe851..b94723b7cbddf 100644 --- a/pandas/tests/series/methods/test_equals.py +++ b/pandas/tests/series/methods/test_equals.py @@ -82,15 +82,13 @@ def test_equals_matching_nas(): left = Series([np.datetime64("NaT")], dtype=object) right = Series([np.datetime64("NaT")], dtype=object) assert left.equals(right) - with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): - assert Index(left).equals(Index(right)) + assert Index(left).equals(Index(right)) assert left.array.equals(right.array) left = Series([np.timedelta64("NaT")], dtype=object) right = Series([np.timedelta64("NaT")], dtype=object) assert left.equals(right) - with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): - assert Index(left).equals(Index(right)) + assert Index(left).equals(Index(right)) assert left.array.equals(right.array) left = Series([np.float64("NaN")], dtype=object) diff --git a/pandas/tests/series/methods/test_fillna.py b/pandas/tests/series/methods/test_fillna.py index 592dba253532d..f53d75df83124 100644 --- a/pandas/tests/series/methods/test_fillna.py +++ b/pandas/tests/series/methods/test_fillna.py @@ -6,7 +6,6 @@ import numpy as np import pytest -import pytz from pandas import ( Categorical, @@ -159,9 +158,8 @@ def test_fillna_consistency(self): # assignment ser2 = ser.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): ser2[1] = "foo" - tm.assert_series_equal(ser2, expected) def test_timedelta_fillna(self, frame_or_series, unit): # GH#3371 @@ -411,7 +409,7 @@ def test_datetime64_tz_fillna(self, tz, unit): Timestamp("2011-01-02 10:00", tz=tz), Timestamp("2011-01-03 10:00"), Timestamp("2011-01-02 10:00", tz=tz), - ] + ], ) tm.assert_series_equal(expected, result) tm.assert_series_equal(isna(ser), null_loc) @@ -861,7 +859,7 @@ def test_fillna_bug(self): def test_ffill_mixed_dtypes_without_missing_data(self): # GH#14956 - series = Series([datetime(2015, 1, 1, tzinfo=pytz.utc), 1]) + series = Series([datetime(2015, 1, 1, tzinfo=timezone.utc), 1]) result = series.ffill() tm.assert_series_equal(series, result) @@ -923,16 +921,16 @@ def test_datetime64tz_fillna_round_issue(self): # GH#14872 data = Series( - [NaT, NaT, datetime(2016, 12, 12, 22, 24, 6, 100001, tzinfo=pytz.utc)] + [NaT, NaT, datetime(2016, 12, 12, 22, 24, 6, 100001, tzinfo=timezone.utc)] ) filled = data.bfill() expected = Series( [ - datetime(2016, 12, 12, 22, 24, 6, 100001, tzinfo=pytz.utc), - datetime(2016, 12, 12, 22, 24, 6, 100001, tzinfo=pytz.utc), - datetime(2016, 12, 12, 22, 24, 6, 100001, tzinfo=pytz.utc), + datetime(2016, 12, 12, 22, 24, 6, 100001, tzinfo=timezone.utc), + datetime(2016, 12, 12, 22, 24, 6, 100001, tzinfo=timezone.utc), + datetime(2016, 12, 12, 22, 24, 6, 100001, tzinfo=timezone.utc), ] ) diff --git a/pandas/tests/series/methods/test_get_numeric_data.py b/pandas/tests/series/methods/test_get_numeric_data.py index f25583904377a..4a11d7905f506 100644 --- a/pandas/tests/series/methods/test_get_numeric_data.py +++ b/pandas/tests/series/methods/test_get_numeric_data.py @@ -1,5 +1,4 @@ from pandas import ( - Index, Series, date_range, ) @@ -19,7 +18,7 @@ def test_get_numeric_data_preserve_dtype(self): obj = Series([1, "2", 3.0]) result = obj._get_numeric_data() - expected = Series([], dtype=object, index=Index([], dtype=object)) + expected = Series([], dtype=object) tm.assert_series_equal(result, expected) obj = Series([True, False, True]) @@ -28,5 +27,5 @@ def test_get_numeric_data_preserve_dtype(self): obj = Series(date_range("20130101", periods=3)) result = obj._get_numeric_data() - expected = Series([], dtype="M8[ns]", index=Index([], dtype=object)) + expected = Series([], dtype="M8[ns]") tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_info.py b/pandas/tests/series/methods/test_info.py index bd1bc1781958c..097976b0a7ac0 100644 --- a/pandas/tests/series/methods/test_info.py +++ b/pandas/tests/series/methods/test_info.py @@ -5,6 +5,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat import PYPY from pandas import ( @@ -140,6 +142,7 @@ def test_info_memory_usage_deep_pypy(): assert s_object.memory_usage(deep=True) == s_object.memory_usage() +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "index, plus", [ diff --git a/pandas/tests/series/methods/test_isin.py b/pandas/tests/series/methods/test_isin.py index 937b85a547bcd..e997ae32cf2e2 100644 --- a/pandas/tests/series/methods/test_isin.py +++ b/pandas/tests/series/methods/test_isin.py @@ -92,7 +92,7 @@ def test_isin_with_i8(self): tm.assert_series_equal(result, expected) # timedelta64[ns] - s = Series(pd.to_timedelta(range(5), unit="d")) + s = Series(pd.to_timedelta(range(5), unit="D")) result = s.isin(s[0:2]) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_nlargest.py b/pandas/tests/series/methods/test_nlargest.py index 56b7cf42a798d..67ba1d7ca51b7 100644 --- a/pandas/tests/series/methods/test_nlargest.py +++ b/pandas/tests/series/methods/test_nlargest.py @@ -15,7 +15,7 @@ def assert_check_nselect_boundary(vals, dtype, method): # helper function for 'test_boundary_{dtype}' tests ser = Series(vals, dtype=dtype) result = getattr(ser, method)(3) - expected_idxr = [0, 1, 2] if method == "nsmallest" else [3, 2, 1] + expected_idxr = range(3) if method == "nsmallest" else range(3, 0, -1) expected = ser.loc[expected_idxr] tm.assert_series_equal(result, expected) @@ -46,7 +46,7 @@ def test_nlargest_error(self, r, method, arg): [ pd.to_datetime(["2003", "2002", "2001", "2002", "2005"]), pd.to_datetime(["2003", "2002", "2001", "2002", "2005"], utc=True), - pd.to_timedelta(["3d", "2d", "1d", "2d", "5d"]), + pd.to_timedelta(["3D", "2D", "1D", "2D", "5D"]), np.array([3, 2, 1, 2, 5], dtype="int8"), np.array([3, 2, 1, 2, 5], dtype="int16"), np.array([3, 2, 1, 2, 5], dtype="int32"), diff --git a/pandas/tests/series/methods/test_reindex.py b/pandas/tests/series/methods/test_reindex.py index d049f446edb0c..068446a5e216b 100644 --- a/pandas/tests/series/methods/test_reindex.py +++ b/pandas/tests/series/methods/test_reindex.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype - from pandas import ( NA, Categorical, @@ -22,9 +20,6 @@ import pandas._testing as tm -@pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="share memory doesn't work for arrow" -) def test_reindex(datetime_series, string_series): identity = string_series.reindex(string_series.index) @@ -234,13 +229,15 @@ def test_reindex_categorical(): tm.assert_series_equal(result, expected) # partial reindexing - expected = Series(Categorical(values=["b", "c"], categories=["a", "b", "c"])) - expected.index = [1, 2] + expected = Series( + Categorical(values=["b", "c"], categories=["a", "b", "c"]), index=range(1, 3) + ) result = s.reindex([1, 2]) tm.assert_series_equal(result, expected) - expected = Series(Categorical(values=["c", np.nan], categories=["a", "b", "c"])) - expected.index = [2, 3] + expected = Series( + Categorical(values=["c", np.nan], categories=["a", "b", "c"]), index=range(2, 4) + ) result = s.reindex([2, 3]) tm.assert_series_equal(result, expected) @@ -261,11 +258,11 @@ def test_reindex_fill_value(): # floats floats = Series([1.0, 2.0, 3.0]) result = floats.reindex([1, 2, 3]) - expected = Series([2.0, 3.0, np.nan], index=[1, 2, 3]) + expected = Series([2.0, 3.0, np.nan], index=range(1, 4)) tm.assert_series_equal(result, expected) result = floats.reindex([1, 2, 3], fill_value=0) - expected = Series([2.0, 3.0, 0], index=[1, 2, 3]) + expected = Series([2.0, 3.0, 0], index=range(1, 4)) tm.assert_series_equal(result, expected) # ----------------------------------------------------------- @@ -273,12 +270,12 @@ def test_reindex_fill_value(): ints = Series([1, 2, 3]) result = ints.reindex([1, 2, 3]) - expected = Series([2.0, 3.0, np.nan], index=[1, 2, 3]) + expected = Series([2.0, 3.0, np.nan], index=range(1, 4)) tm.assert_series_equal(result, expected) # don't upcast result = ints.reindex([1, 2, 3], fill_value=0) - expected = Series([2, 3, 0], index=[1, 2, 3]) + expected = Series([2, 3, 0], index=range(1, 4)) assert issubclass(result.dtype.type, np.integer) tm.assert_series_equal(result, expected) @@ -287,11 +284,11 @@ def test_reindex_fill_value(): objects = Series([1, 2, 3], dtype=object) result = objects.reindex([1, 2, 3]) - expected = Series([2, 3, np.nan], index=[1, 2, 3], dtype=object) + expected = Series([2, 3, np.nan], index=range(1, 4), dtype=object) tm.assert_series_equal(result, expected) result = objects.reindex([1, 2, 3], fill_value="foo") - expected = Series([2, 3, "foo"], index=[1, 2, 3], dtype=object) + expected = Series([2, 3, "foo"], index=range(1, 4), dtype=object) tm.assert_series_equal(result, expected) # ------------------------------------------------------------ @@ -299,11 +296,11 @@ def test_reindex_fill_value(): bools = Series([True, False, True]) result = bools.reindex([1, 2, 3]) - expected = Series([False, True, np.nan], index=[1, 2, 3], dtype=object) + expected = Series([False, True, np.nan], index=range(1, 4), dtype=object) tm.assert_series_equal(result, expected) result = bools.reindex([1, 2, 3], fill_value=False) - expected = Series([False, True, False], index=[1, 2, 3]) + expected = Series([False, True, False], index=range(1, 4)) tm.assert_series_equal(result, expected) @@ -318,7 +315,7 @@ def test_reindex_fill_value_datetimelike_upcast(dtype, fill_value): ser = Series([NaT], dtype=dtype) result = ser.reindex([0, 1], fill_value=fill_value) - expected = Series([NaT, fill_value], index=[0, 1], dtype=object) + expected = Series([NaT, fill_value], index=range(2), dtype=object) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index 0a79bcea679a7..de0855bf7192e 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -3,7 +3,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype import pandas as pd import pandas._testing as tm @@ -359,7 +359,6 @@ def test_replace_mixed_types_with_string(self): expected = pd.Series([1, np.nan, 3, np.nan, 4, 5], dtype=object) tm.assert_series_equal(expected, result) - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't fill 0 in string") @pytest.mark.parametrize( "categorical, numeric", [ @@ -370,9 +369,7 @@ def test_replace_mixed_types_with_string(self): def test_replace_categorical(self, categorical, numeric): # GH 24971, GH#23305 ser = pd.Series(pd.Categorical(categorical, categories=["A", "B"])) - msg = "with CategoricalDtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = ser.replace({"A": 1, "B": 2}) + result = ser.cat.rename_categories({"A": 1, "B": 2}) expected = pd.Series(numeric).astype("category") if 2 not in expected.cat.categories: # i.e. categories should be [1, 2] even if there are no "B"s present @@ -380,16 +377,13 @@ def test_replace_categorical(self, categorical, numeric): expected = expected.cat.add_categories(2) tm.assert_series_equal(expected, result, check_categorical=False) - @pytest.mark.parametrize( - "data, data_exp", [(["a", "b", "c"], ["b", "b", "c"]), (["a"], ["b"])] - ) - def test_replace_categorical_inplace(self, data, data_exp): + def test_replace_categorical_inplace(self): # GH 53358 + data = ["a", "b", "c"] + data_exp = ["b", "b", "c"] result = pd.Series(data, dtype="category") - msg = "with CategoricalDtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result.replace(to_replace="a", value="b", inplace=True) - expected = pd.Series(data_exp, dtype="category") + result.replace(to_replace="a", value="b", inplace=True) + expected = pd.Series(pd.Categorical(data_exp, categories=data)) tm.assert_series_equal(result, expected) def test_replace_categorical_single(self): @@ -404,25 +398,10 @@ def test_replace_categorical_single(self): expected = expected.cat.remove_unused_categories() assert c[2] != "foo" - msg = "with CategoricalDtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = c.replace(c[2], "foo") + result = c.cat.rename_categories({c.values[2]: "foo"}) tm.assert_series_equal(expected, result) assert c[2] != "foo" # ensure non-inplace call does not alter original - msg = "with CategoricalDtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - return_value = c.replace(c[2], "foo", inplace=True) - assert return_value is None - tm.assert_series_equal(expected, c) - - first_value = c[0] - msg = "with CategoricalDtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - return_value = c.replace(c[1], c[0], inplace=True) - assert return_value is None - assert c[0] == c[1] == first_value # test replacing with existing value - def test_replace_with_no_overflowerror(self): # GH 25616 # casts to object without Exception from OverflowError @@ -640,7 +619,7 @@ def test_replace_nullable_numeric(self): with pytest.raises(TypeError, match="Invalid value"): ints.replace(1, 9.5) - @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't fill 1 in string") + @pytest.mark.xfail(using_string_dtype(), reason="can't fill 1 in string") @pytest.mark.parametrize("regex", [False, True]) def test_replace_regex_dtype_series(self, regex): # GH-48644 @@ -668,6 +647,7 @@ def test_replace_value_none_dtype_numeric(self, val): expected = pd.Series([1, None], dtype=object) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_replace_change_dtype_series(self, using_infer_string): # GH#25797 df = pd.DataFrame.from_dict({"Test": ["0.5", True, "0.6"]}) diff --git a/pandas/tests/series/methods/test_to_csv.py b/pandas/tests/series/methods/test_to_csv.py index f7dec02ab0e5b..0bcad49847291 100644 --- a/pandas/tests/series/methods/test_to_csv.py +++ b/pandas/tests/series/methods/test_to_csv.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import Series import pandas._testing as tm @@ -24,6 +26,7 @@ def read_csv(self, path, **kwargs): return out + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_from_csv(self, datetime_series, string_series, temp_file): # freq doesn't round-trip datetime_series.index = datetime_series.index._with_freq(None) @@ -31,7 +34,9 @@ def test_from_csv(self, datetime_series, string_series, temp_file): path = temp_file datetime_series.to_csv(path, header=False) ts = self.read_csv(path, parse_dates=True) - tm.assert_series_equal(datetime_series, ts, check_names=False) + expected = datetime_series.copy() + expected.index = expected.index.as_unit("s") + tm.assert_series_equal(expected, ts, check_names=False) assert ts.name is None assert ts.index.name is None @@ -57,6 +62,7 @@ def test_from_csv(self, datetime_series, string_series, temp_file): series = self.read_csv(path, sep="|", parse_dates=True) check_series = Series({datetime(1998, 1, 1): 1.0, datetime(1999, 1, 1): 2.0}) + check_series.index = check_series.index.as_unit("s") tm.assert_series_equal(check_series, series) series = self.read_csv(path, sep="|", parse_dates=False) diff --git a/pandas/tests/series/methods/test_unstack.py b/pandas/tests/series/methods/test_unstack.py index f9e6dc644e908..8c4f0ff3eaea7 100644 --- a/pandas/tests/series/methods/test_unstack.py +++ b/pandas/tests/series/methods/test_unstack.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -134,6 +136,7 @@ def test_unstack_mixed_type_name_in_multiindex( tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_unstack_multi_index_categorical_values(): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), diff --git a/pandas/tests/series/methods/test_update.py b/pandas/tests/series/methods/test_update.py index 1d29e116be5c2..9b5fb098bf3ee 100644 --- a/pandas/tests/series/methods/test_update.py +++ b/pandas/tests/series/methods/test_update.py @@ -35,37 +35,39 @@ def test_update(self): tm.assert_frame_equal(df, expected) @pytest.mark.parametrize( - "other, dtype, expected, warn", + "other, dtype, expected, raises", [ # other is int - ([61, 63], "int32", Series([10, 61, 12], dtype="int32"), None), - ([61, 63], "int64", Series([10, 61, 12]), None), - ([61, 63], float, Series([10.0, 61.0, 12.0]), None), - ([61, 63], object, Series([10, 61, 12], dtype=object), None), + ([61, 63], "int32", Series([10, 61, 12], dtype="int32"), False), + ([61, 63], "int64", Series([10, 61, 12]), False), + ([61, 63], float, Series([10.0, 61.0, 12.0]), False), + ([61, 63], object, Series([10, 61, 12], dtype=object), False), # other is float, but can be cast to int - ([61.0, 63.0], "int32", Series([10, 61, 12], dtype="int32"), None), - ([61.0, 63.0], "int64", Series([10, 61, 12]), None), - ([61.0, 63.0], float, Series([10.0, 61.0, 12.0]), None), - ([61.0, 63.0], object, Series([10, 61.0, 12], dtype=object), None), + ([61.0, 63.0], "int32", Series([10, 61, 12], dtype="int32"), False), + ([61.0, 63.0], "int64", Series([10, 61, 12]), False), + ([61.0, 63.0], float, Series([10.0, 61.0, 12.0]), False), + ([61.0, 63.0], object, Series([10, 61.0, 12], dtype=object), False), # others is float, cannot be cast to int - ([61.1, 63.1], "int32", Series([10.0, 61.1, 12.0]), FutureWarning), - ([61.1, 63.1], "int64", Series([10.0, 61.1, 12.0]), FutureWarning), - ([61.1, 63.1], float, Series([10.0, 61.1, 12.0]), None), - ([61.1, 63.1], object, Series([10, 61.1, 12], dtype=object), None), + ([61.1, 63.1], "int32", Series([10.0, 61.1, 12.0]), True), + ([61.1, 63.1], "int64", Series([10.0, 61.1, 12.0]), True), + ([61.1, 63.1], float, Series([10.0, 61.1, 12.0]), False), + ([61.1, 63.1], object, Series([10, 61.1, 12], dtype=object), False), # other is object, cannot be cast - ([(61,), (63,)], "int32", Series([10, (61,), 12]), FutureWarning), - ([(61,), (63,)], "int64", Series([10, (61,), 12]), FutureWarning), - ([(61,), (63,)], float, Series([10.0, (61,), 12.0]), FutureWarning), - ([(61,), (63,)], object, Series([10, (61,), 12]), None), + ([(61,), (63,)], "int32", Series([10, (61,), 12]), True), + ([(61,), (63,)], "int64", Series([10, (61,), 12]), True), + ([(61,), (63,)], float, Series([10.0, (61,), 12.0]), True), + ([(61,), (63,)], object, Series([10, (61,), 12]), False), ], ) - def test_update_dtypes(self, other, dtype, expected, warn): + def test_update_dtypes(self, other, dtype, expected, raises): ser = Series([10, 11, 12], dtype=dtype) other = Series(other, index=[1, 3]) - with tm.assert_produces_warning(warn, match="item of incompatible dtype"): + if raises: + with pytest.raises(TypeError, match="Invalid value"): + ser.update(other) + else: ser.update(other) - - tm.assert_series_equal(ser, expected) + tm.assert_series_equal(ser, expected) @pytest.mark.parametrize( "values, other, expected", diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index f0930a831e98d..ff84b5c52183b 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -9,6 +9,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs import lib from pandas._libs.tslibs import IncompatibleFrequency @@ -500,6 +502,7 @@ def test_ser_cmp_result_names(self, names, comparison_op): result = op(ser, cidx) assert result.name == names[2] + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_comparisons(self, using_infer_string): s = Series(["a", "b", "c"]) s2 = Series([False, True, False]) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 3f9d5bbe806bb..3efcd82da42e4 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -752,7 +752,7 @@ def test_constructor_pass_nan_nat(self): tm.assert_series_equal(Series(np.array([np.nan, np.nan])), exp) exp = Series([NaT, NaT]) - assert exp.dtype == "datetime64[ns]" + assert exp.dtype == "datetime64[s]" tm.assert_series_equal(Series([NaT, NaT]), exp) tm.assert_series_equal(Series(np.array([NaT, NaT])), exp) @@ -934,7 +934,7 @@ def test_constructor_datetimes_with_nulls(self): np.array([None, None, datetime.now(), None]), ]: result = Series(arr) - assert result.dtype == "M8[ns]" + assert result.dtype == "M8[us]" def test_constructor_dtype_datetime64(self): s = Series(iNaT, dtype="M8[ns]", index=range(5)) @@ -962,15 +962,15 @@ def test_constructor_dtype_datetime64_10(self): dates = [np.datetime64(x) for x in pydates] ser = Series(dates) - assert ser.dtype == "M8[ns]" + assert ser.dtype == "M8[us]" ser.iloc[0] = np.nan - assert ser.dtype == "M8[ns]" + assert ser.dtype == "M8[us]" # GH3414 related expected = Series(pydates, dtype="datetime64[ms]") - result = Series(Series(dates).astype(np.int64) / 1000000, dtype="M8[ms]") + result = Series(Series(dates).astype(np.int64) / 1000, dtype="M8[ms]") tm.assert_series_equal(result, expected) result = Series(dates, dtype="datetime64[ms]") @@ -1084,16 +1084,16 @@ def test_constructor_dtype_datetime64_4(self): def test_constructor_dtype_datetime64_3(self): # if we passed a NaT it remains ser = Series([datetime(2010, 1, 1), datetime(2, 1, 1), NaT]) - assert ser.dtype == "object" + assert ser.dtype == "M8[us]" assert ser[2] is NaT assert "NaT" in str(ser) def test_constructor_dtype_datetime64_2(self): # if we passed a nan it remains ser = Series([datetime(2010, 1, 1), datetime(2, 1, 1), np.nan]) - assert ser.dtype == "object" - assert ser[2] is np.nan - assert "NaN" in str(ser) + assert ser.dtype == "M8[us]" + assert ser[2] is NaT + assert "NaT" in str(ser) def test_constructor_with_datetime_tz(self): # 8260 @@ -1155,7 +1155,7 @@ def test_constructor_with_datetime_tz4(self): Timestamp("2013-01-02 14:00:00-0800", tz="US/Pacific"), ] ) - assert ser.dtype == "datetime64[ns, US/Pacific]" + assert ser.dtype == "datetime64[s, US/Pacific]" assert lib.infer_dtype(ser, skipna=True) == "datetime64" def test_constructor_with_datetime_tz3(self): @@ -1215,7 +1215,7 @@ def test_construction_to_datetimelike_unit(self, arr_dtype, kind, unit): def test_constructor_with_naive_string_and_datetimetz_dtype(self, arg): # GH 17415: With naive string result = Series([arg], dtype="datetime64[ns, CET]") - expected = Series(Timestamp(arg)).dt.tz_localize("CET") + expected = Series([Timestamp(arg)], dtype="M8[ns]").dt.tz_localize("CET") tm.assert_series_equal(result, expected) def test_constructor_datetime64_bigendian(self): @@ -1318,9 +1318,8 @@ def test_constructor_periodindex(self): pi = period_range("20130101", periods=5, freq="D") s = Series(pi) assert s.dtype == "Period[D]" - with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): - expected = Series(pi.astype(object)) - tm.assert_series_equal(s, expected) + expected = Series(pi.astype(object)) + assert expected.dtype == object def test_constructor_dict(self): d = {"a": 0.0, "b": 1.0, "c": 2.0} @@ -1356,14 +1355,8 @@ def test_constructor_dict_order(self): expected = Series([1, 0, 2], index=list("bac")) tm.assert_series_equal(result, expected) - def test_constructor_dict_extension(self, ea_scalar_and_dtype, request): + def test_constructor_dict_extension(self, ea_scalar_and_dtype): ea_scalar, ea_dtype = ea_scalar_and_dtype - if isinstance(ea_scalar, Timestamp): - mark = pytest.mark.xfail( - reason="Construction from dict goes through " - "maybe_convert_objects which casts to nano" - ) - request.applymarker(mark) d = {"a": ea_scalar} result = Series(d, index=["a"]) expected = Series(ea_scalar, index=["a"], dtype=ea_dtype) @@ -1408,7 +1401,9 @@ def create_data(constructor): result_Timestamp = Series(data_Timestamp) tm.assert_series_equal(result_datetime64, expected) - tm.assert_series_equal(result_datetime, expected) + tm.assert_series_equal( + result_datetime, expected.set_axis(expected.index.as_unit("us")) + ) tm.assert_series_equal(result_Timestamp, expected) def test_constructor_dict_tuple_indexer(self): @@ -2118,9 +2113,12 @@ def test_series_string_inference_array_string_dtype(self): tm.assert_series_equal(ser, expected) def test_series_string_inference_storage_definition(self): - # GH#54793 + # https://github.com/pandas-dev/pandas/issues/54793 + # but after PDEP-14 (string dtype), it was decided to keep dtype="string" + # returning the NA string dtype, so expected is changed from + # "string[pyarrow_numpy]" to "string[pyarrow]" pytest.importorskip("pyarrow") - expected = Series(["a", "b"], dtype="string[pyarrow_numpy]") + expected = Series(["a", "b"], dtype="string[python]") with pd.option_context("future.infer_string", True): result = Series(["a", "b"], dtype="string") tm.assert_series_equal(result, expected) @@ -2141,20 +2139,14 @@ def test_series_string_inference_na_first(self): result = Series([pd.NA, "b"]) tm.assert_series_equal(result, expected) - def test_inference_on_pandas_objects(self): + @pytest.mark.parametrize("klass", [Series, Index]) + def test_inference_on_pandas_objects(self, klass): # GH#56012 - ser = Series([Timestamp("2019-12-31")], dtype=object) - with tm.assert_produces_warning(None): - # This doesn't do inference - result = Series(ser) + obj = klass([Timestamp("2019-12-31")], dtype=object) + # This doesn't do inference + result = Series(obj) assert result.dtype == np.object_ - idx = Index([Timestamp("2019-12-31")], dtype=object) - - with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): - result = Series(idx) - assert result.dtype != np.object_ - class TestSeriesConstructorIndexCoercion: def test_series_constructor_datetimelike_index_coercion(self): diff --git a/pandas/tests/series/test_formats.py b/pandas/tests/series/test_formats.py index c001e0f9b028a..1d95fbf8dccb8 100644 --- a/pandas/tests/series/test_formats.py +++ b/pandas/tests/series/test_formats.py @@ -6,7 +6,7 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype +from pandas._config import using_string_dtype import pandas as pd from pandas import ( @@ -144,7 +144,7 @@ def test_tidy_repr_name_0(self, arg): assert "Name: 0" in rep_str @pytest.mark.xfail( - using_pyarrow_string_dtype(), reason="TODO: investigate why this is failing" + using_string_dtype(), reason="TODO(infer_string): investigate failure" ) def test_newline(self): ser = Series(["a\n\r\tb"], name="a\n\r\td", index=["a\n\r\tf"]) diff --git a/pandas/tests/series/test_logical_ops.py b/pandas/tests/series/test_logical_ops.py index f59eacea3fe6c..939bf888fd61b 100644 --- a/pandas/tests/series/test_logical_ops.py +++ b/pandas/tests/series/test_logical_ops.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas import ( DataFrame, Index, @@ -348,6 +350,7 @@ def test_reverse_ops_with_index(self, op, expected): expected = Series(expected) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_logical_ops_label_based(self, using_infer_string): # GH#4947 # logical ops should be label based diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index 108c3aabb1aa4..1c88329a83b0e 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -37,13 +37,8 @@ def test_timedelta64_nan(self): assert not isna(td1[0]) # GH#16674 iNaT is treated as an integer when given by the user - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with pytest.raises(TypeError, match="Invalid value"): td1[1] = iNaT - assert not isna(td1[1]) - assert td1.dtype == np.object_ - assert td1[1] == iNaT - td1[1] = td[1] - assert not isna(td1[1]) td1[2] = NaT assert isna(td1[2]) diff --git a/pandas/tests/strings/__init__.py b/pandas/tests/strings/__init__.py index 01b49b5e5b633..6c4bec6a23789 100644 --- a/pandas/tests/strings/__init__.py +++ b/pandas/tests/strings/__init__.py @@ -2,12 +2,20 @@ import pandas as pd -object_pyarrow_numpy = ("object", "string[pyarrow_numpy]") + +def is_object_or_nan_string_dtype(dtype): + """ + Check if string-like dtype is following NaN semantics, i.e. is object + dtype or a NaN-variant of the StringDtype. + """ + return (isinstance(dtype, np.dtype) and dtype == "object") or ( + dtype.na_value is np.nan + ) def _convert_na_value(ser, expected): if ser.dtype != object: - if ser.dtype.storage == "pyarrow_numpy": + if ser.dtype.na_value is np.nan: expected = expected.fillna(np.nan) else: # GH#18463 diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index fb308b72e47f5..29adc1db994e9 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -13,7 +13,7 @@ ) from pandas.tests.strings import ( _convert_na_value, - object_pyarrow_numpy, + is_object_or_nan_string_dtype, ) # -------------------------------------------------------------------------------------- @@ -33,7 +33,9 @@ def test_contains(any_string_dtype): pat = "mmm[_]+" result = values.str.contains(pat) - expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series( np.array([False, np.nan, True, True, False], dtype=np.object_), dtype=expected_dtype, @@ -52,7 +54,9 @@ def test_contains(any_string_dtype): dtype=any_string_dtype, ) result = values.str.contains(pat) - expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series(np.array([False, False, True, True]), dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -79,14 +83,18 @@ def test_contains(any_string_dtype): pat = "mmm[_]+" result = values.str.contains(pat) - expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series( np.array([False, np.nan, True, True], dtype=np.object_), dtype=expected_dtype ) tm.assert_series_equal(result, expected) result = values.str.contains(pat, na=False) - expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series(np.array([False, False, True, True]), dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -171,7 +179,9 @@ def test_contains_moar(any_string_dtype): ) result = s.str.contains("a") - expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series( [False, False, False, True, True, False, np.nan, False, False, True], dtype=expected_dtype, @@ -212,7 +222,9 @@ def test_contains_nan(any_string_dtype): s = Series([np.nan, np.nan, np.nan], dtype=any_string_dtype) result = s.str.contains("foo", na=False) - expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series([False, False, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -230,7 +242,9 @@ def test_contains_nan(any_string_dtype): tm.assert_series_equal(result, expected) result = s.str.contains("foo") - expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series([np.nan, np.nan, np.nan], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -675,7 +689,9 @@ def test_replace_regex_single_character(regex, any_string_dtype): def test_match(any_string_dtype): # New match behavior introduced in 0.13 - expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) values = Series(["fooBAD__barBAD", np.nan, "foo"], dtype=any_string_dtype) result = values.str.match(".*(BAD[_]+).*(BAD)") @@ -730,12 +746,16 @@ def test_match_na_kwarg(any_string_dtype): s = Series(["a", "b", np.nan], dtype=any_string_dtype) result = s.str.match("a", na=False) - expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series([True, False, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) result = s.str.match("a") - expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series([True, False, np.nan], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -743,7 +763,9 @@ def test_match_na_kwarg(any_string_dtype): def test_match_case_kwarg(any_string_dtype): values = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype) result = values.str.match("ab", case=False) - expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series([True, True, True, True], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -759,7 +781,9 @@ def test_fullmatch(any_string_dtype): ["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype ) result = ser.str.fullmatch(".*BAD[_]+.*BAD") - expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series([True, False, np.nan, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -768,7 +792,9 @@ def test_fullmatch_dollar_literal(any_string_dtype): # GH 56652 ser = Series(["foo", "foo$foo", np.nan, "foo$"], dtype=any_string_dtype) result = ser.str.fullmatch("foo\\$") - expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series([False, False, np.nan, True], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -778,14 +804,18 @@ def test_fullmatch_na_kwarg(any_string_dtype): ["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype ) result = ser.str.fullmatch(".*BAD[_]+.*BAD", na=False) - expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series([True, False, False, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) def test_fullmatch_case_kwarg(any_string_dtype, performance_warning): ser = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype) - expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series([True, False, False, False], dtype=expected_dtype) @@ -859,7 +889,9 @@ def test_find(any_string_dtype): ser = Series( ["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF", "XXXX"], dtype=any_string_dtype ) - expected_dtype = np.int64 if any_string_dtype in object_pyarrow_numpy else "Int64" + expected_dtype = ( + np.int64 if is_object_or_nan_string_dtype(any_string_dtype) else "Int64" + ) result = ser.str.find("EF") expected = Series([4, 3, 1, 0, -1], dtype=expected_dtype) @@ -911,7 +943,9 @@ def test_find_nan(any_string_dtype): ser = Series( ["ABCDEFG", np.nan, "DEFGHIJEF", np.nan, "XXXX"], dtype=any_string_dtype ) - expected_dtype = np.float64 if any_string_dtype in object_pyarrow_numpy else "Int64" + expected_dtype = ( + np.float64 if is_object_or_nan_string_dtype(any_string_dtype) else "Int64" + ) result = ser.str.find("EF") expected = Series([4, np.nan, 1, np.nan, -1], dtype=expected_dtype) diff --git a/pandas/tests/strings/test_split_partition.py b/pandas/tests/strings/test_split_partition.py index 452e5ec5cf939..4fab6e7778002 100644 --- a/pandas/tests/strings/test_split_partition.py +++ b/pandas/tests/strings/test_split_partition.py @@ -14,7 +14,7 @@ ) from pandas.tests.strings import ( _convert_na_value, - object_pyarrow_numpy, + is_object_or_nan_string_dtype, ) @@ -385,7 +385,7 @@ def test_split_nan_expand(any_string_dtype): # check that these are actually np.nan/pd.NA and not None # TODO see GH 18463 # tm.assert_frame_equal does not differentiate - if any_string_dtype in object_pyarrow_numpy: + if is_object_or_nan_string_dtype(any_string_dtype): assert all(np.isnan(x) for x in result.iloc[1]) else: assert all(x is pd.NA for x in result.iloc[1]) diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index 25e4e1f9ec50c..1ce46497c3c22 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -14,7 +14,7 @@ ) import pandas._testing as tm from pandas.core.strings.accessor import StringMethods -from pandas.tests.strings import object_pyarrow_numpy +from pandas.tests.strings import is_object_or_nan_string_dtype @pytest.mark.parametrize("pattern", [0, True, Series(["foo", "bar"])]) @@ -41,7 +41,9 @@ def test_iter_raises(): def test_count(any_string_dtype): ser = Series(["foo", "foofoo", np.nan, "foooofooofommmfoo"], dtype=any_string_dtype) result = ser.str.count("f[o]+") - expected_dtype = np.float64 if any_string_dtype in object_pyarrow_numpy else "Int64" + expected_dtype = ( + np.float64 if is_object_or_nan_string_dtype(any_string_dtype) else "Int64" + ) expected = Series([1, 2, np.nan, 4], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -93,7 +95,7 @@ def test_repeat_with_null(any_string_dtype, arg, repeat): def test_empty_str_methods(any_string_dtype): empty_str = empty = Series(dtype=any_string_dtype) - if any_string_dtype in object_pyarrow_numpy: + if is_object_or_nan_string_dtype(any_string_dtype): empty_int = Series(dtype="int64") empty_bool = Series(dtype=bool) else: @@ -119,16 +121,16 @@ def test_empty_str_methods(any_string_dtype): tm.assert_series_equal(empty_str, empty.str.repeat(3)) tm.assert_series_equal(empty_bool, empty.str.match("^a")) tm.assert_frame_equal( - DataFrame(columns=[0], dtype=any_string_dtype), + DataFrame(columns=range(1), dtype=any_string_dtype), empty.str.extract("()", expand=True), ) tm.assert_frame_equal( - DataFrame(columns=[0, 1], dtype=any_string_dtype), + DataFrame(columns=range(2), dtype=any_string_dtype), empty.str.extract("()()", expand=True), ) tm.assert_series_equal(empty_str, empty.str.extract("()", expand=False)) tm.assert_frame_equal( - DataFrame(columns=[0, 1], dtype=any_string_dtype), + DataFrame(columns=range(2), dtype=any_string_dtype), empty.str.extract("()()", expand=False), ) tm.assert_frame_equal(empty_df.set_axis([], axis=1), empty.str.get_dummies()) @@ -207,7 +209,9 @@ def test_ismethods(method, expected, any_string_dtype): ser = Series( ["A", "b", "Xy", "4", "3A", "", "TT", "55", "-", " "], dtype=any_string_dtype ) - expected_dtype = "bool" if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + "bool" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series(expected, dtype=expected_dtype) result = getattr(ser.str, method)() tm.assert_series_equal(result, expected) @@ -233,7 +237,9 @@ def test_isnumeric_unicode(method, expected, any_string_dtype): ["A", "3", "¼", "★", "፸", "3", "four"], # noqa: RUF001 dtype=any_string_dtype, ) - expected_dtype = "bool" if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + "bool" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series(expected, dtype=expected_dtype) result = getattr(ser.str, method)() tm.assert_series_equal(result, expected) @@ -253,7 +259,9 @@ def test_isnumeric_unicode(method, expected, any_string_dtype): def test_isnumeric_unicode_missing(method, expected, any_string_dtype): values = ["A", np.nan, "¼", "★", np.nan, "3", "four"] # noqa: RUF001 ser = Series(values, dtype=any_string_dtype) - expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) expected = Series(expected, dtype=expected_dtype) result = getattr(ser.str, method)() tm.assert_series_equal(result, expected) @@ -284,7 +292,9 @@ def test_len(any_string_dtype): dtype=any_string_dtype, ) result = ser.str.len() - expected_dtype = "float64" if any_string_dtype in object_pyarrow_numpy else "Int64" + expected_dtype = ( + "float64" if is_object_or_nan_string_dtype(any_string_dtype) else "Int64" + ) expected = Series([3, 4, 6, np.nan, 8, 4, 1], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -313,7 +323,9 @@ def test_index(method, sub, start, end, index_or_series, any_string_dtype, expec obj = index_or_series( ["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF"], dtype=any_string_dtype ) - expected_dtype = np.int64 if any_string_dtype in object_pyarrow_numpy else "Int64" + expected_dtype = ( + np.int64 if is_object_or_nan_string_dtype(any_string_dtype) else "Int64" + ) expected = index_or_series(expected, dtype=expected_dtype) result = getattr(obj.str, method)(sub, start, end) @@ -354,7 +366,9 @@ def test_index_wrong_type_raises(index_or_series, any_string_dtype, method): ) def test_index_missing(any_string_dtype, method, exp): ser = Series(["abcb", "ab", "bcbe", np.nan], dtype=any_string_dtype) - expected_dtype = np.float64 if any_string_dtype in object_pyarrow_numpy else "Int64" + expected_dtype = ( + np.float64 if is_object_or_nan_string_dtype(any_string_dtype) else "Int64" + ) result = getattr(ser.str, method)("b") expected = Series(exp + [np.nan], dtype=expected_dtype) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 6da6ad27f853f..06fd81ed722d9 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs import ( algos as libalgos, hashtable as ht, @@ -873,6 +875,14 @@ def test_unique_masked(self, any_numeric_ea_dtype): expected = pd.array([1, pd.NA, 2], dtype=any_numeric_ea_dtype) tm.assert_extension_array_equal(result, expected) + def test_unique_NumpyExtensionArray(self): + arr_complex = pd.array( + [1 + 1j, 2, 3] + ) # NumpyEADtype('complex128') => NumpyExtensionArray + result = pd.unique(arr_complex) + expected = pd.array([1 + 1j, 2 + 0j, 3 + 0j]) + tm.assert_extension_array_equal(result, expected) + def test_nunique_ints(index_or_series_or_array): # GH#36327 @@ -1264,6 +1274,7 @@ def test_value_counts_datetime_outofbounds(self, dtype): ], dtype=dtype, ) + res = ser.value_counts() exp_index = Index( @@ -1637,7 +1648,10 @@ def test_unique_tuples(self, arr, uniques): expected = np.empty(len(uniques), dtype=object) expected[:] = uniques - msg = "unique requires a Series, Index, ExtensionArray, or np.ndarray, got list" + msg = ( + r"unique requires a Series, Index, ExtensionArray, np.ndarray " + r"or NumpyExtensionArray got list" + ) with pytest.raises(TypeError, match=msg): # GH#52986 pd.unique(arr) @@ -1656,7 +1670,11 @@ def test_unique_tuples(self, arr, uniques): ) def test_unique_complex_numbers(self, array, expected): # GH 17927 - msg = "unique requires a Series, Index, ExtensionArray, or np.ndarray, got list" + msg = ( + r"unique requires a Series, Index, ExtensionArray, np.ndarray " + r"or NumpyExtensionArray got list" + ) + with pytest.raises(TypeError, match=msg): # GH#52986 pd.unique(array) @@ -1666,6 +1684,7 @@ def test_unique_complex_numbers(self, array, expected): class TestHashTable: + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "htable, data", [ @@ -1705,6 +1724,7 @@ def test_hashtable_unique(self, htable, data, writable): reconstr = result_unique[result_inverse] tm.assert_numpy_array_equal(reconstr, s_duplicated.values) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "htable, data", [ diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index bcecd1b2d5eec..ca97af0d3eb32 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -3,11 +3,12 @@ import string import subprocess import sys -import textwrap import numpy as np import pytest +from pandas.compat import WASM + import pandas as pd from pandas import Series import pandas._testing as tm @@ -234,6 +235,7 @@ def test_temp_setattr(with_exception): assert ser.name == "first" +@pytest.mark.skipif(WASM, reason="Can't start subprocesses in WASM") @pytest.mark.single_cpu def test_str_size(): # GH#21758 @@ -247,21 +249,3 @@ def test_str_size(): ] result = subprocess.check_output(call).decode()[-4:-1].strip("\n") assert int(result) == int(expected) - - -@pytest.mark.single_cpu -def test_bz2_missing_import(): - # Check whether bz2 missing import is handled correctly (issue #53857) - code = """ - import sys - sys.modules['bz2'] = None - import pytest - import pandas as pd - from pandas.compat import get_bz2_file - msg = 'bz2 module not available.' - with pytest.raises(RuntimeError, match=msg): - get_bz2_file() - """ - code = textwrap.dedent(code) - call = [sys.executable, "-c", code] - subprocess.check_output(call) diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index ee26fdae74960..1e6538ca5a8fb 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -10,6 +10,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.errors import IntCastingNaNError import pandas as pd @@ -164,6 +166,7 @@ def test_pandas_datareader(): pytest.importorskip("pandas_datareader") +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") def test_pyarrow(df): pyarrow = pytest.importorskip("pyarrow") diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 8f661edf0f241..e87498742061b 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -288,6 +288,13 @@ def test_multiindex_with_na(self): tm.assert_frame_equal(df, expected) + @pytest.mark.parametrize("na", [None, np.nan]) + def test_multiindex_insert_level_with_na(self, na): + # GH 59003 + df = DataFrame([0], columns=[["A"], ["B"]]) + df[na, "B"] = 1 + tm.assert_frame_equal(df[na], DataFrame([1], columns=["B"])) + class TestSorted: """everything you wanted to test about sorting""" diff --git a/pandas/tests/test_sorting.py b/pandas/tests/test_sorting.py index 132608d7df115..2a225bda953cf 100644 --- a/pandas/tests/test_sorting.py +++ b/pandas/tests/test_sorting.py @@ -104,7 +104,9 @@ def test_int64_overflow_groupby_large_df_shuffled(self, agg): gr = df.groupby(list("abcde")) # verify this is testing what it is supposed to test! - assert is_int64_overflow_possible(gr._grouper.shape) + assert is_int64_overflow_possible( + tuple(ping.ngroups for ping in gr._grouper.groupings) + ) mi = MultiIndex.from_arrays( [ar.ravel() for ar in np.array_split(np.unique(arr, axis=0), 5, axis=1)], @@ -221,7 +223,6 @@ def test_int64_overflow_how_merge(self, left_right, join_type): out = merge(left, right, how="outer") out.sort_values(out.columns.tolist(), inplace=True) - out.index = np.arange(len(out)) tm.assert_frame_equal(out, merge(left, right, how=join_type, sort=True)) @pytest.mark.slow diff --git a/pandas/tests/test_take.py b/pandas/tests/test_take.py index ce2e4e0f6cec5..451ef42fff3d1 100644 --- a/pandas/tests/test_take.py +++ b/pandas/tests/test_take.py @@ -5,6 +5,7 @@ from pandas._libs import iNaT +from pandas import array import pandas._testing as tm import pandas.core.algorithms as algos @@ -303,7 +304,14 @@ def test_take_coerces_list(self): arr = [1, 2, 3] msg = ( "pd.api.extensions.take requires a numpy.ndarray, ExtensionArray, " - "Index, or Series, got list" + "Index, Series, or NumpyExtensionArray got list" ) with pytest.raises(TypeError, match=msg): algos.take(arr, [0, 0]) + + def test_take_NumpyExtensionArray(self): + # GH#59177 + arr = array([1 + 1j, 2, 3]) # NumpyEADtype('complex128') (NumpyExtensionArray) + assert algos.take(arr, [2]) == 2 + arr = array([1, 2, 3]) # Int64Dtype() (ExtensionArray) + assert algos.take(arr, [2]) == 2 diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index b05c30fa50fbe..658e16bfe5682 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -10,11 +10,11 @@ ) from decimal import Decimal import locale +import zoneinfo from dateutil.parser import parse import numpy as np import pytest -import pytz from pandas._libs import tslib from pandas._libs.tslibs import ( @@ -117,7 +117,9 @@ def test_to_datetime_format_YYYYMMDD_with_nat(self, cache): ser = Series([19801222, 19801222] + [19810105] * 5, dtype="float") # with NaT expected = Series( - [Timestamp("19801222"), Timestamp("19801222")] + [Timestamp("19810105")] * 5 + [Timestamp("19801222"), Timestamp("19801222")] + + [Timestamp("19810105")] * 5, + dtype="M8[s]", ) expected[2] = np.nan ser[2] = np.nan @@ -143,19 +145,32 @@ def test_to_datetime_format_YYYYMM_with_nat(self, cache): # Explicit cast to float to explicit cast when setting np.nan ser = Series([198012, 198012] + [198101] * 5, dtype="float") expected = Series( - [Timestamp("19801201"), Timestamp("19801201")] + [Timestamp("19810101")] * 5 + [Timestamp("19801201"), Timestamp("19801201")] + + [Timestamp("19810101")] * 5, + dtype="M8[s]", ) expected[2] = np.nan ser[2] = np.nan result = to_datetime(ser, format="%Y%m", cache=cache) tm.assert_series_equal(result, expected) + def test_to_datetime_format_YYYYMMDD_oob_for_ns(self, cache): + # coercion + # GH 7930, GH 14487 + ser = Series([20121231, 20141231, 99991231]) + result = to_datetime(ser, format="%Y%m%d", errors="raise", cache=cache) + expected = Series( + np.array(["2012-12-31", "2014-12-31", "9999-12-31"], dtype="M8[s]"), + dtype="M8[s]", + ) + tm.assert_series_equal(result, expected) + def test_to_datetime_format_YYYYMMDD_coercion(self, cache): # coercion # GH 7930 - ser = Series([20121231, 20141231, 99991231]) + ser = Series([20121231, 20141231, 999999999999999999999999999991231]) result = to_datetime(ser, format="%Y%m%d", errors="coerce", cache=cache) - expected = Series(["20121231", "20141231", "NaT"], dtype="M8[ns]") + expected = Series(["20121231", "20141231", "NaT"], dtype="M8[s]") tm.assert_series_equal(result, expected) @pytest.mark.parametrize( @@ -417,9 +432,11 @@ def test_to_datetime_format_weeks(self, value, fmt, expected, cache): ["2010-01-01 12:00:00 Z", "2010-01-01 12:00:00 Z"], [ Timestamp( - "2010-01-01 12:00:00", tzinfo=pytz.FixedOffset(0) - ), # pytz coerces to UTC - Timestamp("2010-01-01 12:00:00", tzinfo=pytz.FixedOffset(0)), + "2010-01-01 12:00:00", tzinfo=timezone(timedelta(minutes=0)) + ), + Timestamp( + "2010-01-01 12:00:00", tzinfo=timezone(timedelta(minutes=0)) + ), ], ], ], @@ -532,7 +549,8 @@ def test_to_datetime_overflow(self): res = to_datetime(arg, errors="coerce") assert res is NaT res = to_datetime([arg], errors="coerce") - tm.assert_index_equal(res, Index([NaT])) + exp = Index([NaT], dtype="M8[s]") + tm.assert_index_equal(res, exp) def test_to_datetime_mixed_datetime_and_string(self): # GH#47018 adapted old doctest with new behavior @@ -563,7 +581,7 @@ def test_to_datetime_mixed_date_and_string(self, format): # https://github.com/pandas-dev/pandas/issues/50108 d1 = date(2020, 1, 2) res = to_datetime(["2020-01-01", d1], format=format) - expected = DatetimeIndex(["2020-01-01", "2020-01-02"], dtype="M8[ns]") + expected = DatetimeIndex(["2020-01-01", "2020-01-02"], dtype="M8[s]") tm.assert_index_equal(res, expected) @pytest.mark.parametrize( @@ -579,7 +597,7 @@ def test_to_datetime_mixed_date_and_string(self, format): ["2000-01-01 01:00:00-08:00", "2000-01-01 02:00:00-08:00"], DatetimeIndex( ["2000-01-01 09:00:00+00:00", "2000-01-01 10:00:00+00:00"], - dtype="datetime64[ns, UTC]", + dtype="datetime64[us, UTC]", ), id="all tz-aware, with utc", ), @@ -588,7 +606,7 @@ def test_to_datetime_mixed_date_and_string(self, format): ["2000-01-01 01:00:00+00:00", "2000-01-01 02:00:00+00:00"], DatetimeIndex( ["2000-01-01 01:00:00+00:00", "2000-01-01 02:00:00+00:00"], - ), + ).as_unit("us"), id="all tz-aware, without utc", ), pytest.param( @@ -596,7 +614,7 @@ def test_to_datetime_mixed_date_and_string(self, format): ["2000-01-01 01:00:00-08:00", "2000-01-01 02:00:00+00:00"], DatetimeIndex( ["2000-01-01 09:00:00+00:00", "2000-01-01 02:00:00+00:00"], - dtype="datetime64[ns, UTC]", + dtype="datetime64[us, UTC]", ), id="all tz-aware, mixed offsets, with utc", ), @@ -605,7 +623,7 @@ def test_to_datetime_mixed_date_and_string(self, format): ["2000-01-01 01:00:00", "2000-01-01 02:00:00+00:00"], DatetimeIndex( ["2000-01-01 01:00:00+00:00", "2000-01-01 02:00:00+00:00"], - dtype="datetime64[ns, UTC]", + dtype="datetime64[us, UTC]", ), id="tz-aware string, naive pydatetime, with utc", ), @@ -625,6 +643,8 @@ def test_to_datetime_mixed_datetime_and_string_with_format( ts1 = constructor(args[0]) ts2 = args[1] result = to_datetime([ts1, ts2], format=fmt, utc=utc) + if constructor is Timestamp: + expected = expected.as_unit("s") tm.assert_index_equal(result, expected) @pytest.mark.parametrize( @@ -696,7 +716,7 @@ def test_to_datetime_mixed_offsets_with_none_tz_utc_false_removed( "%Y-%m-%d %H:%M:%S%z", DatetimeIndex( ["2000-01-01 08:00:00+00:00", "2000-01-02 00:00:00+00:00", "NaT"], - dtype="datetime64[ns, UTC]", + dtype="datetime64[s, UTC]", ), id="ISO8601, UTC", ), @@ -704,7 +724,7 @@ def test_to_datetime_mixed_offsets_with_none_tz_utc_false_removed( "%Y-%d-%m %H:%M:%S%z", DatetimeIndex( ["2000-01-01 08:00:00+00:00", "2000-02-01 00:00:00+00:00", "NaT"], - dtype="datetime64[ns, UTC]", + dtype="datetime64[s, UTC]", ), id="non-ISO8601, UTC", ), @@ -965,7 +985,7 @@ def test_to_datetime_now(self): # See GH#18666 with tm.set_timezone("US/Eastern"): # GH#18705 - now = Timestamp("now").as_unit("ns") + now = Timestamp("now") pdnow = to_datetime("now") pdnow2 = to_datetime(["now"])[0] @@ -988,12 +1008,12 @@ def test_to_datetime_today(self, tz): # this both of these timezones _and_ UTC will all be in the same day, # so this test will not detect the regression introduced in #18666. with tm.set_timezone(tz): - nptoday = np.datetime64("today").astype("datetime64[ns]").astype(np.int64) + nptoday = np.datetime64("today").astype("datetime64[us]").astype(np.int64) pdtoday = to_datetime("today") pdtoday2 = to_datetime(["today"])[0] - tstoday = Timestamp("today").as_unit("ns") - tstoday2 = Timestamp.today().as_unit("ns") + tstoday = Timestamp("today") + tstoday2 = Timestamp.today() # These should all be equal with infinite perf; this gives # a generous margin of 10 seconds @@ -1030,7 +1050,7 @@ def test_to_datetime_now_with_format(self, format, expected_ds, string, attribut # https://github.com/pandas-dev/pandas/issues/50359 result = to_datetime(["2020-01-03 00:00:00Z", string], format=format, utc=True) expected = DatetimeIndex( - [expected_ds, getattr(Timestamp, attribute)()], dtype="datetime64[ns, UTC]" + [expected_ds, getattr(Timestamp, attribute)()], dtype="datetime64[s, UTC]" ) assert (expected - result).max().total_seconds() < 1 @@ -1091,11 +1111,7 @@ def test_to_datetime_array_of_dt64s(self, cache, unit): # Assuming all datetimes are in bounds, to_datetime() returns # an array that is equal to Timestamp() parsing result = to_datetime(dts, cache=cache) - if cache: - # FIXME: behavior should not depend on cache - expected = DatetimeIndex([Timestamp(x).asm8 for x in dts], dtype="M8[s]") - else: - expected = DatetimeIndex([Timestamp(x).asm8 for x in dts], dtype="M8[ns]") + expected = DatetimeIndex([Timestamp(x).asm8 for x in dts], dtype="M8[s]") tm.assert_index_equal(result, expected) @@ -1106,14 +1122,7 @@ def test_to_datetime_array_of_dt64s(self, cache, unit): to_datetime(dts_with_oob, errors="raise") result = to_datetime(dts_with_oob, errors="coerce", cache=cache) - if not cache: - # FIXME: shouldn't depend on cache! - expected = DatetimeIndex( - [Timestamp(dts_with_oob[0]).asm8, Timestamp(dts_with_oob[1]).asm8] * 30 - + [NaT], - ) - else: - expected = DatetimeIndex(np.array(dts_with_oob, dtype="M8[s]")) + expected = DatetimeIndex(np.array(dts_with_oob, dtype="M8[s]")) tm.assert_index_equal(result, expected) def test_to_datetime_tz(self, cache): @@ -1126,7 +1135,7 @@ def test_to_datetime_tz(self, cache): result = to_datetime(arr, cache=cache) expected = DatetimeIndex( ["2013-01-01 13:00:00", "2013-01-02 14:00:00"], tz="US/Pacific" - ) + ).as_unit("s") tm.assert_index_equal(result, expected) def test_to_datetime_tz_mixed(self, cache): @@ -1145,7 +1154,7 @@ def test_to_datetime_tz_mixed(self, cache): result = to_datetime(arr, cache=cache, errors="coerce") expected = DatetimeIndex( - ["2013-01-01 13:00:00-08:00", "NaT"], dtype="datetime64[ns, US/Pacific]" + ["2013-01-01 13:00:00-08:00", "NaT"], dtype="datetime64[s, US/Pacific]" ) tm.assert_index_equal(result, expected) @@ -1162,6 +1171,7 @@ def test_to_datetime_different_offsets_removed(self, cache): def test_to_datetime_tz_pytz(self, cache): # see gh-8260 + pytz = pytest.importorskip("pytz") us_eastern = pytz.timezone("US/Eastern") arr = np.array( [ @@ -1177,7 +1187,7 @@ def test_to_datetime_tz_pytz(self, cache): result = to_datetime(arr, utc=True, cache=cache) expected = DatetimeIndex( ["2000-01-01 08:00:00+00:00", "2000-06-01 07:00:00+00:00"], - dtype="datetime64[ns, UTC]", + dtype="datetime64[us, UTC]", freq=None, ) tm.assert_index_equal(result, expected) @@ -1264,7 +1274,7 @@ def test_to_datetime_tz_psycopg2(self, request, cache): result = to_datetime(arr, errors="coerce", utc=True, cache=cache) expected = DatetimeIndex( ["2000-01-01 08:00:00+00:00", "2000-06-01 07:00:00+00:00"], - dtype="datetime64[ns, UTC]", + dtype="datetime64[us, UTC]", freq=None, ) tm.assert_index_equal(result, expected) @@ -1273,15 +1283,15 @@ def test_to_datetime_tz_psycopg2(self, request, cache): i = DatetimeIndex( ["2000-01-01 08:00:00"], tz=psycopg2_tz.FixedOffsetTimezone(offset=-300, name=None), - ) - assert is_datetime64_ns_dtype(i) + ).as_unit("us") + assert not is_datetime64_ns_dtype(i) # tz coercion result = to_datetime(i, errors="coerce", cache=cache) tm.assert_index_equal(result, i) result = to_datetime(i, errors="coerce", utc=True, cache=cache) - expected = DatetimeIndex(["2000-01-01 13:00:00"], dtype="datetime64[ns, UTC]") + expected = DatetimeIndex(["2000-01-01 13:00:00"], dtype="datetime64[us, UTC]") tm.assert_index_equal(result, expected) @pytest.mark.parametrize("arg", [True, False]) @@ -1351,16 +1361,20 @@ def test_datetime_invalid_scalar(self, value, format): def test_datetime_outofbounds_scalar(self, value, format): # GH24763 res = to_datetime(value, errors="coerce", format=format) - assert res is NaT + if format is None: + assert isinstance(res, Timestamp) + assert res == Timestamp(value) + else: + assert res is NaT if format is not None: msg = r'^time data ".*" doesn\'t match format ".*", at position 0.' with pytest.raises(ValueError, match=msg): to_datetime(value, errors="raise", format=format) else: - msg = "^Out of bounds .*, at position 0$" - with pytest.raises(OutOfBoundsDatetime, match=msg): - to_datetime(value, errors="raise", format=format) + res = to_datetime(value, errors="raise", format=format) + assert isinstance(res, Timestamp) + assert res == Timestamp(value) @pytest.mark.parametrize( ("values"), [(["a"]), (["00:01:99"]), (["a", "b", "99:00:00"])] @@ -1433,15 +1447,17 @@ def test_to_datetime_cache_scalar(self): assert result == expected @pytest.mark.parametrize( - "datetimelikes,expected_values", + "datetimelikes,expected_values,exp_unit", ( ( (None, np.nan) + (NaT,) * start_caching_at, (NaT,) * (start_caching_at + 2), + "s", ), ( (None, Timestamp("2012-07-26")) + (NaT,) * start_caching_at, (NaT, Timestamp("2012-07-26")) + (NaT,) * start_caching_at, + "s", ), ( (None,) @@ -1449,11 +1465,12 @@ def test_to_datetime_cache_scalar(self): + ("2012 July 26", Timestamp("2012-07-26")), (NaT,) * (start_caching_at + 1) + (Timestamp("2012-07-26"), Timestamp("2012-07-26")), + "s", ), ), ) def test_convert_object_to_datetime_with_cache( - self, datetimelikes, expected_values + self, datetimelikes, expected_values, exp_unit ): # GH#39882 ser = Series( @@ -1463,7 +1480,7 @@ def test_convert_object_to_datetime_with_cache( result_series = to_datetime(ser, errors="coerce") expected_series = Series( expected_values, - dtype="datetime64[ns]", + dtype=f"datetime64[{exp_unit}]", ) tm.assert_series_equal(result_series, expected_series) @@ -1484,7 +1501,7 @@ def test_convert_object_to_datetime_with_cache( ) def test_to_datetime_converts_null_like_to_nat(self, cache, input): # GH35888 - expected = Series([NaT] * len(input), dtype="M8[ns]") + expected = Series([NaT] * len(input), dtype="M8[s]") result = to_datetime(input, cache=cache) tm.assert_series_equal(result, expected) @@ -1535,7 +1552,17 @@ def test_to_datetime_coerce_oob(self, string_arg, format, outofbounds): # https://github.com/pandas-dev/pandas/issues/50255 ts_strings = [string_arg, outofbounds] result = to_datetime(ts_strings, errors="coerce", format=format) - expected = DatetimeIndex([datetime(2018, 3, 1), NaT]) + if isinstance(outofbounds, str) and ( + format.startswith("%B") ^ outofbounds.startswith("J") + ): + # the strings don't match the given format, so they raise and we coerce + expected = DatetimeIndex([datetime(2018, 3, 1), NaT], dtype="M8[s]") + elif isinstance(outofbounds, datetime): + expected = DatetimeIndex( + [datetime(2018, 3, 1), outofbounds], dtype="M8[us]" + ) + else: + expected = DatetimeIndex([datetime(2018, 3, 1), outofbounds], dtype="M8[s]") tm.assert_index_equal(result, expected) def test_to_datetime_malformed_no_raise(self): @@ -1546,7 +1573,9 @@ def test_to_datetime_malformed_no_raise(self): UserWarning, match="Could not infer format", raise_on_extra_warnings=False ): result = to_datetime(ts_strings, errors="coerce") - tm.assert_index_equal(result, Index([NaT, NaT])) + # TODO: should Index get "s" by default here? + exp = Index([NaT, NaT], dtype="M8[s]") + tm.assert_index_equal(result, exp) def test_to_datetime_malformed_raise(self): # GH 48633 @@ -1594,7 +1623,7 @@ def test_iso_8601_strings_with_different_offsets_utc(self): result = to_datetime(ts_strings, utc=True) expected = DatetimeIndex( [Timestamp(2015, 11, 18, 10), Timestamp(2015, 11, 18, 10), NaT], tz="UTC" - ) + ).as_unit("s") tm.assert_index_equal(result, expected) def test_mixed_offsets_with_native_datetime_utc_false_raises(self): @@ -1620,7 +1649,7 @@ def test_non_iso_strings_with_tz_offset(self): result = to_datetime(["March 1, 2018 12:00:00+0400"] * 2) expected = DatetimeIndex( [datetime(2018, 3, 1, 12, tzinfo=timezone(timedelta(minutes=240)))] * 2 - ) + ).as_unit("s") tm.assert_index_equal(result, expected) @pytest.mark.parametrize( @@ -1641,9 +1670,11 @@ def test_timestamp_utc_true(self, ts, expected): @pytest.mark.parametrize("dt_str", ["00010101", "13000101", "30000101", "99990101"]) def test_to_datetime_with_format_out_of_bounds(self, dt_str): # GH 9107 - msg = "Out of bounds nanosecond timestamp" - with pytest.raises(OutOfBoundsDatetime, match=msg): - to_datetime(dt_str, format="%Y%m%d") + res = to_datetime(dt_str, format="%Y%m%d") + dtobj = datetime.strptime(dt_str, "%Y%m%d") + expected = Timestamp(dtobj).as_unit("s") + assert res == expected + assert res.unit == expected.unit def test_to_datetime_utc(self): arr = np.array([parse("2012-06-13T01:39:00Z")], dtype=object) @@ -1671,7 +1702,9 @@ def test_to_datetime_fixed_offset(self): ["2020-10-26 00:00:00+06:00", Timestamp("2018-01-01", tz="US/Pacific")], [ "2020-10-26 00:00:00+06:00", - datetime(2020, 1, 1, 18, tzinfo=pytz.timezone("Australia/Melbourne")), + datetime(2020, 1, 1, 18).astimezone( + zoneinfo.ZoneInfo("Australia/Melbourne") + ), ], ], ) @@ -1726,7 +1759,7 @@ def test_to_datetime_month_or_year_unit_non_round_float(self, cache, unit): # In 3.0, the string "1.5" is parsed as as it would be without unit, # which fails. With errors="coerce" this becomes NaT. res = to_datetime(["1.5"], unit=unit, errors="coerce") - expected = to_datetime([NaT]) + expected = to_datetime([NaT]).as_unit("ns") tm.assert_index_equal(res, expected) # round floats are OK @@ -1980,6 +2013,7 @@ def test_dataframe(self, df, cache): # dict-like result = to_datetime(df[["year", "month", "day"]].to_dict(), cache=cache) + expected.index = Index([0, 1]) tm.assert_series_equal(result, expected) def test_dataframe_dict_with_constructable(self, df, cache): @@ -1988,7 +2022,8 @@ def test_dataframe_dict_with_constructable(self, df, cache): df2["month"] = 2 result = to_datetime(df2, cache=cache) expected2 = Series( - [Timestamp("20150204 00:00:00"), Timestamp("20160205 00:0:00")] + [Timestamp("20150204 00:00:00"), Timestamp("20160205 00:0:00")], + index=Index([0, 1]), ) tm.assert_series_equal(result, expected2) @@ -2149,7 +2184,7 @@ def test_dataframe_utc_true(self): df = DataFrame({"year": [2015, 2016], "month": [2, 3], "day": [4, 5]}) result = to_datetime(df, utc=True) expected = Series( - np.array(["2015-02-04", "2016-03-05"], dtype="datetime64[ns]") + np.array(["2015-02-04", "2016-03-05"], dtype="datetime64[s]") ).dt.tz_localize("UTC") tm.assert_series_equal(result, expected) @@ -2323,7 +2358,7 @@ def test_to_datetime_iso8601_non_padded(self, input, format): ) def test_to_datetime_iso8601_with_timezone_valid(self, input, format): # https://github.com/pandas-dev/pandas/issues/12649 - expected = Timestamp(2020, 1, 1, tzinfo=pytz.UTC) + expected = Timestamp(2020, 1, 1, tzinfo=timezone.utc) result = to_datetime(input, format=format) assert result == expected @@ -2361,7 +2396,9 @@ def test_to_datetime_with_space_in_series(self, cache): with pytest.raises(ValueError, match=msg): to_datetime(ser, errors="raise", cache=cache) result_coerce = to_datetime(ser, errors="coerce", cache=cache) - expected_coerce = Series([datetime(2006, 10, 18), datetime(2008, 10, 18), NaT]) + expected_coerce = Series( + [datetime(2006, 10, 18), datetime(2008, 10, 18), NaT] + ).dt.as_unit("s") tm.assert_series_equal(result_coerce, expected_coerce) @td.skip_if_not_us_locale @@ -2473,7 +2510,7 @@ def test_string_na_nat_conversion(self, cache): strings = np.array(["1/1/2000", "1/2/2000", np.nan, "1/4/2000"], dtype=object) - expected = np.empty(4, dtype="M8[ns]") + expected = np.empty(4, dtype="M8[s]") for i, val in enumerate(strings): if isna(val): expected[i] = iNaT @@ -2518,7 +2555,7 @@ def test_string_na_nat_conversion_with_name(self, cache): result = to_datetime(series, cache=cache) dresult = to_datetime(dseries, cache=cache) - expected = Series(np.empty(5, dtype="M8[ns]"), index=idx) + expected = Series(np.empty(5, dtype="M8[s]"), index=idx) for i in range(5): x = series.iloc[i] if isna(x): @@ -2558,7 +2595,7 @@ def test_dayfirst(self, cache): arr = ["10/02/2014", "11/02/2014", "12/02/2014"] expected = DatetimeIndex( [datetime(2014, 2, 10), datetime(2014, 2, 11), datetime(2014, 2, 12)] - ) + ).as_unit("s") idx1 = DatetimeIndex(arr, dayfirst=True) idx2 = DatetimeIndex(np.array(arr), dayfirst=True) idx3 = to_datetime(arr, dayfirst=True, cache=cache) @@ -2582,7 +2619,7 @@ def test_dayfirst_warnings_valid_input(self): # CASE 1: valid input arr = ["31/12/2014", "10/03/2011"] expected = DatetimeIndex( - ["2014-12-31", "2011-03-10"], dtype="datetime64[ns]", freq=None + ["2014-12-31", "2011-03-10"], dtype="datetime64[s]", freq=None ) # A. dayfirst arg correct, no warning @@ -2687,7 +2724,7 @@ def test_to_datetime_consistent_format(self, cache): ser = Series(np.array(data)) result = to_datetime(ser, cache=cache) expected = Series( - ["2011-01-01", "2011-02-01", "2011-03-01"], dtype="datetime64[ns]" + ["2011-01-01", "2011-02-01", "2011-03-01"], dtype="datetime64[s]" ) tm.assert_series_equal(result, expected) @@ -2699,9 +2736,7 @@ def test_to_datetime_series_with_nans(self, cache): ) ) result = to_datetime(ser, cache=cache) - expected = Series( - ["2011-01-01", NaT, "2011-01-03", NaT], dtype="datetime64[ns]" - ) + expected = Series(["2011-01-01", NaT, "2011-01-03", NaT], dtype="datetime64[s]") tm.assert_series_equal(result, expected) def test_to_datetime_series_start_with_nans(self, cache): @@ -2720,7 +2755,7 @@ def test_to_datetime_series_start_with_nans(self, cache): result = to_datetime(ser, cache=cache) expected = Series( - [NaT, NaT, "2011-01-01", "2011-01-02", "2011-01-03"], dtype="datetime64[ns]" + [NaT, NaT, "2011-01-01", "2011-01-02", "2011-01-03"], dtype="datetime64[s]" ) tm.assert_series_equal(result, expected) @@ -2734,6 +2769,7 @@ def test_infer_datetime_format_tz_name(self, tz_name, offset): result = to_datetime(ser) tz = timezone(timedelta(minutes=offset)) expected = Series([Timestamp("2019-02-02 08:07:13").tz_localize(tz)]) + expected = expected.dt.as_unit("s") tm.assert_series_equal(result, expected) @pytest.mark.parametrize( @@ -2749,7 +2785,7 @@ def test_infer_datetime_format_zero_tz(self, ts, zero_tz): # GH 41047 ser = Series([ts + zero_tz]) result = to_datetime(ser) - tz = pytz.utc if zero_tz == "Z" else None + tz = timezone.utc if zero_tz == "Z" else None expected = Series([Timestamp(ts, tz=tz)]) tm.assert_series_equal(result, expected) @@ -2890,9 +2926,16 @@ def test_parsers(self, date_str, expected, cache): # https://github.com/dateutil/dateutil/issues/217 yearfirst = True - result1, _ = parsing.parse_datetime_string_with_reso( + result1, reso_attrname = parsing.parse_datetime_string_with_reso( date_str, yearfirst=yearfirst ) + + reso = { + "nanosecond": "ns", + "microsecond": "us", + "millisecond": "ms", + "second": "s", + }.get(reso_attrname, "s") result2 = to_datetime(date_str, yearfirst=yearfirst) result3 = to_datetime([date_str], yearfirst=yearfirst) # result5 is used below @@ -2907,7 +2950,7 @@ def test_parsers(self, date_str, expected, cache): for res in [result1, result2]: assert res == expected for res in [result3, result4, result6, result8, result9]: - exp = DatetimeIndex([Timestamp(expected)]) + exp = DatetimeIndex([Timestamp(expected)]).as_unit(reso) tm.assert_index_equal(res, exp) # these really need to have yearfirst, but we don't support @@ -2921,7 +2964,7 @@ def test_na_values_with_cache( self, cache, unique_nulls_fixture, unique_nulls_fixture2 ): # GH22305 - expected = Index([NaT, NaT], dtype="datetime64[ns]") + expected = Index([NaT, NaT], dtype="datetime64[s]") result = to_datetime([unique_nulls_fixture, unique_nulls_fixture2], cache=cache) tm.assert_index_equal(result, expected) @@ -2947,6 +2990,8 @@ def test_parsers_nat(self): ("20/12/21", True, False, datetime(2021, 12, 20)), ("20/12/21", False, True, datetime(2020, 12, 21)), ("20/12/21", True, True, datetime(2020, 12, 21)), + # GH 58859 + ("20201012", True, False, datetime(2020, 12, 10)), ], ) def test_parsers_dayfirst_yearfirst( @@ -3146,7 +3191,7 @@ def test_invalid_origin(self, unit): ) def test_epoch(self, units, epochs): epoch_1960 = Timestamp(1960, 1, 1) - units_from_epochs = list(range(5)) + units_from_epochs = np.arange(5, dtype=np.int64) expected = Series( [pd.Timedelta(x, unit=units) + epoch_1960 for x in units_from_epochs] ) @@ -3177,7 +3222,7 @@ def test_invalid_origins(self, origin, exc, units): def test_invalid_origins_tzinfo(self): # GH16842 with pytest.raises(ValueError, match="must be tz-naive"): - to_datetime(1, unit="D", origin=datetime(2000, 1, 1, tzinfo=pytz.utc)) + to_datetime(1, unit="D", origin=datetime(2000, 1, 1, tzinfo=timezone.utc)) def test_incorrect_value_exception(self): # GH47495 @@ -3197,9 +3242,16 @@ def test_incorrect_value_exception(self): ) def test_to_datetime_out_of_bounds_with_format_arg(self, format, warning): # see gh-23830 - msg = r"^Out of bounds nanosecond timestamp: 2417-10-10 00:00:00, at position 0" - with pytest.raises(OutOfBoundsDatetime, match=msg): - to_datetime("2417-10-10 00:00:00", format=format) + if format is None: + res = to_datetime("2417-10-10 00:00:00.00", format=format) + assert isinstance(res, Timestamp) + assert res.year == 2417 + assert res.month == 10 + assert res.day == 10 + else: + msg = "unconverted data remains when parsing with format.*, at position 0" + with pytest.raises(ValueError, match=msg): + to_datetime("2417-10-10 00:00:00.00", format=format) @pytest.mark.parametrize( "arg, origin, expected_str", @@ -3331,7 +3383,7 @@ def test_empty_string_datetime(errors, args, format): # coerce empty string to pd.NaT result = to_datetime(td, format=format, errors=errors) - expected = Series(["2016-03-24", "2016-03-25", NaT], dtype="datetime64[ns]") + expected = Series(["2016-03-24", "2016-03-25", NaT], dtype="datetime64[s]") tm.assert_series_equal(expected, result) @@ -3371,14 +3423,12 @@ def test_to_datetime_cache_coerce_50_lines_outofbounds(series_length): ) result1 = to_datetime(ser, errors="coerce", utc=True) - expected1 = Series( - [NaT] + ([Timestamp("1991-10-20 00:00:00+00:00")] * series_length) - ) - + expected1 = Series([Timestamp(x) for x in ser]) + assert expected1.dtype == "M8[us, UTC]" tm.assert_series_equal(result1, expected1) - with pytest.raises(OutOfBoundsDatetime, match="Out of bounds nanosecond timestamp"): - to_datetime(ser, errors="raise", utc=True) + result3 = to_datetime(ser, errors="raise", utc=True) + tm.assert_series_equal(result3, expected1) def test_to_datetime_format_f_parse_nanos(): @@ -3463,7 +3513,7 @@ def test_to_datetime_with_empty_str_utc_false_format_mixed(): # GH 50887 vals = ["2020-01-01 00:00+00:00", ""] result = to_datetime(vals, format="mixed") - expected = Index([Timestamp("2020-01-01 00:00+00:00"), "NaT"], dtype="M8[ns, UTC]") + expected = Index([Timestamp("2020-01-01 00:00+00:00"), "NaT"], dtype="M8[s, UTC]") tm.assert_index_equal(result, expected) # Check that a couple of other similar paths work the same way diff --git a/pandas/tests/tools/test_to_timedelta.py b/pandas/tests/tools/test_to_timedelta.py index ba000a0439dd1..9ec2689069da9 100644 --- a/pandas/tests/tools/test_to_timedelta.py +++ b/pandas/tests/tools/test_to_timedelta.py @@ -29,7 +29,7 @@ def test_to_timedelta_dt64_raises(self): # supported GH#29794 msg = r"dtype datetime64\[ns\] cannot be converted to timedelta64\[ns\]" - ser = Series([pd.NaT]) + ser = Series([pd.NaT], dtype="M8[ns]") with pytest.raises(TypeError, match=msg): to_timedelta(ser) with pytest.raises(TypeError, match=msg): @@ -56,7 +56,10 @@ def test_to_timedelta_same_np_timedelta64(self): def test_to_timedelta_series(self): # Series expected = Series([timedelta(days=1), timedelta(days=1, seconds=1)]) - result = to_timedelta(Series(["1d", "1days 00:00:01"])) + + msg = "'d' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + result = to_timedelta(Series(["1d", "1days 00:00:01"])) tm.assert_series_equal(result, expected) def test_to_timedelta_units(self): diff --git a/pandas/tests/tseries/frequencies/test_inference.py b/pandas/tests/tseries/frequencies/test_inference.py index edfc1973a2bd9..dad5c73b89626 100644 --- a/pandas/tests/tseries/frequencies/test_inference.py +++ b/pandas/tests/tseries/frequencies/test_inference.py @@ -23,7 +23,6 @@ date_range, period_range, ) -import pandas._testing as tm from pandas.core.arrays import ( DatetimeArray, TimedeltaArray, @@ -202,17 +201,6 @@ def test_infer_freq_custom(base_delta_code_pair, constructor): assert frequencies.infer_freq(index) is None -@pytest.mark.parametrize( - "freq,expected", [("Q", "QE-DEC"), ("Q-NOV", "QE-NOV"), ("Q-OCT", "QE-OCT")] -) -def test_infer_freq_index(freq, expected): - rng = period_range("1959Q2", "2009Q3", freq=freq) - with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): - rng = Index(rng.to_timestamp("D", how="e").astype(object)) - - assert rng.inferred_freq == expected - - @pytest.mark.parametrize( "expected,dates", list( diff --git a/pandas/tests/tseries/holiday/test_calendar.py b/pandas/tests/tseries/holiday/test_calendar.py index 99829857e6836..90e2e117852a2 100644 --- a/pandas/tests/tseries/holiday/test_calendar.py +++ b/pandas/tests/tseries/holiday/test_calendar.py @@ -57,10 +57,10 @@ def __init__(self, name=None, rules=None) -> None: jan2 = TestCalendar(rules=[Holiday("jan2", year=2015, month=1, day=2)]) # Getting holidays for Jan 1 should not alter results for Jan 2. - expected = DatetimeIndex(["01-Jan-2015"]).as_unit("ns") + expected = DatetimeIndex(["01-Jan-2015"]).as_unit("us") tm.assert_index_equal(jan1.holidays(), expected) - expected2 = DatetimeIndex(["02-Jan-2015"]).as_unit("ns") + expected2 = DatetimeIndex(["02-Jan-2015"]).as_unit("us") tm.assert_index_equal(jan2.holidays(), expected2) diff --git a/pandas/tests/tseries/holiday/test_holiday.py b/pandas/tests/tseries/holiday/test_holiday.py index 08f4a1250392e..ffe6ff0b51bcf 100644 --- a/pandas/tests/tseries/holiday/test_holiday.py +++ b/pandas/tests/tseries/holiday/test_holiday.py @@ -1,7 +1,9 @@ -from datetime import datetime +from datetime import ( + datetime, + timezone, +) import pytest -from pytz import utc from pandas import ( DatetimeIndex, @@ -128,9 +130,9 @@ def test_holiday_dates(holiday, start_date, end_date, expected): # Verify that timezone info is preserved. assert list( holiday.dates( - utc.localize(Timestamp(start_date)), utc.localize(Timestamp(end_date)) + Timestamp(start_date, tz=timezone.utc), Timestamp(end_date, tz=timezone.utc) ) - ) == [utc.localize(dt) for dt in expected] + ) == [dt.replace(tzinfo=timezone.utc) for dt in expected] @pytest.mark.parametrize( @@ -194,8 +196,10 @@ def test_holidays_within_dates(holiday, start, expected): # Verify that timezone info is preserved. assert list( - holiday.dates(utc.localize(Timestamp(start)), utc.localize(Timestamp(start))) - ) == [utc.localize(dt) for dt in expected] + holiday.dates( + Timestamp(start, tz=timezone.utc), Timestamp(start, tz=timezone.utc) + ) + ) == [dt.replace(tzinfo=timezone.utc) for dt in expected] @pytest.mark.parametrize( diff --git a/pandas/tests/tseries/offsets/test_dst.py b/pandas/tests/tseries/offsets/test_dst.py index 8ff80536fc69e..dfdc69c0fe18e 100644 --- a/pandas/tests/tseries/offsets/test_dst.py +++ b/pandas/tests/tseries/offsets/test_dst.py @@ -5,7 +5,6 @@ from datetime import timedelta import pytest -import pytz from pandas._libs.tslibs import Timestamp from pandas._libs.tslibs.offsets import ( @@ -33,10 +32,8 @@ from pandas import DatetimeIndex import pandas._testing as tm -from pandas.util.version import Version -# error: Module has no attribute "__version__" -pytz_version = Version(pytz.__version__) # type: ignore[attr-defined] +pytz = pytest.importorskip("pytz") def get_utc_offset_hours(ts): @@ -52,7 +49,10 @@ class TestDST: # test both basic names and dateutil timezones timezone_utc_offsets = { - "US/Eastern": {"utc_offset_daylight": -4, "utc_offset_standard": -5}, + pytz.timezone("US/Eastern"): { + "utc_offset_daylight": -4, + "utc_offset_standard": -5, + }, "dateutil/US/Pacific": {"utc_offset_daylight": -7, "utc_offset_standard": -8}, } valid_date_offsets_singular = [ @@ -96,7 +96,10 @@ def _test_offset( if ( offset_name in ["hour", "minute", "second", "microsecond"] and offset_n == 1 - and tstart == Timestamp("2013-11-03 01:59:59.999999-0500", tz="US/Eastern") + and tstart + == Timestamp( + "2013-11-03 01:59:59.999999-0500", tz=pytz.timezone("US/Eastern") + ) ): # This addition results in an ambiguous wall time err_msg = { @@ -147,7 +150,9 @@ def _test_offset( assert datepart_offset == offset.kwds[offset_name] else: # the offset should be the same as if it was done in UTC - assert t == (tstart.tz_convert("UTC") + offset).tz_convert("US/Pacific") + assert t == (tstart.tz_convert("UTC") + offset).tz_convert( + pytz.timezone("US/Pacific") + ) def _make_timestamp(self, string, hrs_offset, tz): if hrs_offset >= 0: @@ -224,16 +229,6 @@ def test_all_offset_classes(self, tup): @pytest.mark.parametrize( "original_dt, target_dt, offset, tz", [ - pytest.param( - Timestamp("1900-01-01"), - Timestamp("1905-07-01"), - MonthBegin(66), - "Africa/Lagos", - marks=pytest.mark.xfail( - pytz_version < Version("2020.5") or pytz_version == Version("2022.2"), - reason="GH#41906: pytz utc transition dates changed", - ), - ), ( Timestamp("2021-10-01 01:15"), Timestamp("2021-10-31 01:15"), @@ -263,7 +258,7 @@ def test_all_offset_classes(self, tup): def test_nontick_offset_with_ambiguous_time_error(original_dt, target_dt, offset, tz): # .apply for non-Tick offsets throws AmbiguousTimeError when the target dt # is dst-ambiguous - localized_dt = original_dt.tz_localize(tz) + localized_dt = original_dt.tz_localize(pytz.timezone(tz)) msg = f"Cannot infer dst time from {target_dt}, try using the 'ambiguous' argument" with pytest.raises(pytz.AmbiguousTimeError, match=msg): diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index 1e5bfa6033216..d19717e87c7d2 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -788,9 +788,7 @@ def test_get_offset(): pairs = [ ("B", BDay()), - ("b", BDay()), - ("bme", BMonthEnd()), - ("Bme", BMonthEnd()), + ("BME", BMonthEnd()), ("W-MON", Week(weekday=0)), ("W-TUE", Week(weekday=1)), ("W-WED", Week(weekday=2)), diff --git a/pandas/tests/tslibs/test_array_to_datetime.py b/pandas/tests/tslibs/test_array_to_datetime.py index 35b72c9bb2887..3c55ae2c6f904 100644 --- a/pandas/tests/tslibs/test_array_to_datetime.py +++ b/pandas/tests/tslibs/test_array_to_datetime.py @@ -15,7 +15,6 @@ tslib, ) from pandas._libs.tslibs.dtypes import NpyDatetimeUnit -from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime from pandas import Timestamp import pandas._testing as tm @@ -156,7 +155,7 @@ def test_parsing_valid_dates(data, expected): arr = np.array(data, dtype=object) result, _ = tslib.array_to_datetime(arr) - expected = np.array(expected, dtype="M8[ns]") + expected = np.array(expected, dtype="M8[s]") tm.assert_numpy_array_equal(result, expected) @@ -174,6 +173,8 @@ def test_parsing_timezone_offsets(dt_string, expected_tz): # to the same datetime after the timezone offset is added. arr = np.array(["01-01-2013 00:00:00"], dtype=object) expected, _ = tslib.array_to_datetime(arr) + if "000000000" in dt_string: + expected = expected.astype("M8[ns]") arr = np.array([dt_string], dtype=object) result, result_tz = tslib.array_to_datetime(arr) @@ -206,38 +207,35 @@ def test_parsing_different_timezone_offsets(): @pytest.mark.parametrize( - "invalid_date", + "invalid_date,exp_unit", [ - date(1000, 1, 1), - datetime(1000, 1, 1), - "1000-01-01", - "Jan 1, 1000", - np.datetime64("1000-01-01"), + (date(1000, 1, 1), "s"), + (datetime(1000, 1, 1), "us"), + ("1000-01-01", "s"), + ("Jan 1, 1000", "s"), + (np.datetime64("1000-01-01"), "s"), ], ) @pytest.mark.parametrize("errors", ["coerce", "raise"]) -def test_coerce_outside_ns_bounds(invalid_date, errors): +def test_coerce_outside_ns_bounds(invalid_date, exp_unit, errors): arr = np.array([invalid_date], dtype="object") - kwargs = {"values": arr, "errors": errors} - if errors == "raise": - msg = "^Out of bounds nanosecond timestamp: .*, at position 0$" + result, _ = tslib.array_to_datetime(arr, errors=errors) + out_reso = np.datetime_data(result.dtype)[0] + assert out_reso == exp_unit + ts = Timestamp(invalid_date) + assert ts.unit == exp_unit - with pytest.raises(OutOfBoundsDatetime, match=msg): - tslib.array_to_datetime(**kwargs) - else: # coerce. - result, _ = tslib.array_to_datetime(**kwargs) - expected = np.array([iNaT], dtype="M8[ns]") - - tm.assert_numpy_array_equal(result, expected) + expected = np.array([ts._value], dtype=f"M8[{exp_unit}]") + tm.assert_numpy_array_equal(result, expected) def test_coerce_outside_ns_bounds_one_valid(): arr = np.array(["1/1/1000", "1/1/2000"], dtype=object) result, _ = tslib.array_to_datetime(arr, errors="coerce") - expected = [iNaT, "2000-01-01T00:00:00.000000000"] - expected = np.array(expected, dtype="M8[ns]") + expected = ["1000-01-01T00:00:00.000000000", "2000-01-01T00:00:00.000000000"] + expected = np.array(expected, dtype="M8[s]") tm.assert_numpy_array_equal(result, expected) @@ -247,7 +245,13 @@ def test_coerce_of_invalid_datetimes(): # With coercing, the invalid dates becomes iNaT result, _ = tslib.array_to_datetime(arr, errors="coerce") expected = ["2013-01-01T00:00:00.000000000", iNaT, iNaT] - tm.assert_numpy_array_equal(result, np.array(expected, dtype="M8[ns]")) + tm.assert_numpy_array_equal(result, np.array(expected, dtype="M8[s]")) + + # With coercing, the invalid dates becomes iNaT + result, _ = tslib.array_to_datetime(arr, errors="coerce") + expected = ["2013-01-01T00:00:00.000000000", iNaT, iNaT] + + tm.assert_numpy_array_equal(result, np.array(expected, dtype="M8[s]")) def test_to_datetime_barely_out_of_bounds(): @@ -292,5 +296,5 @@ def test_datetime_subclass(klass): arr = np.array([klass(2000, 1, 1)], dtype=object) result, _ = tslib.array_to_datetime(arr) - expected = np.array(["2000-01-01T00:00:00.000000000"], dtype="M8[ns]") + expected = np.array(["2000-01-01T00:00:00.000000"], dtype="M8[us]") tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/tslibs/test_conversion.py b/pandas/tests/tslibs/test_conversion.py index 6a0b86cbd03ee..f62910b5e1f1c 100644 --- a/pandas/tests/tslibs/test_conversion.py +++ b/pandas/tests/tslibs/test_conversion.py @@ -1,8 +1,10 @@ -from datetime import datetime +from datetime import ( + datetime, + timezone, +) import numpy as np import pytest -from pytz import UTC from pandas._libs.tslibs import ( OutOfBoundsTimedelta, @@ -55,7 +57,7 @@ def _compare_local_to_utc(tz_didx, naive_didx): def test_tz_localize_to_utc_copies(): # GH#46460 arr = np.arange(5, dtype="i8") - result = tz_convert_from_utc(arr, tz=UTC) + result = tz_convert_from_utc(arr, tz=timezone.utc) tm.assert_numpy_array_equal(result, arr) assert not np.shares_memory(arr, result) @@ -100,7 +102,7 @@ def test_tz_convert_readonly(): # GH#35530 arr = np.array([0], dtype=np.int64) arr.setflags(write=False) - result = tz_convert_from_utc(arr, UTC) + result = tz_convert_from_utc(arr, timezone.utc) tm.assert_numpy_array_equal(result, arr) @@ -141,14 +143,18 @@ class SubDatetime(datetime): "dt, expected", [ pytest.param( - Timestamp("2000-01-01"), Timestamp("2000-01-01", tz=UTC), id="timestamp" + Timestamp("2000-01-01"), + Timestamp("2000-01-01", tz=timezone.utc), + id="timestamp", ), pytest.param( - datetime(2000, 1, 1), datetime(2000, 1, 1, tzinfo=UTC), id="datetime" + datetime(2000, 1, 1), + datetime(2000, 1, 1, tzinfo=timezone.utc), + id="datetime", ), pytest.param( SubDatetime(2000, 1, 1), - SubDatetime(2000, 1, 1, tzinfo=UTC), + SubDatetime(2000, 1, 1, tzinfo=timezone.utc), id="subclassed_datetime", ), ], @@ -157,5 +163,5 @@ def test_localize_pydatetime_dt_types(dt, expected): # GH 25851 # ensure that subclassed datetime works with # localize_pydatetime - result = conversion.localize_pydatetime(dt, UTC) + result = conversion.localize_pydatetime(dt, timezone.utc) assert result == expected diff --git a/pandas/tests/tslibs/test_resolution.py b/pandas/tests/tslibs/test_resolution.py index e9da6b3cf991c..0e7705ad7ed94 100644 --- a/pandas/tests/tslibs/test_resolution.py +++ b/pandas/tests/tslibs/test_resolution.py @@ -1,6 +1,7 @@ +import datetime + import numpy as np import pytest -import pytz from pandas._libs.tslibs import ( Resolution, @@ -8,8 +9,6 @@ ) from pandas._libs.tslibs.dtypes import NpyDatetimeUnit -import pandas._testing as tm - def test_get_resolution_nano(): # don't return the fallback RESO_DAY @@ -23,7 +22,7 @@ def test_get_resolution_non_nano_data(): res = get_resolution(arr, None, NpyDatetimeUnit.NPY_FR_us.value) assert res == Resolution.RESO_US - res = get_resolution(arr, pytz.UTC, NpyDatetimeUnit.NPY_FR_us.value) + res = get_resolution(arr, datetime.timezone.utc, NpyDatetimeUnit.NPY_FR_us.value) assert res == Resolution.RESO_US @@ -49,16 +48,9 @@ def test_get_attrname_from_abbrev(freqstr, expected): @pytest.mark.parametrize("freq", ["H", "S"]) -def test_units_H_S_deprecated_from_attrname_to_abbrevs(freq): - # GH#52536 - msg = f"'{freq}' is deprecated and will be removed in a future version." - - with tm.assert_produces_warning(FutureWarning, match=msg): - Resolution.get_reso_from_freqstr(freq) - +def test_unit_H_S_raises(freq): + # GH#59143 + msg = f"Invalid frequency: {freq}" -@pytest.mark.parametrize("freq", ["T", "t", "L", "U", "N", "n"]) -def test_reso_abbrev_T_L_U_N_raises(freq): - msg = f"Frequency '{freq}' is no longer supported." with pytest.raises(ValueError, match=msg): Resolution.get_reso_from_freqstr(freq) diff --git a/pandas/tests/tslibs/test_timezones.py b/pandas/tests/tslibs/test_timezones.py index 28e4889983fb9..8dd7060f21d59 100644 --- a/pandas/tests/tslibs/test_timezones.py +++ b/pandas/tests/tslibs/test_timezones.py @@ -6,7 +6,6 @@ import dateutil.tz import pytest -import pytz from pandas._libs.tslibs import ( conversion, @@ -22,10 +21,11 @@ def test_is_utc(utc_fixture): assert timezones.is_utc(tz) -@pytest.mark.parametrize("tz_name", list(pytz.common_timezones)) -def test_cache_keys_are_distinct_for_pytz_vs_dateutil(tz_name): - tz_p = timezones.maybe_get_tz(tz_name) - tz_d = timezones.maybe_get_tz("dateutil/" + tz_name) +def test_cache_keys_are_distinct_for_pytz_vs_dateutil(): + pytz = pytest.importorskip("pytz") + for tz_name in pytz.common_timezones: + tz_p = timezones.maybe_get_tz(tz_name) + tz_d = timezones.maybe_get_tz("dateutil/" + tz_name) if tz_d is None: pytest.skip(tz_name + ": dateutil does not know about this one") @@ -76,12 +76,15 @@ def test_tz_compare_utc(utc_fixture, utc_fixture2): @pytest.fixture( params=[ - (pytz.timezone("US/Eastern"), lambda tz, x: tz.localize(x)), + ("pytz/US/Eastern", lambda tz, x: tz.localize(x)), (dateutil.tz.gettz("US/Eastern"), lambda tz, x: x.replace(tzinfo=tz)), ] ) def infer_setup(request): eastern, localize = request.param + if isinstance(eastern, str) and eastern.startswith("pytz/"): + pytz = pytest.importorskip("pytz") + eastern = pytz.timezone(eastern.removeprefix("pytz/")) start_naive = datetime(2001, 1, 1) end_naive = datetime(2009, 1, 1) @@ -111,10 +114,10 @@ def test_infer_tz_compat(infer_setup): def test_infer_tz_utc_localize(infer_setup): _, _, start, end, start_naive, end_naive = infer_setup - utc = pytz.utc + utc = timezone.utc - start = utc.localize(start_naive) - end = utc.localize(end_naive) + start = start_naive.astimezone(utc) + end = end_naive.astimezone(utc) assert timezones.infer_tzinfo(start, end) is utc @@ -124,8 +127,8 @@ def test_infer_tz_mismatch(infer_setup, ordered): eastern, _, _, _, start_naive, end_naive = infer_setup msg = "Inputs must both have the same timezone" - utc = pytz.utc - start = utc.localize(start_naive) + utc = timezone.utc + start = start_naive.astimezone(utc) end = conversion.localize_pydatetime(end_naive, eastern) args = (start, end) if ordered else (end, start) @@ -139,7 +142,7 @@ def test_maybe_get_tz_invalid_types(): timezones.maybe_get_tz(44.0) with pytest.raises(TypeError, match=""): - timezones.maybe_get_tz(pytz) + timezones.maybe_get_tz(pytest) msg = "" with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/tslibs/test_to_offset.py b/pandas/tests/tslibs/test_to_offset.py index ad4e9e2bcf38a..67521c7e2a3ac 100644 --- a/pandas/tests/tslibs/test_to_offset.py +++ b/pandas/tests/tslibs/test_to_offset.py @@ -31,6 +31,7 @@ ("2SME-16", offsets.SemiMonthEnd(2, day_of_month=16)), ("2SMS-14", offsets.SemiMonthBegin(2, day_of_month=14)), ("2SMS-15", offsets.SemiMonthBegin(2)), + ("LWOM-MON", offsets.LastWeekOfMonth()), ], ) def test_to_offset(freq_input, expected): @@ -61,11 +62,11 @@ def test_to_offset_negative(freqstr, expected): "2SMS-15D", "100foo", # Invalid leading +/- signs. - "+-1d", + "+-1D", "-+1h", "+1", "-7", - "+d", + "+D", "-m", # Invalid shortcut anchors. "SME-0", @@ -128,9 +129,14 @@ def test_to_offset_leading_zero(freqstr, expected): assert result.n == expected -@pytest.mark.parametrize("freqstr,expected", [("+1d", 1), ("+2h30min", 150)]) -def test_to_offset_leading_plus(freqstr, expected): - result = to_offset(freqstr) +@pytest.mark.parametrize( + "freqstr,expected,wrn", [("+1d", 1, FutureWarning), ("+2h30min", 150, None)] +) +def test_to_offset_leading_plus(freqstr, expected, wrn): + msg = "'d' is deprecated and will be removed in a future version." + + with tm.assert_produces_warning(wrn, match=msg): + result = to_offset(freqstr) assert result.n == expected @@ -185,36 +191,49 @@ def test_anchored_shortcuts(shortcut, expected): "2qs-feb", "2bqs", "2sms", + "1sme", "2bms", "2cbme", "2me", - "2w", ], ) -def test_to_offset_lowercase_frequency_deprecated(freq_depr): +def test_to_offset_lowercase_frequency_raises(freq_depr): + msg = f"Invalid frequency: {freq_depr}" + + with pytest.raises(ValueError, match=msg): + to_offset(freq_depr) + + +@pytest.mark.parametrize("freq_depr", ["2MIN", "2Us", "2NS"]) +def test_to_offset_uppercase_frequency_deprecated(freq_depr): # GH#54939 depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed in a " - f"future version, please use '{freq_depr.upper()[1:]}' instead." + f"future version, please use '{freq_depr.lower()[1:]}' instead." with tm.assert_produces_warning(FutureWarning, match=depr_msg): to_offset(freq_depr) @pytest.mark.parametrize( - "freq_depr", + "freq_depr,expected", [ - "2H", - "2BH", - "2MIN", - "2S", - "2Us", - "2NS", + ("2w", offsets.Week(2, weekday=6)), + ("2b", offsets.BusinessDay(2)), + ("2d", offsets.Day(2)), ], ) -def test_to_offset_uppercase_frequency_deprecated(freq_depr): - # GH#54939 - depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed in a " - f"future version, please use '{freq_depr.lower()[1:]}' instead." +def test_to_offset_lowercase_frequency_deprecated(freq_depr, expected): + # GH#54939, GH#58998 + msg = f"'{freq_depr[1:]}' is deprecated and will be removed in a future version." - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - to_offset(freq_depr) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = to_offset(freq_depr) + assert result == expected + + +@pytest.mark.parametrize("freq", ["2H", "2BH", "2S"]) +def test_to_offset_uppercase_frequency_raises(freq): + msg = f"Invalid frequency: {freq}" + + with pytest.raises(ValueError, match=msg): + to_offset(freq) diff --git a/pandas/tests/util/test_hashing.py b/pandas/tests/util/test_hashing.py index a54e0071aa006..e654534ccd453 100644 --- a/pandas/tests/util/test_hashing.py +++ b/pandas/tests/util/test_hashing.py @@ -260,14 +260,14 @@ def test_categorical_consistency(s1, categorize): tm.assert_series_equal(h1, h3) -def test_categorical_with_nan_consistency(): - c = pd.Categorical.from_codes( - [-1, 0, 1, 2, 3, 4], categories=pd.date_range("2012-01-01", periods=5, name="B") - ) - expected = hash_array(c, categorize=False) - - c = pd.Categorical.from_codes([-1, 0], categories=[pd.Timestamp("2012-01-01")]) - result = hash_array(c, categorize=False) +def test_categorical_with_nan_consistency(unit): + dti = pd.date_range("2012-01-01", periods=5, name="B", unit=unit) + cat = pd.Categorical.from_codes([-1, 0, 1, 2, 3, 4], categories=dti) + expected = hash_array(cat, categorize=False) + + ts = pd.Timestamp("2012-01-01").as_unit(unit) + cat2 = pd.Categorical.from_codes([-1, 0], categories=[ts]) + result = hash_array(cat2, categorize=False) assert result[0] in expected assert result[1] in expected diff --git a/pandas/tests/window/test_ewm.py b/pandas/tests/window/test_ewm.py index 35c896dc0090b..4ea6c805a2ee4 100644 --- a/pandas/tests/window/test_ewm.py +++ b/pandas/tests/window/test_ewm.py @@ -102,7 +102,8 @@ def test_ewma_with_times_equal_spacing(halflife_with_times, times, min_periods): tm.assert_frame_equal(result, expected) -def test_ewma_with_times_variable_spacing(tz_aware_fixture, unit): +def test_ewma_with_times_variable_spacing(tz_aware_fixture, unit, adjust): + # GH 54328 tz = tz_aware_fixture halflife = "23 days" times = ( @@ -112,8 +113,11 @@ def test_ewma_with_times_variable_spacing(tz_aware_fixture, unit): ) data = np.arange(3) df = DataFrame(data) - result = df.ewm(halflife=halflife, times=times).mean() - expected = DataFrame([0.0, 0.5674161888241773, 1.545239952073459]) + result = df.ewm(halflife=halflife, times=times, adjust=adjust).mean() + if adjust: + expected = DataFrame([0.0, 0.5674161888241773, 1.545239952073459]) + else: + expected = DataFrame([0.0, 0.23762518642226227, 1.534926369128742]) tm.assert_frame_equal(result, expected) @@ -148,13 +152,56 @@ def test_ewm_getitem_attributes_retained(arg, adjust, ignore_na): assert result == expected -def test_ewma_times_adjust_false_raises(): - # GH 40098 +def test_ewma_times_adjust_false_with_disallowed_com(): + # GH 54328 + with pytest.raises( + NotImplementedError, + match=( + "None of com, span, or alpha can be specified " + "if times is provided and adjust=False" + ), + ): + Series(range(1)).ewm( + 0.1, + adjust=False, + times=date_range("2000", freq="D", periods=1), + halflife="1D", + ) + + +def test_ewma_times_adjust_false_with_disallowed_alpha(): + # GH 54328 with pytest.raises( - NotImplementedError, match="times is not supported with adjust=False." + NotImplementedError, + match=( + "None of com, span, or alpha can be specified " + "if times is provided and adjust=False" + ), + ): + Series(range(1)).ewm( + 0.1, + adjust=False, + times=date_range("2000", freq="D", periods=1), + alpha=0.5, + halflife="1D", + ) + + +def test_ewma_times_adjust_false_with_disallowed_span(): + # GH 54328 + with pytest.raises( + NotImplementedError, + match=( + "None of com, span, or alpha can be specified " + "if times is provided and adjust=False" + ), ): Series(range(1)).ewm( - 0.1, adjust=False, times=date_range("2000", freq="D", periods=1) + 0.1, + adjust=False, + times=date_range("2000", freq="D", periods=1), + span=10, + halflife="1D", ) diff --git a/pandas/tests/window/test_expanding.py b/pandas/tests/window/test_expanding.py index b4a045cd26fe4..b2f76bdd0e2ad 100644 --- a/pandas/tests/window/test_expanding.py +++ b/pandas/tests/window/test_expanding.py @@ -502,8 +502,8 @@ def test_expanding_apply_min_periods_0(engine_and_raw): def test_expanding_cov_diff_index(): # GH 7512 - s1 = Series([1, 2, 3], index=[0, 1, 2]) - s2 = Series([1, 3], index=[0, 2]) + s1 = Series([1, 2, 3], index=range(3)) + s2 = Series([1, 3], index=range(0, 4, 2)) result = s1.expanding().cov(s2) expected = Series([None, None, 2.0]) tm.assert_series_equal(result, expected) @@ -515,14 +515,14 @@ def test_expanding_cov_diff_index(): s1 = Series([7, 8, 10], index=[0, 1, 3]) s2 = Series([7, 9, 10], index=[0, 2, 3]) result = s1.expanding().cov(s2) - expected = Series([None, None, None, 4.5]) + expected = Series([None, None, None, 4.5], index=list(range(4))) tm.assert_series_equal(result, expected) def test_expanding_corr_diff_index(): # GH 7512 - s1 = Series([1, 2, 3], index=[0, 1, 2]) - s2 = Series([1, 3], index=[0, 2]) + s1 = Series([1, 2, 3], index=range(3)) + s2 = Series([1, 3], index=range(0, 4, 2)) result = s1.expanding().corr(s2) expected = Series([None, None, 1.0]) tm.assert_series_equal(result, expected) @@ -534,7 +534,7 @@ def test_expanding_corr_diff_index(): s1 = Series([7, 8, 10], index=[0, 1, 3]) s2 = Series([7, 9, 10], index=[0, 2, 3]) result = s1.expanding().corr(s2) - expected = Series([None, None, None, 1.0]) + expected = Series([None, None, None, 1.0], index=list(range(4))) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py index 120470b09a92b..4d37c6d57f788 100644 --- a/pandas/tests/window/test_groupby.py +++ b/pandas/tests/window/test_groupby.py @@ -582,7 +582,7 @@ def test_groupby_rolling_string_index(self): groups = df.groupby("group") df["count_to_date"] = groups.cumcount() - rolling_groups = groups.rolling("10d", on="eventTime") + rolling_groups = groups.rolling("10D", on="eventTime") result = rolling_groups.apply(lambda df: df.shape[0]) expected = DataFrame( [ @@ -623,11 +623,14 @@ def test_groupby_rolling_count_closed_on(self, unit): "date": date_range(end="20190101", periods=6, unit=unit), } ) - result = ( - df.groupby("group") - .rolling("3d", on="date", closed="left")["column1"] - .count() - ) + msg = "'d' is deprecated and will be removed in a future version." + + with tm.assert_produces_warning(FutureWarning, match=msg): + result = ( + df.groupby("group") + .rolling("3d", on="date", closed="left")["column1"] + .count() + ) dti = DatetimeIndex( [ "2018-12-27", diff --git a/pandas/tests/window/test_numba.py b/pandas/tests/window/test_numba.py index 650eb911e410b..23b17c651f08d 100644 --- a/pandas/tests/window/test_numba.py +++ b/pandas/tests/window/test_numba.py @@ -67,6 +67,21 @@ def f(x, *args): ) tm.assert_series_equal(result, expected) + def test_numba_min_periods(self): + # GH 58868 + def last_row(x): + assert len(x) == 3 + return x[-1] + + df = DataFrame([[1, 2], [3, 4], [5, 6], [7, 8]]) + + result = df.rolling(3, method="table", min_periods=3).apply( + last_row, raw=True, engine="numba" + ) + + expected = DataFrame([[np.nan, np.nan], [np.nan, np.nan], [5, 6], [7, 8]]) + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( "data", [ @@ -304,7 +319,9 @@ def f(x): @td.skip_if_no("numba") def test_invalid_kwargs_nopython(): - with pytest.raises(NumbaUtilError, match="numba does not support kwargs with"): + with pytest.raises( + NumbaUtilError, match="numba does not support keyword-only arguments" + ): Series(range(1)).rolling(1).apply( lambda x: x, kwargs={"a": 1}, engine="numba", raw=True ) diff --git a/pandas/tests/window/test_pairwise.py b/pandas/tests/window/test_pairwise.py index 6fae79ee70702..d23c6501ed1d1 100644 --- a/pandas/tests/window/test_pairwise.py +++ b/pandas/tests/window/test_pairwise.py @@ -103,6 +103,7 @@ def test_flex_binary_frame(method, frame): ) res3 = getattr(frame.rolling(window=10), method)(frame2) + res3.columns = Index(list(res3.columns)) exp = DataFrame( {k: getattr(frame[k].rolling(window=10), method)(frame2[k]) for k in frame} ) @@ -143,26 +144,26 @@ def test_corr_sanity(): def test_rolling_cov_diff_length(): # GH 7512 - s1 = Series([1, 2, 3], index=[0, 1, 2]) - s2 = Series([1, 3], index=[0, 2]) + s1 = Series([1, 2, 3], index=range(3)) + s2 = Series([1, 3], index=range(0, 4, 2)) result = s1.rolling(window=3, min_periods=2).cov(s2) expected = Series([None, None, 2.0]) tm.assert_series_equal(result, expected) - s2a = Series([1, None, 3], index=[0, 1, 2]) + s2a = Series([1, None, 3], index=range(3)) result = s1.rolling(window=3, min_periods=2).cov(s2a) tm.assert_series_equal(result, expected) def test_rolling_corr_diff_length(): # GH 7512 - s1 = Series([1, 2, 3], index=[0, 1, 2]) - s2 = Series([1, 3], index=[0, 2]) + s1 = Series([1, 2, 3], index=range(3)) + s2 = Series([1, 3], index=range(0, 4, 2)) result = s1.rolling(window=3, min_periods=2).corr(s2) expected = Series([None, None, 1.0]) tm.assert_series_equal(result, expected) - s2a = Series([1, None, 3], index=[0, 1, 2]) + s2a = Series([1, None, 3], index=range(3)) result = s1.rolling(window=3, min_periods=2).corr(s2a) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index fc8d7f69b8180..af3194b5085c4 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -578,7 +578,7 @@ def test_missing_minp_zero_variable(): [np.nan] * 4, index=DatetimeIndex(["2017-01-01", "2017-01-04", "2017-01-06", "2017-01-07"]), ) - result = x.rolling(Timedelta("2d"), min_periods=0).sum() + result = x.rolling(Timedelta("2D"), min_periods=0).sum() expected = Series(0.0, index=x.index) tm.assert_series_equal(result, expected) @@ -1153,7 +1153,7 @@ def test_timeoffset_as_window_parameter_for_corr(unit): index=dti, ) - res = df.rolling(window="3d").corr() + res = df.rolling(window="3D").corr() tm.assert_frame_equal(exp, res) @@ -1380,17 +1380,20 @@ def test_invalid_method(): Series(range(1)).rolling(1, method="foo") -@pytest.mark.parametrize("window", [1, "1d"]) -def test_rolling_descending_date_order_with_offset(window, frame_or_series): +def test_rolling_descending_date_order_with_offset(frame_or_series): # GH#40002 - idx = date_range(start="2020-01-01", end="2020-01-03", freq="1d") - obj = frame_or_series(range(1, 4), index=idx) - result = obj.rolling("1d", closed="left").sum() + msg = "'d' is deprecated and will be removed in a future version." + + with tm.assert_produces_warning(FutureWarning, match=msg): + idx = date_range(start="2020-01-01", end="2020-01-03", freq="1d") + obj = frame_or_series(range(1, 4), index=idx) + result = obj.rolling("1d", closed="left").sum() + expected = frame_or_series([np.nan, 1, 2], index=idx) tm.assert_equal(result, expected) - result = obj.iloc[::-1].rolling("1d", closed="left").sum() - idx = date_range(start="2020-01-03", end="2020-01-01", freq="-1d") + result = obj.iloc[::-1].rolling("1D", closed="left").sum() + idx = date_range(start="2020-01-03", end="2020-01-01", freq="-1D") expected = frame_or_series([np.nan, 3, 2], index=idx) tm.assert_equal(result, expected) diff --git a/pandas/tests/window/test_timeseries_window.py b/pandas/tests/window/test_timeseries_window.py index 820b0134cc577..eacdaddfa28b0 100644 --- a/pandas/tests/window/test_timeseries_window.py +++ b/pandas/tests/window/test_timeseries_window.py @@ -101,7 +101,7 @@ def test_on(self, regular): # column is valid df = df.copy() df["C"] = date_range("20130101", periods=len(df)) - df.rolling(window="2d", on="C").sum() + df.rolling(window="2D", on="C").sum() # invalid columns msg = "window must be an integer" @@ -109,7 +109,7 @@ def test_on(self, regular): df.rolling(window="2d", on="B") # ok even though on non-selected - df.rolling(window="2d", on="C").B.sum() + df.rolling(window="2D", on="C").B.sum() def test_monotonic_on(self): # on/index must be monotonic @@ -682,7 +682,7 @@ def test_rolling_on_multi_index_level(self): [date_range("20190101", periods=3), range(2)], names=["date", "seq"] ), ) - result = df.rolling("10d", on=df.index.get_level_values("date")).sum() + result = df.rolling("10D", on=df.index.get_level_values("date")).sum() expected = DataFrame( {"column": [0.0, 1.0, 3.0, 6.0, 10.0, 15.0]}, index=df.index ) diff --git a/pandas/tseries/holiday.py b/pandas/tseries/holiday.py index 8e51183138b5c..bf4ec2e551f01 100644 --- a/pandas/tseries/holiday.py +++ b/pandas/tseries/holiday.py @@ -4,7 +4,7 @@ datetime, timedelta, ) -from typing import Callable +from typing import TYPE_CHECKING import warnings from dateutil.relativedelta import ( @@ -35,6 +35,9 @@ Easter, ) +if TYPE_CHECKING: + from collections.abc import Callable + def next_monday(dt: datetime) -> datetime: """ diff --git a/pandas/util/_decorators.py b/pandas/util/_decorators.py index bdfb0b1cad8ae..165824bec131f 100644 --- a/pandas/util/_decorators.py +++ b/pandas/util/_decorators.py @@ -6,7 +6,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, cast, ) import warnings @@ -19,7 +18,10 @@ from pandas.util._exceptions import find_stack_level if TYPE_CHECKING: - from collections.abc import Mapping + from collections.abc import ( + Callable, + Mapping, + ) def deprecate( diff --git a/pandas/util/_print_versions.py b/pandas/util/_print_versions.py index 6cdd96996cea6..7e18ebe40cfa8 100644 --- a/pandas/util/_print_versions.py +++ b/pandas/util/_print_versions.py @@ -45,7 +45,7 @@ def _get_sys_info() -> dict[str, JSONSerializable]: language_code, encoding = locale.getlocale() return { "commit": _get_commit_hash(), - "python": ".".join([str(i) for i in sys.version_info]), + "python": platform.python_version(), "python-bits": struct.calcsize("P") * 8, "OS": uname_result.system, "OS-release": uname_result.release, @@ -70,33 +70,25 @@ def _get_dependency_info() -> dict[str, JSONSerializable]: "pytz", "dateutil", # install / build, - "setuptools", "pip", "Cython", - # test - "pytest", - "hypothesis", # docs "sphinx", - # Other, need a min version - "blosc", - "feather", - "xlsxwriter", - "lxml.etree", - "html5lib", - "pymysql", - "psycopg2", - "jinja2", # Other, not imported. "IPython", - "pandas_datareader", ] + # Optional dependencies deps.extend(list(VERSIONS)) result: dict[str, JSONSerializable] = {} for modname in deps: - mod = import_optional_dependency(modname, errors="ignore") - result[modname] = get_version(mod) if mod else None + try: + mod = import_optional_dependency(modname, errors="ignore") + except Exception: + # Dependency conflicts may cause a non ImportError + result[modname] = "N/A" + else: + result[modname] = get_version(mod) if mod else None return result @@ -115,6 +107,11 @@ def show_versions(as_json: str | bool = False) -> None: Info will be written to that file in JSON format. * If True, outputs info in JSON format to the console. + See Also + -------- + get_option : Retrieve the value of the specified option. + set_option : Set the value of the specified option or options. + Examples -------- >>> pd.show_versions() # doctest: +SKIP diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index 48684c4810d2a..1c17587db72d4 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -27,14 +27,12 @@ def test_foo(): from __future__ import annotations import locale -from typing import ( - TYPE_CHECKING, - Callable, -) +from typing import TYPE_CHECKING import pytest if TYPE_CHECKING: + from collections.abc import Callable from pandas._typing import F from pandas.compat import ( diff --git a/pandas/util/_tester.py b/pandas/util/_tester.py index 494f306ec807d..c0e9756372f47 100644 --- a/pandas/util/_tester.py +++ b/pandas/util/_tester.py @@ -27,6 +27,10 @@ def test(extra_args: list[str] | None = None, run_doctests: bool = False) -> Non both doctests/regular tests, just append "--doctest-modules"/"--doctest-cython" to extra_args. + See Also + -------- + pytest.main : The main entry point for pytest testing framework. + Examples -------- >>> pd.test() # doctest: +SKIP diff --git a/pandas/util/version/__init__.py b/pandas/util/version/__init__.py index 153424e339c45..9838e371f0d00 100644 --- a/pandas/util/version/__init__.py +++ b/pandas/util/version/__init__.py @@ -8,11 +8,13 @@ from __future__ import annotations import collections -from collections.abc import Iterator +from collections.abc import ( + Callable, + Iterator, +) import itertools import re from typing import ( - Callable, SupportsInt, Tuple, Union, diff --git a/pyproject.toml b/pyproject.toml index 085c054f8241a..cc5cc1cf84d0c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,7 +25,7 @@ authors = [ { name = 'The Pandas Development Team', email='pandas-dev@python.org' }, ] license = {file = 'LICENSE'} -requires-python = '>=3.9' +requires-python = '>=3.10' dependencies = [ "numpy>=1.23.5; python_version<'3.12'", "numpy>=1.26.0; python_version>='3.12'", @@ -43,7 +43,6 @@ classifiers = [ 'Programming Language :: Python', 'Programming Language :: Python :: 3', 'Programming Language :: Python :: 3 :: Only', - 'Programming Language :: Python :: 3.9', 'Programming Language :: Python :: 3.10', 'Programming Language :: Python :: 3.11', 'Programming Language :: Python :: 3.12', @@ -59,7 +58,7 @@ repository = 'https://github.com/pandas-dev/pandas' matplotlib = "pandas:plotting._matplotlib" [project.optional-dependencies] -test = ['hypothesis>=6.46.1', 'pytest>=7.3.2', 'pytest-xdist>=2.2.0'] +test = ['hypothesis>=6.84.0', 'pytest>=7.3.2', 'pytest-xdist>=3.4.0'] pyarrow = ['pyarrow>=10.0.1'] performance = ['bottleneck>=1.3.6', 'numba>=0.56.4', 'numexpr>=2.8.4'] computation = ['scipy>=1.10.0', 'xarray>=2022.12.0'] @@ -92,7 +91,7 @@ all = ['adbc-driver-postgresql>=0.10.0', 'fsspec>=2022.11.0', 'gcsfs>=2022.11.0', 'html5lib>=1.1', - 'hypothesis>=6.46.1', + 'hypothesis>=6.84.0', 'jinja2>=3.1.2', 'lxml>=4.9.2', 'matplotlib>=3.6.3', @@ -106,7 +105,7 @@ all = ['adbc-driver-postgresql>=0.10.0', 'PyQt5>=5.15.9', 'pyreadstat>=1.2.0', 'pytest>=7.3.2', - 'pytest-xdist>=2.2.0', + 'pytest-xdist>=3.4.0', 'python-calamine>=0.1.7', 'pyxlsb>=1.0.10', 'qtpy>=2.3.0', @@ -146,23 +145,26 @@ parentdir_prefix = "pandas-" setup = ['--vsenv'] # For Windows [tool.cibuildwheel] -skip = "cp36-* cp37-* cp38-* pp* *_i686 *_ppc64le *_s390x" +skip = "cp36-* cp37-* cp38-* cp39-* pp* *_i686 *_ppc64le *_s390x" build-verbosity = "3" environment = {LDFLAGS="-Wl,--strip-all"} -test-requires = "hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0" +test-requires = "hypothesis>=6.84.0 pytest>=7.3.2 pytest-xdist>=3.4.0" test-command = """ PANDAS_CI='1' python -c 'import pandas as pd; \ pd.test(extra_args=["-m not clipboard and not single_cpu and not slow and not network and not db", "-n 2", "--no-strict-data-files"]); \ pd.test(extra_args=["-m not clipboard and single_cpu and not slow and not network and not db", "--no-strict-data-files"]);' \ """ +free-threaded-support = true +before-build = "bash {package}/scripts/cibw_before_build.sh" +before-test = "bash {package}/scripts/cibw_before_test.sh" [tool.cibuildwheel.windows] -before-build = "pip install delvewheel" +before-build = "pip install delvewheel && bash {package}/scripts/cibw_before_build.sh" repair-wheel-command = "delvewheel repair -w {dest_dir} {wheel}" [[tool.cibuildwheel.overrides]] select = "*-musllinux*" -before-test = "apk update && apk add musl-locales" +before-test = "apk update && apk add musl-locales && bash {package}/scripts/cibw_before_test.sh" [[tool.cibuildwheel.overrides]] select = "*-win*" @@ -178,6 +180,16 @@ test-command = "" select = "*-macosx*" environment = {CFLAGS="-g0"} +[[tool.cibuildwheel.overrides]] +select = "*pyodide*" +test-requires = "pytest>=7.3.2 hypothesis>=6.84.0" +# Pyodide repairs wheels on its own, using auditwheel-emscripten +repair-wheel-command = "" +test-command = """ + PANDAS_CI='1' python -c 'import pandas as pd; \ + pd.test(extra_args=["-m not clipboard and not single_cpu and not slow and not network and not db", "--no-strict-data-files"]);' \ + """ + [tool.ruff] line-length = 88 target-version = "py310" @@ -319,6 +331,8 @@ ignore = [ "RUF007", # mutable-class-default "RUF012", + # type-comparison + "E721", # Additional pylint rules # literal-membership @@ -521,7 +535,6 @@ module = [ "pandas._libs.*", "pandas._testing.*", # TODO "pandas.compat.numpy.function", # TODO - "pandas.compat.compressors", # TODO "pandas.core._numba.executor", # TODO "pandas.core.array_algos.masked_reductions", # TODO "pandas.core.array_algos.putmask", # TODO @@ -722,5 +735,5 @@ exclude_lines = [ directory = "coverage_html_report" [tool.codespell] -ignore-words-list = "blocs, coo, hist, nd, sav, ser, recuse, nin, timere, expec, expecs, indext" +ignore-words-list = "blocs, coo, hist, nd, sav, ser, recuse, nin, timere, expec, expecs, indext, SME, NotIn, tructures, tru" ignore-regex = 'https://([\w/\.])+' diff --git a/requirements-dev.txt b/requirements-dev.txt index f5da7f70ccdba..dbfd7c6bf7bf5 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -8,8 +8,8 @@ meson[ninja]==1.2.1 meson-python==0.13.1 pytest>=7.3.2 pytest-cov -pytest-xdist>=2.2.0 -pytest-qt>=4.2.0 +pytest-xdist>=3.4.0 +pytest-qt>=4.4.0 pytest-localserver PyQt5>=5.15.9 coverage @@ -22,7 +22,7 @@ bottleneck>=1.3.6 fastparquet>=2023.10.0 fsspec>=2022.11.0 html5lib>=1.1 -hypothesis>=6.46.1 +hypothesis>=6.84.0 gcsfs>=2022.11.0 ipython jinja2>=3.1.2 diff --git a/scripts/check_for_inconsistent_pandas_namespace.py b/scripts/check_for_inconsistent_pandas_namespace.py index 52eca6f6d93ac..ec0a4a408c800 100644 --- a/scripts/check_for_inconsistent_pandas_namespace.py +++ b/scripts/check_for_inconsistent_pandas_namespace.py @@ -27,10 +27,7 @@ Sequence, ) import sys -from typing import ( - NamedTuple, - Optional, -) +from typing import NamedTuple ERROR_MESSAGE = ( "{path}:{lineno}:{col_offset}: " @@ -89,7 +86,7 @@ def replace_inconsistent_pandas_namespace(visitor: Visitor, content: str) -> str def check_for_inconsistent_pandas_namespace( content: str, path: str, *, replace: bool -) -> Optional[str]: +) -> str | None: tree = ast.parse(content) visitor = Visitor() @@ -121,7 +118,7 @@ def check_for_inconsistent_pandas_namespace( return replace_inconsistent_pandas_namespace(visitor, content) -def main(argv: Optional[Sequence[str]] = None) -> None: +def main(argv: Sequence[str] | None = None) -> None: parser = argparse.ArgumentParser() parser.add_argument("paths", nargs="*") parser.add_argument("--replace", action="store_true") diff --git a/scripts/cibw_before_build.sh b/scripts/cibw_before_build.sh new file mode 100644 index 0000000000000..f3049b27ed5d1 --- /dev/null +++ b/scripts/cibw_before_build.sh @@ -0,0 +1,9 @@ +# TODO: Delete when there's PyPI NumPy/Cython releases the support Python 3.13. +# If free-threading support is not included in those releases, this script will have +# to whether this runs for a free-threaded build instead. +PYTHON_VERSION="$(python -c "import sys; print(f'{sys.version_info.major}{sys.version_info.minor}')")" +if [[ $PYTHON_VERSION == "313" ]]; then + python -m pip install -U pip + python -m pip install -i https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy cython + python -m pip install ninja meson-python versioneer[toml] +fi diff --git a/scripts/cibw_before_test.sh b/scripts/cibw_before_test.sh new file mode 100644 index 0000000000000..7d1b143881ced --- /dev/null +++ b/scripts/cibw_before_test.sh @@ -0,0 +1,8 @@ +# TODO: Delete when there's PyPI NumPy/Cython releases the support Python 3.13. +# If free-threading support is not included in those releases, this script will have +# to whether this runs for a free-threaded build instead. +PYTHON_VERSION="$(python -c "import sys; print(f'{sys.version_info.major}{sys.version_info.minor}')")" +if [[ $PYTHON_VERSION == "313" ]]; then + python -m pip install -U pip + python -m pip install -i https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy +fi diff --git a/scripts/tests/data/deps_expected_random.yaml b/scripts/tests/data/deps_expected_random.yaml index 7bb95d05afb45..d1db7989a95a4 100644 --- a/scripts/tests/data/deps_expected_random.yaml +++ b/scripts/tests/data/deps_expected_random.yaml @@ -12,7 +12,7 @@ dependencies: # test dependencies - pytest>=7.3.2 - pytest-cov - - pytest-xdist>=2.2.0 + - pytest-xdist>=3.4.0 - psutil - boto3 diff --git a/scripts/tests/data/deps_minimum.toml b/scripts/tests/data/deps_minimum.toml index ed7b9affe9a50..0a53225a5d995 100644 --- a/scripts/tests/data/deps_minimum.toml +++ b/scripts/tests/data/deps_minimum.toml @@ -39,8 +39,6 @@ classifiers = [ 'Programming Language :: Python', 'Programming Language :: Python :: 3', 'Programming Language :: Python :: 3 :: Only', - 'Programming Language :: Python :: 3.8', - 'Programming Language :: Python :: 3.9', 'Programming Language :: Python :: 3.10', 'Programming Language :: Python :: 3.11', 'Topic :: Scientific/Engineering' @@ -55,7 +53,7 @@ repository = 'https://github.com/pandas-dev/pandas' matplotlib = "pandas:plotting._matplotlib" [project.optional-dependencies] -test = ['hypothesis>=6.34.2', 'pytest>=7.3.2', 'pytest-xdist>=2.2.0'] +test = ['hypothesis>=6.34.2', 'pytest>=7.3.2', 'pytest-xdist>=3.4.0'] performance = ['bottleneck>=1.3.2', 'numba>=0.53.1', 'numexpr>=2.7.1'] timezone = ['tzdata>=2022.1'] computation = ['scipy>=1.7.1', 'xarray>=0.21.0'] @@ -76,7 +74,7 @@ html = ['beautifulsoup4>=4.9.3', 'html5lib>=1.1', 'lxml>=4.6.3'] xml = ['lxml>=4.6.3'] plot = ['matplotlib>=3.6.1'] output_formatting = ['jinja2>=3.0.0', 'tabulate>=0.8.9'] -clipboard = ['PyQt5>=5.15.1', 'qtpy>=2.2.0'] +clipboard = ['PyQt5>=5.15.1', 'qtpy>=2.3.0'] compression = ['zstandard>=0.15.2'] all = ['beautifulsoup4>=5.9.3', # blosc only available on conda (https://github.com/Blosc/python-blosc/issues/297) @@ -100,10 +98,10 @@ all = ['beautifulsoup4>=5.9.3', 'PyQt5>=5.15.1', 'pyreadstat>=1.1.2', 'pytest>=7.3.2', - 'pytest-xdist>=2.2.0', + 'pytest-xdist>=3.4.0', 'python-calamine>=0.1.7', 'pyxlsb>=1.0.8', - 'qtpy>=2.2.0', + 'qtpy>=2.3.0', 'scipy>=1.7.1', 's3fs>=2021.08.0', 'SQLAlchemy>=1.4.16', @@ -140,7 +138,7 @@ parentdir_prefix = "pandas-" [tool.cibuildwheel] skip = "cp36-* cp37-* pp37-* *-manylinux_i686 *_ppc64le *_s390x *-musllinux*" build-verbosity = "3" -test-requires = "hypothesis>=6.34.2 pytest>=7.3.2 pytest-xdist>=2.2.0" +test-requires = "hypothesis>=6.34.2 pytest>=7.3.2 pytest-xdist>=3.4.0" test-command = "python {project}/ci/test_wheels.py" [tool.cibuildwheel.macos] diff --git a/scripts/tests/data/deps_unmodified_random.yaml b/scripts/tests/data/deps_unmodified_random.yaml index 49299aa078ce4..afb28dd2c08bb 100644 --- a/scripts/tests/data/deps_unmodified_random.yaml +++ b/scripts/tests/data/deps_unmodified_random.yaml @@ -12,7 +12,7 @@ dependencies: # test dependencies - pytest>=7.3.2 - pytest-cov - - pytest-xdist>=2.2.0 + - pytest-xdist>=3.4.0 - psutil - boto3 diff --git a/scripts/validate_unwanted_patterns.py b/scripts/validate_unwanted_patterns.py index ba3123a07df4b..35f6ffb4980df 100755 --- a/scripts/validate_unwanted_patterns.py +++ b/scripts/validate_unwanted_patterns.py @@ -12,14 +12,14 @@ import argparse import ast -from collections.abc import Iterable +from collections.abc import ( + Callable, + Iterable, +) import sys import token import tokenize -from typing import ( - IO, - Callable, -) +from typing import IO PRIVATE_IMPORTS_TO_IGNORE: set[str] = { "_extension_array_shared_docs", diff --git a/web/pandas/community/ecosystem.md b/web/pandas/community/ecosystem.md index 6cd67302b2a0e..49ece5564c300 100644 --- a/web/pandas/community/ecosystem.md +++ b/web/pandas/community/ecosystem.md @@ -362,6 +362,18 @@ any Delta table into Pandas dataframe. ## Out-of-core +### [Bodo](https://bodo.ai/) + +Bodo is a high-performance Python computing engine that automatically parallelizes and +optimizes your code through compilation using HPC (high-performance computing) techniques. +Designed to operate with native pandas dataframes, Bodo compiles your pandas code to execute +across multiple cores on a single machine or distributed clusters of multiple compute nodes efficiently. +Bodo also makes distributed pandas dataframes queryable with SQL. + +The community edition of Bodo is free to use on up to 8 cores. Beyond that, Bodo offers a paid +enterprise edition. Free licenses of Bodo (for more than 8 cores) are available +[upon request](https://www.bodo.ai/contact) for academic and non-profit use. + ### [Cylon](https://cylondata.org/) Cylon is a fast, scalable, distributed memory parallel runtime with a pandas diff --git a/web/pandas/pdeps/0001-purpose-and-guidelines.md b/web/pandas/pdeps/0001-purpose-and-guidelines.md index bb15b8f997b11..7f5f0326eba6c 100644 --- a/web/pandas/pdeps/0001-purpose-and-guidelines.md +++ b/web/pandas/pdeps/0001-purpose-and-guidelines.md @@ -285,3 +285,4 @@ hope can help clarify our meaning here: [51417]: https://github.com/pandas-dev/pandas/pull/51417 [28900]: https://github.com/pandas-dev/pandas/issues/28900 [35407]: https://github.com/pandas-dev/pandas/issues/35407 +[53576]: https://github.com/pandas-dev/pandas/pull/53576 diff --git a/web/pandas/pdeps/0006-ban-upcasting.md b/web/pandas/pdeps/0006-ban-upcasting.md index a86455b70c71a..ae5872186bf23 100644 --- a/web/pandas/pdeps/0006-ban-upcasting.md +++ b/web/pandas/pdeps/0006-ban-upcasting.md @@ -1,7 +1,7 @@ # PDEP-6: Ban upcasting in setitem-like operations - Created: 23 December 2022 -- Status: Accepted +- Status: Implemented - Discussion: [#39584](https://github.com/pandas-dev/pandas/pull/50402) - Author: [Marco Gorelli](https://github.com/MarcoGorelli) ([original issue](https://github.com/pandas-dev/pandas/issues/39584) by [Joris Van den Bossche](https://github.com/jorisvandenbossche)) - Revision: 1 @@ -244,3 +244,4 @@ Deprecate sometime in the 2.x releases (after 2.0.0 has already been released), ### PDEP History - 23 December 2022: Initial draft +- 4 July 2024: Change status to "implemented" diff --git a/web/pandas/pdeps/0014-string-dtype.md b/web/pandas/pdeps/0014-string-dtype.md new file mode 100644 index 0000000000000..5b74f71216454 --- /dev/null +++ b/web/pandas/pdeps/0014-string-dtype.md @@ -0,0 +1,375 @@ +# PDEP-14: Dedicated string data type for pandas 3.0 + +- Created: May 3, 2024 +- Status: Accepted +- Discussion: https://github.com/pandas-dev/pandas/pull/58551 +- Author: [Joris Van den Bossche](https://github.com/jorisvandenbossche) +- Revision: 1 + +## Abstract + +This PDEP proposes to introduce a dedicated string dtype that will be used by +default in pandas 3.0: + +* In pandas 3.0, enable a string dtype (`"str"`) by default, using PyArrow if available + or otherwise a string dtype using numpy object-dtype under the hood as fallback. +* The default string dtype will use missing value semantics (using NaN) consistent + with the other default data types. + +This will give users a long-awaited proper string dtype for 3.0, while 1) not +(yet) making PyArrow a _hard_ dependency, but only a dependency used by default, +and 2) leaving room for future improvements (different missing value semantics, +using NumPy 2.0 strings, etc). + +## Background + +Currently, pandas by default stores text data in an `object`-dtype NumPy array. +The current implementation has two primary drawbacks. First, `object` dtype is +not specific to strings: any Python object can be stored in an `object`-dtype +array, not just strings, and seeing `object` as the dtype for a column with +strings is confusing for users. Second: this is not efficient (all string +methods on a Series are eventually calling Python methods on the individual +string objects). + +To solve the first issue, a dedicated extension dtype for string data has +already been +[added in pandas 1.0](https://pandas.pydata.org/docs/whatsnew/v1.0.0.html#dedicated-string-data-type). +This has always been opt-in for now, requiring users to explicitly request the +dtype (with `dtype="string"` or `dtype=pd.StringDtype()`). The array backing +this string dtype was initially almost the same as the default implementation, +i.e. an `object`-dtype NumPy array of Python strings. + +To solve the second issue (performance), pandas contributed to the development +of string kernels in the PyArrow package, and a variant of the string dtype +backed by PyArrow was +[added in pandas 1.3](https://pandas.pydata.org/docs/whatsnew/v1.3.0.html#pyarrow-backed-string-data-type). +This could be specified with the `storage` keyword in the opt-in string dtype +(`pd.StringDtype(storage="pyarrow")`). + +Since its introduction, the `StringDtype` has always been opt-in, and has used +the experimental `pd.NA` sentinel for missing values (which was also [introduced +in pandas 1.0](https://pandas.pydata.org/docs/whatsnew/v1.0.0.html#experimental-na-scalar-to-denote-missing-values)). +However, up to this date, pandas has not yet taken the step to use `pd.NA` for +for any default dtype, and thus the `StringDtype` deviates in missing value +behaviour compared to the default data types. + +In 2023, [PDEP-10](https://pandas.pydata.org/pdeps/0010-required-pyarrow-dependency.html) +proposed to start using a PyArrow-backed string dtype by default in pandas 3.0 +(i.e. infer this type for string data instead of object dtype). To ensure we +could use the variant of `StringDtype` backed by PyArrow instead of Python +objects (for better performance), it proposed to make `pyarrow` a new required +runtime dependency of pandas. + +In the meantime, NumPy has also been working on a native variable-width string +data type, which was made available [starting with NumPy +2.0](https://numpy.org/devdocs/release/2.0.0-notes.html#stringdtype-has-been-added-to-numpy). +This can provide a potential alternative to PyArrow for implementing a string +data type in pandas that is not backed by Python objects. + +After acceptance of PDEP-10, two aspects of the proposal have been under +reconsideration: + +- Based on feedback from users and maintainers from other packages (mostly + around installation complexity and size), it has been considered to relax the + new `pyarrow` requirement to not be a _hard_ runtime dependency. In addition, + NumPy 2.0 could in the future potentially reduce the need to make PyArrow a + required dependency specifically for a dedicated pandas string dtype. +- PDEP-10 did not consider the usage of the experimental `pd.NA` as a + consequence of adopting one of the existing implementations of the + `StringDtype`. + +For the second aspect, another variant of the `StringDtype` was +[introduced in pandas 2.1](https://pandas.pydata.org/docs/whatsnew/v2.1.0.html#whatsnew-210-enhancements-infer-strings) +that is still backed by PyArrow but follows the default missing values semantics +pandas uses for all other default data types (and using `NaN` as the missing +value sentinel) ([GH-54792](https://github.com/pandas-dev/pandas/issues/54792)). +At the time, the `storage` option for this new variant was called +`"pyarrow_numpy"` to disambiguate from the existing `"pyarrow"` option using +`pd.NA` (but this PDEP proposes a better naming scheme, see the "Naming" +subsection below). + +This last dtype variant is what users currently (pandas 2.2) get for string data +when enabling the ``future.infer_string`` option (to enable the behaviour which +is intended to become the default in pandas 3.0). + +## Proposal + +To be able to move forward with a string data type in pandas 3.0, this PDEP proposes: + +1. For pandas 3.0, a `"str"` string dtype is enabled by default, i.e. this + string dtype will be used as the default dtype for text data when creating + pandas objects (e.g. inference in constructors, I/O functions). +2. This default string dtype will follow the same behaviour for missing values + as other default data types, and use `NaN` as the missing value sentinel. +3. The string dtype will use PyArrow if installed, and otherwise falls back to + an in-house functionally-equivalent (but slower) version. This fallback can + reuse (with minor code additions) the existing numpy object-dtype backed + StringArray for its implementation. +4. Installation guidelines are updated to clearly encourage users to install + pyarrow for the default user experience. + +Those string dtypes enabled by default will then no longer be considered as +experimental. + +### Default inference of a string dtype + +By default, pandas will infer this new string dtype instead of object dtype for +string data (when creating pandas objects, such as in constructors or IO +functions). + +In pandas 2.2, the existing `future.infer_string` option can be used to opt-in to the future +default behaviour: + +```python +>>> pd.options.future.infer_string = True +>>> pd.Series(["a", "b", None]) +0 a +1 b +2 NaN +dtype: string +``` + +Right now (pandas 2.2), the existing option only enables the PyArrow-based +future dtype. For the remaining 2.x releases, this option will be expanded to +also work when PyArrow is not installed to enable the object-dtype fallback in +that case. + +### Missing value semantics + +As mentioned in the background section, the original `StringDtype` has always +used the experimental `pd.NA` sentinel for missing values. In addition to using +`pd.NA` as the scalar for a missing value, this essentially means that: + +- String columns follow ["NA-semantics"](https://pandas.pydata.org/docs/user_guide/missing_data.html#na-semantics) + for missing values, where `NA` propagates in boolean operations such as + comparisons or predicates. +- Operations on the string column that give a numeric or boolean result use the + nullable Integer/Float/Boolean data types (e.g. `ser.str.len()` returns the + nullable `"Int64"` / `pd.Int64Dtype()` dtype instead of the numpy `int64` + dtype (or `float64` in case of missing values)). + +However, up to this date, all other default data types still use `NaN` semantics +for missing values. Therefore, this proposal says that a new default string +dtype should also still use the same default missing value semantics and return +default data types when doing operations on the string column, to be consistent +with the other default dtypes at this point. + +In practice, this means that the default string dtype will use `NaN` as +the missing value sentinel, and: + +- String columns will follow NaN-semantics for missing values, where `NaN` gives + False in boolean operations such as comparisons or predicates. +- Operations on the string column that give a numeric or boolean result will use + the default data types (i.e. numpy `int64`/`float64`/`bool`). + +Because the original `StringDtype` implementations already use `pd.NA` and +return masked integer and boolean arrays in operations, a new variant of the +existing dtypes that uses `NaN` and default data types was needed. The original +variant of `StringDtype` using `pd.NA` will continue to be available for those +who were already using it. + +### Object-dtype "fallback" implementation + +To avoid a hard dependency on PyArrow for pandas 3.0, this PDEP proposes to keep +a "fallback" option in case PyArrow is not installed. The original `StringDtype` +backed by a numpy object-dtype array of Python strings can be mostly reused for +this (adding a new variant of the dtype) and a new `StringArray` subclass only +needs minor changes to follow the above-mentioned missing value semantics +([GH-58451](https://github.com/pandas-dev/pandas/pull/58451)). + +For pandas 3.0, this is the most realistic option given this implementation has +already been available for a long time. Beyond 3.0, further improvements such as +using NumPy 2.0 ([GH-58503](https://github.com/pandas-dev/pandas/issues/58503)) +or nanoarrow ([GH-58552](https://github.com/pandas-dev/pandas/issues/58552)) can +still be explored, but at that point that is an implementation detail that +should not have a direct impact on users (except for performance). + +For the original variant of `StringDtype` using `pd.NA`, currently the default +storage is `"python"` (the object-dtype based implementation). Also for this +variant, it is proposed to follow the same logic for determining the default +storage, i.e. default to `"pyarrow"` if available, and otherwise +fall back to `"python"`. + +### Naming + +Given the long history of this topic, the naming of the dtypes is a difficult +topic. + +In the first place, it should be acknowledged that most users should not need to +use storage-specific options. Users are expected to specify a generic name (such +as `"str"` or `"string"`), and that will give them their default string dtype +(which depends on whether PyArrow is installed or not). + +For the generic string alias to specify the dtype, `"string"` is already used +for the `StringDtype` using `pd.NA`. This PDEP proposes to use `"str"` for the +new default `StringDtype` using `NaN`. This ensures backwards compatibility for +code using `dtype="string"`, and was also chosen because `dtype="str"` or +`dtype=str` currently already works to ensure your data is converted to +strings (only using object dtype for the result). + +But for testing purposes and advanced use cases that want control over the exact +variant of the `StringDtype`, we need some way to specify this and distinguish +them from the other string dtypes. + +Currently (pandas 2.2), `StringDtype(storage="pyarrow_numpy")` is used for the new variant using `NaN`, +where the `"pyarrow_numpy"` storage was used to disambiguate from the existing +`"pyarrow"` option using `pd.NA`. However, `"pyarrow_numpy"` is a rather confusing +option and doesn't generalize well. Therefore, this PDEP proposes a new naming +scheme as outlined below, and `"pyarrow_numpy"` will be deprecated as an alias +in pandas 2.3 and removed in pandas 3.0. + +The `storage` keyword of `StringDtype` is kept to disambiguate the underlying +storage of the string data (using pyarrow or python objects), but an additional +`na_value` is introduced to disambiguate the the variants using NA semantics +and NaN semantics. + +Overview of the different ways to specify a dtype and the resulting concrete +dtype of the data: + +| User specification | Concrete dtype | String alias | Note | +|---------------------------------------------|---------------------------------------------------------------|---------------------------------------|----------| +| Unspecified (inference) | `StringDtype(storage="pyarrow"\|"python", na_value=np.nan)` | "str" | (1) | +| `"str"` or `StringDtype(na_value=np.nan)` | `StringDtype(storage="pyarrow"\|"python", na_value=np.nan)` | "str" | (1) | +| `StringDtype("pyarrow", na_value=np.nan)` | `StringDtype(storage="pyarrow", na_value=np.nan)` | "str" | | +| `StringDtype("python", na_value=np.nan)` | `StringDtype(storage="python", na_value=np.nan)` | "str" | | +| `StringDtype("pyarrow")` | `StringDtype(storage="pyarrow", na_value=pd.NA)` | "string[pyarrow]" | | +| `StringDtype("python")` | `StringDtype(storage="python", na_value=pd.NA)` | "string[python]" | | +| `"string"` or `StringDtype()` | `StringDtype(storage="pyarrow"\|"python", na_value=pd.NA)` | "string[pyarrow]" or "string[python]" | (1) | +| `StringDtype("pyarrow_numpy")` | `StringDtype(storage="pyarrow", na_value=np.nan)` | "string[pyarrow_numpy]" | (2) | + +Notes: + +- (1) You get "pyarrow" or "python" depending on pyarrow being installed. +- (2) "pyarrow_numpy" is kept temporarily because this is already in a released + version, but it will be deprecated in 2.x and removed for 3.0. + +For the new default string dtype, only the `"str"` alias can be used to +specify the dtype as a string, i.e. pandas would not provide a way to make the +underlying storage (pyarrow or python) explicit through the string alias. This +string alias is only a convenience shortcut and for most users `"str"` is +sufficient (they don't need to specify the storage), and the explicit +`pd.StringDtype(storage=..., na_value=np.nan)` is still available for more +fine-grained control. + +Also for the existing variant using `pd.NA`, specifying the storage through the +string alias could be deprecated, but that is left for a separate decision. + +## Alternatives + +### Why not delay introducing a default string dtype? + +To avoid introducing a new string dtype while other discussions and changes are +in flux (eventually making pyarrow a required dependency? adopting `pd.NA` as +the default missing value sentinel? using the new NumPy 2.0 capabilities? +overhauling all our dtypes to use a logical data type system?), introducing a +default string dtype could also be delayed until there is more clarity in those +other discussions. Specifically, it would avoid temporarily switching to use +`NaN` for the string dtype, while in a future version we might switch back +to `pd.NA` by default. + +However: + +1. Delaying has a cost: it further postpones introducing a dedicated string + dtype that has significant benefits for users, both in usability as (for the + part of the user base that has PyArrow installed) in performance. +2. In case pandas eventually transitions to use `pd.NA` as the default missing value + sentinel, a migration path for _all_ pandas data types will be needed, and thus + the challenges around this will not be unique to the string dtype and + therefore not a reason to delay this. + +Making this change now for 3.0 will benefit the majority of users, and the PDEP +author believes this is worth the cost of the added complexity around "yet +another dtype" (also for other data types we already have multiple variants). + +### Why not use the existing StringDtype with `pd.NA`? + +Wouldn't adding even more variants of the string dtype make things only more +confusing? Indeed, this proposal unfortunately introduces more variants of the +string dtype. However, the reason for this is to ensure the actual default user +experience is _less_ confusing, and the new string dtype fits better with the +other default data types. + +If the new default string data type would use `pd.NA`, then after some +operations, a user can easily end up with a DataFrame that mixes columns using +`NaN` semantics and columns using `NA` semantics (and thus a DataFrame that +could have columns with two different int64, two different float64, two different +bool, etc dtypes). This would lead to a very confusing default experience. + +With the proposed new variant of the StringDtype, this will ensure that for the +_default_ experience, a user will only see only 1 kind of integer dtype, only +kind of 1 bool dtype, etc. For now, a user should only get columns using `pd.NA` +when explicitly opting into this. + +### Naming alternatives + +An initial version of this PDEP proposed to use the `"string"` alias and the +default `pd.StringDtype()` class constructor for the new default dtype. +However, that caused a lot of discussion around backwards compatibility for +existing users of `dtype=pd.StringDtype()` and `dtype="string"`, that uses +`pd.NA` to represent missing values. + +During the discussion, several alternatives have been brought up. Both +alternative keyword names as using a different constructor. In the end, +this PDEP proposes to use a different string alias (`"str"`) but to keep +using the existing `pd.StringDtype` (with the existing `storage` keyword but +with an additional `na_value` keyword) for now to keep the changes as +minimal as possible, leaving a larger overhaul of the dtype system (potentially +including different constructor functions or namespace) for a future discussion. +See [GH-58613](https://github.com/pandas-dev/pandas/issues/58613) for the full +discussion. + +One consequence is that when using the class constructor for the default dtype, +it has to be used with non-default arguments, i.e. a user needs to specify +`pd.StringDtype(na_value=np.nan)` to get the default dtype using `NaN`. +Therefore, the pandas documentation will focus on the usage of `dtype="str"`. + +## Backward compatibility + +The most visible backwards incompatible change will be that columns with string +data will no longer have an `object` dtype. Therefore, code that assumes +`object` dtype (such as `ser.dtype == object`) will need to be updated. This +change is done as a hard break in a major release, as warning in advance for the +changed inference is deemed too noisy. + +To allow testing code in advance, the +`pd.options.future.infer_string = True` option is available for users. + +Otherwise, the actual string-specific functionality (such as the `.str` accessor +methods) should generally all keep working as is. + +By preserving the current missing value semantics, this proposal is also mostly +backwards compatible on this aspect. When storing strings in object dtype, pandas +however did allow using `None` as the missing value indicator as well (and in +certain cases such as the `shift` method, pandas even introduced this itself). +For all the cases where currently `None` was used as the missing value sentinel, +this will change to consistently use `NaN`. + +### For existing users of `StringDtype` + +Existing code that already opted in to use the `StringDtype` using `pd.NA` +should generally keep working as is. The latest version of this PDEP preserves +the behaviour of `dtype="string"` or `dtype=pd.StringDtype()` to mean the +`pd.NA` variant of the dtype. + +It does propose the change the default storage to `"pyarrow"` (if available) for +the opt-in `pd.NA` variant as well, but this should have limited, if any, +user-visible impact. + +## Timeline + +The future PyArrow-backed string dtype was already made available behind a feature +flag in pandas 2.1 (enabled by `pd.options.future.infer_string = True`). + +The variant using numpy object-dtype can also be backported to the 2.2.x branch +to allow easier testing. It is proposed to release this as 2.3.0 (created from +the 2.2.x branch, given that the main branch already includes many other changes +targeted for 3.0), together with the changes to the naming scheme. + +The 2.3.0 release would then have all future string functionality available +(both the pyarrow and object-dtype based variants of the default string dtype). + +For pandas 3.0, this `future.infer_string` flag becomes enabled by default. + +## PDEP-14 History + +- 3 May 2024: Initial version diff --git a/web/pandas_web.py b/web/pandas_web.py index aac07433f2712..b3872b829c73a 100755 --- a/web/pandas_web.py +++ b/web/pandas_web.py @@ -280,6 +280,7 @@ def roadmap_pdeps(context): PDEP's in different status from the directory tree and GitHub. """ KNOWN_STATUS = { + "Draft", "Under discussion", "Accepted", "Implemented", @@ -319,7 +320,7 @@ def roadmap_pdeps(context): github_repo_url = context["main"]["github_repo_url"] resp = requests.get( "https://api.github.com/search/issues?" - f"q=is:pr is:open label:PDEP repo:{github_repo_url}", + f"q=is:pr is:open label:PDEP draft:false repo:{github_repo_url}", headers=GITHUB_API_HEADERS, timeout=5, )