Skip to content

Commit

Permalink
Upgrade pandas to 1.5 (#11617)
Browse files Browse the repository at this point in the history
This PR introduces `pandas-1.5` support in `cudf`. The changes include:

- [x] Requires `group_keys` support in `groupby` for `dask_cudf` to work: #11659
- [x] Requires `zfill` updates to match `pandas-1.5` behavior: #11634
- [x] `where` API: Ability to inspect a scalar value if it can be fit into the existing dtype, similar to: pandas-dev/pandas#48373
- [x] Switches `ValueError` to `TypeError` when an unknown category is being set to a `CategoricalColumn`
- [x] Handles breaking change of an `ArrowIntervalType` related import that has resulted in `cudf` to error on import itself.
- [x] Fix an issue with `IntervalColumn.to_pandas`.
- [x] Raises error when an object of `boolean` dtype is being set to a `NumericalColumn`.
- [x] Raises error when `pat` is None in `Series.str.startswith` & `Series.str.endswith`.
- [x] Add `IntervalDtype.to_pandas` with appropriate versioning.
- [x] Handle `get_window_bounds` signature changes.
- [x] Fix and version a bunch of pytests.

```python
branch-22.10:

== 4275 failed, 79837 passed, 2049 skipped, 1193 xfailed, 1923 xpassed, 6597 warnings, 4 errors in 1103.52s (0:18:23) ==
== 803 failed, 106 passed, 14 skipped, 14 xfailed, 324 warnings, 17 errors in 148.46s (0:02:28) ==

This PR:

== 84041 passed, 2049 skipped, 1199 xfailed, 1710 xpassed, 6599 warnings in 359.27s (0:05:59) ==
== 954 passed, 14 skipped, 7 xfailed, 3 xpassed, 580 warnings in 54.75s ==
```

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Ashwin Srinath (https://github.com/shwina)
  - Matthew Roeschke (https://github.com/mroeschke)
  - Mark Sadang (https://github.com/msadang)

URL: #11617
  • Loading branch information
galipremsagar authored Sep 21, 2022
1 parent 0528b38 commit 387c5ff
Show file tree
Hide file tree
Showing 30 changed files with 326 additions and 113 deletions.
3 changes: 2 additions & 1 deletion ci/gpu/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,6 @@ gpuci_logger "Check conda environment"
conda info
conda config --show-sources
conda list --show-channel-urls

gpuci_logger "Check compiler versions"
python --version

Expand Down Expand Up @@ -251,6 +250,8 @@ fi

cd "$WORKSPACE/python/cudf/cudf"
# It is essential to cd into $WORKSPACE/python/cudf/cudf as `pytest-xdist` + `coverage` seem to work only at this directory level.
gpuci_logger "Check conda packages"
conda list
gpuci_logger "Python py.test for cuDF"
py.test -n 8 --cache-clear --basetemp="$WORKSPACE/cudf-cuda-tmp" --ignore="$WORKSPACE/python/cudf/cudf/benchmarks" --junitxml="$WORKSPACE/junit-cudf.xml" -v --cov-config="$WORKSPACE/python/cudf/.coveragerc" --cov=cudf --cov-report=xml:"$WORKSPACE/python/cudf/cudf-coverage.xml" --cov-report term --dist=loadscope tests

Expand Down
2 changes: 1 addition & 1 deletion conda/environments/cudf_dev_cuda11.5.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ dependencies:
- python>=3.8,<3.10
- numba>=0.54
- numpy
- pandas>=1.0,<1.5.0dev0
- pandas>=1.0,<1.6.0dev0
- pyarrow=9
- fastavro>=0.22.9
- python-snappy>=0.6.0
Expand Down
2 changes: 1 addition & 1 deletion conda/recipes/cudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ requirements:
- protobuf>=3.20.1,<3.21.0a0
- python
- typing_extensions
- pandas >=1.0,<1.5.0dev0
- pandas >=1.0,<1.6.0dev0
- cupy >=9.5.0,<12.0.0a0
- numba >=0.54
- numpy
Expand Down
7 changes: 7 additions & 0 deletions python/cudf/cudf/core/_internals/where.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from cudf.core.missing import NA
from cudf.utils.dtypes import (
_can_cast,
_dtype_can_hold_element,
find_common_type,
is_mixed_with_object_dtype,
)
Expand Down Expand Up @@ -84,6 +85,12 @@ def _check_and_cast_columns_with_other(
other, source_dtype
):
common_dtype = source_dtype
elif (
isinstance(source_col, cudf.core.column.NumericalColumn)
and other_is_scalar
and _dtype_can_hold_element(source_dtype, other)
):
common_dtype = source_dtype
else:
common_dtype = find_common_type(
[
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/column/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -755,7 +755,7 @@ def __setitem__(self, key, value):
)

if to_add_categories > 0:
raise ValueError(
raise TypeError(
"Cannot setitem on a Categorical with a new "
"category, set the categories first"
)
Expand Down
10 changes: 7 additions & 3 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@
is_string_dtype,
is_struct_dtype,
)
from cudf.core._compat import PANDAS_GE_150
from cudf.core.abc import Serializable
from cudf.core.buffer import Buffer, DeviceBufferLike, as_device_buffer_like
from cudf.core.dtypes import (
Expand All @@ -83,6 +84,11 @@
)
from cudf.utils.utils import _array_ufunc, mask_dtype

if PANDAS_GE_150:
from pandas.core.arrays.arrow.extension_types import ArrowIntervalType
else:
from pandas.core.arrays._arrow_utils import ArrowIntervalType

T = TypeVar("T", bound="ColumnBase")
# TODO: This workaround allows type hints for `slice`, since `slice` is a
# method in ColumnBase.
Expand Down Expand Up @@ -290,9 +296,7 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase:
size=codes.size,
ordered=array.type.ordered,
)
elif isinstance(
array.type, pd.core.arrays._arrow_utils.ArrowIntervalType
):
elif isinstance(array.type, ArrowIntervalType):
return cudf.core.column.IntervalColumn.from_arrow(array)

result = libcudf.interop.from_arrow(data)[0]
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/column/interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,5 +131,5 @@ def to_pandas(self, index: pd.Index = None, **kwargs) -> "pd.Series":
# types into pandas (trying to convert the underlying numerical columns
# directly is problematic), so we're stuck with this for now.
return pd.Series(
pd.IntervalDtype().__from_arrow__(self.to_arrow()), index=index
self.dtype.to_pandas().__from_arrow__(self.to_arrow()), index=index
)
38 changes: 38 additions & 0 deletions python/cudf/cudf/core/column/numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
is_integer,
is_integer_dtype,
is_number,
is_scalar,
)
from cudf.core.buffer import DeviceBufferLike, as_device_buffer_like
from cudf.core.column import (
Expand Down Expand Up @@ -128,6 +129,43 @@ def has_nulls(self, include_nan=False):
self.nan_count != 0 if include_nan else False
)

def __setitem__(self, key: Any, value: Any):
"""
Set the value of ``self[key]`` to ``value``.
If ``value`` and ``self`` are of different types, ``value`` is coerced
to ``self.dtype``.
"""

# Normalize value to scalar/column
device_value = (
cudf.Scalar(
value,
dtype=self.dtype
if cudf._lib.scalar._is_null_host_scalar(value)
else None,
)
if is_scalar(value)
else as_column(value)
)

if not is_bool_dtype(self.dtype) and is_bool_dtype(device_value.dtype):
raise TypeError(f"Invalid value {value} for dtype {self.dtype}")
else:
device_value = device_value.astype(self.dtype)

out: Optional[ColumnBase] # If None, no need to perform mimic inplace.
if isinstance(key, slice):
out = self._scatter_by_slice(key, device_value)
else:
key = as_column(key)
if not isinstance(key, cudf.core.column.NumericalColumn):
raise ValueError(f"Invalid scatter map type {key.dtype}.")
out = self._scatter_by_column(key, device_value)

if out:
self._mimic_inplace(out, inplace=True)

@property
def __cuda_array_interface__(self) -> Mapping[str, Any]:
output = {
Expand Down
10 changes: 6 additions & 4 deletions python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -3752,8 +3752,9 @@ def endswith(self, pat: str) -> SeriesOrIndex:
dtype: bool
"""
if pat is None:
result_col = column.column_empty(
len(self._column), dtype="bool", masked=True
raise TypeError(
f"expected a string or a sequence-like object, not "
f"{type(pat).__name__}"
)
elif is_scalar(pat):
result_col = libstrings.endswith(
Expand Down Expand Up @@ -3814,8 +3815,9 @@ def startswith(self, pat: Union[str, Sequence]) -> SeriesOrIndex:
dtype: bool
"""
if pat is None:
result_col = column.column_empty(
len(self._column), dtype="bool", masked=True
raise TypeError(
f"expected a string or a sequence-like object, not "
f"{type(pat).__name__}"
)
elif is_scalar(pat):
result_col = libstrings.startswith(
Expand Down
14 changes: 12 additions & 2 deletions python/cudf/cudf/core/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,18 +10,22 @@
import pyarrow as pa
from pandas.api import types as pd_types
from pandas.api.extensions import ExtensionDtype
from pandas.core.arrays._arrow_utils import ArrowIntervalType
from pandas.core.dtypes.dtypes import (
CategoricalDtype as pd_CategoricalDtype,
CategoricalDtypeType as pd_CategoricalDtypeType,
)

import cudf
from cudf._typing import Dtype
from cudf.core._compat import PANDAS_GE_130
from cudf.core._compat import PANDAS_GE_130, PANDAS_GE_150
from cudf.core.abc import Serializable
from cudf.core.buffer import DeviceBufferLike

if PANDAS_GE_150:
from pandas.core.arrays.arrow.extension_types import ArrowIntervalType
else:
from pandas.core.arrays._arrow_utils import ArrowIntervalType


def dtype(arbitrary):
"""
Expand Down Expand Up @@ -610,6 +614,12 @@ def from_pandas(cls, pd_dtype: pd.IntervalDtype) -> "IntervalDtype":
else:
return cls(subtype=pd_dtype.subtype)

def to_pandas(self) -> pd.IntervalDtype:
if PANDAS_GE_130:
return pd.IntervalDtype(subtype=self.subtype, closed=self.closed)
else:
return pd.IntervalDtype(subtype=self.subtype)

def __eq__(self, other):
if isinstance(other, str):
# This means equality isn't transitive but mimics pandas
Expand Down
6 changes: 3 additions & 3 deletions python/cudf/cudf/core/multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from cudf._typing import DataFrameOrSeries
from cudf.api.types import is_integer, is_list_like, is_object_dtype
from cudf.core import column
from cudf.core._compat import PANDAS_GE_120
from cudf.core._compat import PANDAS_GE_120, PANDAS_GE_150
from cudf.core.frame import Frame
from cudf.core.index import (
BaseIndex,
Expand Down Expand Up @@ -451,8 +451,8 @@ def __repr__(self):
)
)

if PANDAS_GE_120:
# TODO: Remove this whole `if` block,
if PANDAS_GE_120 and not PANDAS_GE_150:
# Need this whole `if` block,
# this is a workaround for the following issue:
# https://github.com/pandas-dev/pandas/issues/39984
preprocess_pdf = pd.DataFrame(
Expand Down
22 changes: 16 additions & 6 deletions python/cudf/cudf/core/window/rolling.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from cudf import _lib as libcudf
from cudf.api.types import is_integer, is_number
from cudf.core import column
from cudf.core._compat import PANDAS_GE_150
from cudf.core.column.column import as_column
from cudf.core.mixins import Reducible
from cudf.utils import cudautils
Expand Down Expand Up @@ -215,12 +216,21 @@ def _apply_agg_column(self, source_column, agg_name):
following_window = None
window = self.window
elif isinstance(self.window, BaseIndexer):
start, end = self.window.get_window_bounds(
num_values=len(self.obj),
min_periods=self.min_periods,
center=self.center,
closed=None,
)
if PANDAS_GE_150:
start, end = self.window.get_window_bounds(
num_values=len(self.obj),
min_periods=self.min_periods,
center=self.center,
closed=None,
step=None,
)
else:
start, end = self.window.get_window_bounds(
num_values=len(self.obj),
min_periods=self.min_periods,
center=self.center,
closed=None,
)
start = as_column(start, dtype="int32")
end = as_column(end, dtype="int32")

Expand Down
10 changes: 8 additions & 2 deletions python/cudf/cudf/tests/test_array_ufunc.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import pytest

import cudf
from cudf.core._compat import PANDAS_GE_150
from cudf.testing._utils import assert_eq, set_random_null_mask_inplace

_UFUNCS = [
Expand Down Expand Up @@ -84,14 +85,19 @@ def test_ufunc_index(ufunc):
assert_eq(g, e, check_exact=False)
else:
assert_eq(got, expect, check_exact=False)
except AssertionError:
except AssertionError as e:
# TODO: This branch can be removed when
# https://github.com/rapidsai/cudf/issues/10178 is resolved
if fname in ("power", "float_power"):
if (got - expect).abs().max() == 1:
pytest.xfail("https://github.com/rapidsai/cudf/issues/10178")
elif fname in ("bitwise_and", "bitwise_or", "bitwise_xor"):
pytest.xfail("https://github.com/pandas-dev/pandas/issues/46769")
if PANDAS_GE_150:
raise e
else:
pytest.xfail(
"https://github.com/pandas-dev/pandas/issues/46769"
)
raise


Expand Down
9 changes: 6 additions & 3 deletions python/cudf/cudf/tests/test_binops.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

import cudf
from cudf import Series
from cudf.core._compat import PANDAS_GE_150
from cudf.core.index import as_index
from cudf.testing import _utils as utils
from cudf.utils.dtypes import (
Expand Down Expand Up @@ -768,7 +769,7 @@ def test_operator_func_between_series_logical(
@pytest.mark.parametrize("func", _operators_comparison)
@pytest.mark.parametrize("has_nulls", [True, False])
@pytest.mark.parametrize("scalar", [-59.0, np.nan, 0, 59.0])
@pytest.mark.parametrize("fill_value", [None, True, False, 1.0])
@pytest.mark.parametrize("fill_value", [None, 1.0])
@pytest.mark.parametrize("use_cudf_scalar", [False, True])
def test_operator_func_series_and_scalar_logical(
dtype, func, has_nulls, scalar, fill_value, use_cudf_scalar
Expand Down Expand Up @@ -1561,7 +1562,8 @@ def test_scalar_null_binops(op, dtype_l, dtype_r):
pytest.param(
"nanoseconds",
marks=pytest.mark.xfail(
reason="https://github.com/pandas-dev/pandas/issues/36589"
condition=not PANDAS_GE_150,
reason="https://github.com/pandas-dev/pandas/issues/36589",
),
),
],
Expand Down Expand Up @@ -1668,7 +1670,8 @@ def test_datetime_dateoffset_binaryop_multiple(date_col, kwargs, op):
pytest.param(
"nanoseconds",
marks=pytest.mark.xfail(
reason="https://github.com/pandas-dev/pandas/issues/36589"
condition=not PANDAS_GE_150,
reason="https://github.com/pandas-dev/pandas/issues/36589",
),
),
],
Expand Down
6 changes: 3 additions & 3 deletions python/cudf/cudf/tests/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -414,7 +414,7 @@ def test_categorical_as_unordered(pd_str_cat, inplace):
pytest.param(
True,
marks=pytest.mark.skipif(
not PANDAS_GE_134,
condition=not PANDAS_GE_134,
reason="https://github.com/pandas-dev/pandas/issues/43232",
),
),
Expand Down Expand Up @@ -454,7 +454,7 @@ def test_categorical_reorder_categories(
pytest.param(
True,
marks=pytest.mark.skipif(
not PANDAS_GE_134,
condition=not PANDAS_GE_134,
reason="https://github.com/pandas-dev/pandas/issues/43232",
),
),
Expand Down Expand Up @@ -491,7 +491,7 @@ def test_categorical_add_categories(pd_str_cat, inplace):
pytest.param(
True,
marks=pytest.mark.skipif(
not PANDAS_GE_134,
condition=not PANDAS_GE_134,
reason="https://github.com/pandas-dev/pandas/issues/43232",
),
),
Expand Down
Loading

0 comments on commit 387c5ff

Please sign in to comment.