Upgrade pandas to 1.5 (#11617)

This PR introduces `pandas-1.5` support in `cudf`. The changes include: - [x] Requires `group_keys` support in `groupby` for `dask_cudf` to work: #11659 - [x] Requires `zfill` updates to match `pandas-1.5` behavior: #11634 - [x] `where` API: Ability to inspect a scalar value if it can be fit into the existing dtype, similar to: pandas-dev/pandas#48373 - [x] Switches `ValueError` to `TypeError` when an unknown category is being set to a `CategoricalColumn` - [x] Handles breaking change of an `ArrowIntervalType` related import that has resulted in `cudf` to error on import itself. - [x] Fix an issue with `IntervalColumn.to_pandas`. - [x] Raises error when an object of `boolean` dtype is being set to a `NumericalColumn`. - [x] Raises error when `pat` is None in `Series.str.startswith` & `Series.str.endswith`. - [x] Add `IntervalDtype.to_pandas` with appropriate versioning. - [x] Handle `get_window_bounds` signature changes. - [x] Fix and version a bunch of pytests. ```python branch-22.10: == 4275 failed, 79837 passed, 2049 skipped, 1193 xfailed, 1923 xpassed, 6597 warnings, 4 errors in 1103.52s (0:18:23) == == 803 failed, 106 passed, 14 skipped, 14 xfailed, 324 warnings, 17 errors in 148.46s (0:02:28) == This PR: == 84041 passed, 2049 skipped, 1199 xfailed, 1710 xpassed, 6599 warnings in 359.27s (0:05:59) == == 954 passed, 14 skipped, 7 xfailed, 3 xpassed, 580 warnings in 54.75s == ``` Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Ashwin Srinath (https://github.com/shwina) - Matthew Roeschke (https://github.com/mroeschke) - Mark Sadang (https://github.com/msadang) URL: #11617
rapidsai · Sep 21, 2022 · 387c5ff · 387c5ff
1 parent 0528b38
commit 387c5ff
Show file tree

Hide file tree

Showing 30 changed files with 326 additions and 113 deletions.
diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
@@ -79,7 +79,6 @@ gpuci_logger "Check conda environment"
 conda info
 conda config --show-sources
 conda list --show-channel-urls
-
 gpuci_logger "Check compiler versions"
 python --version
 
@@ -251,6 +250,8 @@ fi
 
 cd "$WORKSPACE/python/cudf/cudf"
 # It is essential to cd into $WORKSPACE/python/cudf/cudf as `pytest-xdist` + `coverage` seem to work only at this directory level.
+gpuci_logger "Check conda packages"
+conda list
 gpuci_logger "Python py.test for cuDF"
 py.test -n 8 --cache-clear --basetemp="$WORKSPACE/cudf-cuda-tmp" --ignore="$WORKSPACE/python/cudf/cudf/benchmarks" --junitxml="$WORKSPACE/junit-cudf.xml" -v --cov-config="$WORKSPACE/python/cudf/.coveragerc" --cov=cudf --cov-report=xml:"$WORKSPACE/python/cudf/cudf-coverage.xml" --cov-report term --dist=loadscope tests
 

diff --git a/conda/environments/cudf_dev_cuda11.5.yml b/conda/environments/cudf_dev_cuda11.5.yml
@@ -20,7 +20,7 @@ dependencies:
   - python>=3.8,<3.10
   - numba>=0.54
   - numpy
-  - pandas>=1.0,<1.5.0dev0
+  - pandas>=1.0,<1.6.0dev0
   - pyarrow=9
   - fastavro>=0.22.9
   - python-snappy>=0.6.0

diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
@@ -48,7 +48,7 @@ requirements:
     - protobuf>=3.20.1,<3.21.0a0
     - python
     - typing_extensions
-    - pandas >=1.0,<1.5.0dev0
+    - pandas >=1.0,<1.6.0dev0
     - cupy >=9.5.0,<12.0.0a0
     - numba >=0.54
     - numpy

diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py
@@ -16,6 +16,7 @@
 from cudf.core.missing import NA
 from cudf.utils.dtypes import (
     _can_cast,
+    _dtype_can_hold_element,
     find_common_type,
     is_mixed_with_object_dtype,
 )
@@ -84,6 +85,12 @@ def _check_and_cast_columns_with_other(
         other, source_dtype
     ):
         common_dtype = source_dtype
+    elif (
+        isinstance(source_col, cudf.core.column.NumericalColumn)
+        and other_is_scalar
+        and _dtype_can_hold_element(source_dtype, other)
+    ):
+        common_dtype = source_dtype
     else:
         common_dtype = find_common_type(
             [

diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
@@ -755,7 +755,7 @@ def __setitem__(self, key, value):
             )
 
         if to_add_categories > 0:
-            raise ValueError(
+            raise TypeError(
                 "Cannot setitem on a Categorical with a new "
                 "category, set the categories first"
             )

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
@@ -62,6 +62,7 @@
     is_string_dtype,
     is_struct_dtype,
 )
+from cudf.core._compat import PANDAS_GE_150
 from cudf.core.abc import Serializable
 from cudf.core.buffer import Buffer, DeviceBufferLike, as_device_buffer_like
 from cudf.core.dtypes import (
@@ -83,6 +84,11 @@
 )
 from cudf.utils.utils import _array_ufunc, mask_dtype
 
+if PANDAS_GE_150:
+    from pandas.core.arrays.arrow.extension_types import ArrowIntervalType
+else:
+    from pandas.core.arrays._arrow_utils import ArrowIntervalType
+
 T = TypeVar("T", bound="ColumnBase")
 # TODO: This workaround allows type hints for `slice`, since `slice` is a
 # method in ColumnBase.
@@ -290,9 +296,7 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase:
                 size=codes.size,
                 ordered=array.type.ordered,
             )
-        elif isinstance(
-            array.type, pd.core.arrays._arrow_utils.ArrowIntervalType
-        ):
+        elif isinstance(array.type, ArrowIntervalType):
             return cudf.core.column.IntervalColumn.from_arrow(array)
 
         result = libcudf.interop.from_arrow(data)[0]

diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py
@@ -131,5 +131,5 @@ def to_pandas(self, index: pd.Index = None, **kwargs) -> "pd.Series":
         # types into pandas (trying to convert the underlying numerical columns
         # directly is problematic), so we're stuck with this for now.
         return pd.Series(
-            pd.IntervalDtype().__from_arrow__(self.to_arrow()), index=index
+            self.dtype.to_pandas().__from_arrow__(self.to_arrow()), index=index
         )
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
@@ -34,6 +34,7 @@
     is_integer,
     is_integer_dtype,
     is_number,
+    is_scalar,
 )
 from cudf.core.buffer import DeviceBufferLike, as_device_buffer_like
 from cudf.core.column import (
@@ -128,6 +129,43 @@ def has_nulls(self, include_nan=False):
             self.nan_count != 0 if include_nan else False
         )
 
+    def __setitem__(self, key: Any, value: Any):
+        """
+        Set the value of ``self[key]`` to ``value``.
+
+        If ``value`` and ``self`` are of different types, ``value`` is coerced
+        to ``self.dtype``.
+        """
+
+        # Normalize value to scalar/column
+        device_value = (
+            cudf.Scalar(
+                value,
+                dtype=self.dtype
+                if cudf._lib.scalar._is_null_host_scalar(value)
+                else None,
+            )
+            if is_scalar(value)
+            else as_column(value)
+        )
+
+        if not is_bool_dtype(self.dtype) and is_bool_dtype(device_value.dtype):
+            raise TypeError(f"Invalid value {value} for dtype {self.dtype}")
+        else:
+            device_value = device_value.astype(self.dtype)
+
+        out: Optional[ColumnBase]  # If None, no need to perform mimic inplace.
+        if isinstance(key, slice):
+            out = self._scatter_by_slice(key, device_value)
+        else:
+            key = as_column(key)
+            if not isinstance(key, cudf.core.column.NumericalColumn):
+                raise ValueError(f"Invalid scatter map type {key.dtype}.")
+            out = self._scatter_by_column(key, device_value)
+
+        if out:
+            self._mimic_inplace(out, inplace=True)
+
     @property
     def __cuda_array_interface__(self) -> Mapping[str, Any]:
         output = {

diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
@@ -3752,8 +3752,9 @@ def endswith(self, pat: str) -> SeriesOrIndex:
         dtype: bool
         """
         if pat is None:
-            result_col = column.column_empty(
-                len(self._column), dtype="bool", masked=True
+            raise TypeError(
+                f"expected a string or a sequence-like object, not "
+                f"{type(pat).__name__}"
             )
         elif is_scalar(pat):
             result_col = libstrings.endswith(
@@ -3814,8 +3815,9 @@ def startswith(self, pat: Union[str, Sequence]) -> SeriesOrIndex:
         dtype: bool
         """
         if pat is None:
-            result_col = column.column_empty(
-                len(self._column), dtype="bool", masked=True
+            raise TypeError(
+                f"expected a string or a sequence-like object, not "
+                f"{type(pat).__name__}"
             )
         elif is_scalar(pat):
             result_col = libstrings.startswith(

diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
@@ -10,18 +10,22 @@
 import pyarrow as pa
 from pandas.api import types as pd_types
 from pandas.api.extensions import ExtensionDtype
-from pandas.core.arrays._arrow_utils import ArrowIntervalType
 from pandas.core.dtypes.dtypes import (
     CategoricalDtype as pd_CategoricalDtype,
     CategoricalDtypeType as pd_CategoricalDtypeType,
 )
 
 import cudf
 from cudf._typing import Dtype
-from cudf.core._compat import PANDAS_GE_130
+from cudf.core._compat import PANDAS_GE_130, PANDAS_GE_150
 from cudf.core.abc import Serializable
 from cudf.core.buffer import DeviceBufferLike
 
+if PANDAS_GE_150:
+    from pandas.core.arrays.arrow.extension_types import ArrowIntervalType
+else:
+    from pandas.core.arrays._arrow_utils import ArrowIntervalType
+
 
 def dtype(arbitrary):
     """
@@ -610,6 +614,12 @@ def from_pandas(cls, pd_dtype: pd.IntervalDtype) -> "IntervalDtype":
         else:
             return cls(subtype=pd_dtype.subtype)
 
+    def to_pandas(self) -> pd.IntervalDtype:
+        if PANDAS_GE_130:
+            return pd.IntervalDtype(subtype=self.subtype, closed=self.closed)
+        else:
+            return pd.IntervalDtype(subtype=self.subtype)
+
     def __eq__(self, other):
         if isinstance(other, str):
             # This means equality isn't transitive but mimics pandas

diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
@@ -20,7 +20,7 @@
 from cudf._typing import DataFrameOrSeries
 from cudf.api.types import is_integer, is_list_like, is_object_dtype
 from cudf.core import column
-from cudf.core._compat import PANDAS_GE_120
+from cudf.core._compat import PANDAS_GE_120, PANDAS_GE_150
 from cudf.core.frame import Frame
 from cudf.core.index import (
     BaseIndex,
@@ -451,8 +451,8 @@ def __repr__(self):
                 )
             )
 
-            if PANDAS_GE_120:
-                # TODO: Remove this whole `if` block,
+            if PANDAS_GE_120 and not PANDAS_GE_150:
+                # Need this whole `if` block,
                 # this is a workaround for the following issue:
                 # https://github.com/pandas-dev/pandas/issues/39984
                 preprocess_pdf = pd.DataFrame(

diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py
@@ -10,6 +10,7 @@
 from cudf import _lib as libcudf
 from cudf.api.types import is_integer, is_number
 from cudf.core import column
+from cudf.core._compat import PANDAS_GE_150
 from cudf.core.column.column import as_column
 from cudf.core.mixins import Reducible
 from cudf.utils import cudautils
@@ -215,12 +216,21 @@ def _apply_agg_column(self, source_column, agg_name):
             following_window = None
             window = self.window
         elif isinstance(self.window, BaseIndexer):
-            start, end = self.window.get_window_bounds(
-                num_values=len(self.obj),
-                min_periods=self.min_periods,
-                center=self.center,
-                closed=None,
-            )
+            if PANDAS_GE_150:
+                start, end = self.window.get_window_bounds(
+                    num_values=len(self.obj),
+                    min_periods=self.min_periods,
+                    center=self.center,
+                    closed=None,
+                    step=None,
+                )
+            else:
+                start, end = self.window.get_window_bounds(
+                    num_values=len(self.obj),
+                    min_periods=self.min_periods,
+                    center=self.center,
+                    closed=None,
+                )
             start = as_column(start, dtype="int32")
             end = as_column(end, dtype="int32")
 

diff --git a/python/cudf/cudf/tests/test_array_ufunc.py b/python/cudf/cudf/tests/test_array_ufunc.py
@@ -10,6 +10,7 @@
 import pytest
 
 import cudf
+from cudf.core._compat import PANDAS_GE_150
 from cudf.testing._utils import assert_eq, set_random_null_mask_inplace
 
 _UFUNCS = [
@@ -84,14 +85,19 @@ def test_ufunc_index(ufunc):
                 assert_eq(g, e, check_exact=False)
         else:
             assert_eq(got, expect, check_exact=False)
-    except AssertionError:
+    except AssertionError as e:
         # TODO: This branch can be removed when
         # https://github.com/rapidsai/cudf/issues/10178 is resolved
         if fname in ("power", "float_power"):
             if (got - expect).abs().max() == 1:
                 pytest.xfail("https://github.com/rapidsai/cudf/issues/10178")
         elif fname in ("bitwise_and", "bitwise_or", "bitwise_xor"):
-            pytest.xfail("https://github.com/pandas-dev/pandas/issues/46769")
+            if PANDAS_GE_150:
+                raise e
+            else:
+                pytest.xfail(
+                    "https://github.com/pandas-dev/pandas/issues/46769"
+                )
         raise
 
 

diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py
@@ -13,6 +13,7 @@
 
 import cudf
 from cudf import Series
+from cudf.core._compat import PANDAS_GE_150
 from cudf.core.index import as_index
 from cudf.testing import _utils as utils
 from cudf.utils.dtypes import (
@@ -768,7 +769,7 @@ def test_operator_func_between_series_logical(
 @pytest.mark.parametrize("func", _operators_comparison)
 @pytest.mark.parametrize("has_nulls", [True, False])
 @pytest.mark.parametrize("scalar", [-59.0, np.nan, 0, 59.0])
-@pytest.mark.parametrize("fill_value", [None, True, False, 1.0])
+@pytest.mark.parametrize("fill_value", [None, 1.0])
 @pytest.mark.parametrize("use_cudf_scalar", [False, True])
 def test_operator_func_series_and_scalar_logical(
     dtype, func, has_nulls, scalar, fill_value, use_cudf_scalar
@@ -1561,7 +1562,8 @@ def test_scalar_null_binops(op, dtype_l, dtype_r):
         pytest.param(
             "nanoseconds",
             marks=pytest.mark.xfail(
-                reason="https://github.com/pandas-dev/pandas/issues/36589"
+                condition=not PANDAS_GE_150,
+                reason="https://github.com/pandas-dev/pandas/issues/36589",
             ),
         ),
     ],
@@ -1668,7 +1670,8 @@ def test_datetime_dateoffset_binaryop_multiple(date_col, kwargs, op):
         pytest.param(
             "nanoseconds",
             marks=pytest.mark.xfail(
-                reason="https://github.com/pandas-dev/pandas/issues/36589"
+                condition=not PANDAS_GE_150,
+                reason="https://github.com/pandas-dev/pandas/issues/36589",
             ),
         ),
     ],

diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py
@@ -414,7 +414,7 @@ def test_categorical_as_unordered(pd_str_cat, inplace):
         pytest.param(
             True,
             marks=pytest.mark.skipif(
-                not PANDAS_GE_134,
+                condition=not PANDAS_GE_134,
                 reason="https://github.com/pandas-dev/pandas/issues/43232",
             ),
         ),
@@ -454,7 +454,7 @@ def test_categorical_reorder_categories(
         pytest.param(
             True,
             marks=pytest.mark.skipif(
-                not PANDAS_GE_134,
+                condition=not PANDAS_GE_134,
                 reason="https://github.com/pandas-dev/pandas/issues/43232",
             ),
         ),
@@ -491,7 +491,7 @@ def test_categorical_add_categories(pd_str_cat, inplace):
         pytest.param(
             True,
             marks=pytest.mark.skipif(
-                not PANDAS_GE_134,
+                condition=not PANDAS_GE_134,
                 reason="https://github.com/pandas-dev/pandas/issues/43232",
             ),
         ),