rapidsai · rapids-bot · Feb 26, 2021 · Feb 12, 2021 · Feb 12, 2021 · Feb 12, 2021
@@ -17,7 +17,7 @@ dependencies:
   - python>=3.6,<3.8
   - numba>=0.49.0,!=0.51.0
   - numpy
-  - pandas>=1.0,<1.2.0dev0
+  - pandas>=1.0,<1.3.0dev0
   - pyarrow=1.0.1
   - fastavro>=0.22.9
   - notebook>=0.5.0

@@ -17,7 +17,7 @@ dependencies:
   - python>=3.6,<3.8
   - numba>=0.49,!=0.51.0
   - numpy
-  - pandas>=1.0,<1.2.0dev0
+  - pandas>=1.0,<1.3.0dev0
   - pyarrow=1.0.1
   - fastavro>=0.22.9
   - notebook>=0.5.0

@@ -17,7 +17,7 @@ dependencies:
   - python>=3.6,<3.8
   - numba>=0.49,!=0.51.0
   - numpy
-  - pandas>=1.0,<1.2.0dev0
+  - pandas>=1.0,<1.3.0dev0
   - pyarrow=1.0.1
   - fastavro>=0.22.9
   - notebook>=0.5.0

@@ -1,4 +1,4 @@
-# Copyright (c) 2018, NVIDIA CORPORATION.
+# Copyright (c) 2018-2021, NVIDIA CORPORATION.
 
 {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
 {% set minor_version =  version.split('.')[0] + '.' + version.split('.')[1] %}
@@ -35,7 +35,7 @@ requirements:
     - protobuf
     - python
     - typing_extensions
-    - pandas >=1.0,<1.2.0dev0
+    - pandas >=1.0,<1.3.0dev0
     - cupy >7.1.0,<9.0.0a0
     - numba >=0.49.0
     - numpy

@@ -1,8 +1,9 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 
 import pandas as pd
 from packaging import version
 
 PANDAS_VERSION = version.parse(pd.__version__)
 PANDAS_GE_100 = PANDAS_VERSION >= version.parse("1.0")
 PANDAS_GE_110 = PANDAS_VERSION >= version.parse("1.1")
+PANDAS_GE_120 = PANDAS_VERSION >= version.parse("1.2")
@@ -9,6 +9,7 @@
     Dict,
     Mapping,
     Optional,
+    Sequence,
     Tuple,
     Union,
     cast,
@@ -867,6 +868,15 @@ def set_base_data(self, value):
         else:
             super().set_base_data(value)
 
+    def _process_values_for_isin(
+        self, values: Sequence
+    ) -> Tuple[ColumnBase, ColumnBase]:
+        lhs = self
+        # We need to convert values to same type as self,
+        # hence passing dtype=self.dtype
+        rhs = cudf.core.column.as_column(values, dtype=self.dtype)
+        return lhs, rhs
+
     def set_base_mask(self, value: Optional[Buffer]):
         super().set_base_mask(value)
         self._codes = None
@@ -936,6 +946,22 @@ def unary_operator(self, unaryop: str):
         )
 
     def __setitem__(self, key, value):
+        if cudf.utils.dtypes.is_scalar(value):
+            new_values = [value]
+        else:
+            new_values = value
+
+        to_add_categories = cudf.Index(new_values).difference(self.categories)
+
+        if (
+            len(to_add_categories)
+            and not to_add_categories.isna()._values.all()
+        ):
+            raise ValueError(
+                "Cannot setitem on a Categorical with a new "
+                "category, set the categories first"
+            )
+
         if cudf.utils.dtypes.is_scalar(value):
             value = self._encode(value) if value is not None else value
         else:
@@ -1204,6 +1230,12 @@ def fillna(
                         raise ValueError(err_msg) from err
             else:
                 fill_value = column.as_column(fill_value, nan_as_null=False)
+                if isinstance(fill_value, CategoricalColumn):
+                    if self.dtype != fill_value.dtype:
+                        raise ValueError(
+                            "Cannot set a Categorical with another, "
+                            "without identical categories"
+                        )
                 # TODO: only required if fill_value has a subset of the
                 # categories:
                 fill_value = fill_value.cat()._set_categories(

@@ -1,4 +1,5 @@
 # Copyright (c) 2018-2021, NVIDIA CORPORATION.
+
 from __future__ import annotations
 
 import builtins
@@ -49,12 +50,12 @@
     get_time_unit,
     is_categorical_dtype,
     is_decimal_dtype,
+    is_interval_dtype,
     is_list_dtype,
     is_numerical_dtype,
     is_scalar,
     is_string_dtype,
     is_struct_dtype,
-    is_interval_dtype,
     min_signed_type,
     min_unsigned_type,
     np_to_pa_dtype,
@@ -863,40 +864,60 @@ def isin(self, values: Sequence) -> ColumnBase:
         rhs = None
 
         try:
-            # We need to convert values to same type as self,
-            # hence passing dtype=self.dtype
-            rhs = as_column(values, dtype=self.dtype)
-
-            # Short-circuit if rhs is all null.
-            if lhs.null_count == 0 and (rhs.null_count == len(rhs)):
-                return full(len(self), False, dtype="bool")
+            lhs, rhs = self._process_values_for_isin(values)
+            res = lhs._isin_earlystop(rhs)
+            if res is not None:
+                return res
         except ValueError:
             # pandas functionally returns all False when cleansing via
             # typecasting fails
             return full(len(self), False, dtype="bool")
 
-        # If categorical, combine categories first
-        if is_categorical_dtype(lhs):
-            lhs_cats = lhs.cat().categories._values
-            rhs_cats = rhs.cat().categories._values
-
-            if not np.issubdtype(rhs_cats.dtype, lhs_cats.dtype):
-                # If they're not the same dtype, short-circuit if the values
-                # list doesn't have any nulls. If it does have nulls, make
-                # the values list a Categorical with a single null
-                if not rhs.has_nulls:
-                    return full(len(self), False, dtype="bool")
-                rhs = as_column(pd.Categorical.from_codes([-1], categories=[]))
-                rhs = rhs.cat().set_categories(lhs_cats).astype(self.dtype)
-
-        ldf = cudf.DataFrame({"x": lhs, "orig_order": arange(len(lhs))})
+        res = lhs._obtain_isin_result(rhs)
+
+        return res
+
+    def _process_values_for_isin(
+        self, values: Sequence
+    ) -> Tuple[ColumnBase, ColumnBase]:
+        """
+        Helper function for `isin` which pre-process `values` based on `self`.
+        """
+        lhs = self
+        rhs = as_column(values, nan_as_null=False)
+        if lhs.null_count == len(lhs):
+            lhs = lhs.astype(rhs.dtype)
+        elif rhs.null_count == len(rhs):
+            rhs = rhs.astype(lhs.dtype)
+        return lhs, rhs
+
+    def _isin_earlystop(self, rhs: ColumnBase) -> Union[ColumnBase, None]:
+        """
+        Helper function for `isin` which determines possibility of
+        early-stopping or not.
+        """
+        if self.dtype != rhs.dtype:
+            if self.null_count and rhs.null_count:
+                return self.isna()
+            else:
+                return cudf.core.column.full(len(self), False, dtype="bool")
+        elif self.null_count == 0 and (rhs.null_count == len(rhs)):
+            return cudf.core.column.full(len(self), False, dtype="bool")
+        else:
+            return None
+
+    def _obtain_isin_result(self, rhs: ColumnBase) -> ColumnBase:
+        """
+        Helper function for `isin` which merges `self` & `rhs`
+        to determine what values of `rhs` exist in `self`.
+        """
+        ldf = cudf.DataFrame({"x": self, "orig_order": arange(len(self))})
         rdf = cudf.DataFrame(
             {"x": rhs, "bool": full(len(rhs), True, dtype="bool")}
         )
         res = ldf.merge(rdf, on="x", how="left").sort_values(by="orig_order")
         res = res.drop_duplicates(subset="orig_order", ignore_index=True)
         res = res._data["bool"].fillna(False)
-
         return res
 
     def as_mask(self) -> Buffer:
@@ -1250,7 +1271,7 @@ def sum(
     def product(
         self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0
     ):
-        raise TypeError(f"cannot perform prod with type {self.dtype}")
+        raise TypeError(f"cannot perform product with type {self.dtype}")
 
     def mean(self, skipna: bool = None, dtype: Dtype = None):
         raise TypeError(f"cannot perform mean with type {self.dtype}")
@@ -1262,7 +1283,7 @@ def var(self, skipna: bool = None, ddof=1, dtype: Dtype = np.float64):
         raise TypeError(f"cannot perform var with type {self.dtype}")
 
     def kurtosis(self, skipna: bool = None):
-        raise TypeError(f"cannot perform kurt with type {self.dtype}")
+        raise TypeError(f"cannot perform kurtosis with type {self.dtype}")
 
     def skew(self, skipna: bool = None):
         raise TypeError(f"cannot perform skew with type {self.dtype}")

@@ -1,4 +1,5 @@
 # Copyright (c) 2019-2021, NVIDIA CORPORATION.
+
 from __future__ import annotations
 
 import datetime as dt
@@ -13,11 +14,17 @@
 import cudf
 from cudf import _lib as libcudf
 from cudf._typing import DatetimeLikeScalar, Dtype, DtypeObj, ScalarLike
+from cudf.core._compat import PANDAS_GE_120
 from cudf.core.buffer import Buffer
 from cudf.core.column import ColumnBase, column, string
 from cudf.utils.dtypes import is_scalar
 from cudf.utils.utils import _fillna_natwise
 
+if PANDAS_GE_120:
+    _guess_datetime_format = pd.core.tools.datetimes.guess_datetime_format
+else:
+    _guess_datetime_format = pd.core.tools.datetimes._guess_datetime_format
+
 # nanoseconds per time_unit
 _numpy_to_pandas_conversion = {
     "ns": 1,
@@ -235,6 +242,19 @@ def mean(self, skipna=None, dtype=np.float64) -> ScalarLike:
             unit=self.time_unit,
         )
 
+    def std(
+        self, skipna: bool = None, ddof: int = 1, dtype: Dtype = np.float64
+    ) -> pd.Timedelta:
+        return pd.Timedelta(
+            self.as_numerical.std(skipna=skipna, ddof=ddof, dtype=dtype)
+            * _numpy_to_pandas_conversion[self.time_unit],
+        )
+
+    def median(self, skipna: bool = None) -> pd.Timestamp:
+        return pd.Timestamp(
+            self.as_numerical.median(skipna=skipna), unit=self.time_unit
+        )
+
     def quantile(
         self, q: Union[float, Sequence[float]], interpolation: str, exact: bool
     ) -> ColumnBase:
@@ -316,6 +336,33 @@ def find_last_value(self, value: ScalarLike, closest: bool = False) -> int:
     def is_unique(self) -> bool:
         return self.as_numerical.is_unique
 
+    def isin(self, values: Sequence) -> ColumnBase:
+        if cudf.utils.dtypes.is_scalar(values):
+            raise TypeError(
+                "only list-like objects are allowed to be passed "
+                f"to isin(), you passed a [{type(values).__name__}]"
+            )
+
+        lhs = self
+        rhs = None
+
+        try:
+            rhs = cudf.core.column.as_column(values)
+
+            if rhs.dtype.kind in {"f", "i", "u"}:
+                return cudf.core.column.full(len(self), False, dtype="bool")
+            rhs = rhs.astype(self.dtype)
+            res = lhs._isin_earlystop(rhs)
+            if res is not None:
+                return res
+        except ValueError:
+            # pandas functionally returns all False when cleansing via
+            # typecasting fails
+            return cudf.core.column.full(len(self), False, dtype="bool")
+
+        res = lhs._obtain_isin_result(rhs)
+        return res
+
     def can_cast_safely(self, to_dtype: Dtype) -> bool:
         if np.issubdtype(to_dtype, np.datetime64):
 
@@ -375,7 +422,7 @@ def infer_format(element: str, **kwargs) -> str:
     """
     Infers datetime format from a string, also takes cares for `ms` and `ns`
     """
-    fmt = pd.core.tools.datetimes._guess_datetime_format(element, **kwargs)
+    fmt = _guess_datetime_format(element, **kwargs)
 
     if fmt is not None:
         return fmt
@@ -389,15 +436,11 @@ def infer_format(element: str, **kwargs) -> str:
     second_parts = re.split(r"(\D+)", element_parts[1], maxsplit=1)
     subsecond_fmt = ".%" + str(len(second_parts[0])) + "f"
 
-    first_part = pd.core.tools.datetimes._guess_datetime_format(
-        element_parts[0], **kwargs
-    )
+    first_part = _guess_datetime_format(element_parts[0], **kwargs)
     # For the case where first_part is '00:00:03'
     if first_part is None:
         tmp = "1970-01-01 " + element_parts[0]
-        first_part = pd.core.tools.datetimes._guess_datetime_format(
-            tmp, **kwargs
-        ).split(" ", 1)[1]
+        first_part = _guess_datetime_format(tmp, **kwargs).split(" ", 1)[1]
     if first_part is None:
         raise ValueError("Unable to infer the timestamp format from the data")
 
@@ -411,9 +454,7 @@ def infer_format(element: str, **kwargs) -> str:
 
         if len(second_part) > 1:
             # Only infer if second_parts is not an empty string.
-            second_part = pd.core.tools.datetimes._guess_datetime_format(
-                second_part, **kwargs
-            )
+            second_part = _guess_datetime_format(second_part, **kwargs)
     else:
         second_part = ""
 

@@ -3,7 +3,7 @@
 from __future__ import annotations
 
 from numbers import Number
-from typing import Any, Callable, Sequence, Union, cast
+from typing import Any, Callable, Sequence, Tuple, Union, cast
 
 import numpy as np
 import pandas as pd
@@ -248,6 +248,22 @@ def std(
     ) -> float:
         return self.reduce("std", skipna=skipna, dtype=dtype, ddof=ddof)
 
+    def _process_values_for_isin(
+        self, values: Sequence
+    ) -> Tuple[ColumnBase, ColumnBase]:
+        lhs = cast("cudf.core.column.ColumnBase", self)
+        rhs = as_column(values, nan_as_null=False)
+
+        if isinstance(rhs, NumericalColumn):
+            rhs = rhs.astype(dtype=self.dtype)
+
+        if lhs.null_count == len(lhs):
+            lhs = lhs.astype(rhs.dtype)
+        elif rhs.null_count == len(rhs):
+            rhs = rhs.astype(lhs.dtype)
+
+        return lhs, rhs
+
     def sum_of_squares(self, dtype: Dtype = None) -> float:
         return libcudf.reduce.reduce("sum_of_squares", self, dtype=dtype)
 

@@ -5189,7 +5189,7 @@ def _get_cols_list(parent_obj, others):
         ]
 
         return cols_list
-    elif others is not None:
+    elif others is not None and not isinstance(others, StringMethods):
         if (
             parent_index is not None
             and isinstance(others, cudf.Series)