Adding cudf.cut method (#8002)

This PR aims to add the cut method to cudf. Cut in CuDF will mirror [pandas.cut](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.cut.html) and will be useful to use in `hist`. It also builds off of the `interval` and `IntervalIndex`. Authors: - Marlene (https://github.com/marlenezw) Approvers: - Nghia Truong (https://github.com/ttnghia) - Michael Wang (https://github.com/isVoid) URL: #8002
rapidsai · Jun 11, 2021 · 58b354f · 58b354f
1 parent d3b440e
commit 58b354f
Show file tree

Hide file tree

Showing 9 changed files with 624 additions and 10 deletions.
diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py
@@ -41,6 +41,7 @@
     from_pandas,
     interval_range,
     merge,
+    cut,
 )
 from cudf.core.algorithms import factorize
 from cudf.core.dtypes import (

diff --git a/python/cudf/cudf/core/__init__.py b/python/cudf/cudf/core/__init__.py
@@ -27,3 +27,4 @@
 from cudf.core.multiindex import MultiIndex
 from cudf.core.scalar import NA, Scalar
 from cudf.core.series import Series
+from cudf.core.cut import cut
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
@@ -35,6 +35,7 @@
     is_mixed_with_object_dtype,
     min_signed_type,
     min_unsigned_type,
+    is_interval_dtype,
 )
 
 if TYPE_CHECKING:
@@ -1092,7 +1093,13 @@ def to_pandas(self, index: pd.Index = None, **kwargs) -> pd.Series:
 
         signed_dtype = min_signed_type(len(col.categories))
         codes = col.cat().codes.astype(signed_dtype).fillna(-1).to_array()
-        categories = col.categories.dropna(drop_nan=True).to_pandas()
+        if is_interval_dtype(col.categories.dtype):
+            # leaving out dropna because it temporarily changes an interval
+            # index into a struct and throws off results.
+            # TODO: work on interval index dropna
+            categories = col.categories.to_pandas()
+        else:
+            categories = col.categories.dropna(drop_nan=True).to_pandas()
         data = pd.Categorical.from_codes(
             codes, categories=categories, ordered=col.ordered
         )

diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py
@@ -72,9 +72,10 @@ def to_arrow(self):
         return pa.ExtensionArray.from_storage(typ, struct_arrow)
 
     def from_struct_column(self, closed="right"):
+        first_field_name = list(self.dtype.fields.keys())[0]
         return IntervalColumn(
             size=self.size,
-            dtype=IntervalDtype(self.dtype.fields["left"], closed),
+            dtype=IntervalDtype(self.dtype.fields[first_field_name], closed),
             mask=self.base_mask,
             offset=self.offset,
             null_count=self.null_count,

diff --git a/python/cudf/cudf/core/cut.py b/python/cudf/cudf/core/cut.py
@@ -0,0 +1,296 @@
+from cudf._lib.labeling import label_bins
+from cudf.core.column import as_column
+from cudf.core.column import build_categorical_column
+from cudf.core.index import IntervalIndex, interval_range
+from cudf.utils.dtypes import is_list_like
+import cupy
+import cudf
+import numpy as np
+import pandas as pd
+from collections.abc import Sequence
+
+# from cudf._lib.filling import sequence
+
+
+def cut(
+    x,
+    bins,
+    right: bool = True,
+    labels=None,
+    retbins: bool = False,
+    precision: int = 3,
+    include_lowest: bool = False,
+    duplicates: str = "raise",
+    ordered: bool = True,
+):
+
+    """
+    Bin values into discrete intervals.
+    Use cut when you need to segment and sort data values into bins. This
+    function is also useful for going from a continuous variable to a
+    categorical variable.
+    Parameters
+    ----------
+    x : array-like
+        The input array to be binned. Must be 1-dimensional.
+    bins : int, sequence of scalars, or IntervalIndex
+        The criteria to bin by.
+        * int : Defines the number of equal-width bins in the
+        range of x. The range of x is extended by .1% on each
+        side to include the minimum and maximum values of x.
+    right : bool, default True
+        Indicates whether bins includes the rightmost edge or not.
+    labels : array or False, default None
+        Specifies the labels for the returned bins. Must be the same
+        length as the resulting bins. If False, returns only integer
+        indicators of thebins. If True,raises an error. When ordered=False,
+        labels must be provided.
+    retbins : bool, default False
+        Whether to return the bins or not.
+    precision : int, default 3
+        The precision at which to store and display the bins labels.
+    include_lowest : bool, default False
+        Whether the first interval should be left-inclusive or not.
+    duplicates : {default 'raise', 'drop'}, optional
+        If bin edges are not unique, raise ValueError or drop non-uniques.
+    ordered : bool, default True
+        Whether the labels are ordered or not. Applies to returned types
+        Categorical and Series (with Categorical dtype). If True,
+        the resulting categorical will be ordered. If False, the resulting
+        categorical will be unordered (labels must be provided).
+    Returns
+    -------
+    out : CategoricalIndex
+        An array-like object representing the respective bin for each value
+        of x. The type depends on the value of labels.
+    bins : numpy.ndarray or IntervalIndex.
+        The computed or specified bins. Only returned when retbins=True.
+        For scalar or sequence bins, this is an ndarray with the computed
+        bins. If set duplicates=drop, bins will drop non-unique bin. For
+        an IntervalIndex bins, this is equal to bins.
+    Examples
+    --------
+    Discretize into three equal-sized bins.
+    >>> cudf.cut(np.array([1, 7, 5, 4, 6, 3]), 3)
+    CategoricalIndex([(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0],
+    ...         (5.0, 7.0],(0.994, 3.0]], categories=[(0.994, 3.0],
+    ...         (3.0, 5.0], (5.0, 7.0]], ordered=True, dtype='category')
+    >>> cudf.cut(np.array([1, 7, 5, 4, 6, 3]), 3, retbins=True)
+    (CategoricalIndex([(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0],
+    ...         (5.0, 7.0],(0.994, 3.0]],categories=[(0.994, 3.0],
+    ...         (3.0, 5.0], (5.0, 7.0]],ordered=True, dtype='category'),
+    array([0.994, 3.   , 5.   , 7.   ]))
+    >>> cudf.cut(np.array([1, 7, 5, 4, 6, 3]),
+    ...        3, labels=["bad", "medium", "good"])
+    CategoricalIndex(['bad', 'good', 'medium', 'medium', 'good', 'bad'],
+    ...       categories=['bad', 'medium', 'good'],ordered=True,
+    ...       dtype='category')
+    >>> cudf.cut(np.array([1, 7, 5, 4, 6, 3]), 3,
+    ...       labels=["B", "A", "B"], ordered=False)
+    CategoricalIndex(['B', 'B', 'A', 'A', 'B', 'B'], categories=['A', 'B'],
+    ...        ordered=False, dtype='category')
+    >>> cudf.cut([0, 1, 1, 2], bins=4, labels=False)
+    array([0, 1, 1, 3], dtype=int32)
+    Passing a Series as an input returns a Series with categorical dtype:
+    >>> s = cudf.Series(np.array([2, 4, 6, 8, 10]),
+    ...        index=['a', 'b', 'c', 'd', 'e'])
+    >>> cudf.cut(s, 3)
+    """
+    left_inclusive = False
+    right_inclusive = True
+    # saving the original input x for use in case its a series
+    orig_x = x
+    old_bins = bins
+
+    if not ordered and labels is None:
+        raise ValueError("'labels' must be provided if 'ordered = False'")
+
+    if duplicates not in ["raise", "drop"]:
+        raise ValueError(
+            "invalid value for 'duplicates' parameter, valid options are: "
+            "raise, drop"
+        )
+
+    if labels is not False:
+        if not (labels is None or is_list_like(labels)):
+            raise ValueError(
+                "Bin labels must either be False, None or passed in as a "
+                "list-like argument"
+            )
+        elif ordered and labels is not None:
+            if len(set(labels)) != len(labels):
+                raise ValueError(
+                    "labels must be unique if ordered=True;"
+                    "pass ordered=False for duplicate labels"
+                )
+
+    # bins can either be an int, sequence of scalars or an intervalIndex
+    if isinstance(bins, Sequence):
+        if len(set(bins)) is not len(bins):
+            if duplicates == "raise":
+                raise ValueError(
+                    f"Bin edges must be unique: {repr(bins)}.\n"
+                    f"You can drop duplicate edges by setting the 'duplicates'"
+                    "kwarg"
+                )
+            elif duplicates == "drop":
+                # get unique values but maintain list dtype
+                bins = list(dict.fromkeys(bins))
+
+    # if bins is an intervalIndex we ignore the value of right
+    elif isinstance(bins, (pd.IntervalIndex, cudf.IntervalIndex)):
+        right = bins.closed == "right"
+
+    # create bins if given an int or single scalar
+    if not isinstance(bins, pd.IntervalIndex):
+        if not isinstance(bins, (Sequence)):
+            if isinstance(
+                x, (pd.Series, cudf.Series, np.ndarray, cupy.ndarray)
+            ):
+                mn = x.min()
+                mx = x.max()
+            else:
+                mn = min(x)
+                mx = max(x)
+            bins = np.linspace(mn, mx, bins + 1, endpoint=True)
+            adj = (mx - mn) * 0.001
+            if right:
+                bins[0] -= adj
+            else:
+                bins[-1] += adj
+
+        # if right and include lowest we adjust the first
+        # bin edge to make sure it is included
+        if right and include_lowest:
+            bins[0] = bins[0] - 10 ** (-precision)
+
+        # if right is false the last bin edge is not included
+        if not right:
+            right_edge = bins[-1]
+            x = cupy.asarray(x)
+            x[x == right_edge] = right_edge + 1
+
+        # adjust bin edges decimal precision
+        int_label_bins = np.around(bins, precision)
+
+    # the inputs is a column of the values in the array x
+    input_arr = as_column(x)
+
+    # checking for the correct inclusivity values
+    if right:
+        closed = "right"
+    else:
+        closed = "left"
+        left_inclusive = True
+
+    if isinstance(bins, pd.IntervalIndex):
+        interval_labels = bins
+    elif labels is None:
+        if duplicates == "drop" and len(bins) == 1 and len(old_bins) != 1:
+            if right and include_lowest:
+                old_bins[0] = old_bins[0] - 10 ** (-precision)
+                interval_labels = interval_range(
+                    old_bins[0], old_bins[1], periods=1, closed=closed
+                )
+            else:
+                interval_labels = IntervalIndex.from_breaks(
+                    old_bins, closed=closed
+                )
+        else:
+            # get labels for categories
+            interval_labels = IntervalIndex.from_breaks(
+                int_label_bins, closed=closed
+            )
+    elif labels is not False:
+        if not (is_list_like(labels)):
+            raise ValueError(
+                "Bin labels must either be False, None or passed in as a "
+                "list-like argument"
+            )
+        if ordered and len(set(labels)) != len(labels):
+            raise ValueError(
+                "labels must be unique if ordered=True; pass ordered=False for"
+                "duplicate labels"
+            )
+        else:
+            if len(labels) != len(bins) - 1:
+                raise ValueError(
+                    "Bin labels must be one fewer than the number of bin edges"
+                )
+            if not ordered and len(set(labels)) != len(labels):
+                interval_labels = cudf.CategoricalIndex(
+                    labels, categories=None, ordered=False
+                )
+            else:
+                interval_labels = (
+                    labels if len(set(labels)) == len(labels) else None
+                )
+
+    if isinstance(bins, pd.IntervalIndex):
+        # get the left and right edges of the bins as columns
+        # we cannot typecast an IntervalIndex, so we need to
+        # make the edges the same type as the input array
+        left_edges = as_column(bins.left).astype(input_arr.dtype)
+        right_edges = as_column(bins.right).astype(input_arr.dtype)
+    else:
+        # get the left and right edges of the bins as columns
+        left_edges = as_column(bins[:-1:], dtype="float64")
+        right_edges = as_column(bins[+1::], dtype="float64")
+        # the input arr must be changed to the same type as the edges
+        input_arr = input_arr.astype(left_edges.dtype)
+    # get the indexes for the appropriate number
+    index_labels = label_bins(
+        input_arr, left_edges, left_inclusive, right_edges, right_inclusive
+    )
+
+    if labels is False:
+        # if labels is false we return the index labels, we return them
+        # as a series if we have a series input
+        if isinstance(orig_x, (pd.Series, cudf.Series)):
+            # need to run more tests but looks like in this case pandas
+            # always returns a float64 dtype
+            indx_arr_series = cudf.Series(index_labels, dtype="float64")
+            # if retbins we return the bins as well
+            if retbins:
+                return indx_arr_series, bins
+            else:
+                return indx_arr_series
+        elif retbins:
+            return index_labels.values, bins
+        else:
+            return index_labels.values
+
+    if labels is not None:
+        if labels is not ordered and len(set(labels)) != len(labels):
+            # when we have duplicate labels and ordered is False, we
+            # should allow duplicate categories. The categories are
+            # returned in order
+            new_data = [interval_labels[i][0] for i in index_labels.values]
+            return cudf.CategoricalIndex(
+                new_data, categories=sorted(set(labels)), ordered=False
+            )
+
+    col = build_categorical_column(
+        categories=interval_labels,
+        codes=index_labels,
+        mask=index_labels.base_mask,
+        offset=index_labels.offset,
+        size=index_labels.size,
+        ordered=ordered,
+    )
+
+    # we return a categorical index, as we don't have a Categorical method
+    categorical_index = cudf.core.index.as_index(col)
+
+    if isinstance(orig_x, (pd.Series, cudf.Series)):
+        # if we have a series input we return a series output
+        res_series = cudf.Series(categorical_index, index=orig_x.index)
+        if retbins:
+            return res_series, bins
+        else:
+            return res_series
+    elif retbins:
+        # if retbins is true we return the bins as well
+        return categorical_index, bins
+    else:
+        return categorical_index
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
@@ -9,6 +9,7 @@
 import pyarrow as pa
 from pandas.api.extensions import ExtensionDtype
 from pandas.core.arrays._arrow_utils import ArrowIntervalType
+from cudf.utils.dtypes import is_interval_dtype
 
 import cudf
 from cudf._typing import Dtype
@@ -72,7 +73,7 @@ def to_pandas(self) -> pd.CategoricalDtype:
     def _init_categories(self, categories: Any):
         if categories is None:
             return categories
-        if len(categories) == 0:
+        if len(categories) == 0 and not is_interval_dtype(categories):
             dtype = "object"  # type: Any
         else:
             dtype = None

diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
@@ -480,7 +480,6 @@ def to_frame(self, index=True, name=None):
             col_name = 0
         else:
             col_name = self.name
-
         return cudf.DataFrame(
             {col_name: self._values}, index=self if index else None
         )
@@ -1080,7 +1079,6 @@ def to_series(self, index=None, name=None):
         Series
             The dtype will be based on the type of the Index values.
         """
-
         return cudf.Series(
             self._values,
             index=self.copy(deep=False) if index is None else index,
@@ -2796,6 +2794,8 @@ def as_index(arbitrary, **kwargs) -> BaseIndex:
         return TimedeltaIndex(arbitrary, **kwargs)
     elif isinstance(arbitrary, CategoricalColumn):
         return CategoricalIndex(arbitrary, **kwargs)
+    elif isinstance(arbitrary, IntervalColumn):
+        return IntervalIndex(arbitrary, **kwargs)
     elif isinstance(arbitrary, cudf.Series):
         return as_index(arbitrary._column, **kwargs)
     elif isinstance(arbitrary, pd.RangeIndex):