diff --git a/CHANGELOG.md b/CHANGELOG.md index e22277c4a29..caab91ec94d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ - PR #6814 Implement `cudf::reduce` for `decimal32` and `decimal64` (part 1) - PR #6929 Add `Index.set_names` api - PR #6907 Add `replace_null` API with `replace_policy` parameter, `fixed_width` column support +- PR #6885 Share `factorize` implementation with Index and cudf module ## Improvements diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py index 91242b9ca06..9008c6a73a2 100644 --- a/python/cudf/cudf/__init__.py +++ b/python/cudf/cudf/__init__.py @@ -39,6 +39,7 @@ from_pandas, merge, ) +from cudf.core.algorithms import factorize from cudf.core.dtypes import CategoricalDtype from cudf.core.groupby import Grouper from cudf.core.ops import ( diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py new file mode 100644 index 00000000000..6cfe158651b --- /dev/null +++ b/python/cudf/cudf/core/algorithms.py @@ -0,0 +1,66 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. +from warnings import warn + +import cupy as cp + +from cudf.core.series import Index, Series + + +def factorize(values, sort=False, na_sentinel=-1, size_hint=None): + """Encode the input values as integer labels + + Parameters + ---------- + values: Series, Index, or CuPy array + The data to be factorized. + na_sentinel : number, default -1 + Value to indicate missing category. + + Returns + -------- + (labels, cats) : (Series, Series) + - *labels* contains the encoded values + - *cats* contains the categories in order that the N-th + item corresponds to the (N-1) code. + + Examples + -------- + >>> import cudf + >>> data = cudf.Series(['a', 'c', 'c']) + >>> codes, uniques = cudf.factorize(data) + >>> codes + 0 0 + 1 1 + 2 1 + dtype: int8 + >>> uniques + 0 a + 1 c + dtype: object + + See Also + -------- + cudf.Series.factorize + + """ + if sort: + raise NotImplementedError( + "Sorting not yet supported during factorization." + ) + if na_sentinel is None: + raise NotImplementedError("na_sentinel can not be None.") + + if size_hint: + warn("size_hint is not applicable for cudf.factorize") + + return_cupy_array = isinstance(values, cp.core.core.ndarray) + + values = Series(values) + + cats = values._column.dropna().unique().astype(values.dtype) + + name = values.name # label_encoding mutates self.name + labels = values.label_encoding(cats=cats, na_sentinel=na_sentinel).values + values.name = name + + return labels, cats.values if return_cupy_array else Index(cats) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index f60288b95b3..5643acd5379 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -452,6 +452,17 @@ def _clean_nulls_from_index(self): else: return self + def factorize(self, na_sentinel=-1): + """ + Encode the input values as integer labels + + See Also + -------- + cudf.Series.factorize + + """ + return cudf.core.algorithms.factorize(self, na_sentinel=na_sentinel) + @property def nlevels(self): """ diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 3e8f8212b0c..7caeb196e65 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -618,9 +618,9 @@ def _compute_levels_and_codes(self): codes = cudf.DataFrame() for name in self._source_data.columns: code, cats = self._source_data[name].factorize() - codes[name] = code.reset_index(drop=True).astype(np.int64) + codes[name] = code.astype(np.int64) cats.name = None - cats = cats.reset_index(drop=True)._copy_construct(name=None) + cats = cudf.Series(cats)._copy_construct(name=None) levels.append(cats) self._levels = levels diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 0ea76877bab..1d119b58878 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -2574,13 +2574,7 @@ def factorize(self, na_sentinel=-1): 1 c dtype: object """ - cats = self.dropna().unique().astype(self.dtype) - - name = self.name # label_encoding mutates self.name - labels = self.label_encoding(cats=cats, na_sentinel=na_sentinel) - self.name = name - - return labels, cats + return cudf.core.algorithms.factorize(self, na_sentinel=na_sentinel) # UDF related diff --git a/python/cudf/cudf/tests/test_factorize.py b/python/cudf/cudf/tests/test_factorize.py index 4c42ee2a7bf..61d11fa5961 100644 --- a/python/cudf/cudf/tests/test_factorize.py +++ b/python/cudf/cudf/tests/test_factorize.py @@ -1,14 +1,17 @@ # Copyright (c) 2018, NVIDIA CORPORATION. +import cupy as cp import numpy as np +import pandas as pd import pytest -from cudf.core import DataFrame, Series +import cudf +from cudf.core import DataFrame, Index from cudf.tests.utils import assert_eq @pytest.mark.parametrize("ncats,nelem", [(2, 2), (2, 10), (10, 100)]) -def test_factorize(ncats, nelem): +def test_factorize_series_obj(ncats, nelem): df = DataFrame() np.random.seed(0) @@ -17,15 +20,34 @@ def test_factorize(ncats, nelem): uvals, labels = df["cats"].factorize() np.testing.assert_array_equal(labels.to_array(), sorted(set(arr))) - assert isinstance(uvals, Series) - assert isinstance(labels, Series) + assert isinstance(uvals, cp.core.core.ndarray) + assert isinstance(labels, Index) encoder = dict((labels[idx], idx) for idx in range(len(labels))) handcoded = [encoder[v] for v in arr] - np.testing.assert_array_equal(uvals.to_array(), handcoded) + np.testing.assert_array_equal(uvals.get(), handcoded) -def test_factorize_index(): +@pytest.mark.parametrize("ncats,nelem", [(2, 2), (2, 10), (10, 100)]) +def test_factorize_index_obj(ncats, nelem): + df = DataFrame() + np.random.seed(0) + + # initialize data frame + df["cats"] = arr = np.random.randint(2, size=10, dtype=np.int32) + df = df.set_index("cats") + + uvals, labels = df.index.factorize() + np.testing.assert_array_equal(labels.values.get(), sorted(set(arr))) + assert isinstance(uvals, cp.core.core.ndarray) + assert isinstance(labels, Index) + + encoder = dict((labels[idx], idx) for idx in range(len(labels))) + handcoded = [encoder[v] for v in arr] + np.testing.assert_array_equal(uvals.get(), handcoded) + + +def test_factorize_series_index(): df = DataFrame() df["col1"] = ["C", "H", "C", "W", "W", "W", "W", "W", "C", "W"] df["col2"] = [ @@ -40,10 +62,7 @@ def test_factorize_index(): 2992446.0, 2992448.0, ] - - assert_eq( - df.col1.factorize()[0].to_array(), df.to_pandas().col1.factorize()[0] - ) + assert_eq(df.col1.factorize()[0].get(), df.to_pandas().col1.factorize()[0]) assert_eq( df.col1.factorize()[1].to_pandas().values, df.to_pandas().col1.factorize()[1].values, @@ -51,10 +70,72 @@ def test_factorize_index(): df = df.set_index("col2") - assert_eq( - df.col1.factorize()[0].to_array(), df.to_pandas().col1.factorize()[0] - ) + assert_eq(df.col1.factorize()[0].get(), df.to_pandas().col1.factorize()[0]) assert_eq( df.col1.factorize()[1].to_pandas().values, df.to_pandas().col1.factorize()[1].values, ) + + +def test_cudf_factorize_series(): + data = [1, 2, 3, 4, 5] + + psr = pd.Series(data) + gsr = cudf.Series(data) + + expect = pd.factorize(psr) + got = cudf.factorize(gsr) + + assert len(expect) == len(got) + + np.testing.assert_array_equal(expect[0], got[0].get()) + np.testing.assert_array_equal(expect[1], got[1].values.get()) + + +def test_cudf_factorize_index(): + data = [1, 2, 3, 4, 5] + + pi = pd.Index(data) + gi = cudf.Index(data) + + expect = pd.factorize(pi) + got = cudf.factorize(gi) + + assert len(expect) == len(got) + + np.testing.assert_array_equal(expect[0], got[0].get()) + np.testing.assert_array_equal(expect[1], got[1].values.get()) + + +def test_cudf_factorize_array(): + data = [1, 2, 3, 4, 5] + + parr = np.array(data) + garr = cp.array(data) + + expect = pd.factorize(parr) + got = cudf.factorize(garr) + + assert len(expect) == len(got) + + np.testing.assert_array_equal(expect[0], got[0].get()) + np.testing.assert_array_equal(expect[1], got[1].get()) + + +def test_factorize_result_classes(): + data = [1, 2, 3] + + labels, cats = cudf.factorize(cudf.Series(data)) + + assert isinstance(labels, cp.core.core.ndarray) + assert isinstance(cats, cudf.Index) + + labels, cats = cudf.factorize(cudf.Index(data)) + + assert isinstance(labels, cp.core.core.ndarray) + assert isinstance(cats, cudf.Index) + + labels, cats = cudf.factorize(cp.array(data)) + + assert isinstance(labels, cp.core.core.ndarray) + assert isinstance(cats, cp.core.core.ndarray) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 300b17b1625..4c6589789bf 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -474,8 +474,8 @@ def test_series_factorize(data, na_sentinel): expected_labels, expected_cats = psr.factorize(na_sentinel=na_sentinel) actual_labels, actual_cats = gsr.factorize(na_sentinel=na_sentinel) - assert_eq(expected_labels, actual_labels.to_array()) - assert_eq(expected_cats.values, actual_cats.to_array()) + assert_eq(expected_labels, actual_labels.get()) + assert_eq(expected_cats.values, actual_cats.to_pandas().values) @pytest.mark.parametrize(