From 154c03d4f1354275b2f474116311d1f7668147f6 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 2 Dec 2020 12:05:42 -0800 Subject: [PATCH 01/16] add Index.factorize and cudf.factorize and tests --- python/cudf/cudf/__init__.py | 2 +- python/cudf/cudf/core/index.py | 5 ++++ python/cudf/cudf/core/series.py | 29 ++++++++++++++++-------- python/cudf/cudf/tests/test_factorize.py | 24 +++++++++++++++++--- 4 files changed, 46 insertions(+), 14 deletions(-) diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py index 91242b9ca06..322f8928246 100644 --- a/python/cudf/cudf/__init__.py +++ b/python/cudf/cudf/__init__.py @@ -62,7 +62,7 @@ true_divide, ) from cudf.core.reshape import concat, get_dummies, melt, merge_sorted -from cudf.core.series import isclose +from cudf.core.series import _factorize as factorize, isclose from cudf.core.tools.datetimes import to_datetime from cudf.core.tools.numeric import to_numeric from cudf.io import ( diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 7485b99b0ce..216f54aaa24 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -452,6 +452,11 @@ def _clean_nulls_from_index(self): else: return self + def factorize(self, na_sentinel=-1): + from cudf.core.series import _factorize + + return _factorize(self, na_sentinel=na_sentinel) + def fillna(self, value, downcast=None): """ Fill null values with the specified value. diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 4dac81f6189..f9de7ed9057 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -21,10 +21,10 @@ ColumnBase, DatetimeColumn, TimeDeltaColumn, + arange, as_column, column, column_empty_like, - arange, full, ) from cudf.core.column.categorical import ( @@ -50,8 +50,10 @@ min_scalar_type, numeric_normalize_types, ) -from cudf.utils.utils import get_relevant_submodule -from cudf.utils.utils import get_appropriate_dispatched_func +from cudf.utils.utils import ( + get_appropriate_dispatched_func, + get_relevant_submodule, +) class Series(Frame, Serializable): @@ -2572,13 +2574,7 @@ def factorize(self, na_sentinel=-1): 1 c dtype: object """ - cats = self.dropna().unique().astype(self.dtype) - - name = self.name # label_encoding mutates self.name - labels = self.label_encoding(cats=cats, na_sentinel=na_sentinel) - self.name = name - - return labels, cats + return _factorize(self, na_sentinel=na_sentinel) # UDF related @@ -5216,3 +5212,16 @@ def isclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False): result_col[equal_nulls] = True return Series(result_col, index=index) + + +def _factorize(obj, na_sentinel=-1): + + obj = cudf.Series(obj) + + cats = obj.dropna().unique().astype(obj.dtype) + + name = obj.name # label_encoding mutates self.name + labels = obj.label_encoding(cats=cats, na_sentinel=na_sentinel) + obj.name = name + + return labels, cats diff --git a/python/cudf/cudf/tests/test_factorize.py b/python/cudf/cudf/tests/test_factorize.py index 4c42ee2a7bf..193b138e871 100644 --- a/python/cudf/cudf/tests/test_factorize.py +++ b/python/cudf/cudf/tests/test_factorize.py @@ -8,7 +8,7 @@ @pytest.mark.parametrize("ncats,nelem", [(2, 2), (2, 10), (10, 100)]) -def test_factorize(ncats, nelem): +def test_factorize_series_obj(ncats, nelem): df = DataFrame() np.random.seed(0) @@ -25,7 +25,26 @@ def test_factorize(ncats, nelem): np.testing.assert_array_equal(uvals.to_array(), handcoded) -def test_factorize_index(): +@pytest.mark.parametrize("ncats,nelem", [(2, 2), (2, 10), (10, 100)]) +def test_factorize_index_obj(ncats, nelem): + df = DataFrame() + np.random.seed(0) + + # initialize data frame + df["cats"] = arr = np.random.randint(2, size=10, dtype=np.int32) + df = df.set_index("cats") + + uvals, labels = df.index.factorize() + np.testing.assert_array_equal(labels.to_array(), sorted(set(arr))) + assert isinstance(uvals, Series) + assert isinstance(labels, Series) + + encoder = dict((labels[idx], idx) for idx in range(len(labels))) + handcoded = [encoder[v] for v in arr] + np.testing.assert_array_equal(uvals.to_array(), handcoded) + + +def test_factorize_series_index(): df = DataFrame() df["col1"] = ["C", "H", "C", "W", "W", "W", "W", "W", "C", "W"] df["col2"] = [ @@ -40,7 +59,6 @@ def test_factorize_index(): 2992446.0, 2992448.0, ] - assert_eq( df.col1.factorize()[0].to_array(), df.to_pandas().col1.factorize()[0] ) From 4711a31a167be52d6e55a0693063c985acf8a12d Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 2 Dec 2020 13:42:45 -0800 Subject: [PATCH 02/16] index.py cleanup --- python/cudf/cudf/core/index.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 216f54aaa24..8e6c2e3d527 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -453,9 +453,15 @@ def _clean_nulls_from_index(self): return self def factorize(self, na_sentinel=-1): - from cudf.core.series import _factorize + """ + Encode the input values as integer labels - return _factorize(self, na_sentinel=na_sentinel) + See Also + -------- + cudf.Series.factorize + + """ + return cudf.core.series._factorize(self, na_sentinel=na_sentinel) def fillna(self, value, downcast=None): """ From aac5aae6cebac3f3259bdc2af0bfbd542cc14802 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 3 Dec 2020 16:22:49 -0800 Subject: [PATCH 03/16] restructure things --- python/cudf/cudf/__init__.py | 2 +- python/cudf/cudf/core/index.py | 2 +- python/cudf/cudf/core/series.py | 21 +++++++++------------ 3 files changed, 11 insertions(+), 14 deletions(-) diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py index 322f8928246..86a27404195 100644 --- a/python/cudf/cudf/__init__.py +++ b/python/cudf/cudf/__init__.py @@ -62,7 +62,7 @@ true_divide, ) from cudf.core.reshape import concat, get_dummies, melt, merge_sorted -from cudf.core.series import _factorize as factorize, isclose +from cudf.core.series import factorize, isclose from cudf.core.tools.datetimes import to_datetime from cudf.core.tools.numeric import to_numeric from cudf.io import ( diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 8e6c2e3d527..d9e9d4c43f5 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -461,7 +461,7 @@ def factorize(self, na_sentinel=-1): cudf.Series.factorize """ - return cudf.core.series._factorize(self, na_sentinel=na_sentinel) + return cudf.core.series.factorize(self, na_sentinel=na_sentinel) def fillna(self, value, downcast=None): """ diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index f9de7ed9057..7cd7fac135f 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -2574,7 +2574,13 @@ def factorize(self, na_sentinel=-1): 1 c dtype: object """ - return _factorize(self, na_sentinel=na_sentinel) + cats = self.dropna().unique().astype(self.dtype) + + name = self.name # label_encoding mutates self.name + labels = self.label_encoding(cats=cats, na_sentinel=na_sentinel) + self.name = name + + return labels, cats # UDF related @@ -5214,14 +5220,5 @@ def isclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False): return Series(result_col, index=index) -def _factorize(obj, na_sentinel=-1): - - obj = cudf.Series(obj) - - cats = obj.dropna().unique().astype(obj.dtype) - - name = obj.name # label_encoding mutates self.name - labels = obj.label_encoding(cats=cats, na_sentinel=na_sentinel) - obj.name = name - - return labels, cats +def factorize(obj, na_sentinel=-1): + return cudf.Series(obj).factorize(na_sentinel=na_sentinel) From 66c6025e26509766cbb0a495c9520263c52ce4ae Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 3 Dec 2020 16:26:15 -0800 Subject: [PATCH 04/16] changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9baaaa22235..5c140435865 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -35,6 +35,7 @@ - PR #6765 Cupy fallback for __array_function__ and __array_ufunc__ for cudf.Series - PR #6817 Add support for scatter() on lists-of-struct columns - PR #6805 Implement `cudf::detail::copy_if` for `decimal32` and `decimal64` +- PR #6885 Share `factorize` implementation with Index and cudf module ## Improvements From bc73626df0c01a8bc1b8e1fc07f3f1a312652a22 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 7 Dec 2020 13:09:57 -0800 Subject: [PATCH 05/16] add top level function tests --- python/cudf/cudf/tests/test_factorize.py | 60 +++++++++++++++++++++++- 1 file changed, 59 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_factorize.py b/python/cudf/cudf/tests/test_factorize.py index 193b138e871..64b4dd03987 100644 --- a/python/cudf/cudf/tests/test_factorize.py +++ b/python/cudf/cudf/tests/test_factorize.py @@ -3,9 +3,12 @@ import numpy as np import pytest +import cudf from cudf.core import DataFrame, Series from cudf.tests.utils import assert_eq +import pandas as pd +import cupy as cp @pytest.mark.parametrize("ncats,nelem", [(2, 2), (2, 10), (10, 100)]) def test_factorize_series_obj(ncats, nelem): @@ -43,7 +46,6 @@ def test_factorize_index_obj(ncats, nelem): handcoded = [encoder[v] for v in arr] np.testing.assert_array_equal(uvals.to_array(), handcoded) - def test_factorize_series_index(): df = DataFrame() df["col1"] = ["C", "H", "C", "W", "W", "W", "W", "W", "C", "W"] @@ -76,3 +78,59 @@ def test_factorize_series_index(): df.col1.factorize()[1].to_pandas().values, df.to_pandas().col1.factorize()[1].values, ) + +def test_cudf_factorize_series(): + data = [1,2,3,4,5] + + psr = pd.Series(data) + gsr = cudf.Series(data) + + expect = pd.factorize(psr) + got = cudf.factorize(gsr) + + assert(len(expect) == len(got)) + + np.testing.assert_array_equal(expect[0], got[0].to_array()) + np.testing.assert_array_equal(expect[1], got[1].to_array()) + +def test_cudf_factorize_series(): + data = [1,2,3,4,5] + + psr = pd.Series(data) + gsr = cudf.Series(data) + + expect = pd.factorize(psr) + got = cudf.factorize(gsr) + + assert(len(expect) == len(got)) + + np.testing.assert_array_equal(expect[0], got[0].to_array()) + np.testing.assert_array_equal(expect[1], got[1].to_array()) + +def test_cudf_factorize_index(): + data = [1,2,3,4,5] + + pi = pd.Index(data) + gi = cudf.Index(data) + + expect = pd.factorize(pi) + got = cudf.factorize(gi) + + assert(len(expect) == len(got)) + + np.testing.assert_array_equal(expect[0], got[0].to_array()) + np.testing.assert_array_equal(expect[1], got[1].to_array()) + +def test_cudf_factorize_array(): + data = [1,2,3,4,5] + + parr = np.array(data) + garr = cp.array(data) + + expect = pd.factorize(parr) + got = cudf.factorize(garr) + + assert(len(expect) == len(got)) + + np.testing.assert_array_equal(expect[0], got[0].to_array()) + np.testing.assert_array_equal(expect[1], got[1].to_array()) From 2ed46f235905e6d2b7f8283951050466ef72f659 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 7 Dec 2020 13:37:26 -0800 Subject: [PATCH 06/16] cleanup and style --- python/cudf/cudf/core/series.py | 44 ++++++++++++++++++++++-- python/cudf/cudf/tests/test_factorize.py | 32 ++++++----------- 2 files changed, 53 insertions(+), 23 deletions(-) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 7cd7fac135f..39fa137e91e 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -5220,5 +5220,45 @@ def isclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False): return Series(result_col, index=index) -def factorize(obj, na_sentinel=-1): - return cudf.Series(obj).factorize(na_sentinel=na_sentinel) +def factorize(values, sort=False, na_sentinel=-1): + """Encode the input values as integer labels + + Parameters + ---------- + values: Series, Index, or CuPy array + The data to be factorized. + na_sentinel : number + Value to indicate missing category. + + Returns + -------- + (labels, cats) : (Series, Series) + - *labels* contains the encoded values + - *cats* contains the categories in order that the N-th + item corresponds to the (N-1) code. + + Examples + -------- + >>> import cudf + >>> data = cudf.Series(['a', 'c', 'c']) + >>> codes, uniques = cudf.factorize(data) + >>> codes + 0 0 + 1 1 + 2 1 + dtype: int8 + >>> uniques + 0 a + 1 c + dtype: object + + See Also + -------- + cudf.Series.factorize + + """ + if sort: + raise NotImplementedError( + "Sorting not yet supported during factorization." + ) + return cudf.Series(values).factorize(na_sentinel=na_sentinel) diff --git a/python/cudf/cudf/tests/test_factorize.py b/python/cudf/cudf/tests/test_factorize.py index 64b4dd03987..7d36e3faf29 100644 --- a/python/cudf/cudf/tests/test_factorize.py +++ b/python/cudf/cudf/tests/test_factorize.py @@ -1,14 +1,14 @@ # Copyright (c) 2018, NVIDIA CORPORATION. +import cupy as cp import numpy as np +import pandas as pd import pytest import cudf from cudf.core import DataFrame, Series from cudf.tests.utils import assert_eq -import pandas as pd -import cupy as cp @pytest.mark.parametrize("ncats,nelem", [(2, 2), (2, 10), (10, 100)]) def test_factorize_series_obj(ncats, nelem): @@ -46,6 +46,7 @@ def test_factorize_index_obj(ncats, nelem): handcoded = [encoder[v] for v in arr] np.testing.assert_array_equal(uvals.to_array(), handcoded) + def test_factorize_series_index(): df = DataFrame() df["col1"] = ["C", "H", "C", "W", "W", "W", "W", "W", "C", "W"] @@ -79,22 +80,9 @@ def test_factorize_series_index(): df.to_pandas().col1.factorize()[1].values, ) -def test_cudf_factorize_series(): - data = [1,2,3,4,5] - - psr = pd.Series(data) - gsr = cudf.Series(data) - - expect = pd.factorize(psr) - got = cudf.factorize(gsr) - - assert(len(expect) == len(got)) - - np.testing.assert_array_equal(expect[0], got[0].to_array()) - np.testing.assert_array_equal(expect[1], got[1].to_array()) def test_cudf_factorize_series(): - data = [1,2,3,4,5] + data = [1, 2, 3, 4, 5] psr = pd.Series(data) gsr = cudf.Series(data) @@ -102,13 +90,14 @@ def test_cudf_factorize_series(): expect = pd.factorize(psr) got = cudf.factorize(gsr) - assert(len(expect) == len(got)) + assert len(expect) == len(got) np.testing.assert_array_equal(expect[0], got[0].to_array()) np.testing.assert_array_equal(expect[1], got[1].to_array()) + def test_cudf_factorize_index(): - data = [1,2,3,4,5] + data = [1, 2, 3, 4, 5] pi = pd.Index(data) gi = cudf.Index(data) @@ -116,13 +105,14 @@ def test_cudf_factorize_index(): expect = pd.factorize(pi) got = cudf.factorize(gi) - assert(len(expect) == len(got)) + assert len(expect) == len(got) np.testing.assert_array_equal(expect[0], got[0].to_array()) np.testing.assert_array_equal(expect[1], got[1].to_array()) + def test_cudf_factorize_array(): - data = [1,2,3,4,5] + data = [1, 2, 3, 4, 5] parr = np.array(data) garr = cp.array(data) @@ -130,7 +120,7 @@ def test_cudf_factorize_array(): expect = pd.factorize(parr) got = cudf.factorize(garr) - assert(len(expect) == len(got)) + assert len(expect) == len(got) np.testing.assert_array_equal(expect[0], got[0].to_array()) np.testing.assert_array_equal(expect[1], got[1].to_array()) From 49c02cad7dabc470b7707aa3b388973259fd9f4e Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 8 Dec 2020 11:58:32 -0800 Subject: [PATCH 07/16] error for size_hint parameter --- python/cudf/cudf/core/series.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 39fa137e91e..c8e7d78b484 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -5220,7 +5220,7 @@ def isclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False): return Series(result_col, index=index) -def factorize(values, sort=False, na_sentinel=-1): +def factorize(values, sort=False, na_sentinel=-1, size_hint=None): """Encode the input values as integer labels Parameters @@ -5261,4 +5261,9 @@ def factorize(values, sort=False, na_sentinel=-1): raise NotImplementedError( "Sorting not yet supported during factorization." ) + if size_hint: + raise NotImplementedError( + "size_hint is not applicable for cudf.factorize" + ) + return cudf.Series(values).factorize(na_sentinel=na_sentinel) From 755e584ec67812a07e8457ad021b5cce66e5c58c Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 8 Dec 2020 12:37:56 -0800 Subject: [PATCH 08/16] reorganize --- python/cudf/cudf/core/algorithms.py | 59 +++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 python/cudf/cudf/core/algorithms.py diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py new file mode 100644 index 00000000000..8875d416e4e --- /dev/null +++ b/python/cudf/cudf/core/algorithms.py @@ -0,0 +1,59 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. +from cudf.core.series import Series + + +def factorize(values, sort=False, na_sentinel=-1, size_hint=None): + """Encode the input values as integer labels + + Parameters + ---------- + values: Series, Index, or CuPy array + The data to be factorized. + na_sentinel : number + Value to indicate missing category. + + Returns + -------- + (labels, cats) : (Series, Series) + - *labels* contains the encoded values + - *cats* contains the categories in order that the N-th + item corresponds to the (N-1) code. + + Examples + -------- + >>> import cudf + >>> data = cudf.Series(['a', 'c', 'c']) + >>> codes, uniques = cudf.factorize(data) + >>> codes + 0 0 + 1 1 + 2 1 + dtype: int8 + >>> uniques + 0 a + 1 c + dtype: object + + See Also + -------- + cudf.Series.factorize + + """ + if sort: + raise NotImplementedError( + "Sorting not yet supported during factorization." + ) + if size_hint: + raise NotImplementedError( + "size_hint is not applicable for cudf.factorize" + ) + + values = Series(values) + + cats = values.dropna().unique().astype(values.dtype) + + name = values.name # label_encoding mutates self.name + labels = values.label_encoding(cats=cats, na_sentinel=na_sentinel) + values.name = name + + return labels, cats From c7d1a35c03dec0d7eb091be01f0d6a9050ec3367 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 8 Dec 2020 12:40:16 -0800 Subject: [PATCH 09/16] reorganize and style --- python/cudf/cudf/__init__.py | 3 +- python/cudf/cudf/core/__init__.py | 2 +- python/cudf/cudf/core/index.py | 2 +- python/cudf/cudf/core/series.py | 57 +------------------------------ 4 files changed, 5 insertions(+), 59 deletions(-) diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py index 86a27404195..9008c6a73a2 100644 --- a/python/cudf/cudf/__init__.py +++ b/python/cudf/cudf/__init__.py @@ -39,6 +39,7 @@ from_pandas, merge, ) +from cudf.core.algorithms import factorize from cudf.core.dtypes import CategoricalDtype from cudf.core.groupby import Grouper from cudf.core.ops import ( @@ -62,7 +63,7 @@ true_divide, ) from cudf.core.reshape import concat, get_dummies, melt, merge_sorted -from cudf.core.series import factorize, isclose +from cudf.core.series import isclose from cudf.core.tools.datetimes import to_datetime from cudf.core.tools.numeric import to_numeric from cudf.io import ( diff --git a/python/cudf/cudf/core/__init__.py b/python/cudf/cudf/core/__init__.py index d6c232373c7..05d106881b0 100644 --- a/python/cudf/cudf/core/__init__.py +++ b/python/cudf/cudf/core/__init__.py @@ -1,6 +1,6 @@ # Copyright (c) 2018-2020, NVIDIA CORPORATION. -from cudf.core import buffer, column, common +from cudf.core import algorithms, buffer, column, common from cudf.core.buffer import Buffer from cudf.core.dataframe import DataFrame, from_pandas, merge from cudf.core.index import ( diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 935fc1b84b3..dd86fbc1390 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -461,7 +461,7 @@ def factorize(self, na_sentinel=-1): cudf.Series.factorize """ - return cudf.core.series.factorize(self, na_sentinel=na_sentinel) + return cudf.core.algorithms.factorize(self, na_sentinel=na_sentinel) def fillna(self, value, downcast=None): """ diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index c8e7d78b484..b3cf001656a 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -2574,13 +2574,7 @@ def factorize(self, na_sentinel=-1): 1 c dtype: object """ - cats = self.dropna().unique().astype(self.dtype) - - name = self.name # label_encoding mutates self.name - labels = self.label_encoding(cats=cats, na_sentinel=na_sentinel) - self.name = name - - return labels, cats + return cudf.core.algorithms.factorize(self, na_sentinel=na_sentinel) # UDF related @@ -5218,52 +5212,3 @@ def isclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False): result_col[equal_nulls] = True return Series(result_col, index=index) - - -def factorize(values, sort=False, na_sentinel=-1, size_hint=None): - """Encode the input values as integer labels - - Parameters - ---------- - values: Series, Index, or CuPy array - The data to be factorized. - na_sentinel : number - Value to indicate missing category. - - Returns - -------- - (labels, cats) : (Series, Series) - - *labels* contains the encoded values - - *cats* contains the categories in order that the N-th - item corresponds to the (N-1) code. - - Examples - -------- - >>> import cudf - >>> data = cudf.Series(['a', 'c', 'c']) - >>> codes, uniques = cudf.factorize(data) - >>> codes - 0 0 - 1 1 - 2 1 - dtype: int8 - >>> uniques - 0 a - 1 c - dtype: object - - See Also - -------- - cudf.Series.factorize - - """ - if sort: - raise NotImplementedError( - "Sorting not yet supported during factorization." - ) - if size_hint: - raise NotImplementedError( - "size_hint is not applicable for cudf.factorize" - ) - - return cudf.Series(values).factorize(na_sentinel=na_sentinel) From 729c5c7942669c8a35c26a8dc6f37bb28f9f95f6 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 8 Dec 2020 12:55:08 -0800 Subject: [PATCH 10/16] untangle imports --- python/cudf/cudf/core/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/__init__.py b/python/cudf/cudf/core/__init__.py index 05d106881b0..d6c232373c7 100644 --- a/python/cudf/cudf/core/__init__.py +++ b/python/cudf/cudf/core/__init__.py @@ -1,6 +1,6 @@ # Copyright (c) 2018-2020, NVIDIA CORPORATION. -from cudf.core import algorithms, buffer, column, common +from cudf.core import buffer, column, common from cudf.core.buffer import Buffer from cudf.core.dataframe import DataFrame, from_pandas, merge from cudf.core.index import ( From 366022df7c6eff5084043202d5902166242e8bcb Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 8 Dec 2020 13:11:08 -0800 Subject: [PATCH 11/16] raise for na_sentinel==None --- python/cudf/cudf/core/algorithms.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py index 8875d416e4e..6ce7af73557 100644 --- a/python/cudf/cudf/core/algorithms.py +++ b/python/cudf/cudf/core/algorithms.py @@ -47,6 +47,8 @@ def factorize(values, sort=False, na_sentinel=-1, size_hint=None): raise NotImplementedError( "size_hint is not applicable for cudf.factorize" ) + if not na_sentinel: + raise NotImplementedError("na_sentinel can not be None.") values = Series(values) From c281cec94cf2ec8abe1dcc042ebd271e116e4917 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 8 Dec 2020 13:12:00 -0800 Subject: [PATCH 12/16] adjust docstring --- python/cudf/cudf/core/algorithms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py index 6ce7af73557..8111c08fc98 100644 --- a/python/cudf/cudf/core/algorithms.py +++ b/python/cudf/cudf/core/algorithms.py @@ -9,7 +9,7 @@ def factorize(values, sort=False, na_sentinel=-1, size_hint=None): ---------- values: Series, Index, or CuPy array The data to be factorized. - na_sentinel : number + na_sentinel : number, default -1 Value to indicate missing category. Returns From bbb3f6c51ad21387ac02ffd0f1d65ca75ab130e7 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Fri, 11 Dec 2020 06:59:50 -0800 Subject: [PATCH 13/16] use a warning instead of erroring --- python/cudf/cudf/core/algorithms.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py index 8111c08fc98..4f3c010ff3e 100644 --- a/python/cudf/cudf/core/algorithms.py +++ b/python/cudf/cudf/core/algorithms.py @@ -1,4 +1,6 @@ # Copyright (c) 2020, NVIDIA CORPORATION. +from warnings import warn + from cudf.core.series import Series @@ -43,13 +45,12 @@ def factorize(values, sort=False, na_sentinel=-1, size_hint=None): raise NotImplementedError( "Sorting not yet supported during factorization." ) - if size_hint: - raise NotImplementedError( - "size_hint is not applicable for cudf.factorize" - ) if not na_sentinel: raise NotImplementedError("na_sentinel can not be None.") + if size_hint: + warn("size_hint is not applicable for cudf.factorize") + values = Series(values) cats = values.dropna().unique().astype(values.dtype) From 5560fc4aa503637fe9f921fca3251a356aa67a1f Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Fri, 11 Dec 2020 07:30:46 -0800 Subject: [PATCH 14/16] adjust implementation and tests --- python/cudf/cudf/core/algorithms.py | 10 +++-- python/cudf/cudf/tests/test_factorize.py | 55 +++++++++++++++--------- 2 files changed, 42 insertions(+), 23 deletions(-) diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py index 4f3c010ff3e..5cd5e46f808 100644 --- a/python/cudf/cudf/core/algorithms.py +++ b/python/cudf/cudf/core/algorithms.py @@ -1,7 +1,9 @@ # Copyright (c) 2020, NVIDIA CORPORATION. from warnings import warn -from cudf.core.series import Series +import cupy as cp + +from cudf.core.series import Index, Series def factorize(values, sort=False, na_sentinel=-1, size_hint=None): @@ -51,12 +53,14 @@ def factorize(values, sort=False, na_sentinel=-1, size_hint=None): if size_hint: warn("size_hint is not applicable for cudf.factorize") + return_cupy_array = isinstance(values, cp.core.core.ndarray) + values = Series(values) cats = values.dropna().unique().astype(values.dtype) name = values.name # label_encoding mutates self.name - labels = values.label_encoding(cats=cats, na_sentinel=na_sentinel) + labels = values.label_encoding(cats=cats, na_sentinel=na_sentinel).values values.name = name - return labels, cats + return labels, cats.values if return_cupy_array else Index(cats) diff --git a/python/cudf/cudf/tests/test_factorize.py b/python/cudf/cudf/tests/test_factorize.py index 7d36e3faf29..61d11fa5961 100644 --- a/python/cudf/cudf/tests/test_factorize.py +++ b/python/cudf/cudf/tests/test_factorize.py @@ -6,7 +6,7 @@ import pytest import cudf -from cudf.core import DataFrame, Series +from cudf.core import DataFrame, Index from cudf.tests.utils import assert_eq @@ -20,12 +20,12 @@ def test_factorize_series_obj(ncats, nelem): uvals, labels = df["cats"].factorize() np.testing.assert_array_equal(labels.to_array(), sorted(set(arr))) - assert isinstance(uvals, Series) - assert isinstance(labels, Series) + assert isinstance(uvals, cp.core.core.ndarray) + assert isinstance(labels, Index) encoder = dict((labels[idx], idx) for idx in range(len(labels))) handcoded = [encoder[v] for v in arr] - np.testing.assert_array_equal(uvals.to_array(), handcoded) + np.testing.assert_array_equal(uvals.get(), handcoded) @pytest.mark.parametrize("ncats,nelem", [(2, 2), (2, 10), (10, 100)]) @@ -38,13 +38,13 @@ def test_factorize_index_obj(ncats, nelem): df = df.set_index("cats") uvals, labels = df.index.factorize() - np.testing.assert_array_equal(labels.to_array(), sorted(set(arr))) - assert isinstance(uvals, Series) - assert isinstance(labels, Series) + np.testing.assert_array_equal(labels.values.get(), sorted(set(arr))) + assert isinstance(uvals, cp.core.core.ndarray) + assert isinstance(labels, Index) encoder = dict((labels[idx], idx) for idx in range(len(labels))) handcoded = [encoder[v] for v in arr] - np.testing.assert_array_equal(uvals.to_array(), handcoded) + np.testing.assert_array_equal(uvals.get(), handcoded) def test_factorize_series_index(): @@ -62,9 +62,7 @@ def test_factorize_series_index(): 2992446.0, 2992448.0, ] - assert_eq( - df.col1.factorize()[0].to_array(), df.to_pandas().col1.factorize()[0] - ) + assert_eq(df.col1.factorize()[0].get(), df.to_pandas().col1.factorize()[0]) assert_eq( df.col1.factorize()[1].to_pandas().values, df.to_pandas().col1.factorize()[1].values, @@ -72,9 +70,7 @@ def test_factorize_series_index(): df = df.set_index("col2") - assert_eq( - df.col1.factorize()[0].to_array(), df.to_pandas().col1.factorize()[0] - ) + assert_eq(df.col1.factorize()[0].get(), df.to_pandas().col1.factorize()[0]) assert_eq( df.col1.factorize()[1].to_pandas().values, df.to_pandas().col1.factorize()[1].values, @@ -92,8 +88,8 @@ def test_cudf_factorize_series(): assert len(expect) == len(got) - np.testing.assert_array_equal(expect[0], got[0].to_array()) - np.testing.assert_array_equal(expect[1], got[1].to_array()) + np.testing.assert_array_equal(expect[0], got[0].get()) + np.testing.assert_array_equal(expect[1], got[1].values.get()) def test_cudf_factorize_index(): @@ -107,8 +103,8 @@ def test_cudf_factorize_index(): assert len(expect) == len(got) - np.testing.assert_array_equal(expect[0], got[0].to_array()) - np.testing.assert_array_equal(expect[1], got[1].to_array()) + np.testing.assert_array_equal(expect[0], got[0].get()) + np.testing.assert_array_equal(expect[1], got[1].values.get()) def test_cudf_factorize_array(): @@ -122,5 +118,24 @@ def test_cudf_factorize_array(): assert len(expect) == len(got) - np.testing.assert_array_equal(expect[0], got[0].to_array()) - np.testing.assert_array_equal(expect[1], got[1].to_array()) + np.testing.assert_array_equal(expect[0], got[0].get()) + np.testing.assert_array_equal(expect[1], got[1].get()) + + +def test_factorize_result_classes(): + data = [1, 2, 3] + + labels, cats = cudf.factorize(cudf.Series(data)) + + assert isinstance(labels, cp.core.core.ndarray) + assert isinstance(cats, cudf.Index) + + labels, cats = cudf.factorize(cudf.Index(data)) + + assert isinstance(labels, cp.core.core.ndarray) + assert isinstance(cats, cudf.Index) + + labels, cats = cudf.factorize(cp.array(data)) + + assert isinstance(labels, cp.core.core.ndarray) + assert isinstance(cats, cp.core.core.ndarray) From 50fdf035e3a1428285e7a52c42c46bc1228988f7 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Fri, 11 Dec 2020 09:38:44 -0800 Subject: [PATCH 15/16] fix tests --- python/cudf/cudf/core/algorithms.py | 2 +- python/cudf/cudf/core/multiindex.py | 4 ++-- python/cudf/cudf/tests/test_series.py | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py index 5cd5e46f808..e2fd939e8c1 100644 --- a/python/cudf/cudf/core/algorithms.py +++ b/python/cudf/cudf/core/algorithms.py @@ -47,7 +47,7 @@ def factorize(values, sort=False, na_sentinel=-1, size_hint=None): raise NotImplementedError( "Sorting not yet supported during factorization." ) - if not na_sentinel: + if na_sentinel is None: raise NotImplementedError("na_sentinel can not be None.") if size_hint: diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 717aed4e163..c9baa991acf 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -574,9 +574,9 @@ def _compute_levels_and_codes(self): codes = cudf.DataFrame() for name in self._source_data.columns: code, cats = self._source_data[name].factorize() - codes[name] = code.reset_index(drop=True).astype(np.int64) + codes[name] = code.astype(np.int64) cats.name = None - cats = cats.reset_index(drop=True)._copy_construct(name=None) + cats = cudf.Series(cats)._copy_construct(name=None) levels.append(cats) self._levels = levels diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 300b17b1625..4c6589789bf 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -474,8 +474,8 @@ def test_series_factorize(data, na_sentinel): expected_labels, expected_cats = psr.factorize(na_sentinel=na_sentinel) actual_labels, actual_cats = gsr.factorize(na_sentinel=na_sentinel) - assert_eq(expected_labels, actual_labels.to_array()) - assert_eq(expected_cats.values, actual_cats.to_array()) + assert_eq(expected_labels, actual_labels.get()) + assert_eq(expected_cats.values, actual_cats.to_pandas().values) @pytest.mark.parametrize( From d927cfca82a37da942033aa387ad251cfe795f10 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Fri, 18 Dec 2020 11:12:09 -0800 Subject: [PATCH 16/16] use column.dropna --- python/cudf/cudf/core/algorithms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py index e2fd939e8c1..6cfe158651b 100644 --- a/python/cudf/cudf/core/algorithms.py +++ b/python/cudf/cudf/core/algorithms.py @@ -57,7 +57,7 @@ def factorize(values, sort=False, na_sentinel=-1, size_hint=None): values = Series(values) - cats = values.dropna().unique().astype(values.dtype) + cats = values._column.dropna().unique().astype(values.dtype) name = values.name # label_encoding mutates self.name labels = values.label_encoding(cats=cats, na_sentinel=na_sentinel).values