Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Share factorize implementation with Index and cudf module #6885

Merged
merged 20 commits into from
Dec 19, 2020
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
- PR #6814 Implement `cudf::reduce` for `decimal32` and `decimal64` (part 1)
- PR #6929 Add `Index.set_names` api
- PR #6907 Add `replace_null` API with `replace_policy` parameter, `fixed_width` column support
- PR #6885 Share `factorize` implementation with Index and cudf module

## Improvements

Expand Down
1 change: 1 addition & 0 deletions python/cudf/cudf/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
from_pandas,
merge,
)
from cudf.core.algorithms import factorize
from cudf.core.dtypes import CategoricalDtype
from cudf.core.groupby import Grouper
from cudf.core.ops import (
Expand Down
66 changes: 66 additions & 0 deletions python/cudf/cudf/core/algorithms.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# Copyright (c) 2020, NVIDIA CORPORATION.
from warnings import warn

import cupy as cp

from cudf.core.series import Index, Series


def factorize(values, sort=False, na_sentinel=-1, size_hint=None):
"""Encode the input values as integer labels

Parameters
----------
values: Series, Index, or CuPy array
The data to be factorized.
na_sentinel : number, default -1
Value to indicate missing category.

Returns
--------
(labels, cats) : (Series, Series)
- *labels* contains the encoded values
- *cats* contains the categories in order that the N-th
item corresponds to the (N-1) code.

Examples
--------
>>> import cudf
>>> data = cudf.Series(['a', 'c', 'c'])
>>> codes, uniques = cudf.factorize(data)
>>> codes
0 0
1 1
2 1
dtype: int8
>>> uniques
0 a
1 c
dtype: object

See Also
--------
cudf.Series.factorize

"""
if sort:
raise NotImplementedError(
"Sorting not yet supported during factorization."
)
if na_sentinel is None:
raise NotImplementedError("na_sentinel can not be None.")

if size_hint:
warn("size_hint is not applicable for cudf.factorize")

return_cupy_array = isinstance(values, cp.core.core.ndarray)

values = Series(values)

cats = values.dropna().unique().astype(values.dtype)
kkraus14 marked this conversation as resolved.
Show resolved Hide resolved

name = values.name # label_encoding mutates self.name
labels = values.label_encoding(cats=cats, na_sentinel=na_sentinel).values
values.name = name

return labels, cats.values if return_cupy_array else Index(cats)
11 changes: 11 additions & 0 deletions python/cudf/cudf/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -452,6 +452,17 @@ def _clean_nulls_from_index(self):
else:
return self

def factorize(self, na_sentinel=-1):
"""
Encode the input values as integer labels

See Also
--------
cudf.Series.factorize

"""
return cudf.core.algorithms.factorize(self, na_sentinel=na_sentinel)

@property
def nlevels(self):
"""
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/core/multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -618,9 +618,9 @@ def _compute_levels_and_codes(self):
codes = cudf.DataFrame()
for name in self._source_data.columns:
code, cats = self._source_data[name].factorize()
codes[name] = code.reset_index(drop=True).astype(np.int64)
codes[name] = code.astype(np.int64)
cats.name = None
cats = cats.reset_index(drop=True)._copy_construct(name=None)
cats = cudf.Series(cats)._copy_construct(name=None)
levels.append(cats)

self._levels = levels
Expand Down
8 changes: 1 addition & 7 deletions python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2574,13 +2574,7 @@ def factorize(self, na_sentinel=-1):
1 c
dtype: object
"""
cats = self.dropna().unique().astype(self.dtype)

name = self.name # label_encoding mutates self.name
labels = self.label_encoding(cats=cats, na_sentinel=na_sentinel)
self.name = name

return labels, cats
return cudf.core.algorithms.factorize(self, na_sentinel=na_sentinel)

# UDF related

Expand Down
107 changes: 94 additions & 13 deletions python/cudf/cudf/tests/test_factorize.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,17 @@
# Copyright (c) 2018, NVIDIA CORPORATION.

import cupy as cp
import numpy as np
import pandas as pd
import pytest

from cudf.core import DataFrame, Series
import cudf
from cudf.core import DataFrame, Index
from cudf.tests.utils import assert_eq


@pytest.mark.parametrize("ncats,nelem", [(2, 2), (2, 10), (10, 100)])
def test_factorize(ncats, nelem):
def test_factorize_series_obj(ncats, nelem):
df = DataFrame()
np.random.seed(0)

Expand All @@ -17,15 +20,34 @@ def test_factorize(ncats, nelem):

uvals, labels = df["cats"].factorize()
np.testing.assert_array_equal(labels.to_array(), sorted(set(arr)))
assert isinstance(uvals, Series)
assert isinstance(labels, Series)
assert isinstance(uvals, cp.core.core.ndarray)
assert isinstance(labels, Index)

encoder = dict((labels[idx], idx) for idx in range(len(labels)))
handcoded = [encoder[v] for v in arr]
np.testing.assert_array_equal(uvals.to_array(), handcoded)
np.testing.assert_array_equal(uvals.get(), handcoded)


def test_factorize_index():
@pytest.mark.parametrize("ncats,nelem", [(2, 2), (2, 10), (10, 100)])
def test_factorize_index_obj(ncats, nelem):
kkraus14 marked this conversation as resolved.
Show resolved Hide resolved
df = DataFrame()
np.random.seed(0)

# initialize data frame
df["cats"] = arr = np.random.randint(2, size=10, dtype=np.int32)
df = df.set_index("cats")

uvals, labels = df.index.factorize()
np.testing.assert_array_equal(labels.values.get(), sorted(set(arr)))
assert isinstance(uvals, cp.core.core.ndarray)
assert isinstance(labels, Index)

encoder = dict((labels[idx], idx) for idx in range(len(labels)))
handcoded = [encoder[v] for v in arr]
np.testing.assert_array_equal(uvals.get(), handcoded)


def test_factorize_series_index():
df = DataFrame()
df["col1"] = ["C", "H", "C", "W", "W", "W", "W", "W", "C", "W"]
df["col2"] = [
Expand All @@ -40,21 +62,80 @@ def test_factorize_index():
2992446.0,
2992448.0,
]

assert_eq(
df.col1.factorize()[0].to_array(), df.to_pandas().col1.factorize()[0]
)
assert_eq(df.col1.factorize()[0].get(), df.to_pandas().col1.factorize()[0])
assert_eq(
df.col1.factorize()[1].to_pandas().values,
df.to_pandas().col1.factorize()[1].values,
)

df = df.set_index("col2")

assert_eq(
df.col1.factorize()[0].to_array(), df.to_pandas().col1.factorize()[0]
)
assert_eq(df.col1.factorize()[0].get(), df.to_pandas().col1.factorize()[0])
assert_eq(
df.col1.factorize()[1].to_pandas().values,
df.to_pandas().col1.factorize()[1].values,
)


def test_cudf_factorize_series():
data = [1, 2, 3, 4, 5]

psr = pd.Series(data)
gsr = cudf.Series(data)

expect = pd.factorize(psr)
got = cudf.factorize(gsr)

assert len(expect) == len(got)

np.testing.assert_array_equal(expect[0], got[0].get())
np.testing.assert_array_equal(expect[1], got[1].values.get())


def test_cudf_factorize_index():
data = [1, 2, 3, 4, 5]

pi = pd.Index(data)
gi = cudf.Index(data)

expect = pd.factorize(pi)
got = cudf.factorize(gi)

assert len(expect) == len(got)

np.testing.assert_array_equal(expect[0], got[0].get())
np.testing.assert_array_equal(expect[1], got[1].values.get())


def test_cudf_factorize_array():
data = [1, 2, 3, 4, 5]

parr = np.array(data)
garr = cp.array(data)

expect = pd.factorize(parr)
got = cudf.factorize(garr)

assert len(expect) == len(got)

np.testing.assert_array_equal(expect[0], got[0].get())
np.testing.assert_array_equal(expect[1], got[1].get())


def test_factorize_result_classes():
data = [1, 2, 3]

labels, cats = cudf.factorize(cudf.Series(data))

assert isinstance(labels, cp.core.core.ndarray)
assert isinstance(cats, cudf.Index)

labels, cats = cudf.factorize(cudf.Index(data))

assert isinstance(labels, cp.core.core.ndarray)
assert isinstance(cats, cudf.Index)

labels, cats = cudf.factorize(cp.array(data))

assert isinstance(labels, cp.core.core.ndarray)
assert isinstance(cats, cp.core.core.ndarray)
4 changes: 2 additions & 2 deletions python/cudf/cudf/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -474,8 +474,8 @@ def test_series_factorize(data, na_sentinel):
expected_labels, expected_cats = psr.factorize(na_sentinel=na_sentinel)
actual_labels, actual_cats = gsr.factorize(na_sentinel=na_sentinel)

assert_eq(expected_labels, actual_labels.to_array())
assert_eq(expected_cats.values, actual_cats.to_array())
assert_eq(expected_labels, actual_labels.get())
assert_eq(expected_cats.values, actual_cats.to_pandas().values)


@pytest.mark.parametrize(
Expand Down