From e34988f50c4e9fe4182ef31dd6d3f453fdc1bc8f Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Fri, 18 Jun 2021 17:13:56 -0400 Subject: [PATCH 1/3] Apply metadata to keys before returning in _encode --- python/cudf/cudf/core/frame.py | 5 +++++ python/cudf/cudf/tests/test_reshape.py | 17 ++++++++++++++++- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 5da6f7d8f2f..cc747d9d81c 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -3259,6 +3259,11 @@ def _split(self, splits, keep_index=True): def _encode(self): keys, indices = libcudf.transform.table_encode(self) keys = self.__class__._from_table(keys) + for col in keys._data: + keys._data[col] = keys._data[col]._with_type_metadata( + self._data[col].dtype + ) + return keys, indices def _reindex( diff --git a/python/cudf/cudf/tests/test_reshape.py b/python/cudf/cudf/tests/test_reshape.py index b030924779d..5b2d6474df0 100644 --- a/python/cudf/cudf/tests/test_reshape.py +++ b/python/cudf/cudf/tests/test_reshape.py @@ -416,7 +416,7 @@ def test_unstack_multiindex(level): pdf = pd.DataFrame( { "foo": ["one", "one", "one", "two", "two", "two"], - "bar": ["A", "B", "C", "A", "B", "C"], + "bar": pd.Categorical(["A", "B", "C", "A", "B", "C"]), "baz": [1, 2, 3, 4, 5, 6], "zoo": ["x", "y", "z", "q", "w", "t"], } @@ -436,6 +436,7 @@ def test_unstack_multiindex(level): [ pd.Index(range(0, 5), name=None), pd.Index(range(0, 5), name="row_index"), + pd.CategoricalIndex(["d", "e", "f", "g", "h"]), ], ) @pytest.mark.parametrize( @@ -474,6 +475,20 @@ def test_unstack_index_invalid(): gdf.unstack() +def test_unstack_categorical_index(): + pdf = pd.DataFrame( + { + "foo": pd.Categorical(list("abcabc")), + "bar": [1, 2, 3, 4, 5, 6], + "baz": np.random.rand(6), + } + ).set_index(["foo", "bar"]) + + gdf = cudf.from_pandas(pdf) + + assert_eq(pdf.unstack("foo"), gdf.unstack("foo")) + + def test_pivot_duplicate_error(): gdf = cudf.DataFrame( {"a": [0, 1, 2, 2], "b": [1, 2, 3, 3], "d": [1, 2, 3, 4]} From 9ee2ad2fcbefbe3ecbcb8384f9ad0e7318e3c715 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Fri, 18 Jun 2021 17:27:43 -0400 Subject: [PATCH 2/3] Remove superfluous test --- python/cudf/cudf/tests/test_reshape.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/python/cudf/cudf/tests/test_reshape.py b/python/cudf/cudf/tests/test_reshape.py index 5b2d6474df0..bd0bc257096 100644 --- a/python/cudf/cudf/tests/test_reshape.py +++ b/python/cudf/cudf/tests/test_reshape.py @@ -475,20 +475,6 @@ def test_unstack_index_invalid(): gdf.unstack() -def test_unstack_categorical_index(): - pdf = pd.DataFrame( - { - "foo": pd.Categorical(list("abcabc")), - "bar": [1, 2, 3, 4, 5, 6], - "baz": np.random.rand(6), - } - ).set_index(["foo", "bar"]) - - gdf = cudf.from_pandas(pdf) - - assert_eq(pdf.unstack("foo"), gdf.unstack("foo")) - - def test_pivot_duplicate_error(): gdf = cudf.DataFrame( {"a": [0, 1, 2, 2], "b": [1, 2, 3, 3], "d": [1, 2, 3, 4]} From 636fdb8dac3cd9951fbb73b9abb4c497471515c9 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Wed, 14 Jul 2021 16:42:06 -0400 Subject: [PATCH 3/3] xfail tests unstacking categorical index --- python/cudf/cudf/tests/test_reshape.py | 35 ++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/tests/test_reshape.py b/python/cudf/cudf/tests/test_reshape.py index ea964ea371e..0161df7696c 100644 --- a/python/cudf/cudf/tests/test_reshape.py +++ b/python/cudf/cudf/tests/test_reshape.py @@ -393,15 +393,35 @@ def test_pivot_multi_values(): "level", [ 0, - 1, + pytest.param( + 1, + marks=pytest.mark.xfail( + reason="Categorical column indexes not supported" + ), + ), 2, "foo", - "bar", + pytest.param( + "bar", + marks=pytest.mark.xfail( + reason="Categorical column indexes not supported" + ), + ), "baz", [], - [0, 1], + pytest.param( + [0, 1], + marks=pytest.mark.xfail( + reason="Categorical column indexes not supported" + ), + ), ["foo"], - ["foo", "bar"], + pytest.param( + ["foo", "bar"], + marks=pytest.mark.xfail( + reason="Categorical column indexes not supported" + ), + ), pytest.param( [0, 1, 2], marks=pytest.mark.xfail(reason="Pandas behaviour unclear"), @@ -436,7 +456,12 @@ def test_unstack_multiindex(level): [ pd.Index(range(0, 5), name=None), pd.Index(range(0, 5), name="row_index"), - pd.CategoricalIndex(["d", "e", "f", "g", "h"]), + pytest.param( + pd.CategoricalIndex(["d", "e", "f", "g", "h"]), + marks=pytest.mark.xfail( + reason="Categorical column indexes not supported" + ), + ), ], ) @pytest.mark.parametrize(