Apply metadata to keys before returning in Frame._encode (#8560)

Fixes #7365 Applies column metadata to the output columns of `keys` in `Frame._encode`; skipping this step meant that the output of `DataFrame.unstack` would not have the expected metadata for index columns: ```python import pandas as pd import cudf pdf = pd.DataFrame( { "foo": ["one", "one", "one", "two", "two", "two"], "bar": pd.Categorical(["A", "B", "C", "A", "B", "C"]), "baz": [1, 2, 3, 4, 5, 6], "zoo": ["x", "y", "z", "q", "w", "t"], }).set_index(["foo", "bar", "baz"]) gdf = cudf.from_pandas(pdf) pdf.unstack("baz") zoo baz 1 2 3 4 5 6 foo bar one A x NaN NaN NaN NaN NaN B NaN y NaN NaN NaN NaN C NaN NaN z NaN NaN NaN two A NaN NaN NaN q NaN NaN B NaN NaN NaN NaN w NaN C NaN NaN NaN NaN NaN t gdf.unstack("baz") zoo baz 1 2 3 4 5 6 foo bar one 0 x <NA> <NA> <NA> <NA> <NA> 1 <NA> y <NA> <NA> <NA> <NA> 2 <NA> <NA> z <NA> <NA> <NA> two 0 <NA> <NA> <NA> q <NA> <NA> 1 <NA> <NA> <NA> <NA> w <NA> 2 <NA> <NA> <NA> <NA> <NA> t ``` Authors: - Charles Blackmon-Luca (https://github.com/charlesbluca) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: #8560
rapidsai · Jul 16, 2021 · cd1866e · cd1866e
1 parent d07f994
commit cd1866e
Show file tree

Hide file tree

Showing 2 changed files with 36 additions and 5 deletions.
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
@@ -3251,6 +3251,11 @@ def _split(self, splits, keep_index=True):
     def _encode(self):
         keys, indices = libcudf.transform.table_encode(self)
         keys = self.__class__._from_table(keys)
+        for col in keys._data:
+            keys._data[col] = keys._data[col]._with_type_metadata(
+                self._data[col].dtype
+            )
+
         return keys, indices
 
     def _reindex(

diff --git a/python/cudf/cudf/tests/test_reshape.py b/python/cudf/cudf/tests/test_reshape.py
@@ -393,15 +393,35 @@ def test_pivot_multi_values():
     "level",
     [
         0,
-        1,
+        pytest.param(
+            1,
+            marks=pytest.mark.xfail(
+                reason="Categorical column indexes not supported"
+            ),
+        ),
         2,
         "foo",
-        "bar",
+        pytest.param(
+            "bar",
+            marks=pytest.mark.xfail(
+                reason="Categorical column indexes not supported"
+            ),
+        ),
         "baz",
         [],
-        [0, 1],
+        pytest.param(
+            [0, 1],
+            marks=pytest.mark.xfail(
+                reason="Categorical column indexes not supported"
+            ),
+        ),
         ["foo"],
-        ["foo", "bar"],
+        pytest.param(
+            ["foo", "bar"],
+            marks=pytest.mark.xfail(
+                reason="Categorical column indexes not supported"
+            ),
+        ),
         pytest.param(
             [0, 1, 2],
             marks=pytest.mark.xfail(reason="Pandas behaviour unclear"),
@@ -416,7 +436,7 @@ def test_unstack_multiindex(level):
     pdf = pd.DataFrame(
         {
             "foo": ["one", "one", "one", "two", "two", "two"],
-            "bar": ["A", "B", "C", "A", "B", "C"],
+            "bar": pd.Categorical(["A", "B", "C", "A", "B", "C"]),
             "baz": [1, 2, 3, 4, 5, 6],
             "zoo": ["x", "y", "z", "q", "w", "t"],
         }
@@ -436,6 +456,12 @@ def test_unstack_multiindex(level):
     [
         pd.Index(range(0, 5), name=None),
         pd.Index(range(0, 5), name="row_index"),
+        pytest.param(
+            pd.CategoricalIndex(["d", "e", "f", "g", "h"]),
+            marks=pytest.mark.xfail(
+                reason="Categorical column indexes not supported"
+            ),
+        ),
     ],
 )
 @pytest.mark.parametrize(