Skip to content

Commit

Permalink
Apply metadata to keys before returning in Frame._encode (#8560)
Browse files Browse the repository at this point in the history
Fixes #7365 

Applies column metadata to the output columns of `keys` in `Frame._encode`; skipping this step meant that the output of `DataFrame.unstack` would not have the expected metadata for index columns:

```python
import pandas as pd
import cudf

pdf = pd.DataFrame(
    {
        "foo": ["one", "one", "one", "two", "two", "two"],
        "bar": pd.Categorical(["A", "B", "C", "A", "B", "C"]),
        "baz": [1, 2, 3, 4, 5, 6],
        "zoo": ["x", "y", "z", "q", "w", "t"],
    }).set_index(["foo", "bar", "baz"])
gdf = cudf.from_pandas(pdf)

pdf.unstack("baz")
         zoo                         
baz        1    2    3    4    5    6
foo bar                              
one A      x  NaN  NaN  NaN  NaN  NaN
    B    NaN    y  NaN  NaN  NaN  NaN
    C    NaN  NaN    z  NaN  NaN  NaN
two A    NaN  NaN  NaN    q  NaN  NaN
    B    NaN  NaN  NaN  NaN    w  NaN
    C    NaN  NaN  NaN  NaN  NaN    t

gdf.unstack("baz")
          zoo                              
baz         1     2     3     4     5     6
foo bar                                    
one 0       x  <NA>  <NA>  <NA>  <NA>  <NA>
    1    <NA>     y  <NA>  <NA>  <NA>  <NA>
    2    <NA>  <NA>     z  <NA>  <NA>  <NA>
two 0    <NA>  <NA>  <NA>     q  <NA>  <NA>
    1    <NA>  <NA>  <NA>  <NA>     w  <NA>
    2    <NA>  <NA>  <NA>  <NA>  <NA>     t
```

Authors:
  - Charles Blackmon-Luca (https://github.com/charlesbluca)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: #8560
  • Loading branch information
charlesbluca authored Jul 16, 2021
1 parent d07f994 commit cd1866e
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 5 deletions.
5 changes: 5 additions & 0 deletions python/cudf/cudf/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -3251,6 +3251,11 @@ def _split(self, splits, keep_index=True):
def _encode(self):
keys, indices = libcudf.transform.table_encode(self)
keys = self.__class__._from_table(keys)
for col in keys._data:
keys._data[col] = keys._data[col]._with_type_metadata(
self._data[col].dtype
)

return keys, indices

def _reindex(
Expand Down
36 changes: 31 additions & 5 deletions python/cudf/cudf/tests/test_reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -393,15 +393,35 @@ def test_pivot_multi_values():
"level",
[
0,
1,
pytest.param(
1,
marks=pytest.mark.xfail(
reason="Categorical column indexes not supported"
),
),
2,
"foo",
"bar",
pytest.param(
"bar",
marks=pytest.mark.xfail(
reason="Categorical column indexes not supported"
),
),
"baz",
[],
[0, 1],
pytest.param(
[0, 1],
marks=pytest.mark.xfail(
reason="Categorical column indexes not supported"
),
),
["foo"],
["foo", "bar"],
pytest.param(
["foo", "bar"],
marks=pytest.mark.xfail(
reason="Categorical column indexes not supported"
),
),
pytest.param(
[0, 1, 2],
marks=pytest.mark.xfail(reason="Pandas behaviour unclear"),
Expand All @@ -416,7 +436,7 @@ def test_unstack_multiindex(level):
pdf = pd.DataFrame(
{
"foo": ["one", "one", "one", "two", "two", "two"],
"bar": ["A", "B", "C", "A", "B", "C"],
"bar": pd.Categorical(["A", "B", "C", "A", "B", "C"]),
"baz": [1, 2, 3, 4, 5, 6],
"zoo": ["x", "y", "z", "q", "w", "t"],
}
Expand All @@ -436,6 +456,12 @@ def test_unstack_multiindex(level):
[
pd.Index(range(0, 5), name=None),
pd.Index(range(0, 5), name="row_index"),
pytest.param(
pd.CategoricalIndex(["d", "e", "f", "g", "h"]),
marks=pytest.mark.xfail(
reason="Categorical column indexes not supported"
),
),
],
)
@pytest.mark.parametrize(
Expand Down

0 comments on commit cd1866e

Please sign in to comment.