Skip to content

Commit

Permalink
Support MultiIndex in DataFrame.unstack. (#1322)
Browse files Browse the repository at this point in the history
We can support unstack with MultiIndex using pivot_table.
But as discussed at databricks/koalas#886 (comment), we should still be careful to use it.
  • Loading branch information
rising-star92 committed Mar 4, 2020
1 parent 3526461 commit 7c9642d
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 9 deletions.
59 changes: 54 additions & 5 deletions databricks/koalas/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -7967,8 +7967,15 @@ def melt(self, id_vars=None, value_vars=None, var_name=None, value_name="value")

def unstack(self):
"""
Pivot the (necessarily hierarchical) index labels. The output
will be a Series.
Pivot the (necessarily hierarchical) index labels.
Returns a DataFrame having a new level of column labels whose inner-most level
consists of the pivoted index labels.
If the index is not a MultiIndex, the output will be a Series.
.. note:: If the index is a MultiIndex, the output DataFrame could be very wide, and
it could cause a serious performance degradation since Spark partitions it row based.
Returns
-------
Expand Down Expand Up @@ -8014,14 +8021,56 @@ def unstack(self):
1 4
2 6
Name: 0, dtype: object
For MultiIndex case:
>>> df = ks.DataFrame({"A": ["a", "b", "c"],
... "B": [1, 3, 5],
... "C": [2, 4, 6]},
... columns=["A", "B", "C"])
>>> df = df.set_index('A', append=True)
>>> df # doctest: +NORMALIZE_WHITESPACE
B C
A
0 a 1 2
1 b 3 4
2 c 5 6
>>> df.unstack().sort_index() # doctest: +NORMALIZE_WHITESPACE
B C
A a b c a b c
0 1.0 NaN NaN 2.0 NaN NaN
1 NaN 3.0 NaN NaN 4.0 NaN
2 NaN NaN 5.0 NaN NaN 6.0
"""
from databricks.koalas.series import _col

if len(self._internal.index_columns) > 1:
raise NotImplementedError(
"Multi-index is not supported. Consider "
"using DataFrame.pivot_table or DataFrame.pivot instead."
# The index after `reset_index()` will never be used, so use "distributed" index
# as a dummy to avoid overhead.
with option_context("compute.default_index_type", "distributed"):
df = self.reset_index()
index = df._internal.column_labels[: len(self._internal.index_columns) - 1]
columns = df.columns[len(self._internal.index_columns) - 1]
df = df.pivot_table(
index=index, columns=columns, values=self._internal.column_labels, aggfunc="first"
)
internal = df._internal.copy(
index_map=[
(index_column, name)
for index_column, name in zip(
df._internal.index_columns, self._internal.index_names[:-1]
)
],
column_label_names=(
df._internal.column_label_names[:-1]
+ [
None
if self._internal.index_names[-1] is None
else df._internal.column_label_names[-1]
]
),
)
return DataFrame(internal)

# TODO: Codes here are similar with melt. Should we deduplicate?
column_labels = self._internal.column_labels
Expand Down
8 changes: 4 additions & 4 deletions databricks/koalas/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1850,14 +1850,14 @@ def test_pivot_table_and_index(self):
self.assert_eq(ktable.index, ptable.index)
self.assert_eq(repr(ktable.index), repr(ptable.index))

def test_unstack_errors(self):
kdf = ks.DataFrame(
def test_unstack(self):
pdf = pd.DataFrame(
np.random.randn(3, 3),
index=pd.MultiIndex.from_tuples([("rg1", "x"), ("rg1", "y"), ("rg2", "z")]),
)
kdf = ks.from_pandas(pdf)

with self.assertRaisesRegex(NotImplementedError, "Multi-index is not supported."):
kdf.unstack()
self.assert_eq(kdf.unstack().sort_index(), pdf.unstack().sort_index(), almost=True)

def test_pivot_errors(self):
kdf = ks.range(10)
Expand Down

0 comments on commit 7c9642d

Please sign in to comment.