Support MultiIndex in DataFrame.unstack. (#1322)

We can support unstack with MultiIndex using pivot_table. But as discussed at databricks/koalas#886 (comment), we should still be careful to use it.
rising-star92 · Mar 4, 2020 · 7c9642d · 7c9642d
1 parent 3526461
commit 7c9642d
Show file tree

Hide file tree

Showing 2 changed files with 58 additions and 9 deletions.
diff --git a/databricks/koalas/frame.py b/databricks/koalas/frame.py
@@ -7967,8 +7967,15 @@ def melt(self, id_vars=None, value_vars=None, var_name=None, value_name="value")
 
     def unstack(self):
         """
-        Pivot the (necessarily hierarchical) index labels. The output
-        will be a Series.
+        Pivot the (necessarily hierarchical) index labels.
+
+        Returns a DataFrame having a new level of column labels whose inner-most level
+        consists of the pivoted index labels.
+
+        If the index is not a MultiIndex, the output will be a Series.
+
+        .. note:: If the index is a MultiIndex, the output DataFrame could be very wide, and
+            it could cause a serious performance degradation since Spark partitions it row based.
 
         Returns
         -------
@@ -8014,14 +8021,56 @@ def unstack(self):
               1    4
               2    6
         Name: 0, dtype: object
+
+        For MultiIndex case:
+
+        >>> df = ks.DataFrame({"A": ["a", "b", "c"],
+        ...                    "B": [1, 3, 5],
+        ...                    "C": [2, 4, 6]},
+        ...                   columns=["A", "B", "C"])
+        >>> df = df.set_index('A', append=True)
+        >>> df  # doctest: +NORMALIZE_WHITESPACE
+             B  C
+          A
+        0 a  1  2
+        1 b  3  4
+        2 c  5  6
+        >>> df.unstack().sort_index()  # doctest: +NORMALIZE_WHITESPACE
+             B              C
+        A    a    b    c    a    b    c
+        0  1.0  NaN  NaN  2.0  NaN  NaN
+        1  NaN  3.0  NaN  NaN  4.0  NaN
+        2  NaN  NaN  5.0  NaN  NaN  6.0
         """
         from databricks.koalas.series import _col
 
         if len(self._internal.index_columns) > 1:
-            raise NotImplementedError(
-                "Multi-index is not supported. Consider "
-                "using DataFrame.pivot_table or DataFrame.pivot instead."
+            # The index after `reset_index()` will never be used, so use "distributed" index
+            # as a dummy to avoid overhead.
+            with option_context("compute.default_index_type", "distributed"):
+                df = self.reset_index()
+            index = df._internal.column_labels[: len(self._internal.index_columns) - 1]
+            columns = df.columns[len(self._internal.index_columns) - 1]
+            df = df.pivot_table(
+                index=index, columns=columns, values=self._internal.column_labels, aggfunc="first"
             )
+            internal = df._internal.copy(
+                index_map=[
+                    (index_column, name)
+                    for index_column, name in zip(
+                        df._internal.index_columns, self._internal.index_names[:-1]
+                    )
+                ],
+                column_label_names=(
+                    df._internal.column_label_names[:-1]
+                    + [
+                        None
+                        if self._internal.index_names[-1] is None
+                        else df._internal.column_label_names[-1]
+                    ]
+                ),
+            )
+            return DataFrame(internal)
 
         # TODO: Codes here are similar with melt. Should we deduplicate?
         column_labels = self._internal.column_labels

diff --git a/databricks/koalas/tests/test_dataframe.py b/databricks/koalas/tests/test_dataframe.py
@@ -1850,14 +1850,14 @@ def test_pivot_table_and_index(self):
         self.assert_eq(ktable.index, ptable.index)
         self.assert_eq(repr(ktable.index), repr(ptable.index))
 
-    def test_unstack_errors(self):
-        kdf = ks.DataFrame(
+    def test_unstack(self):
+        pdf = pd.DataFrame(
             np.random.randn(3, 3),
             index=pd.MultiIndex.from_tuples([("rg1", "x"), ("rg1", "y"), ("rg2", "z")]),
         )
+        kdf = ks.from_pandas(pdf)
 
-        with self.assertRaisesRegex(NotImplementedError, "Multi-index is not supported."):
-            kdf.unstack()
+        self.assert_eq(kdf.unstack().sort_index(), pdf.unstack().sort_index(), almost=True)
 
     def test_pivot_errors(self):
         kdf = ks.range(10)