diff --git a/databricks/koalas/plot.py b/databricks/koalas/plot.py index 7604d4a594..9b2e1395e0 100644 --- a/databricks/koalas/plot.py +++ b/databricks/koalas/plot.py @@ -415,16 +415,16 @@ def _make_plot(self): colors = self._get_colors(num_colors=1) stacking_id = self._get_stacking_id() - sdf = self.data.to_spark() + sdf = self.data._sdf - for i, data_column in enumerate(self.data._internal.data_columns): + for i, idx in enumerate(self.data._internal.column_index): # 'y' is a Spark DataFrame that selects one column. - y = sdf.select(data_column) + y = sdf.select(self.data._internal.scol_for(idx)) ax = self._get_ax(i) kwds = self.kwds.copy() - label = pprint_thing(data_column) + label = pprint_thing(idx if len(idx) > 1 else idx[0]) kwds['label'] = label style, kwds = self._apply_style_colors(colors, kwds, i, label) @@ -579,17 +579,17 @@ def _make_plot(self): colors = self._get_colors(num_colors=1) stacking_id = self._get_stacking_id() - sdf = self.data.to_spark() + sdf = self.data._sdf - for i, data_column in enumerate(self.data._internal.data_columns): + for i, idx in enumerate(self.data._internal.column_index): # 'y' is a Spark DataFrame that selects one column. - y = sdf.select(data_column) + y = sdf.select(self.data._internal.scol_for(idx)) ax = self._get_ax(i) kwds = self.kwds.copy() - label = pprint_thing(data_column) - kwds['label'] = data_column + label = pprint_thing(idx if len(idx) > 1 else idx[0]) + kwds['label'] = label style, kwds = self._apply_style_colors(colors, kwds, i, label) if style is not None: diff --git a/databricks/koalas/tests/test_frame_plot.py b/databricks/koalas/tests/test_frame_plot.py index 0f8bf8d764..b4f69d14ef 100644 --- a/databricks/koalas/tests/test_frame_plot.py +++ b/databricks/koalas/tests/test_frame_plot.py @@ -55,43 +55,77 @@ def compare_plots(self, ax1, ax2): self.assert_eq(self.plot_to_base64(ax1), self.plot_to_base64(ax2)) def test_line_plot(self): + + def _test_line_plot(pdf, kdf): + ax1 = pdf.plot(kind="line", colormap='Paired') + ax2 = kdf.plot(kind="line", colormap='Paired') + self.compare_plots(ax1, ax2) + + ax3 = pdf.plot.line(colormap='Paired') + ax4 = kdf.plot.line(colormap='Paired') + self.compare_plots(ax3, ax4) + pdf = self.pdf1 kdf = self.kdf1 + _test_line_plot(pdf, kdf) - ax1 = pdf.plot(kind="line", colormap='Paired') - ax2 = kdf.plot(kind="line", colormap='Paired') - self.compare_plots(ax1, ax2) - - ax3 = pdf.plot.line(colormap='Paired') - ax4 = kdf.plot.line(colormap='Paired') - self.compare_plots(ax3, ax4) + # multi-index columns + columns = pd.MultiIndex.from_tuples([('x', 'a'), ('y', 'b')]) + pdf.columns = columns + kdf.columns = columns + _test_line_plot(pdf, kdf) def test_area_plot(self): + + def _test_are_plot(pdf, kdf): + + ax1 = pdf.plot(kind="area", colormap='Paired') + ax2 = kdf.plot(kind="area", colormap='Paired') + self.compare_plots(ax1, ax2) + + ax3 = pdf.plot.area(colormap='Paired') + ax4 = kdf.plot.area(colormap='Paired') + self.compare_plots(ax3, ax4) + pdf = self.pdf1 kdf = self.kdf1 + _test_are_plot(pdf, kdf) - ax1 = pdf.plot(kind="area", colormap='Paired') - ax2 = kdf.plot(kind="area", colormap='Paired') - self.compare_plots(ax1, ax2) - - ax3 = pdf.plot.area(colormap='Paired') - ax4 = kdf.plot.area(colormap='Paired') - self.compare_plots(ax3, ax4) + # multi-index columns + columns = pd.MultiIndex.from_tuples([('x', 'a'), ('y', 'b')]) + pdf.columns = columns + kdf.columns = columns + _test_are_plot(pdf, kdf) def test_area_plot_stacked_false(self): - # test if frame area plot is correct when stacked=False because default is True + + def _test_area_plot_stacked_false(pdf, kdf): + ax1 = pdf.plot.area(stacked=False) + ax2 = kdf.plot.area(stacked=False) + self.compare_plots(ax1, ax2) + + # test if frame area plot is correct when stacked=False because default is True pdf = pd.DataFrame({ 'sales': [3, 2, 3, 9, 10, 6], 'signups': [5, 5, 6, 12, 14, 13], 'visits': [20, 42, 28, 62, 81, 50], }, index=pd.date_range(start='2018/01/01', end='2018/07/01', freq='M')) kdf = koalas.from_pandas(pdf) + _test_area_plot_stacked_false(pdf, kdf) - ax1 = pdf.plot.area(stacked=False) - ax2 = kdf.plot.area(stacked=False) - self.compare_plots(ax1, ax2) + # multi-index columns + columns = pd.MultiIndex.from_tuples([('x', 'sales'), ('x', 'signups'), ('y', 'visits')]) + pdf.columns = columns + kdf.columns = columns + _test_area_plot_stacked_false(pdf, kdf) def test_area_plot_y(self): + + def _test_area_plot_y(pdf, kdf, y): + ax1 = pdf.plot.area(y=y) + ax2 = kdf.plot.area(y=y) + self.compare_plots(ax1, ax2) + # test if frame area plot is correct when y is specified pdf = pd.DataFrame({ 'sales': [3, 2, 3, 9, 10, 6], @@ -99,48 +133,78 @@ def test_area_plot_y(self): 'visits': [20, 42, 28, 62, 81, 50], }, index=pd.date_range(start='2018/01/01', end='2018/07/01', freq='M')) kdf = koalas.from_pandas(pdf) + _test_area_plot_y(pdf, kdf, y='sales') - ax1 = pdf.plot.area(y='sales') - ax2 = kdf.plot.area(y='sales') - self.compare_plots(ax1, ax2) + # multi-index columns + columns = pd.MultiIndex.from_tuples([('x', 'sales'), ('x', 'signups'), ('y', 'visits')]) + pdf.columns = columns + kdf.columns = columns + _test_area_plot_y(pdf, kdf, y=('x', 'sales')) def test_barh_plot_with_x_y(self): + + def _test_barh_plot_with_x_y(pdf, kdf, x, y): + ax1 = pdf.plot(kind="barh", x=x, y=y, colormap='Paired') + ax2 = kdf.plot(kind="barh", x=x, y=y, colormap='Paired') + self.compare_plots(ax1, ax2) + + ax3 = pdf.plot.barh(x=x, y=y, colormap='Paired') + ax4 = kdf.plot.barh(x=x, y=y, colormap='Paired') + self.compare_plots(ax3, ax4) + # this is testing plot with specified x and y pdf = pd.DataFrame({'lab': ['A', 'B', 'C'], 'val': [10, 30, 20]}) kdf = koalas.from_pandas(pdf) + _test_barh_plot_with_x_y(pdf, kdf, x='lab', y='val') - ax1 = pdf.plot(kind="barh", x='lab', y='val', colormap='Paired') - ax2 = kdf.plot(kind="barh", x='lab', y='val', colormap='Paired') - self.compare_plots(ax1, ax2) - - ax3 = pdf.plot.barh(x='lab', y='val', colormap='Paired') - ax4 = kdf.plot.barh(x='lab', y='val', colormap='Paired') - self.compare_plots(ax3, ax4) + # multi-index columns + columns = pd.MultiIndex.from_tuples([('x', 'lab'), ('y', 'val')]) + pdf.columns = columns + kdf.columns = columns + _test_barh_plot_with_x_y(pdf, kdf, x=('x', 'lab'), y=('y', 'val')) def test_barh_plot(self): + + def _test_barh_plot(pdf, kdf): + ax1 = pdf.plot(kind="barh", colormap='Paired') + ax2 = kdf.plot(kind="barh", colormap='Paired') + self.compare_plots(ax1, ax2) + + ax3 = pdf.plot.barh(colormap='Paired') + ax4 = kdf.plot.barh(colormap='Paired') + self.compare_plots(ax3, ax4) + # this is testing when x or y is not assigned pdf = pd.DataFrame({'lab': ['A', 'B', 'C'], 'val': [10, 30, 20]}) kdf = koalas.from_pandas(pdf) + _test_barh_plot(pdf, kdf) - ax1 = pdf.plot(kind="barh", colormap='Paired') - ax2 = kdf.plot(kind="barh", colormap='Paired') - self.compare_plots(ax1, ax2) - - ax3 = pdf.plot.barh(colormap='Paired') - ax4 = kdf.plot.barh(colormap='Paired') - self.compare_plots(ax3, ax4) + # multi-index columns + columns = pd.MultiIndex.from_tuples([('x', 'lab'), ('y', 'val')]) + pdf.columns = columns + kdf.columns = columns + _test_barh_plot(pdf, kdf) def test_bar_plot(self): + + def _test_bar_plot(pdf, kdf): + ax1 = pdf.plot(kind='bar', colormap='Paired') + ax2 = kdf.plot(kind='bar', colormap='Paired') + self.compare_plots(ax1, ax2) + + ax3 = pdf.plot.bar(colormap='Paired') + ax4 = kdf.plot.bar(colormap='Paired') + self.compare_plots(ax3, ax4) + pdf = self.pdf1 kdf = self.kdf1 + _test_bar_plot(pdf, kdf) - ax1 = pdf.plot(kind='bar', colormap='Paired') - ax2 = kdf.plot(kind='bar', colormap='Paired') - self.compare_plots(ax1, ax2) - - ax3 = pdf.plot.bar(colormap='Paired') - ax4 = kdf.plot.bar(colormap='Paired') - self.compare_plots(ax3, ax4) + # multi-index columns + columns = pd.MultiIndex.from_tuples([('x', 'lab'), ('y', 'val')]) + pdf.columns = columns + kdf.columns = columns + _test_bar_plot(pdf, kdf) def test_bar_with_x_y(self): # this is testing plot with specified x and y @@ -155,28 +219,51 @@ def test_bar_with_x_y(self): ax4 = kdf.plot.bar(x='lab', y='val', colormap='Paired') self.compare_plots(ax3, ax4) + # multi-index columns + columns = pd.MultiIndex.from_tuples([('x', 'lab'), ('y', 'val')]) + pdf.columns = columns + kdf.columns = columns + + ax5 = pdf.plot(kind="bar", x=('x', 'lab'), y=('y', 'val'), colormap='Paired') + ax6 = kdf.plot(kind="bar", x=('x', 'lab'), y=('y', 'val'), colormap='Paired') + self.compare_plots(ax5, ax6) + + ax7 = pdf.plot.bar(x=('x', 'lab'), y=('y', 'val'), colormap='Paired') + ax8 = kdf.plot.bar(x=('x', 'lab'), y=('y', 'val'), colormap='Paired') + self.compare_plots(ax7, ax8) + def test_pie_plot(self): - pdf = pd.DataFrame({'mass': [0.330, 4.87, 5.97], 'radius': [2439.7, 6051.8, 6378.1]}, - index=['Mercury', 'Venus', 'Earth']) - kdf = koalas.from_pandas(pdf) - ax1 = pdf.plot.pie(y='mass', figsize=(5, 5), colormap='Paired') - ax2 = kdf.plot.pie(y='mass', figsize=(5, 5), colormap='Paired') - self.compare_plots(ax1, ax2) + def _test_pie_plot(pdf, kdf, y): - ax1 = pdf.plot(kind="pie", y='mass', figsize=(5, 5), colormap='Paired') - ax2 = kdf.plot(kind="pie", y='mass', figsize=(5, 5), colormap='Paired') - self.compare_plots(ax1, ax2) + ax1 = pdf.plot.pie(y=y, figsize=(5, 5), colormap='Paired') + ax2 = kdf.plot.pie(y=y, figsize=(5, 5), colormap='Paired') + self.compare_plots(ax1, ax2) + + ax1 = pdf.plot(kind="pie", y=y, figsize=(5, 5), colormap='Paired') + ax2 = kdf.plot(kind="pie", y=y, figsize=(5, 5), colormap='Paired') + self.compare_plots(ax1, ax2) + + ax11, ax12 = pdf.plot.pie(figsize=(5, 5), subplots=True, colormap='Paired') + ax21, ax22 = kdf.plot.pie(figsize=(5, 5), subplots=True, colormap='Paired') + self.compare_plots(ax11, ax21) + self.compare_plots(ax12, ax22) - ax11, ax12 = pdf.plot.pie(figsize=(5, 5), subplots=True, colormap='Paired') - ax21, ax22 = kdf.plot.pie(figsize=(5, 5), subplots=True, colormap='Paired') - self.compare_plots(ax11, ax21) - self.compare_plots(ax12, ax22) + ax11, ax12 = pdf.plot(kind="pie", figsize=(5, 5), subplots=True, colormap='Paired') + ax21, ax22 = kdf.plot(kind="pie", figsize=(5, 5), subplots=True, colormap='Paired') + self.compare_plots(ax11, ax21) + self.compare_plots(ax12, ax22) + + pdf = pd.DataFrame({'mass': [0.330, 4.87, 5.97], 'radius': [2439.7, 6051.8, 6378.1]}, + index=['Mercury', 'Venus', 'Earth']) + kdf = koalas.from_pandas(pdf) + _test_pie_plot(pdf, kdf, y='mass') - ax11, ax12 = pdf.plot(kind="pie", figsize=(5, 5), subplots=True, colormap='Paired') - ax21, ax22 = kdf.plot(kind="pie", figsize=(5, 5), subplots=True, colormap='Paired') - self.compare_plots(ax11, ax21) - self.compare_plots(ax12, ax22) + # multi-index columns + columns = pd.MultiIndex.from_tuples([('x', 'mass'), ('y', 'radius')]) + pdf.columns = columns + kdf.columns = columns + _test_pie_plot(pdf, kdf, y=('x', 'mass')) def test_pie_plot_error_message(self): # this is to test if error is correctly raising when y is not specified @@ -191,44 +278,62 @@ def test_pie_plot_error_message(self): self.assertTrue(error_message in str(context.exception)) def test_scatter_plot(self): + + def _test_scatter_plot(pdf, kdf, x, y, c): + ax1 = pdf.plot.scatter(x=x, y=y) + ax2 = kdf.plot.scatter(x=x, y=y) + self.compare_plots(ax1, ax2) + + ax1 = pdf.plot(kind='scatter', x=x, y=y) + ax2 = kdf.plot(kind='scatter', x=x, y=y) + self.compare_plots(ax1, ax2) + + # check when keyword c is given as name of a column + ax1 = pdf.plot.scatter(x=x, y=y, c=c, s=50) + ax2 = kdf.plot.scatter(x=x, y=y, c=c, s=50) + self.compare_plots(ax1, ax2) + # Use pandas scatter plot example pdf = pd.DataFrame(np.random.rand(50, 4), columns=['a', 'b', 'c', 'd']) kdf = koalas.from_pandas(pdf) + _test_scatter_plot(pdf, kdf, x='a', y='b', c='c') - ax1 = pdf.plot.scatter(x='a', y='b') - ax2 = kdf.plot.scatter(x='a', y='b') - self.compare_plots(ax1, ax2) + # multi-index columns + columns = pd.MultiIndex.from_tuples([('x', 'a'), ('x', 'b'), ('y', 'c'), ('z', 'd')]) + pdf.columns = columns + kdf.columns = columns + _test_scatter_plot(pdf, kdf, x=('x', 'a'), y=('x', 'b'), c=('y', 'c')) - ax1 = pdf.plot(kind='scatter', x='a', y='b') - ax2 = kdf.plot(kind='scatter', x='a', y='b') - self.compare_plots(ax1, ax2) + def test_hist_plot(self): - # check when keyword c is given as name of a column - ax1 = pdf.plot.scatter(x='a', y='b', c='c', s=50) - ax2 = kdf.plot.scatter(x='a', y='b', c='c', s=50) - self.compare_plots(ax1, ax2) + def _test_hist_plot(pdf, kdf): + _, ax1 = plt.subplots(1, 1) + ax1 = pdf.plot.hist() + _, ax2 = plt.subplots(1, 1) + ax2 = kdf.plot.hist() + self.compare_plots(ax1, ax2) - def test_hist_plot(self): - pdf = self.pdf1 - kdf = self.kdf1 + ax1 = pdf.plot.hist(bins=15) + ax2 = kdf.plot.hist(bins=15) + self.compare_plots(ax1, ax2) - _, ax1 = plt.subplots(1, 1) - ax1 = pdf.plot.hist() - _, ax2 = plt.subplots(1, 1) - ax2 = kdf.plot.hist() - self.compare_plots(ax1, ax2) + ax1 = pdf.plot(kind='hist', bins=15) + ax2 = kdf.plot(kind='hist', bins=15) + self.compare_plots(ax1, ax2) - ax1 = pdf.plot.hist(bins=15) - ax2 = kdf.plot.hist(bins=15) - self.compare_plots(ax1, ax2) + ax1 = pdf.plot.hist(bins=3, bottom=[2, 1, 3]) + ax2 = kdf.plot.hist(bins=3, bottom=[2, 1, 3]) + self.compare_plots(ax1, ax2) - ax1 = pdf.plot(kind='hist', bins=15) - ax2 = kdf.plot(kind='hist', bins=15) - self.compare_plots(ax1, ax2) + pdf = self.pdf1 + kdf = self.kdf1 + _test_hist_plot(pdf, kdf) - ax1 = pdf.plot.hist(bins=3, bottom=[2, 1, 3]) - ax2 = kdf.plot.hist(bins=3, bottom=[2, 1, 3]) - self.compare_plots(ax1, ax2) + # multi-index columns + columns = pd.MultiIndex.from_tuples([('x', 'a'), ('y', 'b')]) + pdf.columns = columns + kdf.columns = columns + _test_hist_plot(pdf, kdf) def test_missing(self): ks = self.kdf1