From 4aadff4db59243b4510a874fef2bdb17402d1674 Mon Sep 17 00:00:00 2001 From: Chelsea Lin <124939984+chelsea-lin@users.noreply.github.com> Date: Mon, 11 Mar 2024 16:44:20 -0700 Subject: [PATCH] feat: (Series|Dataframe).plot.hist() (#420) * feat: (Series|Dataframe).plot.hist() --- bigframes/dataframe.py | 5 + bigframes/operations/_matplotlib/__init__.py | 30 +++ bigframes/operations/_matplotlib/core.py | 30 +++ bigframes/operations/_matplotlib/hist.py | 172 ++++++++++++++++++ bigframes/operations/plotting.py | 34 ++++ bigframes/series.py | 5 + docs/reference/bigframes.pandas/frame.rst | 11 ++ docs/reference/bigframes.pandas/series.rst | 9 + setup.py | 1 + testing/constraints-3.9.txt | 1 + tests/system/small/operations/test_plot.py | 168 +++++++++++++++++ .../bigframes_vendored/pandas/core/frame.py | 11 ++ .../bigframes_vendored/pandas/core/series.py | 11 ++ .../pandas/plotting/_core.py | 48 +++++ 14 files changed, 536 insertions(+) create mode 100644 bigframes/operations/_matplotlib/__init__.py create mode 100644 bigframes/operations/_matplotlib/core.py create mode 100644 bigframes/operations/_matplotlib/hist.py create mode 100644 bigframes/operations/plotting.py create mode 100644 tests/system/small/operations/test_plot.py create mode 100644 third_party/bigframes_vendored/pandas/plotting/_core.py diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 24c4699473..a122212d04 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -59,6 +59,7 @@ import bigframes.formatting_helpers as formatter import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops +import bigframes.operations.plotting as plotting import bigframes.series import bigframes.series as bf_series import bigframes.session._io.bigquery @@ -3193,4 +3194,8 @@ def get_right_id(id): return result + @property + def plot(self): + return plotting.PlotAccessor(self) + __matmul__ = dot diff --git a/bigframes/operations/_matplotlib/__init__.py b/bigframes/operations/_matplotlib/__init__.py new file mode 100644 index 0000000000..f8770a9ef8 --- /dev/null +++ b/bigframes/operations/_matplotlib/__init__.py @@ -0,0 +1,30 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import bigframes.operations._matplotlib.core as core +import bigframes.operations._matplotlib.hist as hist + +PLOT_CLASSES: dict[str, type[core.MPLPlot]] = { + "hist": hist.HistPlot, +} + + +def plot(data, kind, **kwargs): + plot_obj = PLOT_CLASSES[kind](data, **kwargs) + plot_obj.generate() + plot_obj.draw() + return plot_obj.result + + +__all__ = ["plot"] diff --git a/bigframes/operations/_matplotlib/core.py b/bigframes/operations/_matplotlib/core.py new file mode 100644 index 0000000000..4b15d6f4dd --- /dev/null +++ b/bigframes/operations/_matplotlib/core.py @@ -0,0 +1,30 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import abc + +import matplotlib.pyplot as plt + + +class MPLPlot(abc.ABC): + @abc.abstractmethod + def generate(self): + pass + + def draw(self) -> None: + plt.draw_if_interactive() + + @property + def result(self): + return self.axes diff --git a/bigframes/operations/_matplotlib/hist.py b/bigframes/operations/_matplotlib/hist.py new file mode 100644 index 0000000000..720b94d7da --- /dev/null +++ b/bigframes/operations/_matplotlib/hist.py @@ -0,0 +1,172 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools +from typing import Literal + +import numpy as np +import pandas as pd + +import bigframes.constants as constants +import bigframes.operations._matplotlib.core as bfplt + + +class HistPlot(bfplt.MPLPlot): + @property + def _kind(self) -> Literal["hist"]: + return "hist" + + def __init__( + self, + data, + bins: int = 10, + **kwargs, + ) -> None: + self.bins = bins + self.label = kwargs.get("label", None) + self.by = kwargs.pop("by", None) + self.kwargs = kwargs + + if self.by is not None: + raise NotImplementedError( + f"Non-none `by` argument is not yet supported. {constants.FEEDBACK_LINK}" + ) + if not isinstance(self.bins, int): + raise NotImplementedError( + f"Only integer values are supported for the `bins` argument. {constants.FEEDBACK_LINK}" + ) + if kwargs.get("weight", None) is not None: + raise NotImplementedError( + f"Non-none `weight` argument is not yet supported. {constants.FEEDBACK_LINK}" + ) + + self.data = self._compute_plot_data(data) + + def generate(self) -> None: + """ + Calculates weighted histograms through BigQuery and plots them through pandas + native histogram plot. + """ + hist_bars = self._calculate_hist_bars(self.data, self.bins) + bin_edges = self._calculate_bin_edges( + hist_bars, self.bins, self.kwargs.get("range", None) + ) + + weights = { + col_name: hist_bar.values for col_name, hist_bar in hist_bars.items() + } + hist_x = { + col_name: pd.Series( + ( + hist_bar.index.get_level_values("left_exclusive") + + hist_bar.index.get_level_values("right_inclusive") + ) + / 2.0 + ) + for col_name, hist_bar in hist_bars.items() + } + + # Align DataFrames for plotting despite potential differences in column + # lengths, filling shorter columns with zeros. + hist_x_pd = pd.DataFrame( + list(itertools.zip_longest(*hist_x.values())), columns=list(hist_x.keys()) + ).sort_index(axis=1)[self.data.columns.values] + weights_pd = pd.DataFrame( + list(itertools.zip_longest(*weights.values())), columns=list(weights.keys()) + ).sort_index(axis=1)[self.data.columns.values] + + # Prevents pandas from dropping NA values and causing length mismatches by + # filling them with zeros. + hist_x_pd.fillna(0, inplace=True) + weights_pd.fillna(0, inplace=True) + + self.axes = hist_x_pd.plot.hist( + bins=bin_edges, + weights=np.array(weights_pd.values), + **self.kwargs, + ) # type: ignore + + def _compute_plot_data(self, data): + """ + Prepares data for plotting, focusing on numeric data types. + + Raises: + TypeError: If the input data contains no numeric columns. + """ + # Importing at the top of the file causes a circular import. + import bigframes.series as series + + if isinstance(data, series.Series): + label = self.label + if label is None and data.name is None: + label = "" + if label is None: + data = data.to_frame() + else: + data = data.to_frame(name=label) + + # TODO(chelsealin): Support timestamp/date types here. + include_type = ["number"] + numeric_data = data.select_dtypes(include=include_type) + try: + is_empty = numeric_data.columns.empty + except AttributeError: + is_empty = not len(numeric_data) + + if is_empty: + raise TypeError("no numeric data to plot") + + return numeric_data + + @staticmethod + def _calculate_hist_bars(data, bins): + """ + Calculates histogram bars for each column in a BigFrames DataFrame, and + returns a dictionary where keys are column names and values are pandas + Series. The series values are the histogram bins' heights with a + multi-index defining 'left_exclusive' and 'right_inclusive' bin edges. + """ + import bigframes.pandas as bpd + + # TODO: Optimize this by batching multiple jobs into one. + hist_bar = {} + for _, col in enumerate(data.columns): + cutted_data = bpd.cut(data[col], bins=bins, labels=None) + hist_bar[col] = ( + cutted_data.struct.explode() + .value_counts() + .to_pandas() + .sort_index(level="left_exclusive") + ) + return hist_bar + + @staticmethod + def _calculate_bin_edges(hist_bars, bins, range): + """ + Calculate bin edges from the histogram bars. + """ + bin_edges = None + for _, hist_bar in hist_bars.items(): + left = hist_bar.index.get_level_values("left_exclusive") + right = hist_bar.index.get_level_values("right_inclusive") + if bin_edges is None: + bin_edges = left.union(right) + else: + bin_edges = left.union(right).union(bin_edges) + + if bin_edges is None: + return None + + _, bins = np.histogram(bin_edges, bins=bins, range=range) + return bins diff --git a/bigframes/operations/plotting.py b/bigframes/operations/plotting.py new file mode 100644 index 0000000000..ef36e9383a --- /dev/null +++ b/bigframes/operations/plotting.py @@ -0,0 +1,34 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional, Sequence + +import bigframes.constants as constants +import bigframes.operations._matplotlib as bfplt +import third_party.bigframes_vendored.pandas.plotting._core as vendordt + + +class PlotAccessor: + __doc__ = vendordt.PlotAccessor.__doc__ + + def __init__(self, data) -> None: + self._parent = data + + def hist(self, by: Optional[Sequence[str]] = None, bins: int = 10, **kwargs): + if kwargs.pop("backend", None) is not None: + raise NotImplementedError( + f"Only support matplotlib backend for now. {constants.FEEDBACK_LINK}" + ) + # Calls matplotlib backend to plot the data. + return bfplt.plot(self._parent.copy(), kind="hist", by=by, bins=bins, **kwargs) diff --git a/bigframes/series.py b/bigframes/series.py index dfa6fa4b0d..21f1f3b4e4 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -50,6 +50,7 @@ import bigframes.operations.aggregations as agg_ops import bigframes.operations.base import bigframes.operations.datetimes as dt +import bigframes.operations.plotting as plotting import bigframes.operations.strings as strings import bigframes.operations.structs as structs import third_party.bigframes_vendored.pandas.core.series as vendored_pandas_series @@ -1557,6 +1558,10 @@ def __array_ufunc__( def str(self) -> strings.StringMethods: return strings.StringMethods(self._block) + @property + def plot(self): + return plotting.PlotAccessor(self) + def _slice( self, start: typing.Optional[int] = None, diff --git a/docs/reference/bigframes.pandas/frame.rst b/docs/reference/bigframes.pandas/frame.rst index a49bcc8f7c..d1610accdd 100644 --- a/docs/reference/bigframes.pandas/frame.rst +++ b/docs/reference/bigframes.pandas/frame.rst @@ -7,3 +7,14 @@ DataFrame :members: :inherited-members: :undoc-members: + +Accessors +--------- + +Plotting handling +^^^^^^^^^^^^^^^^^ + +.. automodule:: bigframes.operations.plotting + :members: + :inherited-members: + :undoc-members: diff --git a/docs/reference/bigframes.pandas/series.rst b/docs/reference/bigframes.pandas/series.rst index e212904f3f..f14eb8e862 100644 --- a/docs/reference/bigframes.pandas/series.rst +++ b/docs/reference/bigframes.pandas/series.rst @@ -42,3 +42,12 @@ Struct handling :members: :inherited-members: :undoc-members: + +Plotting handling +^^^^^^^^^^^^^^^^^ + +.. automodule:: bigframes.operations.plotting + :members: + :inherited-members: + :undoc-members: + :noindex: diff --git a/setup.py b/setup.py index 516d5b8a19..027c1b76af 100644 --- a/setup.py +++ b/setup.py @@ -58,6 +58,7 @@ "tabulate >= 0.9", "ipywidgets >=7.7.1", "humanize >= 4.6.0", + "matplotlib >= 3.7.1", ] extras = { # Optional test dependencies packages. If they're missed, may skip some tests. diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index c4fed64fbd..07c8b763f3 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -20,5 +20,6 @@ sqlglot==20.8.0 tabulate==0.9 ipywidgets==7.7.1 humanize==4.6.0 +matplotlib==3.7.1 # extras pandas-gbq==0.19.0 diff --git a/tests/system/small/operations/test_plot.py b/tests/system/small/operations/test_plot.py new file mode 100644 index 0000000000..44f31ec071 --- /dev/null +++ b/tests/system/small/operations/test_plot.py @@ -0,0 +1,168 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pandas._testing as tm +import pytest + + +def _check_legend_labels(ax, labels): + """ + Check the ax has expected legend label + """ + assert ax.get_legend() is not None + texts = ax.get_legend().get_texts() + if not isinstance(texts, list): + assert texts.get_text() == labels + else: + actual_labels = [t.get_text() for t in texts] + assert len(actual_labels) == len(labels) + for label, e in zip(actual_labels, labels): + assert label == e + + +def test_series_hist_bins(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bins = 5 + ax = scalars_df["int64_col"].plot.hist(bins=bins) + pd_ax = scalars_pandas_df["int64_col"].plot.hist(bins=bins) + + # Compares axis values and height between bigframes and pandas histograms. + # Note: Due to potential float rounding by matplotlib, this test may not + # be applied to all cases. + assert len(ax.patches) == len(pd_ax.patches) + for i in range(len(ax.patches)): + assert ax.patches[i].xy == pd_ax.patches[i].xy + assert ax.patches[i]._height == pd_ax.patches[i]._height + + +def test_dataframes_hist_bins(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bins = 7 + columns = ["int64_col", "int64_too", "float64_col"] + ax = scalars_df[columns].plot.hist(bins=bins) + pd_ax = scalars_pandas_df[columns].plot.hist(bins=bins) + + # Compares axis values and height between bigframes and pandas histograms. + # Note: Due to potential float rounding by matplotlib, this test may not + # be applied to all cases. + assert len(ax.patches) == len(pd_ax.patches) + for i in range(len(ax.patches)): + assert ax.patches[i]._height == pd_ax.patches[i]._height + + +@pytest.mark.parametrize( + ("col_names"), + [ + pytest.param(["int64_col"]), + pytest.param(["float64_col"]), + pytest.param(["int64_too", "bool_col"]), + pytest.param(["bool_col"], marks=pytest.mark.xfail(raises=TypeError)), + pytest.param(["date_col"], marks=pytest.mark.xfail(raises=TypeError)), + pytest.param(["datetime_col"], marks=pytest.mark.xfail(raises=TypeError)), + pytest.param(["time_col"], marks=pytest.mark.xfail(raises=TypeError)), + pytest.param(["timestamp_col"], marks=pytest.mark.xfail(raises=TypeError)), + ], +) +def test_hist_include_types(scalars_dfs, col_names): + scalars_df, _ = scalars_dfs + ax = scalars_df[col_names].plot.hist() + assert len(ax.patches) == 10 + + +@pytest.mark.parametrize( + ("arg_name", "arg_value"), + [ + pytest.param( + "by", ["int64_col"], marks=pytest.mark.xfail(raises=NotImplementedError) + ), + pytest.param( + "bins", [1, 3, 5], marks=pytest.mark.xfail(raises=NotImplementedError) + ), + pytest.param( + "weight", [2, 3], marks=pytest.mark.xfail(raises=NotImplementedError) + ), + pytest.param( + "backend", + "backend.module", + marks=pytest.mark.xfail(raises=NotImplementedError), + ), + ], +) +def test_hist_not_implemented_error(scalars_dfs, arg_name, arg_value): + scalars_df, _ = scalars_dfs + kwargs = {arg_name: arg_value} + scalars_df.plot.hist(**kwargs) + + +def test_hist_kwargs_true_subplots(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + columns = ["int64_col", "int64_too", "float64_col"] + axes = scalars_df[columns].plot.hist(subplots=True) + pd_axes = scalars_pandas_df[columns].plot.hist(subplots=True) + assert len(axes) == len(pd_axes) + + expected_labels = (["int64_col"], ["int64_too"], ["float64_col"]) + for ax, labels in zip(axes, expected_labels): + _check_legend_labels(ax, labels) + + +def test_hist_kwargs_list_subplots(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + columns = ["int64_col", "int64_too", "float64_col"] + subplots = [["int64_col", "int64_too"]] + axes = scalars_df[columns].plot.hist(subplots=subplots) + pd_axes = scalars_pandas_df[columns].plot.hist(subplots=subplots) + assert len(axes) == len(pd_axes) + + expected_labels = (["int64_col", "int64_too"], ["float64_col"]) + for ax, labels in zip(axes, expected_labels): + _check_legend_labels(ax, labels=labels) + + +@pytest.mark.parametrize( + ("orientation"), + [ + pytest.param("horizontal"), + pytest.param("vertical"), + ], +) +def test_hist_kwargs_orientation(scalars_dfs, orientation): + scalars_df, scalars_pandas_df = scalars_dfs + ax = scalars_df["int64_col"].plot.hist(orientation=orientation) + pd_ax = scalars_pandas_df["int64_col"].plot.hist(orientation=orientation) + assert ax.xaxis.get_label().get_text() == pd_ax.xaxis.get_label().get_text() + assert ax.yaxis.get_label().get_text() == pd_ax.yaxis.get_label().get_text() + + +def test_hist_kwargs_ticks_props(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + xticks = [20, 18] + yticks = [30, 40] + + ax = scalars_df["float64_col"].plot.hist(xticks=xticks, yticks=yticks) + pd_ax = scalars_pandas_df["float64_col"].plot.hist(xticks=xticks, yticks=yticks) + xlabels = ax.get_xticklabels() + pd_xlables = pd_ax.get_xticklabels() + assert len(xlabels) == len(pd_xlables) + for i in range(len(pd_xlables)): + tm.assert_almost_equal(xlabels[i].get_fontsize(), pd_xlables[i].get_fontsize()) + tm.assert_almost_equal(xlabels[i].get_rotation(), pd_xlables[i].get_rotation()) + + ylabels = ax.get_yticklabels() + pd_ylables = pd_ax.get_yticklabels() + assert len(xlabels) == len(pd_xlables) + for i in range(len(pd_xlables)): + tm.assert_almost_equal(ylabels[i].get_fontsize(), pd_ylables[i].get_fontsize()) + tm.assert_almost_equal(ylabels[i].get_rotation(), pd_ylables[i].get_rotation()) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index f88649ca13..0399d9c5b9 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -5224,3 +5224,14 @@ def dot(self, other): the matrix product of self and other in a DataFrame. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + @property + def plot(self): + """ + Make plots of Dataframes. + + Returns: + bigframes.operations.plotting.PlotAccessor: + An accessor making plots. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 6c01a6dd0c..2c4f2aaa8f 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -3111,6 +3111,17 @@ def str(self): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + @property + def plot(self): + """ + Make plots of Series. + + Returns: + bigframes.operations.plotting.PlotAccessor: + An accessor making plots. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def isin(self, values): """ Whether elements in Series are contained in values. diff --git a/third_party/bigframes_vendored/pandas/plotting/_core.py b/third_party/bigframes_vendored/pandas/plotting/_core.py new file mode 100644 index 0000000000..d0425737ee --- /dev/null +++ b/third_party/bigframes_vendored/pandas/plotting/_core.py @@ -0,0 +1,48 @@ +from typing import Optional, Sequence + +from bigframes import constants + + +class PlotAccessor: + """ + Make plots of Series or DataFrame with the `matplotlib` backend. + """ + + def hist(self, by: Optional[Sequence[str]] = None, bins: int = 10, **kwargs): + """ + Draw one histogram of the DataFrame’s columns. + + A histogram is a representation of the distribution of data. + This function groups the values of all given Series in the DataFrame + into bins and draws all bins in one :class:`matplotlib.axes.Axes`. + This is useful when the DataFrame's Series are in a similar scale. + + Parameters + ---------- + by : str or sequence, optional + Column in the DataFrame to group by. It is not supported yet. + bins : int, default 10 + Number of histogram bins to be used. + **kwargs + Additional keyword arguments are documented in + :meth:`DataFrame.plot`. + + Returns + ------- + class:`matplotlib.AxesSubplot` + Return a histogram plot. + + Examples + -------- + For Series: + + .. plot:: + :context: close-figs + + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> df = bpd.DataFrame(np.random.randint(1, 7, 6000), columns=['one']) + >>> df['two'] = np.random.randint(1, 7, 6000) + np.random.randint(1, 7, 6000) + >>> ax = df.plot.hist(bins=12, alpha=0.5) + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)