From e7916d943aafe15c7cad9a11a003bcdbeedbb35f Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Thu, 14 Nov 2024 19:44:14 +0000 Subject: [PATCH 1/4] feat: (Series | DataFrame).plot.bar --- bigframes/operations/_matplotlib/__init__.py | 5 +- bigframes/operations/_matplotlib/core.py | 21 ++++--- bigframes/operations/plotting.py | 10 +++- .../system/small/operations/test_plotting.py | 12 ++++ .../pandas/plotting/_core.py | 60 +++++++++++++++++++ 5 files changed, 98 insertions(+), 10 deletions(-) diff --git a/bigframes/operations/_matplotlib/__init__.py b/bigframes/operations/_matplotlib/__init__.py index 6ffe71139d..5f99d3b50a 100644 --- a/bigframes/operations/_matplotlib/__init__.py +++ b/bigframes/operations/_matplotlib/__init__.py @@ -20,10 +20,11 @@ PLOT_TYPES = typing.Union[type[core.SamplingPlot], type[hist.HistPlot]] PLOT_CLASSES: dict[str, PLOT_TYPES] = { - "hist": hist.HistPlot, - "line": core.LinePlot, "area": core.AreaPlot, + "bar": core.BarPlot, + "line": core.LinePlot, "scatter": core.ScatterPlot, + "hist": hist.HistPlot, } diff --git a/bigframes/operations/_matplotlib/core.py b/bigframes/operations/_matplotlib/core.py index 9e59e09877..9292814fc0 100644 --- a/bigframes/operations/_matplotlib/core.py +++ b/bigframes/operations/_matplotlib/core.py @@ -46,7 +46,8 @@ def result(self): class SamplingPlot(MPLPlot): - @abc.abstractproperty + @property + @abc.abstractmethod def _kind(self): pass @@ -74,18 +75,24 @@ def _compute_plot_data(self): return self._compute_sample_data(self.data) -class LinePlot(SamplingPlot): - @property - def _kind(self) -> typing.Literal["line"]: - return "line" - - class AreaPlot(SamplingPlot): @property def _kind(self) -> typing.Literal["area"]: return "area" +class BarPlot(SamplingPlot): + @property + def _kind(self) -> typing.Literal["bar"]: + return "bar" + + +class LinePlot(SamplingPlot): + @property + def _kind(self) -> typing.Literal["line"]: + return "line" + + class ScatterPlot(SamplingPlot): @property def _kind(self) -> typing.Literal["scatter"]: diff --git a/bigframes/operations/plotting.py b/bigframes/operations/plotting.py index a45b825354..e9a86be6c9 100644 --- a/bigframes/operations/plotting.py +++ b/bigframes/operations/plotting.py @@ -23,7 +23,7 @@ class PlotAccessor(vendordt.PlotAccessor): __doc__ = vendordt.PlotAccessor.__doc__ - _common_kinds = ("line", "area", "hist") + _common_kinds = ("line", "area", "hist", "bar") _dataframe_kinds = ("scatter",) _all_kinds = _common_kinds + _dataframe_kinds @@ -72,6 +72,14 @@ def area( ): return self(kind="area", x=x, y=y, stacked=stacked, **kwargs) + def bar( + self, + x: typing.Optional[typing.Hashable] = None, + y: typing.Optional[typing.Hashable] = None, + **kwargs, + ): + return self(kind="bar", x=x, y=y, **kwargs) + def scatter( self, x: typing.Optional[typing.Hashable] = None, diff --git a/tests/system/small/operations/test_plotting.py b/tests/system/small/operations/test_plotting.py index 7be44e0a0f..3624232ea0 100644 --- a/tests/system/small/operations/test_plotting.py +++ b/tests/system/small/operations/test_plotting.py @@ -195,6 +195,18 @@ def test_area(scalars_dfs): tm.assert_almost_equal(line.get_data()[1], pd_line.get_data()[1]) +def test_bar(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_names = ["int64_col", "float64_col", "int64_too"] + ax = scalars_df[col_names].plot.bar() + pd_ax = scalars_pandas_df[col_names].plot.bar() + tm.assert_almost_equal(ax.get_xticks(), pd_ax.get_xticks()) + tm.assert_almost_equal(ax.get_yticks(), pd_ax.get_yticks()) + for line, pd_line in zip(ax.lines, pd_ax.lines): + # Compare y coordinates between the lines + tm.assert_almost_equal(line.get_data()[1], pd_line.get_data()[1]) + + def test_scatter(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_names = ["int64_col", "float64_col", "int64_too", "bool_col"] diff --git a/third_party/bigframes_vendored/pandas/plotting/_core.py b/third_party/bigframes_vendored/pandas/plotting/_core.py index 2409068fa8..4ed5c8eb0b 100644 --- a/third_party/bigframes_vendored/pandas/plotting/_core.py +++ b/third_party/bigframes_vendored/pandas/plotting/_core.py @@ -215,6 +215,66 @@ def area( """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def bar( + self, + x: typing.Optional[typing.Hashable] = None, + y: typing.Optional[typing.Hashable] = None, + **kwargs, + ): + """ + Draw a vertical bar plot. + + This function calls `pandas.plot` to generate a plot with a random sample + of items. For consistent results, the random sampling is reproducible. + Use the `sampling_random_state` parameter to modify the sampling seed. + + **Examples:** + + Basic plot. + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'lab':['A', 'B', 'C'], 'val':[10, 30, 20]}) + >>> ax = df.plot.bar(x='lab', y='val', rot=0) + + Plot a whole dataframe to a bar plot. Each column is assigned a distinct color, + and each row is nested in a group along the horizontal axis. + + >>> speed = [0.1, 17.5, 40, 48, 52, 69, 88] + >>> lifespan = [2, 8, 70, 1.5, 25, 12, 28] + >>> index = ['snail', 'pig', 'elephant', + ... 'rabbit', 'giraffe', 'coyote', 'horse'] + >>> df = bpd.DataFrame({'speed': speed, 'lifespan': lifespan}, index=index) + >>> ax = df.plot.bar(rot=0) + + Plot stacked bar charts for the DataFrame. + + >>> ax = df.plot.bar(stacked=True) + + If you don’t like the default colours, you can specify how you’d like each column + to be colored. + + >>> axes = df.plot.bar( + ... rot=0, subplots=True, color={"speed": "red", "lifespan": "green"} + ... ) + + Args: + x (label or position, optional): + Allows plotting of one column versus another. If not specified, the index + of the DataFrame is used. + y (label or position, optional): + Allows plotting of one column versus another. If not specified, all numerical + columns are used. + **kwargs: + Additional keyword arguments are documented in + :meth:`DataFrame.plot`. + + Returns: + matplotlib.axes.Axes or numpy.ndarray: + Area plot, or array of area plots if subplots is True. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def scatter( self, x: typing.Optional[typing.Hashable] = None, From 86de62c42f1b181441156cb492ce13ab476e5673 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Fri, 15 Nov 2024 22:19:56 +0000 Subject: [PATCH 2/4] add warning message --- bigframes/operations/_matplotlib/core.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/bigframes/operations/_matplotlib/core.py b/bigframes/operations/_matplotlib/core.py index 9292814fc0..22c726d6f0 100644 --- a/bigframes/operations/_matplotlib/core.py +++ b/bigframes/operations/_matplotlib/core.py @@ -14,6 +14,7 @@ import abc import typing +import warnings import bigframes_vendored.constants as constants import pandas as pd @@ -62,6 +63,15 @@ def generate(self) -> None: def _compute_sample_data(self, data): # TODO: Cache the sampling data in the PlotAccessor. sampling_n = self.kwargs.pop("sampling_n", DEFAULT_SAMPLING_N) + if self._sampling_warning_msg is not None: + total_n = data.shape[0] + if sampling_n < total_n: + warnings.warn( + self._sampling_warning_msg().format( + sampling_n=sampling_n, total_n=total_n + ) + ) + sampling_random_state = self.kwargs.pop( "sampling_random_state", DEFAULT_SAMPLING_STATE ) @@ -74,6 +84,9 @@ def _compute_sample_data(self, data): def _compute_plot_data(self): return self._compute_sample_data(self.data) + def _sampling_warning_msg(self) -> str: + return None + class AreaPlot(SamplingPlot): @property @@ -86,6 +99,14 @@ class BarPlot(SamplingPlot): def _kind(self) -> typing.Literal["bar"]: return "bar" + def _sampling_warning_msg(self) -> str: + return ( + "To optimize plotting performance, your data has been downsampled to {sampling_n} " + "rows from the original {total_n} rows. This may result in some data points " + "not being displayed. For a more comprehensive view, consider pre-processing " + "your data by aggregating it or selecting the top categories." + ) + class LinePlot(SamplingPlot): @property From ae9e7540469297eb2b85d1aba5de104a963d3d76 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Mon, 18 Nov 2024 18:51:04 +0000 Subject: [PATCH 3/4] fix mypy --- bigframes/operations/_matplotlib/core.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/bigframes/operations/_matplotlib/core.py b/bigframes/operations/_matplotlib/core.py index 22c726d6f0..2d1d9f3f08 100644 --- a/bigframes/operations/_matplotlib/core.py +++ b/bigframes/operations/_matplotlib/core.py @@ -52,6 +52,10 @@ class SamplingPlot(MPLPlot): def _kind(self): pass + @property + def _sampling_warning_msg(self) -> typing.Optional[str]: + return None + def __init__(self, data, **kwargs) -> None: self.kwargs = kwargs self.data = data @@ -67,7 +71,7 @@ def _compute_sample_data(self, data): total_n = data.shape[0] if sampling_n < total_n: warnings.warn( - self._sampling_warning_msg().format( + self._sampling_warning_msg.format( sampling_n=sampling_n, total_n=total_n ) ) @@ -84,10 +88,6 @@ def _compute_sample_data(self, data): def _compute_plot_data(self): return self._compute_sample_data(self.data) - def _sampling_warning_msg(self) -> str: - return None - - class AreaPlot(SamplingPlot): @property def _kind(self) -> typing.Literal["area"]: @@ -99,7 +99,8 @@ class BarPlot(SamplingPlot): def _kind(self) -> typing.Literal["bar"]: return "bar" - def _sampling_warning_msg(self) -> str: + @property + def _sampling_warning_msg(self) -> typing.Optional[str]: return ( "To optimize plotting performance, your data has been downsampled to {sampling_n} " "rows from the original {total_n} rows. This may result in some data points " From 194089d53ec90017ba3108b22b521e05b7a45ad6 Mon Sep 17 00:00:00 2001 From: Owl Bot Date: Mon, 18 Nov 2024 18:53:54 +0000 Subject: [PATCH 4/4] =?UTF-8?q?=F0=9F=A6=89=20Updates=20from=20OwlBot=20po?= =?UTF-8?q?st-processor?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --- bigframes/operations/_matplotlib/core.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bigframes/operations/_matplotlib/core.py b/bigframes/operations/_matplotlib/core.py index 2d1d9f3f08..b7c926be99 100644 --- a/bigframes/operations/_matplotlib/core.py +++ b/bigframes/operations/_matplotlib/core.py @@ -88,6 +88,7 @@ def _compute_sample_data(self, data): def _compute_plot_data(self): return self._compute_sample_data(self.data) + class AreaPlot(SamplingPlot): @property def _kind(self) -> typing.Literal["area"]: