Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add Series.plot.hist() method #398

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
108 changes: 108 additions & 0 deletions bigframes/operations/plot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Sequence

import matplotlib.pyplot as plt

import bigframes.constants as constants
import third_party.bigframes_vendored.pandas.plotting._core as vendordt


class PlotAccessor:
__doc__ = vendordt.PlotAccessor.__doc__

def __init__(self, data) -> None:
self._parent = data

def hist(self, by: Sequence[str] | None = None, bins: int = 10, **kwargs):
if by is not None:
raise NotImplementedError(
f"Non-none `by` argument is not yet supported. {constants.FEEDBACK_LINK}"
)
if kwargs.pop("backend", None) is not None:
raise NotImplementedError(
f"Only support matplotlib backend for now. {constants.FEEDBACK_LINK}"
)
import bigframes.dataframe as dataframe

if isinstance(self._parent, dataframe.DataFrame):
raise NotImplementedError(
f"`Dataframe.plot.hist` is not implemented yet. {constants.FEEDBACK_LINK}"
)

return self._hist_series(
by=by,
bins=bins,
**kwargs,
)

def _hist_series(
self,
by: Sequence[str] | None = None,
bins: int = 10,
**kwargs,
):
# Only supported some arguments to adorn plots.
ax = kwargs.pop("ax", None)
figsize = kwargs.pop("figsize", None)
legend = kwargs.pop("legend", False)
grid = kwargs.pop("grid", None)
xticks = kwargs.pop("xticks", None)
yticks = kwargs.pop("yticks", None)

# Calculates the bins' values and weights through BigQuery
import bigframes.pandas as bpd

series = self._parent.copy()
binned = bpd.cut(series, bins=bins, labels=None)
binned_data = (
binned.struct.explode()
.value_counts()
.to_pandas()
.sort_index(level="left_exclusive")
)
weights = binned_data.values
left_bins = binned_data.index.get_level_values("left_exclusive")
right_bins = binned_data.index.get_level_values("right_inclusive")
bin_edges = left_bins.union(right_bins, sort=True)

# This code takes the hist_series function from pandas and tweaks it a bit.
if kwargs.get("layout", None) is not None:
raise ValueError("The 'layout' keyword is not supported when 'by' is None")

fig = kwargs.pop(
"figure", plt.gcf() if plt.get_fignums() else plt.figure(figsize=figsize)
)
if figsize is not None and tuple(figsize) != tuple(fig.get_size_inches()):
fig.set_size_inches(*figsize, forward=True)

ax = kwargs.pop("ax", None)
if ax is None:
ax = fig.gca()
elif ax.get_figure() != fig:
raise AssertionError("passed axis not bound to passed figure")

if legend:
kwargs["label"] = series.name
ax.hist(x=left_bins, bins=bin_edges, weights=weights, **kwargs)
if legend:
ax.legend()
if grid is not None:
ax.grid(grid)
if xticks is not None:
ax.set_xticks(xticks)
if yticks is not None:
ax.set_yticks(yticks)

return ax
5 changes: 5 additions & 0 deletions bigframes/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
import bigframes.operations.aggregations as agg_ops
import bigframes.operations.base
import bigframes.operations.datetimes as dt
import bigframes.operations.plot as plot
import bigframes.operations.strings as strings
import bigframes.operations.structs as structs
import third_party.bigframes_vendored.pandas.core.series as vendored_pandas_series
Expand Down Expand Up @@ -1551,6 +1552,10 @@ def __array_ufunc__(
def str(self) -> strings.StringMethods:
return strings.StringMethods(self._block)

@property
def plot(self):
return plot.PlotAccessor(self)

def _slice(
self,
start: typing.Optional[int] = None,
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@
"tabulate >= 0.9",
"ipywidgets >=7.7.1",
"humanize >= 4.6.0",
"matplotlib >= 3.7.1",
]
extras = {
# Optional test dependencies packages. If they're missed, may skip some tests.
Expand Down
50 changes: 50 additions & 0 deletions tests/system/small/operations/test_plot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import pandas._testing as tm


def test_series_hist_bins(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs
ax = scalars_df["int64_col"].plot.hist(bins=5)
pd_ax = scalars_pandas_df["int64_col"].hist(bins=5)

# Check hist has same height compared to the pandas one.
assert len(ax.patches) == len(pd_ax.patches)
for i in range(len(ax.patches)):
assert ax.patches[i].xy == pd_ax.patches[i].xy
assert ax.patches[i]._height == pd_ax.patches[i]._height


def test_series_hist_ticks_props(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs

xticks = [20, 18]
yticks = [30, 40]

ax = scalars_df["float64_col"].plot.hist(xticks=xticks, yticks=yticks)
pd_ax = scalars_pandas_df["float64_col"].plot.hist(xticks=xticks, yticks=yticks)
xlabels = ax.get_xticklabels()
pd_xlables = pd_ax.get_xticklabels()
assert len(xlabels) == len(pd_xlables)
for i in range(len(pd_xlables)):
tm.assert_almost_equal(xlabels[i].get_fontsize(), pd_xlables[i].get_fontsize())
tm.assert_almost_equal(xlabels[i].get_rotation(), pd_xlables[i].get_rotation())

ylabels = ax.get_yticklabels()
pd_ylables = pd_ax.get_yticklabels()
assert len(xlabels) == len(pd_xlables)
for i in range(len(pd_xlables)):
tm.assert_almost_equal(ylabels[i].get_fontsize(), pd_ylables[i].get_fontsize())
tm.assert_almost_equal(ylabels[i].get_rotation(), pd_ylables[i].get_rotation())
11 changes: 11 additions & 0 deletions third_party/bigframes_vendored/pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -3111,6 +3111,17 @@ def str(self):
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

@property
def plot(self):
"""
Make plots of Series.

Returns:
bigframes.operations.plot.PlotAccessor:
An accessor making plots.
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

def isin(self, values):
"""
Whether elements in Series are contained in values.
Expand Down
53 changes: 53 additions & 0 deletions third_party/bigframes_vendored/pandas/plotting/_core.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
from typing import Sequence

from bigframes import constants


class PlotAccessor:
def hist(self, by: Sequence[str] | None = None, bins: int = 10, **kwargs):
"""
Draw histogram of the input series using matplotlib.

Parameters
----------
by : str or sequence, optional
If passed, then used to form histograms for separate groups.
Currently, it is not supported yet.
bins : int, default 10
Number of histogram bins to be used.
ax : matplotlib axes object, default None
An axes of the current figure.
grid : bool, default None (matlab style default)
Axis grid lines.
xticks : sequence
Values to use for the xticks.
yticks : sequence
Values to use for the yticks.
figsize : a tuple (width, height) in inches
Size of a figure object.
backend : str, default None
Backend to use instead of the backend specified in the option
``plotting.backend``. Currently, only `matplotlib` is not supported yet.
legend : bool, default False
Place legend on axis subplots.
**kwargs
Options to pass to matplotlib plotting method.

Returns
-------
class:`matplotlib.Axes`
A histogram plot.

Examples
--------
For Series:

.. plot::
:context: close-figs

>>> import bigframes.pandas as bpd
>>> lst = ['a', 'a', 'a', 'b', 'b', 'b']
>>> ser = bpd.Series([1, 2, 2, 4, 6, 6], index=lst)
>>> hist = ser.plot.hist()
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
Loading