Skip to content

Commit

Permalink
feat: add Series.plot.hist method
Browse files Browse the repository at this point in the history
  • Loading branch information
chelsea-lin committed Feb 29, 2024
1 parent dd3643d commit 1cb6c5e
Show file tree
Hide file tree
Showing 7 changed files with 330 additions and 0 deletions.
108 changes: 108 additions & 0 deletions bigframes/operations/plot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Sequence

import matplotlib.pyplot as plt

import bigframes.constants as constants
import third_party.bigframes_vendored.pandas.plotting._core as vendordt


class PlotAccessor:
__doc__ = vendordt.PlotAccessor.__doc__

def __init__(self, data) -> None:
self._parent = data

def hist(self, by: Sequence[str] | None = None, bins: int = 10, **kwargs):
if by is not None:
raise NotImplementedError(
f"Non-none `by` argument is not yet supported. {constants.FEEDBACK_LINK}"
)
if kwargs.pop("backend", None) is not None:
raise NotImplementedError(
f"Only support matplotlib backend for now. {constants.FEEDBACK_LINK}"
)
import bigframes.dataframe as dataframe

if isinstance(self._parent, dataframe.DataFrame):
raise NotImplementedError(
f"`Dataframe.plot.hist` is not implemented yet. {constants.FEEDBACK_LINK}"
)

return self._hist_series(
by=by,
bins=bins,
**kwargs,
)

def _hist_series(
self,
by: Sequence[str] | None = None,
bins: int = 10,
**kwargs,
):
# Only supported some arguments to adorn plots.
ax = kwargs.pop("ax", None)
figsize = kwargs.pop("figsize", None)
legend = kwargs.pop("legend", False)
grid = kwargs.pop("grid", None)
xticks = kwargs.pop("xticks", None)
yticks = kwargs.pop("yticks", None)

# Calculates the bins' values and weights through BigQuery
import bigframes.pandas as bpd

series = self._parent.copy()
binned = bpd.cut(series, bins=bins, labels=None)
binned_data = (
binned.struct.explode()
.value_counts()
.to_pandas()
.sort_index(level="left_exclusive")
)
weights = binned_data.values
left_bins = binned_data.index.get_level_values("left_exclusive")
right_bins = binned_data.index.get_level_values("right_inclusive")
bin_edges = left_bins.union(right_bins, sort=True)

# This code takes the hist_series function from pandas and tweaks it a bit.
if kwargs.get("layout", None) is not None:
raise ValueError("The 'layout' keyword is not supported when 'by' is None")

fig = kwargs.pop(
"figure", plt.gcf() if plt.get_fignums() else plt.figure(figsize=figsize)
)
if figsize is not None and tuple(figsize) != tuple(fig.get_size_inches()):
fig.set_size_inches(*figsize, forward=True)

ax = kwargs.pop("ax", None)
if ax is None:
ax = fig.gca()
elif ax.get_figure() != fig:
raise AssertionError("passed axis not bound to passed figure")

if legend:
kwargs["label"] = series.name
ax.hist(x=left_bins, bins=bin_edges, weights=weights, **kwargs)
if legend:
ax.legend()
if grid is not None:
ax.grid(grid)
if xticks is not None:
ax.set_xticks(xticks)
if yticks is not None:
ax.set_yticks(yticks)

return ax
102 changes: 102 additions & 0 deletions bigframes/plotting/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Iterable

from matplotlib.axes import Axes
import matplotlib.pyplot as plt
import numpy as np


def hist_series(
series,
by=None,
ax=None,
grid: bool = True,
xlabelsize: int | None = None,
xrot: float | None = None,
ylabelsize: int | None = None,
yrot: float | None = None,
figsize=None,
bins: int = 10,
legend: bool = False,
**kwargs,
):
import bigframes.pandas as bpd

# Calculates the bins' values and weights through BigQuery
binned = bpd.cut(series, bins=bins, labels=None)
binned_data = (
binned.struct.explode()
.value_counts()
.to_pandas()
.sort_index(level="left_exclusive")
)

weights = binned_data.values
left_bins = binned_data.index.get_level_values("left_exclusive")
right_bins = binned_data.index.get_level_values("right_inclusive")
bin_edges = left_bins.union(right_bins, sort=True)

# This code takes the hist_series function from pandas and tweaks it a bit.
if kwargs.get("layout", None) is not None:
raise ValueError("The 'layout' keyword is not supported when 'by' is None")
# hack until the plotting interface is a bit more unified
fig = kwargs.pop(
"figure", plt.gcf() if plt.get_fignums() else plt.figure(figsize=figsize)
)
if figsize is not None and tuple(figsize) != tuple(fig.get_size_inches()):
fig.set_size_inches(*figsize, forward=True)
if ax is None:
ax = fig.gca()
elif ax.get_figure() != fig:
raise AssertionError("passed axis not bound to passed figure")
if legend:
kwargs["label"] = series.name
ax.hist(x=left_bins, bins=bin_edges, weights=weights, **kwargs)
if legend:
ax.legend()
ax.grid(grid)
axes = np.array([ax])

_set_ticks_props(
axes,
xlabelsize=xlabelsize,
xrot=xrot,
ylabelsize=ylabelsize,
yrot=yrot,
)

if hasattr(axes, "ndim"):
if axes.ndim == 1 and len(axes) == 1:
return axes[0]
return axes


def _set_ticks_props(
axes: Iterable[Axes],
xlabelsize: int | None = None,
xrot=None,
ylabelsize: int | None = None,
yrot=None,
):
for ax in axes:
if xlabelsize is not None:
plt.setp(ax.get_xticklabels(), fontsize=xlabelsize)
if xrot is not None:
plt.setp(ax.get_xticklabels(), rotation=xrot)
if ylabelsize is not None:
plt.setp(ax.get_yticklabels(), fontsize=ylabelsize)
if yrot is not None:
plt.setp(ax.get_yticklabels(), rotation=yrot)
return axes
5 changes: 5 additions & 0 deletions bigframes/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
import bigframes.operations.aggregations as agg_ops
import bigframes.operations.base
import bigframes.operations.datetimes as dt
import bigframes.operations.plot as plot
import bigframes.operations.strings as strings
import bigframes.operations.structs as structs
import third_party.bigframes_vendored.pandas.core.series as vendored_pandas_series
Expand Down Expand Up @@ -1551,6 +1552,10 @@ def __array_ufunc__(
def str(self) -> strings.StringMethods:
return strings.StringMethods(self._block)

@property
def plot(self):
return plot.PlotAccessor(self)

def _slice(
self,
start: typing.Optional[int] = None,
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@
"tabulate >= 0.9",
"ipywidgets >=7.7.1",
"humanize >= 4.6.0",
"matplotlib >= 3.7.1",
]
extras = {
# Optional test dependencies packages. If they're missed, may skip some tests.
Expand Down
50 changes: 50 additions & 0 deletions tests/system/small/operations/test_plot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import pandas._testing as tm


def test_series_hist_bins(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs
ax = scalars_df["int64_col"].plot.hist(bins=5)
pd_ax = scalars_pandas_df["int64_col"].hist(bins=5)

# Check hist has same height compared to the pandas one.
assert len(ax.patches) == len(pd_ax.patches)
for i in range(len(ax.patches)):
assert ax.patches[i].xy == pd_ax.patches[i].xy
assert ax.patches[i]._height == pd_ax.patches[i]._height


def test_series_hist_ticks_props(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs

xticks = [20, 18]
yticks = [30, 40]

ax = scalars_df["float64_col"].plot.hist(xticks=xticks, yticks=yticks)
pd_ax = scalars_pandas_df["float64_col"].plot.hist(xticks=xticks, yticks=yticks)
xlabels = ax.get_xticklabels()
pd_xlables = pd_ax.get_xticklabels()
assert len(xlabels) == len(pd_xlables)
for i in range(len(pd_xlables)):
tm.assert_almost_equal(xlabels[i].get_fontsize(), pd_xlables[i].get_fontsize())
tm.assert_almost_equal(xlabels[i].get_rotation(), pd_xlables[i].get_rotation())

ylabels = ax.get_yticklabels()
pd_ylables = pd_ax.get_yticklabels()
assert len(xlabels) == len(pd_xlables)
for i in range(len(pd_xlables)):
tm.assert_almost_equal(ylabels[i].get_fontsize(), pd_ylables[i].get_fontsize())
tm.assert_almost_equal(ylabels[i].get_rotation(), pd_ylables[i].get_rotation())
11 changes: 11 additions & 0 deletions third_party/bigframes_vendored/pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -3111,6 +3111,17 @@ def str(self):
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

@property
def plot(self):
"""
Make plots of Series.
Returns:
bigframes.operations.plot.PlotAccessor:
An accessor making plots.
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

def isin(self, values):
"""
Whether elements in Series are contained in values.
Expand Down
53 changes: 53 additions & 0 deletions third_party/bigframes_vendored/pandas/plotting/_core.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
from typing import Sequence

from bigframes import constants


class PlotAccessor:
def hist(self, by: Sequence[str] | None = None, bins: int = 10, **kwargs):
"""
Draw histogram of the input series using matplotlib.
Parameters
----------
by : str or sequence, optional
If passed, then used to form histograms for separate groups.
Currently, it is not supported yet.
bins : int, default 10
Number of histogram bins to be used.
ax : matplotlib axes object, default None
An axes of the current figure.
grid : bool, default None (matlab style default)
Axis grid lines.
xticks : sequence
Values to use for the xticks.
yticks : sequence
Values to use for the yticks.
figsize : a tuple (width, height) in inches
Size of a figure object.
backend : str, default None
Backend to use instead of the backend specified in the option
``plotting.backend``. Currently, only `matplotlib` is not supported yet.
legend : bool, default False
Place legend on axis subplots.
**kwargs
Options to pass to matplotlib plotting method.
Returns
-------
class:`matplotlib.Axes`
A histogram plot.
Examples
--------
For Series:
.. plot::
:context: close-figs
>>> import bigframes.pandas as bpd
>>> lst = ['a', 'a', 'a', 'b', 'b', 'b']
>>> ser = bpd.Series([1, 2, 2, 4, 6, 6], index=lst)
>>> hist = ser.plot.hist()
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

0 comments on commit 1cb6c5e

Please sign in to comment.