Skip to content

Commit

Permalink
feat: (Series|Dataframe).plot.hist() (#420)
Browse files Browse the repository at this point in the history
* feat: (Series|Dataframe).plot.hist()
  • Loading branch information
chelsea-lin authored Mar 11, 2024
1 parent 60594f4 commit 4aadff4
Show file tree
Hide file tree
Showing 14 changed files with 536 additions and 0 deletions.
5 changes: 5 additions & 0 deletions bigframes/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@
import bigframes.formatting_helpers as formatter
import bigframes.operations as ops
import bigframes.operations.aggregations as agg_ops
import bigframes.operations.plotting as plotting
import bigframes.series
import bigframes.series as bf_series
import bigframes.session._io.bigquery
Expand Down Expand Up @@ -3193,4 +3194,8 @@ def get_right_id(id):

return result

@property
def plot(self):
return plotting.PlotAccessor(self)

__matmul__ = dot
30 changes: 30 additions & 0 deletions bigframes/operations/_matplotlib/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import bigframes.operations._matplotlib.core as core
import bigframes.operations._matplotlib.hist as hist

PLOT_CLASSES: dict[str, type[core.MPLPlot]] = {
"hist": hist.HistPlot,
}


def plot(data, kind, **kwargs):
plot_obj = PLOT_CLASSES[kind](data, **kwargs)
plot_obj.generate()
plot_obj.draw()
return plot_obj.result


__all__ = ["plot"]
30 changes: 30 additions & 0 deletions bigframes/operations/_matplotlib/core.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import abc

import matplotlib.pyplot as plt


class MPLPlot(abc.ABC):
@abc.abstractmethod
def generate(self):
pass

def draw(self) -> None:
plt.draw_if_interactive()

@property
def result(self):
return self.axes
172 changes: 172 additions & 0 deletions bigframes/operations/_matplotlib/hist.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import itertools
from typing import Literal

import numpy as np
import pandas as pd

import bigframes.constants as constants
import bigframes.operations._matplotlib.core as bfplt


class HistPlot(bfplt.MPLPlot):
@property
def _kind(self) -> Literal["hist"]:
return "hist"

def __init__(
self,
data,
bins: int = 10,
**kwargs,
) -> None:
self.bins = bins
self.label = kwargs.get("label", None)
self.by = kwargs.pop("by", None)
self.kwargs = kwargs

if self.by is not None:
raise NotImplementedError(
f"Non-none `by` argument is not yet supported. {constants.FEEDBACK_LINK}"
)
if not isinstance(self.bins, int):
raise NotImplementedError(
f"Only integer values are supported for the `bins` argument. {constants.FEEDBACK_LINK}"
)
if kwargs.get("weight", None) is not None:
raise NotImplementedError(
f"Non-none `weight` argument is not yet supported. {constants.FEEDBACK_LINK}"
)

self.data = self._compute_plot_data(data)

def generate(self) -> None:
"""
Calculates weighted histograms through BigQuery and plots them through pandas
native histogram plot.
"""
hist_bars = self._calculate_hist_bars(self.data, self.bins)
bin_edges = self._calculate_bin_edges(
hist_bars, self.bins, self.kwargs.get("range", None)
)

weights = {
col_name: hist_bar.values for col_name, hist_bar in hist_bars.items()
}
hist_x = {
col_name: pd.Series(
(
hist_bar.index.get_level_values("left_exclusive")
+ hist_bar.index.get_level_values("right_inclusive")
)
/ 2.0
)
for col_name, hist_bar in hist_bars.items()
}

# Align DataFrames for plotting despite potential differences in column
# lengths, filling shorter columns with zeros.
hist_x_pd = pd.DataFrame(
list(itertools.zip_longest(*hist_x.values())), columns=list(hist_x.keys())
).sort_index(axis=1)[self.data.columns.values]
weights_pd = pd.DataFrame(
list(itertools.zip_longest(*weights.values())), columns=list(weights.keys())
).sort_index(axis=1)[self.data.columns.values]

# Prevents pandas from dropping NA values and causing length mismatches by
# filling them with zeros.
hist_x_pd.fillna(0, inplace=True)
weights_pd.fillna(0, inplace=True)

self.axes = hist_x_pd.plot.hist(
bins=bin_edges,
weights=np.array(weights_pd.values),
**self.kwargs,
) # type: ignore

def _compute_plot_data(self, data):
"""
Prepares data for plotting, focusing on numeric data types.
Raises:
TypeError: If the input data contains no numeric columns.
"""
# Importing at the top of the file causes a circular import.
import bigframes.series as series

if isinstance(data, series.Series):
label = self.label
if label is None and data.name is None:
label = ""
if label is None:
data = data.to_frame()
else:
data = data.to_frame(name=label)

# TODO(chelsealin): Support timestamp/date types here.
include_type = ["number"]
numeric_data = data.select_dtypes(include=include_type)
try:
is_empty = numeric_data.columns.empty
except AttributeError:
is_empty = not len(numeric_data)

if is_empty:
raise TypeError("no numeric data to plot")

return numeric_data

@staticmethod
def _calculate_hist_bars(data, bins):
"""
Calculates histogram bars for each column in a BigFrames DataFrame, and
returns a dictionary where keys are column names and values are pandas
Series. The series values are the histogram bins' heights with a
multi-index defining 'left_exclusive' and 'right_inclusive' bin edges.
"""
import bigframes.pandas as bpd

# TODO: Optimize this by batching multiple jobs into one.
hist_bar = {}
for _, col in enumerate(data.columns):
cutted_data = bpd.cut(data[col], bins=bins, labels=None)
hist_bar[col] = (
cutted_data.struct.explode()
.value_counts()
.to_pandas()
.sort_index(level="left_exclusive")
)
return hist_bar

@staticmethod
def _calculate_bin_edges(hist_bars, bins, range):
"""
Calculate bin edges from the histogram bars.
"""
bin_edges = None
for _, hist_bar in hist_bars.items():
left = hist_bar.index.get_level_values("left_exclusive")
right = hist_bar.index.get_level_values("right_inclusive")
if bin_edges is None:
bin_edges = left.union(right)
else:
bin_edges = left.union(right).union(bin_edges)

if bin_edges is None:
return None

_, bins = np.histogram(bin_edges, bins=bins, range=range)
return bins
34 changes: 34 additions & 0 deletions bigframes/operations/plotting.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Optional, Sequence

import bigframes.constants as constants
import bigframes.operations._matplotlib as bfplt
import third_party.bigframes_vendored.pandas.plotting._core as vendordt


class PlotAccessor:
__doc__ = vendordt.PlotAccessor.__doc__

def __init__(self, data) -> None:
self._parent = data

def hist(self, by: Optional[Sequence[str]] = None, bins: int = 10, **kwargs):
if kwargs.pop("backend", None) is not None:
raise NotImplementedError(
f"Only support matplotlib backend for now. {constants.FEEDBACK_LINK}"
)
# Calls matplotlib backend to plot the data.
return bfplt.plot(self._parent.copy(), kind="hist", by=by, bins=bins, **kwargs)
5 changes: 5 additions & 0 deletions bigframes/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
import bigframes.operations.aggregations as agg_ops
import bigframes.operations.base
import bigframes.operations.datetimes as dt
import bigframes.operations.plotting as plotting
import bigframes.operations.strings as strings
import bigframes.operations.structs as structs
import third_party.bigframes_vendored.pandas.core.series as vendored_pandas_series
Expand Down Expand Up @@ -1557,6 +1558,10 @@ def __array_ufunc__(
def str(self) -> strings.StringMethods:
return strings.StringMethods(self._block)

@property
def plot(self):
return plotting.PlotAccessor(self)

def _slice(
self,
start: typing.Optional[int] = None,
Expand Down
11 changes: 11 additions & 0 deletions docs/reference/bigframes.pandas/frame.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,14 @@ DataFrame
:members:
:inherited-members:
:undoc-members:

Accessors
---------

Plotting handling
^^^^^^^^^^^^^^^^^

.. automodule:: bigframes.operations.plotting
:members:
:inherited-members:
:undoc-members:
9 changes: 9 additions & 0 deletions docs/reference/bigframes.pandas/series.rst
Original file line number Diff line number Diff line change
Expand Up @@ -42,3 +42,12 @@ Struct handling
:members:
:inherited-members:
:undoc-members:

Plotting handling
^^^^^^^^^^^^^^^^^

.. automodule:: bigframes.operations.plotting
:members:
:inherited-members:
:undoc-members:
:noindex:
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@
"tabulate >= 0.9",
"ipywidgets >=7.7.1",
"humanize >= 4.6.0",
"matplotlib >= 3.7.1",
]
extras = {
# Optional test dependencies packages. If they're missed, may skip some tests.
Expand Down
1 change: 1 addition & 0 deletions testing/constraints-3.9.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,5 +20,6 @@ sqlglot==20.8.0
tabulate==0.9
ipywidgets==7.7.1
humanize==4.6.0
matplotlib==3.7.1
# extras
pandas-gbq==0.19.0
Loading

0 comments on commit 4aadff4

Please sign in to comment.