Skip to content

Commit

Permalink
feat: (Series|Dataframe).plot.hist()
Browse files Browse the repository at this point in the history
  • Loading branch information
chelsea-lin committed Mar 7, 2024
1 parent 38bd2ba commit a2ac563
Show file tree
Hide file tree
Showing 11 changed files with 478 additions and 0 deletions.
5 changes: 5 additions & 0 deletions bigframes/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@
import bigframes.formatting_helpers as formatter
import bigframes.operations as ops
import bigframes.operations.aggregations as agg_ops
import bigframes.operations.plot as plot
import bigframes.series
import bigframes.series as bf_series
import bigframes.session._io.bigquery
Expand Down Expand Up @@ -3190,4 +3191,8 @@ def get_right_id(id):

return result

@property
def plot(self):
return plot.PlotAccessor(self)

__matmul__ = dot
30 changes: 30 additions & 0 deletions bigframes/operations/_matplotlib/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from bigframes.operations._matplotlib.core import MPLPlot
from bigframes.operations._matplotlib.hist import HistPlot

PLOT_CLASSES: dict[str, type[MPLPlot]] = {
"hist": HistPlot,
}


def plot(data, kind, **kwargs):
plot_obj = PLOT_CLASSES[kind](data, **kwargs)
plot_obj.generate()
plot_obj.draw()
return plot_obj.result


__all__ = ["plot"]
30 changes: 30 additions & 0 deletions bigframes/operations/_matplotlib/core.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from abc import ABC, abstractmethod

import matplotlib.pyplot as plt


class MPLPlot(ABC):
@abstractmethod
def generate(self):
pass

def draw(self) -> None:
plt.draw_if_interactive()

@property
def result(self):
return self.axes
134 changes: 134 additions & 0 deletions bigframes/operations/_matplotlib/hist.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import itertools
from typing import Literal

import numpy as np
import pandas as pd

import bigframes.constants as constants
from bigframes.operations._matplotlib.core import MPLPlot


class HistPlot(MPLPlot):
@property
def _kind(self) -> Literal["hist"]:
return "hist"

def __init__(
self,
data,
bins: int = 10,
**kwargs,
) -> None:
self.bins = bins
self.label = kwargs.get("label", None)
self.by = kwargs.pop("by", None)
self.kwargs = kwargs

if self.by is not None:
raise NotImplementedError(
f"Non-none `by` argument is not yet supported. {constants.FEEDBACK_LINK}"
)
if not isinstance(self.bins, int):
raise NotImplementedError(
f"Only integer values are supported for the `bins` argument. {constants.FEEDBACK_LINK}"
)
if kwargs.get("weight", None) is not None:
raise NotImplementedError(
f"Non-none `weight` argument is not yet supported. {constants.FEEDBACK_LINK}"
)

self.data = self._compute_plot_data(data)

def generate(self) -> None:
hist_bars = self._calculate_hist_bar(self.data, self.bins)

bin_edges = None
hist_x = {}
weights = {}
for col_name, hist_bar in hist_bars.items():
left = hist_bar.index.get_level_values("left_exclusive")
right = hist_bar.index.get_level_values("right_inclusive")

hist_x[col_name] = pd.Series((left + right) / 2.0)
weights[col_name] = hist_bar.values
if bin_edges is None:
bin_edges = left.union(right)
else:
bin_edges = left.union(right).union(bin_edges)

bins = None
if bin_edges is not None:
_, bins = np.histogram(
bin_edges, bins=self.bins, range=self.kwargs.get("range", None)
)

# Fills with NA values when items have different lengths.
ordered_columns = self.data.columns.values
hist_x_pd = pd.DataFrame(
list(itertools.zip_longest(*hist_x.values())), columns=list(hist_x.keys())
).sort_index(axis=1)
weights_pd = pd.DataFrame(
list(itertools.zip_longest(*weights.values())), columns=list(weights.keys())
).sort_index(axis=1)

self.axes = hist_x_pd[ordered_columns].plot.hist(
bins=bins,
weights=np.array(weights_pd[ordered_columns].values),
**self.kwargs,
) # type: ignore

def _compute_plot_data(self, data):
# Importing at the top of the file causes a circular import.
import bigframes.series as series

if isinstance(data, series.Series):
label = self.label
if label is None and data.name is None:
label = ""
if label is None:
data = data.to_frame()
else:
data = data.to_frame(name=label)

# TODO(chelsealin): Support timestamp/date types here.
include_type = ["number"]
numeric_data = data.select_dtypes(include=include_type)
try:
is_empty = numeric_data.columns.empty
except AttributeError:
is_empty = not len(numeric_data)

if is_empty:
raise TypeError("no numeric data to plot")

return numeric_data

@staticmethod
def _calculate_hist_bar(data, bins):
import bigframes.pandas as bpd

# TODO: Optimize this by batching multiple jobs into one.
hist_bar = {}
for _, col in enumerate(data.columns):
cutted_data = bpd.cut(data[col], bins=bins, labels=None)
hist_bar[col] = (
cutted_data.struct.explode()
.value_counts()
.to_pandas()
.sort_index(level="left_exclusive")
)
return hist_bar
35 changes: 35 additions & 0 deletions bigframes/operations/plot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Optional, Sequence

import bigframes.constants as constants
import bigframes.operations._matplotlib as plotbackend
import third_party.bigframes_vendored.pandas.plotting._core as vendordt


class PlotAccessor:
__doc__ = vendordt.PlotAccessor.__doc__

def __init__(self, data) -> None:
self._parent = data

def hist(self, by: Optional[Sequence[str]] = None, bins: int = 10, **kwargs):
if kwargs.pop("backend", None) is not None:
raise NotImplementedError(
f"Only support matplotlib backend for now. {constants.FEEDBACK_LINK}"
)
kwargs["by"] = by
kwargs["bins"] = bins
return plotbackend.plot(self._parent.copy(), kind="hist", **kwargs)
5 changes: 5 additions & 0 deletions bigframes/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
import bigframes.operations.aggregations as agg_ops
import bigframes.operations.base
import bigframes.operations.datetimes as dt
import bigframes.operations.plot as plot
import bigframes.operations.strings as strings
import bigframes.operations.structs as structs
import third_party.bigframes_vendored.pandas.core.series as vendored_pandas_series
Expand Down Expand Up @@ -1557,6 +1558,10 @@ def __array_ufunc__(
def str(self) -> strings.StringMethods:
return strings.StringMethods(self._block)

@property
def plot(self):
return plot.PlotAccessor(self)

def _slice(
self,
start: typing.Optional[int] = None,
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@
"tabulate >= 0.9",
"ipywidgets >=7.7.1",
"humanize >= 4.6.0",
"matplotlib >= 3.7.1",
]
extras = {
# Optional test dependencies packages. If they're missed, may skip some tests.
Expand Down
Loading

0 comments on commit a2ac563

Please sign in to comment.