Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Allow DataFrame binary ops to align on either axis and with loc… #544

Merged
merged 10 commits into from
Apr 4, 2024
49 changes: 49 additions & 0 deletions bigframes/core/normalize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations

import pandas as pd

import bigframes.core.indexes.index as index
import bigframes.series as series


def normalize_to_bf_series(obj, default_index: index.Index) -> series.Series:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would prefer just "to_bf_series" since to_blah seems like a common name for things that basically just convert, and "normalize" has so many possible meanings.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

if isinstance(obj, series.Series):
return obj
if isinstance(obj, pd.Series):
return series.Series(obj)
if isinstance(obj, index.Index):
return series.Series(obj, default_index)
if isinstance(obj, pd.Index):
return series.Series(obj, default_index)
if pd.api.types.is_list_like(obj):
return series.Series(obj, default_index)
else:
raise TypeError(f"Cannot interpret {obj} as series.")


def normalize_to_pd_series(obj, default_index: pd.Index) -> pd.Series:
if isinstance(obj, series.Series):
return obj.to_pandas()
if isinstance(obj, pd.Series):
return obj
if isinstance(obj, index.Index):
return pd.Series(obj.to_pandas(), default_index)
if isinstance(obj, pd.Index):
return pd.Series(obj, default_index)
if pd.api.types.is_list_like(obj):
return pd.Series(obj, default_index)
else:
raise TypeError(f"Cannot interpret {obj} as series.")
97 changes: 74 additions & 23 deletions bigframes/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@
import bigframes.core.guid
import bigframes.core.indexers as indexers
import bigframes.core.indexes as indexes
import bigframes.core.normalize
import bigframes.core.ordering as order
import bigframes.core.utils as utils
import bigframes.core.window
Expand Down Expand Up @@ -663,22 +664,24 @@ def _apply_binop(
how: str = "outer",
reverse: bool = False,
):
if isinstance(other, (float, int)):
if isinstance(other, (float, int, bool)):
return self._apply_scalar_binop(other, op, reverse=reverse)
elif isinstance(other, indexes.Index):
return self._apply_series_binop(
other.to_series(index=self.index),
op,
axis=axis,
how=how,
reverse=reverse,
)
elif isinstance(other, bigframes.series.Series):
return self._apply_series_binop(
other, op, axis=axis, how=how, reverse=reverse
)
elif isinstance(other, DataFrame):
return self._apply_dataframe_binop(other, op, how=how, reverse=reverse)
elif isinstance(other, pandas.DataFrame):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

might need to add pandas.DataFrame to the type hint for other?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For what purpose? Mypy is able to infer based on the isinstance checks.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, gotcha, but should we remove the other type hints then? Just for consistency/cleanliness.

return self._apply_dataframe_binop(
DataFrame(other), op, how=how, reverse=reverse
)
elif utils.get_axis_number(axis) == 0:
bf_series = bigframes.core.normalize.normalize_to_bf_series(
other, self.index
)
return self._apply_series_binop_axis_0(bf_series, op, how, reverse)
elif utils.get_axis_number(axis) == 1:
pd_series = bigframes.core.normalize.normalize_to_pd_series(
other, self.columns
)
return self._apply_series_binop_axis_1(pd_series, op, how, reverse)
raise NotImplementedError(
f"binary operation is not implemented on the second operand of type {type(other).__name__}."
f"{constants.FEEDBACK_LINK}"
Expand All @@ -700,22 +703,13 @@ def _apply_scalar_binop(
block = block.drop_columns([column_id])
return DataFrame(block)

def _apply_series_binop(
def _apply_series_binop_axis_0(
self,
other: bigframes.series.Series,
op: ops.BinaryOp,
axis: str | int = "columns",
how: str = "outer",
reverse: bool = False,
) -> DataFrame:
if axis not in ("columns", "index", 0, 1):
raise ValueError(f"Invalid input: axis {axis}.")

if axis in ("columns", 1):
raise NotImplementedError(
f"Row Series operations haven't been supported. {constants.FEEDBACK_LINK}"
)

block, (get_column_left, get_column_right) = self._block.join(
other._block, how=how
)
Expand All @@ -738,6 +732,63 @@ def _apply_series_binop(
block = block.with_index_labels(self.index.names)
return DataFrame(block)

def _apply_series_binop_axis_1(
self,
other: pandas.Series,
op: ops.BinaryOp,
how: str = "outer",
reverse: bool = False,
) -> DataFrame:
# Somewhat different alignment than df-df so separate codepath for now.
if self.columns.equals(other.index):
columns, lcol_indexer, rcol_indexer = self.columns, None, None
else:
if not (self.columns.is_unique and other.index.is_unique):
raise ValueError("Cannot align non-unique indices")
columns, lcol_indexer, rcol_indexer = self.columns.join(
other.index, how=how, return_indexers=True
)

binop_result_ids = []

column_indices = zip(
lcol_indexer if (lcol_indexer is not None) else range(len(columns)),
rcol_indexer if (rcol_indexer is not None) else range(len(columns)),
)

block = self._block
for left_index, right_index in column_indices:
if left_index >= 0 and right_index >= 0: # -1 indices indicate missing
self_col_id = self._block.value_columns[left_index]
other_scalar = other.iloc[right_index]
expr = (
op.as_expr(ex.const(other_scalar), self_col_id)
if reverse
else op.as_expr(self_col_id, ex.const(other_scalar))
)
elif left_index >= 0:
self_col_id = self._block.value_columns[left_index]
expr = (
op.as_expr(ex.const(None), self_col_id)
if reverse
else op.as_expr(self_col_id, ex.const(None))
)
elif right_index >= 0:
other_scalar = other.iloc[right_index]
expr = (
op.as_expr(ex.const(other_scalar), ex.const(None))
if reverse
else op.as_expr(ex.const(None), ex.const(other_scalar))
)
else:
# Should not be possible
raise ValueError("No right or left index.")
block, result_col_id = block.project_expr(expr)
binop_result_ids.append(result_col_id)

block = block.select_columns(binop_result_ids)
return DataFrame(block.with_column_labels(columns))

def _apply_dataframe_binop(
self,
other: DataFrame,
Expand Down
Empty file added bigframes/typing.py
Empty file.
32 changes: 32 additions & 0 deletions tests/system/small/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@

import bigframes
import bigframes._config.display_options as display_options
import bigframes.core.indexes.index as bf_indexes
import bigframes.dataframe as dataframe
import bigframes.series as series
from tests.system.utils import (
Expand Down Expand Up @@ -2074,6 +2075,37 @@ def test_series_binop_axis_index(
assert_pandas_df_equal(bf_result, pd_result)


@skip_legacy_pandas
@pytest.mark.parametrize(
("input"),
[
((1000, 2000, 3000)),
(pd.Index([1000, 2000, 3000])),
(bf_indexes.Index([1000, 2000, 3000])),
(pd.Series((1000, 2000), index=["int64_too", "float64_col"])),
(series.Series((1000, 2000), index=["int64_too", "float64_col"])),
],
ids=[
"tuple",
"pd_index",
"bf_index",
"pd_series",
"bf_series",
],
)
def test_listlike_binop_axis_1(scalars_dfs, input):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we add axis=1 to some existing tests for various binary ops?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

axis=1 completely changes alignment so the objects would just fail to align on this axis for existing tests. I'm not too worried about the different ops breaking, the actual scalar op compilation doesn't change for axis=1, the big worry is alignment failing

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, right, the existing objects wouldn't wouldn't work. Okay, sounds good.

scalars_df, scalars_pandas_df = scalars_dfs

df_columns = ["int64_col", "float64_col", "int64_too"]

bf_result = scalars_df[df_columns].add(input, axis=1).to_pandas()
if hasattr(input, "to_pandas"):
input = input.to_pandas()
pd_result = scalars_pandas_df[df_columns].add(input, axis=1)

assert_pandas_df_equal(bf_result, pd_result, check_dtype=False)


@pytest.mark.parametrize(
("left_labels", "right_labels"),
[
Expand Down
Loading