-
Notifications
You must be signed in to change notification settings - Fork 42
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: Allow DataFrame binary ops to align on either axis and with loc… #544
Changes from 8 commits
1e0c39e
78369f7
bd5a8d1
3d74bef
b0624ba
ed4eef0
f00de99
f9e99a2
420ea03
8128b84
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
# Copyright 2023 Google LLC | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
from __future__ import annotations | ||
|
||
import pandas as pd | ||
|
||
import bigframes.core.indexes.index as index | ||
import bigframes.series as series | ||
|
||
|
||
def normalize_to_bf_series(obj, default_index: index.Index) -> series.Series: | ||
if isinstance(obj, series.Series): | ||
return obj | ||
if isinstance(obj, pd.Series): | ||
return series.Series(obj) | ||
if isinstance(obj, index.Index): | ||
return series.Series(obj, default_index) | ||
if isinstance(obj, pd.Index): | ||
return series.Series(obj, default_index) | ||
if pd.api.types.is_list_like(obj): | ||
return series.Series(obj, default_index) | ||
else: | ||
raise TypeError(f"Cannot interpret {obj} as series.") | ||
|
||
|
||
def normalize_to_pd_series(obj, default_index: pd.Index) -> pd.Series: | ||
if isinstance(obj, series.Series): | ||
return obj.to_pandas() | ||
if isinstance(obj, pd.Series): | ||
return obj | ||
if isinstance(obj, index.Index): | ||
return pd.Series(obj.to_pandas(), default_index) | ||
if isinstance(obj, pd.Index): | ||
return pd.Series(obj, default_index) | ||
if pd.api.types.is_list_like(obj): | ||
return pd.Series(obj, default_index) | ||
else: | ||
raise TypeError(f"Cannot interpret {obj} as series.") |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -55,6 +55,7 @@ | |
import bigframes.core.guid | ||
import bigframes.core.indexers as indexers | ||
import bigframes.core.indexes as indexes | ||
import bigframes.core.normalize | ||
import bigframes.core.ordering as order | ||
import bigframes.core.utils as utils | ||
import bigframes.core.window | ||
|
@@ -663,22 +664,24 @@ def _apply_binop( | |
how: str = "outer", | ||
reverse: bool = False, | ||
): | ||
if isinstance(other, (float, int)): | ||
if isinstance(other, (float, int, bool)): | ||
return self._apply_scalar_binop(other, op, reverse=reverse) | ||
elif isinstance(other, indexes.Index): | ||
return self._apply_series_binop( | ||
other.to_series(index=self.index), | ||
op, | ||
axis=axis, | ||
how=how, | ||
reverse=reverse, | ||
) | ||
elif isinstance(other, bigframes.series.Series): | ||
return self._apply_series_binop( | ||
other, op, axis=axis, how=how, reverse=reverse | ||
) | ||
elif isinstance(other, DataFrame): | ||
return self._apply_dataframe_binop(other, op, how=how, reverse=reverse) | ||
elif isinstance(other, pandas.DataFrame): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. might need to add pandas.DataFrame to the type hint for other? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For what purpose? Mypy is able to infer based on the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh, gotcha, but should we remove the other type hints then? Just for consistency/cleanliness. |
||
return self._apply_dataframe_binop( | ||
DataFrame(other), op, how=how, reverse=reverse | ||
) | ||
elif utils.get_axis_number(axis) == 0: | ||
bf_series = bigframes.core.normalize.normalize_to_bf_series( | ||
other, self.index | ||
) | ||
return self._apply_series_binop_axis_0(bf_series, op, how, reverse) | ||
elif utils.get_axis_number(axis) == 1: | ||
pd_series = bigframes.core.normalize.normalize_to_pd_series( | ||
other, self.columns | ||
) | ||
return self._apply_series_binop_axis_1(pd_series, op, how, reverse) | ||
raise NotImplementedError( | ||
f"binary operation is not implemented on the second operand of type {type(other).__name__}." | ||
f"{constants.FEEDBACK_LINK}" | ||
|
@@ -700,22 +703,13 @@ def _apply_scalar_binop( | |
block = block.drop_columns([column_id]) | ||
return DataFrame(block) | ||
|
||
def _apply_series_binop( | ||
def _apply_series_binop_axis_0( | ||
self, | ||
other: bigframes.series.Series, | ||
op: ops.BinaryOp, | ||
axis: str | int = "columns", | ||
how: str = "outer", | ||
reverse: bool = False, | ||
) -> DataFrame: | ||
if axis not in ("columns", "index", 0, 1): | ||
raise ValueError(f"Invalid input: axis {axis}.") | ||
|
||
if axis in ("columns", 1): | ||
raise NotImplementedError( | ||
f"Row Series operations haven't been supported. {constants.FEEDBACK_LINK}" | ||
) | ||
|
||
block, (get_column_left, get_column_right) = self._block.join( | ||
other._block, how=how | ||
) | ||
|
@@ -738,6 +732,63 @@ def _apply_series_binop( | |
block = block.with_index_labels(self.index.names) | ||
return DataFrame(block) | ||
|
||
def _apply_series_binop_axis_1( | ||
self, | ||
other: pandas.Series, | ||
op: ops.BinaryOp, | ||
how: str = "outer", | ||
reverse: bool = False, | ||
) -> DataFrame: | ||
# Somewhat different alignment than df-df so separate codepath for now. | ||
if self.columns.equals(other.index): | ||
columns, lcol_indexer, rcol_indexer = self.columns, None, None | ||
else: | ||
if not (self.columns.is_unique and other.index.is_unique): | ||
raise ValueError("Cannot align non-unique indices") | ||
columns, lcol_indexer, rcol_indexer = self.columns.join( | ||
other.index, how=how, return_indexers=True | ||
) | ||
|
||
binop_result_ids = [] | ||
|
||
column_indices = zip( | ||
lcol_indexer if (lcol_indexer is not None) else range(len(columns)), | ||
rcol_indexer if (rcol_indexer is not None) else range(len(columns)), | ||
) | ||
|
||
block = self._block | ||
for left_index, right_index in column_indices: | ||
if left_index >= 0 and right_index >= 0: # -1 indices indicate missing | ||
self_col_id = self._block.value_columns[left_index] | ||
other_scalar = other.iloc[right_index] | ||
expr = ( | ||
op.as_expr(ex.const(other_scalar), self_col_id) | ||
if reverse | ||
else op.as_expr(self_col_id, ex.const(other_scalar)) | ||
) | ||
elif left_index >= 0: | ||
self_col_id = self._block.value_columns[left_index] | ||
expr = ( | ||
op.as_expr(ex.const(None), self_col_id) | ||
if reverse | ||
else op.as_expr(self_col_id, ex.const(None)) | ||
) | ||
elif right_index >= 0: | ||
other_scalar = other.iloc[right_index] | ||
expr = ( | ||
op.as_expr(ex.const(other_scalar), ex.const(None)) | ||
if reverse | ||
else op.as_expr(ex.const(None), ex.const(other_scalar)) | ||
) | ||
else: | ||
# Should not be possible | ||
raise ValueError("No right or left index.") | ||
block, result_col_id = block.project_expr(expr) | ||
binop_result_ids.append(result_col_id) | ||
|
||
block = block.select_columns(binop_result_ids) | ||
return DataFrame(block.with_column_labels(columns)) | ||
|
||
def _apply_dataframe_binop( | ||
self, | ||
other: DataFrame, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -27,6 +27,7 @@ | |
|
||
import bigframes | ||
import bigframes._config.display_options as display_options | ||
import bigframes.core.indexes.index as bf_indexes | ||
import bigframes.dataframe as dataframe | ||
import bigframes.series as series | ||
from tests.system.utils import ( | ||
|
@@ -2074,6 +2075,37 @@ def test_series_binop_axis_index( | |
assert_pandas_df_equal(bf_result, pd_result) | ||
|
||
|
||
@skip_legacy_pandas | ||
@pytest.mark.parametrize( | ||
("input"), | ||
[ | ||
((1000, 2000, 3000)), | ||
(pd.Index([1000, 2000, 3000])), | ||
(bf_indexes.Index([1000, 2000, 3000])), | ||
(pd.Series((1000, 2000), index=["int64_too", "float64_col"])), | ||
(series.Series((1000, 2000), index=["int64_too", "float64_col"])), | ||
], | ||
ids=[ | ||
"tuple", | ||
"pd_index", | ||
"bf_index", | ||
"pd_series", | ||
"bf_series", | ||
], | ||
) | ||
def test_listlike_binop_axis_1(scalars_dfs, input): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we add axis=1 to some existing tests for various binary ops? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. axis=1 completely changes alignment so the objects would just fail to align on this axis for existing tests. I'm not too worried about the different ops breaking, the actual scalar op compilation doesn't change for axis=1, the big worry is alignment failing There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh, right, the existing objects wouldn't wouldn't work. Okay, sounds good. |
||
scalars_df, scalars_pandas_df = scalars_dfs | ||
|
||
df_columns = ["int64_col", "float64_col", "int64_too"] | ||
|
||
bf_result = scalars_df[df_columns].add(input, axis=1).to_pandas() | ||
if hasattr(input, "to_pandas"): | ||
input = input.to_pandas() | ||
pd_result = scalars_pandas_df[df_columns].add(input, axis=1) | ||
|
||
assert_pandas_df_equal(bf_result, pd_result, check_dtype=False) | ||
|
||
|
||
@pytest.mark.parametrize( | ||
("left_labels", "right_labels"), | ||
[ | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would prefer just "to_bf_series" since to_blah seems like a common name for things that basically just convert, and "normalize" has so many possible meanings.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done