Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: support dataframe.cov #498

Merged
merged 4 commits into from
Mar 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 14 additions & 5 deletions bigframes/core/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -1110,13 +1110,22 @@ def summarize(
index_columns=[label_col_id],
)

def corr(self):
"""Returns a block object to compute the self-correlation on this block."""
def calculate_pairwise_metric(self, op=agg_ops.CorrOp()):
"""
Returns a block object to compute pairwise metrics among all value columns in this block.

The metric to be computed is specified by the `op` parameter, which can be either a
correlation operation (default) or a covariance operation.
"""
if len(self.value_columns) > 30:
raise NotImplementedError(
"This function supports dataframes with 30 columns or fewer. "
f"Provided dataframe has {len(self.value_columns)} columns. {constants.FEEDBACK_LINK}"
)

aggregations = [
(
ex.BinaryAggregation(
agg_ops.CorrOp(), ex.free_var(left_col), ex.free_var(right_col)
),
ex.BinaryAggregation(op, ex.free_var(left_col), ex.free_var(right_col)),
f"{left_col}-{right_col}",
)
for left_col in self.value_columns
Expand Down
14 changes: 9 additions & 5 deletions bigframes/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1019,17 +1019,21 @@ def corr(self, method="pearson", min_periods=None, numeric_only=False) -> DataFr
raise NotImplementedError(
f"min_periods not yet supported. {constants.FEEDBACK_LINK}"
)
if len(self.columns) > 30:
raise NotImplementedError(
f"Only work with dataframes containing fewer than 30 columns. Current: {len(self.columns)}. {constants.FEEDBACK_LINK}"
)

if not numeric_only:
frame = self._raise_on_non_numeric("corr")
else:
frame = self._drop_non_numeric()

return DataFrame(frame._block.corr())
return DataFrame(frame._block.calculate_pairwise_metric(op=agg_ops.CorrOp()))

def cov(self, *, numeric_only: bool = False) -> DataFrame:
if not numeric_only:
frame = self._raise_on_non_numeric("corr")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"cov"

else:
frame = self._drop_non_numeric()

return DataFrame(frame._block.calculate_pairwise_metric(agg_ops.CovOp()))

def to_pandas(
self,
Expand Down
28 changes: 28 additions & 0 deletions tests/system/small/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1916,6 +1916,34 @@ def test_corr_w_invalid_parameters(scalars_dfs):
scalars_df[columns].corr(min_periods=1)


@pytest.mark.parametrize(
("columns", "numeric_only"),
[
(["bool_col", "int64_col", "float64_col"], True),
(["bool_col", "int64_col", "float64_col"], False),
(["bool_col", "int64_col", "float64_col", "string_col"], True),
pytest.param(
["bool_col", "int64_col", "float64_col", "string_col"],
False,
marks=pytest.mark.xfail(
raises=NotImplementedError,
),
),
],
)
def test_cov_w_numeric_only(scalars_dfs, columns, numeric_only):
scalars_df, scalars_pandas_df = scalars_dfs
bf_result = scalars_df[columns].cov(numeric_only=numeric_only).to_pandas()
pd_result = scalars_pandas_df[columns].cov(numeric_only=numeric_only)

# BigFrames and Pandas differ in their data type handling:
# - Column types: BigFrames uses Float64, Pandas uses float64.
# - Index types: BigFrames uses strign, Pandas uses object.
pd.testing.assert_frame_equal(
bf_result, pd_result, check_dtype=False, check_index_type=False
)


@pytest.mark.parametrize(
("op"),
[
Expand Down
21 changes: 21 additions & 0 deletions tests/system/small/test_multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -921,6 +921,27 @@ def test_corr_w_multi_index(scalars_df_index, scalars_pandas_df_index):
)


def test_cov_w_multi_index(scalars_df_index, scalars_pandas_df_index):
columns = ["int64_too", "float64_col", "int64_col"]
multi_columns = pandas.MultiIndex.from_tuples(zip(["a", "b", "b"], [1, 2, 2]))

bf = scalars_df_index[columns].copy()
bf.columns = multi_columns

pd_df = scalars_pandas_df_index[columns].copy()
pd_df.columns = multi_columns

bf_result = bf.cov(numeric_only=True).to_pandas()
pd_result = pd_df.cov(numeric_only=True)

# BigFrames and Pandas differ in their data type handling:
# - Column types: BigFrames uses Float64, Pandas uses float64.
# - Index types: BigFrames uses string, Pandas uses object.
pandas.testing.assert_frame_equal(
bf_result, pd_result, check_dtype=False, check_index_type=False
)


@pytest.mark.parametrize(
("index_names",),
[
Expand Down
30 changes: 29 additions & 1 deletion third_party/bigframes_vendored/pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -2834,10 +2834,38 @@ def corr(self, method, min_periods, numeric_only) -> DataFrame:
Include only float, int, boolean, decimal data.

Returns:
DataFrame: Correlation matrix.
DataFrame: Correlation matrix.
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

def cov(self, *, numeric_only) -> DataFrame:
"""
Compute pairwise covariance of columns, excluding NA/null values.

**Examples:**

>>> import bigframes.pandas as bpd
>>> bpd.options.display.progress_bar = None

>>> df = bpd.DataFrame({'A': [1, 2, 3],
... 'B': [400, 500, 600],
... 'C': [0.8, 0.4, 0.9]})
>>> df.cov(numeric_only=True)
A B C
A 1.0 100.0 0.05
B 100.0 10000.0 5.0
C 0.05 5.0 0.07
<BLANKLINE>
[3 rows x 3 columns]

Args:
numeric_only(bool, default False):
Include only float, int, boolean, decimal data.

Returns:
DataFrame: The covariance matrix of the series of the DataFrame.
"""

def update(
self, other, join: str = "left", overwrite: bool = True, filter_func=None
) -> DataFrame:
Expand Down