Skip to content

Commit

Permalink
feat: support dataframe.cov (#498)
Browse files Browse the repository at this point in the history
Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly:
- [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code!  That way we can discuss the change, evaluate designs, and agree on the general idea
- [ ] Ensure the tests and linter pass
- [ ] Code coverage does not decrease (if any source code was changed)
- [ ] Appropriate docs were updated (if necessary)

Document:
https://screenshot.googleplex.com/9egi7MsNj2uWHkH

Fixes #<issue_number_goes_here> 🦕
  • Loading branch information
Genesis929 authored Mar 22, 2024
1 parent 97afad9 commit c4beafd
Show file tree
Hide file tree
Showing 5 changed files with 101 additions and 11 deletions.
19 changes: 14 additions & 5 deletions bigframes/core/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -1110,13 +1110,22 @@ def summarize(
index_columns=[label_col_id],
)

def corr(self):
"""Returns a block object to compute the self-correlation on this block."""
def calculate_pairwise_metric(self, op=agg_ops.CorrOp()):
"""
Returns a block object to compute pairwise metrics among all value columns in this block.
The metric to be computed is specified by the `op` parameter, which can be either a
correlation operation (default) or a covariance operation.
"""
if len(self.value_columns) > 30:
raise NotImplementedError(
"This function supports dataframes with 30 columns or fewer. "
f"Provided dataframe has {len(self.value_columns)} columns. {constants.FEEDBACK_LINK}"
)

aggregations = [
(
ex.BinaryAggregation(
agg_ops.CorrOp(), ex.free_var(left_col), ex.free_var(right_col)
),
ex.BinaryAggregation(op, ex.free_var(left_col), ex.free_var(right_col)),
f"{left_col}-{right_col}",
)
for left_col in self.value_columns
Expand Down
14 changes: 9 additions & 5 deletions bigframes/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1019,17 +1019,21 @@ def corr(self, method="pearson", min_periods=None, numeric_only=False) -> DataFr
raise NotImplementedError(
f"min_periods not yet supported. {constants.FEEDBACK_LINK}"
)
if len(self.columns) > 30:
raise NotImplementedError(
f"Only work with dataframes containing fewer than 30 columns. Current: {len(self.columns)}. {constants.FEEDBACK_LINK}"
)

if not numeric_only:
frame = self._raise_on_non_numeric("corr")
else:
frame = self._drop_non_numeric()

return DataFrame(frame._block.corr())
return DataFrame(frame._block.calculate_pairwise_metric(op=agg_ops.CorrOp()))

def cov(self, *, numeric_only: bool = False) -> DataFrame:
if not numeric_only:
frame = self._raise_on_non_numeric("corr")
else:
frame = self._drop_non_numeric()

return DataFrame(frame._block.calculate_pairwise_metric(agg_ops.CovOp()))

def to_pandas(
self,
Expand Down
28 changes: 28 additions & 0 deletions tests/system/small/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1916,6 +1916,34 @@ def test_corr_w_invalid_parameters(scalars_dfs):
scalars_df[columns].corr(min_periods=1)


@pytest.mark.parametrize(
("columns", "numeric_only"),
[
(["bool_col", "int64_col", "float64_col"], True),
(["bool_col", "int64_col", "float64_col"], False),
(["bool_col", "int64_col", "float64_col", "string_col"], True),
pytest.param(
["bool_col", "int64_col", "float64_col", "string_col"],
False,
marks=pytest.mark.xfail(
raises=NotImplementedError,
),
),
],
)
def test_cov_w_numeric_only(scalars_dfs, columns, numeric_only):
scalars_df, scalars_pandas_df = scalars_dfs
bf_result = scalars_df[columns].cov(numeric_only=numeric_only).to_pandas()
pd_result = scalars_pandas_df[columns].cov(numeric_only=numeric_only)

# BigFrames and Pandas differ in their data type handling:
# - Column types: BigFrames uses Float64, Pandas uses float64.
# - Index types: BigFrames uses strign, Pandas uses object.
pd.testing.assert_frame_equal(
bf_result, pd_result, check_dtype=False, check_index_type=False
)


@pytest.mark.parametrize(
("op"),
[
Expand Down
21 changes: 21 additions & 0 deletions tests/system/small/test_multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -921,6 +921,27 @@ def test_corr_w_multi_index(scalars_df_index, scalars_pandas_df_index):
)


def test_cov_w_multi_index(scalars_df_index, scalars_pandas_df_index):
columns = ["int64_too", "float64_col", "int64_col"]
multi_columns = pandas.MultiIndex.from_tuples(zip(["a", "b", "b"], [1, 2, 2]))

bf = scalars_df_index[columns].copy()
bf.columns = multi_columns

pd_df = scalars_pandas_df_index[columns].copy()
pd_df.columns = multi_columns

bf_result = bf.cov(numeric_only=True).to_pandas()
pd_result = pd_df.cov(numeric_only=True)

# BigFrames and Pandas differ in their data type handling:
# - Column types: BigFrames uses Float64, Pandas uses float64.
# - Index types: BigFrames uses string, Pandas uses object.
pandas.testing.assert_frame_equal(
bf_result, pd_result, check_dtype=False, check_index_type=False
)


@pytest.mark.parametrize(
("index_names",),
[
Expand Down
30 changes: 29 additions & 1 deletion third_party/bigframes_vendored/pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -2834,10 +2834,38 @@ def corr(self, method, min_periods, numeric_only) -> DataFrame:
Include only float, int, boolean, decimal data.
Returns:
DataFrame: Correlation matrix.
DataFrame: Correlation matrix.
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

def cov(self, *, numeric_only) -> DataFrame:
"""
Compute pairwise covariance of columns, excluding NA/null values.
**Examples:**
>>> import bigframes.pandas as bpd
>>> bpd.options.display.progress_bar = None
>>> df = bpd.DataFrame({'A': [1, 2, 3],
... 'B': [400, 500, 600],
... 'C': [0.8, 0.4, 0.9]})
>>> df.cov(numeric_only=True)
A B C
A 1.0 100.0 0.05
B 100.0 10000.0 5.0
C 0.05 5.0 0.07
<BLANKLINE>
[3 rows x 3 columns]
Args:
numeric_only(bool, default False):
Include only float, int, boolean, decimal data.
Returns:
DataFrame: The covariance matrix of the series of the DataFrame.
"""

def update(
self, other, join: str = "left", overwrite: bool = True, filter_func=None
) -> DataFrame:
Expand Down

0 comments on commit c4beafd

Please sign in to comment.