From a25d0728e0f762d7bb03284e472cecf8c56f7c69 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Fri, 22 Mar 2024 20:36:44 +0000 Subject: [PATCH 1/3] feat: support dataframe.cov --- bigframes/core/blocks.py | 18 ++++++++---- bigframes/dataframe.py | 14 ++++++---- tests/system/small/test_dataframe.py | 28 +++++++++++++++++++ .../bigframes_vendored/pandas/core/frame.py | 14 +++++++++- 4 files changed, 63 insertions(+), 11 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 178d698f8d..066ad5df20 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -1106,13 +1106,21 @@ def summarize( index_columns=[label_col_id], ) - def corr(self): - """Returns a block object to compute the self-correlation on this block.""" + def calculate_pairwise_metric(self, op=agg_ops.CorrOp()): + """ + Returns a block object to compute pairwise metrics among all value columns in this block. + + The metric to be computed is specified by the `op` parameter, which can be either a + correlation operation (default) or a covariance operation. + """ + if len(self.value_columns) > 30: + raise NotImplementedError( + f"Only work with dataframes containing fewer than 30 columns. Current: {len(self.columns)}. {constants.FEEDBACK_LINK}" + ) + aggregations = [ ( - ex.BinaryAggregation( - agg_ops.CorrOp(), ex.free_var(left_col), ex.free_var(right_col) - ), + ex.BinaryAggregation(op, ex.free_var(left_col), ex.free_var(right_col)), f"{left_col}-{right_col}", ) for left_col in self.value_columns diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 4e447c547f..0be4377ab9 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1019,17 +1019,21 @@ def corr(self, method="pearson", min_periods=None, numeric_only=False) -> DataFr raise NotImplementedError( f"min_periods not yet supported. {constants.FEEDBACK_LINK}" ) - if len(self.columns) > 30: - raise NotImplementedError( - f"Only work with dataframes containing fewer than 30 columns. Current: {len(self.columns)}. {constants.FEEDBACK_LINK}" - ) if not numeric_only: frame = self._raise_on_non_numeric("corr") else: frame = self._drop_non_numeric() - return DataFrame(frame._block.corr()) + return DataFrame(frame._block.calculate_pairwise_metric(op=agg_ops.CorrOp())) + + def cov(self, *, numeric_only: bool = False) -> DataFrame: + if not numeric_only: + frame = self._raise_on_non_numeric("corr") + else: + frame = self._drop_non_numeric() + + return DataFrame(frame._block.calculate_pairwise_metric(agg_ops.CovOp())) def to_pandas( self, diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 3b6cd8c05f..84caaf8359 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -1915,6 +1915,34 @@ def test_corr_w_invalid_parameters(scalars_dfs): scalars_df[columns].corr(min_periods=1) +@pytest.mark.parametrize( + ("columns", "numeric_only"), + [ + (["bool_col", "int64_col", "float64_col"], True), + (["bool_col", "int64_col", "float64_col"], False), + (["bool_col", "int64_col", "float64_col", "string_col"], True), + pytest.param( + ["bool_col", "int64_col", "float64_col", "string_col"], + False, + marks=pytest.mark.xfail( + raises=NotImplementedError, + ), + ), + ], +) +def test_cov_w_numeric_only(scalars_dfs, columns, numeric_only): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[columns].cov(numeric_only=numeric_only).to_pandas() + pd_result = scalars_pandas_df[columns].cov(numeric_only=numeric_only) + + # BigFrames and Pandas differ in their data type handling: + # - Column types: BigFrames uses Float64, Pandas uses float64. + # - Index types: BigFrames uses strign, Pandas uses object. + pd.testing.assert_frame_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + @pytest.mark.parametrize( ("op"), [ diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 4eceb8a2f1..b54707dcba 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -2835,10 +2835,22 @@ def corr(self, method, min_periods, numeric_only) -> DataFrame: Include only float, int, boolean, decimal data. Returns: - DataFrame: Correlation matrix. + DataFrame: Correlation matrix. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def cov(self, *, numeric_only) -> DataFrame: + """ + Compute pairwise covariance of columns, excluding NA/null values. + + Args: + numeric_only(bool, default False): + Include only float, int, boolean, decimal data. + + Returns: + DataFrame: The covariance matrix of the series of the DataFrame. + """ + def update( self, other, join: str = "left", overwrite: bool = True, filter_func=None ) -> DataFrame: From 3d14d84ff6e7b0b33b304bb5f8409192ccf9799a Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Fri, 22 Mar 2024 21:03:06 +0000 Subject: [PATCH 2/3] Update code example --- .../bigframes_vendored/pandas/core/frame.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index b54707dcba..7851381f64 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -2843,6 +2843,22 @@ def cov(self, *, numeric_only) -> DataFrame: """ Compute pairwise covariance of columns, excluding NA/null values. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({'A': [1, 2, 3], + ... 'B': [400, 500, 600], + ... 'C': [0.8, 0.4, 0.9]}) + >>> df.cov(numeric_only=True) + A B C + A 1.0 100.0 0.05 + B 100.0 10000.0 5.0 + C 0.05 5.0 0.07 + + [3 rows x 3 columns] + Args: numeric_only(bool, default False): Include only float, int, boolean, decimal data. From be5022ca5daa71f7334c0f7e81153743b866a65e Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Fri, 22 Mar 2024 22:18:30 +0000 Subject: [PATCH 3/3] Update test. --- bigframes/core/blocks.py | 3 ++- tests/system/small/test_multiindex.py | 21 +++++++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 066ad5df20..2244668a47 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -1115,7 +1115,8 @@ def calculate_pairwise_metric(self, op=agg_ops.CorrOp()): """ if len(self.value_columns) > 30: raise NotImplementedError( - f"Only work with dataframes containing fewer than 30 columns. Current: {len(self.columns)}. {constants.FEEDBACK_LINK}" + "This function supports dataframes with 30 columns or fewer. " + f"Provided dataframe has {len(self.value_columns)} columns. {constants.FEEDBACK_LINK}" ) aggregations = [ diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py index 4a293526df..4cba38975c 100644 --- a/tests/system/small/test_multiindex.py +++ b/tests/system/small/test_multiindex.py @@ -920,6 +920,27 @@ def test_corr_w_multi_index(scalars_df_index, scalars_pandas_df_index): ) +def test_cov_w_multi_index(scalars_df_index, scalars_pandas_df_index): + columns = ["int64_too", "float64_col", "int64_col"] + multi_columns = pandas.MultiIndex.from_tuples(zip(["a", "b", "b"], [1, 2, 2])) + + bf = scalars_df_index[columns].copy() + bf.columns = multi_columns + + pd_df = scalars_pandas_df_index[columns].copy() + pd_df.columns = multi_columns + + bf_result = bf.cov(numeric_only=True).to_pandas() + pd_result = pd_df.cov(numeric_only=True) + + # BigFrames and Pandas differ in their data type handling: + # - Column types: BigFrames uses Float64, Pandas uses float64. + # - Index types: BigFrames uses string, Pandas uses object. + pandas.testing.assert_frame_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + @pytest.mark.parametrize( ("index_names",), [