From a25d0728e0f762d7bb03284e472cecf8c56f7c69 Mon Sep 17 00:00:00 2001
From: Huan Chen <huanc@google.com>
Date: Fri, 22 Mar 2024 20:36:44 +0000
Subject: [PATCH 1/3] feat: support dataframe.cov

---
 bigframes/core/blocks.py                      | 18 ++++++++----
 bigframes/dataframe.py                        | 14 ++++++----
 tests/system/small/test_dataframe.py          | 28 +++++++++++++++++++
 .../bigframes_vendored/pandas/core/frame.py   | 14 +++++++++-
 4 files changed, 63 insertions(+), 11 deletions(-)

diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py
index 178d698f8d..066ad5df20 100644
--- a/bigframes/core/blocks.py
+++ b/bigframes/core/blocks.py
@@ -1106,13 +1106,21 @@ def summarize(
             index_columns=[label_col_id],
         )
 
-    def corr(self):
-        """Returns a block object to compute the self-correlation on this block."""
+    def calculate_pairwise_metric(self, op=agg_ops.CorrOp()):
+        """
+        Returns a block object to compute pairwise metrics among all value columns in this block.
+
+        The metric to be computed is specified by the `op` parameter, which can be either a
+        correlation operation (default) or a covariance operation.
+        """
+        if len(self.value_columns) > 30:
+            raise NotImplementedError(
+                f"Only work with dataframes containing fewer than 30 columns. Current: {len(self.columns)}. {constants.FEEDBACK_LINK}"
+            )
+
         aggregations = [
             (
-                ex.BinaryAggregation(
-                    agg_ops.CorrOp(), ex.free_var(left_col), ex.free_var(right_col)
-                ),
+                ex.BinaryAggregation(op, ex.free_var(left_col), ex.free_var(right_col)),
                 f"{left_col}-{right_col}",
             )
             for left_col in self.value_columns
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
index 4e447c547f..0be4377ab9 100644
--- a/bigframes/dataframe.py
+++ b/bigframes/dataframe.py
@@ -1019,17 +1019,21 @@ def corr(self, method="pearson", min_periods=None, numeric_only=False) -> DataFr
             raise NotImplementedError(
                 f"min_periods not yet supported. {constants.FEEDBACK_LINK}"
             )
-        if len(self.columns) > 30:
-            raise NotImplementedError(
-                f"Only work with dataframes containing fewer than 30 columns. Current: {len(self.columns)}. {constants.FEEDBACK_LINK}"
-            )
 
         if not numeric_only:
             frame = self._raise_on_non_numeric("corr")
         else:
             frame = self._drop_non_numeric()
 
-        return DataFrame(frame._block.corr())
+        return DataFrame(frame._block.calculate_pairwise_metric(op=agg_ops.CorrOp()))
+
+    def cov(self, *, numeric_only: bool = False) -> DataFrame:
+        if not numeric_only:
+            frame = self._raise_on_non_numeric("corr")
+        else:
+            frame = self._drop_non_numeric()
+
+        return DataFrame(frame._block.calculate_pairwise_metric(agg_ops.CovOp()))
 
     def to_pandas(
         self,
diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py
index 3b6cd8c05f..84caaf8359 100644
--- a/tests/system/small/test_dataframe.py
+++ b/tests/system/small/test_dataframe.py
@@ -1915,6 +1915,34 @@ def test_corr_w_invalid_parameters(scalars_dfs):
         scalars_df[columns].corr(min_periods=1)
 
 
+@pytest.mark.parametrize(
+    ("columns", "numeric_only"),
+    [
+        (["bool_col", "int64_col", "float64_col"], True),
+        (["bool_col", "int64_col", "float64_col"], False),
+        (["bool_col", "int64_col", "float64_col", "string_col"], True),
+        pytest.param(
+            ["bool_col", "int64_col", "float64_col", "string_col"],
+            False,
+            marks=pytest.mark.xfail(
+                raises=NotImplementedError,
+            ),
+        ),
+    ],
+)
+def test_cov_w_numeric_only(scalars_dfs, columns, numeric_only):
+    scalars_df, scalars_pandas_df = scalars_dfs
+    bf_result = scalars_df[columns].cov(numeric_only=numeric_only).to_pandas()
+    pd_result = scalars_pandas_df[columns].cov(numeric_only=numeric_only)
+
+    # BigFrames and Pandas differ in their data type handling:
+    # - Column types: BigFrames uses Float64, Pandas uses float64.
+    # - Index types: BigFrames uses strign, Pandas uses object.
+    pd.testing.assert_frame_equal(
+        bf_result, pd_result, check_dtype=False, check_index_type=False
+    )
+
+
 @pytest.mark.parametrize(
     ("op"),
     [
diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py
index 4eceb8a2f1..b54707dcba 100644
--- a/third_party/bigframes_vendored/pandas/core/frame.py
+++ b/third_party/bigframes_vendored/pandas/core/frame.py
@@ -2835,10 +2835,22 @@ def corr(self, method, min_periods, numeric_only) -> DataFrame:
                 Include only float, int, boolean, decimal data.
 
         Returns:
-            DataFrame:  Correlation matrix.
+            DataFrame: Correlation matrix.
         """
         raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
 
+    def cov(self, *, numeric_only) -> DataFrame:
+        """
+        Compute pairwise covariance of columns, excluding NA/null values.
+
+        Args:
+            numeric_only(bool, default False):
+                Include only float, int, boolean, decimal data.
+
+        Returns:
+            DataFrame: The covariance matrix of the series of the DataFrame.
+        """
+
     def update(
         self, other, join: str = "left", overwrite: bool = True, filter_func=None
     ) -> DataFrame:

From 3d14d84ff6e7b0b33b304bb5f8409192ccf9799a Mon Sep 17 00:00:00 2001
From: Huan Chen <huanc@google.com>
Date: Fri, 22 Mar 2024 21:03:06 +0000
Subject: [PATCH 2/3] Update code example

---
 .../bigframes_vendored/pandas/core/frame.py      | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py
index b54707dcba..7851381f64 100644
--- a/third_party/bigframes_vendored/pandas/core/frame.py
+++ b/third_party/bigframes_vendored/pandas/core/frame.py
@@ -2843,6 +2843,22 @@ def cov(self, *, numeric_only) -> DataFrame:
         """
         Compute pairwise covariance of columns, excluding NA/null values.
 
+        **Examples:**
+
+            >>> import bigframes.pandas as bpd
+            >>> bpd.options.display.progress_bar = None
+
+            >>> df = bpd.DataFrame({'A': [1, 2, 3],
+            ...                    'B': [400, 500, 600],
+            ...                    'C': [0.8, 0.4, 0.9]})
+            >>> df.cov(numeric_only=True)
+                   A        B     C
+            A    1.0    100.0  0.05
+            B  100.0  10000.0   5.0
+            C   0.05      5.0  0.07
+            <BLANKLINE>
+            [3 rows x 3 columns]
+
         Args:
             numeric_only(bool, default False):
                 Include only float, int, boolean, decimal data.

From be5022ca5daa71f7334c0f7e81153743b866a65e Mon Sep 17 00:00:00 2001
From: Huan Chen <huanc@google.com>
Date: Fri, 22 Mar 2024 22:18:30 +0000
Subject: [PATCH 3/3] Update test.

---
 bigframes/core/blocks.py              |  3 ++-
 tests/system/small/test_multiindex.py | 21 +++++++++++++++++++++
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py
index 066ad5df20..2244668a47 100644
--- a/bigframes/core/blocks.py
+++ b/bigframes/core/blocks.py
@@ -1115,7 +1115,8 @@ def calculate_pairwise_metric(self, op=agg_ops.CorrOp()):
         """
         if len(self.value_columns) > 30:
             raise NotImplementedError(
-                f"Only work with dataframes containing fewer than 30 columns. Current: {len(self.columns)}. {constants.FEEDBACK_LINK}"
+                "This function supports dataframes with 30 columns or fewer. "
+                f"Provided dataframe has {len(self.value_columns)} columns. {constants.FEEDBACK_LINK}"
             )
 
         aggregations = [
diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py
index 4a293526df..4cba38975c 100644
--- a/tests/system/small/test_multiindex.py
+++ b/tests/system/small/test_multiindex.py
@@ -920,6 +920,27 @@ def test_corr_w_multi_index(scalars_df_index, scalars_pandas_df_index):
     )
 
 
+def test_cov_w_multi_index(scalars_df_index, scalars_pandas_df_index):
+    columns = ["int64_too", "float64_col", "int64_col"]
+    multi_columns = pandas.MultiIndex.from_tuples(zip(["a", "b", "b"], [1, 2, 2]))
+
+    bf = scalars_df_index[columns].copy()
+    bf.columns = multi_columns
+
+    pd_df = scalars_pandas_df_index[columns].copy()
+    pd_df.columns = multi_columns
+
+    bf_result = bf.cov(numeric_only=True).to_pandas()
+    pd_result = pd_df.cov(numeric_only=True)
+
+    # BigFrames and Pandas differ in their data type handling:
+    # - Column types: BigFrames uses Float64, Pandas uses float64.
+    # - Index types: BigFrames uses string, Pandas uses object.
+    pandas.testing.assert_frame_equal(
+        bf_result, pd_result, check_dtype=False, check_index_type=False
+    )
+
+
 @pytest.mark.parametrize(
     ("index_names",),
     [