feat: support dataframe.cov (#498)

Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Document: https://screenshot.googleplex.com/9egi7MsNj2uWHkH Fixes #<issue_number_goes_here> 🦕
googleapis · Mar 22, 2024 · c4beafd · c4beafd
1 parent 97afad9
commit c4beafd
Show file tree

Hide file tree

Showing 5 changed files with 101 additions and 11 deletions.
diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py
@@ -1110,13 +1110,22 @@ def summarize(
             index_columns=[label_col_id],
         )
 
-    def corr(self):
-        """Returns a block object to compute the self-correlation on this block."""
+    def calculate_pairwise_metric(self, op=agg_ops.CorrOp()):
+        """
+        Returns a block object to compute pairwise metrics among all value columns in this block.
+
+        The metric to be computed is specified by the `op` parameter, which can be either a
+        correlation operation (default) or a covariance operation.
+        """
+        if len(self.value_columns) > 30:
+            raise NotImplementedError(
+                "This function supports dataframes with 30 columns or fewer. "
+                f"Provided dataframe has {len(self.value_columns)} columns. {constants.FEEDBACK_LINK}"
+            )
+
         aggregations = [
             (
-                ex.BinaryAggregation(
-                    agg_ops.CorrOp(), ex.free_var(left_col), ex.free_var(right_col)
-                ),
+                ex.BinaryAggregation(op, ex.free_var(left_col), ex.free_var(right_col)),
                 f"{left_col}-{right_col}",
             )
             for left_col in self.value_columns

diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
@@ -1019,17 +1019,21 @@ def corr(self, method="pearson", min_periods=None, numeric_only=False) -> DataFr
             raise NotImplementedError(
                 f"min_periods not yet supported. {constants.FEEDBACK_LINK}"
             )
-        if len(self.columns) > 30:
-            raise NotImplementedError(
-                f"Only work with dataframes containing fewer than 30 columns. Current: {len(self.columns)}. {constants.FEEDBACK_LINK}"
-            )
 
         if not numeric_only:
             frame = self._raise_on_non_numeric("corr")
         else:
             frame = self._drop_non_numeric()
 
-        return DataFrame(frame._block.corr())
+        return DataFrame(frame._block.calculate_pairwise_metric(op=agg_ops.CorrOp()))
+
+    def cov(self, *, numeric_only: bool = False) -> DataFrame:
+        if not numeric_only:
+            frame = self._raise_on_non_numeric("corr")
+        else:
+            frame = self._drop_non_numeric()
+
+        return DataFrame(frame._block.calculate_pairwise_metric(agg_ops.CovOp()))
 
     def to_pandas(
         self,

diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py
@@ -1916,6 +1916,34 @@ def test_corr_w_invalid_parameters(scalars_dfs):
         scalars_df[columns].corr(min_periods=1)
 
 
+@pytest.mark.parametrize(
+    ("columns", "numeric_only"),
+    [
+        (["bool_col", "int64_col", "float64_col"], True),
+        (["bool_col", "int64_col", "float64_col"], False),
+        (["bool_col", "int64_col", "float64_col", "string_col"], True),
+        pytest.param(
+            ["bool_col", "int64_col", "float64_col", "string_col"],
+            False,
+            marks=pytest.mark.xfail(
+                raises=NotImplementedError,
+            ),
+        ),
+    ],
+)
+def test_cov_w_numeric_only(scalars_dfs, columns, numeric_only):
+    scalars_df, scalars_pandas_df = scalars_dfs
+    bf_result = scalars_df[columns].cov(numeric_only=numeric_only).to_pandas()
+    pd_result = scalars_pandas_df[columns].cov(numeric_only=numeric_only)
+
+    # BigFrames and Pandas differ in their data type handling:
+    # - Column types: BigFrames uses Float64, Pandas uses float64.
+    # - Index types: BigFrames uses strign, Pandas uses object.
+    pd.testing.assert_frame_equal(
+        bf_result, pd_result, check_dtype=False, check_index_type=False
+    )
+
+
 @pytest.mark.parametrize(
     ("op"),
     [

diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py
@@ -921,6 +921,27 @@ def test_corr_w_multi_index(scalars_df_index, scalars_pandas_df_index):
     )
 
 
+def test_cov_w_multi_index(scalars_df_index, scalars_pandas_df_index):
+    columns = ["int64_too", "float64_col", "int64_col"]
+    multi_columns = pandas.MultiIndex.from_tuples(zip(["a", "b", "b"], [1, 2, 2]))
+
+    bf = scalars_df_index[columns].copy()
+    bf.columns = multi_columns
+
+    pd_df = scalars_pandas_df_index[columns].copy()
+    pd_df.columns = multi_columns
+
+    bf_result = bf.cov(numeric_only=True).to_pandas()
+    pd_result = pd_df.cov(numeric_only=True)
+
+    # BigFrames and Pandas differ in their data type handling:
+    # - Column types: BigFrames uses Float64, Pandas uses float64.
+    # - Index types: BigFrames uses string, Pandas uses object.
+    pandas.testing.assert_frame_equal(
+        bf_result, pd_result, check_dtype=False, check_index_type=False
+    )
+
+
 @pytest.mark.parametrize(
     ("index_names",),
     [

diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py
@@ -2834,10 +2834,38 @@ def corr(self, method, min_periods, numeric_only) -> DataFrame:
                 Include only float, int, boolean, decimal data.
 
         Returns:
-            DataFrame:  Correlation matrix.
+            DataFrame: Correlation matrix.
         """
         raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
 
+    def cov(self, *, numeric_only) -> DataFrame:
+        """
+        Compute pairwise covariance of columns, excluding NA/null values.
+
+        **Examples:**
+
+            >>> import bigframes.pandas as bpd
+            >>> bpd.options.display.progress_bar = None
+
+            >>> df = bpd.DataFrame({'A': [1, 2, 3],
+            ...                    'B': [400, 500, 600],
+            ...                    'C': [0.8, 0.4, 0.9]})
+            >>> df.cov(numeric_only=True)
+                   A        B     C
+            A    1.0    100.0  0.05
+            B  100.0  10000.0   5.0
+            C   0.05      5.0  0.07
+            <BLANKLINE>
+            [3 rows x 3 columns]
+
+        Args:
+            numeric_only(bool, default False):
+                Include only float, int, boolean, decimal data.
+
+        Returns:
+            DataFrame: The covariance matrix of the series of the DataFrame.
+        """
+
     def update(
         self, other, join: str = "left", overwrite: bool = True, filter_func=None
     ) -> DataFrame: