Fix pandas SettingWithCopyWarning, which really shouldn't be ignored. (…

…#2447) See: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy This implementation requires column names to be strings, which is hopefully fine (but could be changed). The downside is that this sometimes copies data when unnecessary. It's pretty hard to know when objects are views or not when using public APIS. Pandas objects have a `._is_view` attribute that tell us this. I don't know about cugraph, or if this is even an issue in cugraph. CC @rlratzel Authors: - Erik Welch (https://github.com/eriknw) Approvers: - Rick Ratzel (https://github.com/rlratzel) URL: #2447
rapidsai · Jul 27, 2022 · d925926 · d925926
1 parent 830d113
commit d925926
Show file tree

Hide file tree

Showing 2 changed files with 30 additions and 8 deletions.
diff --git a/python/cugraph/cugraph/structure/property_graph.py b/python/cugraph/cugraph/structure/property_graph.py
@@ -308,7 +308,7 @@ def add_vertex_data(self,
             # column in the incoming dataframe, since the initial merge may not
             # result in the same dtype. (see
             # https://github.com/rapidsai/cudf/issues/9981)
-            self.__update_dataframe_dtypes(
+            self.__vertex_prop_dataframe = self.__update_dataframe_dtypes(
                 self.__vertex_prop_dataframe,
                 {self.vertex_col_name: dataframe[vertex_col_name].dtype})
 
@@ -430,7 +430,7 @@ def add_edge_data(self,
             # column in the incoming dataframe, since the initial merge may not
             # result in the same dtype. (see
             # https://github.com/rapidsai/cudf/issues/9981)
-            self.__update_dataframe_dtypes(
+            self.__edge_prop_dataframe = self.__update_dataframe_dtypes(
                 self.__edge_prop_dataframe,
                 {self.src_col_name: dataframe[vertex_col_names[0]].dtype,
                  self.dst_col_name: dataframe[vertex_col_names[1]].dtype,
@@ -659,7 +659,7 @@ def extract_subgraph(self,
         # possibly had their dtypes converted in order to accommodate NaN
         # values. Restore the original dtypes in the resulting edges df prior
         # to creating a Graph.
-        self.__update_dataframe_dtypes(edges, self.__edge_prop_dtypes)
+        edges = self.__update_dataframe_dtypes(edges, self.__edge_prop_dtypes)
 
         return self.edge_props_to_graph(
             edges,
@@ -727,7 +727,9 @@ def annotate_dataframe(self, df, G, edge_vertex_col_names):
                       inplace=True)
 
         # restore the original dtypes
-        self.__update_dataframe_dtypes(new_df, self.__edge_prop_dtypes)
+        new_df = self.__update_dataframe_dtypes(
+            new_df, self.__edge_prop_dtypes
+        )
         for col in df.columns:
             new_df[col] = new_df[col].astype(df[col].dtype)
 
@@ -906,6 +908,7 @@ def __update_dataframe_dtypes(df, column_dtype_dict):
         This also handles converting standard integer dtypes to nullable
         integer dtypes, needed to accommodate NA values in columns.
         """
+        update_cols = {}
         for (col, dtype) in column_dtype_dict.items():
             # If the DataFrame is Pandas and the dtype is an integer type,
             # ensure a nullable integer array is used by specifying the correct
@@ -919,7 +922,11 @@ def __update_dataframe_dtypes(df, column_dtype_dict):
                 # Assigning to df[col] produces a (false?) warning with Pandas,
                 # but assigning to df.loc[:,col] does not update the df in
                 # cudf, so do one or the other based on type.
-                if type(df) is cudf.DataFrame:
-                    df[col] = df[col].astype(dtype_str)
-                else:
-                    df.loc[:, col] = df[col].astype(dtype_str)
+                update_cols[col] = df[col].astype(dtype_str)
+        if not update_cols:
+            return df
+        # Use df.assign to avoid assignment into df in case df is a view:
+        # https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html
+        # #returning-a-view-versus-a-copy
+        # Note that this requires all column names to be strings.
+        return df.assign(**update_cols)
diff --git a/python/cugraph/cugraph/tests/test_property_graph.py b/python/cugraph/cugraph/tests/test_property_graph.py
@@ -118,6 +118,21 @@ def setup_function():
 # =============================================================================
 # Pytest fixtures
 # =============================================================================
+@pytest.fixture(scope="function", autouse=True)
+def raise_on_pandas_warning():
+    """Raise when pandas gives SettingWithCopyWarning warning"""
+    # Perhaps we should put this in pytest.ini, pyproject.toml, or conftest.py
+    import warnings
+
+    filters = list(warnings.filters)
+    warnings.filterwarnings(
+        "error",
+        category=pd.core.common.SettingWithCopyWarning
+    )
+    yield
+    warnings.filters = filters
+
+
 df_types = [cudf.DataFrame, pd.DataFrame]