Optimize pg.get_x_data APIs (#3086)

This PR closes #3026 . The speed up on cupy arrays is `83x`. ``` mainline: ----------------------- benchmark: 1 tests ----------------------- Name (time in s, mem in bytes) Mean Rounds ------------------------------------------------------------------ bench_get_vector_features_cp_array[128-1000000] 1.0880 1 ------------------------------------------------------------------ branch: ------------------------ benchmark: 1 tests ----------------------- Name (time in ms, mem in bytes) Mean Rounds ------------------------------------------------------------------- bench_get_vector_features_cp_array[128-1000000] 12.7905 1 ------------------------------------------------------------------- ``` ``` branch: ------------------------- benchmark: 1 tests ------------------------- Name (time in ms, mem in bytes) Mean Rounds ---------------------------------------------------------------------- bench_get_vector_features_cudf_series[128-1000000] 12.7068 1 ---------------------------------------------------------------------- Mainline: ------------------------- benchmark: 1 tests ------------------------- Name (time in ms, mem in bytes) Mean Rounds ---------------------------------------------------------------------- bench_get_vector_features_cudf_series[128-1000000] 12.6223 1 ---------------------------------------------------------------------- ``` Authors: - Vibhu Jawa (https://github.com/VibhuJawa) Approvers: - Alex Barghi (https://github.com/alexbarghi-nv) URL: #3086
rapidsai · Jan 3, 2023 · bc2e130 · bc2e130
1 parent b2cdd82
commit bc2e130
Show file tree

Hide file tree

Showing 4 changed files with 114 additions and 26 deletions.
diff --git a/python/cugraph/cugraph/dask/structure/mg_property_graph.py b/python/cugraph/cugraph/dask/structure/mg_property_graph.py
@@ -13,7 +13,6 @@
 
 import cudf
 import cupy
-import numpy as np
 import cugraph
 import dask_cudf
 import cugraph.dask as dcg
@@ -578,11 +577,13 @@ def get_vertex_data(self, vertex_ids=None, types=None, columns=None):
             if vertex_ids is not None:
                 if isinstance(vertex_ids, int):
                     vertex_ids = [vertex_ids]
-                elif not isinstance(
-                    vertex_ids, (list, slice, np.ndarray, self.__series_type)
-                ):
-                    vertex_ids = list(vertex_ids)
-                df = df.loc[vertex_ids]
+                try:
+                    df = df.loc[vertex_ids]
+                except TypeError:
+                    raise TypeError(
+                        "vertex_ids needs to be a list-like type "
+                        f"compatible with DataFrame.loc[], got {type(vertex_ids)}"
+                    )
 
             if types is not None:
                 if isinstance(types, str):
@@ -906,11 +907,14 @@ def get_edge_data(self, edge_ids=None, types=None, columns=None):
             if edge_ids is not None:
                 if isinstance(edge_ids, int):
                     edge_ids = [edge_ids]
-                elif not isinstance(
-                    edge_ids, (list, slice, np.ndarray, self.__series_type)
-                ):
-                    edge_ids = list(edge_ids)
-                df = df.loc[edge_ids]
+
+                try:
+                    df = df.loc[edge_ids]
+                except TypeError:
+                    raise TypeError(
+                        "edge_ids needs to be a list-like type "
+                        f"compatible with DataFrame.loc[], got {type(edge_ids)}"
+                    )
 
             if types is not None:
                 if isinstance(types, str):

diff --git a/python/cugraph/cugraph/structure/property_graph.py b/python/cugraph/cugraph/structure/property_graph.py
@@ -512,7 +512,7 @@ def add_vertex_data(
     ):
         """
         Add a dataframe describing vertex properties to the PropertyGraph.
-        Can contain additional vertices that will not have associatede edges.
+        Can contain additional vertices that will not have associated edges.
 
         Parameters
         ----------
@@ -829,11 +829,14 @@ def get_vertex_data(self, vertex_ids=None, types=None, columns=None):
             if vertex_ids is not None:
                 if isinstance(vertex_ids, int):
                     vertex_ids = [vertex_ids]
-                elif not isinstance(
-                    vertex_ids, (list, slice, np.ndarray, self.__series_type)
-                ):
-                    vertex_ids = list(vertex_ids)
-                df = df.loc[vertex_ids]
+
+                try:
+                    df = df.loc[vertex_ids]
+                except TypeError:
+                    raise TypeError(
+                        "vertex_ids needs to be a list-like type "
+                        f"compatible with DataFrame.loc[], got {type(vertex_ids)}"
+                    )
 
             if types is not None:
                 if isinstance(types, str):
@@ -1218,11 +1221,14 @@ def get_edge_data(self, edge_ids=None, types=None, columns=None):
             if edge_ids is not None:
                 if isinstance(edge_ids, int):
                     edge_ids = [edge_ids]
-                elif not isinstance(
-                    edge_ids, (list, slice, np.ndarray, self.__series_type)
-                ):
-                    edge_ids = list(edge_ids)
-                df = df.loc[edge_ids]
+
+                try:
+                    df = df.loc[edge_ids]
+                except TypeError:
+                    raise TypeError(
+                        "edge_ids needs to be a list-like type "
+                        f"compatible with DataFrame.loc[], got {type(edge_ids)}"
+                    )
 
             if types is not None:
                 if isinstance(types, str):

diff --git a/python/cugraph/cugraph/tests/mg/test_mg_property_graph.py b/python/cugraph/cugraph/tests/mg/test_mg_property_graph.py
@@ -866,11 +866,14 @@ def test_renumber_vertices_by_type(dataset1_MGPropertyGraph, prev_id_column):
         assert df_id_ranges.loc[key, "stop"] == stop
         df = pG.get_vertex_data(types=[key]).compute()
         assert len(df) == stop - start + 1
-        assert (df["_VERTEX_"] == list(range(start, stop + 1))).all()
+        assert (
+            df["_VERTEX_"] == df["_VERTEX_"]._constructor(range(start, stop + 1))
+        ).all()
         if prev_id_column is not None:
             cur = df[prev_id_column].sort_values()
             expected = sorted(x for x, *args in data[key][1])
-            assert (cur == expected).all()
+            expected = cur._constructor(sorted(x for x, *args in data[key][1]))
+            assert (cur.values == expected.values).all()
     # Make sure we renumber vertex IDs in edge data too
     df = pG.get_edge_data().compute()
     assert 0 <= df[pG.src_col_name].min() < df[pG.src_col_name].max() < 9
@@ -905,7 +908,9 @@ def test_renumber_edges_by_type(dataset1_MGPropertyGraph, prev_id_column):
         assert df_id_ranges.loc[key, "stop"] == stop
         df = pG.get_edge_data(types=[key]).compute()
         assert len(df) == stop - start + 1
-        assert (df[pG.edge_id_col_name] == list(range(start, stop + 1))).all()
+        actual = df[pG.edge_id_col_name]
+        expected = actual._constructor(range(start, stop + 1))
+        assert (actual == expected).all()
         if prev_id_column is not None:
             assert prev_id_column in df.columns
 
@@ -1336,3 +1341,32 @@ def func():
         assert len(df) == len(cyber_df)
 
     gpubenchmark(func)
+
+
+@pytest.mark.slow
+@pytest.mark.parametrize("n_rows", [1_000_000])
+@pytest.mark.parametrize("n_feats", [128])
+def bench_get_vector_features(gpubenchmark, dask_client, n_rows, n_feats):
+    from cugraph.experimental import MGPropertyGraph
+
+    df = cudf.DataFrame(
+        {
+            "src": cp.arange(0, n_rows, dtype=cp.int32),
+            "dst": cp.arange(0, n_rows, dtype=cp.int32) + 1,
+        }
+    )
+    for i in range(n_feats):
+        df[f"feat_{i}"] = cp.ones(len(df), dtype=cp.int32)
+    df = dask_cudf.from_cudf(df, npartitions=16)
+
+    vector_properties = {"feat": [f"feat_{i}" for i in range(n_feats)]}
+    pG = MGPropertyGraph()
+    pG.add_edge_data(
+        df, vertex_col_names=["src", "dst"], vector_properties=vector_properties
+    )
+
+    def func(pG):
+        df = pG.get_edge_data(edge_ids=cp.arange(0, 100_000))
+        df = df.compute()
+
+    gpubenchmark(func, pG)
diff --git a/python/cugraph/cugraph/tests/test_property_graph.py b/python/cugraph/cugraph/tests/test_property_graph.py
@@ -2384,7 +2384,7 @@ def func():
     gpubenchmark(func)
 
 
-@pytest.mark.slow
+# @pytest.mark.slow
 @pytest.mark.parametrize("n_rows", [10_000, 100_000, 1_000_000, 10_000_000])
 @pytest.mark.parametrize("n_feats", [32, 64, 128])
 def bench_add_vector_features(gpubenchmark, n_rows, n_feats):
@@ -2408,3 +2408,47 @@ def func():
         )
 
     gpubenchmark(func)
+
+
+@pytest.mark.parametrize("n_rows", [1_000_000])
+@pytest.mark.parametrize("n_feats", [128])
+def bench_get_vector_features_cp_array(benchmark, n_rows, n_feats):
+    from cugraph.experimental import PropertyGraph
+
+    df = cudf.DataFrame(
+        {
+            "src": cp.arange(0, n_rows, dtype=cp.int32),
+            "dst": cp.arange(0, n_rows, dtype=cp.int32) + 1,
+        }
+    )
+    for i in range(n_feats):
+        df[f"feat_{i}"] = cp.ones(len(df), dtype=cp.int32)
+
+    vector_properties = {"feat": [f"feat_{i}" for i in range(n_feats)]}
+    pG = PropertyGraph()
+    pG.add_edge_data(
+        df, vertex_col_names=["src", "dst"], vector_properties=vector_properties
+    )
+    benchmark(pG.get_edge_data, edge_ids=cp.arange(0, 100_000))
+
+
+@pytest.mark.parametrize("n_rows", [1_000_000])
+@pytest.mark.parametrize("n_feats", [128])
+def bench_get_vector_features_cudf_series(benchmark, n_rows, n_feats):
+    from cugraph.experimental import PropertyGraph
+
+    df = cudf.DataFrame(
+        {
+            "src": cp.arange(0, n_rows, dtype=cp.int32),
+            "dst": cp.arange(0, n_rows, dtype=cp.int32) + 1,
+        }
+    )
+    for i in range(n_feats):
+        df[f"feat_{i}"] = cp.ones(len(df), dtype=cp.int32)
+
+    vector_properties = {"feat": [f"feat_{i}" for i in range(n_feats)]}
+    pG = PropertyGraph()
+    pG.add_edge_data(
+        df, vertex_col_names=["src", "dst"], vector_properties=vector_properties
+    )
+    benchmark(pG.get_edge_data, edge_ids=cudf.Series(cp.arange(0, 100_000)))