diff --git a/python/cugraph/cugraph/dask/structure/mg_property_graph.py b/python/cugraph/cugraph/dask/structure/mg_property_graph.py index 2a986a07623..e7ee8350ed7 100644 --- a/python/cugraph/cugraph/dask/structure/mg_property_graph.py +++ b/python/cugraph/cugraph/dask/structure/mg_property_graph.py @@ -13,7 +13,6 @@ import cudf import cupy -import numpy as np import cugraph import dask_cudf import cugraph.dask as dcg @@ -578,11 +577,13 @@ def get_vertex_data(self, vertex_ids=None, types=None, columns=None): if vertex_ids is not None: if isinstance(vertex_ids, int): vertex_ids = [vertex_ids] - elif not isinstance( - vertex_ids, (list, slice, np.ndarray, self.__series_type) - ): - vertex_ids = list(vertex_ids) - df = df.loc[vertex_ids] + try: + df = df.loc[vertex_ids] + except TypeError: + raise TypeError( + "vertex_ids needs to be a list-like type " + f"compatible with DataFrame.loc[], got {type(vertex_ids)}" + ) if types is not None: if isinstance(types, str): @@ -906,11 +907,14 @@ def get_edge_data(self, edge_ids=None, types=None, columns=None): if edge_ids is not None: if isinstance(edge_ids, int): edge_ids = [edge_ids] - elif not isinstance( - edge_ids, (list, slice, np.ndarray, self.__series_type) - ): - edge_ids = list(edge_ids) - df = df.loc[edge_ids] + + try: + df = df.loc[edge_ids] + except TypeError: + raise TypeError( + "edge_ids needs to be a list-like type " + f"compatible with DataFrame.loc[], got {type(edge_ids)}" + ) if types is not None: if isinstance(types, str): diff --git a/python/cugraph/cugraph/structure/property_graph.py b/python/cugraph/cugraph/structure/property_graph.py index e203e01e650..45ff7452b21 100644 --- a/python/cugraph/cugraph/structure/property_graph.py +++ b/python/cugraph/cugraph/structure/property_graph.py @@ -512,7 +512,7 @@ def add_vertex_data( ): """ Add a dataframe describing vertex properties to the PropertyGraph. - Can contain additional vertices that will not have associatede edges. + Can contain additional vertices that will not have associated edges. Parameters ---------- @@ -829,11 +829,14 @@ def get_vertex_data(self, vertex_ids=None, types=None, columns=None): if vertex_ids is not None: if isinstance(vertex_ids, int): vertex_ids = [vertex_ids] - elif not isinstance( - vertex_ids, (list, slice, np.ndarray, self.__series_type) - ): - vertex_ids = list(vertex_ids) - df = df.loc[vertex_ids] + + try: + df = df.loc[vertex_ids] + except TypeError: + raise TypeError( + "vertex_ids needs to be a list-like type " + f"compatible with DataFrame.loc[], got {type(vertex_ids)}" + ) if types is not None: if isinstance(types, str): @@ -1218,11 +1221,14 @@ def get_edge_data(self, edge_ids=None, types=None, columns=None): if edge_ids is not None: if isinstance(edge_ids, int): edge_ids = [edge_ids] - elif not isinstance( - edge_ids, (list, slice, np.ndarray, self.__series_type) - ): - edge_ids = list(edge_ids) - df = df.loc[edge_ids] + + try: + df = df.loc[edge_ids] + except TypeError: + raise TypeError( + "edge_ids needs to be a list-like type " + f"compatible with DataFrame.loc[], got {type(edge_ids)}" + ) if types is not None: if isinstance(types, str): diff --git a/python/cugraph/cugraph/tests/mg/test_mg_property_graph.py b/python/cugraph/cugraph/tests/mg/test_mg_property_graph.py index 1879d99ecca..e0db183be80 100644 --- a/python/cugraph/cugraph/tests/mg/test_mg_property_graph.py +++ b/python/cugraph/cugraph/tests/mg/test_mg_property_graph.py @@ -866,11 +866,14 @@ def test_renumber_vertices_by_type(dataset1_MGPropertyGraph, prev_id_column): assert df_id_ranges.loc[key, "stop"] == stop df = pG.get_vertex_data(types=[key]).compute() assert len(df) == stop - start + 1 - assert (df["_VERTEX_"] == list(range(start, stop + 1))).all() + assert ( + df["_VERTEX_"] == df["_VERTEX_"]._constructor(range(start, stop + 1)) + ).all() if prev_id_column is not None: cur = df[prev_id_column].sort_values() expected = sorted(x for x, *args in data[key][1]) - assert (cur == expected).all() + expected = cur._constructor(sorted(x for x, *args in data[key][1])) + assert (cur.values == expected.values).all() # Make sure we renumber vertex IDs in edge data too df = pG.get_edge_data().compute() assert 0 <= df[pG.src_col_name].min() < df[pG.src_col_name].max() < 9 @@ -905,7 +908,9 @@ def test_renumber_edges_by_type(dataset1_MGPropertyGraph, prev_id_column): assert df_id_ranges.loc[key, "stop"] == stop df = pG.get_edge_data(types=[key]).compute() assert len(df) == stop - start + 1 - assert (df[pG.edge_id_col_name] == list(range(start, stop + 1))).all() + actual = df[pG.edge_id_col_name] + expected = actual._constructor(range(start, stop + 1)) + assert (actual == expected).all() if prev_id_column is not None: assert prev_id_column in df.columns @@ -1336,3 +1341,32 @@ def func(): assert len(df) == len(cyber_df) gpubenchmark(func) + + +@pytest.mark.slow +@pytest.mark.parametrize("n_rows", [1_000_000]) +@pytest.mark.parametrize("n_feats", [128]) +def bench_get_vector_features(gpubenchmark, dask_client, n_rows, n_feats): + from cugraph.experimental import MGPropertyGraph + + df = cudf.DataFrame( + { + "src": cp.arange(0, n_rows, dtype=cp.int32), + "dst": cp.arange(0, n_rows, dtype=cp.int32) + 1, + } + ) + for i in range(n_feats): + df[f"feat_{i}"] = cp.ones(len(df), dtype=cp.int32) + df = dask_cudf.from_cudf(df, npartitions=16) + + vector_properties = {"feat": [f"feat_{i}" for i in range(n_feats)]} + pG = MGPropertyGraph() + pG.add_edge_data( + df, vertex_col_names=["src", "dst"], vector_properties=vector_properties + ) + + def func(pG): + df = pG.get_edge_data(edge_ids=cp.arange(0, 100_000)) + df = df.compute() + + gpubenchmark(func, pG) diff --git a/python/cugraph/cugraph/tests/test_property_graph.py b/python/cugraph/cugraph/tests/test_property_graph.py index ff67fafcbfc..b066772ff08 100644 --- a/python/cugraph/cugraph/tests/test_property_graph.py +++ b/python/cugraph/cugraph/tests/test_property_graph.py @@ -2384,7 +2384,7 @@ def func(): gpubenchmark(func) -@pytest.mark.slow +# @pytest.mark.slow @pytest.mark.parametrize("n_rows", [10_000, 100_000, 1_000_000, 10_000_000]) @pytest.mark.parametrize("n_feats", [32, 64, 128]) def bench_add_vector_features(gpubenchmark, n_rows, n_feats): @@ -2408,3 +2408,47 @@ def func(): ) gpubenchmark(func) + + +@pytest.mark.parametrize("n_rows", [1_000_000]) +@pytest.mark.parametrize("n_feats", [128]) +def bench_get_vector_features_cp_array(benchmark, n_rows, n_feats): + from cugraph.experimental import PropertyGraph + + df = cudf.DataFrame( + { + "src": cp.arange(0, n_rows, dtype=cp.int32), + "dst": cp.arange(0, n_rows, dtype=cp.int32) + 1, + } + ) + for i in range(n_feats): + df[f"feat_{i}"] = cp.ones(len(df), dtype=cp.int32) + + vector_properties = {"feat": [f"feat_{i}" for i in range(n_feats)]} + pG = PropertyGraph() + pG.add_edge_data( + df, vertex_col_names=["src", "dst"], vector_properties=vector_properties + ) + benchmark(pG.get_edge_data, edge_ids=cp.arange(0, 100_000)) + + +@pytest.mark.parametrize("n_rows", [1_000_000]) +@pytest.mark.parametrize("n_feats", [128]) +def bench_get_vector_features_cudf_series(benchmark, n_rows, n_feats): + from cugraph.experimental import PropertyGraph + + df = cudf.DataFrame( + { + "src": cp.arange(0, n_rows, dtype=cp.int32), + "dst": cp.arange(0, n_rows, dtype=cp.int32) + 1, + } + ) + for i in range(n_feats): + df[f"feat_{i}"] = cp.ones(len(df), dtype=cp.int32) + + vector_properties = {"feat": [f"feat_{i}" for i in range(n_feats)]} + pG = PropertyGraph() + pG.add_edge_data( + df, vertex_col_names=["src", "dst"], vector_properties=vector_properties + ) + benchmark(pG.get_edge_data, edge_ids=cudf.Series(cp.arange(0, 100_000)))