Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Optimize pg.get_x_data APIs #3086

Merged
26 changes: 15 additions & 11 deletions python/cugraph/cugraph/dask/structure/mg_property_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@

import cudf
import cupy
import numpy as np
import cugraph
import dask_cudf
import cugraph.dask as dcg
Expand Down Expand Up @@ -578,11 +577,13 @@ def get_vertex_data(self, vertex_ids=None, types=None, columns=None):
if vertex_ids is not None:
if isinstance(vertex_ids, int):
vertex_ids = [vertex_ids]
elif not isinstance(
vertex_ids, (list, slice, np.ndarray, self.__series_type)
):
vertex_ids = list(vertex_ids)
VibhuJawa marked this conversation as resolved.
Show resolved Hide resolved
df = df.loc[vertex_ids]
try:
df = df.loc[vertex_ids]
except TypeError:
raise TypeError(
"vertex_ids needs to be a list-like type "
f"compatible with DataFrame.loc[], got {type(vertex_ids)}"
)

if types is not None:
if isinstance(types, str):
Expand Down Expand Up @@ -906,11 +907,14 @@ def get_edge_data(self, edge_ids=None, types=None, columns=None):
if edge_ids is not None:
if isinstance(edge_ids, int):
edge_ids = [edge_ids]
elif not isinstance(
edge_ids, (list, slice, np.ndarray, self.__series_type)
):
edge_ids = list(edge_ids)
df = df.loc[edge_ids]

try:
df = df.loc[edge_ids]
except TypeError:
raise TypeError(
"edge_ids needs to be a list-like type "
f"compatible with DataFrame.loc[], got {type(edge_ids)}"
)

if types is not None:
if isinstance(types, str):
Expand Down
28 changes: 17 additions & 11 deletions python/cugraph/cugraph/structure/property_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -512,7 +512,7 @@ def add_vertex_data(
):
"""
Add a dataframe describing vertex properties to the PropertyGraph.
Can contain additional vertices that will not have associatede edges.
Can contain additional vertices that will not have associated edges.

Parameters
----------
Expand Down Expand Up @@ -829,11 +829,14 @@ def get_vertex_data(self, vertex_ids=None, types=None, columns=None):
if vertex_ids is not None:
if isinstance(vertex_ids, int):
vertex_ids = [vertex_ids]
elif not isinstance(
vertex_ids, (list, slice, np.ndarray, self.__series_type)
):
vertex_ids = list(vertex_ids)
df = df.loc[vertex_ids]

try:
df = df.loc[vertex_ids]
except TypeError:
raise TypeError(
"vertex_ids needs to be a list-like type "
f"compatible with DataFrame.loc[], got {type(vertex_ids)}"
)

if types is not None:
if isinstance(types, str):
Expand Down Expand Up @@ -1218,11 +1221,14 @@ def get_edge_data(self, edge_ids=None, types=None, columns=None):
if edge_ids is not None:
if isinstance(edge_ids, int):
edge_ids = [edge_ids]
elif not isinstance(
edge_ids, (list, slice, np.ndarray, self.__series_type)
):
edge_ids = list(edge_ids)
df = df.loc[edge_ids]

try:
df = df.loc[edge_ids]
except TypeError:
raise TypeError(
"edge_ids needs to be a list-like type "
f"compatible with DataFrame.loc[], got {type(edge_ids)}"
)

if types is not None:
if isinstance(types, str):
Expand Down
40 changes: 37 additions & 3 deletions python/cugraph/cugraph/tests/mg/test_mg_property_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -866,11 +866,14 @@ def test_renumber_vertices_by_type(dataset1_MGPropertyGraph, prev_id_column):
assert df_id_ranges.loc[key, "stop"] == stop
df = pG.get_vertex_data(types=[key]).compute()
assert len(df) == stop - start + 1
assert (df["_VERTEX_"] == list(range(start, stop + 1))).all()
assert (
df["_VERTEX_"] == df["_VERTEX_"]._constructor(range(start, stop + 1))
).all()
Comment on lines +869 to +871
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These changes are similar to https://github.com/rapidsai/cugraph/pull/3036/files#r1048542911 .

Unsure how CI passed without them

if prev_id_column is not None:
cur = df[prev_id_column].sort_values()
expected = sorted(x for x, *args in data[key][1])
assert (cur == expected).all()
expected = cur._constructor(sorted(x for x, *args in data[key][1]))
assert (cur.values == expected.values).all()
Comment on lines +875 to +876
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I made similar changes in my PR. We have to be more careful running MG tests until they are included in CI.

# Make sure we renumber vertex IDs in edge data too
df = pG.get_edge_data().compute()
assert 0 <= df[pG.src_col_name].min() < df[pG.src_col_name].max() < 9
Expand Down Expand Up @@ -905,7 +908,9 @@ def test_renumber_edges_by_type(dataset1_MGPropertyGraph, prev_id_column):
assert df_id_ranges.loc[key, "stop"] == stop
df = pG.get_edge_data(types=[key]).compute()
assert len(df) == stop - start + 1
assert (df[pG.edge_id_col_name] == list(range(start, stop + 1))).all()
actual = df[pG.edge_id_col_name]
expected = actual._constructor(range(start, stop + 1))
assert (actual == expected).all()
Comment on lines +911 to +913
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if prev_id_column is not None:
assert prev_id_column in df.columns

Expand Down Expand Up @@ -1336,3 +1341,32 @@ def func():
assert len(df) == len(cyber_df)

gpubenchmark(func)


@pytest.mark.slow
@pytest.mark.parametrize("n_rows", [1_000_000])
@pytest.mark.parametrize("n_feats", [128])
def bench_get_vector_features(gpubenchmark, dask_client, n_rows, n_feats):
from cugraph.experimental import MGPropertyGraph

df = cudf.DataFrame(
{
"src": cp.arange(0, n_rows, dtype=cp.int32),
"dst": cp.arange(0, n_rows, dtype=cp.int32) + 1,
}
)
for i in range(n_feats):
df[f"feat_{i}"] = cp.ones(len(df), dtype=cp.int32)
df = dask_cudf.from_cudf(df, npartitions=16)

vector_properties = {"feat": [f"feat_{i}" for i in range(n_feats)]}
pG = MGPropertyGraph()
pG.add_edge_data(
df, vertex_col_names=["src", "dst"], vector_properties=vector_properties
)

def func(pG):
df = pG.get_edge_data(edge_ids=cp.arange(0, 100_000))
df = df.compute()

gpubenchmark(func, pG)
46 changes: 45 additions & 1 deletion python/cugraph/cugraph/tests/test_property_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -2384,7 +2384,7 @@ def func():
gpubenchmark(func)


@pytest.mark.slow
# @pytest.mark.slow
@pytest.mark.parametrize("n_rows", [10_000, 100_000, 1_000_000, 10_000_000])
@pytest.mark.parametrize("n_feats", [32, 64, 128])
def bench_add_vector_features(gpubenchmark, n_rows, n_feats):
Expand All @@ -2408,3 +2408,47 @@ def func():
)

gpubenchmark(func)


@pytest.mark.parametrize("n_rows", [1_000_000])
@pytest.mark.parametrize("n_feats", [128])
def bench_get_vector_features_cp_array(benchmark, n_rows, n_feats):
from cugraph.experimental import PropertyGraph

df = cudf.DataFrame(
{
"src": cp.arange(0, n_rows, dtype=cp.int32),
"dst": cp.arange(0, n_rows, dtype=cp.int32) + 1,
}
)
for i in range(n_feats):
df[f"feat_{i}"] = cp.ones(len(df), dtype=cp.int32)

vector_properties = {"feat": [f"feat_{i}" for i in range(n_feats)]}
pG = PropertyGraph()
pG.add_edge_data(
df, vertex_col_names=["src", "dst"], vector_properties=vector_properties
)
benchmark(pG.get_edge_data, edge_ids=cp.arange(0, 100_000))


@pytest.mark.parametrize("n_rows", [1_000_000])
@pytest.mark.parametrize("n_feats", [128])
def bench_get_vector_features_cudf_series(benchmark, n_rows, n_feats):
from cugraph.experimental import PropertyGraph

df = cudf.DataFrame(
{
"src": cp.arange(0, n_rows, dtype=cp.int32),
"dst": cp.arange(0, n_rows, dtype=cp.int32) + 1,
}
)
for i in range(n_feats):
df[f"feat_{i}"] = cp.ones(len(df), dtype=cp.int32)

vector_properties = {"feat": [f"feat_{i}" for i in range(n_feats)]}
pG = PropertyGraph()
pG.add_edge_data(
df, vertex_col_names=["src", "dst"], vector_properties=vector_properties
)
benchmark(pG.get_edge_data, edge_ids=cudf.Series(cp.arange(0, 100_000)))