Skip to content

Commit

Permalink
Optimize pg.get_x_data APIs (#3086)
Browse files Browse the repository at this point in the history
This PR closes #3026 . 
The speed up on cupy arrays is `83x`. 

```
mainline:
----------------------- benchmark: 1 tests -----------------------
Name (time in s, mem in bytes)                        Mean  Rounds
------------------------------------------------------------------
bench_get_vector_features_cp_array[128-1000000]     1.0880       1
------------------------------------------------------------------


branch: 
------------------------ benchmark: 1 tests -----------------------
Name (time in ms, mem in bytes)                        Mean  Rounds
-------------------------------------------------------------------
bench_get_vector_features_cp_array[128-1000000]     12.7905       1
-------------------------------------------------------------------
```

```
branch: 
------------------------- benchmark: 1 tests -------------------------
Name (time in ms, mem in bytes)                           Mean  Rounds
----------------------------------------------------------------------
bench_get_vector_features_cudf_series[128-1000000]     12.7068       1
----------------------------------------------------------------------


Mainline: 
------------------------- benchmark: 1 tests -------------------------
Name (time in ms, mem in bytes)                           Mean  Rounds
----------------------------------------------------------------------
bench_get_vector_features_cudf_series[128-1000000]     12.6223       1
----------------------------------------------------------------------
```

Authors:
  - Vibhu Jawa (https://github.com/VibhuJawa)

Approvers:
  - Alex Barghi (https://github.com/alexbarghi-nv)

URL: #3086
  • Loading branch information
VibhuJawa authored Jan 3, 2023
1 parent b2cdd82 commit bc2e130
Show file tree
Hide file tree
Showing 4 changed files with 114 additions and 26 deletions.
26 changes: 15 additions & 11 deletions python/cugraph/cugraph/dask/structure/mg_property_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@

import cudf
import cupy
import numpy as np
import cugraph
import dask_cudf
import cugraph.dask as dcg
Expand Down Expand Up @@ -578,11 +577,13 @@ def get_vertex_data(self, vertex_ids=None, types=None, columns=None):
if vertex_ids is not None:
if isinstance(vertex_ids, int):
vertex_ids = [vertex_ids]
elif not isinstance(
vertex_ids, (list, slice, np.ndarray, self.__series_type)
):
vertex_ids = list(vertex_ids)
df = df.loc[vertex_ids]
try:
df = df.loc[vertex_ids]
except TypeError:
raise TypeError(
"vertex_ids needs to be a list-like type "
f"compatible with DataFrame.loc[], got {type(vertex_ids)}"
)

if types is not None:
if isinstance(types, str):
Expand Down Expand Up @@ -906,11 +907,14 @@ def get_edge_data(self, edge_ids=None, types=None, columns=None):
if edge_ids is not None:
if isinstance(edge_ids, int):
edge_ids = [edge_ids]
elif not isinstance(
edge_ids, (list, slice, np.ndarray, self.__series_type)
):
edge_ids = list(edge_ids)
df = df.loc[edge_ids]

try:
df = df.loc[edge_ids]
except TypeError:
raise TypeError(
"edge_ids needs to be a list-like type "
f"compatible with DataFrame.loc[], got {type(edge_ids)}"
)

if types is not None:
if isinstance(types, str):
Expand Down
28 changes: 17 additions & 11 deletions python/cugraph/cugraph/structure/property_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -512,7 +512,7 @@ def add_vertex_data(
):
"""
Add a dataframe describing vertex properties to the PropertyGraph.
Can contain additional vertices that will not have associatede edges.
Can contain additional vertices that will not have associated edges.
Parameters
----------
Expand Down Expand Up @@ -829,11 +829,14 @@ def get_vertex_data(self, vertex_ids=None, types=None, columns=None):
if vertex_ids is not None:
if isinstance(vertex_ids, int):
vertex_ids = [vertex_ids]
elif not isinstance(
vertex_ids, (list, slice, np.ndarray, self.__series_type)
):
vertex_ids = list(vertex_ids)
df = df.loc[vertex_ids]

try:
df = df.loc[vertex_ids]
except TypeError:
raise TypeError(
"vertex_ids needs to be a list-like type "
f"compatible with DataFrame.loc[], got {type(vertex_ids)}"
)

if types is not None:
if isinstance(types, str):
Expand Down Expand Up @@ -1218,11 +1221,14 @@ def get_edge_data(self, edge_ids=None, types=None, columns=None):
if edge_ids is not None:
if isinstance(edge_ids, int):
edge_ids = [edge_ids]
elif not isinstance(
edge_ids, (list, slice, np.ndarray, self.__series_type)
):
edge_ids = list(edge_ids)
df = df.loc[edge_ids]

try:
df = df.loc[edge_ids]
except TypeError:
raise TypeError(
"edge_ids needs to be a list-like type "
f"compatible with DataFrame.loc[], got {type(edge_ids)}"
)

if types is not None:
if isinstance(types, str):
Expand Down
40 changes: 37 additions & 3 deletions python/cugraph/cugraph/tests/mg/test_mg_property_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -866,11 +866,14 @@ def test_renumber_vertices_by_type(dataset1_MGPropertyGraph, prev_id_column):
assert df_id_ranges.loc[key, "stop"] == stop
df = pG.get_vertex_data(types=[key]).compute()
assert len(df) == stop - start + 1
assert (df["_VERTEX_"] == list(range(start, stop + 1))).all()
assert (
df["_VERTEX_"] == df["_VERTEX_"]._constructor(range(start, stop + 1))
).all()
if prev_id_column is not None:
cur = df[prev_id_column].sort_values()
expected = sorted(x for x, *args in data[key][1])
assert (cur == expected).all()
expected = cur._constructor(sorted(x for x, *args in data[key][1]))
assert (cur.values == expected.values).all()
# Make sure we renumber vertex IDs in edge data too
df = pG.get_edge_data().compute()
assert 0 <= df[pG.src_col_name].min() < df[pG.src_col_name].max() < 9
Expand Down Expand Up @@ -905,7 +908,9 @@ def test_renumber_edges_by_type(dataset1_MGPropertyGraph, prev_id_column):
assert df_id_ranges.loc[key, "stop"] == stop
df = pG.get_edge_data(types=[key]).compute()
assert len(df) == stop - start + 1
assert (df[pG.edge_id_col_name] == list(range(start, stop + 1))).all()
actual = df[pG.edge_id_col_name]
expected = actual._constructor(range(start, stop + 1))
assert (actual == expected).all()
if prev_id_column is not None:
assert prev_id_column in df.columns

Expand Down Expand Up @@ -1336,3 +1341,32 @@ def func():
assert len(df) == len(cyber_df)

gpubenchmark(func)


@pytest.mark.slow
@pytest.mark.parametrize("n_rows", [1_000_000])
@pytest.mark.parametrize("n_feats", [128])
def bench_get_vector_features(gpubenchmark, dask_client, n_rows, n_feats):
from cugraph.experimental import MGPropertyGraph

df = cudf.DataFrame(
{
"src": cp.arange(0, n_rows, dtype=cp.int32),
"dst": cp.arange(0, n_rows, dtype=cp.int32) + 1,
}
)
for i in range(n_feats):
df[f"feat_{i}"] = cp.ones(len(df), dtype=cp.int32)
df = dask_cudf.from_cudf(df, npartitions=16)

vector_properties = {"feat": [f"feat_{i}" for i in range(n_feats)]}
pG = MGPropertyGraph()
pG.add_edge_data(
df, vertex_col_names=["src", "dst"], vector_properties=vector_properties
)

def func(pG):
df = pG.get_edge_data(edge_ids=cp.arange(0, 100_000))
df = df.compute()

gpubenchmark(func, pG)
46 changes: 45 additions & 1 deletion python/cugraph/cugraph/tests/test_property_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -2384,7 +2384,7 @@ def func():
gpubenchmark(func)


@pytest.mark.slow
# @pytest.mark.slow
@pytest.mark.parametrize("n_rows", [10_000, 100_000, 1_000_000, 10_000_000])
@pytest.mark.parametrize("n_feats", [32, 64, 128])
def bench_add_vector_features(gpubenchmark, n_rows, n_feats):
Expand All @@ -2408,3 +2408,47 @@ def func():
)

gpubenchmark(func)


@pytest.mark.parametrize("n_rows", [1_000_000])
@pytest.mark.parametrize("n_feats", [128])
def bench_get_vector_features_cp_array(benchmark, n_rows, n_feats):
from cugraph.experimental import PropertyGraph

df = cudf.DataFrame(
{
"src": cp.arange(0, n_rows, dtype=cp.int32),
"dst": cp.arange(0, n_rows, dtype=cp.int32) + 1,
}
)
for i in range(n_feats):
df[f"feat_{i}"] = cp.ones(len(df), dtype=cp.int32)

vector_properties = {"feat": [f"feat_{i}" for i in range(n_feats)]}
pG = PropertyGraph()
pG.add_edge_data(
df, vertex_col_names=["src", "dst"], vector_properties=vector_properties
)
benchmark(pG.get_edge_data, edge_ids=cp.arange(0, 100_000))


@pytest.mark.parametrize("n_rows", [1_000_000])
@pytest.mark.parametrize("n_feats", [128])
def bench_get_vector_features_cudf_series(benchmark, n_rows, n_feats):
from cugraph.experimental import PropertyGraph

df = cudf.DataFrame(
{
"src": cp.arange(0, n_rows, dtype=cp.int32),
"dst": cp.arange(0, n_rows, dtype=cp.int32) + 1,
}
)
for i in range(n_feats):
df[f"feat_{i}"] = cp.ones(len(df), dtype=cp.int32)

vector_properties = {"feat": [f"feat_{i}" for i in range(n_feats)]}
pG = PropertyGraph()
pG.add_edge_data(
df, vertex_col_names=["src", "dst"], vector_properties=vector_properties
)
benchmark(pG.get_edge_data, edge_ids=cudf.Series(cp.arange(0, 100_000)))

0 comments on commit bc2e130

Please sign in to comment.