Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add node2vec wrapper to cugraph #2093

Merged
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
a8c6634
Feature from branch-22.04-node2vec without merging issues
betochimas Feb 18, 2022
70bccbd
Merge pull request #11 from betochimas/branch-22.04-fea-node2vec-pyli…
betochimas Feb 18, 2022
226a2bc
Initial commit to cugraph node2vec wrapper
betochimas Feb 18, 2022
9399259
Account for offsets to path_sizes change
betochimas Feb 18, 2022
010f87a
Improved testing coverage
betochimas Feb 18, 2022
6b0b017
Merge pull request #12 from betochimas/branch-22.04-fea-node2vec-pyli…
betochimas Feb 18, 2022
abe6eed
Description update
betochimas Feb 18, 2022
7184312
Testing update for both values of compress_result, will pass once #20…
betochimas Feb 22, 2022
0495fc8
Style edits
betochimas Feb 22, 2022
97f90b4
Testing based on random_walks suite
betochimas Feb 23, 2022
0b1886a
Testing more inline with C implementation
betochimas Feb 23, 2022
16ecf4a
Merge pull request #13 from betochimas/branch-22.04-fea-node2vec-pyli…
betochimas Feb 23, 2022
f7cc0bb
Implementation ready, testing outline for cugraph node2vec
betochimas Feb 23, 2022
e1a595c
Implementation plus testing, with exception on networkx graphs
betochimas Feb 23, 2022
8d6aad4
Updated docstring
betochimas Feb 23, 2022
53d309a
Merge pull request #15 from rapidsai/branch-22.04
betochimas Feb 24, 2022
465c6b3
Merge pull request #16 from rapidsai/branch-22.04
betochimas Feb 24, 2022
494bdba
Merge pull request #17 from betochimas/branch-22.04
betochimas Feb 25, 2022
a640a54
Removed slower type check and redundant cupy array cast
betochimas Mar 1, 2022
e035788
Replaced source_array with seed_array
betochimas Mar 2, 2022
19b06ce
Merge pull request #19 from betochimas/branch-22.04-fea-node2vec-pyli…
betochimas Mar 2, 2022
44864e8
Resolving part of PR review, mainly description and checks
betochimas Mar 3, 2022
363ebbd
Merge pull request #20 from betochimas/branch-22.04
betochimas Mar 7, 2022
b06ccda
Merge pull request #21 from betochimas/branch-22.04-fea-node2vec-pyli…
betochimas Mar 7, 2022
621dc95
Testing now checks individual walks are valid
betochimas Mar 8, 2022
154816c
Merge pull request #22 from betochimas/branch-22.04
betochimas Mar 9, 2022
bf1e221
More efficient node2vec testing, other review changes
betochimas Mar 10, 2022
9100f77
CI checks + edits
betochimas Mar 10, 2022
308984c
Made threshold vals for force atlas 2 consistent over cpp and python …
betochimas Mar 10, 2022
dacbba8
Type edit in test_node2vec
betochimas Mar 11, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion python/cugraph/cugraph/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@
from cugraph.raft import raft_include_test
from cugraph.comms import comms

from cugraph.sampling import random_walks, rw_path
from cugraph.sampling import random_walks, rw_path, node2vec

from cugraph import experimental

Expand Down
3 changes: 2 additions & 1 deletion python/cugraph/cugraph/sampling/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2021, NVIDIA CORPORATION.
# Copyright (c) 2021-2022, NVIDIA CORPORATION.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
Expand All @@ -12,3 +12,4 @@
# limitations under the License.

from cugraph.sampling.random_walks import random_walks, rw_path
from cugraph.sampling.node2vec import node2vec
140 changes: 140 additions & 0 deletions python/cugraph/cugraph/sampling/node2vec.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
# Copyright (c) 2022, NVIDIA CORPORATION.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import pylibcugraph
import cudf
from cugraph.utilities import ensure_cugraph_obj_for_nx


def node2vec(G, start_vertices, max_depth, use_padding, p=1.0, q=1.0):
betochimas marked this conversation as resolved.
Show resolved Hide resolved
"""
Computes random walks for each node in 'start_vertices', under the
node2vec sampling framework.

References
----------

A Grover, J Leskovec: node2vec: Scalable Feature Learning for Networks,
Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge
Discovery and Data Mining, https://arxiv.org/abs/1607.00653

Parameters
----------
G : cuGraph.Graph or networkx.Graph
The graph can be either directed (DiGraph) or undirected (Graph).

start_vertices: int or list or cudf.Series
betochimas marked this conversation as resolved.
Show resolved Hide resolved
A single node or a list or a cudf.Series of nodes from which to run
the random walks

max_depth: int, optional
Copy link
Contributor

@jnke2016 jnke2016 Mar 2, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

follow up with function declaration I commented. I think max_depth should not be optional and must be a positive integer as you mentioned. I am not sure how the C++ part handles it if no value is passed. If this is updated, please could you update RW too as well as the test which I believe ensure max_depth is not None?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I assume you mean the description in the docstring right? I think I did that because the default value is None, but as you mentioned, max_depth certainly isn't optional, so I'll go ahead and remove that from node2vec and random_walks

The maximum depth of the random walks

use_padding: bool, optional
betochimas marked this conversation as resolved.
Show resolved Hide resolved
If True, padded paths are returned else coalesced paths are returned

p: double, optional
betochimas marked this conversation as resolved.
Show resolved Hide resolved
Return factor, which represents the likelihood of backtracking to
a previous node in the walk. A higher value makes it less likely to
sample a previously visited node, while a lower value makes it more
likely to backtrack, making the walk "local"

q: double, optional
betochimas marked this conversation as resolved.
Show resolved Hide resolved
In-out factor, which represents the likelihood of visiting nodes
closer or further from the outgoing node. If q > 1, the random walk
is likelier to visit nodes closer to the outgoing node. If q < 1, the
betochimas marked this conversation as resolved.
Show resolved Hide resolved
random walk is likelier to visit nodes further from the outgoing node.

Returns
-------
vertex_paths : cudf.Series or cudf.DataFrame
Series containing the vertices of edges/paths in the random walk.

edge_weight_paths: cudf.Series
Series containing the edge weights of edges represented by the
returned vertex_paths

sizes: int or cudf.Series
The path size or sizes in case of coalesced paths.

Example
-------
>>> M = cudf.read_csv(datasets_path / 'karate.csv', delimiter=' ',
... dtype=['int32', 'int32', 'float32'], header=None)
>>> G = cugraph.Graph()
>>> G.from_cudf_edgelist(M, source='0', destination='1', edge_attr='2')
>>> start_vertices = cudf.Series([0, 2])
>>> paths, weights, path_sizes = cugraph.node2vec(G, start_vertices, 3,
... True, 0.8, 0.5)

"""
if (not isinstance(max_depth, int)) or (max_depth < 1):
raise ValueError("'max_depth' must be a positive integer")
rlratzel marked this conversation as resolved.
Show resolved Hide resolved
if (not isinstance(use_padding, bool)):
betochimas marked this conversation as resolved.
Show resolved Hide resolved
raise ValueError("'use_padding' must be a bool")
if (p is None) or (p <= 0.0):
raise ValueError("'p' must be a positive double")
if (q is None) or (q <= 0.0):
rlratzel marked this conversation as resolved.
Show resolved Hide resolved
raise ValueError("'q' must be a positive double")

G, _ = ensure_cugraph_obj_for_nx(G)

if isinstance(start_vertices, int):
start_vertices = [start_vertices]

if isinstance(start_vertices, list):
start_vertices = cudf.Series(start_vertices)

if G.renumbered is True:
if isinstance(start_vertices, cudf.DataFrame):
start_vertices = G.lookup_internal_vertex_id(
start_vertices, start_vertices.columns)
else:
start_vertices = G.lookup_internal_vertex_id(start_vertices)

srcs = G.edgelist.edgelist_df['src']
Copy link
Contributor

@jnke2016 jnke2016 Mar 2, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This call will return the source column of the renumbered dataframe and will raise an error if the user didn't create a cugraph.Graph from a renumbered dataframe. You should probably first check if G is renumbered to avoid this

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Investigating this right now, the pylibcugraph wrapper does not work as intended if renumbered is False and returns an incorrect input. I added a fixme on line 124 describing this, and I believe this has to be resolved from the pylibcugraph or c layer

dsts = G.edgelist.edgelist_df['dst']
weights = G.edgelist.edgelist_df['weights']

resource_handle = pylibcugraph.experimental.ResourceHandle()
graph_props = pylibcugraph.experimental.GraphProperties(
is_multigraph=G.is_multigraph())
store_transposed = False
renumber = G.renumbered
betochimas marked this conversation as resolved.
Show resolved Hide resolved
do_expensive_check = False

sg = pylibcugraph.experimental.SGGraph(resource_handle, graph_props,
srcs, dsts, weights,
store_transposed, renumber,
do_expensive_check)

vertex_set, edge_set, sizes = pylibcugraph.experimental.node2vec(
resource_handle, sg, start_vertices,
max_depth, use_padding, p, q)
vertex_set = cudf.Series(vertex_set)
edge_set = cudf.Series(edge_set)
sizes = cudf.Series(sizes)

if G.renumbered:
df_ = cudf.DataFrame()
df_['vertex_set'] = vertex_set
df_ = G.unrenumber(df_, 'vertex_set', preserve_order=True)
vertex_set = cudf.Series(df_['vertex_set'])

if use_padding:
edge_set_sz = (max_depth - 1) * len(start_vertices)
return vertex_set, edge_set[:edge_set_sz], sizes

vertex_set_sz = vertex_set.sum()
edge_set_sz = vertex_set_sz - len(start_vertices)
return vertex_set[:vertex_set_sz], edge_set[:edge_set_sz], sizes
2 changes: 1 addition & 1 deletion python/cugraph/cugraph/sampling/random_walks.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def random_walks(G,
>>> M = cudf.read_csv(datasets_path / 'karate.csv', delimiter=' ',
... dtype=['int32', 'int32', 'float32'], header=None)
>>> G = cugraph.Graph()
>>> G.from_cudf_edgelist(M, source='0', destination='1')
>>> G.from_cudf_edgelist(M, source='0', destination='1', edge_attr='2')
>>> _, _, _ = cugraph.random_walks(G, M, 3)

"""
Expand Down
177 changes: 177 additions & 0 deletions python/cugraph/cugraph/tests/test_node2vec.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
# Copyright (c) 2022, NVIDIA CORPORATION.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import gc
import random

import pytest

from cugraph.tests import utils
import cugraph


# =============================================================================
# Parameters
# =============================================================================
DIRECTED_GRAPH_OPTIONS = [False, True]
DATASETS_SMALL = [pytest.param(d) for d in utils.DATASETS_SMALL]


# =============================================================================
# Pytest Setup / Teardown - called for each test function
# =============================================================================
def setup_function():
gc.collect()


def calc_node2vec(G,
start_vertices,
max_depth=None,
use_padding=False,
p=1.0,
q=1.0):
"""
Compute node2vec for each nodes in 'start_vertices'

Parameters
----------
G : cuGraph.Graph or networkx.Graph

start_vertices : int or list or cudf.Series

max_depth : int

use_padding : bool

p : double
betochimas marked this conversation as resolved.
Show resolved Hide resolved

q : double
"""
assert G is not None

vertex_paths, edge_weights, vertex_path_sizes = cugraph.node2vec(
G, start_vertices, max_depth, use_padding, p, q)

return (vertex_paths, edge_weights, vertex_path_sizes), start_vertices


@pytest.mark.parametrize("graph_file", utils.DATASETS_SMALL)
@pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS)
def test_node2vec_coalesced(
graph_file,
directed
):
G = utils.generate_cugraph_graph_from_file(graph_file, directed=directed,
edgevals=True)
k = random.randint(1, 10)
max_depth = 3
start_vertices = random.sample(range(G.number_of_vertices()), k)
df, seeds = calc_node2vec(
G,
start_vertices,
max_depth,
use_padding=False,
p=0.8,
q=0.5
)
# Check that weights match up with paths
vertex_paths, edge_weights, vertex_path_sizes = df
assert vertex_paths.size == max_depth * k
# NOTE: This below assertion will pass once PR #2089 is merged
# assert edge_weights.size == (max_depth - 1) * k


@pytest.mark.parametrize("graph_file", utils.DATASETS_SMALL)
@pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS)
def test_node2vec_padded(
graph_file,
directed
):
G = utils.generate_cugraph_graph_from_file(graph_file, directed=directed,
edgevals=True)
k = random.randint(1, 10)
max_depth = 3
start_vertices = random.sample(range(G.number_of_vertices()), k)
df, seeds = calc_node2vec(
G,
start_vertices,
max_depth,
use_padding=True,
p=0.8,
q=0.5
)
vertex_paths, edge_weights, vertex_path_sizes = df
assert vertex_paths.size == max_depth * k
# NOTE: This below assertion will pass once PR #2089 is merged
# assert edge_weights.size == (max_depth - 1) * k
assert vertex_path_sizes.sum() == vertex_paths.size
# Check that weights match up with paths

# Check that path sizes matches up correctly with paths


@pytest.mark.parametrize("graph_file", utils.DATASETS_SMALL)
@pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS)
@pytest.mark.parametrize("max_depth", [None, -1])
@pytest.mark.parametrize("p", [None, -1])
def test_node2vec_invalid(
graph_file,
directed,
max_depth,
p
):
G = utils.generate_cugraph_graph_from_file(graph_file, directed=directed,
edgevals=True)
k = random.randint(1, 10)
start_vertices = random.sample(range(G.number_of_vertices()), k)
# Tests for invalid max depth, p, and q
use_padding = True
q = 1.0
with pytest.raises(ValueError):
df, seeds = calc_node2vec(
G,
start_vertices,
max_depth=max_depth,
use_padding=use_padding,
p=p,
q=q
)


# FIXME: NetworkX Graphs not supported currently
"""
@pytest.mark.parametrize("graph_file", utils.DATASETS_SMALL)
@pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS)
def test_node2vec_nx(
graph_file,
directed
):
Gnx = utils.generate_nx_graph_from_file(graph_file, directed=directed,
edgevals=True)
k = random.randint(1, 10)
max_depth = 3
start_vertices = random.sample(range(Gnx.number_of_nodes()), k)
df, seeds = calc_node2vec(
Gnx,
start_vertices,
max_depth,
use_padding=True,
p=0.8,
q=0.5
)
vertex_paths, edge_weights, vertex_path_sizes = df
assert vertex_paths.size == max_depth * k
# NOTE: This below assertion will pass once PR #2089 is merged
# assert edge_weights.size == (max_depth - 1) * k
assert vertex_path_sizes.sum() == vertex_paths.size
"""
39 changes: 39 additions & 0 deletions python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -164,3 +164,42 @@ cdef extern from "cugraph_c/algorithms.h":
cugraph_paths_result_t** result,
cugraph_error_t** error
)

###########################################################################
# random_walks
ctypedef struct cugraph_random_walk_result_t:
pass

cdef cugraph_type_erased_device_array_view_t* \
cugraph_random_walk_result_get_paths(
cugraph_random_walk_result_t* result
)

cdef cugraph_type_erased_device_array_view_t* \
cugraph_random_walk_result_get_weights(
cugraph_random_walk_result_t* result
)

cdef cugraph_type_erased_device_array_view_t* \
cugraph_random_walk_result_get_path_sizes(
cugraph_random_walk_result_t* result
)

cdef void \
cugraph_random_walk_result_free(
cugraph_random_walk_result_t* result
)

# node2vec
cdef cugraph_error_code_t \
cugraph_node2vec(
const cugraph_resource_handle_t* handle,
cugraph_graph_t* graph,
const cugraph_type_erased_device_array_view_t* sources,
size_t max_depth,
bool_t compress_result,
double p,
double q,
cugraph_random_walk_result_t** result,
cugraph_error_t** error
)
3 changes: 3 additions & 0 deletions python/pylibcugraph/pylibcugraph/experimental/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,3 +52,6 @@

from pylibcugraph.sssp import EXPERIMENTAL__sssp
sssp = experimental_warning_wrapper(EXPERIMENTAL__sssp)

from pylibcugraph.node2vec import EXPERIMENTAL__node2vec
node2vec = experimental_warning_wrapper(EXPERIMENTAL__node2vec)
Loading