Skip to content

Commit

Permalink
neighbor sampling in COO/CSR format (#1982)
Browse files Browse the repository at this point in the history
This pull request adds neighborhood sampling, as needed by GNN frameworks (DGL, PyTorch-Geometric).

Since I did not hear back on most of the other issues that need to be addressed before this, I am continuing with my plan of first opening a PR with just the API. Once we agree on the final API, and once a minimal version of cugraph-ops is integrated, we can add the implementation of this API.

In particular, for now I am suggesting that the sampling type is exposed in the public API (it does not exist yet in cugraph-ops since that has not been integrated yet). This must be decided ahead of sampling for best performance (either by the end user or some automatic heuristic on the original graph), which is why it makes sense to have as a separate parameter for this API.

EDIT: link to issue #1978

Authors:
  - Matt Joux (https://github.com/MatthiasKohl)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)
  - Robert Maynard (https://github.com/robertmaynard)
  - Andrei Schaffer (https://github.com/aschaffer)
  - Chuck Hastings (https://github.com/ChuckHastings)

URL: #1982
  • Loading branch information
MatthiasKohl authored Feb 22, 2022
1 parent df49ad7 commit e95171f
Show file tree
Hide file tree
Showing 9 changed files with 248 additions and 7 deletions.
6 changes: 3 additions & 3 deletions build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ BUILD_DIRS="${LIBCUGRAPH_BUILD_DIR} ${LIBCUGRAPH_ETL_BUILD_DIR} ${CUGRAPH_BUILD_
VERBOSE_FLAG=""
CMAKE_VERBOSE_OPTION=""
BUILD_TYPE=Release
INSTALL_TARGET=install
INSTALL_TARGET="--target install"
BUILD_CPP_TESTS=ON
BUILD_CPP_MG_TESTS=OFF
BUILD_ALL_GPU_ARCH=0
Expand Down Expand Up @@ -198,7 +198,7 @@ if buildAll || hasArg libcugraph; then
-DBUILD_TESTS=${BUILD_CPP_TESTS} \
-DBUILD_CUGRAPH_MG_TESTS=${BUILD_CPP_MG_TESTS} \
${CMAKE_VERBOSE_OPTION}
cmake --build "${LIBCUGRAPH_BUILD_DIR}" -j${PARALLEL_LEVEL} --target ${INSTALL_TARGET} ${VERBOSE_FLAG}
cmake --build "${LIBCUGRAPH_BUILD_DIR}" -j${PARALLEL_LEVEL} ${INSTALL_TARGET} ${VERBOSE_FLAG}
fi

# Configure, build, and install libcugraph_etl
Expand All @@ -220,7 +220,7 @@ if buildAll || hasArg libcugraph_etl; then
-DBUILD_CUGRAPH_MG_TESTS=${BUILD_CPP_MG_TESTS} \
${CMAKE_VERBOSE_OPTION} \
${REPODIR}/cpp/libcugraph_etl
cmake --build "${LIBCUGRAPH_ETL_BUILD_DIR}" -j${PARALLEL_LEVEL} --target ${INSTALL_TARGET} ${VERBOSE_FLAG}
cmake --build "${LIBCUGRAPH_ETL_BUILD_DIR}" -j${PARALLEL_LEVEL} ${INSTALL_TARGET} ${VERBOSE_FLAG}
fi

# Build, and install pylibcugraph
Expand Down
3 changes: 2 additions & 1 deletion ci/release/update-version.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/bin/bash
# Copyright (c) 2018-2021, NVIDIA CORPORATION.
# Copyright (c) 2018-2022, NVIDIA CORPORATION.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
Expand Down Expand Up @@ -50,6 +50,7 @@ sed_runner 's/version = .*/version = '"'${NEXT_SHORT_TAG}'"'/g' docs/cugraph/sou
sed_runner 's/release = .*/release = '"'${NEXT_FULL_TAG}'"'/g' docs/cugraph/source/conf.py

for FILE in conda/environments/*.yml; do
sed_runner "s/libcugraphops=${CURRENT_SHORT_TAG}/libcugraphops=${NEXT_SHORT_TAG}/g" ${FILE};
sed_runner "s/cudf=${CURRENT_SHORT_TAG}/cudf=${NEXT_SHORT_TAG}/g" ${FILE};
sed_runner "s/rmm=${CURRENT_SHORT_TAG}/rmm=${NEXT_SHORT_TAG}/g" ${FILE};
sed_runner "s/dask-cuda=${CURRENT_SHORT_TAG}/dask-cuda=${NEXT_SHORT_TAG}/g" ${FILE};
Expand Down
1 change: 1 addition & 0 deletions conda/recipes/libcugraph/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ requirements:
- {{ pin_compatible('cudatoolkit', max_pin='x', min_pin='x') }}
- nccl>=2.9.9
- ucx-proc=*=gpu
- libcugraphops {{ minor_version }}.*
- libcusolver>=11.2.1

about:
Expand Down
1 change: 0 additions & 1 deletion conda/recipes/libcugraph_etl/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ requirements:
- cmake>=3.20.1
- doxygen>=1.8.11
- cudatoolkit {{ cuda_version }}.*
- libcugraphops {{ minor_version }}.* # needed for cmake to find transitive deps
- libcudf {{ minor_version }}.*
- libcugraph {{ minor_version }}.*
run:
Expand Down
2 changes: 2 additions & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,7 @@ add_library(cugraph SHARED
src/community/legacy/triangles_counting.cu
src/community/legacy/extract_subgraph_by_vertex.cu
src/community/legacy/egonet.cu
src/sampling/neighborhood.cu
src/sampling/random_walks.cu
src/sampling/detail/gather_utils_impl.cu
src/cores/legacy/core_number.cu
Expand Down Expand Up @@ -285,6 +286,7 @@ target_include_directories(cugraph
# - link libraries -------------------------------------------------------------
target_link_libraries(cugraph
PUBLIC
cugraphops::cugraphops
raft::raft
PRIVATE
cugraph::cuHornet
Expand Down
8 changes: 7 additions & 1 deletion cpp/cmake/thirdparty/get_libcugraphops.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,15 @@ function(find_and_configure_cugraphops)
HEADER_NAMES graph/sampling.h
LIBRARY_NAMES cugraph-ops++
INCLUDE_SUFFIXES cugraph-ops
BUILD_EXPORT_SET cugraph-exports
INSTALL_EXPORT_SET cugraph-exports
)

rapids_find_package(cugraphops REQUIRED)
rapids_find_package(cugraphops
REQUIRED
BUILD_EXPORT_SET cugraph-exports
INSTALL_EXPORT_SET cugraph-exports
)

endfunction()

Expand Down
66 changes: 65 additions & 1 deletion cpp/include/cugraph/algorithms.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2021, NVIDIA CORPORATION.
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -24,6 +24,8 @@
#include <cugraph/internals.hpp>
#include <cugraph/legacy/graph.hpp>

#include <cugraph-ops/graph/sampling.h>

#include <raft/handle.hpp>

namespace cugraph {
Expand Down Expand Up @@ -1416,6 +1418,68 @@ random_walks(raft::handle_t const& handle,
bool use_padding = false,
std::unique_ptr<sampling_params_t> sampling_strategy = nullptr);

/**
* @brief generate sub-sampled graph as an adjacency list (CSR format) given input graph,
* list of vertices and sample size per vertex. The output graph consists of the given
* vertices with each vertex having at most `sample_size` neighbors from the original graph
*
* @tparam graph_t Type of input graph/view (typically, graph_view_t, non-transposed and
* single-gpu).
* @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
* handles to various CUDA libraries) to run graph algorithms.
* @param rng The Rng (stateful) instance holding pseudo-random number generator state.
* @param graph Graph (view )object to sub-sample.
* @param ptr_d_start Device pointer to set of starting vertex indices for the sub-sampling.
* @param num_start_vertices = number(vertices) to use for sub-sampling.
* @param sampling_size = max number of neighbors per output vertex.
* @param sampling_algo = the sampling algorithm (algo R/algo L/etc.) used to produce outputs.
* @return std::tuple<rmm::device_uvector<typename graph_t::edge_type>,
* rmm::device_uvector<typename graph_t::vertex_type>>
* Tuple consisting of two arrays representing the offsets and indices of
* the sub-sampled graph.
*/
template <typename graph_t>
std::tuple<rmm::device_uvector<typename graph_t::edge_type>,
rmm::device_uvector<typename graph_t::vertex_type>>
sample_neighbors_adjacency_list(raft::handle_t const& handle,
ops::gnn::graph::Rng& rng,
graph_t const& graph,
typename graph_t::vertex_type const* ptr_d_start,
size_t num_start_vertices,
size_t sampling_size,
ops::gnn::graph::SamplingAlgoT sampling_algo);

/**
* @brief generate sub-sampled graph as an edge list (COO format) given input graph,
* list of vertices and sample size per vertex. The output graph consists of the given
* vertices with each vertex having at most `sample_size` neighbors from the original graph
*
* @tparam graph_t Type of input graph/view (typically, graph_view_t, non-transposed and
* single-gpu).
* @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
* handles to various CUDA libraries) to run graph algorithms.
* @param rng The Rng (stateful) instance holding pseudo-random number generator state.
* @param graph Graph (view )object to sub-sample.
* @param ptr_d_start Device pointer to set of starting vertex indices for the sub-sampling.
* @param num_start_vertices = number(vertices) to use for sub-sampling.
* @param sampling_size = max number of neighbors per output vertex.
* @param sampling_algo = the sampling algorithm (algo R/algo L/etc.) used to produce outputs.
* @return std::tuple<rmm::device_uvector<typename graph_t::edge_type>,
* rmm::device_uvector<typename graph_t::vertex_type>>
* Tuple consisting of two arrays representing the source and destination nodes of
* the sub-sampled graph.
*/
template <typename graph_t>
std::tuple<rmm::device_uvector<typename graph_t::vertex_type>,
rmm::device_uvector<typename graph_t::vertex_type>>
sample_neighbors_edgelist(raft::handle_t const& handle,
ops::gnn::graph::Rng& rng,
graph_t const& graph,
typename graph_t::vertex_type const* ptr_d_start,
size_t num_start_vertices,
size_t sampling_size,
ops::gnn::graph::SamplingAlgoT sampling_algo);

/**
* @brief Finds (weakly-connected-)component IDs of each vertices in the input graph.
*
Expand Down
115 changes: 115 additions & 0 deletions cpp/src/sampling/neighborhood.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
/*
* Copyright (c) 2021-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <cugraph/algorithms.hpp>

#include <utilities/cugraph_ops_utils.hpp>

#include <cugraph-ops/graph/sampling.h>

namespace cugraph {

template <typename graph_t>
std::tuple<rmm::device_uvector<typename graph_t::edge_type>,
rmm::device_uvector<typename graph_t::vertex_type>>
sample_neighbors_adjacency_list(raft::handle_t const& handle,
ops::gnn::graph::Rng& rng,
graph_t const& graph,
typename graph_t::vertex_type const* ptr_d_start,
size_t num_start_vertices,
size_t sampling_size,
ops::gnn::graph::SamplingAlgoT sampling_algo)
{
const auto [ops_graph, max_degree] = detail::get_graph_and_max_degree(graph);
return ops::gnn::graph::uniform_sample_csr(rng,
ops_graph,
ptr_d_start,
num_start_vertices,
sampling_size,
sampling_algo,
max_degree,
handle.get_stream());
}

template <typename graph_t>
std::tuple<rmm::device_uvector<typename graph_t::vertex_type>,
rmm::device_uvector<typename graph_t::vertex_type>>
sample_neighbors_edgelist(raft::handle_t const& handle,
ops::gnn::graph::Rng& rng,
graph_t const& graph,
typename graph_t::vertex_type const* ptr_d_start,
size_t num_start_vertices,
size_t sampling_size,
ops::gnn::graph::SamplingAlgoT sampling_algo)
{
const auto [ops_graph, max_degree] = detail::get_graph_and_max_degree(graph);
return ops::gnn::graph::uniform_sample_coo(rng,
ops_graph,
ptr_d_start,
num_start_vertices,
sampling_size,
sampling_algo,
max_degree,
handle.get_stream());
}

// template explicit instantiation directives (EIDir's):
//
// CSR SG FP32{
template std::tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>>
sample_neighbors_adjacency_list<graph_view_t<int32_t, int32_t, float, false, false>>(
raft::handle_t const& handle,
ops::gnn::graph::Rng& rng,
graph_view_t<int32_t, int32_t, float, false, false> const& gview,
int32_t const* ptr_d_start,
size_t num_start_vertices,
size_t sampling_size,
ops::gnn::graph::SamplingAlgoT sampling_algo);

template std::tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<int64_t>>
sample_neighbors_adjacency_list<graph_view_t<int64_t, int64_t, float, false, false>>(
raft::handle_t const& handle,
ops::gnn::graph::Rng& rng,
graph_view_t<int64_t, int64_t, float, false, false> const& gview,
int64_t const* ptr_d_start,
size_t num_start_vertices,
size_t sampling_size,
ops::gnn::graph::SamplingAlgoT sampling_algo);
//}
//
// COO SG FP32{
template std::tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>>
sample_neighbors_edgelist<graph_view_t<int32_t, int32_t, float, false, false>>(
raft::handle_t const& handle,
ops::gnn::graph::Rng& rng,
graph_view_t<int32_t, int32_t, float, false, false> const& gview,
int32_t const* ptr_d_start,
size_t num_start_vertices,
size_t sampling_size,
ops::gnn::graph::SamplingAlgoT sampling_algo);

template std::tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<int64_t>>
sample_neighbors_edgelist<graph_view_t<int64_t, int64_t, float, false, false>>(
raft::handle_t const& handle,
ops::gnn::graph::Rng& rng,
graph_view_t<int64_t, int64_t, float, false, false> const& gview,
int64_t const* ptr_d_start,
size_t num_start_vertices,
size_t sampling_size,
ops::gnn::graph::SamplingAlgoT sampling_algo);
//}

} // namespace cugraph
53 changes: 53 additions & 0 deletions cpp/src/utilities/cugraph_ops_utils.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
/*
* Copyright (c) 2021-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <cugraph/graph_view.hpp>

#include <cugraph-ops/graph/format.h>

#include <tuple>

namespace cugraph {
namespace detail {

template <typename NodeTypeT, typename EdgeTypeT, typename WeightT>
ops::gnn::graph::fg_csr<EdgeTypeT> get_graph(
graph_view_t<NodeTypeT, EdgeTypeT, WeightT, false, false> const& gview)
{
ops::gnn::graph::fg_csr<EdgeTypeT> graph;
graph.n_nodes = gview.get_number_of_vertices();
graph.n_indices = gview.get_number_of_edges();
// FIXME: this is evil and is just temporary until we have a matching type in cugraph-ops
// or we change the type accepted by the functions calling into cugraph-ops
graph.offsets = const_cast<EdgeTypeT*>(gview.get_matrix_partition_view().get_offsets());
graph.indices = const_cast<EdgeTypeT*>(gview.get_matrix_partition_view().get_indices());
return graph;
}

template <typename NodeTypeT, typename EdgeTypeT, typename WeightT>
std::tuple<ops::gnn::graph::fg_csr<EdgeTypeT>, NodeTypeT> get_graph_and_max_degree(
graph_view_t<NodeTypeT, EdgeTypeT, WeightT, false, false> const& gview)
{
// FIXME this is sufficient for now, but if there is a fast (cached) way
// of getting max degree, use that instead
auto max_degree = std::numeric_limits<NodeTypeT>::max();
return std::make_tuple(get_graph(gview), max_degree);
}

} // namespace detail
} // namespace cugraph

0 comments on commit e95171f

Please sign in to comment.