From 723688fb1fcc0ff88df8850899e4e3168b78131e Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 19 Aug 2021 11:35:12 -0400 Subject: [PATCH 01/57] delete unused file --- .../cugraph/vertex_partition_device.cuh | 110 ------------------ 1 file changed, 110 deletions(-) delete mode 100644 cpp/include/cugraph/vertex_partition_device.cuh diff --git a/cpp/include/cugraph/vertex_partition_device.cuh b/cpp/include/cugraph/vertex_partition_device.cuh deleted file mode 100644 index 9a5bbf4bbcf..00000000000 --- a/cpp/include/cugraph/vertex_partition_device.cuh +++ /dev/null @@ -1,110 +0,0 @@ -/* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include - -#include - -namespace cugraph { - -template -class vertex_partition_device_base_t { - public: - vertex_partition_device_base_t(vertex_t number_of_vertices) - : number_of_vertices_(number_of_vertices) - { - } - - template - __host__ __device__ std::enable_if_t::value, bool> is_valid_vertex( - vertex_type v) const noexcept - { - return ((v >= 0) && (v < number_of_vertices_)); - } - - template - __host__ __device__ std::enable_if_t::value, bool> is_valid_vertex( - vertex_type v) const noexcept - { - return (v < number_of_vertices_); - } - - private: - // should be trivially copyable to device - vertex_t number_of_vertices_{0}; -}; - -template -class vertex_partition_device_t; - -// multi-GPU version -template -class vertex_partition_device_t> - : public vertex_partition_device_base_t { - public: - vertex_partition_device_t(GraphViewType const& graph_view) - : vertex_partition_device_base_t( - graph_view.get_number_of_vertices()), - first_(graph_view.get_local_vertex_first()), - last_(graph_view.get_local_vertex_last()) - { - } - - __host__ __device__ bool is_local_vertex_nocheck( - typename GraphViewType::vertex_type v) const noexcept - { - return (v >= first_) && (v < last_); - } - - __host__ __device__ typename GraphViewType::vertex_type - get_local_vertex_offset_from_vertex_nocheck(typename GraphViewType::vertex_type v) const noexcept - { - return v - first_; - } - - private: - // should be trivially copyable to device - typename GraphViewType::vertex_type first_{0}; - typename GraphViewType::vertex_type last_{0}; -}; - -// single-GPU version -template -class vertex_partition_device_t> - : public vertex_partition_device_base_t { - public: - vertex_partition_device_t(GraphViewType const& graph_view) - : vertex_partition_device_base_t( - graph_view.get_number_of_vertices()) - { - } - - __host__ __device__ constexpr bool is_local_vertex_nocheck( - typename GraphViewType::vertex_type v) const noexcept - { - return true; - } - - __host__ __device__ constexpr typename GraphViewType::vertex_type - get_local_vertex_offset_from_vertex_nocheck(typename GraphViewType::vertex_type v) const noexcept - { - return v; - } -}; - -} // namespace cugraph From aaf7bb33aa018353a161c6e2cd76d94bcdf4e711 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 25 Aug 2021 22:38:39 -0400 Subject: [PATCH 02/57] update headers to support row/col input properties wrapper --- cpp/include/cugraph/graph_view.hpp | 52 ++++ .../prims/copy_to_adj_matrix_row_col.cuh | 164 +++++------ .../cugraph/prims/row_col_properties.cuh | 258 ++++++++++++++++++ 3 files changed, 383 insertions(+), 91 deletions(-) create mode 100644 cpp/include/cugraph/prims/row_col_properties.cuh diff --git a/cpp/include/cugraph/graph_view.hpp b/cpp/include/cugraph/graph_view.hpp index 3d22828731e..3cab3b7ff8f 100644 --- a/cpp/include/cugraph/graph_view.hpp +++ b/cpp/include/cugraph/graph_view.hpp @@ -554,6 +554,26 @@ class graph_view_t get_local_sorted_unique_edge_row_begin() const + { + return local_sorted_unique_edge_row_first_; + } + + std::optional get_local_sorted_unique_edge_row_end() const + { + return local_sorted_unique_edge_row_last_; + } + + std::optional get_local_sorted_unique_edge_col_begin() const + { + return local_sorted_unique_edge_col_first_; + } + + std::optional get_local_sorted_unique_edge_col_end() const + { + return local_sorted_unique_edge_col_last_; + } + private: std::vector adj_matrix_partition_offsets_{}; std::vector adj_matrix_partition_indices_{}; @@ -569,6 +589,12 @@ class graph_view_t> adj_matrix_partition_segment_offsets_{}; + + // FIXME: to be implemented. + std::optional local_sorted_unique_edge_row_first_{std::nullopt}; + std::optional local_sorted_unique_edge_row_last_{std::nullopt}; + std::optional local_sorted_unique_edge_col_first_{std::nullopt}; + std::optional local_sorted_unique_edge_col_last_{std::nullopt}; }; // single-GPU version @@ -748,6 +774,26 @@ class graph_view_t get_local_sorted_unique_edge_row_begin() const + { + return local_sorted_unique_edge_row_first_; + } + + std::optional get_local_sorted_unique_edge_row_end() const + { + return local_sorted_unique_edge_row_last_; + } + + std::optional get_local_sorted_unique_edge_col_begin() const + { + return local_sorted_unique_edge_col_first_; + } + + std::optional get_local_sorted_unique_edge_col_end() const + { + return local_sorted_unique_edge_col_last_; + } + private: edge_t const* offsets_{nullptr}; vertex_t const* indices_{nullptr}; @@ -755,6 +801,12 @@ class graph_view_t> segment_offsets_{std::nullopt}; + + // FIXME: to be implemented. + std::optional local_sorted_unique_edge_row_first_{std::nullopt}; + std::optional local_sorted_unique_edge_row_last_{std::nullopt}; + std::optional local_sorted_unique_edge_col_first_{std::nullopt}; + std::optional local_sorted_unique_edge_col_last_{std::nullopt}; }; } // namespace cugraph diff --git a/cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh b/cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh index 96aefa016fa..bd0d4a7e3dd 100644 --- a/cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh +++ b/cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -44,11 +45,11 @@ namespace detail { template + typename MatrixMajorValueOutputWrapper> void copy_to_matrix_major(raft::handle_t const& handle, GraphViewType const& graph_view, VertexValueInputIterator vertex_value_input_first, - MatrixMajorValueOutputIterator matrix_major_value_output_first) + MatrixMajorValueOutputWrapper& matrix_major_value_output) { if (GraphViewType::is_multi_gpu) { auto& comm = handle.get_comms(); @@ -79,7 +80,7 @@ void copy_to_matrix_major(raft::handle_t const& handle, } device_allgatherv(col_comm, vertex_value_input_first, - matrix_major_value_output_first, + matrix_major_value_output.value_data(), rx_counts, displacements, handle.get_stream()); @@ -101,20 +102,20 @@ void copy_to_matrix_major(raft::handle_t const& handle, thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), vertex_value_input_first, vertex_value_input_first + graph_view.get_number_of_local_vertices(), - matrix_major_value_output_first); + matrix_major_value_output.value_data()); } } template + typename MatrixMajorValueOutputWrapper> void copy_to_matrix_major(raft::handle_t const& handle, GraphViewType const& graph_view, VertexIterator vertex_first, VertexIterator vertex_last, VertexValueInputIterator vertex_value_input_first, - MatrixMajorValueOutputIterator matrix_major_value_output_first) + MatrixMajorValueOutputWrapper& matrix_major_value_output) { using vertex_t = typename GraphViewType::vertex_type; using edge_t = typename GraphViewType::edge_type; @@ -194,7 +195,7 @@ void copy_to_matrix_major(raft::handle_t const& handle, rx_value_first, rx_value_first + rx_counts[i], map_first, - matrix_major_value_output_first + matrix_partition.get_major_value_start_offset()); + matrix_major_value_output.value_data() + matrix_partition.get_major_value_start_offset()); } else { auto map_first = thrust::make_transform_iterator( rx_vertices.begin(), [matrix_partition] __device__(auto v) { @@ -207,7 +208,7 @@ void copy_to_matrix_major(raft::handle_t const& handle, rx_value_first, rx_value_first + rx_counts[i], map_first, - matrix_major_value_output_first + matrix_partition.get_major_value_start_offset()); + matrix_major_value_output.value_data() + matrix_partition.get_major_value_start_offset()); } } @@ -230,17 +231,17 @@ void copy_to_matrix_major(raft::handle_t const& handle, val_first, val_first + thrust::distance(vertex_first, vertex_last), vertex_first, - matrix_major_value_output_first); + matrix_major_value_output.value_data()); } } template + typename MatrixMinorValueOutputWrapper> void copy_to_matrix_minor(raft::handle_t const& handle, GraphViewType const& graph_view, VertexValueInputIterator vertex_value_input_first, - MatrixMinorValueOutputIterator matrix_minor_value_output_first) + MatrixMinorValueOutputWrapper& matrix_minor_value_output) { if (GraphViewType::is_multi_gpu) { auto& comm = handle.get_comms(); @@ -271,7 +272,7 @@ void copy_to_matrix_minor(raft::handle_t const& handle, } device_allgatherv(row_comm, vertex_value_input_first, - matrix_minor_value_output_first, + matrix_minor_value_output.value_data(), rx_counts, displacements, handle.get_stream()); @@ -293,20 +294,20 @@ void copy_to_matrix_minor(raft::handle_t const& handle, thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), vertex_value_input_first, vertex_value_input_first + graph_view.get_number_of_local_vertices(), - matrix_minor_value_output_first); + matrix_minor_value_output.value_data()); } } template + typename MatrixMinorValueOutputWrapper> void copy_to_matrix_minor(raft::handle_t const& handle, GraphViewType const& graph_view, VertexIterator vertex_first, VertexIterator vertex_last, VertexValueInputIterator vertex_value_input_first, - MatrixMinorValueOutputIterator matrix_minor_value_output_first) + MatrixMinorValueOutputWrapper& matrix_minor_value_output) { using vertex_t = typename GraphViewType::vertex_type; using edge_t = typename GraphViewType::edge_type; @@ -384,7 +385,7 @@ void copy_to_matrix_minor(raft::handle_t const& handle, rx_value_first, rx_value_first + rx_counts[i], map_first, - matrix_minor_value_output_first); + matrix_minor_value_output.value_data()); } else { auto map_first = thrust::make_transform_iterator( rx_vertices.begin(), [matrix_partition] __device__(auto v) { @@ -396,7 +397,7 @@ void copy_to_matrix_minor(raft::handle_t const& handle, rx_value_first, rx_value_first + rx_counts[i], map_first, - matrix_minor_value_output_first); + matrix_minor_value_output.value_data()); } } @@ -418,7 +419,7 @@ void copy_to_matrix_minor(raft::handle_t const& handle, val_first, val_first + thrust::distance(vertex_first, vertex_last), vertex_first, - matrix_minor_value_output_first); + matrix_minor_value_output.value_data()); } } @@ -433,33 +434,29 @@ void copy_to_matrix_minor(raft::handle_t const& handle, * * @tparam GraphViewType Type of the passed non-owning graph object. * @tparam VertexValueInputIterator Type of the iterator for vertex properties. - * @tparam AdjMatrixRowValueOutputIterator Type of the iterator for graph adjacency matrix row - * output property variables. * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and * handles to various CUDA libraries) to run graph algorithms. * @param graph_view Non-owning graph object. * @param vertex_value_input_first Iterator pointing to the vertex properties for the first * (inclusive) vertex (assigned to this process in multi-GPU). `vertex_value_input_last` (exclusive) * is deduced as @p vertex_value_input_first + @p graph_view.get_number_of_local_vertices(). - * @param adj_matrix_row_value_output_first Iterator pointing to the adjacency matrix row output - * property variables for the first (inclusive) row (assigned to this process in multi-GPU). - * `adj_matrix_row_value_output_last` (exclusive) is deduced as @p adj_matrix_row_value_output_first - * + @p graph_view.get_number_of_local_adj_matrix_partition_rows(). + * @param adj_matrix_row_value_output Wrapper used to access data storage to copy row properties (for the rows assigned to this process in multi-GPU). */ -template -void copy_to_adj_matrix_row(raft::handle_t const& handle, - GraphViewType const& graph_view, - VertexValueInputIterator vertex_value_input_first, - AdjMatrixRowValueOutputIterator adj_matrix_row_value_output_first) +template +void copy_to_adj_matrix_row( + raft::handle_t const& handle, + GraphViewType const& graph_view, + VertexValueInputIterator vertex_value_input_first, + row_properties_t::value_type>& + adj_matrix_row_value_output) { - if (GraphViewType::is_adj_matrix_transposed) { + if constexpr (GraphViewType::is_adj_matrix_transposed) { copy_to_matrix_minor( - handle, graph_view, vertex_value_input_first, adj_matrix_row_value_output_first); + handle, graph_view, vertex_value_input_first, adj_matrix_row_value_output); } else { copy_to_matrix_major( - handle, graph_view, vertex_value_input_first, adj_matrix_row_value_output_first); + handle, graph_view, vertex_value_input_first, adj_matrix_row_value_output); } } @@ -474,8 +471,6 @@ void copy_to_adj_matrix_row(raft::handle_t const& handle, * @tparam GraphViewType Type of the passed non-owning graph object. * @tparam VertexIterator Type of the iterator for vertex identifiers. * @tparam VertexValueInputIterator Type of the iterator for vertex properties. - * @tparam AdjMatrixRowValueOutputIterator Type of the iterator for graph adjacency matrix row - * output property variables. * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and * handles to various CUDA libraries) to run graph algorithms. * @param graph_view Non-owning graph object. @@ -486,36 +481,33 @@ void copy_to_adj_matrix_row(raft::handle_t const& handle, * @param vertex_value_input_first Iterator pointing to the vertex properties for the first * (inclusive) vertex (assigned to this process in multi-GPU). `vertex_value_input_last` (exclusive) * is deduced as @p vertex_value_input_first + @p graph_view.get_number_of_local_vertices(). - * @param adj_matrix_row_value_output_first Iterator pointing to the adjacency matrix row output - * property variables for the first (inclusive) row (assigned to this process in multi-GPU). - * `adj_matrix_row_value_output_last` (exclusive) is deduced as @p adj_matrix_row_value_output_first - * + @p graph_view.get_number_of_local_adj_matrix_partition_rows(). + * @param adj_matrix_row_value_output Wrapper used to access data storage to copy row properties (for the rows assigned to this process in multi-GPU). */ -template -void copy_to_adj_matrix_row(raft::handle_t const& handle, - GraphViewType const& graph_view, - VertexIterator vertex_first, - VertexIterator vertex_last, - VertexValueInputIterator vertex_value_input_first, - AdjMatrixRowValueOutputIterator adj_matrix_row_value_output_first) +template +void copy_to_adj_matrix_row( + raft::handle_t const& handle, + GraphViewType const& graph_view, + VertexIterator vertex_first, + VertexIterator vertex_last, + VertexValueInputIterator vertex_value_input_first, + row_properties_t::value_type>& + adj_matrix_row_value_output) { - if (GraphViewType::is_adj_matrix_transposed) { + if constexpr (GraphViewType::is_adj_matrix_transposed) { copy_to_matrix_minor(handle, graph_view, vertex_first, vertex_last, vertex_value_input_first, - adj_matrix_row_value_output_first); + adj_matrix_row_value_output); } else { copy_to_matrix_major(handle, graph_view, vertex_first, vertex_last, vertex_value_input_first, - adj_matrix_row_value_output_first); + adj_matrix_row_value_output); } } @@ -523,38 +515,33 @@ void copy_to_adj_matrix_row(raft::handle_t const& handle, * @brief Copy vertex property values to the corresponding graph adjacency matrix column property * variables. * - * This version fills the entire set of graph adjacency matrix column property values. This function - * is inspired by thrust::copy(). + * This version fills the entire set of graph adjacency matrix column property values. * * @tparam GraphViewType Type of the passed non-owning graph object. * @tparam VertexValueInputIterator Type of the iterator for vertex properties. - * @tparam AdjMatrixColValueOutputIterator Type of the iterator for graph adjacency matrix column - * output property variables. * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and * handles to various CUDA libraries) to run graph algorithms. * @param graph_view Non-owning graph object. * @param vertex_value_input_first Iterator pointing to the vertex properties for the first * (inclusive) vertex (assigned to this process in multi-GPU). `vertex_value_input_last` (exclusive) * is deduced as @p vertex_value_input_first + @p graph_view.get_number_of_local_vertices(). - * @param adj_matrix_col_value_output_first Iterator pointing to the adjacency matrix column output - * property variables for the first (inclusive) column (assigned to this process in multi-GPU). - * `adj_matrix_col_value_output_last` (exclusive) is deduced as @p adj_matrix_col_value_output_first - * + @p graph_view.get_number_of_local_adj_matrix_partition_cols(). + * @param adj_matrix_col_value_output Wrapper used to access data storage to copy column properties (for the columns assigned to this process in multi-GPU). */ -template -void copy_to_adj_matrix_col(raft::handle_t const& handle, - GraphViewType const& graph_view, - VertexValueInputIterator vertex_value_input_first, - AdjMatrixColValueOutputIterator adj_matrix_col_value_output_first) +template +void copy_to_adj_matrix_col( + raft::handle_t const& handle, + GraphViewType const& graph_view, + VertexValueInputIterator vertex_value_input_first, + col_properties_t::value_type>& + adj_matrix_col_value_output) { - if (GraphViewType::is_adj_matrix_transposed) { + if constexpr (GraphViewType::is_adj_matrix_transposed) { copy_to_matrix_major( - handle, graph_view, vertex_value_input_first, adj_matrix_col_value_output_first); + handle, graph_view, vertex_value_input_first, adj_matrix_col_value_output); } else { copy_to_matrix_minor( - handle, graph_view, vertex_value_input_first, adj_matrix_col_value_output_first); + handle, graph_view, vertex_value_input_first, adj_matrix_col_value_output); } } @@ -564,13 +551,11 @@ void copy_to_adj_matrix_col(raft::handle_t const& handle, * * This version fills only a subset of graph adjacency matrix column property values. [@p * vertex_first, @p vertex_last) specifies the vertices with new values to be copied to graph - * adjacency matrix column property variables. This function is inspired by thrust::copy(). + * adjacency matrix column property variables. * * @tparam GraphViewType Type of the passed non-owning graph object. * @tparam VertexIterator Type of the iterator for vertex identifiers. * @tparam VertexValueInputIterator Type of the iterator for vertex properties. - * @tparam AdjMatrixColValueOutputIterator Type of the iterator for graph adjacency matrix column - * output property variables. * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and * handles to various CUDA libraries) to run graph algorithms. * @param graph_view Non-owning graph object. @@ -581,36 +566,33 @@ void copy_to_adj_matrix_col(raft::handle_t const& handle, * @param vertex_value_input_first Iterator pointing to the vertex properties for the first * (inclusive) vertex (assigned to this process in multi-GPU). `vertex_value_input_last` (exclusive) * is deduced as @p vertex_value_input_first + @p graph_view.get_number_of_local_vertices(). - * @param adj_matrix_col_value_output_first Iterator pointing to the adjacency matrix column output - * property variables for the first (inclusive) column (assigned to this process in multi-GPU). - * `adj_matrix_col_value_output_last` (exclusive) is deduced as @p adj_matrix_col_value_output_first - * + @p graph_view.get_number_of_local_adj_matrix_partition_cols(). + * @param adj_matrix_col_value_output Wrapper used to access data storage to copy column properties (for the columns assigned to this process in multi-GPU). */ -template -void copy_to_adj_matrix_col(raft::handle_t const& handle, - GraphViewType const& graph_view, - VertexIterator vertex_first, - VertexIterator vertex_last, - VertexValueInputIterator vertex_value_input_first, - AdjMatrixColValueOutputIterator adj_matrix_col_value_output_first) +template +void copy_to_adj_matrix_col( + raft::handle_t const& handle, + GraphViewType const& graph_view, + VertexIterator vertex_first, + VertexIterator vertex_last, + VertexValueInputIterator vertex_value_input_first, + col_properties_t::value_type>& + adj_matrix_col_value_output) { - if (GraphViewType::is_adj_matrix_transposed) { + if constexpr (GraphViewType::is_adj_matrix_transposed) { copy_to_matrix_major(handle, graph_view, vertex_first, vertex_last, vertex_value_input_first, - adj_matrix_col_value_output_first); + adj_matrix_col_value_output); } else { copy_to_matrix_minor(handle, graph_view, vertex_first, vertex_last, vertex_value_input_first, - adj_matrix_col_value_output_first); + adj_matrix_col_value_output); } } diff --git a/cpp/include/cugraph/prims/row_col_properties.cuh b/cpp/include/cugraph/prims/row_col_properties.cuh new file mode 100644 index 00000000000..b1f550cf3e3 --- /dev/null +++ b/cpp/include/cugraph/prims/row_col_properties.cuh @@ -0,0 +1,258 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +namespace cugraph { + +namespace detail { + +template +struct key_to_value_t { + thrust::optional const key_first{}; + thrust::optional const key_last{}; + ValueIterator const value_first{}; + + __device__ typename thrust::iterator_traits::value_type operator()( + vertex_t offset) const + { + if (key_first) { + auto it = thrust::lower_bound(thrust::seq, *key_first, *key_last, offset); + assert((it != *key_last) && (*it == offset)); + return *(value_first + thrust::distance(*key_first, it)); + } else { + return *(value_first + offset); + } + } +}; + +template +class major_properties_t { + public: + major_properties_t() : buffer_(allocate_dataframe_buffer(0, rmm::cuda_stream_view{})) {} + + major_properties_t(raft::handle_t const& handle, vertex_t buffer_size) + : buffer_(allocate_dataframe_buffer(buffer_size, handle.get_stream())) + { + } + + void fill(T value, rmm::cuda_stream_view stream) + { + thrust::fill( + rmm::exec_policy(stream), value_data(), value_data() + size_dataframe_buffer(buffer_), value); + } + + auto begin() const { return get_dataframe_buffer_begin(buffer_); } + + auto value_data() { return get_dataframe_buffer_begin(buffer_); } + + private: + decltype(allocate_dataframe_buffer(0, rmm::cuda_stream_view{})) buffer_; +}; + +template +class minor_properties_t { + public: + minor_properties_t() + : key_first_(std::nullopt), + key_last_(std::nullopt), + buffer_(allocate_dataframe_buffer(0, rmm::cuda_stream_view{})) + { + } + + minor_properties_t(raft::handle_t const& handle, vertex_t buffer_size) + : key_first_(std::nullopt), + key_last_(std::nullopt), + buffer_(allocate_dataframe_buffer(buffer_size, handle.get_stream())) + { + } + + minor_properties_t(raft::handle_t const& handle, + vertex_t const* key_first, + vertex_t const* key_last) + : key_first_(key_first), + key_last_(key_last), + buffer_( + allocate_dataframe_buffer(thrust::distance(key_first, key_last), handle.get_stream())) + { + } + + void fill(T value, rmm::cuda_stream_view stream) + { + thrust::fill( + rmm::exec_policy(stream), value_data(), value_data() + size_dataframe_buffer(buffer_), value); + } + + auto begin() const + { + auto value_first = get_dataframe_buffer_begin(buffer_); + return thrust::make_transform_iterator( + thrust::make_counting_iterator(vertex_t{0}), + key_to_value_t{ + key_first_ ? thrust::make_optional(*key_first_) : thrust::nullopt, + key_last_ ? thrust::make_optional(*key_last_) : thrust::nullopt, + value_first}); + } + + auto value_data() { return get_dataframe_buffer_begin(buffer_); } + + private: + std::optional key_first_{std::nullopt}; + std::optional key_last_{std::nullopt}; + + decltype(allocate_dataframe_buffer(0, rmm::cuda_stream_view{})) buffer_; +}; + +} // namespace detail + +template +class row_properties_t; + +template +class row_properties_t> { + public: + using value_type = T; + + static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic::value); + + row_properties_t(raft::handle_t const& handle, GraphViewType const& graph_view) + { + auto key_first = graph_view.get_local_sorted_unique_edge_row_begin(); + auto key_last = graph_view.get_local_sorted_unique_edge_row_end(); + if (key_first) { + properties_ = detail::minor_properties_t( + handle, *key_first, *key_last); + } else { + properties_ = detail::minor_properties_t( + handle, graph_view.get_number_of_local_adj_matrix_partition_rows()); + } + } + + void fill(T value, rmm::cuda_stream_view stream) { properties_.fill(value, stream); } + + auto begin() const { return properties_.begin(); } + auto value_data() { return properties_.value_data(); } + + private: + detail::minor_properties_t properties_{}; +}; + +template +class row_properties_t> { + public: + using value_type = T; + + static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic::value); + + row_properties_t(raft::handle_t const& handle, GraphViewType const& graph_view) + { + properties_ = detail::major_properties_t( + handle, graph_view.get_number_of_local_adj_matrix_partition_rows()); + } + + void fill(T value, rmm::cuda_stream_view stream) { properties_.fill(value, stream); } + + auto begin() const { return properties_.begin(); } + auto value_data() { return properties_.value_data(); } + + private: + detail::major_properties_t properties_{}; +}; + +template +class col_properties_t; + +template +class col_properties_t> { + public: + using value_type = T; + + static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic::value); + + col_properties_t(raft::handle_t const& handle, GraphViewType const& graph_view) + { + properties_ = detail::major_properties_t( + handle, graph_view.get_number_of_local_adj_matrix_partition_cols()); + } + + void fill(T value, rmm::cuda_stream_view stream) { properties_.fill(value, stream); } + + auto begin() const { return properties_.begin(); } + auto value_data() { return properties_.value_data(); } + + private: + detail::major_properties_t properties_{}; +}; + +template +class col_properties_t> { + public: + using value_type = T; + + static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic::value); + + col_properties_t(raft::handle_t const& handle, GraphViewType const& graph_view) + { + auto key_first = graph_view.get_local_sorted_unique_edge_col_begin(); + auto key_last = graph_view.get_local_sorted_unique_edge_col_end(); + if (key_first) { + properties_ = detail::minor_properties_t( + handle, *key_first, *key_last); + } else { + properties_ = detail::minor_properties_t( + handle, graph_view.get_number_of_local_adj_matrix_partition_cols()); + } + } + + void fill(T value, rmm::cuda_stream_view stream) { properties_.fill(value, stream); } + + auto begin() const { return properties_.begin(); } + auto value_data() { return properties_.value_data(); } + + private: + detail::minor_properties_t properties_{}; +}; + +class dummy_properties_t { + public: + using value_type = thrust::nullopt_t; + + auto begin() const { return thrust::make_constant_iterator(thrust::nullopt); } +}; + +} // namespace cugraph From 06ed6c59bd2acd2de3795cc3582009846177ff48 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 25 Aug 2021 22:39:48 -0400 Subject: [PATCH 03/57] update to use the wrapper --- cpp/src/link_analysis/pagerank.cu | 8 +++--- cpp/src/structure/coarsen_graph.cu | 46 ++++++++++++++++++++---------- cpp/src/structure/graph_view.cu | 17 +++++------ cpp/src/traversal/sssp.cu | 36 +++++++---------------- 4 files changed, 54 insertions(+), 53 deletions(-) diff --git a/cpp/src/link_analysis/pagerank.cu b/cpp/src/link_analysis/pagerank.cu index 69d5927f629..f06fdccf481 100644 --- a/cpp/src/link_analysis/pagerank.cu +++ b/cpp/src/link_analysis/pagerank.cu @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -188,8 +189,7 @@ void pagerank( // old PageRank values rmm::device_uvector old_pageranks(pull_graph_view.get_number_of_local_vertices(), handle.get_stream()); - rmm::device_uvector adj_matrix_row_pageranks( - pull_graph_view.get_number_of_local_adj_matrix_partition_rows(), handle.get_stream()); + row_properties_t adj_matrix_row_pageranks(handle, pull_graph_view); size_t iter{0}; while (true) { thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), @@ -223,7 +223,7 @@ void pagerank( return pagerank / divisor; }); - copy_to_adj_matrix_row(handle, pull_graph_view, pageranks, adj_matrix_row_pageranks.begin()); + copy_to_adj_matrix_row(handle, pull_graph_view, pageranks, adj_matrix_row_pageranks); auto unvarying_part = aggregate_personalization_vector_size == 0 ? (dangling_sum * alpha + static_cast(1.0 - alpha)) / @@ -234,7 +234,7 @@ void pagerank( handle, pull_graph_view, adj_matrix_row_pageranks.begin(), - thrust::make_constant_iterator(0) /* dummy */, + dummy_properties_t{}.begin(), [alpha] __device__(vertex_t src, vertex_t dst, weight_t w, auto src_val, auto dst_val) { return src_val * w * alpha; }, diff --git a/cpp/src/structure/coarsen_graph.cu b/cpp/src/structure/coarsen_graph.cu index a7abb4846bd..02c2fbae3ab 100644 --- a/cpp/src/structure/coarsen_graph.cu +++ b/cpp/src/structure/coarsen_graph.cu @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -129,17 +130,27 @@ edge_t groupby_e_and_coarsen_edgelist(vertex_t* edgelist_major_vertices /* [INOU } } -template +template std::tuple, rmm::device_uvector, std::optional>> decompress_matrix_partition_to_relabeled_and_grouped_and_coarsened_edgelist( raft::handle_t const& handle, matrix_partition_device_view_t const matrix_partition, - vertex_t const* p_major_labels, - vertex_t const* p_minor_labels, + VertexIterator0 const major_label_first, + VertexIterator1 const minor_label_first, std::optional> const& segment_offsets) { + static_assert( + std::is_same_v::value_type, vertex_t>); + static_assert( + std::is_same_v::value_type, vertex_t>); + // FIXME: it might be possible to directly create relabled & coarsened edgelist from the // compressed sparse format to save memory @@ -152,12 +163,13 @@ decompress_matrix_partition_to_relabeled_and_grouped_and_coarsened_edgelist( pair_first, pair_first + edgelist_major_vertices.size(), pair_first, - [p_major_labels, - p_minor_labels, + [major_label_first, + minor_label_first, major_first = matrix_partition.get_major_first(), minor_first = matrix_partition.get_minor_first()] __device__(auto val) { - return thrust::make_tuple(p_major_labels[thrust::get<0>(val) - major_first], - p_minor_labels[thrust::get<1>(val) - minor_first]); + return thrust::make_tuple( + *(major_label_first + (thrust::get<0>(val) - major_first)), + *(minor_label_first + (thrust::get<1>(val) - minor_first))); }); auto number_of_edges = groupby_e_and_coarsen_edgelist( @@ -212,16 +224,20 @@ coarsen_graph( // 1. construct coarsened edge list - rmm::device_uvector adj_matrix_minor_labels( - store_transposed ? graph_view.get_number_of_local_adj_matrix_partition_rows() - : graph_view.get_number_of_local_adj_matrix_partition_cols(), - handle.get_stream()); - if (store_transposed) { - copy_to_adj_matrix_row(handle, graph_view, labels, adj_matrix_minor_labels.data()); + std::conditional_t< + store_transposed, + row_properties_t, + vertex_t>, + col_properties_t, + vertex_t>> + adj_matrix_minor_labels(handle, graph_view); + if constexpr (store_transposed) { + copy_to_adj_matrix_row(handle, graph_view, labels, adj_matrix_minor_labels); } else { - copy_to_adj_matrix_col(handle, graph_view, labels, adj_matrix_minor_labels.data()); + copy_to_adj_matrix_col(handle, graph_view, labels, adj_matrix_minor_labels); } + std::vector> coarsened_edgelist_major_vertices{}; std::vector> coarsened_edgelist_minor_vertices{}; auto coarsened_edgelist_weights = @@ -291,7 +307,7 @@ coarsen_graph( matrix_partition_device_view_t( graph_view.get_matrix_partition_view(i)), major_labels.data(), - adj_matrix_minor_labels.data(), + adj_matrix_minor_labels.begin(), graph_view.get_local_adj_matrix_partition_segment_offsets(i)); // 1-2. globally shuffle diff --git a/cpp/src/structure/graph_view.cu b/cpp/src/structure/graph_view.cu index 088ed214a74..05de14afd19 100644 --- a/cpp/src/structure/graph_view.cu +++ b/cpp/src/structure/graph_view.cu @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -92,8 +93,8 @@ rmm::device_uvector compute_minor_degrees( copy_v_transform_reduce_out_nbr( handle, graph_view, - thrust::make_constant_iterator(0) /* dummy */, - thrust::make_constant_iterator(0) /* dummy */, + dummy_properties_t{}.begin(), + dummy_properties_t{}.begin(), [] __device__(vertex_t, vertex_t, weight_t, auto, auto) { return edge_t{1}; }, edge_t{0}, minor_degrees.data()); @@ -101,8 +102,8 @@ rmm::device_uvector compute_minor_degrees( copy_v_transform_reduce_in_nbr( handle, graph_view, - thrust::make_constant_iterator(0) /* dummy */, - thrust::make_constant_iterator(0) /* dummy */, + dummy_properties_t{}.begin(), + dummy_properties_t{}.begin(), [] __device__(vertex_t, vertex_t, weight_t, auto, auto) { return edge_t{1}; }, edge_t{0}, minor_degrees.data()); @@ -127,8 +128,8 @@ rmm::device_uvector compute_weight_sums( copy_v_transform_reduce_in_nbr( handle, graph_view, - thrust::make_constant_iterator(0) /* dummy */, - thrust::make_constant_iterator(0) /* dummy */, + dummy_properties_t{}.begin(), + dummy_properties_t{}.begin(), [] __device__(vertex_t, vertex_t, weight_t w, auto, auto) { return w; }, weight_t{0.0}, weight_sums.data()); @@ -136,8 +137,8 @@ rmm::device_uvector compute_weight_sums( copy_v_transform_reduce_out_nbr( handle, graph_view, - thrust::make_constant_iterator(0) /* dummy */, - thrust::make_constant_iterator(0) /* dummy */, + dummy_properties_t{}.begin(), + dummy_properties_t{}.begin(), [] __device__(vertex_t, vertex_t, weight_t w, auto, auto) { return w; }, weight_t{0.0}, weight_sums.data()); diff --git a/cpp/src/traversal/sssp.cu b/cpp/src/traversal/sssp.cu index 8402a74181b..3eb4272cf39 100644 --- a/cpp/src/traversal/sssp.cu +++ b/cpp/src/traversal/sssp.cu @@ -134,22 +134,8 @@ void sssp(raft::handle_t const& handle, // 5. SSSP iteration - bool vertex_and_adj_matrix_row_ranges_coincide = - push_graph_view.get_number_of_local_vertices() == - push_graph_view.get_number_of_local_adj_matrix_partition_rows() - ? true - : false; - rmm::device_uvector adj_matrix_row_distances(0, handle.get_stream()); - if (!vertex_and_adj_matrix_row_ranges_coincide) { - adj_matrix_row_distances.resize(push_graph_view.get_number_of_local_adj_matrix_partition_rows(), - handle.get_stream()); - thrust::fill(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - adj_matrix_row_distances.begin(), - adj_matrix_row_distances.end(), - std::numeric_limits::max()); - } - auto row_distances = - !vertex_and_adj_matrix_row_ranges_coincide ? adj_matrix_row_distances.data() : distances; + row_properties_t adj_matrix_row_distances(handle, push_graph_view); + adj_matrix_row_distances.fill(std::numeric_limits::max(), handle.get_stream()); if (push_graph_view.is_local_vertex_nocheck(source_vertex)) { vertex_frontier.get_bucket(static_cast(Bucket::cur_near)).insert(source_vertex); @@ -157,15 +143,13 @@ void sssp(raft::handle_t const& handle, auto near_far_threshold = delta; while (true) { - if (!vertex_and_adj_matrix_row_ranges_coincide) { - copy_to_adj_matrix_row( - handle, - push_graph_view, - vertex_frontier.get_bucket(static_cast(Bucket::cur_near)).begin(), - vertex_frontier.get_bucket(static_cast(Bucket::cur_near)).end(), - distances, - row_distances); - } + copy_to_adj_matrix_row( + handle, + push_graph_view, + vertex_frontier.get_bucket(static_cast(Bucket::cur_near)).begin(), + vertex_frontier.get_bucket(static_cast(Bucket::cur_near)).end(), + distances, + adj_matrix_row_distances); auto vertex_partition = vertex_partition_device_view_t( push_graph_view.get_vertex_partition_view()); @@ -176,7 +160,7 @@ void sssp(raft::handle_t const& handle, vertex_frontier, static_cast(Bucket::cur_near), std::vector{static_cast(Bucket::next_near), static_cast(Bucket::far)}, - row_distances, + adj_matrix_row_distances.begin(), thrust::make_constant_iterator(0) /* dummy */, [vertex_partition, distances, cutoff] __device__( vertex_t src, vertex_t dst, weight_t w, auto src_val, auto dst_val) { From 32495c5a235a132bc58c5ec9df57af79eca5a431 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Tue, 31 Aug 2021 07:35:27 -0700 Subject: [PATCH 04/57] fix MG Louvain test compile errors --- cpp/tests/community/mg_louvain_helper.cu | 16 +++++++--------- cpp/tests/community/mg_louvain_test.cpp | 1 + 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/cpp/tests/community/mg_louvain_helper.cu b/cpp/tests/community/mg_louvain_helper.cu index 5909ab177cd..5e084237ba2 100644 --- a/cpp/tests/community/mg_louvain_helper.cu +++ b/cpp/tests/community/mg_louvain_helper.cu @@ -85,8 +85,7 @@ compressed_sparse_to_edgelist(edge_t const* compressed_sparse_offsets, // FIXME: this is highly inefficient for very high-degree vertices, for better performance, we can // fill high-degree vertices using one CUDA block per vertex, mid-degree vertices using one CUDA // warp per vertex, and low-degree vertices using one CUDA thread per block - auto execution_policy = handle.get_thrust_policy(); - thrust::for_each(execution_policy, + thrust::for_each(rmm::exec_policy(stream), thrust::make_counting_iterator(major_first), thrust::make_counting_iterator(major_last), [compressed_sparse_offsets, @@ -96,12 +95,12 @@ compressed_sparse_to_edgelist(edge_t const* compressed_sparse_offsets, auto last = compressed_sparse_offsets[v - major_first + 1]; thrust::fill(thrust::seq, p_majors + first, p_majors + last, v); }); - thrust::copy(execution_policy, + thrust::copy(rmm::exec_policy(stream), compressed_sparse_indices, compressed_sparse_indices + number_of_edges, edgelist_minor_vertices.begin()); if (compressed_sparse_weights) { - thrust::copy(execution_policy, + thrust::copy(rmm::exec_policy(stream), (*compressed_sparse_weights), (*compressed_sparse_weights) + number_of_edges, (*edgelist_weights).data()); @@ -124,9 +123,8 @@ void sort_and_coarsen_edgelist( size_t number_of_edges{0}; - auto execution_policy = handle.get_thrust_policy(); if (edgelist_weights) { - thrust::sort_by_key(execution_policy, + thrust::sort_by_key(rmm::exec_policy(stream), pair_first, pair_first + edgelist_major_vertices.size(), (*edgelist_weights).begin()); @@ -137,7 +135,7 @@ void sort_and_coarsen_edgelist( stream); rmm::device_uvector tmp_edgelist_weights(tmp_edgelist_major_vertices.size(), stream); auto it = thrust::reduce_by_key( - execution_policy, + rmm::exec_policy(stream), pair_first, pair_first + edgelist_major_vertices.size(), (*edgelist_weights).begin(), @@ -150,9 +148,9 @@ void sort_and_coarsen_edgelist( edgelist_minor_vertices = std::move(tmp_edgelist_minor_vertices); (*edgelist_weights) = std::move(tmp_edgelist_weights); } else { - thrust::sort(execution_policy, pair_first, pair_first + edgelist_major_vertices.size()); + thrust::sort(rmm::exec_policy(stream), pair_first, pair_first + edgelist_major_vertices.size()); auto it = - thrust::unique(execution_policy, pair_first, pair_first + edgelist_major_vertices.size()); + thrust::unique(rmm::exec_policy(stream), pair_first, pair_first + edgelist_major_vertices.size()); number_of_edges = thrust::distance(pair_first, it); } diff --git a/cpp/tests/community/mg_louvain_test.cpp b/cpp/tests/community/mg_louvain_test.cpp index 4ceacba2acd..ae75929ca0b 100644 --- a/cpp/tests/community/mg_louvain_test.cpp +++ b/cpp/tests/community/mg_louvain_test.cpp @@ -29,6 +29,7 @@ #include #include +#include #include #include From 9e4514cc005d53908540e941e604a2686cfd17bf Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Tue, 31 Aug 2021 07:39:07 -0700 Subject: [PATCH 05/57] clang-format --- cpp/tests/community/mg_louvain_helper.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/tests/community/mg_louvain_helper.cu b/cpp/tests/community/mg_louvain_helper.cu index 5e084237ba2..d52d8657e2a 100644 --- a/cpp/tests/community/mg_louvain_helper.cu +++ b/cpp/tests/community/mg_louvain_helper.cu @@ -149,8 +149,8 @@ void sort_and_coarsen_edgelist( (*edgelist_weights) = std::move(tmp_edgelist_weights); } else { thrust::sort(rmm::exec_policy(stream), pair_first, pair_first + edgelist_major_vertices.size()); - auto it = - thrust::unique(rmm::exec_policy(stream), pair_first, pair_first + edgelist_major_vertices.size()); + auto it = thrust::unique( + rmm::exec_policy(stream), pair_first, pair_first + edgelist_major_vertices.size()); number_of_edges = thrust::distance(pair_first, it); } From be996b3248de3347191e82279d42a6a74108407c Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Tue, 31 Aug 2021 16:29:18 -0400 Subject: [PATCH 06/57] add thrust utility function to convert to/from std::tuple and to emulate thrust::tuple_cat --- .../cugraph/utilities/thrust_tuple_utils.cuh | 38 +++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/cpp/include/cugraph/utilities/thrust_tuple_utils.cuh b/cpp/include/cugraph/utilities/thrust_tuple_utils.cuh index a46db93f6b3..1a193c68ca9 100644 --- a/cpp/include/cugraph/utilities/thrust_tuple_utils.cuh +++ b/cpp/include/cugraph/utilities/thrust_tuple_utils.cuh @@ -60,6 +60,20 @@ struct compute_thrust_tuple_element_sizes_impl { void compute(std::array::value>& arr) const {} }; +template +auto thrust_tuple_to_std_tuple(TupleType tup, std::index_sequence) +{ + return std::make_tuple(thrust::get(tup)...); +} + +template +auto std_tuple_to_thrust_tuple(TupleType tup, std::index_sequence) +{ + constexpr size_t maximum_thrust_tuple_size = 10; + static_assert(std::tuple_size_v <= maximum_thrust_tuple_size); + return thrust::make_tuple(std::get(tup)...); +} + template __device__ std::enable_if_t::value, void> atomic_accumulate_impl( thrust::detail::any_assign& /* dereferencing thrust::discard_iterator results in this type */ lhs, @@ -178,6 +192,30 @@ struct compute_thrust_tuple_element_sizes { } }; +template +auto thrust_tuple_to_std_tuple(TupleType tup) +{ + return detail::thrust_tuple_to_std_tuple( + tup, std::make_index_sequence::value>{}); +} + +template +auto std_tuple_to_thrust_tuple(TupleType tup) +{ + constexpr size_t maximum_thrust_tuple_size = 10; + static_assert(std::tuple_size_v <= maximum_thrust_tuple_size); + return detail::std_tuple_to_thrust_tuple( + tup, std::make_index_sequence>{}); +} + +// a temporary function to emulate thrust::tuple_cat (not supported) using std::tuple_cat (should +// retire once thrust::tuple is replaced with cuda::std::tuple) +template +auto thrust_tuple_cat(TupleTypes... tups) +{ + return std_tuple_to_thrust_tuple(std::tuple_cat(thrust_tuple_to_std_tuple(tups)...)); +} + template struct atomic_accumulate_thrust_tuple { __device__ constexpr void operator()(Iterator iter, TupleType const& value) const From 2f65f41dc25811f23cc9d993e15923a438afde66 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Tue, 31 Aug 2021 16:29:42 -0400 Subject: [PATCH 07/57] added a wrapper class for row/col properties --- .../cugraph/prims/row_col_properties.cuh | 180 +++++++++++++++--- 1 file changed, 149 insertions(+), 31 deletions(-) diff --git a/cpp/include/cugraph/prims/row_col_properties.cuh b/cpp/include/cugraph/prims/row_col_properties.cuh index b1f550cf3e3..adf068d2eb5 100644 --- a/cpp/include/cugraph/prims/row_col_properties.cuh +++ b/cpp/include/cugraph/prims/row_col_properties.cuh @@ -17,6 +17,7 @@ #pragma once #include +#include #include #include @@ -35,22 +36,59 @@ namespace cugraph { namespace detail { template -struct key_to_value_t { - thrust::optional const key_first{}; - thrust::optional const key_last{}; - ValueIterator const value_first{}; +class major_properties_device_view_t { + public: + using value_type = typename thrust::iterator_traits::value_type; + + major_properties_device_view_t() = default; + + major_properties_device_view_t(ValueIterator value_first) : value_first_(value_first) {} + + void add_offset(vertex_t offset) { value_first_ += offset; } - __device__ typename thrust::iterator_traits::value_type operator()( - vertex_t offset) const + ValueIterator value_data() const { return value_first_; } + + __device__ auto get(vertex_t offset) const { return *(value_first_ + offset); } + + private: + ValueIterator value_first_{}; +}; + +template +class minor_properties_device_view_t { + public: + using value_type = typename thrust::iterator_traits::value_type; + + minor_properties_device_view_t() = default; + + minor_properties_device_view_t(ValueIterator value_first) + : key_first_(thrust::nullopt), key_last_(thrust::nullopt), value_first_(value_first) { - if (key_first) { - auto it = thrust::lower_bound(thrust::seq, *key_first, *key_last, offset); - assert((it != *key_last) && (*it == offset)); - return *(value_first + thrust::distance(*key_first, it)); - } else { - return *(value_first + offset); + } + + minor_properties_device_view_t(vertex_t const* key_first, + vertex_t const* key_last, + ValueIterator value_first) + : key_first_(key_first), key_last_(key_last), value_first_(value_first) + { + } + + __device__ auto& get(vertex_t offset) const + { + auto value_offset = offset; + if (key_first_) { + auto it = thrust::lower_bound(thrust::seq, *key_first_, *key_last_, offset); + assert((it != *key_last_) && (*it == offset)); + value_offset = static_cast(thrust::distance(*key_first_, it)); } + return *(value_first_ + value_offset); } + + private: + thrust::optional key_first_{thrust::nullopt}; + thrust::optional key_last_{thrust::nullopt}; + + ValueIterator value_first_{}; }; template @@ -65,14 +103,28 @@ class major_properties_t { void fill(T value, rmm::cuda_stream_view stream) { - thrust::fill( - rmm::exec_policy(stream), value_data(), value_data() + size_dataframe_buffer(buffer_), value); + thrust::fill(rmm::exec_policy(stream), + value_data(), + value_data() + size_dataframe_buffer(buffer_), + value); } - auto begin() const { return get_dataframe_buffer_begin(buffer_); } - auto value_data() { return get_dataframe_buffer_begin(buffer_); } + auto device_view() const + { + auto value_first = get_dataframe_buffer_begin(buffer_); + return major_properties_device_view_t(value_first); + } + + auto mutable_device_view() + { + auto value_first = get_dataframe_buffer_begin(buffer_); + static_assert( + !std::is_const_v::value_type>); + return major_properties_device_view_t(value_first); + } + private: decltype(allocate_dataframe_buffer(0, rmm::cuda_stream_view{})) buffer_; }; @@ -106,22 +158,37 @@ class minor_properties_t { void fill(T value, rmm::cuda_stream_view stream) { - thrust::fill( - rmm::exec_policy(stream), value_data(), value_data() + size_dataframe_buffer(buffer_), value); + thrust::fill(rmm::exec_policy(stream), + value_data(), + value_data() + size_dataframe_buffer(buffer_), + value); } - auto begin() const + auto value_data() { return get_dataframe_buffer_begin(buffer_); } + + auto device_view() const { auto value_first = get_dataframe_buffer_begin(buffer_); - return thrust::make_transform_iterator( - thrust::make_counting_iterator(vertex_t{0}), - key_to_value_t{ - key_first_ ? thrust::make_optional(*key_first_) : thrust::nullopt, - key_last_ ? thrust::make_optional(*key_last_) : thrust::nullopt, - value_first}); + if (key_first_) { + return minor_properties_device_view_t( + *key_first_, *key_last_, value_first); + } else { + return minor_properties_device_view_t(value_first); + } } - auto value_data() { return get_dataframe_buffer_begin(buffer_); } + auto mutable_device_view() + { + auto value_first = get_dataframe_buffer_begin(buffer_); + static_assert( + !std::is_const_v::value_type>); + if (key_first_) { + return minor_properties_device_view_t( + *key_first_, *key_last_, value_first); + } else { + return minor_properties_device_view_t(value_first); + } + } private: std::optional key_first_{std::nullopt}; @@ -130,6 +197,22 @@ class minor_properties_t { decltype(allocate_dataframe_buffer(0, rmm::cuda_stream_view{})) buffer_; }; +template ::value_type>::value>* = nullptr> +auto to_thrust_tuple(Iterator iter) +{ + return thrust::make_tuple(iter); +} + +template ::value_type>::value>* = nullptr> +auto to_thrust_tuple(Iterator iter) +{ + return iter.get_iterator_tuple(); +} + } // namespace detail template @@ -144,6 +227,8 @@ class row_properties_t::value); + row_properties_t() = default; + row_properties_t(raft::handle_t const& handle, GraphViewType const& graph_view) { auto key_first = graph_view.get_local_sorted_unique_edge_row_begin(); @@ -159,9 +244,11 @@ class row_properties_t properties_{}; }; @@ -175,6 +262,8 @@ class row_properties_t::value); + row_properties_t() = default; + row_properties_t(raft::handle_t const& handle, GraphViewType const& graph_view) { properties_ = detail::major_properties_t( @@ -183,9 +272,11 @@ class row_properties_t properties_{}; }; @@ -202,6 +293,8 @@ class col_properties_t::value); + col_properties_t() = default; + col_properties_t(raft::handle_t const& handle, GraphViewType const& graph_view) { properties_ = detail::major_properties_t( @@ -210,9 +303,11 @@ class col_properties_t properties_{}; }; @@ -226,6 +321,8 @@ class col_properties_t::value); + col_properties_t() = default; + col_properties_t(raft::handle_t const& handle, GraphViewType const& graph_view) { auto key_first = graph_view.get_local_sorted_unique_edge_col_begin(); @@ -241,18 +338,39 @@ class col_properties_t properties_{}; }; +template +class dummy_properties_device_view_t { + public: + using value_type = thrust::nullopt_t; + + void add_offset(vertex_t offset) {} // no-op + + __device__ auto get(vertex_t offset) const { return thrust::nullopt; } +}; + +template class dummy_properties_t { public: using value_type = thrust::nullopt_t; - auto begin() const { return thrust::make_constant_iterator(thrust::nullopt); } + auto device_view() const { return dummy_properties_device_view_t{}; } }; +template +auto device_view_concat(detail::major_properties_device_view_t... device_views) +{ + auto concat_first = thrust::make_zip_iterator( + thrust_tuple_cat(detail::to_thrust_tuple(device_views.value_data())...)); + return detail::major_properties_device_view_t(concat_first); +} + } // namespace cugraph From 94717d97597681d2793baa6e1bed340d73b11764 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Tue, 31 Aug 2021 16:32:07 -0400 Subject: [PATCH 08/57] update prims to use the row/col properties wrapper --- .../copy_v_transform_reduce_in_out_nbr.cuh | 216 ++++++------ ...ransform_reduce_key_aggregated_out_nbr.cuh | 147 +++++---- cpp/include/cugraph/prims/count_if_e.cuh | 48 +-- .../cugraph/prims/property_op_utils.cuh | 22 +- ...orm_reduce_by_adj_matrix_row_col_key_e.cuh | 309 +++++++++--------- .../cugraph/prims/transform_reduce_e.cuh | 147 +++++---- .../update_frontier_v_push_if_out_nbr.cuh | 131 ++++---- 7 files changed, 515 insertions(+), 505 deletions(-) diff --git a/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh b/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh index 335b34828e5..353040d18e8 100644 --- a/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh +++ b/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh @@ -46,8 +46,8 @@ int32_t constexpr copy_v_transform_reduce_nbr_for_all_block_size = 512; template @@ -57,8 +57,8 @@ __global__ void for_all_major_for_all_nbr_hypersparse( typename GraphViewType::weight_type, GraphViewType::is_multi_gpu> matrix_partition, typename GraphViewType::vertex_type major_hypersparse_first, - AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first, - AdjMatrixColValueInputIterator adj_matrix_col_value_input_first, + AdjMatrixRowValueInputWrapper adj_matrix_row_value_input, + AdjMatrixColValueInputWrapper adj_matrix_col_value_input, ResultValueOutputIterator result_value_output_first, EdgeOp e_op, T init /* relevent only if update_major == true */) @@ -86,8 +86,8 @@ __global__ void for_all_major_for_all_nbr_hypersparse( thrust::tie(indices, weights, local_degree) = matrix_partition.get_local_edges(static_cast(major_idx)); auto transform_op = [&matrix_partition, - &adj_matrix_row_value_input_first, - &adj_matrix_col_value_input_first, + &adj_matrix_row_value_input, + &adj_matrix_col_value_input, &e_op, major, indices, @@ -106,14 +106,14 @@ __global__ void for_all_major_for_all_nbr_hypersparse( : minor_offset; return evaluate_edge_op() .compute(row, col, weight, - *(adj_matrix_row_value_input_first + row_offset), - *(adj_matrix_col_value_input_first + col_offset), + adj_matrix_row_value_input.get(row_offset), + adj_matrix_col_value_input.get(col_offset), e_op); }; @@ -143,8 +143,8 @@ __global__ void for_all_major_for_all_nbr_hypersparse( template @@ -155,8 +155,8 @@ __global__ void for_all_major_for_all_nbr_low_degree( GraphViewType::is_multi_gpu> matrix_partition, typename GraphViewType::vertex_type major_first, typename GraphViewType::vertex_type major_last, - AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first, - AdjMatrixColValueInputIterator adj_matrix_col_value_input_first, + AdjMatrixRowValueInputWrapper adj_matrix_row_value_input, + AdjMatrixColValueInputWrapper adj_matrix_col_value_input, ResultValueOutputIterator result_value_output_first, EdgeOp e_op, T init /* relevent only if update_major == true */) @@ -178,8 +178,8 @@ __global__ void for_all_major_for_all_nbr_low_degree( thrust::tie(indices, weights, local_degree) = matrix_partition.get_local_edges(static_cast(major_offset)); auto transform_op = [&matrix_partition, - &adj_matrix_row_value_input_first, - &adj_matrix_col_value_input_first, + &adj_matrix_row_value_input, + &adj_matrix_col_value_input, &e_op, major_offset, indices, @@ -201,14 +201,14 @@ __global__ void for_all_major_for_all_nbr_low_degree( : minor_offset; return evaluate_edge_op() .compute(row, col, weight, - *(adj_matrix_row_value_input_first + row_offset), - *(adj_matrix_col_value_input_first + col_offset), + adj_matrix_row_value_input.get(row_offset), + adj_matrix_col_value_input.get(col_offset), e_op); }; @@ -238,8 +238,8 @@ __global__ void for_all_major_for_all_nbr_low_degree( template @@ -250,8 +250,8 @@ __global__ void for_all_major_for_all_nbr_mid_degree( GraphViewType::is_multi_gpu> matrix_partition, typename GraphViewType::vertex_type major_first, typename GraphViewType::vertex_type major_last, - AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first, - AdjMatrixColValueInputIterator adj_matrix_col_value_input_first, + AdjMatrixRowValueInputWrapper adj_matrix_row_value_input, + AdjMatrixColValueInputWrapper adj_matrix_col_value_input, ResultValueOutputIterator result_value_output_first, EdgeOp e_op, T init /* relevent only if update_major == true */) @@ -294,14 +294,14 @@ __global__ void for_all_major_for_all_nbr_mid_degree( : minor_offset; auto e_op_result = evaluate_edge_op() .compute(row, col, weight, - *(adj_matrix_row_value_input_first + row_offset), - *(adj_matrix_col_value_input_first + col_offset), + adj_matrix_row_value_input.get(row_offset), + adj_matrix_col_value_input.get(col_offset), e_op); if (update_major) { e_op_result_sum = edge_property_add(e_op_result_sum, e_op_result); @@ -320,8 +320,8 @@ __global__ void for_all_major_for_all_nbr_mid_degree( template @@ -332,8 +332,8 @@ __global__ void for_all_major_for_all_nbr_high_degree( GraphViewType::is_multi_gpu> matrix_partition, typename GraphViewType::vertex_type major_first, typename GraphViewType::vertex_type major_last, - AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first, - AdjMatrixColValueInputIterator adj_matrix_col_value_input_first, + AdjMatrixRowValueInputWrapper adj_matrix_row_value_input, + AdjMatrixColValueInputWrapper adj_matrix_col_value_input, ResultValueOutputIterator result_value_output_first, EdgeOp e_op, T init /* relevent only if update_major == true */) @@ -373,14 +373,14 @@ __global__ void for_all_major_for_all_nbr_high_degree( : minor_offset; auto e_op_result = evaluate_edge_op() .compute(row, col, weight, - *(adj_matrix_row_value_input_first + row_offset), - *(adj_matrix_col_value_input_first + col_offset), + adj_matrix_row_value_input.get(row_offset), + adj_matrix_col_value_input.get(col_offset), e_op); if (update_major) { e_op_result_sum = edge_property_add(e_op_result_sum, e_op_result); @@ -401,15 +401,15 @@ __global__ void for_all_major_for_all_nbr_high_degree( template void copy_v_transform_reduce_nbr(raft::handle_t const& handle, GraphViewType const& graph_view, - AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first, - AdjMatrixColValueInputIterator adj_matrix_col_value_input_first, + AdjMatrixRowValueInputWrapper adj_matrix_row_value_input, + AdjMatrixColValueInputWrapper adj_matrix_col_value_input, EdgeOp e_op, T init, VertexValueOutputIterator vertex_value_output_first) @@ -475,12 +475,14 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, } } - auto row_value_input_offset = GraphViewType::is_adj_matrix_transposed - ? vertex_t{0} - : matrix_partition.get_major_value_start_offset(); - auto col_value_input_offset = GraphViewType::is_adj_matrix_transposed - ? matrix_partition.get_major_value_start_offset() - : vertex_t{0}; + auto matrix_partition_row_value_input = adj_matrix_row_value_input; + auto matrix_partition_col_value_input = adj_matrix_col_value_input; + if constexpr (GraphViewType::is_adj_matrix_transposed) { + matrix_partition_col_value_input.add_offset(matrix_partition.get_major_value_start_offset()); + } else { + matrix_partition_row_value_input.add_offset(matrix_partition.get_major_value_start_offset()); + } + std::conditional_t< GraphViewType::is_multi_gpu, std::conditional_t, @@ -506,8 +508,8 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, matrix_partition, matrix_partition.get_major_first(), matrix_partition.get_major_first() + (*segment_offsets)[1], - adj_matrix_row_value_input_first + row_value_input_offset, - adj_matrix_col_value_input_first + col_value_input_offset, + matrix_partition_row_value_input, + matrix_partition_col_value_input, output_buffer_first, e_op, major_init); @@ -521,8 +523,8 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, matrix_partition, matrix_partition.get_major_first() + (*segment_offsets)[1], matrix_partition.get_major_first() + (*segment_offsets)[2], - adj_matrix_row_value_input_first + row_value_input_offset, - adj_matrix_col_value_input_first + col_value_input_offset, + matrix_partition_row_value_input, + matrix_partition_col_value_input, output_buffer_first + (update_major ? (*segment_offsets)[1] : vertex_t{0}), e_op, major_init); @@ -536,8 +538,8 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, matrix_partition, matrix_partition.get_major_first() + (*segment_offsets)[2], matrix_partition.get_major_first() + (*segment_offsets)[3], - adj_matrix_row_value_input_first + row_value_input_offset, - adj_matrix_col_value_input_first + col_value_input_offset, + matrix_partition_row_value_input, + matrix_partition_col_value_input, output_buffer_first + (update_major ? (*segment_offsets)[2] : vertex_t{0}), e_op, major_init); @@ -559,8 +561,8 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, <<>>( matrix_partition, matrix_partition.get_major_first() + (*segment_offsets)[3], - adj_matrix_row_value_input_first + row_value_input_offset, - adj_matrix_col_value_input_first + col_value_input_offset, + matrix_partition_row_value_input, + matrix_partition_col_value_input, output_buffer_first + (update_major ? (*segment_offsets)[3] : vertex_t{0}), e_op, major_init); @@ -576,8 +578,8 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, matrix_partition, matrix_partition.get_major_first(), matrix_partition.get_major_last(), - adj_matrix_row_value_input_first + row_value_input_offset, - adj_matrix_col_value_input_first + col_value_input_offset, + matrix_partition_row_value_input, + matrix_partition_col_value_input, output_buffer_first, e_op, major_init); @@ -681,28 +683,29 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, * and thrust::copy() (update vertex properties part, take transform_reduce output as copy input). * * @tparam GraphViewType Type of the passed non-owning graph object. - * @tparam AdjMatrixRowValueInputIterator Type of the iterator for graph adjacency matrix row - * input properties. - * @tparam AdjMatrixColValueInputIterator Type of the iterator for graph adjacency matrix column - * input properties. + * @tparam AdjMatrixRowValueInputWrapper Type of the wrapper for graph adjacency matrix row input + * properties. + * @tparam AdjMatrixColValueInputWrapper Type of the wrapper for graph adjacency matrix column input + * properties. * @tparam EdgeOp Type of the quaternary (or quinary) edge operator. * @tparam T Type of the initial value for reduction over the incoming edges. * @tparam VertexValueOutputIterator Type of the iterator for vertex output property variables. * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and * handles to various CUDA libraries) to run graph algorithms. * @param graph_view Non-owning graph object. - * @param adj_matrix_row_value_input_first Iterator pointing to the adjacency matrix row input - * properties for the first (inclusive) row (assigned to this process in multi-GPU). - * `adj_matrix_row_value_input_last` (exclusive) is deduced as @p adj_matrix_row_value_input_first + - * @p graph_view.get_number_of_local_adj_matrix_partition_rows(). - * @param adj_matrix_col_value_input_first Iterator pointing to the adjacency matrix column input - * properties for the first (inclusive) column (assigned to this process in multi-GPU). - * `adj_matrix_col_value_output_last` (exclusive) is deduced as @p adj_matrix_col_value_output_first - * + @p graph_view.get_number_of_local_adj_matrix_partition_cols(). + * @param adj_matrix_row_value_input Device-copyable wrapper used to access row input properties + * (for the rows assigned to this process in multi-GPU). Use either + * cugraph::row_properties_t::device_view() (if @p e_op needs to access row properties) or + * cugraph::dummy_properties_t::device_view() (if @p e_op does not access row properties). Use + * copy_to_adj_matrix_row to fill the wrapper. + * @param adj_matrix_col_value_input Device-copyable wrapper used to access column input properties + * (for the columns assigned to this process in multi-GPU). Use either + * cugraph::col_properties_t::device_view() (if @p e_op needs to access column properties) or + * cugraph::dummy_properties_t::device_view() (if @p e_op does not access column properties). Use + * copy_to_adj_matrix_col to fill the wrapper. * @param e_op Quaternary (or quinary) operator takes edge source, edge destination, (optional edge - * weight), *(@p adj_matrix_row_value_input_first + i), and *(@p adj_matrix_col_value_input_first + - * j) (where i is in [0, graph_view.get_number_of_local_adj_matrix_partition_rows()) and j is in [0, - * get_number_of_local_adj_matrix_partition_cols())) and returns a value to be reduced. + * weight), properties for the row (i.e. source), and properties for the column (i.e. destination) + * and returns a value to be reduced. * @param init Initial value to be added to the reduced @p e_op return values for each vertex. * @param vertex_value_output_first Iterator pointing to the vertex property variables for the first * (inclusive) vertex (assigned to tihs process in multi-GPU). `vertex_value_output_last` @@ -710,23 +713,23 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, * graph_view.get_number_of_local_vertices(). */ template void copy_v_transform_reduce_in_nbr(raft::handle_t const& handle, GraphViewType const& graph_view, - AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first, - AdjMatrixColValueInputIterator adj_matrix_col_value_input_first, + AdjMatrixRowValueInputWrapper adj_matrix_row_value_input, + AdjMatrixColValueInputWrapper adj_matrix_col_value_input, EdgeOp e_op, T init, VertexValueOutputIterator vertex_value_output_first) { detail::copy_v_transform_reduce_nbr(handle, graph_view, - adj_matrix_row_value_input_first, - adj_matrix_col_value_input_first, + adj_matrix_row_value_input, + adj_matrix_col_value_input, e_op, init, vertex_value_output_first); @@ -740,31 +743,29 @@ void copy_v_transform_reduce_in_nbr(raft::handle_t const& handle, * input). * * @tparam GraphViewType Type of the passed non-owning graph object. - * @tparam AdjMatrixRowValueInputIterator Type of the iterator for graph adjacency matrix row - * input properties. - * @tparam AdjMatrixColValueInputIterator Type of the iterator for graph adjacency matrix column - * input properties. + * @tparam AdjMatrixRowValueInputWrapper Type of the wrapper for graph adjacency matrix row input + * properties. + * @tparam AdjMatrixColValueInputWrapper Type of the wrapper for graph adjacency matrix column input + * properties. * @tparam EdgeOp Type of the quaternary (or quinary) edge operator. * @tparam T Type of the initial value for reduction over the outgoing edges. * @tparam VertexValueOutputIterator Type of the iterator for vertex output property variables. * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and * handles to various CUDA libraries) to run graph algorithms. * @param graph_view Non-owning graph object. - * @param adj_matrix_row_value_input_first Iterator pointing to the adjacency matrix row input - * properties for the first (inclusive) row (assigned to this process in multi-GPU). - * `adj_matrix_row_value_input_last` (exclusive) is deduced as @p adj_matrix_row_value_input_first - * + - * @p graph_view.get_number_of_local_adj_matrix_partition_rows(). - * @param adj_matrix_col_value_input_first Iterator pointing to the adjacency matrix column input - * properties for the first (inclusive) column (assigned to this process in multi-GPU). - * `adj_matrix_col_value_output_last` (exclusive) is deduced as @p - * adj_matrix_col_value_output_first - * + @p graph_view.get_number_of_local_adj_matrix_partition_cols(). - * @param e_op Quaternary (or quinary) operator takes edge source, edge destination, (optional - * edge weight), *(@p adj_matrix_row_value_input_first + i), and *(@p - * adj_matrix_col_value_input_first + j) (where i is in [0, - * graph_view.get_number_of_local_adj_matrix_partition_rows()) and j is in [0, - * get_number_of_local_adj_matrix_partition_cols())) and returns a value to be reduced. + * @param adj_matrix_row_value_input Device-copyable wrapper used to access row input properties + * (for the rows assigned to this process in multi-GPU). Use either + * cugraph::row_properties_t::device_view() (if @p e_op needs to access row properties) or + * cugraph::dummy_properties_t::device_view() (if @p e_op does not access row properties). Use + * copy_to_adj_matrix_row to fill the wrapper. + * @param adj_matrix_col_value_input Device-copyable wrapper used to access column input properties + * (for the columns assigned to this process in multi-GPU). Use either + * cugraph::col_properties_t::device_view() (if @p e_op needs to access column properties) or + * cugraph::dummy_properties_t::device_view() (if @p e_op does not access column properties). Use + * copy_to_adj_matrix_col to fill the wrapper. + * @param e_op Quaternary (or quinary) operator takes edge source, edge destination, (optional edge + * weight), properties for the row (i.e. source), and properties for the column (i.e. destination) + * and returns a value to be reduced. * @param init Initial value to be added to the reduced @p e_op return values for each vertex. * @param vertex_value_output_first Iterator pointing to the vertex property variables for the * first (inclusive) vertex (assigned to tihs process in multi-GPU). `vertex_value_output_last` @@ -772,24 +773,23 @@ void copy_v_transform_reduce_in_nbr(raft::handle_t const& handle, * graph_view.get_number_of_local_vertices(). */ template -void copy_v_transform_reduce_out_nbr( - raft::handle_t const& handle, - GraphViewType const& graph_view, - AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first, - AdjMatrixColValueInputIterator adj_matrix_col_value_input_first, - EdgeOp e_op, - T init, - VertexValueOutputIterator vertex_value_output_first) +void copy_v_transform_reduce_out_nbr(raft::handle_t const& handle, + GraphViewType const& graph_view, + AdjMatrixRowValueInputWrapper adj_matrix_row_value_input, + AdjMatrixColValueInputWrapper adj_matrix_col_value_input, + EdgeOp e_op, + T init, + VertexValueOutputIterator vertex_value_output_first) { detail::copy_v_transform_reduce_nbr(handle, graph_view, - adj_matrix_row_value_input_first, - adj_matrix_col_value_input_first, + adj_matrix_row_value_input, + adj_matrix_col_value_input, e_op, init, vertex_value_output_first); diff --git a/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh b/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh index 5ae32a6f56a..db9e98310c3 100644 --- a/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh +++ b/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh @@ -41,14 +41,14 @@ namespace detail { int32_t constexpr copy_v_transform_reduce_key_aggregated_out_nbr_for_all_block_size = 1024; // a workaround for cudaErrorInvalidDeviceFunction error when device lambda is used -template +template struct minor_to_key_t { - using vertex_t = typename std::iterator_traits::value_type; - VertexIterator adj_matrix_col_key_first{}; + using vertex_t = typename AdjMatrixColKeyInputWrapper::value_type; + AdjMatrixColKeyInputWrapper adj_matrix_col_key_input{}; vertex_t minor_first{}; __device__ vertex_t operator()(vertex_t minor) { - return *(adj_matrix_col_key_first + (minor - minor_first)); + return adj_matrix_col_key_input.get(minor - minor_first); } }; @@ -209,8 +209,9 @@ void decompress_matrix_partition_to_fill_edgelist_majors( * support two level reduction for every vertex. * * @tparam GraphViewType Type of the passed non-owning graph object. - * @tparam AdjMatrixRowValueInputIterator Type of the iterator for graph adjacency matrix row - * input properties. + * @tparam AdjMatrixRowValueInputWrapper Type of the wrapper for graph adjacency matrix row input + * properties. + * @tparam AdjMatrixColKeyInputWrapper Type of the wrapper for graph adjacency matrix column keys. * @tparam VertexIterator Type of the iterator for graph adjacency matrix column key values for * aggregation (key type should coincide with vertex type). * @tparam ValueIterator Type of the iterator for values in (key, value) pairs. @@ -221,28 +222,27 @@ void decompress_matrix_partition_to_fill_edgelist_majors( * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and * handles to various CUDA libraries) to run graph algorithms. * @param graph_view Non-owning graph object. - * @param adj_matrix_row_value_input_first Iterator pointing to the adjacency matrix row input - * properties for the first (inclusive) row (assigned to this process in multi-GPU). - * `adj_matrix_row_value_input_last` (exclusive) is deduced as @p adj_matrix_row_value_input_first - * + @p graph_view.get_number_of_local_adj_matrix_partition_rows(). - * @param adj_matrix_col_key_first Iterator pointing to the adjacency matrix column key (for - * aggregation) for the first (inclusive) column (assigned to this process in multi-GPU). - * `adj_matrix_col_key_last` (exclusive) is deduced as @p adj_matrix_col_key_first + @p - * graph_view.get_number_of_local_adj_matrix_partition_cols(). - * @param map_key_first Iterator pointing to the first (inclusive) key in (key, value) pairs - * (assigned to this process in multi-GPU, - * `cugraph::detail::compute_gpu_id_from_vertex_t` is used to map keys to processes). - * (Key, value) pairs may be provided by transform_reduce_by_adj_matrix_row_key_e() or - * transform_reduce_by_adj_matrix_col_key_e(). - * @param map_key_last Iterator pointing to the last (exclusive) key in (key, value) pairs (assigned - * to this process in multi-GPU). + * @param adj_matrix_row_value_input Device-copyable wrapper used to access row input properties + * (for the rows assigned to this process in multi-GPU). Use either + * cugraph::row_properties_t::device_view() (if @p e_op needs to access row properties) or + * cugraph::dummy_properties_t::device_view() (if @p e_op does not access row properties). Use + * copy_to_adj_matrix_row to fill the wrapper. + * @param adj_matrix_col_key_input Device-copyable wrapper used to access column keys (for the + * columns assigned to this process in multi-GPU). Use either + * cugraph::col_properties_t::device_view(). Use copy_to_adj_matrix_col to fill the wrapper. + * @param map_unique_key_first Iterator pointing to the first (inclusive) key in (key, value) pairs + * (assigned to this process in multi-GPU, `cugraph::detail::compute_gpu_id_from_vertex_t` is used + * to map keys to processes). (Key, value) pairs may be provided by + * transform_reduce_by_adj_matrix_row_key_e() or transform_reduce_by_adj_matrix_col_key_e(). + * @param map_unique_key_last Iterator pointing to the last (exclusive) key in (key, value) pairs + * (assigned to this process in multi-GPU). * @param map_value_first Iterator pointing to the first (inclusive) value in (key, value) pairs * (assigned to this process in multi-GPU). `map_value_last` (exclusive) is deduced as @p - * map_value_first + thrust::distance(@p map_key_first, @p map_key_last). + * map_value_first + thrust::distance(@p map_unique_key_first, @p map_unique_key_last). * @param key_aggregated_e_op Quinary operator takes edge source, key, aggregated edge weight, *(@p * adj_matrix_row_value_input_first + i), and value for the key stored in the input (key, value) - * pairs provided by @p map_key_first, @p map_key_last, and @p map_value_first (aggregated over the - * entire set of processes in multi-GPU). + * pairs provided by @p map_unique_key_first, @p map_unique_key_last, and @p map_value_first + * (aggregated over the entire set of processes in multi-GPU). * @param reduce_op Binary operator takes two input arguments and reduce the two variables to one. * @param init Initial value to be added to the reduced @p reduce_op return values for each vertex. * @param vertex_value_output_first Iterator pointing to the vertex property variables for the @@ -251,9 +251,9 @@ void decompress_matrix_partition_to_fill_edgelist_majors( * graph_view.get_number_of_local_vertices(). */ template ::value_type, + static_assert(std::is_same::value_type, typename GraphViewType::vertex_type>::value); - static_assert(std::is_same::value_type, - typename std::iterator_traits::value_type>::value); static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic::value); using vertex_t = typename GraphViewType::vertex_type; @@ -314,10 +312,10 @@ void copy_v_transform_reduce_key_aggregated_out_nbr( comm.barrier(); // currently, this is ncclAllReduce #endif - auto map_counts = - host_scalar_allgather(row_comm, - static_cast(thrust::distance(map_key_first, map_key_last)), - handle.get_stream()); + auto map_counts = host_scalar_allgather( + row_comm, + static_cast(thrust::distance(map_unique_key_first, map_unique_key_last)), + handle.get_stream()); std::vector map_displacements(row_comm_size, size_t{0}); std::partial_sum(map_counts.begin(), map_counts.end() - 1, map_displacements.begin() + 1); rmm::device_uvector map_keys(map_displacements.back() + map_counts.back(), @@ -326,7 +324,7 @@ void copy_v_transform_reduce_key_aggregated_out_nbr( allocate_dataframe_buffer(map_keys.size(), handle.get_stream()); for (int i = 0; i < row_comm_size; ++i) { device_bcast(row_comm, - map_key_first, + map_unique_key_first, map_keys.begin() + map_displacements[i], map_counts[i], i, @@ -341,13 +339,13 @@ void copy_v_transform_reduce_key_aggregated_out_nbr( // FIXME: these copies are unnecessary, better fix RAFT comm's bcast to take separate input & // output pointers thrust::copy(rmm::exec_policy(handle.get_stream()), - map_key_first, - map_key_last, + map_unique_key_first, + map_unique_key_last, map_keys.begin() + map_displacements[row_comm_rank]); thrust::copy( rmm::exec_policy(handle.get_stream()), map_value_first, - map_value_first + thrust::distance(map_key_first, map_key_last), + map_value_first + thrust::distance(map_unique_key_first, map_unique_key_last), get_dataframe_buffer_begin(map_value_buffer) + map_displacements[row_comm_rank]); handle.get_stream_view().synchronize(); // cuco::static_map currently does not take stream @@ -357,8 +355,9 @@ void copy_v_transform_reduce_key_aggregated_out_nbr( kv_map_ptr = std::make_unique< cuco::static_map>( // cuco::static_map requires at least one empty slot - std::max(static_cast(static_cast(map_keys.size()) / load_factor), - static_cast(thrust::distance(map_key_first, map_key_last)) + 1), + std::max( + static_cast(static_cast(map_keys.size()) / load_factor), + static_cast(thrust::distance(map_unique_key_first, map_unique_key_last)) + 1), invalid_vertex_id::value, invalid_vertex_id::value, stream_adapter); @@ -374,15 +373,19 @@ void copy_v_transform_reduce_key_aggregated_out_nbr( kv_map_ptr = std::make_unique< cuco::static_map>( // cuco::static_map requires at least one empty slot - std::max(static_cast( - static_cast(thrust::distance(map_key_first, map_key_last)) / load_factor), - static_cast(thrust::distance(map_key_first, map_key_last)) + 1), + std::max( + static_cast( + static_cast(thrust::distance(map_unique_key_first, map_unique_key_last)) / + load_factor), + static_cast(thrust::distance(map_unique_key_first, map_unique_key_last)) + 1), invalid_vertex_id::value, invalid_vertex_id::value, stream_adapter); - auto pair_first = thrust::make_zip_iterator(thrust::make_tuple(map_key_first, map_value_first)); - kv_map_ptr->insert(pair_first, pair_first + thrust::distance(map_key_first, map_key_last)); + auto pair_first = + thrust::make_zip_iterator(thrust::make_tuple(map_unique_key_first, map_value_first)); + kv_map_ptr->insert(pair_first, + pair_first + thrust::distance(map_unique_key_first, map_unique_key_last)); } // 2. aggregate each vertex out-going edges based on keys and transform-reduce. @@ -418,8 +421,8 @@ void copy_v_transform_reduce_key_aggregated_out_nbr( if (matrix_partition.get_major_size() > 0) { auto minor_key_first = thrust::make_transform_iterator( matrix_partition.get_indices(), - detail::minor_to_key_t{adj_matrix_col_key_first, - matrix_partition.get_minor_first()}); + detail::minor_to_key_t{adj_matrix_col_key_input, + matrix_partition.get_minor_first()}); thrust::copy(rmm::exec_policy(handle.get_stream()), minor_key_first, minor_key_first + matrix_partition.get_number_of_edges(), @@ -543,28 +546,30 @@ void copy_v_transform_reduce_key_aggregated_out_nbr( allocate_dataframe_buffer(tmp_major_vertices.size(), handle.get_stream()); auto tmp_e_op_result_buffer_first = get_dataframe_buffer_begin(tmp_e_op_result_buffer); + auto matrix_partition_row_value_input = adj_matrix_row_value_input; + matrix_partition_row_value_input.add_offset(matrix_partition.get_major_value_start_offset()); + auto triplet_first = thrust::make_zip_iterator(thrust::make_tuple( tmp_major_vertices.begin(), tmp_minor_keys.begin(), tmp_key_aggregated_edge_weights.begin())); - thrust::transform( - rmm::exec_policy(handle.get_stream()), - triplet_first, - triplet_first + tmp_major_vertices.size(), - tmp_e_op_result_buffer_first, - [adj_matrix_row_value_input_first = - adj_matrix_row_value_input_first + matrix_partition.get_major_value_start_offset(), - key_aggregated_e_op, - matrix_partition, - kv_map = kv_map_ptr->get_device_view()] __device__(auto val) { - auto major = thrust::get<0>(val); - auto key = thrust::get<1>(val); - auto w = thrust::get<2>(val); - return key_aggregated_e_op(major, - key, - w, - *(adj_matrix_row_value_input_first + - matrix_partition.get_major_offset_from_major_nocheck(major)), - kv_map.find(key)->second.load(cuda::std::memory_order_relaxed)); - }); + thrust::transform(rmm::exec_policy(handle.get_stream()), + triplet_first, + triplet_first + tmp_major_vertices.size(), + tmp_e_op_result_buffer_first, + [matrix_partition_row_value_input, + key_aggregated_e_op, + matrix_partition, + kv_map = kv_map_ptr->get_device_view()] __device__(auto val) { + auto major = thrust::get<0>(val); + auto key = thrust::get<1>(val); + auto w = thrust::get<2>(val); + return key_aggregated_e_op( + major, + key, + w, + matrix_partition_row_value_input.get( + matrix_partition.get_major_offset_from_major_nocheck(major)), + kv_map.find(key)->second.load(cuda::std::memory_order_relaxed)); + }); tmp_minor_keys.resize(0, handle.get_stream()); tmp_key_aggregated_edge_weights.resize(0, handle.get_stream()); tmp_minor_keys.shrink_to_fit(handle.get_stream()); diff --git a/cpp/include/cugraph/prims/count_if_e.cuh b/cpp/include/cugraph/prims/count_if_e.cuh index c0f937ee9fd..a715003e7b9 100644 --- a/cpp/include/cugraph/prims/count_if_e.cuh +++ b/cpp/include/cugraph/prims/count_if_e.cuh @@ -31,38 +31,38 @@ namespace cugraph { * This function is inspired by thrust::count_if(). * * @tparam GraphViewType Type of the passed non-owning graph object. - * @tparam AdjMatrixRowValueInputIterator Type of the iterator for graph adjacency matrix row - * input properties. - * @tparam AdjMatrixColValueInputIterator Type of the iterator for graph adjacency matrix column - * input properties. + * @tparam AdjMatrixRowValueInputWrapper Type of the wrapper for graph adjacency matrix row input + * properties. + * @tparam AdjMatrixColValueInputWrapper Type of the wrapper for graph adjacency matrix column input + * properties. * @tparam EdgeOp Type of the quaternary (or quinary) edge operator. * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and * handles to various CUDA libraries) to run graph algorithms. * @param graph_view Non-owning graph object. - * @param adj_matrix_row_value_input_first Iterator pointing to the adjacency matrix row input - * properties for the first (inclusive) row (assigned to this process in multi-GPU). - * `adj_matrix_row_value_input_last` (exclusive) is deduced as @p adj_matrix_row_value_input_first + - * @p graph_view.get_number_of_local_adj_matrix_partition_rows(). - * @param adj_matrix_col_value_input_first Iterator pointing to the adjacency matrix column input - * properties for the first (inclusive) column (assigned to this process in multi-GPU). - * `adj_matrix_col_value_output_last` (exclusive) is deduced as @p adj_matrix_col_value_output_first - * + @p graph_view.get_number_of_local_adj_matrix_partition_cols(). + * @param adj_matrix_row_value_input Device-copyable wrapper used to access row input properties + * (for the rows assigned to this process in multi-GPU). Use either + * cugraph::row_properties_t::device_view() (if @p e_op needs to access row properties) or + * cugraph::dummy_properties_t::device_view() (if @p e_op does not access row properties). Use + * copy_to_adj_matrix_row to fill the wrapper. + * @param adj_matrix_col_value_input Device-copyable wrapper used to access column input properties + * (for the columns assigned to this process in multi-GPU). Use either + * cugraph::col_properties_t::device_view() (if @p e_op needs to access column properties) or + * cugraph::dummy_properties_t::device_view() (if @p e_op does not access column properties). Use + * copy_to_adj_matrix_col to fill the wrapper. * @param e_op Quaternary (or quinary) operator takes edge source, edge destination, (optional edge - * weight), *(@p adj_matrix_row_value_input_first + i), and *(@p adj_matrix_col_value_input_first + - * j) (where i is in [0, graph_view.get_number_of_local_adj_matrix_partition_rows()) and j is in [0, - * get_number_of_local_adj_matrix_partition_cols())) and returns true if this edge should be - * included in the returned count. + * weight), properties for the row (i.e. source), and properties for the column (i.e. destination) + * and returns true if this edge should be included in the returned count. * @return GraphViewType::edge_type Number of times @p e_op returned true. */ template typename GraphViewType::edge_type count_if_e( raft::handle_t const& handle, GraphViewType const& graph_view, - AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first, - AdjMatrixColValueInputIterator adj_matrix_col_value_input_first, + AdjMatrixRowValueInputWrapper adj_matrix_row_value_input, + AdjMatrixColValueInputWrapper adj_matrix_col_value_input, EdgeOp e_op) { using vertex_t = typename GraphViewType::vertex_type; @@ -70,12 +70,12 @@ typename GraphViewType::edge_type count_if_e( return transform_reduce_e(handle, graph_view, - adj_matrix_row_value_input_first, - adj_matrix_col_value_input_first, + adj_matrix_row_value_input, + adj_matrix_col_value_input, cast_edge_op_bool_to_integer{e_op}, edge_t{0}); diff --git a/cpp/include/cugraph/prims/property_op_utils.cuh b/cpp/include/cugraph/prims/property_op_utils.cuh index e164b14ecf2..a1e9bfd95d7 100644 --- a/cpp/include/cugraph/prims/property_op_utils.cuh +++ b/cpp/include/cugraph/prims/property_op_utils.cuh @@ -43,14 +43,14 @@ struct is_valid_edge_op< template struct evaluate_edge_op { using vertex_type = typename GraphViewType::vertex_type; using weight_type = typename GraphViewType::weight_type; - using row_value_type = typename std::iterator_traits::value_type; - using col_value_type = typename std::iterator_traits::value_type; + using row_value_type = typename AdjMatrixRowValueInputWrapper::value_type; + using col_value_type = typename AdjMatrixColValueInputWrapper::value_type; template struct cast_edge_op_bool_to_integer { static_assert(std::is_integral::value); using vertex_type = typename GraphViewType::vertex_type; using weight_type = typename GraphViewType::weight_type; - using row_value_type = typename std::iterator_traits::value_type; - using col_value_type = typename std::iterator_traits::value_type; + using row_value_type = typename AdjMatrixRowValueInputWrapper::value_type; + using col_value_type = typename AdjMatrixColValueInputWrapper::value_type; EdgeOp e_op{}; @@ -132,10 +132,10 @@ struct property_add> using Type = thrust::tuple; private: - template - __device__ constexpr auto sum_impl(T& t1, T& t2, std::index_sequence) + template + __device__ constexpr auto sum_impl(T& t1, T& t2, std::index_sequence) { - return thrust::make_tuple((thrust::get(t1) + thrust::get(t2))...); + return thrust::make_tuple((thrust::get(t1) + thrust::get(t2))...); } public: diff --git a/cpp/include/cugraph/prims/transform_reduce_by_adj_matrix_row_col_key_e.cuh b/cpp/include/cugraph/prims/transform_reduce_by_adj_matrix_row_col_key_e.cuh index f8583d71f5c..ad357900d6a 100644 --- a/cpp/include/cugraph/prims/transform_reduce_by_adj_matrix_row_col_key_e.cuh +++ b/cpp/include/cugraph/prims/transform_reduce_by_adj_matrix_row_col_key_e.cuh @@ -36,9 +36,9 @@ int32_t constexpr transform_reduce_by_adj_matrix_row_col_key_e_for_all_block_siz template __device__ void update_buffer_element( @@ -49,9 +49,9 @@ __device__ void update_buffer_element( typename GraphViewType::vertex_type major, typename GraphViewType::vertex_type minor, typename GraphViewType::weight_type weight, - AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first, - AdjMatrixColValueInputIterator adj_matrix_col_value_input_first, - VertexIterator adj_matrix_row_col_key_first, + AdjMatrixRowValueInputWrapper adj_matrix_row_value_input, + AdjMatrixColValueInputWrapper adj_matrix_col_value_input, + AdjMatrixRowColKeyInputWrapper adj_matrix_row_col_key_input, EdgeOp e_op, typename GraphViewType::vertex_type* key, T* value) @@ -65,27 +65,26 @@ __device__ void update_buffer_element( auto row_offset = GraphViewType::is_adj_matrix_transposed ? minor_offset : major_offset; auto col_offset = GraphViewType::is_adj_matrix_transposed ? major_offset : minor_offset; - *key = *(adj_matrix_row_col_key_first + - ((GraphViewType::is_adj_matrix_transposed != adj_matrix_row_key) ? major_offset - : minor_offset)); + *key = adj_matrix_row_col_key_input.get(( + (GraphViewType::is_adj_matrix_transposed != adj_matrix_row_key) ? major_offset : minor_offset)); *value = evaluate_edge_op() .compute(row, col, weight, - *(adj_matrix_row_value_input_first + row_offset), - *(adj_matrix_col_value_input_first + col_offset), + adj_matrix_row_value_input.get(row_offset), + adj_matrix_col_value_input.get(col_offset), e_op); } template __global__ void for_all_major_for_all_nbr_hypersparse( @@ -94,9 +93,9 @@ __global__ void for_all_major_for_all_nbr_hypersparse( typename GraphViewType::weight_type, GraphViewType::is_multi_gpu> matrix_partition, typename GraphViewType::vertex_type major_hypersparse_first, - AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first, - AdjMatrixColValueInputIterator adj_matrix_col_value_input_first, - VertexIterator adj_matrix_row_col_key_first, + AdjMatrixRowValueInputWrapper adj_matrix_row_value_input, + AdjMatrixColValueInputWrapper adj_matrix_col_value_input, + AdjMatrixRowColKeyInputWrapper adj_matrix_row_col_key_input, EdgeOp e_op, typename GraphViewType::vertex_type* keys, T* values) @@ -129,9 +128,9 @@ __global__ void for_all_major_for_all_nbr_hypersparse( major, indices[i], weights ? (*weights)[i] : weight_t{1.0}, - adj_matrix_row_value_input_first, - adj_matrix_col_value_input_first, - adj_matrix_row_col_key_first, + adj_matrix_row_value_input, + adj_matrix_col_value_input, + adj_matrix_row_col_key_input, e_op, keys + local_offset + i, values + local_offset + i); @@ -143,9 +142,9 @@ __global__ void for_all_major_for_all_nbr_hypersparse( template __global__ void for_all_major_for_all_nbr_low_degree( @@ -155,9 +154,9 @@ __global__ void for_all_major_for_all_nbr_low_degree( GraphViewType::is_multi_gpu> matrix_partition, typename GraphViewType::vertex_type major_first, typename GraphViewType::vertex_type major_last, - AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first, - AdjMatrixColValueInputIterator adj_matrix_col_value_input_first, - VertexIterator adj_matrix_row_col_key_first, + AdjMatrixRowValueInputWrapper adj_matrix_row_value_input, + AdjMatrixColValueInputWrapper adj_matrix_col_value_input, + AdjMatrixRowColKeyInputWrapper adj_matrix_row_col_key_input, EdgeOp e_op, typename GraphViewType::vertex_type* keys, T* values) @@ -186,9 +185,9 @@ __global__ void for_all_major_for_all_nbr_low_degree( major, indices[i], weights ? (*weights)[i] : weight_t{1.0}, - adj_matrix_row_value_input_first, - adj_matrix_col_value_input_first, - adj_matrix_row_col_key_first, + adj_matrix_row_value_input, + adj_matrix_col_value_input, + adj_matrix_row_col_key_input, e_op, keys + local_offset + i, values + local_offset + i); @@ -200,9 +199,9 @@ __global__ void for_all_major_for_all_nbr_low_degree( template __global__ void for_all_major_for_all_nbr_mid_degree( @@ -212,9 +211,9 @@ __global__ void for_all_major_for_all_nbr_mid_degree( GraphViewType::is_multi_gpu> matrix_partition, typename GraphViewType::vertex_type major_first, typename GraphViewType::vertex_type major_last, - AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first, - AdjMatrixColValueInputIterator adj_matrix_col_value_input_first, - VertexIterator adj_matrix_row_col_key_first, + AdjMatrixRowValueInputWrapper adj_matrix_row_value_input, + AdjMatrixColValueInputWrapper adj_matrix_col_value_input, + AdjMatrixRowColKeyInputWrapper adj_matrix_row_col_key_input, EdgeOp e_op, typename GraphViewType::vertex_type* keys, T* values) @@ -246,9 +245,9 @@ __global__ void for_all_major_for_all_nbr_mid_degree( major, indices[i], weights ? (*weights)[i] : weight_t{1.0}, - adj_matrix_row_value_input_first, - adj_matrix_col_value_input_first, - adj_matrix_row_col_key_first, + adj_matrix_row_value_input, + adj_matrix_col_value_input, + adj_matrix_row_col_key_input, e_op, keys + local_offset + i, values + local_offset + i); @@ -260,9 +259,9 @@ __global__ void for_all_major_for_all_nbr_mid_degree( template __global__ void for_all_major_for_all_nbr_high_degree( @@ -272,9 +271,9 @@ __global__ void for_all_major_for_all_nbr_high_degree( GraphViewType::is_multi_gpu> matrix_partition, typename GraphViewType::vertex_type major_first, typename GraphViewType::vertex_type major_last, - AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first, - AdjMatrixColValueInputIterator adj_matrix_col_value_input_first, - VertexIterator adj_matrix_row_col_key_first, + AdjMatrixRowValueInputWrapper adj_matrix_row_value_input, + AdjMatrixColValueInputWrapper adj_matrix_col_value_input, + AdjMatrixRowColKeyInputWrapper adj_matrix_row_col_key_input, EdgeOp e_op, typename GraphViewType::vertex_type* keys, T* values) @@ -302,9 +301,9 @@ __global__ void for_all_major_for_all_nbr_high_degree( major, indices[i], weights ? (*weights)[i] : weight_t{1.0}, - adj_matrix_row_value_input_first, - adj_matrix_col_value_input_first, - adj_matrix_row_col_key_first, + adj_matrix_row_value_input, + adj_matrix_col_value_input, + adj_matrix_row_col_key_input, e_op, keys + local_offset + i, values + local_offset + i); @@ -345,9 +344,9 @@ std::tuple, BufferType> reduce_to_unique_kv_pairs( template std::tuple, @@ -355,14 +354,14 @@ std::tuple, transform_reduce_by_adj_matrix_row_col_key_e( raft::handle_t const& handle, GraphViewType const& graph_view, - AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first, - AdjMatrixColValueInputIterator adj_matrix_col_value_input_first, - VertexIterator adj_matrix_row_col_key_first, + AdjMatrixRowValueInputWrapper adj_matrix_row_value_input, + AdjMatrixColValueInputWrapper adj_matrix_col_value_input, + AdjMatrixRowColKeyInputWrapper adj_matrix_row_col_key_input, EdgeOp e_op, T init) { static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic::value); - static_assert(std::is_same::value_type, + static_assert(std::is_same::value); using vertex_t = typename GraphViewType::vertex_type; @@ -392,13 +391,23 @@ transform_reduce_by_adj_matrix_row_col_key_e( auto tmp_value_buffer = allocate_dataframe_buffer(tmp_keys.size(), handle.get_stream()); if (graph_view.get_vertex_partition_size(comm_root_rank) > 0) { - auto row_value_input_offset = GraphViewType::is_adj_matrix_transposed - ? vertex_t{0} - : matrix_partition.get_major_value_start_offset(); - auto col_value_input_offset = GraphViewType::is_adj_matrix_transposed - ? matrix_partition.get_major_value_start_offset() - : vertex_t{0}; - auto segment_offsets = graph_view.get_local_adj_matrix_partition_segment_offsets(i); + auto matrix_partition_row_value_input = adj_matrix_row_value_input; + auto matrix_partition_col_value_input = adj_matrix_col_value_input; + if constexpr (GraphViewType::is_adj_matrix_transposed) { + matrix_partition_col_value_input.add_offset( + matrix_partition.get_major_value_start_offset()); + } else { + matrix_partition_row_value_input.add_offset( + matrix_partition.get_major_value_start_offset()); + } + auto matrix_partition_row_col_key_input = adj_matrix_row_col_key_input; + if constexpr ((adj_matrix_row_key && !GraphViewType::is_adj_matrix_transposed) || + (!adj_matrix_row_key && GraphViewType::is_adj_matrix_transposed)) { + matrix_partition_row_col_key_input.add_offset( + matrix_partition.get_major_value_start_offset()); + } + + auto segment_offsets = graph_view.get_local_adj_matrix_partition_segment_offsets(i); if (segment_offsets) { // FIXME: we may further improve performance by 1) concurrently running kernels on different // segments; 2) individually tuning block sizes for different segments; and 3) adding one @@ -414,10 +423,9 @@ transform_reduce_by_adj_matrix_row_col_key_e( matrix_partition, matrix_partition.get_major_first(), matrix_partition.get_major_first() + (*segment_offsets)[1], - adj_matrix_row_value_input_first + row_value_input_offset, - adj_matrix_col_value_input_first + col_value_input_offset, - adj_matrix_row_col_key_first + - (adj_matrix_row_key ? row_value_input_offset : col_value_input_offset), + matrix_partition_row_value_input, + matrix_partition_col_value_input, + matrix_partition_row_col_key_input, e_op, tmp_keys.data(), get_dataframe_buffer_begin(tmp_value_buffer)); @@ -432,10 +440,9 @@ transform_reduce_by_adj_matrix_row_col_key_e( matrix_partition, matrix_partition.get_major_first() + (*segment_offsets)[1], matrix_partition.get_major_first() + (*segment_offsets)[2], - adj_matrix_row_value_input_first + row_value_input_offset, - adj_matrix_col_value_input_first + col_value_input_offset, - adj_matrix_row_col_key_first + - (adj_matrix_row_key ? row_value_input_offset : col_value_input_offset), + matrix_partition_row_value_input, + matrix_partition_col_value_input, + matrix_partition_row_col_key_input, e_op, tmp_keys.data(), get_dataframe_buffer_begin(tmp_value_buffer)); @@ -450,10 +457,9 @@ transform_reduce_by_adj_matrix_row_col_key_e( matrix_partition, matrix_partition.get_major_first() + (*segment_offsets)[2], matrix_partition.get_major_first() + (*segment_offsets)[3], - adj_matrix_row_value_input_first + row_value_input_offset, - adj_matrix_col_value_input_first + col_value_input_offset, - adj_matrix_row_col_key_first + - (adj_matrix_row_key ? row_value_input_offset : col_value_input_offset), + matrix_partition_row_value_input, + matrix_partition_col_value_input, + matrix_partition_row_col_key_input, e_op, tmp_keys.data(), get_dataframe_buffer_begin(tmp_value_buffer)); @@ -468,10 +474,9 @@ transform_reduce_by_adj_matrix_row_col_key_e( <<>>( matrix_partition, matrix_partition.get_major_first() + (*segment_offsets)[3], - adj_matrix_row_value_input_first + row_value_input_offset, - adj_matrix_col_value_input_first + col_value_input_offset, - adj_matrix_row_col_key_first + - (adj_matrix_row_key ? row_value_input_offset : col_value_input_offset), + matrix_partition_row_value_input, + matrix_partition_col_value_input, + matrix_partition_row_col_key_input, e_op, tmp_keys.data(), get_dataframe_buffer_begin(tmp_value_buffer)); @@ -487,10 +492,9 @@ transform_reduce_by_adj_matrix_row_col_key_e( matrix_partition, matrix_partition.get_major_first(), matrix_partition.get_major_last(), - adj_matrix_row_value_input_first + row_value_input_offset, - adj_matrix_col_value_input_first + col_value_input_offset, - adj_matrix_row_col_key_first + - (adj_matrix_row_key ? row_value_input_offset : col_value_input_offset), + matrix_partition_row_value_input, + matrix_partition_col_value_input, + matrix_partition_row_col_key_input, e_op, tmp_keys.data(), get_dataframe_buffer_begin(tmp_value_buffer)); @@ -562,32 +566,32 @@ transform_reduce_by_adj_matrix_row_col_key_e( * edges are determined by the graph adjacency matrix rows. * * @tparam GraphViewType Type of the passed non-owning graph object. - * @tparam AdjMatrixRowValueInputIterator Type of the iterator for graph adjacency matrix row - * input properties. - * @tparam AdjMatrixColValueInputIterator Type of the iterator for graph adjacency matrix column - * input properties. - * @tparam VertexIterator Type of the iterator for keys in (key, value) pairs (key type should - * coincide with vertex type). + * @tparam AdjMatrixRowValueInputWrapper Type of the wrapper for graph adjacency matrix row input + * properties. + * @tparam AdjMatrixColValueInputWrapper Type of the wrapper for graph adjacency matrix column input + * properties. + * @tparam AdjMatrixRowKeyInputWrapper Type of the wrapper for graph adjacency matrix row keys. * @tparam EdgeOp Type of the quaternary (or quinary) edge operator. * @tparam T Type of the values in (key, value) pairs. * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and * handles to various CUDA libraries) to run graph algorithms. * @param graph_view Non-owning graph object. - * @param adj_matrix_row_value_input_first Iterator pointing to the adjacency matrix row input - * properties for the first (inclusive) row (assigned to this process in multi-GPU). - * `adj_matrix_row_value_input_last` (exclusive) is deduced as @p adj_matrix_row_value_input_first + - * @p graph_view.get_number_of_local_adj_matrix_partition_rows(). - * @param adj_matrix_col_value_input_first Iterator pointing to the adjacency matrix column input - * properties for the first (inclusive) column (assigned to this process in multi-GPU). - * `adj_matrix_col_value_output_last` (exclusive) is deduced as @p adj_matrix_col_value_output_first - * + @p graph_view.get_number_of_local_adj_matrix_partition_cols(). - * @param adj_matrix_row_key_first Iterator pointing to the adjacency matrix row key for the first - * (inclusive) column (assigned to this process in multi-GPU). `adj_matrix_row_key_last` (exclusive) - * is deduced as @p adj_matrix_row_key_first + @p graph_view.get_number_of_local_adj_matrix_rows(). + * @param adj_matrix_row_value_input Device-copyable wrapper used to access row input properties + * (for the rows assigned to this process in multi-GPU). Use either + * cugraph::row_properties_t::device_view() (if @p e_op needs to access row properties) or + * cugraph::dummy_properties_t::device_view() (if @p e_op does not access row properties). Use + * copy_to_adj_matrix_row to fill the wrapper. + * @param adj_matrix_col_value_input Device-copyable wrapper used to access column input properties + * (for the columns assigned to this process in multi-GPU). Use either + * cugraph::col_properties_t::device_view() (if @p e_op needs to access column properties) or + * cugraph::dummy_properties_t::device_view() (if @p e_op does not access column properties). Use + * copy_to_adj_matrix_col to fill the wrapper. + * @param adj_matrix_row_key_input Device-copyable wrapper used to access row keys(for the rows + * assigned to this process in multi-GPU). Use either cugraph::row_properties_t::device_view(). Use + * copy_to_adj_matrix_row to fill the wrapper. * @param e_op Quaternary (or quinary) operator takes edge source, edge destination, (optional edge - * weight), *(@p adj_matrix_row_value_input_first + i), and *(@p adj_matrix_col_value_input_first + - * j) (where i is in [0, graph_view.get_number_of_local_adj_matrix_partition_rows()) and j is in [0, - * get_number_of_local_adj_matrix_partition_cols())) and returns a transformed value to be reduced. + * weight), properties for the row (i.e. source), and properties for the column (i.e. destination) + * and returns a transformed value to be reduced. * @param init Initial value to be added to the value in each transform-reduced (key, value) pair. * @return std::tuple Tuple of rmm::device_uvector and * rmm::device_uvector (if T is arithmetic scalar) or a tuple of rmm::device_uvector objects (if @@ -595,32 +599,31 @@ transform_reduce_by_adj_matrix_row_col_key_e( * type). */ template auto transform_reduce_by_adj_matrix_row_key_e( raft::handle_t const& handle, GraphViewType const& graph_view, - AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first, - AdjMatrixColValueInputIterator adj_matrix_col_value_input_first, - VertexIterator adj_matrix_row_key_first, + AdjMatrixRowValueInputWrapper adj_matrix_row_value_input, + AdjMatrixColValueInputWrapper adj_matrix_col_value_input, + AdjMatrixRowKeyInputWrapper adj_matrix_row_key_input, EdgeOp e_op, T init) { static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic::value); - static_assert(std::is_same::value_type, + static_assert(std::is_same::value); - return detail::transform_reduce_by_adj_matrix_row_col_key_e( - handle, - graph_view, - adj_matrix_row_value_input_first, - adj_matrix_col_value_input_first, - adj_matrix_row_key_first, - e_op, - init); + return detail::transform_reduce_by_adj_matrix_row_col_key_e(handle, + graph_view, + adj_matrix_row_value_input, + adj_matrix_col_value_input, + adj_matrix_row_key_input, + e_op, + init); } // FIXME: EdgeOp & VertexOp in update_frontier_v_push_if_out_nbr concatenates push inidicator or @@ -632,33 +635,32 @@ auto transform_reduce_by_adj_matrix_row_key_e( * edges are determined by the graph adjacency matrix columns. * * @tparam GraphViewType Type of the passed non-owning graph object. - * @tparam AdjMatrixRowValueInputIterator Type of the iterator for graph adjacency matrix row - * input properties. - * @tparam AdjMatrixColValueInputIterator Type of the iterator for graph adjacency matrix column - * input properties. - * @tparam VertexIterator Type of the iterator for keys in (key, value) pairs (key type should - * coincide with vertex type). + * @tparam AdjMatrixRowValueInputWrapper Type of the wrapper for graph adjacency matrix row input + * properties. + * @tparam AdjMatrixColValueInputWrapper Type of the wrapper for graph adjacency matrix column input + * properties. + * @tparam AdjMatrixColKeyInputWrapper Type of the wrapper for graph adjacency matrix column keys. * @tparam EdgeOp Type of the quaternary (or quinary) edge operator. * @tparam T Type of the values in (key, value) pairs. * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and * handles to various CUDA libraries) to run graph algorithms. * @param graph_view Non-owning graph object. - * @param adj_matrix_row_value_input_first Iterator pointing to the adjacency matrix row input - * properties for the first (inclusive) row (assigned to this process in multi-GPU). - * `adj_matrix_row_value_input_last` (exclusive) is deduced as @p adj_matrix_row_value_input_first + - * @p graph_view.get_number_of_local_adj_matrix_partition_rows(). - * @param adj_matrix_col_value_input_first Iterator pointing to the adjacency matrix column input - * properties for the first (inclusive) column (assigned to this process in multi-GPU). - * `adj_matrix_col_value_output_last` (exclusive) is deduced as @p adj_matrix_col_value_output_first - * + @p graph_view.get_number_of_local_adj_matrix_partition_cols(). - * @param adj_matrix_col_key_first Iterator pointing to the adjacency matrix column key for the - * first (inclusive) column (assigned to this process in multi-GPU). - * `adj_matrix_col_key_last` (exclusive) is deduced as @p adj_matrix_col_key_first + @p - * graph_view.get_number_of_local_adj_matrix_cols(). + * @param adj_matrix_row_value_input Device-copyable wrapper used to access row input properties + * (for the rows assigned to this process in multi-GPU). Use either + * cugraph::row_properties_t::device_view() (if @p e_op needs to access row properties) or + * cugraph::dummy_properties_t::device_view() (if @p e_op does not access row properties). Use + * copy_to_adj_matrix_row to fill the wrapper. + * @param adj_matrix_col_value_input Device-copyable wrapper used to access column input properties + * (for the columns assigned to this process in multi-GPU). Use either + * cugraph::col_properties_t::device_view() (if @p e_op needs to access column properties) or + * cugraph::dummy_properties_t::device_view() (if @p e_op does not access column properties). Use + * copy_to_adj_matrix_col to fill the wrapper. + * @param adj_matrix_col_key_input Device-copyable wrapper used to access column keys(for the + * columns assigned to this process in multi-GPU). Use either + * cugraph::col_properties_t::device_view(). Use copy_to_adj_matrix_col to fill the wrapper. * @param e_op Quaternary (or quinary) operator takes edge source, edge destination, (optional edge - * weight), *(@p adj_matrix_row_value_input_first + i), and *(@p adj_matrix_col_value_input_first + - * j) (where i is in [0, graph_view.get_number_of_local_adj_matrix_partition_rows()) and j is in [0, - * get_number_of_local_adj_matrix_partition_cols())) and returns a transformed value to be reduced. + * weight), properties for the row (i.e. source), and properties for the column (i.e. destination) + * and returns a transformed value to be reduced. * @param init Initial value to be added to the value in each transform-reduced (key, value) pair. * @return std::tuple Tuple of rmm::device_uvector and * rmm::device_uvector (if T is arithmetic scalar) or a tuple of rmm::device_uvector objects (if @@ -666,32 +668,31 @@ auto transform_reduce_by_adj_matrix_row_key_e( * type). */ template auto transform_reduce_by_adj_matrix_col_key_e( raft::handle_t const& handle, GraphViewType const& graph_view, - AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first, - AdjMatrixColValueInputIterator adj_matrix_col_value_input_first, - VertexIterator adj_matrix_col_key_first, + AdjMatrixRowValueInputWrapper adj_matrix_row_value_input, + AdjMatrixColValueInputWrapper adj_matrix_col_value_input, + AdjMatrixColKeyInputWrapper adj_matrix_col_key_input, EdgeOp e_op, T init) { static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic::value); - static_assert(std::is_same::value_type, + static_assert(std::is_same::value); - return detail::transform_reduce_by_adj_matrix_row_col_key_e( - handle, - graph_view, - adj_matrix_row_value_input_first, - adj_matrix_col_value_input_first, - adj_matrix_col_key_first, - e_op, - init); + return detail::transform_reduce_by_adj_matrix_row_col_key_e(handle, + graph_view, + adj_matrix_row_value_input, + adj_matrix_col_value_input, + adj_matrix_col_key_input, + e_op, + init); } } // namespace cugraph diff --git a/cpp/include/cugraph/prims/transform_reduce_e.cuh b/cpp/include/cugraph/prims/transform_reduce_e.cuh index f46a00d37e4..f5b18e1efd6 100644 --- a/cpp/include/cugraph/prims/transform_reduce_e.cuh +++ b/cpp/include/cugraph/prims/transform_reduce_e.cuh @@ -37,8 +37,8 @@ namespace detail { int32_t constexpr transform_reduce_e_for_all_block_size = 128; template __global__ void for_all_major_for_all_nbr_hypersparse( @@ -47,8 +47,8 @@ __global__ void for_all_major_for_all_nbr_hypersparse( typename GraphViewType::weight_type, GraphViewType::is_multi_gpu> matrix_partition, typename GraphViewType::vertex_type major_hypersparse_first, - AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first, - AdjMatrixColValueInputIterator adj_matrix_col_value_input_first, + AdjMatrixRowValueInputWrapper adj_matrix_row_value_input, + AdjMatrixColValueInputWrapper adj_matrix_col_value_input, ResultIterator result_iter /* size 1 */, EdgeOp e_op) { @@ -80,8 +80,8 @@ __global__ void for_all_major_for_all_nbr_hypersparse( thrust::make_counting_iterator(edge_t{0}), thrust::make_counting_iterator(local_degree), [&matrix_partition, - &adj_matrix_row_value_input_first, - &adj_matrix_col_value_input_first, + &adj_matrix_row_value_input, + &adj_matrix_col_value_input, &e_op, major, indices, @@ -100,14 +100,14 @@ __global__ void for_all_major_for_all_nbr_hypersparse( : minor_offset; return evaluate_edge_op() .compute(row, col, weight, - *(adj_matrix_row_value_input_first + row_offset), - *(adj_matrix_col_value_input_first + col_offset), + adj_matrix_row_value_input.get(row_offset), + adj_matrix_col_value_input.get(col_offset), e_op); }, e_op_result_t{}, @@ -124,8 +124,8 @@ __global__ void for_all_major_for_all_nbr_hypersparse( } template __global__ void for_all_major_for_all_nbr_low_degree( @@ -135,8 +135,8 @@ __global__ void for_all_major_for_all_nbr_low_degree( GraphViewType::is_multi_gpu> matrix_partition, typename GraphViewType::vertex_type major_first, typename GraphViewType::vertex_type major_last, - AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first, - AdjMatrixColValueInputIterator adj_matrix_col_value_input_first, + AdjMatrixRowValueInputWrapper adj_matrix_row_value_input, + AdjMatrixColValueInputWrapper adj_matrix_col_value_input, ResultIterator result_iter /* size 1 */, EdgeOp e_op) { @@ -162,8 +162,8 @@ __global__ void for_all_major_for_all_nbr_low_degree( thrust::make_counting_iterator(edge_t{0}), thrust::make_counting_iterator(local_degree), [&matrix_partition, - &adj_matrix_row_value_input_first, - &adj_matrix_col_value_input_first, + &adj_matrix_row_value_input, + &adj_matrix_col_value_input, &e_op, major_offset, indices, @@ -185,14 +185,14 @@ __global__ void for_all_major_for_all_nbr_low_degree( : minor_offset; return evaluate_edge_op() .compute(row, col, weight, - *(adj_matrix_row_value_input_first + row_offset), - *(adj_matrix_col_value_input_first + col_offset), + adj_matrix_row_value_input.get(row_offset), + adj_matrix_col_value_input.get(col_offset), e_op); }, e_op_result_t{}, @@ -209,8 +209,8 @@ __global__ void for_all_major_for_all_nbr_low_degree( } template __global__ void for_all_major_for_all_nbr_mid_degree( @@ -220,8 +220,8 @@ __global__ void for_all_major_for_all_nbr_mid_degree( GraphViewType::is_multi_gpu> matrix_partition, typename GraphViewType::vertex_type major_first, typename GraphViewType::vertex_type major_last, - AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first, - AdjMatrixColValueInputIterator adj_matrix_col_value_input_first, + AdjMatrixRowValueInputWrapper adj_matrix_row_value_input, + AdjMatrixColValueInputWrapper adj_matrix_col_value_input, ResultIterator result_iter /* size 1 */, EdgeOp e_op) { @@ -262,14 +262,14 @@ __global__ void for_all_major_for_all_nbr_mid_degree( : minor_offset; auto e_op_result = evaluate_edge_op() .compute(row, col, weight, - *(adj_matrix_row_value_input_first + row_offset), - *(adj_matrix_col_value_input_first + col_offset), + adj_matrix_row_value_input.get(row_offset), + adj_matrix_col_value_input.get(col_offset), e_op); e_op_result_sum = edge_property_add(e_op_result_sum, e_op_result); } @@ -283,8 +283,8 @@ __global__ void for_all_major_for_all_nbr_mid_degree( } template __global__ void for_all_major_for_all_nbr_high_degree( @@ -294,8 +294,8 @@ __global__ void for_all_major_for_all_nbr_high_degree( GraphViewType::is_multi_gpu> matrix_partition, typename GraphViewType::vertex_type major_first, typename GraphViewType::vertex_type major_last, - AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first, - AdjMatrixColValueInputIterator adj_matrix_col_value_input_first, + AdjMatrixRowValueInputWrapper adj_matrix_row_value_input, + AdjMatrixColValueInputWrapper adj_matrix_col_value_input, ResultIterator result_iter /* size 1 */, EdgeOp e_op) { @@ -333,14 +333,14 @@ __global__ void for_all_major_for_all_nbr_high_degree( : minor_offset; auto e_op_result = evaluate_edge_op() .compute(row, col, weight, - *(adj_matrix_row_value_input_first + row_offset), - *(adj_matrix_col_value_input_first + col_offset), + adj_matrix_row_value_input.get(row_offset), + adj_matrix_col_value_input.get(col_offset), e_op); e_op_result_sum = edge_property_add(e_op_result_sum, e_op_result); } @@ -361,39 +361,40 @@ __global__ void for_all_major_for_all_nbr_high_degree( * This function is inspired by thrust::transform_reduce(). * * @tparam GraphViewType Type of the passed non-owning graph object. - * @tparam AdjMatrixRowValueInputIterator Type of the iterator for graph adjacency matrix row - * input properties. - * @tparam AdjMatrixColValueInputIterator Type of the iterator for graph adjacency matrix column - * input properties. + * @tparam AdjMatrixRowValueInputWrapper Type of the wrapper for graph adjacency matrix row input + * properties. + * @tparam AdjMatrixColValueInputWrapper Type of the wrapper for graph adjacency matrix column input + * properties. * @tparam EdgeOp Type of the quaternary (or quinary) edge operator. * @tparam T Type of the initial value. * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and * handles to various CUDA libraries) to run graph algorithms. * @param graph_view Non-owning graph object. - * @param adj_matrix_row_value_input_first Iterator pointing to the adjacency matrix row input - * properties for the first (inclusive) row (assigned to this process in multi-GPU). - * `adj_matrix_row_value_input_last` (exclusive) is deduced as @p adj_matrix_row_value_input_first + - * @p graph_view.get_number_of_local_adj_matrix_partition_rows(). - * @param adj_matrix_col_value_input_first Iterator pointing to the adjacency matrix column input - * properties for the first (inclusive) column (assigned to this process in multi-GPU). - * `adj_matrix_col_value_output_last` (exclusive) is deduced as @p adj_matrix_col_value_output_first - * + @p graph_view.get_number_of_local_adj_matrix_partition_cols(). + * @param adj_matrix_row_value_input Device-copyable wrapper used to access row input properties + * (for the rows assigned to this process in multi-GPU). Use either + * cugraph::row_properties_t::device_view() (if @p e_op needs to access row properties) or + * cugraph::dummy_properties_t::device_view() (if @p e_op does not access row properties). Use + * copy_to_adj_matrix_row to fill the wrapper. + * @param adj_matrix_col_value_input Device-copyable wrapper used to access column input properties + * (for the columns assigned to this process in multi-GPU). Use either + * cugraph::col_properties_t::device_view() (if @p e_op needs to access column properties) or + * cugraph::dummy_properties_t::device_view() (if @p e_op does not access column properties). Use + * copy_to_adj_matrix_col to fill the wrapper. * @param e_op Quaternary (or quinary) operator takes edge source, edge destination, (optional edge - * weight), *(@p adj_matrix_row_value_input_first + i), and *(@p adj_matrix_col_value_input_first + - * j) (where i is in [0, graph_view.get_number_of_local_adj_matrix_partition_rows()) and j is in [0, - * get_number_of_local_adj_matrix_partition_cols())) and returns a transformed value to be reduced. + * weight), properties for the row (i.e. source), and properties for the column (i.e. destination) + * and returns a value to be reduced. * @param init Initial value to be added to the transform-reduced input vertex properties. * @return T Reduction of the @p edge_op outputs. */ template T transform_reduce_e(raft::handle_t const& handle, GraphViewType const& graph_view, - AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first, - AdjMatrixColValueInputIterator adj_matrix_col_value_input_first, + AdjMatrixRowValueInputWrapper adj_matrix_row_value_input, + AdjMatrixColValueInputWrapper adj_matrix_col_value_input, EdgeOp e_op, T init) { @@ -416,13 +417,15 @@ T transform_reduce_e(raft::handle_t const& handle, matrix_partition_device_view_t( graph_view.get_matrix_partition_view(i)); - auto row_value_input_offset = GraphViewType::is_adj_matrix_transposed - ? vertex_t{0} - : matrix_partition.get_major_value_start_offset(); - auto col_value_input_offset = GraphViewType::is_adj_matrix_transposed - ? matrix_partition.get_major_value_start_offset() - : vertex_t{0}; - auto segment_offsets = graph_view.get_local_adj_matrix_partition_segment_offsets(i); + auto matrix_partition_row_value_input = adj_matrix_row_value_input; + auto matrix_partition_col_value_input = adj_matrix_col_value_input; + if constexpr (GraphViewType::is_adj_matrix_transposed) { + matrix_partition_col_value_input.add_offset(matrix_partition.get_major_value_start_offset()); + } else { + matrix_partition_row_value_input.add_offset(matrix_partition.get_major_value_start_offset()); + } + + auto segment_offsets = graph_view.get_local_adj_matrix_partition_segment_offsets(i); if (segment_offsets) { // FIXME: we may further improve performance by 1) concurrently running kernels on different // segments; 2) individually tuning block sizes for different segments; and 3) adding one more @@ -437,8 +440,8 @@ T transform_reduce_e(raft::handle_t const& handle, matrix_partition, matrix_partition.get_major_first(), matrix_partition.get_major_first() + (*segment_offsets)[1], - adj_matrix_row_value_input_first + row_value_input_offset, - adj_matrix_col_value_input_first + col_value_input_offset, + matrix_partition_row_value_input, + matrix_partition_col_value_input, get_dataframe_buffer_begin(result_buffer), e_op); } @@ -451,8 +454,8 @@ T transform_reduce_e(raft::handle_t const& handle, matrix_partition, matrix_partition.get_major_first() + (*segment_offsets)[1], matrix_partition.get_major_first() + (*segment_offsets)[2], - adj_matrix_row_value_input_first + row_value_input_offset, - adj_matrix_col_value_input_first + col_value_input_offset, + matrix_partition_row_value_input, + matrix_partition_col_value_input, get_dataframe_buffer_begin(result_buffer), e_op); } @@ -465,8 +468,8 @@ T transform_reduce_e(raft::handle_t const& handle, matrix_partition, matrix_partition.get_major_first() + (*segment_offsets)[2], matrix_partition.get_major_first() + (*segment_offsets)[3], - adj_matrix_row_value_input_first + row_value_input_offset, - adj_matrix_col_value_input_first + col_value_input_offset, + matrix_partition_row_value_input, + matrix_partition_col_value_input, get_dataframe_buffer_begin(result_buffer), e_op); } @@ -479,8 +482,8 @@ T transform_reduce_e(raft::handle_t const& handle, <<>>( matrix_partition, matrix_partition.get_major_first() + (*segment_offsets)[3], - adj_matrix_row_value_input_first + row_value_input_offset, - adj_matrix_col_value_input_first + col_value_input_offset, + matrix_partition_row_value_input, + matrix_partition_col_value_input, get_dataframe_buffer_begin(result_buffer), e_op); } @@ -495,8 +498,8 @@ T transform_reduce_e(raft::handle_t const& handle, matrix_partition, matrix_partition.get_major_first(), matrix_partition.get_major_last(), - adj_matrix_row_value_input_first + row_value_input_offset, - adj_matrix_col_value_input_first + col_value_input_offset, + matrix_partition_row_value_input, + matrix_partition_col_value_input, get_dataframe_buffer_begin(result_buffer), e_op); } diff --git a/cpp/include/cugraph/prims/update_frontier_v_push_if_out_nbr.cuh b/cpp/include/cugraph/prims/update_frontier_v_push_if_out_nbr.cuh index 1d04dd7fa87..97c87a477cc 100644 --- a/cpp/include/cugraph/prims/update_frontier_v_push_if_out_nbr.cuh +++ b/cpp/include/cugraph/prims/update_frontier_v_push_if_out_nbr.cuh @@ -182,8 +182,8 @@ struct check_invalid_bucket_idx_t { }; template @@ -196,8 +196,8 @@ __device__ void push_if_buffer_element( typename GraphViewType::vertex_type row_offset, typename GraphViewType::vertex_type col, typename GraphViewType::weight_type weight, - AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first, - AdjMatrixColValueInputIterator adj_matrix_col_value_input_first, + AdjMatrixRowValueInputWrapper adj_matrix_row_value_input, + AdjMatrixColValueInputWrapper adj_matrix_col_value_input, BufferKeyOutputIterator buffer_key_output_first, BufferPayloadOutputIterator buffer_payload_output_first, size_t* buffer_idx_ptr, @@ -211,14 +211,14 @@ __device__ void push_if_buffer_element( auto col_offset = matrix_partition.get_minor_offset_from_minor_nocheck(col); auto e_op_result = evaluate_edge_op() .compute(key, col, weight, - *(adj_matrix_row_value_input_first + row_offset), - *(adj_matrix_col_value_input_first + col_offset), + adj_matrix_row_value_input.get(row_offset), + adj_matrix_col_value_input.get(col_offset), e_op); if (e_op_result) { static_assert(sizeof(unsigned long long int) == sizeof(size_t)); @@ -241,8 +241,8 @@ __device__ void push_if_buffer_element( template @@ -254,8 +254,8 @@ __global__ void for_all_frontier_row_for_all_nbr_hypersparse( typename GraphViewType::vertex_type major_hypersparse_first, KeyIterator key_first, KeyIterator key_last, - AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first, - AdjMatrixColValueInputIterator adj_matrix_col_value_input_first, + AdjMatrixRowValueInputWrapper adj_matrix_row_value_input, + AdjMatrixColValueInputWrapper adj_matrix_col_value_input, BufferKeyOutputIterator buffer_key_output_first, BufferPayloadOutputIterator buffer_payload_output_first, size_t* buffer_idx_ptr, @@ -303,8 +303,8 @@ __global__ void for_all_frontier_row_for_all_nbr_hypersparse( row_offset, indices[i], weights ? (*weights)[i] : weight_t{1.0}, - adj_matrix_row_value_input_first, - adj_matrix_col_value_input_first, + adj_matrix_row_value_input, + adj_matrix_col_value_input, buffer_key_output_first, buffer_payload_output_first, buffer_idx_ptr, @@ -317,8 +317,8 @@ __global__ void for_all_frontier_row_for_all_nbr_hypersparse( template @@ -329,8 +329,8 @@ __global__ void for_all_frontier_row_for_all_nbr_low_degree( GraphViewType::is_multi_gpu> matrix_partition, KeyIterator key_first, KeyIterator key_last, - AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first, - AdjMatrixColValueInputIterator adj_matrix_col_value_input_first, + AdjMatrixRowValueInputWrapper adj_matrix_row_value_input, + AdjMatrixColValueInputWrapper adj_matrix_col_value_input, BufferKeyOutputIterator buffer_key_output_first, BufferPayloadOutputIterator buffer_payload_output_first, size_t* buffer_idx_ptr, @@ -370,8 +370,8 @@ __global__ void for_all_frontier_row_for_all_nbr_low_degree( row_offset, indices[i], weights ? (*weights)[i] : weight_t{1.0}, - adj_matrix_row_value_input_first, - adj_matrix_col_value_input_first, + adj_matrix_row_value_input, + adj_matrix_col_value_input, buffer_key_output_first, buffer_payload_output_first, buffer_idx_ptr, @@ -383,8 +383,8 @@ __global__ void for_all_frontier_row_for_all_nbr_low_degree( template @@ -395,8 +395,8 @@ __global__ void for_all_frontier_row_for_all_nbr_mid_degree( GraphViewType::is_multi_gpu> matrix_partition, KeyIterator key_first, KeyIterator key_last, - AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first, - AdjMatrixColValueInputIterator adj_matrix_col_value_input_first, + AdjMatrixRowValueInputWrapper adj_matrix_row_value_input, + AdjMatrixColValueInputWrapper adj_matrix_col_value_input, BufferKeyOutputIterator buffer_key_output_first, BufferPayloadOutputIterator buffer_payload_output_first, size_t* buffer_idx_ptr, @@ -438,8 +438,8 @@ __global__ void for_all_frontier_row_for_all_nbr_mid_degree( row_offset, indices[i], weights ? (*weights)[i] : weight_t{1.0}, - adj_matrix_row_value_input_first, - adj_matrix_col_value_input_first, + adj_matrix_row_value_input, + adj_matrix_col_value_input, buffer_key_output_first, buffer_payload_output_first, buffer_idx_ptr, @@ -452,8 +452,8 @@ __global__ void for_all_frontier_row_for_all_nbr_mid_degree( template @@ -464,8 +464,8 @@ __global__ void for_all_frontier_row_for_all_nbr_high_degree( GraphViewType::is_multi_gpu> matrix_partition, KeyIterator key_first, KeyIterator key_last, - AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first, - AdjMatrixColValueInputIterator adj_matrix_col_value_input_first, + AdjMatrixRowValueInputWrapper adj_matrix_row_value_input, + AdjMatrixColValueInputWrapper adj_matrix_col_value_input, BufferKeyOutputIterator buffer_key_output_first, BufferPayloadOutputIterator buffer_payload_output_first, size_t* buffer_idx_ptr, @@ -504,8 +504,8 @@ __global__ void for_all_frontier_row_for_all_nbr_high_degree( row_offset, indices[i], weights ? (*weights)[i] : weight_t{1.0}, - adj_matrix_row_value_input_first, - adj_matrix_col_value_input_first, + adj_matrix_row_value_input, + adj_matrix_col_value_input, buffer_key_output_first, buffer_payload_output_first, buffer_idx_ptr, @@ -752,10 +752,10 @@ typename GraphViewType::edge_type compute_num_out_nbrs_from_frontier( * @tparam GraphViewType Type of the passed non-owning graph object. * @tparam VertexFrontierType Type of the vertex frontier class which abstracts vertex frontier * managements. - * @tparam AdjMatrixRowValueInputIterator Type of the iterator for graph adjacency matrix row - * input properties. - * @tparam AdjMatrixColValueInputIterator Type of the iterator for graph adjacency matrix column - * input properties. + * @tparam AdjMatrixRowValueInputWrapper Type of the wrapper for graph adjacency matrix row input + * properties. + * @tparam AdjMatrixColValueInputWrapper Type of the wrapper for graph adjacency matrix column input + * properties. * @tparam EdgeOp Type of the quaternary (or quinary) edge operator. * @tparam ReduceOp Type of the binary reduction operator. * @tparam VertexValueInputIterator Type of the iterator for vertex properties. @@ -770,19 +770,19 @@ typename GraphViewType::edge_type compute_num_out_nbrs_from_frontier( * current iteration. * @param next_frontier_bucket_indices Indices of the VertexFrontier buckets to store new frontier * vertices for the next iteration. - * @param adj_matrix_row_value_input_first Iterator pointing to the adjacency matrix row input - * properties for the first (inclusive) row (assigned to this process in multi-GPU). - * `adj_matrix_row_value_input_last` (exclusive) is deduced as @p adj_matrix_row_value_input_first + - * @p graph_view.get_number_of_local_adj_matrix_partition_rows(). - * @param adj_matrix_col_value_input_first Iterator pointing to the adjacency matrix column input - * properties for the first (inclusive) column (assigned to this process in multi-GPU). - * `adj_matrix_col_value_output_last` (exclusive) is deduced as @p adj_matrix_col_value_output_first - * + @p graph_view.get_number_of_local_adj_matrix_partition_cols(). + * @param adj_matrix_row_value_input Device-copyable wrapper used to access row input properties + * (for the rows assigned to this process in multi-GPU). Use either + * cugraph::row_properties_t::device_view() (if @p e_op needs to access row properties) or + * cugraph::dummy_properties_t::device_view() (if @p e_op does not access row properties). Use + * copy_to_adj_matrix_row to fill the wrapper. + * @param adj_matrix_col_value_input Device-copyable wrapper used to access column input properties + * (for the columns assigned to this process in multi-GPU). Use either + * cugraph::col_properties_t::device_view() (if @p e_op needs to access column properties) or + * cugraph::dummy_properties_t::device_view() (if @p e_op does not access column properties). Use + * copy_to_adj_matrix_col to fill the wrapper. * @param e_op Quaternary (or quinary) operator takes edge source, edge destination, (optional edge - * weight), *(@p adj_matrix_row_value_input_first + i), and *(@p adj_matrix_col_value_input_first + - * j) (where i is in [0, graph_view.get_number_of_local_adj_matrix_partition_rows()) and j is in [0, - * get_number_of_local_adj_matrix_partition_cols())) and returns a value to reduced by the @p - * reduce_op. + * weight), properties for the row (i.e. source), and properties for the column (i.e. destination) + * and returns a value to be reduced the @p reduce_op. * @param reduce_op Binary operator takes two input arguments and reduce the two variables to one. * @param vertex_value_input_first Iterator pointing to the vertex properties for the first * (inclusive) vertex (assigned to this process in multi-GPU). `vertex_value_input_last` (exclusive) @@ -799,8 +799,8 @@ typename GraphViewType::edge_type compute_num_out_nbrs_from_frontier( */ template (payload_buffer, new_buffer_size, handle.get_stream()); } - auto row_value_input_offset = GraphViewType::is_adj_matrix_transposed - ? vertex_t{0} - : matrix_partition.get_major_value_start_offset(); + auto matrix_partition_row_value_input = adj_matrix_row_value_input; + auto matrix_partition_col_value_input = adj_matrix_col_value_input; + matrix_partition_row_value_input.add_offset(matrix_partition.get_major_value_start_offset()); + if (segment_offsets) { static_assert(detail::num_sparse_segments_per_vertex_partition == 3); std::vector h_thresholds(detail::num_sparse_segments_per_vertex_partition + @@ -1030,8 +1031,8 @@ void update_frontier_v_push_if_out_nbr( matrix_partition, get_dataframe_buffer_begin(matrix_partition_frontier_key_buffer), get_dataframe_buffer_begin(matrix_partition_frontier_key_buffer) + h_offsets[0], - adj_matrix_row_value_input_first + row_value_input_offset, - adj_matrix_col_value_input_first, + matrix_partition_row_value_input, + matrix_partition_col_value_input, get_dataframe_buffer_begin(key_buffer), detail::get_optional_payload_buffer_begin(payload_buffer), buffer_idx.data(), @@ -1047,8 +1048,8 @@ void update_frontier_v_push_if_out_nbr( matrix_partition, get_dataframe_buffer_begin(matrix_partition_frontier_key_buffer) + h_offsets[0], get_dataframe_buffer_begin(matrix_partition_frontier_key_buffer) + h_offsets[1], - adj_matrix_row_value_input_first + row_value_input_offset, - adj_matrix_col_value_input_first, + matrix_partition_row_value_input, + matrix_partition_col_value_input, get_dataframe_buffer_begin(key_buffer), detail::get_optional_payload_buffer_begin(payload_buffer), buffer_idx.data(), @@ -1064,8 +1065,8 @@ void update_frontier_v_push_if_out_nbr( matrix_partition, get_dataframe_buffer_begin(matrix_partition_frontier_key_buffer) + h_offsets[1], get_dataframe_buffer_begin(matrix_partition_frontier_key_buffer) + h_offsets[2], - adj_matrix_row_value_input_first + row_value_input_offset, - adj_matrix_col_value_input_first, + matrix_partition_row_value_input, + matrix_partition_col_value_input, get_dataframe_buffer_begin(key_buffer), detail::get_optional_payload_buffer_begin(payload_buffer), buffer_idx.data(), @@ -1082,8 +1083,8 @@ void update_frontier_v_push_if_out_nbr( matrix_partition.get_major_first() + (*segment_offsets)[3], get_dataframe_buffer_begin(matrix_partition_frontier_key_buffer) + h_offsets[2], get_dataframe_buffer_begin(matrix_partition_frontier_key_buffer) + h_offsets[3], - adj_matrix_row_value_input_first + row_value_input_offset, - adj_matrix_col_value_input_first, + matrix_partition_row_value_input, + matrix_partition_col_value_input, get_dataframe_buffer_begin(key_buffer), detail::get_optional_payload_buffer_begin(payload_buffer), buffer_idx.data(), @@ -1101,8 +1102,8 @@ void update_frontier_v_push_if_out_nbr( matrix_partition, get_dataframe_buffer_begin(matrix_partition_frontier_key_buffer), get_dataframe_buffer_end(matrix_partition_frontier_key_buffer), - adj_matrix_row_value_input_first + row_value_input_offset, - adj_matrix_col_value_input_first, + matrix_partition_row_value_input, + matrix_partition_col_value_input, get_dataframe_buffer_begin(key_buffer), detail::get_optional_payload_buffer_begin(payload_buffer), buffer_idx.data(), From a6fec7e8d403566055260fce6c2afe885207f4d0 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Tue, 31 Aug 2021 16:33:18 -0400 Subject: [PATCH 09/57] update algorithms to use row/col properties wrapper --- cpp/src/centrality/katz_centrality.cu | 12 +- cpp/src/community/louvain.cuh | 300 ++++++++++-------- .../components/weakly_connected_components.cu | 27 +- cpp/src/link_analysis/pagerank.cu | 12 +- cpp/src/structure/coarsen_graph.cu | 19 +- cpp/src/structure/graph_view.cu | 16 +- cpp/src/structure/relabel.cu | 1 - cpp/src/traversal/bfs.cu | 5 +- cpp/src/traversal/sssp.cu | 25 +- 9 files changed, 221 insertions(+), 196 deletions(-) diff --git a/cpp/src/centrality/katz_centrality.cu b/cpp/src/centrality/katz_centrality.cu index a638694153b..7bbc03e254e 100644 --- a/cpp/src/centrality/katz_centrality.cu +++ b/cpp/src/centrality/katz_centrality.cu @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -91,8 +92,7 @@ void katz_centrality(raft::handle_t const& handle, // old katz centrality values rmm::device_uvector tmp_katz_centralities( pull_graph_view.get_number_of_local_vertices(), handle.get_stream()); - rmm::device_uvector adj_matrix_row_katz_centralities( - pull_graph_view.get_number_of_local_adj_matrix_partition_rows(), handle.get_stream()); + row_properties_t adj_matrix_row_katz_centralities(handle, pull_graph_view); auto new_katz_centralities = katz_centralities; auto old_katz_centralities = tmp_katz_centralities.data(); size_t iter{0}; @@ -100,14 +100,14 @@ void katz_centrality(raft::handle_t const& handle, std::swap(new_katz_centralities, old_katz_centralities); copy_to_adj_matrix_row( - handle, pull_graph_view, old_katz_centralities, adj_matrix_row_katz_centralities.begin()); + handle, pull_graph_view, old_katz_centralities, adj_matrix_row_katz_centralities); copy_v_transform_reduce_in_nbr( handle, pull_graph_view, - adj_matrix_row_katz_centralities.begin(), - thrust::make_constant_iterator(0) /* dummy */, - [alpha] __device__(vertex_t src, vertex_t dst, weight_t w, auto src_val, auto dst_val) { + adj_matrix_row_katz_centralities.device_view(), + dummy_properties_t{}.device_view(), + [alpha] __device__(vertex_t, vertex_t, weight_t w, auto src_val, auto) { return static_cast(alpha * src_val * w); }, betas != nullptr ? result_t{0.0} : beta, diff --git a/cpp/src/community/louvain.cuh b/cpp/src/community/louvain.cuh index 09189c95e38..c1f1d4ca67d 100644 --- a/cpp/src/community/louvain.cuh +++ b/cpp/src/community/louvain.cuh @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -52,6 +53,8 @@ class Louvain { graph_view_t::is_adj_matrix_transposed, graph_view_t::is_multi_gpu>; + static_assert(!graph_view_t::is_adj_matrix_transposed); + Louvain(raft::handle_t const& handle, graph_view_t const& graph_view) : #ifdef TIMING @@ -60,12 +63,13 @@ class Louvain { handle_(handle), dendrogram_(std::make_unique>()), current_graph_view_(graph_view), - cluster_keys_v_(graph_view.get_number_of_local_vertices(), handle.get_stream_view()), - cluster_weights_v_(graph_view.get_number_of_local_vertices(), handle.get_stream_view()), - vertex_weights_v_(graph_view.get_number_of_local_vertices(), handle.get_stream_view()), - src_vertex_weights_cache_v_(0, handle.get_stream_view()), - src_cluster_cache_v_(0, handle.get_stream_view()), - dst_cluster_cache_v_(0, handle.get_stream_view()) + cluster_keys_v_(0, handle.get_stream_view()), + cluster_weights_v_(0, handle.get_stream_view()), + vertex_weights_v_(0, handle.get_stream()), + src_vertex_weights_cache_(), + next_clusters_v_(0, handle.get_stream_view()), + src_clusters_cache_(), + dst_clusters_cache_() { } @@ -82,16 +86,16 @@ class Louvain { weight_t total_edge_weight = transform_reduce_e( handle_, current_graph_view_, - thrust::make_constant_iterator(0), - thrust::make_constant_iterator(0), - [] __device__(auto src, auto dst, weight_t wt, auto, auto) { return wt; }, + dummy_properties_t{}.device_view(), + dummy_properties_t{}.device_view(), + [] __device__(auto, auto, weight_t wt, auto, auto) { return wt; }, weight_t{0}); while (dendrogram_->num_levels() < max_level) { // // Initialize every cluster to reference each vertex to itself // - initialize_dendrogram_level(current_graph_view_.get_number_of_local_vertices()); + initialize_dendrogram_level(); compute_vertex_and_cluster_weights(); @@ -148,10 +152,11 @@ class Louvain { } protected: - void initialize_dendrogram_level(vertex_t num_vertices) + void initialize_dendrogram_level() { - dendrogram_->add_level( - current_graph_view_.get_local_vertex_first(), num_vertices, handle_.get_stream_view()); + dendrogram_->add_level(current_graph_view_.get_local_vertex_first(), + current_graph_view_.get_number_of_local_vertices(), + handle_.get_stream_view()); thrust::sequence(rmm::exec_policy(handle_.get_stream_view()), dendrogram_->current_level_begin(), @@ -160,7 +165,7 @@ class Louvain { } public: - weight_t modularity(weight_t total_edge_weight, weight_t resolution) + weight_t modularity(weight_t total_edge_weight, weight_t resolution) const { weight_t sum_degree_squared = thrust::transform_reduce( rmm::exec_policy(handle_.get_stream_view()), @@ -170,7 +175,7 @@ class Louvain { weight_t{0}, thrust::plus()); - if (graph_t::is_multi_gpu) { + if (graph_view_t::is_multi_gpu) { sum_degree_squared = host_scalar_allreduce(handle_.get_comms(), sum_degree_squared, handle_.get_stream()); } @@ -178,9 +183,15 @@ class Louvain { weight_t sum_internal = transform_reduce_e( handle_, current_graph_view_, - d_src_cluster_cache_, - d_dst_cluster_cache_, - [] __device__(auto src, auto dst, weight_t wt, auto src_cluster, auto nbr_cluster) { + graph_view_t::is_multi_gpu + ? src_clusters_cache_.device_view() + : detail::major_properties_device_view_t( + next_clusters_v_.begin()), + graph_view_t::is_multi_gpu + ? dst_clusters_cache_.device_view() + : detail::minor_properties_device_view_t( + next_clusters_v_.begin()), + [] __device__(auto, auto, weight_t wt, auto src_cluster, auto nbr_cluster) { if (src_cluster == nbr_cluster) { return wt; } else { @@ -213,10 +224,7 @@ class Louvain { vertex_weights_v_.size(), handle_.get_stream()); - d_src_vertex_weights_cache_ = - cache_src_vertex_properties(vertex_weights_v_, src_vertex_weights_cache_v_); - - if (graph_view_t::is_multi_gpu) { + if constexpr (graph_view_t::is_multi_gpu) { auto const comm_size = handle_.get_comms().get_size(); rmm::device_uvector rx_keys_v(0, handle_.get_stream_view()); rmm::device_uvector rx_weights_v(0, handle_.get_stream_view()); @@ -238,49 +246,50 @@ class Louvain { cluster_weights_v_ = std::move(rx_weights_v); } + if (graph_view_t::is_multi_gpu) { + src_vertex_weights_cache_ = + row_properties_t(handle_, current_graph_view_); + copy_to_adj_matrix_row( + handle_, current_graph_view_, vertex_weights_v_.begin(), src_vertex_weights_cache_); + vertex_weights_v_.resize(0, handle_.get_stream()); + vertex_weights_v_.shrink_to_fit(handle_.get_stream()); + } + timer_stop(handle_.get_stream_view()); } template - T* cache_src_vertex_properties(rmm::device_uvector& input, rmm::device_uvector& src_cache_v) + void cache_src_properties(rmm::device_uvector& input, + row_properties_t& src_cache_) { - if (graph_view_t::is_multi_gpu) { - src_cache_v.resize(current_graph_view_.get_number_of_local_adj_matrix_partition_rows(), - handle_.get_stream_view()); - copy_to_adj_matrix_row(handle_, current_graph_view_, input.begin(), src_cache_v.begin()); - return src_cache_v.begin(); - } else { - return input.begin(); - } + copy_to_adj_matrix_row(handle_, current_graph_view_, input.begin(), src_cache_); } template - T* cache_dst_vertex_properties(rmm::device_uvector& input, rmm::device_uvector& dst_cache_v) + void cache_dst_properties(rmm::device_uvector& input, + col_properties_t& dst_cache_) { - if (graph_view_t::is_multi_gpu) { - dst_cache_v.resize(current_graph_view_.get_number_of_local_adj_matrix_partition_cols(), - handle_.get_stream_view()); - copy_to_adj_matrix_col(handle_, current_graph_view_, input.begin(), dst_cache_v.begin()); - return dst_cache_v.begin(); - } else { - return input.begin(); - } + copy_to_adj_matrix_col(handle_, current_graph_view_, input.begin(), dst_cache_); } virtual weight_t update_clustering(weight_t total_edge_weight, weight_t resolution) { timer_start("update_clustering"); - rmm::device_uvector next_cluster_v(dendrogram_->current_level_size(), - handle_.get_stream_view()); + next_clusters_v_ = + rmm::device_uvector(dendrogram_->current_level_size(), handle_.get_stream()); - raft::copy(next_cluster_v.begin(), + raft::copy(next_clusters_v_.begin(), dendrogram_->current_level_begin(), dendrogram_->current_level_size(), handle_.get_stream()); - d_src_cluster_cache_ = cache_src_vertex_properties(next_cluster_v, src_cluster_cache_v_); - d_dst_cluster_cache_ = cache_dst_vertex_properties(next_cluster_v, dst_cluster_cache_v_); + if constexpr (graph_view_t::is_multi_gpu) { + src_clusters_cache_ = row_properties_t(handle_, current_graph_view_); + cache_src_properties(next_clusters_v_, src_clusters_cache_); + dst_clusters_cache_ = col_properties_t(handle_, current_graph_view_); + cache_dst_properties(next_clusters_v_, dst_clusters_cache_); + } weight_t new_Q = modularity(total_edge_weight, resolution); weight_t cur_Q = new_Q - 1; @@ -293,7 +302,7 @@ class Louvain { while (new_Q > (cur_Q + 0.0001)) { cur_Q = new_Q; - update_by_delta_modularity(total_edge_weight, resolution, next_cluster_v, up_down); + update_by_delta_modularity(total_edge_weight, resolution, next_clusters_v_, up_down); up_down = !up_down; @@ -301,8 +310,8 @@ class Louvain { if (new_Q > cur_Q) { raft::copy(dendrogram_->current_level_begin(), - next_cluster_v.begin(), - next_cluster_v.size(), + next_clusters_v_.begin(), + next_clusters_v_.size(), handle_.get_stream()); } } @@ -311,86 +320,67 @@ class Louvain { return cur_Q; } - void compute_cluster_sum_and_subtract(rmm::device_uvector& old_cluster_sum_v, - rmm::device_uvector& cluster_subtract_v) + std::tuple, rmm::device_uvector> + compute_cluster_sum_and_subtract() const { - auto output_buffer = cugraph::allocate_dataframe_buffer>( - current_graph_view_.get_number_of_local_vertices(), handle_.get_stream_view()); + rmm::device_uvector old_cluster_sum_v( + current_graph_view_.get_number_of_local_vertices(), handle_.get_stream()); + rmm::device_uvector cluster_subtract_v( + current_graph_view_.get_number_of_local_vertices(), handle_.get_stream()); copy_v_transform_reduce_out_nbr( handle_, current_graph_view_, - d_src_cluster_cache_, - d_dst_cluster_cache_, + graph_view_t::is_multi_gpu + ? src_clusters_cache_.device_view() + : detail::major_properties_device_view_t( + next_clusters_v_.data()), + graph_view_t::is_multi_gpu + ? dst_clusters_cache_.device_view() + : detail::minor_properties_device_view_t( + next_clusters_v_.data()), [] __device__(auto src, auto dst, auto wt, auto src_cluster, auto nbr_cluster) { - weight_t subtract{0}; weight_t sum{0}; + weight_t subtract{0}; if (src == dst) subtract = wt; else if (src_cluster == nbr_cluster) sum = wt; - return thrust::make_tuple(subtract, sum); + return thrust::make_tuple(sum, subtract); }, thrust::make_tuple(weight_t{0}, weight_t{0}), - cugraph::get_dataframe_buffer_begin>(output_buffer)); - - thrust::transform( - rmm::exec_policy(handle_.get_stream_view()), - cugraph::get_dataframe_buffer_begin>(output_buffer), - cugraph::get_dataframe_buffer_begin>(output_buffer) + - current_graph_view_.get_number_of_local_vertices(), - old_cluster_sum_v.begin(), - [] __device__(auto p) { return thrust::get<1>(p); }); + thrust::make_zip_iterator(old_cluster_sum_v.begin(), cluster_subtract_v.begin())); - thrust::transform( - rmm::exec_policy(handle_.get_stream_view()), - cugraph::get_dataframe_buffer_begin>(output_buffer), - cugraph::get_dataframe_buffer_begin>(output_buffer) + - current_graph_view_.get_number_of_local_vertices(), - cluster_subtract_v.begin(), - [] __device__(auto p) { return thrust::get<0>(p); }); + return std::make_tuple(std::move(old_cluster_sum_v), std::move(cluster_subtract_v)); } void update_by_delta_modularity(weight_t total_edge_weight, weight_t resolution, - rmm::device_uvector& next_cluster_v, + rmm::device_uvector& next_clusters_v_, bool up_down) { - rmm::device_uvector old_cluster_sum_v( - current_graph_view_.get_number_of_local_vertices(), handle_.get_stream()); - rmm::device_uvector cluster_subtract_v( - current_graph_view_.get_number_of_local_vertices(), handle_.get_stream()); - rmm::device_uvector src_cluster_weights_v(next_cluster_v.size(), - handle_.get_stream()); - - compute_cluster_sum_and_subtract(old_cluster_sum_v, cluster_subtract_v); - - auto output_buffer = cugraph::allocate_dataframe_buffer>( - current_graph_view_.get_number_of_local_vertices(), handle_.get_stream()); - - vertex_t* map_key_first; - vertex_t* map_key_last; - weight_t* map_value_first; - - if (graph_t::is_multi_gpu) { + rmm::device_uvector vertex_cluster_weights_v(0, handle_.get_stream()); + row_properties_t src_cluster_weights{}; + if constexpr (graph_view_t::is_multi_gpu) { cugraph::detail::compute_gpu_id_from_vertex_t vertex_to_gpu_id_op{ handle_.get_comms().get_size()}; - src_cluster_weights_v = - cugraph::collect_values_for_keys(handle_.get_comms(), - cluster_keys_v_.begin(), - cluster_keys_v_.end(), - cluster_weights_v_.data(), - d_src_cluster_cache_, - d_src_cluster_cache_ + src_cluster_cache_v_.size(), - vertex_to_gpu_id_op, - handle_.get_stream()); - - map_key_first = cluster_keys_v_.begin(); - map_key_last = cluster_keys_v_.end(); - map_value_first = cluster_weights_v_.begin(); + vertex_cluster_weights_v = cugraph::collect_values_for_keys(handle_.get_comms(), + cluster_keys_v_.begin(), + cluster_keys_v_.end(), + cluster_weights_v_.data(), + next_clusters_v_.begin(), + next_clusters_v_.end(), + vertex_to_gpu_id_op, + handle_.get_stream()); + + src_cluster_weights = row_properties_t(handle_, current_graph_view_); + copy_to_adj_matrix_row( + handle_, current_graph_view_, vertex_cluster_weights_v.begin(), src_cluster_weights); + vertex_cluster_weights_v.resize(0, handle_.get_stream()); + vertex_cluster_weights_v.shrink_to_fit(handle_.get_stream()); } else { thrust::sort_by_key(rmm::exec_policy(handle_.get_stream_view()), cluster_keys_v_.begin(), @@ -398,9 +388,9 @@ class Louvain { cluster_weights_v_.begin()); thrust::transform(rmm::exec_policy(handle_.get_stream_view()), - next_cluster_v.begin(), - next_cluster_v.end(), - src_cluster_weights_v.begin(), + next_clusters_v_.begin(), + next_clusters_v_.end(), + vertex_cluster_weights_v.begin(), [d_cluster_weights = cluster_weights_v_.data(), d_cluster_keys = cluster_keys_v_.data(), num_clusters = cluster_keys_v_.size()] __device__(vertex_t cluster) { @@ -408,24 +398,51 @@ class Louvain { thrust::seq, d_cluster_keys, d_cluster_keys + num_clusters, cluster); return d_cluster_weights[pos - d_cluster_keys]; }); + } - map_key_first = d_src_cluster_cache_; - map_key_last = d_src_cluster_cache_ + src_cluster_weights_v.size(); - map_value_first = src_cluster_weights_v.begin(); + auto [old_cluster_sum_v, cluster_subtract_v] = compute_cluster_sum_and_subtract(); + + row_properties_t> + src_old_cluster_sum_subtract_pairs{}; + if constexpr (graph_view_t::is_multi_gpu) { + src_old_cluster_sum_subtract_pairs = + row_properties_t>(handle_, + current_graph_view_); + copy_to_adj_matrix_row(handle_, + current_graph_view_, + thrust::make_zip_iterator(thrust::make_tuple( + old_cluster_sum_v.begin(), cluster_subtract_v.begin())), + src_old_cluster_sum_subtract_pairs); } - rmm::device_uvector src_old_cluster_sum_v( - current_graph_view_.get_number_of_local_adj_matrix_partition_rows(), handle_.get_stream()); - rmm::device_uvector src_cluster_subtract_v( - current_graph_view_.get_number_of_local_adj_matrix_partition_rows(), handle_.get_stream()); - copy_to_adj_matrix_row( - handle_, current_graph_view_, old_cluster_sum_v.begin(), src_old_cluster_sum_v.begin()); - copy_to_adj_matrix_row( - handle_, current_graph_view_, cluster_subtract_v.begin(), src_cluster_subtract_v.begin()); + auto output_buffer = cugraph::allocate_dataframe_buffer>( + current_graph_view_.get_number_of_local_vertices(), handle_.get_stream()); + + auto cluster_old_sum_subtract_pair_first = thrust::make_zip_iterator( + thrust::make_tuple(old_cluster_sum_v.cbegin(), cluster_subtract_v.cbegin())); + auto zipped_src_device_view = + graph_view_t::is_multi_gpu + ? device_view_concat(src_vertex_weights_cache_.device_view(), + src_clusters_cache_.device_view(), + src_cluster_weights.device_view(), + src_old_cluster_sum_subtract_pairs.device_view()) + : device_view_concat( + detail::major_properties_device_view_t( + vertex_weights_v_.data()), + detail::major_properties_device_view_t( + next_clusters_v_.data()), + detail::major_properties_device_view_t( + vertex_cluster_weights_v.data()), + detail::major_properties_device_view_t( + cluster_old_sum_subtract_pair_first)); copy_v_transform_reduce_key_aggregated_out_nbr( handle_, current_graph_view_, +#if 1 + zipped_src_device_view, + graph_view_t::is_multi_gpu ? dst_clusters_cache_.device_view() : detail::minor_properties_device_view_t(next_clusters_v_.data()), +#else thrust::make_zip_iterator(thrust::make_tuple(src_old_cluster_sum_v.begin(), d_src_vertex_weights_cache_, src_cluster_subtract_v.begin(), @@ -433,16 +450,25 @@ class Louvain { src_cluster_weights_v.begin())), d_dst_cluster_cache_, - map_key_first, - map_key_last, - map_value_first, +#endif + cluster_keys_v_.begin(), + cluster_keys_v_.end(), + cluster_weights_v_.begin(), [total_edge_weight, resolution] __device__( auto src, auto neighbor_cluster, auto new_cluster_sum, auto src_info, auto a_new) { +#if 1 + auto k_k = thrust::get<0>(src_info); + auto src_cluster = thrust::get<1>(src_info); + auto a_old = thrust::get<2>(src_info); + auto old_cluster_sum = thrust::get<3>(src_info); + auto cluster_subtract = thrust::get<4>(src_info); +#else auto old_cluster_sum = thrust::get<0>(src_info); auto k_k = thrust::get<1>(src_info); auto cluster_subtract = thrust::get<2>(src_info); auto src_cluster = thrust::get<3>(src_info); auto a_old = thrust::get<4>(src_info); +#endif if (src_cluster == neighbor_cluster) new_cluster_sum -= cluster_subtract; @@ -465,10 +491,10 @@ class Louvain { thrust::transform( rmm::exec_policy(handle_.get_stream_view()), - next_cluster_v.begin(), - next_cluster_v.end(), + next_clusters_v_.begin(), + next_clusters_v_.end(), cugraph::get_dataframe_buffer_begin>(output_buffer), - next_cluster_v.begin(), + next_clusters_v_.begin(), [up_down] __device__(vertex_t old_cluster, auto p) { vertex_t new_cluster = thrust::get<0>(p); weight_t delta_modularity = thrust::get<1>(p); @@ -478,17 +504,19 @@ class Louvain { : old_cluster; }); - d_src_cluster_cache_ = cache_src_vertex_properties(next_cluster_v, src_cluster_cache_v_); - d_dst_cluster_cache_ = cache_dst_vertex_properties(next_cluster_v, dst_cluster_cache_v_); + if constexpr (graph_view_t::is_multi_gpu) { + cache_src_properties(next_clusters_v_, src_clusters_cache_); + cache_dst_properties(next_clusters_v_, dst_clusters_cache_); + } std::tie(cluster_keys_v_, cluster_weights_v_) = cugraph::transform_reduce_by_adj_matrix_row_key_e( handle_, current_graph_view_, - thrust::make_constant_iterator(0), - thrust::make_constant_iterator(0), - d_src_cluster_cache_, - [] __device__(auto src, auto dst, auto wt, auto x, auto y) { return wt; }, + dummy_properties_t{}.device_view(), + dummy_properties_t{}.device_view(), + src_clusters_cache_.device_view(), + [] __device__(auto, auto, auto wt, auto, auto) { return wt; }, weight_t{0}); } @@ -534,16 +562,16 @@ class Louvain { std::unique_ptr current_graph_{}; graph_view_t current_graph_view_; - rmm::device_uvector vertex_weights_v_; - rmm::device_uvector src_vertex_weights_cache_v_; - rmm::device_uvector src_cluster_cache_v_; - rmm::device_uvector dst_cluster_cache_v_; + // FIXME: better move inside the update_by_delta_modularity? rmm::device_uvector cluster_keys_v_; rmm::device_uvector cluster_weights_v_; - weight_t* d_src_vertex_weights_cache_; - vertex_t* d_src_cluster_cache_; - vertex_t* d_dst_cluster_cache_; + rmm::device_uvector vertex_weights_v_; + row_properties_t src_vertex_weights_cache_; // src cache for vertex_weights_v_ + + rmm::device_uvector next_clusters_v_; + row_properties_t src_clusters_cache_; // src cache for next_clusters_v_ + col_properties_t dst_clusters_cache_; // dst cache for next_clusters_v_ #ifdef TIMING HighResTimer hr_timer_; diff --git a/cpp/src/components/weakly_connected_components.cu b/cpp/src/components/weakly_connected_components.cu index f20356a6d58..00b211976de 100644 --- a/cpp/src/components/weakly_connected_components.cu +++ b/cpp/src/components/weakly_connected_components.cu @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -442,7 +443,7 @@ void weakly_connected_components_impl(raft::handle_t const& handle, init_max_new_roots = std::min(init_max_new_roots, max_new_roots); } - // 2-3. initialize vertex frontier, edge_buffer, and col_components (if multi-gpu) + // 2-3. initialize vertex frontier, edge_buffer, and adj_matrix_col_components (if multi-gpu) VertexFrontier num_edge_inserts(size_t{0}, handle.get_stream_view()); - rmm::device_uvector col_components( - GraphViewType::is_multi_gpu ? level_graph_view.get_number_of_local_adj_matrix_partition_cols() - : vertex_t{0}, - handle.get_stream_view()); - if (GraphViewType::is_multi_gpu) { - thrust::fill(rmm::exec_policy(handle.get_stream_view()), - col_components.begin(), - col_components.end(), - invalid_component_id::value); + auto adj_matrix_col_components = GraphViewType::is_multi_gpu ? col_properties_t(handle, level_graph_view) : col_properties_t(); + if constexpr (GraphViewType::is_multi_gpu) { + adj_matrix_col_components.fill(invalid_component_id::value, handle.get_stream()); } // 2.4 iterate till every vertex gets visited @@ -508,7 +503,7 @@ void weakly_connected_components_impl(raft::handle_t const& handle, break; } - if (GraphViewType::is_multi_gpu) { + if constexpr (GraphViewType::is_multi_gpu) { copy_to_adj_matrix_col( handle, level_graph_view, @@ -519,7 +514,7 @@ void weakly_connected_components_impl(raft::handle_t const& handle, .end() .get_iterator_tuple()), level_components, - col_components.begin()); + adj_matrix_col_components); } auto max_pushes = @@ -543,9 +538,9 @@ void weakly_connected_components_impl(raft::handle_t const& handle, GraphViewType::is_multi_gpu ? std::vector{static_cast(Bucket::next), static_cast(Bucket::conflict)} : std::vector{static_cast(Bucket::next)}, - thrust::make_counting_iterator(0) /* dummy */, - thrust::make_counting_iterator(0) /* dummy */, - [col_components = GraphViewType::is_multi_gpu ? col_components.data() : level_components, + dummy_properties_t{}.device_view(), + dummy_properties_t{}.device_view(), + [col_components = GraphViewType::is_multi_gpu ? adj_matrix_col_components.mutable_device_view() : detail::minor_properties_device_view_t(level_components), col_first = level_graph_view.get_local_adj_matrix_partition_col_first(), edge_buffer_first = get_dataframe_buffer_begin>(edge_buffer), @@ -556,7 +551,7 @@ void weakly_connected_components_impl(raft::handle_t const& handle, // FIXME: better switch to atomic_ref after // https://github.com/nvidia/libcudacxx/milestone/2 auto old = - atomicCAS(col_components + col_offset, invalid_component_id::value, tag); + atomicCAS(&(col_components.get(col_offset)), invalid_component_id::value, tag); if (old != invalid_component_id::value && old != tag) { // conflict static_assert(sizeof(unsigned long long int) == sizeof(size_t)); auto edge_idx = atomicAdd(reinterpret_cast(num_edge_inserts), diff --git a/cpp/src/link_analysis/pagerank.cu b/cpp/src/link_analysis/pagerank.cu index 2d38371f7fc..b4fff1ce3e2 100644 --- a/cpp/src/link_analysis/pagerank.cu +++ b/cpp/src/link_analysis/pagerank.cu @@ -101,9 +101,9 @@ void pagerank( auto num_nonpositive_edge_weights = count_if_e( handle, pull_graph_view, - thrust::make_constant_iterator(0) /* dummy */, - thrust::make_constant_iterator(0) /* dummy */, - [] __device__(vertex_t src, vertex_t dst, weight_t w, auto src_val, auto dst_val) { + dummy_properties_t{}.device_view(), + dummy_properties_t{}.device_view(), + [] __device__(vertex_t, vertex_t, weight_t w, auto, auto) { return w <= 0.0; }); CUGRAPH_EXPECTS(num_nonpositive_edge_weights == 0, @@ -233,9 +233,9 @@ void pagerank( copy_v_transform_reduce_in_nbr( handle, pull_graph_view, - adj_matrix_row_pageranks.begin(), - dummy_properties_t{}.begin(), - [alpha] __device__(vertex_t src, vertex_t dst, weight_t w, auto src_val, auto dst_val) { + adj_matrix_row_pageranks.device_view(), + dummy_properties_t{}.device_view(), + [alpha] __device__(vertex_t, vertex_t, weight_t w, auto src_val, auto) { return src_val * w * alpha; }, unvarying_part, diff --git a/cpp/src/structure/coarsen_graph.cu b/cpp/src/structure/coarsen_graph.cu index 967234b9e54..5bfa1e7456c 100644 --- a/cpp/src/structure/coarsen_graph.cu +++ b/cpp/src/structure/coarsen_graph.cu @@ -131,22 +131,19 @@ template + typename AdjMatrixMinorLabelInputWrapper> std::tuple, rmm::device_uvector, std::optional>> decompress_matrix_partition_to_relabeled_and_grouped_and_coarsened_edgelist( raft::handle_t const& handle, matrix_partition_device_view_t const matrix_partition, - VertexIterator0 const major_label_first, - VertexIterator1 const minor_label_first, + vertex_t const* major_label_first, + AdjMatrixMinorLabelInputWrapper const minor_label_input, std::optional> const& segment_offsets) { static_assert( - std::is_same_v::value_type, vertex_t>); - static_assert( - std::is_same_v::value_type, vertex_t>); + std::is_same_v); // FIXME: it might be possible to directly create relabled & coarsened edgelist from the // compressed sparse format to save memory @@ -161,12 +158,12 @@ decompress_matrix_partition_to_relabeled_and_grouped_and_coarsened_edgelist( pair_first + edgelist_major_vertices.size(), pair_first, [major_label_first, - minor_label_first, + minor_label_input, major_first = matrix_partition.get_major_first(), minor_first = matrix_partition.get_minor_first()] __device__(auto val) { return thrust::make_tuple( *(major_label_first + (thrust::get<0>(val) - major_first)), - *(minor_label_first + (thrust::get<1>(val) - minor_first))); + minor_label_input.get(thrust::get<1>(val) - minor_first)); }); auto number_of_edges = groupby_e_and_coarsen_edgelist( @@ -304,7 +301,7 @@ coarsen_graph( matrix_partition_device_view_t( graph_view.get_matrix_partition_view(i)), major_labels.data(), - adj_matrix_minor_labels.begin(), + adj_matrix_minor_labels.device_view(), graph_view.get_local_adj_matrix_partition_segment_offsets(i)); // 1-2. globally shuffle @@ -512,7 +509,7 @@ coarsen_graph( matrix_partition_device_view_t( graph_view.get_matrix_partition_view()), labels, - labels, + detail::minor_properties_device_view_t(labels), graph_view.get_local_adj_matrix_partition_segment_offsets(0)); rmm::device_uvector unique_labels(graph_view.get_number_of_vertices(), diff --git a/cpp/src/structure/graph_view.cu b/cpp/src/structure/graph_view.cu index 05de14afd19..6c22fbac24c 100644 --- a/cpp/src/structure/graph_view.cu +++ b/cpp/src/structure/graph_view.cu @@ -93,8 +93,8 @@ rmm::device_uvector compute_minor_degrees( copy_v_transform_reduce_out_nbr( handle, graph_view, - dummy_properties_t{}.begin(), - dummy_properties_t{}.begin(), + dummy_properties_t{}.device_view(), + dummy_properties_t{}.device_view(), [] __device__(vertex_t, vertex_t, weight_t, auto, auto) { return edge_t{1}; }, edge_t{0}, minor_degrees.data()); @@ -102,8 +102,8 @@ rmm::device_uvector compute_minor_degrees( copy_v_transform_reduce_in_nbr( handle, graph_view, - dummy_properties_t{}.begin(), - dummy_properties_t{}.begin(), + dummy_properties_t{}.device_view(), + dummy_properties_t{}.device_view(), [] __device__(vertex_t, vertex_t, weight_t, auto, auto) { return edge_t{1}; }, edge_t{0}, minor_degrees.data()); @@ -128,8 +128,8 @@ rmm::device_uvector compute_weight_sums( copy_v_transform_reduce_in_nbr( handle, graph_view, - dummy_properties_t{}.begin(), - dummy_properties_t{}.begin(), + dummy_properties_t{}.device_view(), + dummy_properties_t{}.device_view(), [] __device__(vertex_t, vertex_t, weight_t w, auto, auto) { return w; }, weight_t{0.0}, weight_sums.data()); @@ -137,8 +137,8 @@ rmm::device_uvector compute_weight_sums( copy_v_transform_reduce_out_nbr( handle, graph_view, - dummy_properties_t{}.begin(), - dummy_properties_t{}.begin(), + dummy_properties_t{}.device_view(), + dummy_properties_t{}.device_view(), [] __device__(vertex_t, vertex_t, weight_t w, auto, auto) { return w; }, weight_t{0.0}, weight_sums.data()); diff --git a/cpp/src/structure/relabel.cu b/cpp/src/structure/relabel.cu index d01143a922e..b68b10b1838 100644 --- a/cpp/src/structure/relabel.cu +++ b/cpp/src/structure/relabel.cu @@ -18,7 +18,6 @@ #include #include #include -#include #include #include diff --git a/cpp/src/traversal/bfs.cu b/cpp/src/traversal/bfs.cu index fa653b7ddb3..70a6c72bc10 100644 --- a/cpp/src/traversal/bfs.cu +++ b/cpp/src/traversal/bfs.cu @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -117,8 +118,8 @@ void bfs(raft::handle_t const& handle, vertex_frontier, static_cast(Bucket::cur), std::vector{static_cast(Bucket::next)}, - thrust::make_constant_iterator(0) /* dummy */, - thrust::make_constant_iterator(0) /* dummy */, + dummy_properties_t{}.device_view(), + dummy_properties_t{}.device_view(), [vertex_partition, distances] __device__( vertex_t src, vertex_t dst, auto src_val, auto dst_val) { auto push = true; diff --git a/cpp/src/traversal/sssp.cu b/cpp/src/traversal/sssp.cu index d1cf68741f9..6c3595723d8 100644 --- a/cpp/src/traversal/sssp.cu +++ b/cpp/src/traversal/sssp.cu @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -78,9 +79,9 @@ void sssp(raft::handle_t const& handle, auto num_negative_edge_weights = count_if_e(handle, push_graph_view, - thrust::make_constant_iterator(0) /* dummy */, - thrust::make_constant_iterator(0) /* dummy */, - [] __device__(vertex_t src, vertex_t dst, weight_t w, auto src_val, auto dst_val) { + dummy_properties_t{}.device_view(), + dummy_properties_t{}.device_view(), + [] __device__(vertex_t, vertex_t, weight_t w, auto, auto) { return w < 0.0; }); CUGRAPH_EXPECTS(num_negative_edge_weights == 0, @@ -112,9 +113,9 @@ void sssp(raft::handle_t const& handle, thrust::tie(average_vertex_degree, average_edge_weight) = transform_reduce_e( handle, push_graph_view, - thrust::make_constant_iterator(0) /* dummy */, - thrust::make_constant_iterator(0) /* dummy */, - [] __device__(vertex_t row, vertex_t col, weight_t w, auto row_val, auto col_val) { + dummy_properties_t{}.device_view(), + dummy_properties_t{}.device_view(), + [] __device__(vertex_t, vertex_t, weight_t w, auto, auto) { return thrust::make_tuple(weight_t{1.0}, w); }, thrust::make_tuple(weight_t{0.0}, weight_t{0.0})); @@ -134,8 +135,10 @@ void sssp(raft::handle_t const& handle, // 5. SSSP iteration - row_properties_t adj_matrix_row_distances(handle, push_graph_view); + auto adj_matrix_row_distances = GraphViewType::is_multi_gpu ? row_properties_t(handle, push_graph_view) : row_properties_t{}; + if (GraphViewType::is_multi_gpu) { adj_matrix_row_distances.fill(std::numeric_limits::max(), handle.get_stream()); + } if (push_graph_view.is_local_vertex_nocheck(source_vertex)) { vertex_frontier.get_bucket(static_cast(Bucket::cur_near)).insert(source_vertex); @@ -143,6 +146,7 @@ void sssp(raft::handle_t const& handle, auto near_far_threshold = delta; while (true) { + if (GraphViewType::is_multi_gpu) { copy_to_adj_matrix_row( handle, push_graph_view, @@ -150,6 +154,7 @@ void sssp(raft::handle_t const& handle, vertex_frontier.get_bucket(static_cast(Bucket::cur_near)).end(), distances, adj_matrix_row_distances); + } auto vertex_partition = vertex_partition_device_view_t( push_graph_view.get_vertex_partition_view()); @@ -160,10 +165,10 @@ void sssp(raft::handle_t const& handle, vertex_frontier, static_cast(Bucket::cur_near), std::vector{static_cast(Bucket::next_near), static_cast(Bucket::far)}, - adj_matrix_row_distances.begin(), - thrust::make_constant_iterator(0) /* dummy */, + GraphViewType::is_multi_gpu ? adj_matrix_row_distances.device_view() : detail::major_properties_device_view_t(distances), + dummy_properties_t{}.device_view(), [vertex_partition, distances, cutoff] __device__( - vertex_t src, vertex_t dst, weight_t w, auto src_val, auto dst_val) { + vertex_t src, vertex_t dst, weight_t w, auto src_val, auto) { auto push = true; auto new_distance = src_val + w; auto threshold = cutoff; From 670d891776eaa7feb134ed2d70cf98e593e4e7ef Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Tue, 31 Aug 2021 16:44:44 -0400 Subject: [PATCH 10/57] clang-format --- cpp/src/centrality/katz_centrality.cu | 3 +- cpp/src/community/louvain.cuh | 19 +++++++---- .../components/weakly_connected_components.cu | 20 ++++++++---- cpp/src/link_analysis/pagerank.cu | 14 ++++---- cpp/src/structure/coarsen_graph.cu | 4 +-- cpp/src/traversal/sssp.cu | 32 ++++++++++--------- 6 files changed, 51 insertions(+), 41 deletions(-) diff --git a/cpp/src/centrality/katz_centrality.cu b/cpp/src/centrality/katz_centrality.cu index 884dacc925f..a9722b448e1 100644 --- a/cpp/src/centrality/katz_centrality.cu +++ b/cpp/src/centrality/katz_centrality.cu @@ -92,7 +92,8 @@ void katz_centrality(raft::handle_t const& handle, // old katz centrality values rmm::device_uvector tmp_katz_centralities( pull_graph_view.get_number_of_local_vertices(), handle.get_stream()); - row_properties_t adj_matrix_row_katz_centralities(handle, pull_graph_view); + row_properties_t adj_matrix_row_katz_centralities(handle, + pull_graph_view); auto new_katz_centralities = katz_centralities; auto old_katz_centralities = tmp_katz_centralities.data(); size_t iter{0}; diff --git a/cpp/src/community/louvain.cuh b/cpp/src/community/louvain.cuh index 3d262930fb3..aff64cad704 100644 --- a/cpp/src/community/louvain.cuh +++ b/cpp/src/community/louvain.cuh @@ -406,7 +406,7 @@ class Louvain { auto [old_cluster_sum_v, cluster_subtract_v] = compute_cluster_sum_and_subtract(); row_properties_t> - src_old_cluster_sum_subtract_pairs{}; + src_old_cluster_sum_subtract_pairs{}; if constexpr (graph_view_t::is_multi_gpu) { src_old_cluster_sum_subtract_pairs = row_properties_t>(handle_, @@ -426,9 +426,9 @@ class Louvain { auto zipped_src_device_view = graph_view_t::is_multi_gpu ? device_view_concat(src_vertex_weights_cache_.device_view(), - src_clusters_cache_.device_view(), - src_cluster_weights.device_view(), - src_old_cluster_sum_subtract_pairs.device_view()) + src_clusters_cache_.device_view(), + src_cluster_weights.device_view(), + src_old_cluster_sum_subtract_pairs.device_view()) : device_view_concat( detail::major_properties_device_view_t( vertex_weights_v_.data()), @@ -436,7 +436,8 @@ class Louvain { next_clusters_v_.data()), detail::major_properties_device_view_t( vertex_cluster_weights_v.data()), - detail::major_properties_device_view_t( + detail::major_properties_device_view_t( cluster_old_sum_subtract_pair_first)); copy_v_transform_reduce_key_aggregated_out_nbr( @@ -444,7 +445,10 @@ class Louvain { current_graph_view_, #if 1 zipped_src_device_view, - graph_view_t::is_multi_gpu ? dst_clusters_cache_.device_view() : detail::minor_properties_device_view_t(next_clusters_v_.data()), + graph_view_t::is_multi_gpu + ? dst_clusters_cache_.device_view() + : detail::minor_properties_device_view_t( + next_clusters_v_.data()), #else thrust::make_zip_iterator(thrust::make_tuple(src_old_cluster_sum_v.begin(), d_src_vertex_weights_cache_, @@ -570,7 +574,8 @@ class Louvain { rmm::device_uvector cluster_weights_v_; rmm::device_uvector vertex_weights_v_; - row_properties_t src_vertex_weights_cache_; // src cache for vertex_weights_v_ + row_properties_t + src_vertex_weights_cache_; // src cache for vertex_weights_v_ rmm::device_uvector next_clusters_v_; row_properties_t src_clusters_cache_; // src cache for next_clusters_v_ diff --git a/cpp/src/components/weakly_connected_components.cu b/cpp/src/components/weakly_connected_components.cu index 26abb0f9f2a..35f4343f721 100644 --- a/cpp/src/components/weakly_connected_components.cu +++ b/cpp/src/components/weakly_connected_components.cu @@ -458,7 +458,10 @@ void weakly_connected_components_impl(raft::handle_t const& handle, // requires placing the atomic variable on managed memory and this make it less attractive. rmm::device_scalar num_edge_inserts(size_t{0}, handle.get_stream_view()); - auto adj_matrix_col_components = GraphViewType::is_multi_gpu ? col_properties_t(handle, level_graph_view) : col_properties_t(); + auto adj_matrix_col_components = + GraphViewType::is_multi_gpu + ? col_properties_t(handle, level_graph_view) + : col_properties_t(); if constexpr (GraphViewType::is_multi_gpu) { adj_matrix_col_components.fill(invalid_component_id::value, handle.get_stream()); } @@ -536,10 +539,13 @@ void weakly_connected_components_impl(raft::handle_t const& handle, GraphViewType::is_multi_gpu ? std::vector{static_cast(Bucket::next), static_cast(Bucket::conflict)} : std::vector{static_cast(Bucket::next)}, - dummy_properties_t{}.device_view(), - dummy_properties_t{}.device_view(), - [col_components = GraphViewType::is_multi_gpu ? adj_matrix_col_components.mutable_device_view() : detail::minor_properties_device_view_t(level_components), - col_first = level_graph_view.get_local_adj_matrix_partition_col_first(), + dummy_properties_t{}.device_view(), + dummy_properties_t{}.device_view(), + [col_components = + GraphViewType::is_multi_gpu + ? adj_matrix_col_components.mutable_device_view() + : detail::minor_properties_device_view_t(level_components), + col_first = level_graph_view.get_local_adj_matrix_partition_col_first(), edge_buffer_first = get_dataframe_buffer_begin>(edge_buffer), num_edge_inserts = @@ -548,8 +554,8 @@ void weakly_connected_components_impl(raft::handle_t const& handle, auto col_offset = dst - col_first; // FIXME: better switch to atomic_ref after // https://github.com/nvidia/libcudacxx/milestone/2 - auto old = - atomicCAS(&(col_components.get(col_offset)), invalid_component_id::value, tag); + auto old = atomicCAS( + &(col_components.get(col_offset)), invalid_component_id::value, tag); if (old != invalid_component_id::value && old != tag) { // conflict static_assert(sizeof(unsigned long long int) == sizeof(size_t)); auto edge_idx = atomicAdd(reinterpret_cast(num_edge_inserts), diff --git a/cpp/src/link_analysis/pagerank.cu b/cpp/src/link_analysis/pagerank.cu index d06677a532d..fe6a182f365 100644 --- a/cpp/src/link_analysis/pagerank.cu +++ b/cpp/src/link_analysis/pagerank.cu @@ -98,14 +98,12 @@ void pagerank( } if (pull_graph_view.is_weighted()) { - auto num_nonpositive_edge_weights = count_if_e( - handle, - pull_graph_view, - dummy_properties_t{}.device_view(), - dummy_properties_t{}.device_view(), - [] __device__(vertex_t, vertex_t, weight_t w, auto, auto) { - return w <= 0.0; - }); + auto num_nonpositive_edge_weights = + count_if_e(handle, + pull_graph_view, + dummy_properties_t{}.device_view(), + dummy_properties_t{}.device_view(), + [] __device__(vertex_t, vertex_t, weight_t w, auto, auto) { return w <= 0.0; }); CUGRAPH_EXPECTS(num_nonpositive_edge_weights == 0, "Invalid input argument: input graph should have postive edge weights."); } diff --git a/cpp/src/structure/coarsen_graph.cu b/cpp/src/structure/coarsen_graph.cu index cf40b0443d5..e56a3f5503f 100644 --- a/cpp/src/structure/coarsen_graph.cu +++ b/cpp/src/structure/coarsen_graph.cu @@ -142,8 +142,7 @@ decompress_matrix_partition_to_relabeled_and_grouped_and_coarsened_edgelist( AdjMatrixMinorLabelInputWrapper const minor_label_input, std::optional> const& segment_offsets) { - static_assert( - std::is_same_v); + static_assert(std::is_same_v); // FIXME: it might be possible to directly create relabled & coarsened edgelist from the // compressed sparse format to save memory @@ -231,7 +230,6 @@ coarsen_graph( copy_to_adj_matrix_col(handle, graph_view, labels, adj_matrix_minor_labels); } - std::vector> coarsened_edgelist_major_vertices{}; std::vector> coarsened_edgelist_minor_vertices{}; auto coarsened_edgelist_weights = diff --git a/cpp/src/traversal/sssp.cu b/cpp/src/traversal/sssp.cu index c049d853867..4ca37b3bdaa 100644 --- a/cpp/src/traversal/sssp.cu +++ b/cpp/src/traversal/sssp.cu @@ -78,11 +78,9 @@ void sssp(raft::handle_t const& handle, auto num_negative_edge_weights = count_if_e(handle, push_graph_view, - dummy_properties_t{}.device_view(), - dummy_properties_t{}.device_view(), - [] __device__(vertex_t, vertex_t, weight_t w, auto, auto) { - return w < 0.0; - }); + dummy_properties_t{}.device_view(), + dummy_properties_t{}.device_view(), + [] __device__(vertex_t, vertex_t, weight_t w, auto, auto) { return w < 0.0; }); CUGRAPH_EXPECTS(num_negative_edge_weights == 0, "Invalid input argument: input graph should have non-negative edge weights."); } @@ -134,7 +132,9 @@ void sssp(raft::handle_t const& handle, // 5. SSSP iteration - auto adj_matrix_row_distances = GraphViewType::is_multi_gpu ? row_properties_t(handle, push_graph_view) : row_properties_t{}; + auto adj_matrix_row_distances = + GraphViewType::is_multi_gpu ? row_properties_t(handle, push_graph_view) + : row_properties_t{}; if (GraphViewType::is_multi_gpu) { adj_matrix_row_distances.fill(std::numeric_limits::max(), handle.get_stream()); } @@ -145,14 +145,14 @@ void sssp(raft::handle_t const& handle, auto near_far_threshold = delta; while (true) { - if (GraphViewType::is_multi_gpu) { - copy_to_adj_matrix_row( - handle, - push_graph_view, - vertex_frontier.get_bucket(static_cast(Bucket::cur_near)).begin(), - vertex_frontier.get_bucket(static_cast(Bucket::cur_near)).end(), - distances, - adj_matrix_row_distances); + if (GraphViewType::is_multi_gpu) { + copy_to_adj_matrix_row( + handle, + push_graph_view, + vertex_frontier.get_bucket(static_cast(Bucket::cur_near)).begin(), + vertex_frontier.get_bucket(static_cast(Bucket::cur_near)).end(), + distances, + adj_matrix_row_distances); } auto vertex_partition = vertex_partition_device_view_t( @@ -164,7 +164,9 @@ void sssp(raft::handle_t const& handle, vertex_frontier, static_cast(Bucket::cur_near), std::vector{static_cast(Bucket::next_near), static_cast(Bucket::far)}, - GraphViewType::is_multi_gpu ? adj_matrix_row_distances.device_view() : detail::major_properties_device_view_t(distances), + GraphViewType::is_multi_gpu + ? adj_matrix_row_distances.device_view() + : detail::major_properties_device_view_t(distances), dummy_properties_t{}.device_view(), [vertex_partition, distances, cutoff] __device__( vertex_t src, vertex_t dst, weight_t w, auto src_val, auto) { From e2e4b1383dd80a4bde9a4125d82a7711b1a898e7 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Tue, 31 Aug 2021 16:48:53 -0400 Subject: [PATCH 11/57] replace rmm::exec_policy(hanlde.get_stream()) with handle.get_thrust_policy() --- cpp/src/structure/renumber_edgelist.cu | 2 +- cpp/tests/sampling/rw_low_level_test.cu | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/structure/renumber_edgelist.cu b/cpp/src/structure/renumber_edgelist.cu index 4123bb5f218..ecdb06e399a 100644 --- a/cpp/src/structure/renumber_edgelist.cu +++ b/cpp/src/structure/renumber_edgelist.cu @@ -550,7 +550,7 @@ void expensive_check_edgelist( for (int j = 0; j < row_comm_size; ++j) { CUGRAPH_EXPECTS( thrust::count_if( - rmm::exec_policy(handle.get_stream_view()), + handle.get_thrust_policy(), edgelist_minor_vertices[i] + (*edgelist_intra_partition_segment_offsets)[i][j], edgelist_minor_vertices[i] + (*edgelist_intra_partition_segment_offsets)[i][j + 1], [row_comm_size, diff --git a/cpp/tests/sampling/rw_low_level_test.cu b/cpp/tests/sampling/rw_low_level_test.cu index 0977d1031bf..3711fb3f98f 100644 --- a/cpp/tests/sampling/rw_low_level_test.cu +++ b/cpp/tests/sampling/rw_low_level_test.cu @@ -73,7 +73,7 @@ void next_biased(raft::handle_t const& handle, vector_test_t& d_next_v, selector_t const& selector) { - thrust::transform(rmm::exec_policy(handle.get_stream_view()), + thrust::transform(handle.get_thrust_policy(), d_src_v.begin(), d_src_v.end(), d_rnd.begin(), From a35d137effd7d02c388903b3e04463361acf293e Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 1 Sep 2021 00:06:54 -0400 Subject: [PATCH 12/57] code refinements --- .../cugraph/prims/row_col_properties.cuh | 17 +++-- .../cugraph/utilities/dataframe_buffer.cuh | 65 +++++++++++++++++++ cpp/src/community/louvain.cuh | 49 +++----------- .../components/weakly_connected_components.cu | 2 +- 4 files changed, 84 insertions(+), 49 deletions(-) diff --git a/cpp/include/cugraph/prims/row_col_properties.cuh b/cpp/include/cugraph/prims/row_col_properties.cuh index adf068d2eb5..3aa14a2e859 100644 --- a/cpp/include/cugraph/prims/row_col_properties.cuh +++ b/cpp/include/cugraph/prims/row_col_properties.cuh @@ -48,7 +48,8 @@ class major_properties_device_view_t { ValueIterator value_data() const { return value_first_; } - __device__ auto get(vertex_t offset) const { return *(value_first_ + offset); } + __device__ ValueIterator get_iter(vertex_t offset) const { return value_first_ + offset; } + __device__ value_type get(vertex_t offset) const { return *get_iter(offset); } private: ValueIterator value_first_{}; @@ -73,7 +74,7 @@ class minor_properties_device_view_t { { } - __device__ auto& get(vertex_t offset) const + __device__ ValueIterator get_iter(vertex_t offset) const { auto value_offset = offset; if (key_first_) { @@ -81,9 +82,11 @@ class minor_properties_device_view_t { assert((it != *key_last_) && (*it == offset)); value_offset = static_cast(thrust::distance(*key_first_, it)); } - return *(value_first_ + value_offset); + return value_first_ + value_offset; } + __device__ value_type get(vertex_t offset) const { return *get_iter(offset); } + private: thrust::optional key_first_{thrust::nullopt}; thrust::optional key_last_{thrust::nullopt}; @@ -113,15 +116,13 @@ class major_properties_t { auto device_view() const { - auto value_first = get_dataframe_buffer_begin(buffer_); + auto value_first = get_dataframe_buffer_cbegin(buffer_); return major_properties_device_view_t(value_first); } auto mutable_device_view() { auto value_first = get_dataframe_buffer_begin(buffer_); - static_assert( - !std::is_const_v::value_type>); return major_properties_device_view_t(value_first); } @@ -168,7 +169,7 @@ class minor_properties_t { auto device_view() const { - auto value_first = get_dataframe_buffer_begin(buffer_); + auto value_first = get_dataframe_buffer_cbegin(buffer_); if (key_first_) { return minor_properties_device_view_t( *key_first_, *key_last_, value_first); @@ -180,8 +181,6 @@ class minor_properties_t { auto mutable_device_view() { auto value_first = get_dataframe_buffer_begin(buffer_); - static_assert( - !std::is_const_v::value_type>); if (key_first_) { return minor_properties_device_view_t( *key_first_, *key_last_, value_first); diff --git a/cpp/include/cugraph/utilities/dataframe_buffer.cuh b/cpp/include/cugraph/utilities/dataframe_buffer.cuh index 04c5db91d89..28afadedf8e 100644 --- a/cpp/include/cugraph/utilities/dataframe_buffer.cuh +++ b/cpp/include/cugraph/utilities/dataframe_buffer.cuh @@ -92,6 +92,21 @@ auto get_dataframe_buffer_begin_tuple_impl(std::index_sequence, BufferTyp get_dataframe_buffer_begin_tuple_element_impl(buffer)...); } +template +auto get_dataframe_buffer_cbegin_tuple_element_impl(BufferType& buffer) +{ + using element_t = typename thrust::tuple_element::type; + return std::get(buffer).cbegin(); +} + +template +auto get_dataframe_buffer_cbegin_tuple_impl(std::index_sequence, BufferType& buffer) +{ + // thrust::make_tuple instead of std::make_tuple as this is fed to thrust::make_zip_iterator. + return thrust::make_tuple( + get_dataframe_buffer_cbegin_tuple_element_impl(buffer)...); +} + template auto get_dataframe_buffer_end_tuple_element_impl(BufferType& buffer) { @@ -106,6 +121,20 @@ auto get_dataframe_buffer_end_tuple_impl(std::index_sequence, BufferType& return thrust::make_tuple(get_dataframe_buffer_end_tuple_element_impl(buffer)...); } +template +auto get_dataframe_buffer_cend_tuple_element_impl(BufferType& buffer) +{ + using element_t = typename thrust::tuple_element::type; + return std::get(buffer).cend(); +} + +template +auto get_dataframe_buffer_cend_tuple_impl(std::index_sequence, BufferType& buffer) +{ + // thrust::make_tuple instead of std::make_tuple as this is fed to thrust::make_zip_iterator. + return thrust::make_tuple(get_dataframe_buffer_cend_tuple_element_impl(buffer)...); +} + } // namespace detail template ::value>* = nullptr> @@ -200,6 +229,24 @@ auto get_dataframe_buffer_begin(BufferType& buffer) std::make_index_sequence(), buffer)); } +template ::value>* = nullptr> +auto get_dataframe_buffer_cbegin(BufferType& buffer) +{ + return buffer.cbegin(); +} + +template ::value>* = nullptr> +auto get_dataframe_buffer_cbegin(BufferType& buffer) +{ + size_t constexpr tuple_size = thrust::tuple_size::value; + return thrust::make_zip_iterator(detail::get_dataframe_buffer_cbegin_tuple_impl( + std::make_index_sequence(), buffer)); +} + template ::value>* = nullptr> @@ -218,4 +265,22 @@ auto get_dataframe_buffer_end(BufferType& buffer) detail::get_dataframe_buffer_end_tuple_impl(std::make_index_sequence(), buffer)); } +template ::value>* = nullptr> +auto get_dataframe_buffer_cend(BufferType& buffer) +{ + return buffer.cend(); +} + +template ::value>* = nullptr> +auto get_dataframe_buffer_cend(BufferType& buffer) +{ + size_t constexpr tuple_size = thrust::tuple_size::value; + return thrust::make_zip_iterator(detail::get_dataframe_buffer_cend_tuple_impl( + std::make_index_sequence(), buffer)); +} + } // namespace cugraph diff --git a/cpp/src/community/louvain.cuh b/cpp/src/community/louvain.cuh index aff64cad704..abcfe41d8b3 100644 --- a/cpp/src/community/louvain.cuh +++ b/cpp/src/community/louvain.cuh @@ -249,7 +249,7 @@ class Louvain { cluster_weights_v_ = std::move(rx_weights_v); } - if (graph_view_t::is_multi_gpu) { + if constexpr (graph_view_t::is_multi_gpu) { src_vertex_weights_cache_ = row_properties_t(handle_, current_graph_view_); copy_to_adj_matrix_row( @@ -261,20 +261,6 @@ class Louvain { timer_stop(handle_.get_stream_view()); } - template - void cache_src_properties(rmm::device_uvector& input, - row_properties_t& src_cache_) - { - copy_to_adj_matrix_row(handle_, current_graph_view_, input.begin(), src_cache_); - } - - template - void cache_dst_properties(rmm::device_uvector& input, - col_properties_t& dst_cache_) - { - copy_to_adj_matrix_col(handle_, current_graph_view_, input.begin(), dst_cache_); - } - virtual weight_t update_clustering(weight_t total_edge_weight, weight_t resolution) { timer_start("update_clustering"); @@ -289,9 +275,11 @@ class Louvain { if constexpr (graph_view_t::is_multi_gpu) { src_clusters_cache_ = row_properties_t(handle_, current_graph_view_); - cache_src_properties(next_clusters_v_, src_clusters_cache_); + copy_to_adj_matrix_row( + handle_, current_graph_view_, next_clusters_v_.begin(), src_clusters_cache_); dst_clusters_cache_ = col_properties_t(handle_, current_graph_view_); - cache_dst_properties(next_clusters_v_, dst_clusters_cache_); + copy_to_adj_matrix_col( + handle_, current_graph_view_, next_clusters_v_.begin(), dst_clusters_cache_); } weight_t new_Q = modularity(total_edge_weight, resolution); @@ -418,7 +406,7 @@ class Louvain { src_old_cluster_sum_subtract_pairs); } - auto output_buffer = cugraph::allocate_dataframe_buffer>( + auto output_buffer = allocate_dataframe_buffer>( current_graph_view_.get_number_of_local_vertices(), handle_.get_stream()); auto cluster_old_sum_subtract_pair_first = thrust::make_zip_iterator( @@ -443,39 +431,21 @@ class Louvain { copy_v_transform_reduce_key_aggregated_out_nbr( handle_, current_graph_view_, -#if 1 zipped_src_device_view, graph_view_t::is_multi_gpu ? dst_clusters_cache_.device_view() : detail::minor_properties_device_view_t( next_clusters_v_.data()), -#else - thrust::make_zip_iterator(thrust::make_tuple(src_old_cluster_sum_v.begin(), - d_src_vertex_weights_cache_, - src_cluster_subtract_v.begin(), - d_src_cluster_cache_, - src_cluster_weights_v.begin())), - - d_dst_cluster_cache_, -#endif cluster_keys_v_.begin(), cluster_keys_v_.end(), cluster_weights_v_.begin(), [total_edge_weight, resolution] __device__( auto src, auto neighbor_cluster, auto new_cluster_sum, auto src_info, auto a_new) { -#if 1 auto k_k = thrust::get<0>(src_info); auto src_cluster = thrust::get<1>(src_info); auto a_old = thrust::get<2>(src_info); auto old_cluster_sum = thrust::get<3>(src_info); auto cluster_subtract = thrust::get<4>(src_info); -#else - auto old_cluster_sum = thrust::get<0>(src_info); - auto k_k = thrust::get<1>(src_info); - auto cluster_subtract = thrust::get<2>(src_info); - auto src_cluster = thrust::get<3>(src_info); - auto a_old = thrust::get<4>(src_info); -#endif if (src_cluster == neighbor_cluster) new_cluster_sum -= cluster_subtract; @@ -512,8 +482,10 @@ class Louvain { }); if constexpr (graph_view_t::is_multi_gpu) { - cache_src_properties(next_clusters_v_, src_clusters_cache_); - cache_dst_properties(next_clusters_v_, dst_clusters_cache_); + copy_to_adj_matrix_row( + handle_, current_graph_view_, next_clusters_v_.begin(), src_clusters_cache_); + copy_to_adj_matrix_row( + handle_, current_graph_view_, next_clusters_v_.begin(), src_clusters_cache_); } std::tie(cluster_keys_v_, cluster_weights_v_) = @@ -569,7 +541,6 @@ class Louvain { std::unique_ptr current_graph_{}; graph_view_t current_graph_view_; - // FIXME: better move inside the update_by_delta_modularity? rmm::device_uvector cluster_keys_v_; rmm::device_uvector cluster_weights_v_; diff --git a/cpp/src/components/weakly_connected_components.cu b/cpp/src/components/weakly_connected_components.cu index 35f4343f721..0bc1bd4996b 100644 --- a/cpp/src/components/weakly_connected_components.cu +++ b/cpp/src/components/weakly_connected_components.cu @@ -555,7 +555,7 @@ void weakly_connected_components_impl(raft::handle_t const& handle, // FIXME: better switch to atomic_ref after // https://github.com/nvidia/libcudacxx/milestone/2 auto old = atomicCAS( - &(col_components.get(col_offset)), invalid_component_id::value, tag); + col_components.get_iter(col_offset), invalid_component_id::value, tag); if (old != invalid_component_id::value && old != tag) { // conflict static_assert(sizeof(unsigned long long int) == sizeof(size_t)); auto edge_idx = atomicAdd(reinterpret_cast(num_edge_inserts), From 83d7313804124aa045e35cd526f234b621f7fcda Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 1 Sep 2021 10:18:56 -0400 Subject: [PATCH 13/57] code clean-up --- cpp/include/cugraph/graph_view.hpp | 14 ++++---------- ...y_v_transform_reduce_key_aggregated_out_nbr.cuh | 1 + 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/cpp/include/cugraph/graph_view.hpp b/cpp/include/cugraph/graph_view.hpp index 3cab3b7ff8f..81aa00fd2ea 100644 --- a/cpp/include/cugraph/graph_view.hpp +++ b/cpp/include/cugraph/graph_view.hpp @@ -776,22 +776,22 @@ class graph_view_t get_local_sorted_unique_edge_row_begin() const { - return local_sorted_unique_edge_row_first_; + return std::nullopt; } std::optional get_local_sorted_unique_edge_row_end() const { - return local_sorted_unique_edge_row_last_; + return std::nullopt; } std::optional get_local_sorted_unique_edge_col_begin() const { - return local_sorted_unique_edge_col_first_; + return std::nullopt; } std::optional get_local_sorted_unique_edge_col_end() const { - return local_sorted_unique_edge_col_last_; + return std::nullopt; } private: @@ -801,12 +801,6 @@ class graph_view_t> segment_offsets_{std::nullopt}; - - // FIXME: to be implemented. - std::optional local_sorted_unique_edge_row_first_{std::nullopt}; - std::optional local_sorted_unique_edge_row_last_{std::nullopt}; - std::optional local_sorted_unique_edge_col_first_{std::nullopt}; - std::optional local_sorted_unique_edge_col_last_{std::nullopt}; }; } // namespace cugraph diff --git a/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh b/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh index f34dc0af660..0ed211b9002 100644 --- a/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh +++ b/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh @@ -425,6 +425,7 @@ void copy_v_transform_reduce_key_aggregated_out_nbr( matrix_partition.get_indices(), detail::minor_to_key_t{adj_matrix_col_key_input, matrix_partition.get_minor_first()}); + auto execution_policy = handle.get_thrust_policy(); thrust::copy(execution_policy, minor_key_first, minor_key_first + matrix_partition.get_number_of_edges(), From 92972ed82bcc869c5acf3e053b20a78b608f6808 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 1 Sep 2021 10:24:43 -0400 Subject: [PATCH 14/57] clang-format --- .../prims/copy_to_adj_matrix_row_col.cuh | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh b/cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh index 37456c4b8bf..183eb38f944 100644 --- a/cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh +++ b/cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh @@ -440,7 +440,8 @@ void copy_to_matrix_minor(raft::handle_t const& handle, * @param vertex_value_input_first Iterator pointing to the vertex properties for the first * (inclusive) vertex (assigned to this process in multi-GPU). `vertex_value_input_last` (exclusive) * is deduced as @p vertex_value_input_first + @p graph_view.get_number_of_local_vertices(). - * @param adj_matrix_row_value_output Wrapper used to access data storage to copy row properties (for the rows assigned to this process in multi-GPU). + * @param adj_matrix_row_value_output Wrapper used to access data storage to copy row properties + * (for the rows assigned to this process in multi-GPU). */ template void copy_to_adj_matrix_row( @@ -452,11 +453,9 @@ void copy_to_adj_matrix_row( adj_matrix_row_value_output) { if constexpr (GraphViewType::is_adj_matrix_transposed) { - copy_to_matrix_minor( - handle, graph_view, vertex_value_input_first, adj_matrix_row_value_output); + copy_to_matrix_minor(handle, graph_view, vertex_value_input_first, adj_matrix_row_value_output); } else { - copy_to_matrix_major( - handle, graph_view, vertex_value_input_first, adj_matrix_row_value_output); + copy_to_matrix_major(handle, graph_view, vertex_value_input_first, adj_matrix_row_value_output); } } @@ -481,7 +480,8 @@ void copy_to_adj_matrix_row( * @param vertex_value_input_first Iterator pointing to the vertex properties for the first * (inclusive) vertex (assigned to this process in multi-GPU). `vertex_value_input_last` (exclusive) * is deduced as @p vertex_value_input_first + @p graph_view.get_number_of_local_vertices(). - * @param adj_matrix_row_value_output Wrapper used to access data storage to copy row properties (for the rows assigned to this process in multi-GPU). + * @param adj_matrix_row_value_output Wrapper used to access data storage to copy row properties + * (for the rows assigned to this process in multi-GPU). */ template void copy_to_adj_matrix_row( @@ -525,7 +525,8 @@ void copy_to_adj_matrix_row( * @param vertex_value_input_first Iterator pointing to the vertex properties for the first * (inclusive) vertex (assigned to this process in multi-GPU). `vertex_value_input_last` (exclusive) * is deduced as @p vertex_value_input_first + @p graph_view.get_number_of_local_vertices(). - * @param adj_matrix_col_value_output Wrapper used to access data storage to copy column properties (for the columns assigned to this process in multi-GPU). + * @param adj_matrix_col_value_output Wrapper used to access data storage to copy column properties + * (for the columns assigned to this process in multi-GPU). */ template void copy_to_adj_matrix_col( @@ -537,11 +538,9 @@ void copy_to_adj_matrix_col( adj_matrix_col_value_output) { if constexpr (GraphViewType::is_adj_matrix_transposed) { - copy_to_matrix_major( - handle, graph_view, vertex_value_input_first, adj_matrix_col_value_output); + copy_to_matrix_major(handle, graph_view, vertex_value_input_first, adj_matrix_col_value_output); } else { - copy_to_matrix_minor( - handle, graph_view, vertex_value_input_first, adj_matrix_col_value_output); + copy_to_matrix_minor(handle, graph_view, vertex_value_input_first, adj_matrix_col_value_output); } } @@ -566,7 +565,8 @@ void copy_to_adj_matrix_col( * @param vertex_value_input_first Iterator pointing to the vertex properties for the first * (inclusive) vertex (assigned to this process in multi-GPU). `vertex_value_input_last` (exclusive) * is deduced as @p vertex_value_input_first + @p graph_view.get_number_of_local_vertices(). - * @param adj_matrix_col_value_output Wrapper used to access data storage to copy column properties (for the columns assigned to this process in multi-GPU). + * @param adj_matrix_col_value_output Wrapper used to access data storage to copy column properties + * (for the columns assigned to this process in multi-GPU). */ template void copy_to_adj_matrix_col( From f39d26672b9ba37a412a8a1c6dbec616388a9927 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 1 Sep 2021 11:32:42 -0400 Subject: [PATCH 15/57] documentation update --- cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh b/cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh index 183eb38f944..ab27e7cc3c7 100644 --- a/cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh +++ b/cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh @@ -429,8 +429,7 @@ void copy_to_matrix_minor(raft::handle_t const& handle, * @brief Copy vertex property values to the corresponding graph adjacency matrix row property * variables. * - * This version fills the entire set of graph adjacency matrix row property values. This function is - * inspired by thrust::copy(). + * This version fills the entire set of graph adjacency matrix row property values. * * @tparam GraphViewType Type of the passed non-owning graph object. * @tparam VertexValueInputIterator Type of the iterator for vertex properties. @@ -465,7 +464,7 @@ void copy_to_adj_matrix_row( * * This version fills only a subset of graph adjacency matrix row property values. [@p vertex_first, * @p vertex_last) specifies the vertices with new values to be copied to graph adjacency matrix row - * property variables. This function is inspired by thrust::copy(). + * property variables. * * @tparam GraphViewType Type of the passed non-owning graph object. * @tparam VertexIterator Type of the iterator for vertex identifiers. From 06c3fa9b64ac75d0352471e645a05b49d8e4e0e6 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 1 Sep 2021 14:00:03 -0400 Subject: [PATCH 16/57] bug fixes --- cpp/src/community/louvain.cuh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/cpp/src/community/louvain.cuh b/cpp/src/community/louvain.cuh index abcfe41d8b3..ea8b753494f 100644 --- a/cpp/src/community/louvain.cuh +++ b/cpp/src/community/louvain.cuh @@ -378,6 +378,7 @@ class Louvain { cluster_keys_v_.end(), cluster_weights_v_.begin()); + vertex_cluster_weights_v.resize(next_clusters_v_.size(), handle_.get_stream()); thrust::transform(handle_.get_thrust_policy(), next_clusters_v_.begin(), next_clusters_v_.end(), @@ -494,7 +495,10 @@ class Louvain { current_graph_view_, dummy_properties_t{}.device_view(), dummy_properties_t{}.device_view(), - src_clusters_cache_.device_view(), + graph_view_t::is_multi_gpu + ? src_clusters_cache_.device_view() + : detail::major_properties_device_view_t( + next_clusters_v_.data()), [] __device__(auto, auto, auto wt, auto, auto) { return wt; }, weight_t{0}); } From 6fde1c0ac7dec5a638c4f72760fea882d68ce9ae Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 1 Sep 2021 15:55:54 -0400 Subject: [PATCH 17/57] additional bug fix --- cpp/src/structure/create_graph_from_edgelist.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cpp/src/structure/create_graph_from_edgelist.cpp b/cpp/src/structure/create_graph_from_edgelist.cpp index d3a385b05bf..c9181f1e000 100644 --- a/cpp/src/structure/create_graph_from_edgelist.cpp +++ b/cpp/src/structure/create_graph_from_edgelist.cpp @@ -113,8 +113,10 @@ create_graph_from_edgelist_impl(raft::handle_t const& handle, *vertex_partition_segment_offsets) = cugraph::renumber_edgelist( handle, - std::optional>{std::make_tuple( - (*local_vertex_span).data(), static_cast((*local_vertex_span).size()))}, + local_vertex_span + ? std::optional>{std::make_tuple( + (*local_vertex_span).data(), static_cast((*local_vertex_span).size()))} + : std::nullopt, major_ptrs, minor_ptrs, edgelist_edge_counts, From 9d25030d9b91491d18794a7a6fa7a56d9ca7afb1 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 2 Sep 2021 10:05:12 -0400 Subject: [PATCH 18/57] MG WCC bug fix --- cpp/src/structure/create_graph_from_edgelist.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cpp/src/structure/create_graph_from_edgelist.cpp b/cpp/src/structure/create_graph_from_edgelist.cpp index d3a385b05bf..c9181f1e000 100644 --- a/cpp/src/structure/create_graph_from_edgelist.cpp +++ b/cpp/src/structure/create_graph_from_edgelist.cpp @@ -113,8 +113,10 @@ create_graph_from_edgelist_impl(raft::handle_t const& handle, *vertex_partition_segment_offsets) = cugraph::renumber_edgelist( handle, - std::optional>{std::make_tuple( - (*local_vertex_span).data(), static_cast((*local_vertex_span).size()))}, + local_vertex_span + ? std::optional>{std::make_tuple( + (*local_vertex_span).data(), static_cast((*local_vertex_span).size()))} + : std::nullopt, major_ptrs, minor_ptrs, edgelist_edge_counts, From dafa4ed3c0b6ac373416e7cfc46698a1eba1464a Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 2 Sep 2021 10:52:25 -0400 Subject: [PATCH 19/57] device lambda to struct functor --- ...ransform_reduce_key_aggregated_out_nbr.cuh | 72 +++++++++++++------ cpp/src/community/louvain.cuh | 49 ++++++++----- 2 files changed, 85 insertions(+), 36 deletions(-) diff --git a/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh b/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh index 0ed211b9002..af25fae7234 100644 --- a/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh +++ b/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh @@ -46,12 +46,51 @@ struct minor_to_key_t { using vertex_t = typename AdjMatrixColKeyInputWrapper::value_type; AdjMatrixColKeyInputWrapper adj_matrix_col_key_input{}; vertex_t minor_first{}; - __device__ vertex_t operator()(vertex_t minor) + __device__ vertex_t operator()(vertex_t minor) const { return adj_matrix_col_key_input.get(minor - minor_first); } }; +// a workaround for cudaErrorInvalidDeviceFunction error when device lambda is used +template +struct minor_key_to_col_rank_t { + compute_gpu_id_from_vertex_t key_func{}; + int row_comm_size{}; + __device__ int operator()( + thrust::tuple val /* major, minor key, weight */) const + { + return key_func(thrust::get<1>(val)) / row_comm_size; + } +}; + +// a workaround for cudaErrorInvalidDeviceFunction error when device lambda is used +template +struct call_key_aggregated_e_op_t { + AdjMatrixRowValueInputWrapper matrix_partition_row_value_input{}; + KeyAggregatedEdgeOp key_aggregated_e_op{}; + MatrixPartitionDeviceView matrix_partition{}; + StaticMapDeviceView kv_map{}; + __device__ auto operator()( + thrust::tuple val /* major, minor key, weight */) const + { + auto major = thrust::get<0>(val); + auto key = thrust::get<1>(val); + auto w = thrust::get<2>(val); + return key_aggregated_e_op(major, + key, + w, + matrix_partition_row_value_input.get( + matrix_partition.get_major_offset_from_major_nocheck(major)), + kv_map.find(key)->second.load(cuda::std::memory_order_relaxed)); + } +}; + template __global__ void for_all_major_for_all_nbr_mid_degree( matrix_partition_device_view_t matrix_partition, @@ -512,10 +551,8 @@ void copy_v_transform_reduce_key_aggregated_out_nbr( col_comm, triplet_first, triplet_first + tmp_major_vertices.size(), - [key_func = detail::compute_gpu_id_from_vertex_t{comm_size}, - row_comm_size] __device__(auto val) { - return key_func(thrust::get<1>(val)) / row_comm_size; - }, + detail::minor_key_to_col_rank_t{ + detail::compute_gpu_id_from_vertex_t{comm_size}, row_comm_size}, handle.get_stream()); auto pair_first = thrust::make_zip_iterator( @@ -558,21 +595,16 @@ void copy_v_transform_reduce_key_aggregated_out_nbr( triplet_first, triplet_first + tmp_major_vertices.size(), tmp_e_op_result_buffer_first, - [matrix_partition_row_value_input, - key_aggregated_e_op, - matrix_partition, - kv_map = kv_map_ptr->get_device_view()] __device__(auto val) { - auto major = thrust::get<0>(val); - auto key = thrust::get<1>(val); - auto w = thrust::get<2>(val); - return key_aggregated_e_op( - major, - key, - w, - matrix_partition_row_value_input.get( - matrix_partition.get_major_offset_from_major_nocheck(major)), - kv_map.find(key)->second.load(cuda::std::memory_order_relaxed)); - }); + detail::call_key_aggregated_e_op_tget_device_view())>{ + matrix_partition_row_value_input, + key_aggregated_e_op, + matrix_partition, + kv_map_ptr->get_device_view()}); tmp_minor_keys.resize(0, handle.get_stream()); tmp_key_aggregated_edge_weights.resize(0, handle.get_stream()); tmp_minor_keys.shrink_to_fit(handle.get_stream()); diff --git a/cpp/src/community/louvain.cuh b/cpp/src/community/louvain.cuh index ea8b753494f..a2e448e85a0 100644 --- a/cpp/src/community/louvain.cuh +++ b/cpp/src/community/louvain.cuh @@ -43,6 +43,38 @@ namespace cugraph { +namespace detail { + +// a workaround for cudaErrorInvalidDeviceFunction error when device lambda is used +template +struct key_aggregated_edge_op_t { + weight_t total_edge_weight{}; + weight_t resolution{}; + __device__ auto operator()( + vertex_t src, + vertex_t neighbor_cluster, + weight_t new_cluster_sum, + thrust::tuple src_info, + weight_t a_new) const + { + auto k_k = thrust::get<0>(src_info); + auto src_cluster = thrust::get<1>(src_info); + auto a_old = thrust::get<2>(src_info); + auto old_cluster_sum = thrust::get<3>(src_info); + auto cluster_subtract = thrust::get<4>(src_info); + + if (src_cluster == neighbor_cluster) new_cluster_sum -= cluster_subtract; + + weight_t delta_modularity = 2 * (((new_cluster_sum - old_cluster_sum) / total_edge_weight) - + resolution * (a_new * k_k - a_old * k_k + k_k * k_k) / + (total_edge_weight * total_edge_weight)); + + return thrust::make_tuple(neighbor_cluster, delta_modularity); + } +}; + +} // namespace detail + template class Louvain { public: @@ -440,22 +472,7 @@ class Louvain { cluster_keys_v_.begin(), cluster_keys_v_.end(), cluster_weights_v_.begin(), - [total_edge_weight, resolution] __device__( - auto src, auto neighbor_cluster, auto new_cluster_sum, auto src_info, auto a_new) { - auto k_k = thrust::get<0>(src_info); - auto src_cluster = thrust::get<1>(src_info); - auto a_old = thrust::get<2>(src_info); - auto old_cluster_sum = thrust::get<3>(src_info); - auto cluster_subtract = thrust::get<4>(src_info); - - if (src_cluster == neighbor_cluster) new_cluster_sum -= cluster_subtract; - - weight_t delta_modularity = 2 * (((new_cluster_sum - old_cluster_sum) / total_edge_weight) - - resolution * (a_new * k_k - a_old * k_k + k_k * k_k) / - (total_edge_weight * total_edge_weight)); - - return thrust::make_tuple(neighbor_cluster, delta_modularity); - }, + detail::key_aggregated_edge_op_t{total_edge_weight, resolution}, [] __device__(auto p1, auto p2) { auto id1 = thrust::get<0>(p1); auto id2 = thrust::get<0>(p2); From 0734f2a08cb45b8d839187d3cb0f7bb0876fd2e9 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 2 Sep 2021 16:08:32 -0400 Subject: [PATCH 20/57] cleanup multi-source BFS artifacts --- cpp/include/cugraph/algorithms.hpp | 3 +- cpp/src/traversal/bfs.cu | 50 ++++++++--------------------- cpp/tests/traversal/mg_bfs_test.cpp | 13 ++++++-- 3 files changed, 25 insertions(+), 41 deletions(-) diff --git a/cpp/include/cugraph/algorithms.hpp b/cpp/include/cugraph/algorithms.hpp index 7d405d324fc..d719abaaa4b 100644 --- a/cpp/include/cugraph/algorithms.hpp +++ b/cpp/include/cugraph/algorithms.hpp @@ -1143,7 +1143,6 @@ weight_t hungarian(raft::handle_t const& handle, * @param predecessors Pointer to the output predecessor array or `nullptr`. * @param sources Source vertices to start breadth-first search (root vertex of the breath-first * search tree). If more than one source is passed, there must be a single source per component. - * Device memory and host memory are accepted. * @param n_sources number of sources (one source per component at most). * @param direction_optimizing If set to true, this algorithm switches between the push based * breadth-first search and pull based breadth-first search depending on the size of the @@ -1158,7 +1157,7 @@ void bfs(raft::handle_t const& handle, graph_view_t const& graph_view, vertex_t* distances, vertex_t* predecessors, - vertex_t* sources, + vertex_t* const sources, size_t n_sources = 1, bool direction_optimizing = false, vertex_t depth_limit = std::numeric_limits::max(), diff --git a/cpp/src/traversal/bfs.cu b/cpp/src/traversal/bfs.cu index 3cf97399b51..abe90d8cc7e 100644 --- a/cpp/src/traversal/bfs.cu +++ b/cpp/src/traversal/bfs.cu @@ -21,7 +21,6 @@ #include #include #include -#include #include #include @@ -47,7 +46,7 @@ void bfs(raft::handle_t const& handle, GraphViewType const& push_graph_view, typename GraphViewType::vertex_type* distances, PredecessorIterator predecessor_first, - typename GraphViewType::vertex_type* sources, + typename GraphViewType::vertex_type* const sources, size_t n_sources, bool direction_optimizing, typename GraphViewType::vertex_type depth_limit, @@ -62,7 +61,11 @@ void bfs(raft::handle_t const& handle, auto const num_vertices = push_graph_view.get_number_of_vertices(); if (num_vertices == 0) { return; } - // CUGRAPH_EXPECTS(sources != nullptr, "Invalid input argument: sources cannot be null"); + + // 1. check input arguments + + CUGRAPH_EXPECTS((n_sources == 0) || (sources != nullptr), + "Invalid input argument: sources cannot be null"); auto aggregate_n_sources = GraphViewType::is_multi_gpu @@ -71,31 +74,18 @@ void bfs(raft::handle_t const& handle, CUGRAPH_EXPECTS(aggregate_n_sources > 0, "Invalid input argument: input should have at least one source"); - // 1. check input arguments CUGRAPH_EXPECTS( push_graph_view.is_symmetric() || !direction_optimizing, "Invalid input argument: input graph should be symmetric for direction optimizing BFS."); - // Transfer single source to the device for single source case - vertex_t* d_sources = sources; - rmm::device_uvector d_sources_v(0, handle.get_stream()); - if (aggregate_n_sources == 1 && n_sources) { - cudaPointerAttributes s_att; - CUDA_CHECK(cudaPointerGetAttributes(&s_att, sources)); - if (s_att.devicePointer == nullptr) { - d_sources_v.resize(n_sources, handle.get_stream()); - d_sources = d_sources_v.data(); - raft::copy(d_sources, sources, n_sources, handle.get_stream()); - } - } - if (do_expensive_check) { - vertex_partition_device_t vertex_partition(push_graph_view); + auto vertex_partition = vertex_partition_device_view_t( + push_graph_view.get_vertex_partition_view()); auto num_invalid_vertices = count_if_v(handle, push_graph_view, - d_sources, - d_sources + n_sources, + sources, + sources + n_sources, [vertex_partition] __device__(auto val) { return !(vertex_partition.is_valid_vertex(val) && vertex_partition.is_local_vertex_nocheck(val)); @@ -122,8 +112,8 @@ void bfs(raft::handle_t const& handle, if (n_sources) { thrust::for_each( rmm::exec_policy(handle.get_thrust_policy()), - d_sources, - d_sources + n_sources, + sources, + sources + n_sources, [vertex_partition, distances, predecessor_first] __device__(auto v) { *(distances + vertex_partition.get_local_vertex_offset_from_vertex_nocheck(v)) = vertex_t{0}; @@ -138,20 +128,8 @@ void bfs(raft::handle_t const& handle, static_cast(Bucket::num_buckets)> vertex_frontier(handle); - // insert local source(s) in the bucket - if (aggregate_n_sources == 1) { - vertex_t src; - // FIXME: this (cheap) transfer could be skiped when is_local_vertex_nocheck accpets device mem - raft::copy(&src, sources, n_sources, handle.get_stream()); - if (push_graph_view.is_local_vertex_nocheck(src)) { - vertex_frontier.get_bucket(static_cast(Bucket::cur)) - .insert(d_sources, d_sources + n_sources); - } - } else { - // pre-shuffled - vertex_frontier.get_bucket(static_cast(Bucket::cur)) - .insert(d_sources, d_sources + n_sources); - } + vertex_frontier.get_bucket(static_cast(Bucket::cur)).insert(sources, sources + n_sources); + // 4. BFS iteration vertex_t depth{0}; while (true) { diff --git a/cpp/tests/traversal/mg_bfs_test.cpp b/cpp/tests/traversal/mg_bfs_test.cpp index e2b2100efad..97f95507205 100644 --- a/cpp/tests/traversal/mg_bfs_test.cpp +++ b/cpp/tests/traversal/mg_bfs_test.cpp @@ -114,12 +114,17 @@ class Tests_MGBFS : public ::testing::TestWithParam>(bfs_usecase.source, handle.get_stream()) + : std::nullopt; + cugraph::bfs(handle, mg_graph_view, d_mg_distances.data(), d_mg_predecessors.data(), - static_cast(bfs_usecase.source), - false, + d_mg_source ? (*d_mg_source).data() : static_cast(nullptr), + d_mg_source ? size_t{1} : size_t{0}, std::numeric_limits::max()); if (cugraph::test::g_perf) { @@ -191,11 +196,13 @@ class Tests_MGBFS : public ::testing::TestWithParam d_sg_source(unrenumbered_source, handle.get_stream()); cugraph::bfs(handle, sg_graph_view, d_sg_distances.data(), d_sg_predecessors.data(), - unrenumbered_source, + d_sg_source.data(), + size_t{1}, false, std::numeric_limits::max()); // 4-5. compare From e7e3db1dbd59bc705668c7dcab240f0730919f23 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 2 Sep 2021 16:13:56 -0400 Subject: [PATCH 21/57] add missing const --- cpp/src/traversal/bfs.cu | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/cpp/src/traversal/bfs.cu b/cpp/src/traversal/bfs.cu index abe90d8cc7e..fa562e38922 100644 --- a/cpp/src/traversal/bfs.cu +++ b/cpp/src/traversal/bfs.cu @@ -46,7 +46,7 @@ void bfs(raft::handle_t const& handle, GraphViewType const& push_graph_view, typename GraphViewType::vertex_type* distances, PredecessorIterator predecessor_first, - typename GraphViewType::vertex_type* const sources, + typename GraphViewType::vertex_type const* sources, size_t n_sources, bool direction_optimizing, typename GraphViewType::vertex_type depth_limit, @@ -195,7 +195,7 @@ void bfs(raft::handle_t const& handle, graph_view_t const& graph_view, vertex_t* distances, vertex_t* predecessors, - vertex_t* sources, + vertex_t const* sources, size_t n_sources, bool direction_optimizing, vertex_t depth_limit, @@ -230,7 +230,7 @@ template void bfs(raft::handle_t const& handle, graph_view_t const& graph_view, int32_t* distances, int32_t* predecessors, - int32_t* sources, + int32_t const* sources, size_t n_sources, bool direction_optimizing, int32_t depth_limit, @@ -240,7 +240,7 @@ template void bfs(raft::handle_t const& handle, graph_view_t const& graph_view, int32_t* distances, int32_t* predecessors, - int32_t* sources, + int32_t const* sources, size_t n_sources, bool direction_optimizing, int32_t depth_limit, @@ -250,7 +250,7 @@ template void bfs(raft::handle_t const& handle, graph_view_t const& graph_view, int32_t* distances, int32_t* predecessors, - int32_t* sources, + int32_t const* sources, size_t n_sources, bool direction_optimizing, int32_t depth_limit, @@ -260,7 +260,7 @@ template void bfs(raft::handle_t const& handle, graph_view_t const& graph_view, int32_t* distances, int32_t* predecessors, - int32_t* sources, + int32_t const* sources, size_t n_sources, bool direction_optimizing, int32_t depth_limit, @@ -270,7 +270,7 @@ template void bfs(raft::handle_t const& handle, graph_view_t const& graph_view, int64_t* distances, int64_t* predecessors, - int64_t* sources, + int64_t const* sources, size_t n_sources, bool direction_optimizing, int64_t depth_limit, @@ -280,7 +280,7 @@ template void bfs(raft::handle_t const& handle, graph_view_t const& graph_view, int64_t* distances, int64_t* predecessors, - int64_t* sources, + int64_t const* sources, size_t n_sources, bool direction_optimizing, int64_t depth_limit, @@ -290,7 +290,7 @@ template void bfs(raft::handle_t const& handle, graph_view_t const& graph_view, int32_t* distances, int32_t* predecessors, - int32_t* sources, + int32_t const* sources, size_t n_sources, bool direction_optimizing, int32_t depth_limit, @@ -300,7 +300,7 @@ template void bfs(raft::handle_t const& handle, graph_view_t const& graph_view, int32_t* distances, int32_t* predecessors, - int32_t* sources, + int32_t const* sources, size_t n_sources, bool direction_optimizing, int32_t depth_limit, @@ -310,7 +310,7 @@ template void bfs(raft::handle_t const& handle, graph_view_t const& graph_view, int32_t* distances, int32_t* predecessors, - int32_t* sources, + int32_t const* sources, size_t n_sources, bool direction_optimizing, int32_t depth_limit, @@ -320,7 +320,7 @@ template void bfs(raft::handle_t const& handle, graph_view_t const& graph_view, int32_t* distances, int32_t* predecessors, - int32_t* sources, + int32_t const* sources, size_t n_sources, bool direction_optimizing, int32_t depth_limit, @@ -330,7 +330,7 @@ template void bfs(raft::handle_t const& handle, graph_view_t const& graph_view, int64_t* distances, int64_t* predecessors, - int64_t* sources, + int64_t const* sources, size_t n_sources, bool direction_optimizing, int64_t depth_limit, @@ -340,7 +340,7 @@ template void bfs(raft::handle_t const& handle, graph_view_t const& graph_view, int64_t* distances, int64_t* predecessors, - int64_t* sources, + int64_t const* sources, size_t n_sources, bool direction_optimizing, int64_t depth_limit, From 559d270e985687e8f2a5628406568d7345863542 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 2 Sep 2021 16:33:38 -0400 Subject: [PATCH 22/57] additional fixes --- cpp/include/cugraph/algorithms.hpp | 2 +- cpp/tests/traversal/bfs_test.cpp | 7 ++++--- cpp/tests/traversal/mg_bfs_test.cpp | 4 ++-- cpp/tests/traversal/ms_bfs_test.cu | 10 ++++++---- 4 files changed, 13 insertions(+), 10 deletions(-) diff --git a/cpp/include/cugraph/algorithms.hpp b/cpp/include/cugraph/algorithms.hpp index d719abaaa4b..2c2f64217f8 100644 --- a/cpp/include/cugraph/algorithms.hpp +++ b/cpp/include/cugraph/algorithms.hpp @@ -1157,7 +1157,7 @@ void bfs(raft::handle_t const& handle, graph_view_t const& graph_view, vertex_t* distances, vertex_t* predecessors, - vertex_t* const sources, + vertex_t const* sources, size_t n_sources = 1, bool direction_optimizing = false, vertex_t depth_limit = std::numeric_limits::max(), diff --git a/cpp/tests/traversal/bfs_test.cpp b/cpp/tests/traversal/bfs_test.cpp index 04b41db9b9b..3f1e18d8cbf 100644 --- a/cpp/tests/traversal/bfs_test.cpp +++ b/cpp/tests/traversal/bfs_test.cpp @@ -27,6 +27,7 @@ #include #include +#include #include #include @@ -133,14 +134,14 @@ class Tests_BFS : public ::testing::TestWithParam const d_source(bfs_usecase.source, handle.get_stream()); cugraph::bfs(handle, graph_view, d_distances.data(), d_predecessors.data(), - &source, - 1, + d_source.data(), + size_t{1}, false, std::numeric_limits::max()); diff --git a/cpp/tests/traversal/mg_bfs_test.cpp b/cpp/tests/traversal/mg_bfs_test.cpp index 97f95507205..2a9d33fa6d5 100644 --- a/cpp/tests/traversal/mg_bfs_test.cpp +++ b/cpp/tests/traversal/mg_bfs_test.cpp @@ -114,7 +114,7 @@ class Tests_MGBFS : public ::testing::TestWithParam>(bfs_usecase.source, handle.get_stream()) : std::nullopt; @@ -196,7 +196,7 @@ class Tests_MGBFS : public ::testing::TestWithParam d_sg_source(unrenumbered_source, handle.get_stream()); + rmm::device_scalar const d_sg_source(unrenumbered_source, handle.get_stream()); cugraph::bfs(handle, sg_graph_view, d_sg_distances.data(), diff --git a/cpp/tests/traversal/ms_bfs_test.cu b/cpp/tests/traversal/ms_bfs_test.cu index 98a9d613c08..b1833c3f295 100644 --- a/cpp/tests/traversal/ms_bfs_test.cu +++ b/cpp/tests/traversal/ms_bfs_test.cu @@ -180,12 +180,13 @@ class Tests_MsBfs : public ::testing::TestWithParam { bool direction_optimizing = false; vertex_t source = h_sources[0]; + rmm::device_scalar const d_source_0(source, handle.get_stream()); cugraph::bfs(handle, graph_view, d_distances_ref[0].begin(), d_predecessors_ref[0].begin(), - &source, - 1, + d_source_0.data(), + size_t{1}, direction_optimizing, configuration.radius); @@ -195,12 +196,13 @@ class Tests_MsBfs : public ::testing::TestWithParam { cudaProfilerStart(); for (size_t i = 0; i < h_sources.size(); i++) { source = h_sources[i]; + rmm::device_scalar const d_source_i(source, handle.get_stream()); cugraph::bfs(handle, graph_view, d_distances_ref[i].begin(), d_predecessors_ref[i].begin(), - &source, - 1, + d_source_i.data(), + size_t{1}, direction_optimizing, configuration.radius); } From 3248c7e89cf7ae3bb6536d81ad38c4e67742e46e Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 2 Sep 2021 16:45:14 -0400 Subject: [PATCH 23/57] more fixes --- cpp/tests/traversal/mg_bfs_test.cpp | 1 + cpp/tests/traversal/ms_bfs_test.cu | 16 ++++++++++------ 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/cpp/tests/traversal/mg_bfs_test.cpp b/cpp/tests/traversal/mg_bfs_test.cpp index 2a9d33fa6d5..1a938edeee9 100644 --- a/cpp/tests/traversal/mg_bfs_test.cpp +++ b/cpp/tests/traversal/mg_bfs_test.cpp @@ -125,6 +125,7 @@ class Tests_MGBFS : public ::testing::TestWithParam(nullptr), d_mg_source ? size_t{1} : size_t{0}, + false, std::numeric_limits::max()); if (cugraph::test::g_perf) { diff --git a/cpp/tests/traversal/ms_bfs_test.cu b/cpp/tests/traversal/ms_bfs_test.cu index b1833c3f295..94776689929 100644 --- a/cpp/tests/traversal/ms_bfs_test.cu +++ b/cpp/tests/traversal/ms_bfs_test.cu @@ -23,19 +23,23 @@ #include #include -#include -#include #include +#include +#include +#include +#include +#include + +#include #include #include #include + +#include + #include #include #include -#include -#include -#include -#include #include #include #include From ee07b2bb537c3261caf179b80dd3d08fb4b49973 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Fri, 3 Sep 2021 23:31:16 -0400 Subject: [PATCH 24/57] additional bug fixes --- ...ransform_reduce_key_aggregated_out_nbr.cuh | 116 ++++++++++++------ cpp/src/community/louvain.cuh | 64 +++++++--- 2 files changed, 122 insertions(+), 58 deletions(-) diff --git a/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh b/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh index af25fae7234..7d68cf8d13d 100644 --- a/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh +++ b/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh @@ -79,9 +79,12 @@ struct call_key_aggregated_e_op_t { __device__ auto operator()( thrust::tuple val /* major, minor key, weight */) const { - auto major = thrust::get<0>(val); - auto key = thrust::get<1>(val); - auto w = thrust::get<2>(val); + auto major = thrust::get<0>(val); + auto key = thrust::get<1>(val); + auto w = thrust::get<2>(val); + auto row_value = matrix_partition_row_value_input.get( + matrix_partition.get_major_offset_from_major_nocheck(major)); + auto key_val = kv_map.find(key)->second.load(cuda::std::memory_order_relaxed); return key_aggregated_e_op(major, key, w, @@ -91,6 +94,52 @@ struct call_key_aggregated_e_op_t { } }; +// a workaround for cudaErrorInvalidDeviceFunction error when device lambda is used +template +struct is_first_in_run_t { + vertex_t const* major_vertices{nullptr}; + __device__ bool operator()(size_t i) const + { + return ((i == 0) || (major_vertices[i] != major_vertices[i - 1])) ? true : false; + } +}; + +// a workaround for cudaErrorInvalidDeviceFunction error when device lambda is used +template +struct is_valid_vertex_t { + __device__ bool operator()(vertex_t v) const { return v != invalid_vertex_id::value; } +}; + +// a workaround for cudaErrorInvalidDeviceFunction error when device lambda is used +template +struct invalidate_if_not_first_in_run_t { + vertex_t const* major_vertices{nullptr}; + __device__ vertex_t operator()(size_t i) const + { + return ((i == 0) || (major_vertices[i] != major_vertices[i - 1])) + ? major_vertices[i] + : invalid_vertex_id::value; + } +}; + +// a workaround for cudaErrorInvalidDeviceFunction error when device lambda is used +template +struct vertex_local_offset_t { + vertex_partition_device_view_t vertex_partition{}; + __device__ vertex_t operator()(vertex_t v) const + { + return vertex_partition.get_local_vertex_offset_from_vertex_nocheck(v); + } +}; + +// a workaround for cudaErrorInvalidDeviceFunction error when device lambda is used +template +struct reduce_with_init_t { + ReduceOp reduce_op{}; + T init{}; + __device__ T operator()(T val) const { return reduce_op(val, init); } +}; + template __global__ void for_all_major_for_all_nbr_mid_degree( matrix_partition_device_view_t matrix_partition, @@ -523,9 +572,9 @@ void copy_v_transform_reduce_key_aggregated_out_nbr( tmp_major_vertices.resize(reduced_size, handle.get_stream()); tmp_minor_keys.resize(tmp_major_vertices.size(), handle.get_stream()); tmp_key_aggregated_edge_weights.resize(tmp_major_vertices.size(), handle.get_stream()); - tmp_major_vertices.shrink_to_fit(handle.get_stream()); tmp_minor_keys.shrink_to_fit(handle.get_stream()); tmp_key_aggregated_edge_weights.shrink_to_fit(handle.get_stream()); + tmp_major_vertices.shrink_to_fit(handle.get_stream()); } if (GraphViewType::is_multi_gpu) { @@ -682,50 +731,39 @@ void copy_v_transform_reduce_key_aggregated_out_nbr( major_vertices.end(), get_dataframe_buffer_begin(e_op_result_buffer)); - auto num_uniques = thrust::count_if( - execution_policy, - thrust::make_counting_iterator(size_t{0}), - thrust::make_counting_iterator(major_vertices.size()), - [major_vertices = major_vertices.data()] __device__(auto i) { - return ((i == 0) || (major_vertices[i] != major_vertices[i - 1])) ? true : false; - }); + auto num_uniques = thrust::count_if(execution_policy, + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(major_vertices.size()), + detail::is_first_in_run_t{major_vertices.data()}); rmm::device_uvector unique_major_vertices(num_uniques, handle.get_stream()); auto major_vertex_first = thrust::make_transform_iterator( thrust::make_counting_iterator(size_t{0}), - [major_vertices = major_vertices.data()] __device__(auto i) { - return ((i == 0) || (major_vertices[i] != major_vertices[i - 1])) - ? major_vertices[i] - : invalid_vertex_id::value; - }); - thrust::copy_if( - execution_policy, - major_vertex_first, - major_vertex_first + major_vertices.size(), - unique_major_vertices.begin(), - [] __device__(auto major) { return major != invalid_vertex_id::value; }); - thrust::reduce_by_key( - execution_policy, - major_vertices.begin(), - major_vertices.end(), - get_dataframe_buffer_begin(e_op_result_buffer), - thrust::make_discard_iterator(), - thrust::make_permutation_iterator( - vertex_value_output_first, - thrust::make_transform_iterator( - unique_major_vertices.begin(), - [vertex_partition = vertex_partition_device_view_t( - graph_view.get_vertex_partition_view())] __device__(auto v) { - return vertex_partition.get_local_vertex_offset_from_vertex_nocheck(v); - })), - thrust::equal_to{}, - reduce_op); + detail::invalidate_if_not_first_in_run_t{major_vertices.data()}); + thrust::copy_if(execution_policy, + major_vertex_first, + major_vertex_first + major_vertices.size(), + unique_major_vertices.begin(), + detail::is_valid_vertex_t{}); + thrust::reduce_by_key(execution_policy, + major_vertices.begin(), + major_vertices.end(), + get_dataframe_buffer_begin(e_op_result_buffer), + thrust::make_discard_iterator(), + thrust::make_permutation_iterator( + vertex_value_output_first, + thrust::make_transform_iterator( + unique_major_vertices.begin(), + detail::vertex_local_offset_t{ + graph_view.get_vertex_partition_view()})), + thrust::equal_to{}, + reduce_op); thrust::transform(execution_policy, vertex_value_output_first, vertex_value_output_first + graph_view.get_number_of_local_vertices(), vertex_value_output_first, - [reduce_op, init] __device__(auto val) { return reduce_op(val, init); }); + detail::reduce_with_init_t{reduce_op, init}); } } // namespace cugraph diff --git a/cpp/src/community/louvain.cuh b/cpp/src/community/louvain.cuh index a2e448e85a0..29153fc2d37 100644 --- a/cpp/src/community/louvain.cuh +++ b/cpp/src/community/louvain.cuh @@ -73,6 +73,46 @@ struct key_aggregated_edge_op_t { } }; +// a workaround for cudaErrorInvalidDeviceFunction error when device lambda is used +template +struct reduce_op_t { + __device__ auto operator()(thrust::tuple p0, + thrust::tuple p1) const + { + auto id0 = thrust::get<0>(p0); + auto id1 = thrust::get<0>(p1); + auto wt0 = thrust::get<1>(p0); + auto wt1 = thrust::get<1>(p1); + + return (wt0 < wt1) ? p1 : ((wt0 > wt1) ? p0 : ((id0 < id1) ? p0 : p1)); + } +}; + +// a workaround for cudaErrorInvalidDeviceFunction error when device lambda is used +template +struct cluster_update_op_t { + bool up_down{}; + __device__ auto operator()(vertex_t old_cluster, thrust::tuple p) const + { + vertex_t new_cluster = thrust::get<0>(p); + weight_t delta_modularity = thrust::get<1>(p); + + return (delta_modularity > weight_t{0}) + ? (((new_cluster > old_cluster) != up_down) ? old_cluster : new_cluster) + : old_cluster; + } +}; + +// a workaround for cudaErrorInvalidDeviceFunction error when device lambda is used +template +struct return_edge_weight_t { + __device__ auto operator()( + vertex_t, vertex_t, weight_t w, thrust::nullopt_t, thrust::nullopt_t) const + { + return w; + } +}; + } // namespace detail template @@ -473,14 +513,7 @@ class Louvain { cluster_keys_v_.end(), cluster_weights_v_.begin(), detail::key_aggregated_edge_op_t{total_edge_weight, resolution}, - [] __device__(auto p1, auto p2) { - auto id1 = thrust::get<0>(p1); - auto id2 = thrust::get<0>(p2); - auto wt1 = thrust::get<1>(p1); - auto wt2 = thrust::get<1>(p2); - - return (wt1 < wt2) ? p2 : ((wt1 > wt2) ? p1 : ((id1 < id2) ? p1 : p2)); - }, + detail::reduce_op_t{}, thrust::make_tuple(vertex_t{-1}, weight_t{0}), cugraph::get_dataframe_buffer_begin>(output_buffer)); @@ -490,20 +523,13 @@ class Louvain { next_clusters_v_.end(), cugraph::get_dataframe_buffer_begin>(output_buffer), next_clusters_v_.begin(), - [up_down] __device__(vertex_t old_cluster, auto p) { - vertex_t new_cluster = thrust::get<0>(p); - weight_t delta_modularity = thrust::get<1>(p); - - return (delta_modularity > weight_t{0}) - ? (((new_cluster > old_cluster) != up_down) ? old_cluster : new_cluster) - : old_cluster; - }); + detail::cluster_update_op_t{up_down}); if constexpr (graph_view_t::is_multi_gpu) { copy_to_adj_matrix_row( handle_, current_graph_view_, next_clusters_v_.begin(), src_clusters_cache_); - copy_to_adj_matrix_row( - handle_, current_graph_view_, next_clusters_v_.begin(), src_clusters_cache_); + copy_to_adj_matrix_col( + handle_, current_graph_view_, next_clusters_v_.begin(), dst_clusters_cache_); } std::tie(cluster_keys_v_, cluster_weights_v_) = @@ -516,7 +542,7 @@ class Louvain { ? src_clusters_cache_.device_view() : detail::major_properties_device_view_t( next_clusters_v_.data()), - [] __device__(auto, auto, auto wt, auto, auto) { return wt; }, + detail::return_edge_weight_t{}, weight_t{0}); } From 44862f745629b902f1c41e7032aed8dd3b7e8e2a Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 8 Sep 2021 14:02:06 -0400 Subject: [PATCH 25/57] add optional variables storing local unique edge rows/columns to graph_t & graph_view_t --- cpp/include/cugraph/graph.hpp | 5 +++++ cpp/include/cugraph/graph_view.hpp | 3 ++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/cpp/include/cugraph/graph.hpp b/cpp/include/cugraph/graph.hpp index de33469e792..69ab403bbdb 100644 --- a/cpp/include/cugraph/graph.hpp +++ b/cpp/include/cugraph/graph.hpp @@ -131,6 +131,11 @@ class graph_t 0 std::optional> adj_matrix_partition_segment_offsets_{std::nullopt}; + + // if valid, store row/column properties in key/value pairs (this saves memory if # unique edge + // rows/cols << V / row_comm_size|col_comm_size). + std::optional> local_sorted_unique_edge_rows{std::nullopt}; + std::optional> local_sorted_unique_edge_cols{std::nullopt}; }; // single-GPU version diff --git a/cpp/include/cugraph/graph_view.hpp b/cpp/include/cugraph/graph_view.hpp index 81aa00fd2ea..f0040b9acb3 100644 --- a/cpp/include/cugraph/graph_view.hpp +++ b/cpp/include/cugraph/graph_view.hpp @@ -590,7 +590,8 @@ class graph_view_t> adj_matrix_partition_segment_offsets_{}; - // FIXME: to be implemented. + // if valid, store row/column properties in key/value pairs (this saves memory if # unique edge + // rows/cols << V / row_comm_size|col_comm_size). std::optional local_sorted_unique_edge_row_first_{std::nullopt}; std::optional local_sorted_unique_edge_row_last_{std::nullopt}; std::optional local_sorted_unique_edge_col_first_{std::nullopt}; From a04844223131f58b8e63d25d1d6c19f5f992a617 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 8 Sep 2021 14:14:49 -0400 Subject: [PATCH 26/57] update renumber_edgelist (in MG) to return # local unique edge rows/cols in addition (this information will be used in future memory scaling optimization) --- cpp/include/cugraph/graph_functions.hpp | 13 +-- cpp/src/structure/renumber_edgelist.cu | 115 +++++++++++++++--------- 2 files changed, 79 insertions(+), 49 deletions(-) diff --git a/cpp/include/cugraph/graph_functions.hpp b/cpp/include/cugraph/graph_functions.hpp index 61bf30e86a2..7e0c41603cd 100644 --- a/cpp/include/cugraph/graph_functions.hpp +++ b/cpp/include/cugraph/graph_functions.hpp @@ -69,10 +69,11 @@ namespace cugraph { * for further memory footprint optimization if provided. * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`). * @return std::tuple, partition_t, vertex_t, edge_t, - * std::vector> Tuple of labels (vertex IDs before renumbering) for the entire set of - * vertices (assigned to this process in multi-GPU), partition_t object storing graph partitioning - * information, total number of vertices, total number of edges, and vertex partition segment - * offsets (a vertex partition is partitioned to multiple segments based on vertex degrees). + * std::vector, vertex_t, vertex_t> Tuple of labels (vertex IDs before renumbering) for + * the entire set of vertices (assigned to this process in multi-GPU), partition_t object storing + * graph partitioning information, total number of vertices, total number of edges, vertex partition + * segment offsets (a vertex partition is partitioned to multiple segments based on vertex degrees), + * and the number of unique edge rows and columns. */ template std::enable_if_t, vertex_t, edge_t, - std::vector>> + std::vector, + vertex_t, + vertex_t>> renumber_edgelist( raft::handle_t const& handle, std::optional> local_vertex_span, diff --git a/cpp/src/structure/renumber_edgelist.cu b/cpp/src/structure/renumber_edgelist.cu index ecdb06e399a..436ad674443 100644 --- a/cpp/src/structure/renumber_edgelist.cu +++ b/cpp/src/structure/renumber_edgelist.cu @@ -44,13 +44,14 @@ namespace cugraph { namespace detail { +// returns renumber map, segment_offsets, and # unique edge majors & minors template -std::tuple, std::vector> compute_renumber_map( - raft::handle_t const& handle, - std::optional> vertex_span, - std::vector const& edgelist_major_vertices, - std::vector const& edgelist_minor_vertices, - std::vector const& edgelist_edge_counts) +std::tuple, std::vector, vertx_t, vertex_t> +compute_renumber_map(raft::handle_t const& handle, + std::optional> vertex_span, + std::vector const& edgelist_major_vertices, + std::vector const& edgelist_minor_vertices, + std::vector const& edgelist_edge_counts) { // FIXME: compare this sort based approach with hash based approach in both speed and memory // footprint @@ -75,6 +76,7 @@ std::tuple, std::vector> compute_renumbe rmm::device_uvector major_labels(0, handle.get_stream()); rmm::device_uvector major_counts(0, handle.get_stream()); + vertex_t num_unique_edge_majors{0}; for (size_t i = 0; i < edgelist_major_vertices.size(); ++i) { rmm::device_uvector tmp_major_labels(0, handle.get_stream()); rmm::device_uvector tmp_major_counts(0, handle.get_stream()); @@ -104,6 +106,7 @@ std::tuple, std::vector> compute_renumbe tmp_major_labels.begin(), tmp_major_counts.begin()); } + num_unique_edge_majors += static_cast(tmp_major_labels.size()); if (multi_gpu) { auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); @@ -170,6 +173,7 @@ std::tuple, std::vector> compute_renumbe edgelist_edge_counts.begin(), edgelist_edge_counts.end() - 1, minor_displs.begin() + 1); rmm::device_uvector minor_labels(minor_displs.back() + edgelist_edge_counts.back(), handle.get_stream()); + vertex_t num_unique_edge_minors{0}; for (size_t i = 0; i < edgelist_minor_vertices.size(); ++i) { thrust::copy(handle.get_thrust_policy(), edgelist_minor_vertices[i], @@ -182,6 +186,7 @@ std::tuple, std::vector> compute_renumbe minor_labels.begin(), thrust::unique(handle.get_thrust_policy(), minor_labels.begin(), minor_labels.end())), handle.get_stream()); + num_unique_edge_minors += static_cast(minor_labels.size()); if (multi_gpu) { auto& comm = handle.get_comms(); auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); @@ -364,7 +369,8 @@ std::tuple, std::vector> compute_renumbe handle.get_stream()); handle.get_stream_view().synchronize(); - return std::make_tuple(std::move(labels), h_segment_offsets); + return std::make_tuple( + std::move(labels), h_segment_offsets, num_unique_edge_majors, num_unique_edge_minors); } template @@ -609,7 +615,9 @@ std::enable_if_t, vertex_t, edge_t, - std::vector>> + std::vector, + vertex_t, + vertex_t>> renumber_edgelist( raft::handle_t const& handle, std::optional> local_vertex_span, @@ -648,7 +656,10 @@ renumber_edgelist( // 1. compute renumber map - auto [renumber_map_labels, vertex_partition_segment_offsets] = + auto [renumber_map_labels, + vertex_partition_segment_offsets, + num_unique_edge_majors, + num_unique_edge_minors] = detail::compute_renumber_map(handle, local_vertex_span, edgelist_const_major_vertices, @@ -832,7 +843,9 @@ renumber_edgelist( partition, number_of_vertices, number_of_edges, - vertex_partition_segment_offsets); + vertex_partition_segment_offsets, + num_unique_edge_majors, + num_unique_edge_minors); } template @@ -854,7 +867,9 @@ renumber_edgelist(raft::handle_t const& handle, std::nullopt); } - auto [renumber_map_labels, segment_offsets] = + rmm::device_uvector renumber_map_labels(0, handle.get_stream()); + std::vector segment_offsets{}; + std::tie(renumber_map_labels, segment_offsets, std::ignore, std::ignore) = detail::compute_renumber_map( handle, vertex_span, @@ -893,17 +908,21 @@ renumber_edgelist(raft::handle_t const& handle, // instantiations for // -template std:: - tuple, partition_t, int32_t, int32_t, std::vector> - renumber_edgelist( - raft::handle_t const& handle, - std::optional> local_vertex_span, - std::vector const& edgelist_major_vertices /* [INOUT] */, - std::vector const& edgelist_minor_vertices /* [INOUT] */, - std::vector const& edgelist_edge_counts, - std::optional>> const& - edgelist_intra_partition_segment_offsets, - bool do_expensive_check); +template std::tuple, + partition_t, + int32_t, + int32_t, + std::vector, + int32_t, + int32_t> +renumber_edgelist( + raft::handle_t const& handle, + std::optional> local_vertex_span, + std::vector const& edgelist_major_vertices /* [INOUT] */, + std::vector const& edgelist_minor_vertices /* [INOUT] */, + std::vector const& edgelist_edge_counts, + std::optional>> const& edgelist_intra_partition_segment_offsets, + bool do_expensive_check); template std::tuple, std::vector> renumber_edgelist( @@ -916,17 +935,21 @@ renumber_edgelist( // instantiations for // -template std:: - tuple, partition_t, int32_t, int64_t, std::vector> - renumber_edgelist( - raft::handle_t const& handle, - std::optional> local_vertex_span, - std::vector const& edgelist_major_vertices /* [INOUT] */, - std::vector const& edgelist_minor_vertices /* [INOUT] */, - std::vector const& edgelist_edge_counts, - std::optional>> const& - edgelist_intra_partition_segment_offsets, - bool do_expensive_check); +template std::tuple, + partition_t, + int32_t, + int64_t, + std::vector, + int32_t, + int32_t> +renumber_edgelist( + raft::handle_t const& handle, + std::optional> local_vertex_span, + std::vector const& edgelist_major_vertices /* [INOUT] */, + std::vector const& edgelist_minor_vertices /* [INOUT] */, + std::vector const& edgelist_edge_counts, + std::optional>> const& edgelist_intra_partition_segment_offsets, + bool do_expensive_check); template std::tuple, std::vector> renumber_edgelist( @@ -939,17 +962,21 @@ renumber_edgelist( // instantiations for // -template std:: - tuple, partition_t, int64_t, int64_t, std::vector> - renumber_edgelist( - raft::handle_t const& handle, - std::optional> local_vertex_span, - std::vector const& edgelist_major_vertices /* [INOUT] */, - std::vector const& edgelist_minor_vertices /* [INOUT] */, - std::vector const& edgelist_edge_counts, - std::optional>> const& - edgelist_intra_partition_segment_offsets, - bool do_expensive_check); +template std::tuple, + partition_t, + int64_t, + int64_t, + std::vector, + int64_t, + int64_t> +renumber_edgelist( + raft::handle_t const& handle, + std::optional> local_vertex_span, + std::vector const& edgelist_major_vertices /* [INOUT] */, + std::vector const& edgelist_minor_vertices /* [INOUT] */, + std::vector const& edgelist_edge_counts, + std::optional>> const& edgelist_intra_partition_segment_offsets, + bool do_expensive_check); template std::tuple, std::vector> renumber_edgelist( From ed46464de5906491e54a6c4411aa1de2cb11725d Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 8 Sep 2021 14:25:39 -0400 Subject: [PATCH 27/57] fix an erroneous comment --- cpp/include/cugraph/graph_functions.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/cugraph/graph_functions.hpp b/cpp/include/cugraph/graph_functions.hpp index 7e0c41603cd..00331f15906 100644 --- a/cpp/include/cugraph/graph_functions.hpp +++ b/cpp/include/cugraph/graph_functions.hpp @@ -73,7 +73,7 @@ namespace cugraph { * the entire set of vertices (assigned to this process in multi-GPU), partition_t object storing * graph partitioning information, total number of vertices, total number of edges, vertex partition * segment offsets (a vertex partition is partitioned to multiple segments based on vertex degrees), - * and the number of unique edge rows and columns. + * and the number of unique edge major & minor vertex IDs. */ template std::enable_if_t Date: Wed, 8 Sep 2021 14:49:35 -0400 Subject: [PATCH 28/57] update the renumber_edgelist caller --- cpp/include/cugraph/utilities/cython.hpp | 5 +++++ cpp/src/structure/coarsen_graph_impl.cuh | 8 +++++++- cpp/src/structure/create_graph_from_edgelist_impl.hpp | 7 +++---- cpp/src/structure/renumber_edgelist_impl.cuh | 2 +- cpp/src/utilities/cython.cu | 4 +++- 5 files changed, 19 insertions(+), 7 deletions(-) diff --git a/cpp/include/cugraph/utilities/cython.hpp b/cpp/include/cugraph/utilities/cython.hpp index 3a4f437bfd0..c7bbe68b2c2 100644 --- a/cpp/include/cugraph/utilities/cython.hpp +++ b/cpp/include/cugraph/utilities/cython.hpp @@ -270,6 +270,9 @@ struct renum_tuple_t { return std::make_unique>(segment_offsets_); } + vertex_t& get_num_unique_edge_majors(void) { return num_unique_edge_majors_; } + vertex_t& get_num_unique_edge_minors(void) { return num_unique_edge_minors_; } + // `partition_t` pass-through getters // int get_part_row_size() const { return part_.get_row_size(); } @@ -364,6 +367,8 @@ struct renum_tuple_t { vertex_t nv_{0}; edge_t ne_{0}; std::vector segment_offsets_; + vertex_t num_unique_edge_majors_{0}; + vertex_t num_unique_edge_minors_{0}; }; // FIXME: finish description for vertex_partition_offsets diff --git a/cpp/src/structure/coarsen_graph_impl.cuh b/cpp/src/structure/coarsen_graph_impl.cuh index 345ce989e53..be424fcf0e3 100644 --- a/cpp/src/structure/coarsen_graph_impl.cuh +++ b/cpp/src/structure/coarsen_graph_impl.cuh @@ -437,7 +437,13 @@ coarsen_graph( minor_ptrs[i] = coarsened_edgelist_minor_vertices[i].data(); counts[i] = static_cast(coarsened_edgelist_major_vertices[i].size()); } - std::tie(renumber_map_labels, partition, number_of_vertices, number_of_edges, segment_offsets) = + std::tie(renumber_map_labels, + partition, + number_of_vertices, + number_of_edges, + segment_offsets, + std::ignore, + std::ignore) = renumber_edgelist( handle, std::optional>{ diff --git a/cpp/src/structure/create_graph_from_edgelist_impl.hpp b/cpp/src/structure/create_graph_from_edgelist_impl.hpp index 58991f2477c..d2353792a1b 100644 --- a/cpp/src/structure/create_graph_from_edgelist_impl.hpp +++ b/cpp/src/structure/create_graph_from_edgelist_impl.hpp @@ -23,9 +23,6 @@ #include -#include -#include - #include #include @@ -112,7 +109,9 @@ create_graph_from_edgelist_impl(raft::handle_t const& handle, partition, number_of_vertices, number_of_edges, - *vertex_partition_segment_offsets) = + *vertex_partition_segment_offsets, + std::ignore, + std::ignore) = cugraph::renumber_edgelist( handle, local_vertex_span diff --git a/cpp/src/structure/renumber_edgelist_impl.cuh b/cpp/src/structure/renumber_edgelist_impl.cuh index e5d0bf0ba30..a990e5b378e 100644 --- a/cpp/src/structure/renumber_edgelist_impl.cuh +++ b/cpp/src/structure/renumber_edgelist_impl.cuh @@ -47,7 +47,7 @@ namespace detail { // returns renumber map, segment_offsets, and # unique edge majors & minors template -std::tuple, std::vector, vertx_t, vertex_t> +std::tuple, std::vector, vertex_t, vertex_t> compute_renumber_map(raft::handle_t const& handle, std::optional> vertex_span, std::vector const& edgelist_major_vertices, diff --git a/cpp/src/utilities/cython.cu b/cpp/src/utilities/cython.cu index 25d42ec1f22..6f295871446 100644 --- a/cpp/src/utilities/cython.cu +++ b/cpp/src/utilities/cython.cu @@ -1216,7 +1216,9 @@ std::unique_ptr> call_renumber( p_ret->get_partition(), p_ret->get_num_vertices(), p_ret->get_num_edges(), - p_ret->get_segment_offsets()) = + p_ret->get_segment_offsets(), + p_ret->get_num_unique_edge_majors(), + p_ret->get_num_unique_edge_minors()) = cugraph::renumber_edgelist(handle, std::nullopt, major_ptrs, From 8b07a3c8a506ebba2930aa2afa021a7ab47e0015 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 8 Sep 2021 15:49:38 -0400 Subject: [PATCH 29/57] update MG graph_t callers --- cpp/include/cugraph/graph.hpp | 2 + cpp/include/cugraph/graph_functions.hpp | 2 +- cpp/include/cugraph/utilities/cython.hpp | 4 ++ cpp/src/structure/coarsen_graph_impl.cuh | 10 +++-- .../create_graph_from_edgelist_impl.hpp | 10 +++-- cpp/src/structure/graph_impl.cuh | 2 + cpp/src/structure/renumber_edgelist_impl.cuh | 14 ++++--- cpp/src/utilities/cython.cu | 38 +++++++++++-------- 8 files changed, 53 insertions(+), 29 deletions(-) diff --git a/cpp/include/cugraph/graph.hpp b/cpp/include/cugraph/graph.hpp index 69ab403bbdb..75ae57a2f88 100644 --- a/cpp/include/cugraph/graph.hpp +++ b/cpp/include/cugraph/graph.hpp @@ -70,6 +70,8 @@ class graph_t> const& segment_offsets, + vertex_t num_local_unique_edge_rows, + vertex_t num_local_unique_edge_cols, bool do_expensive_check = false); bool is_weighted() const { return adj_matrix_partition_weights_.has_value(); } diff --git a/cpp/include/cugraph/graph_functions.hpp b/cpp/include/cugraph/graph_functions.hpp index 00331f15906..478df87d4c5 100644 --- a/cpp/include/cugraph/graph_functions.hpp +++ b/cpp/include/cugraph/graph_functions.hpp @@ -73,7 +73,7 @@ namespace cugraph { * the entire set of vertices (assigned to this process in multi-GPU), partition_t object storing * graph partitioning information, total number of vertices, total number of edges, vertex partition * segment offsets (a vertex partition is partitioned to multiple segments based on vertex degrees), - * and the number of unique edge major & minor vertex IDs. + * and the number of local unique edge major & minor vertex IDs. */ template std::enable_if_t> segment_offsets{}; + vertex_t num_local_unique_edge_majors{}; + vertex_t num_local_unique_edge_minors{}; { std::vector major_ptrs(coarsened_edgelist_major_vertices.size()); std::vector minor_ptrs(major_ptrs.size()); @@ -442,8 +444,8 @@ coarsen_graph( number_of_vertices, number_of_edges, segment_offsets, - std::ignore, - std::ignore) = + num_local_unique_edge_majors, + num_local_unique_edge_minors) = renumber_edgelist( handle, std::optional>{ @@ -479,7 +481,9 @@ coarsen_graph( number_of_vertices, number_of_edges, graph_properties_t{graph_view.is_symmetric(), false}, - segment_offsets), + segment_offsets, + store_transposed ? num_local_unique_edge_minors : num_local_unique_edge_majors, + store_transposed ? num_local_unique_edge_majors : num_local_unique_edge_minors), std::move(renumber_map_labels)); } diff --git a/cpp/src/structure/create_graph_from_edgelist_impl.hpp b/cpp/src/structure/create_graph_from_edgelist_impl.hpp index d2353792a1b..b7d21e9ac94 100644 --- a/cpp/src/structure/create_graph_from_edgelist_impl.hpp +++ b/cpp/src/structure/create_graph_from_edgelist_impl.hpp @@ -95,6 +95,8 @@ create_graph_from_edgelist_impl(raft::handle_t const& handle, cugraph::partition_t partition{}; vertex_t number_of_vertices{}; edge_t number_of_edges{}; + vertex_t num_local_unique_edge_majors{}; + vertex_t num_local_unique_edge_minors{}; auto vertex_partition_segment_offsets = std::make_optional>(0); { std::vector major_ptrs(col_comm_size); @@ -110,8 +112,8 @@ create_graph_from_edgelist_impl(raft::handle_t const& handle, number_of_vertices, number_of_edges, *vertex_partition_segment_offsets, - std::ignore, - std::ignore) = + num_local_unique_edge_majors, + num_local_unique_edge_minors) = cugraph::renumber_edgelist( handle, local_vertex_span @@ -145,7 +147,9 @@ create_graph_from_edgelist_impl(raft::handle_t const& handle, number_of_vertices, number_of_edges, graph_properties, - vertex_partition_segment_offsets), + vertex_partition_segment_offsets, + store_transposed ? num_local_unique_edge_minors : num_local_unique_edge_majors, + store_transposed ? num_local_unique_edge_majors : num_local_unique_edge_minors), std::optional>{std::move(renumber_map_labels)}); } diff --git a/cpp/src/structure/graph_impl.cuh b/cpp/src/structure/graph_impl.cuh index b226427d613..48f94712808 100644 --- a/cpp/src/structure/graph_impl.cuh +++ b/cpp/src/structure/graph_impl.cuh @@ -203,6 +203,8 @@ graph_t> const& segment_offsets, + vertex_t num_local_unique_edge_rows, + vertex_t num_local_unique_edge_cols, bool do_expensive_check) : detail::graph_base_t( handle, number_of_vertices, number_of_edges, properties), diff --git a/cpp/src/structure/renumber_edgelist_impl.cuh b/cpp/src/structure/renumber_edgelist_impl.cuh index a990e5b378e..c9749b3202d 100644 --- a/cpp/src/structure/renumber_edgelist_impl.cuh +++ b/cpp/src/structure/renumber_edgelist_impl.cuh @@ -77,7 +77,7 @@ compute_renumber_map(raft::handle_t const& handle, rmm::device_uvector major_labels(0, handle.get_stream()); rmm::device_uvector major_counts(0, handle.get_stream()); - vertex_t num_unique_edge_majors{0}; + vertex_t num_local_unique_edge_majors{0}; for (size_t i = 0; i < edgelist_major_vertices.size(); ++i) { rmm::device_uvector tmp_major_labels(0, handle.get_stream()); rmm::device_uvector tmp_major_counts(0, handle.get_stream()); @@ -107,7 +107,7 @@ compute_renumber_map(raft::handle_t const& handle, tmp_major_labels.begin(), tmp_major_counts.begin()); } - num_unique_edge_majors += static_cast(tmp_major_labels.size()); + num_local_unique_edge_majors += static_cast(tmp_major_labels.size()); if (multi_gpu) { auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); @@ -174,7 +174,7 @@ compute_renumber_map(raft::handle_t const& handle, edgelist_edge_counts.begin(), edgelist_edge_counts.end() - 1, minor_displs.begin() + 1); rmm::device_uvector minor_labels(minor_displs.back() + edgelist_edge_counts.back(), handle.get_stream()); - vertex_t num_unique_edge_minors{0}; + vertex_t num_local_unique_edge_minors{0}; for (size_t i = 0; i < edgelist_minor_vertices.size(); ++i) { thrust::copy(handle.get_thrust_policy(), edgelist_minor_vertices[i], @@ -187,7 +187,7 @@ compute_renumber_map(raft::handle_t const& handle, minor_labels.begin(), thrust::unique(handle.get_thrust_policy(), minor_labels.begin(), minor_labels.end())), handle.get_stream()); - num_unique_edge_minors += static_cast(minor_labels.size()); + num_local_unique_edge_minors += static_cast(minor_labels.size()); if (multi_gpu) { auto& comm = handle.get_comms(); auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); @@ -370,8 +370,10 @@ compute_renumber_map(raft::handle_t const& handle, handle.get_stream()); handle.get_stream_view().synchronize(); - return std::make_tuple( - std::move(labels), h_segment_offsets, num_unique_edge_majors, num_unique_edge_minors); + return std::make_tuple(std::move(labels), + h_segment_offsets, + num_local_unique_edge_majors, + num_local_unique_edge_minors); } template diff --git a/cpp/src/utilities/cython.cu b/cpp/src/utilities/cython.cu index 6f295871446..f95f9e38ea8 100644 --- a/cpp/src/utilities/cython.cu +++ b/cpp/src/utilities/cython.cu @@ -169,6 +169,8 @@ std::unique_ptr> crea static_cast(graph_container.segment_offsets) + graph_container.num_segments + 1) : std::nullopt, + graph_container.num_local_unique_edge_rows, + graph_container.num_local_unique_edge_cols, graph_container.do_expensive_check); } @@ -221,6 +223,8 @@ void populate_graph_container(graph_container_t& graph_container, size_t num_local_edges, size_t num_global_vertices, size_t num_global_edges, + size_t num_local_unique_edge_rows, + size_t num_local_unique_edge_cols, bool is_weighted, bool is_symmetric, bool transposed, @@ -244,22 +248,24 @@ void populate_graph_container(graph_container_t& graph_container, graph_container.col_comm_rank = col_comm_rank; } - graph_container.src_vertices = src_vertices; - graph_container.dst_vertices = dst_vertices; - graph_container.weights = weights; - graph_container.is_weighted = is_weighted; - graph_container.vertex_partition_offsets = vertex_partition_offsets; - graph_container.segment_offsets = segment_offsets; - graph_container.num_segments = num_segments; - graph_container.num_local_edges = num_local_edges; - graph_container.num_global_vertices = num_global_vertices; - graph_container.num_global_edges = num_global_edges; - graph_container.vertexType = vertexType; - graph_container.edgeType = edgeType; - graph_container.weightType = weightType; - graph_container.transposed = transposed; - graph_container.is_multi_gpu = multi_gpu; - graph_container.do_expensive_check = do_expensive_check; + graph_container.src_vertices = src_vertices; + graph_container.dst_vertices = dst_vertices; + graph_container.weights = weights; + graph_container.is_weighted = is_weighted; + graph_container.vertex_partition_offsets = vertex_partition_offsets; + graph_container.segment_offsets = segment_offsets; + graph_container.num_segments = num_segments; + graph_container.num_local_edges = num_local_edges; + graph_container.num_global_vertices = num_global_vertices; + graph_container.num_global_edges = num_global_edges; + graph_container.num_local_unique_edge_rows = num_local_unique_edge_rows; + graph_container.num_local_unique_edge_cols = num_local_unique_edge_cols; + graph_container.vertexType = vertexType; + graph_container.edgeType = edgeType; + graph_container.weightType = weightType; + graph_container.transposed = transposed; + graph_container.is_multi_gpu = multi_gpu; + graph_container.do_expensive_check = do_expensive_check; graph_properties_t graph_props{.is_symmetric = is_symmetric, .is_multigraph = false}; graph_container.graph_props = graph_props; From f11be0897c6e57934afb16f45789c68e26447e9e Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 8 Sep 2021 16:52:44 -0400 Subject: [PATCH 30/57] remove std::optional from the input parameter segment_offsets of MG graph_t (as renumbering is mandatory in MG, so segment_offsets is not optional for MG) --- cpp/include/cugraph/graph.hpp | 2 +- cpp/src/structure/coarsen_graph_impl.cuh | 2 +- .../create_graph_from_edgelist_impl.hpp | 4 ++-- cpp/src/structure/graph_impl.cuh | 22 ++++++++----------- cpp/src/utilities/cython.cu | 9 +++----- 5 files changed, 16 insertions(+), 23 deletions(-) diff --git a/cpp/include/cugraph/graph.hpp b/cpp/include/cugraph/graph.hpp index 75ae57a2f88..09f29e714cb 100644 --- a/cpp/include/cugraph/graph.hpp +++ b/cpp/include/cugraph/graph.hpp @@ -69,7 +69,7 @@ class graph_t> const& segment_offsets, + std::vector const& segment_offsets, vertex_t num_local_unique_edge_rows, vertex_t num_local_unique_edge_cols, bool do_expensive_check = false); diff --git a/cpp/src/structure/coarsen_graph_impl.cuh b/cpp/src/structure/coarsen_graph_impl.cuh index 70d96102f35..2bd17098f93 100644 --- a/cpp/src/structure/coarsen_graph_impl.cuh +++ b/cpp/src/structure/coarsen_graph_impl.cuh @@ -427,7 +427,7 @@ coarsen_graph( col_comm_rank); vertex_t number_of_vertices{}; edge_t number_of_edges{}; - std::optional> segment_offsets{}; + std::vector segment_offsets{}; vertex_t num_local_unique_edge_majors{}; vertex_t num_local_unique_edge_minors{}; { diff --git a/cpp/src/structure/create_graph_from_edgelist_impl.hpp b/cpp/src/structure/create_graph_from_edgelist_impl.hpp index b7d21e9ac94..340dfebeda1 100644 --- a/cpp/src/structure/create_graph_from_edgelist_impl.hpp +++ b/cpp/src/structure/create_graph_from_edgelist_impl.hpp @@ -97,7 +97,7 @@ create_graph_from_edgelist_impl(raft::handle_t const& handle, edge_t number_of_edges{}; vertex_t num_local_unique_edge_majors{}; vertex_t num_local_unique_edge_minors{}; - auto vertex_partition_segment_offsets = std::make_optional>(0); + std::vector vertex_partition_segment_offsets{}; { std::vector major_ptrs(col_comm_size); std::vector minor_ptrs(major_ptrs.size()); @@ -111,7 +111,7 @@ create_graph_from_edgelist_impl(raft::handle_t const& handle, partition, number_of_vertices, number_of_edges, - *vertex_partition_segment_offsets, + vertex_partition_segment_offsets, num_local_unique_edge_majors, num_local_unique_edge_minors) = cugraph::renumber_edgelist( diff --git a/cpp/src/structure/graph_impl.cuh b/cpp/src/structure/graph_impl.cuh index 48f94712808..8a998f7854f 100644 --- a/cpp/src/structure/graph_impl.cuh +++ b/cpp/src/structure/graph_impl.cuh @@ -202,7 +202,7 @@ graph_t> const& segment_offsets, + std::vector const& segment_offsets, vertex_t num_local_unique_edge_rows, vertex_t num_local_unique_edge_cols, bool do_expensive_check) @@ -227,16 +227,12 @@ graph_t(col_comm_size), "Invalid input argument: errneous edgelists.size()."); CUGRAPH_EXPECTS( - !segment_offsets.has_value() || - ((*segment_offsets).size() == (detail::num_sparse_segments_per_vertex_partition + 1)) || - ((*segment_offsets).size() == (detail::num_sparse_segments_per_vertex_partition + 2)), + (segment_offsets.size() == (detail::num_sparse_segments_per_vertex_partition + 1)) || + (segment_offsets.size() == (detail::num_sparse_segments_per_vertex_partition + 2)), "Invalid input argument: segment_offsets.size() returns an invalid value."); auto is_weighted = edgelists[0].p_edge_weights.has_value(); - auto use_dcs = - segment_offsets - ? ((*segment_offsets).size() > (detail::num_sparse_segments_per_vertex_partition + 1)) - : false; + auto use_dcs = segment_offsets.size() > (detail::num_sparse_segments_per_vertex_partition + 1); CUGRAPH_EXPECTS( std::any_of(edgelists.begin(), @@ -286,12 +282,12 @@ graph_t d_segment_offsets((*segment_offsets).size(), default_stream_view); + rmm::device_uvector d_segment_offsets(segment_offsets.size(), default_stream_view); raft::update_device(d_segment_offsets.data(), - (*segment_offsets).data(), - (*segment_offsets).size(), + segment_offsets.data(), + segment_offsets.size(), default_stream_view.value()); rmm::device_uvector d_aggregate_segment_offsets( col_comm_size * d_segment_offsets.size(), default_stream_view); @@ -332,7 +328,7 @@ graph_t{major_first + (*adj_matrix_partition_segment_offsets_) - [(*segment_offsets).size() * i + + [segment_offsets.size() * i + detail::num_sparse_segments_per_vertex_partition]} : std::nullopt; auto [offsets, indices, weights, dcs_nzd_vertices] = diff --git a/cpp/src/utilities/cython.cu b/cpp/src/utilities/cython.cu index f95f9e38ea8..71bdac0448c 100644 --- a/cpp/src/utilities/cython.cu +++ b/cpp/src/utilities/cython.cu @@ -163,12 +163,9 @@ std::unique_ptr> crea static_cast(graph_container.num_global_vertices), static_cast(graph_container.num_global_edges), graph_container.graph_props, - graph_container.segment_offsets != nullptr - ? std::make_optional>( - static_cast(graph_container.segment_offsets), - static_cast(graph_container.segment_offsets) + - graph_container.num_segments + 1) - : std::nullopt, + std::vector(static_cast(graph_container.segment_offsets), + static_cast(graph_container.segment_offsets) + + graph_container.num_segments + 1), graph_container.num_local_unique_edge_rows, graph_container.num_local_unique_edge_cols, graph_container.do_expensive_check); From 529544cf46f957b87558ecd6a924b559dacbea78 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 9 Sep 2021 15:22:20 -0400 Subject: [PATCH 31/57] update local unique edge rows/cols if using (key, value) pairs instead of contiguous arrays for row/col properties --- cpp/include/cugraph/graph.hpp | 4 +- cpp/include/cugraph/graph_view.hpp | 5 + cpp/src/structure/graph_impl.cuh | 146 ++++++++++++++++++++++++++++- 3 files changed, 148 insertions(+), 7 deletions(-) diff --git a/cpp/include/cugraph/graph.hpp b/cpp/include/cugraph/graph.hpp index 09f29e714cb..5bd5d5a5be3 100644 --- a/cpp/include/cugraph/graph.hpp +++ b/cpp/include/cugraph/graph.hpp @@ -136,8 +136,8 @@ class graph_t> local_sorted_unique_edge_rows{std::nullopt}; - std::optional> local_sorted_unique_edge_cols{std::nullopt}; + std::optional> local_sorted_unique_edge_rows_{std::nullopt}; + std::optional> local_sorted_unique_edge_cols_{std::nullopt}; }; // single-GPU version diff --git a/cpp/include/cugraph/graph_view.hpp b/cpp/include/cugraph/graph_view.hpp index f0040b9acb3..85135575f5c 100644 --- a/cpp/include/cugraph/graph_view.hpp +++ b/cpp/include/cugraph/graph_view.hpp @@ -223,6 +223,11 @@ namespace detail { using namespace cugraph::visitors; +// FIXME: threshold values require tuning +// use (key, value) pairs to store row/column properties if (unique edge rows/cols) over (V / +// row_comm_size|col_comm_size) is smaller than the threshold value +double constexpr row_col_properties_kv_pair_fill_ratio_threshold = 0.25; + // FIXME: threshold values require tuning // use the hypersparse format (currently, DCSR or DCSC) for the vertices with their degrees smaller // than col_comm_size * hypersparse_threshold_ratio, should be less than 1.0 diff --git a/cpp/src/structure/graph_impl.cuh b/cpp/src/structure/graph_impl.cuh index 8a998f7854f..09a32b8f9a2 100644 --- a/cpp/src/structure/graph_impl.cuh +++ b/cpp/src/structure/graph_impl.cuh @@ -41,7 +41,7 @@ namespace cugraph { namespace { -// can't use lambda due to nvcc limitations (The enclosing parent function ("graph_view_t") for an +// can't use lambda due to nvcc limitations (The enclosing parent function ("graph_t") for an // extended __device__ lambda must allow its address to be taken) template struct out_of_range_t { @@ -59,6 +59,20 @@ struct out_of_range_t { } }; +// can't use lambda due to nvcc limitations (The enclosing parent function ("graph_t") for an +// extended __device__ lambda must allow its address to be taken) +template +struct has_nzd_t { + edge_t const* offsets{nullptr}; + vertex_t major_first{}; + + __device__ bool operator()(vertex_t major) const + { + auto major_offset = major - major_first; + return offsets[major_offset + 1] - offsets[major_offset] > 0; + } +}; + // compress edge list (COO) to CSR (or CSC) or CSR + DCSR (CSC + DCSC) hybrid template std::tuple, @@ -251,12 +265,12 @@ graph_tget_number_of_edges(), "Invalid input argument: the sum of local edge counts does not match with number_of_edges."); @@ -278,6 +292,39 @@ graph_t majors(number_of_local_edges, handle.get_stream()); + rmm::device_uvector minors(number_of_local_edges, handle.get_stream()); + size_t cur_size{0}; + for (size_t i = 0; i < edgelists.size(); ++i) { + auto p_majors = store_transposed ? edgelists[i].p_dst_vertices : edgelists[i].p_src_vertices; + auto p_minors = store_transposed ? edgelists[i].p_src_vertices : edgelists[i].p_dst_vertices; + thrust::copy(handle.get_thrust_policy(), + p_majors, + p_majors + edgelists[i].number_of_edges, + majors.begin() + cur_size); + thrust::copy(handle.get_thrust_policy(), + p_minors, + p_minors + edgelists[i].number_of_edges, + minors.begin() + cur_size); + } + thrust::sort(handle.get_thrust_policy(), majors.begin(), majors.end()); + thrust::sort(handle.get_thrust_policy(), minors.begin(), minors.end()); + auto num_local_unique_edge_majors = static_cast(thrust::distance( + majors.begin(), thrust::unique(handle.get_thrust_policy(), majors.begin(), majors.end()))); + auto num_local_unique_edge_minors = static_cast(thrust::distance( + minors.begin(), thrust::unique(handle.get_thrust_policy(), minors.begin(), minors.end()))); + if constexpr (store_transposed) { + CUGRAPH_EXPECTS(num_local_unique_edge_majors == num_local_unique_edge_cols, + "Invalid input argument: num_unique_edge_cols is erroneous."); + CUGRAPH_EXPECTS(num_local_unique_edge_minors == num_local_unique_edge_rows, + "Invalid input argument: num_unique_edge_rows is erroneous."); + } else { + CUGRAPH_EXPECTS(num_local_unique_edge_majors == num_local_unique_edge_rows, + "Invalid input argument: num_unique_edge_rows is erroneous."); + CUGRAPH_EXPECTS(num_local_unique_edge_minors == num_local_unique_edge_cols, + "Invalid input argument: num_unique_edge_cols is erroneous."); + } } // aggregate segment_offsets @@ -350,6 +397,95 @@ graph_t(num_local_unique_edge_majors) / static_cast(aggregate_major_size), + handle.get_stream()); + auto max_minor_properties_fill_ratio = host_scalar_allreduce( + comm, + static_cast(num_local_unique_edge_minors) / static_cast(minor_size), + handle.get_stream()); + + if (max_major_properties_fill_ratio < detail::row_col_properties_kv_pair_fill_ratio_threshold) { + rmm::device_uvector local_sorted_unique_edge_majors(num_local_unique_edge_majors, + handle.get_stream()); + size_t cur_size{0}; + for (size_t i = 0; i < adj_matrix_partition_offsets_.size(); ++i) { + auto [major_first, major_last] = partition.get_matrix_partition_major_range(i); + cur_size += thrust::distance( + local_sorted_unique_edge_majors.data() + cur_size, + thrust::copy_if( + handle.get_thrust_policy(), + thrust::make_counting_iterator(major_first), + thrust::make_counting_iterator(major_last), + local_sorted_unique_edge_majors.data() + cur_size, + has_nzd_t{adj_matrix_partition_offsets_[i].data(), major_first})); + } + assert(cur_size == num_local_unique_edge_majors); + if constexpr (store_transposed) { + local_sorted_unique_edge_cols_ = std::move(local_sorted_unique_edge_majors); + } else { + local_sorted_unique_edge_rows_ = std::move(local_sorted_unique_edge_majors); + } + } + + if (max_minor_properties_fill_ratio < detail::row_col_properties_kv_pair_fill_ratio_threshold) { + rmm::device_uvector local_sorted_unique_edge_minors(0, handle.get_stream()); + for (size_t i = 0; i < adj_matrix_partition_indices_.size(); ++i) { + rmm::device_uvector tmp_minors(adj_matrix_partition_indices_[i].size(), + handle.get_stream()); + thrust::copy(handle.get_thrust_policy(), + adj_matrix_partition_indices_[i].begin(), + adj_matrix_partition_indices_[i].end(), + tmp_minors.begin()); + thrust::sort(handle.get_thrust_policy(), tmp_minors.begin(), tmp_minors.end()); + tmp_minors.resize( + thrust::distance( + tmp_minors.begin(), + thrust::unique(handle.get_thrust_policy(), tmp_minors.begin(), tmp_minors.end())), + handle.get_stream()); + auto cur_size = local_sorted_unique_edge_minors.size(); + if (cur_size == 0) { + local_sorted_unique_edge_minors = std::move(tmp_minors); + } else { + local_sorted_unique_edge_minors.resize( + local_sorted_unique_edge_minors.size() + tmp_minors.size(), handle.get_stream()); + thrust::copy(handle.get_thrust_policy(), + tmp_minors.begin(), + tmp_minors.end(), + local_sorted_unique_edge_minors.begin() + cur_size); + } + } + thrust::sort(handle.get_thrust_policy(), + local_sorted_unique_edge_minors.begin(), + local_sorted_unique_edge_minors.end()); + local_sorted_unique_edge_minors.resize( + thrust::distance(local_sorted_unique_edge_minors.begin(), + thrust::unique(handle.get_thrust_policy(), + local_sorted_unique_edge_minors.begin(), + local_sorted_unique_edge_minors.end())), + handle.get_stream()); + local_sorted_unique_edge_minors.shrink_to_fit(handle.get_stream()); + if constexpr (store_transposed) { + local_sorted_unique_edge_rows_ = std::move(local_sorted_unique_edge_minors); + } else { + local_sorted_unique_edge_cols_ = std::move(local_sorted_unique_edge_minors); + } + } + // optional expensive checks (part 2/2) if (do_expensive_check) { From d2549ccd94d95d94c5327ea1cff55625e9b52b0d Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 9 Sep 2021 16:27:57 -0400 Subject: [PATCH 32/57] update graph_view to take optional local edge rows/cols --- cpp/include/cugraph/graph.hpp | 8 ++++++++ cpp/include/cugraph/graph_view.hpp | 4 ++++ cpp/src/structure/graph_view_impl.cuh | 10 +++++++++- 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/cpp/include/cugraph/graph.hpp b/cpp/include/cugraph/graph.hpp index 5bd5d5a5be3..8b40287f3ae 100644 --- a/cpp/include/cugraph/graph.hpp +++ b/cpp/include/cugraph/graph.hpp @@ -115,6 +115,14 @@ class graph_tget_number_of_edges(), this->get_graph_properties(), adj_matrix_partition_segment_offsets_, + local_sorted_unique_edge_rows_ ? std::make_optional((*local_sorted_unique_edge_rows_).data()) : std::nullopt, + local_sorted_unique_edge_rows_ + ? std::make_optional((*local_sorted_unique_edge_rows_).data() + (*local_sorted_unique_edge_rows_).size()) + : std::nullopt, + local_sorted_unique_edge_cols_ ? std::make_optional((*local_sorted_unique_edge_cols_).data()) : std::nullopt, + local_sorted_unique_edge_cols_ + ? std::make_optional((*local_sorted_unique_edge_cols_).data() + (*local_sorted_unique_edge_cols_).size()) + : std::nullopt, false); } diff --git a/cpp/include/cugraph/graph_view.hpp b/cpp/include/cugraph/graph_view.hpp index 85135575f5c..ae7bffa5e18 100644 --- a/cpp/include/cugraph/graph_view.hpp +++ b/cpp/include/cugraph/graph_view.hpp @@ -332,6 +332,10 @@ class graph_view_t> const& adj_matrix_partition_segment_offsets, + std::optional local_sorted_unique_edge_row_first, + std::optional local_sorted_unique_edge_row_last, + std::optional local_sorted_unique_edge_col_first, + std::optional local_sorted_unique_edge_col_last, bool do_expensive_check = false); bool is_weighted() const { return adj_matrix_partition_weights_.has_value(); } diff --git a/cpp/src/structure/graph_view_impl.cuh b/cpp/src/structure/graph_view_impl.cuh index 156d86e5e76..b34ac1f5253 100644 --- a/cpp/src/structure/graph_view_impl.cuh +++ b/cpp/src/structure/graph_view_impl.cuh @@ -168,6 +168,10 @@ graph_view_t> const& adj_matrix_partition_segment_offsets, + std::optional local_sorted_unique_edge_row_first, + std::optional local_sorted_unique_edge_row_last, + std::optional local_sorted_unique_edge_col_first, + std::optional local_sorted_unique_edge_col_last, bool do_expensive_check) : detail::graph_base_t( handle, number_of_vertices, number_of_edges, properties), @@ -183,7 +187,11 @@ graph_view_t Date: Fri, 10 Sep 2021 15:36:49 -0400 Subject: [PATCH 33/57] refactor input parameters/return values of renumber_edgelist & graph_(view_)t constructors to better group meta information --- cpp/include/cugraph/graph.hpp | 56 ++++++--- cpp/include/cugraph/graph_functions.hpp | 50 +++++--- cpp/include/cugraph/graph_view.hpp | 38 ++++-- .../cugraph/visitors/graph_factory.hpp | 26 ++-- cpp/src/structure/coarsen_graph_impl.cuh | 48 ++++---- .../create_graph_from_edgelist_impl.hpp | 66 +++++----- cpp/src/structure/graph_impl.cuh | 66 +++++----- cpp/src/structure/graph_view_impl.cuh | 116 +++++++++--------- cpp/src/structure/renumber_edgelist_impl.cuh | 25 ++-- cpp/src/structure/renumber_edgelist_mg.cu | 60 ++++----- cpp/src/structure/renumber_edgelist_sg.cu | 6 +- cpp/src/utilities/cython.cu | 68 +++++----- cpp/tests/community/mg_louvain_helper.cu | 7 +- cpp/tests/structure/graph_test.cpp | 5 +- cpp/tests/visitors/bfs_test.cpp | 3 +- 15 files changed, 332 insertions(+), 308 deletions(-) diff --git a/cpp/include/cugraph/graph.hpp b/cpp/include/cugraph/graph.hpp index de33469e792..0ab801f2191 100644 --- a/cpp/include/cugraph/graph.hpp +++ b/cpp/include/cugraph/graph.hpp @@ -37,6 +37,32 @@ struct edgelist_t { edge_t number_of_edges{0}; }; +template +struct graph_meta_t; + +// multi-GPU version +template +struct graph_meta_t> { + vertex_t number_of_vertices{}; + edge_t number_of_edges{}; + graph_properties_t properties{}; + + partition_t partition{}; + + // segment offsets based on vertex degree, relevant only if vertex IDs are renumbered + std::optional> segment_offsets{std::nullopt}; +}; + +// single-GPU version +template +struct graph_meta_t> { + vertex_t number_of_vertices{}; + graph_properties_t properties{}; + + // segment offsets based on vertex degree, relevant only if vertex IDs are renumbered + std::optional> segment_offsets{std::nullopt}; +}; + // graph_t is an owning graph class (note that graph_view_t is a non-owning graph class) template > const& edgelists, - partition_t const& partition, - vertex_t number_of_vertices, - edge_t number_of_edges, - graph_properties_t properties, - std::optional> const& segment_offsets, + graph_meta_t meta, bool do_expensive_check = false); bool is_weighted() const { return adj_matrix_partition_weights_.has_value(); } @@ -108,11 +130,13 @@ class graph_tget_number_of_vertices(), - this->get_number_of_edges(), - this->get_graph_properties(), - adj_matrix_partition_segment_offsets_, + graph_view_meta_t{ + this->get_number_of_vertices(), + this->get_number_of_edges(), + this->get_graph_properties(), + partition_, + adj_matrix_partition_segment_offsets_, + }, false); } @@ -155,9 +179,7 @@ class graph_t const& edgelist, - vertex_t number_of_vertices, - graph_properties_t properties, - std::optional> const& segment_offsets, + graph_meta_t meta, bool do_expensive_check = false); bool is_weighted() const { return weights_.has_value(); } @@ -169,10 +191,10 @@ class graph_t{(*weights_).data()} : std::nullopt, - this->get_number_of_vertices(), - this->get_number_of_edges(), - this->get_graph_properties(), - segment_offsets_, + graph_view_meta_t{this->get_number_of_vertices(), + this->get_number_of_edges(), + this->get_graph_properties(), + segment_offsets_}, false); } diff --git a/cpp/include/cugraph/graph_functions.hpp b/cpp/include/cugraph/graph_functions.hpp index 61bf30e86a2..e11a1f8a2dc 100644 --- a/cpp/include/cugraph/graph_functions.hpp +++ b/cpp/include/cugraph/graph_functions.hpp @@ -28,6 +28,23 @@ namespace cugraph { +template +struct renumber_meta_t { +}; + +template +struct renumber_meta_t> { + vertex_t number_of_vertices{}; + edge_t number_of_edges{}; + partition_t partition{}; + std::vector segment_offsets{}; +}; + +template +struct renumber_meta_t> { + std::vector segment_offsets{}; +}; + /** * @brief renumber edgelist (multi-GPU) * @@ -68,19 +85,18 @@ namespace cugraph { * compute_gpu_id_from_vertex_t function to edge minor vertex IDs. This optinoal information is used * for further memory footprint optimization if provided. * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`). - * @return std::tuple, partition_t, vertex_t, edge_t, - * std::vector> Tuple of labels (vertex IDs before renumbering) for the entire set of - * vertices (assigned to this process in multi-GPU), partition_t object storing graph partitioning - * information, total number of vertices, total number of edges, and vertex partition segment - * offsets (a vertex partition is partitioned to multiple segments based on vertex degrees). + * @return std::tuple, renumber_meta_t> + * Tuple of labels (vertex IDs before renumbering) for the entire set of vertices (assigned to this + * process in multi-GPU) and meta-data collected while renumbering. The meta-data includes total + * number of vertices, total number of edges, partition_t object storing graph partitioning + * information, and vertex partition segment offsets (a vertex partition is partitioned to multiple + * segments based on vertex degrees). This meta-data is expected to be used in graph construction & + * graph primitives. */ template -std::enable_if_t, - partition_t, - vertex_t, - edge_t, - std::vector>> +std::enable_if_t< + multi_gpu, + std::tuple, renumber_meta_t>> renumber_edgelist( raft::handle_t const& handle, std::optional> local_vertex_span, @@ -112,12 +128,16 @@ renumber_edgelist( * Vertex IDs are updated in-place ([INOUT] parameter). * @param num_edgelist_edges Number of edges in the edgelist. * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`). - * @return std::tuple, std::vector> Tuple of abels (vertex - * IDs before renumbering) for the entire set of vertices and vertex partition segment offsets (a - * vertex partition is partitioned to multiple segments based on vertex degrees). + * @return std::tuple, renumber_meta_t> + * Tuple of labels (vertex IDs before renumbering) for the entire set of vertices and meta-data + * collected while renumbering. The meta-data includes vertex partition segment offsets (a vertex + * partition is partitioned to multiple segments based on vertex degrees). This meta-data is + * expected to be used in graph construction & graph primitives. */ template -std::enable_if_t, std::vector>> +std::enable_if_t< + !multi_gpu, + std::tuple, renumber_meta_t>> renumber_edgelist(raft::handle_t const& handle, std::optional> vertex_span, vertex_t* edgelist_major_vertices /* [INOUT] */, diff --git a/cpp/include/cugraph/graph_view.hpp b/cpp/include/cugraph/graph_view.hpp index 81aa00fd2ea..cb18571886c 100644 --- a/cpp/include/cugraph/graph_view.hpp +++ b/cpp/include/cugraph/graph_view.hpp @@ -286,6 +286,33 @@ class graph_base_t : public graph_envelope_t::base_graph_t /*<- visitor logic*/ } // namespace detail +template +struct graph_view_meta_t; + +// multi-GPU version +template +struct graph_view_meta_t> { + vertex_t number_of_vertices; + edge_t number_of_edges; + graph_properties_t properties; + + partition_t partition{}; + + // segment offsets based on vertex degree, relevant only if vertex IDs are renumbered + std::optional> adj_matrix_partition_segment_offsets{}; +}; + +// single-GPU version +template +struct graph_view_meta_t> { + vertex_t number_of_vertices; + edge_t number_of_edges; + graph_properties_t properties; + + // segment offsets based on vertex degree, relevant only if vertex IDs are renumbered + std::optional> segment_offsets{std::nullopt}; +}; + // graph_view_t is a non-owning graph class (note that graph_t is an owning graph class) template > const& adj_matrix_partition_weights, std::optional> const& adj_matrix_partition_dcs_nzd_vertices, std::optional> const& adj_matrix_partition_dcs_nzd_vertex_counts, - partition_t const& partition, - vertex_t number_of_vertices, - edge_t number_of_edges, - graph_properties_t properties, - std::optional> const& adj_matrix_partition_segment_offsets, + graph_view_meta_t meta, bool do_expensive_check = false); bool is_weighted() const { return adj_matrix_partition_weights_.has_value(); } @@ -621,10 +644,7 @@ class graph_view_t weights, - vertex_t number_of_vertices, - edge_t number_of_edges, - graph_properties_t properties, - std::optional> const& segment_offsets, + graph_view_meta_t meta, bool do_expensive_check = false); bool is_weighted() const { return weights_.has_value(); } diff --git a/cpp/include/cugraph/visitors/graph_factory.hpp b/cpp/include/cugraph/visitors/graph_factory.hpp index 5e25624f814..9c8198bf3b0 100644 --- a/cpp/include/cugraph/visitors/graph_factory.hpp +++ b/cpp/include/cugraph/visitors/graph_factory.hpp @@ -159,15 +159,11 @@ struct graph_factory_t< std::optional> opt_seg_off{}; // FIXME: may needd to pass/extract segment_offsets vector + graph_meta_t meta{ + num_global_vertices, num_global_edges, graph_props, partition, opt_seg_off}; + return std::make_unique>( - handle, - edgelist, - partition, - num_global_vertices, - num_global_edges, - graph_props, - opt_seg_off, - do_expensive_check); + handle, edgelist, meta, do_expensive_check); } }; @@ -184,24 +180,18 @@ struct graph_factory_t< /// std::cout << "Single-GPU factory.\n"; std::vector const& v_args{ep.get_args()}; - assert(v_args.size() == 6); + assert(v_args.size() == 4); raft::handle_t const& handle = *static_cast(v_args[0]); auto const& elist = *static_cast const*>(v_args[1]); - auto nv = *static_cast(v_args[2]); - - auto props = *static_cast(v_args[3]); - - bool sorted = *static_cast(v_args[4]); // FIXME: no need to pass this! - - bool check = *static_cast(v_args[5]); + auto meta = *static_cast const*>(v_args[2]); - std::optional> opt_seg_off{}; // should not be needed for (!multi_gpu) + bool check = *static_cast(v_args[3]); return std::make_unique>( - handle, elist, nv, props, opt_seg_off, check); + handle, elist, meta, check); } }; diff --git a/cpp/src/structure/coarsen_graph_impl.cuh b/cpp/src/structure/coarsen_graph_impl.cuh index 345ce989e53..716c9f67993 100644 --- a/cpp/src/structure/coarsen_graph_impl.cuh +++ b/cpp/src/structure/coarsen_graph_impl.cuh @@ -420,14 +420,7 @@ coarsen_graph( // 4. renumber rmm::device_uvector renumber_map_labels(0, handle.get_stream()); - partition_t partition(std::vector(comm_size + 1, 0), - row_comm_size, - col_comm_size, - row_comm_rank, - col_comm_rank); - vertex_t number_of_vertices{}; - edge_t number_of_edges{}; - std::optional> segment_offsets{}; + renumber_meta_t meta{}; { std::vector major_ptrs(coarsened_edgelist_major_vertices.size()); std::vector minor_ptrs(major_ptrs.size()); @@ -437,16 +430,15 @@ coarsen_graph( minor_ptrs[i] = coarsened_edgelist_minor_vertices[i].data(); counts[i] = static_cast(coarsened_edgelist_major_vertices[i].size()); } - std::tie(renumber_map_labels, partition, number_of_vertices, number_of_edges, segment_offsets) = - renumber_edgelist( - handle, - std::optional>{ - std::make_tuple(unique_labels.data(), static_cast(unique_labels.size()))}, - major_ptrs, - minor_ptrs, - counts, - std::nullopt, - do_expensive_check); + std::tie(renumber_map_labels, meta) = renumber_edgelist( + handle, + std::optional>{ + std::make_tuple(unique_labels.data(), static_cast(unique_labels.size()))}, + major_ptrs, + minor_ptrs, + counts, + std::nullopt, + do_expensive_check); } // 5. build a graph @@ -469,11 +461,12 @@ coarsen_graph( std::make_unique>( handle, edgelists, - partition, - number_of_vertices, - number_of_edges, - graph_properties_t{graph_view.is_symmetric(), false}, - segment_offsets), + graph_meta_t{ + meta.number_of_vertices, + meta.number_of_edges, + graph_properties_t{graph_view.is_symmetric(), false}, + meta.partition, + meta.segment_offsets}), std::move(renumber_map_labels)); } @@ -519,7 +512,7 @@ coarsen_graph( thrust::unique(handle.get_thrust_policy(), unique_labels.begin(), unique_labels.end())), handle.get_stream()); - auto [renumber_map_labels, segment_offsets] = renumber_edgelist( + auto [renumber_map_labels, meta] = renumber_edgelist( handle, std::optional>{ std::make_tuple(unique_labels.data(), static_cast(unique_labels.size()))}, @@ -542,9 +535,10 @@ coarsen_graph( std::make_unique>( handle, edgelist, - static_cast(renumber_map_labels.size()), - graph_properties_t{graph_view.is_symmetric(), false}, - segment_offsets), + graph_meta_t{ + static_cast(renumber_map_labels.size()), + graph_properties_t{graph_view.is_symmetric(), false}, + meta.segment_offsets}), std::move(renumber_map_labels)); } diff --git a/cpp/src/structure/create_graph_from_edgelist_impl.hpp b/cpp/src/structure/create_graph_from_edgelist_impl.hpp index 58991f2477c..d60d14c8ac6 100644 --- a/cpp/src/structure/create_graph_from_edgelist_impl.hpp +++ b/cpp/src/structure/create_graph_from_edgelist_impl.hpp @@ -95,10 +95,7 @@ create_graph_from_edgelist_impl(raft::handle_t const& handle, // 2. renumber rmm::device_uvector renumber_map_labels(0, handle.get_stream()); - cugraph::partition_t partition{}; - vertex_t number_of_vertices{}; - edge_t number_of_edges{}; - auto vertex_partition_segment_offsets = std::make_optional>(0); + renumber_meta_t meta{}; { std::vector major_ptrs(col_comm_size); std::vector minor_ptrs(major_ptrs.size()); @@ -108,21 +105,16 @@ create_graph_from_edgelist_impl(raft::handle_t const& handle, minor_ptrs[i] = (store_transposed ? edgelist_rows.begin() : edgelist_cols.begin()) + edgelist_displacements[i]; } - std::tie(renumber_map_labels, - partition, - number_of_vertices, - number_of_edges, - *vertex_partition_segment_offsets) = - cugraph::renumber_edgelist( - handle, - local_vertex_span - ? std::optional>{std::make_tuple( - (*local_vertex_span).data(), static_cast((*local_vertex_span).size()))} - : std::nullopt, - major_ptrs, - minor_ptrs, - edgelist_edge_counts, - edgelist_intra_partition_segment_offsets); + std::tie(renumber_map_labels, meta) = cugraph::renumber_edgelist( + handle, + local_vertex_span + ? std::optional>{std::make_tuple( + (*local_vertex_span).data(), static_cast((*local_vertex_span).size()))} + : std::nullopt, + major_ptrs, + minor_ptrs, + edgelist_edge_counts, + edgelist_intra_partition_segment_offsets); } // 3. create a graph @@ -142,11 +134,11 @@ create_graph_from_edgelist_impl(raft::handle_t const& handle, cugraph::graph_t( handle, edgelists, - partition, - number_of_vertices, - number_of_edges, - graph_properties, - vertex_partition_segment_offsets), + cugraph::graph_meta_t{meta.number_of_vertices, + meta.number_of_edges, + graph_properties, + meta.partition, + meta.segment_offsets}), std::optional>{std::move(renumber_map_labels)}); } @@ -171,17 +163,16 @@ create_graph_from_edgelist_impl(raft::handle_t const& handle, renumber ? std::make_optional>(0, handle.get_stream()) : std::nullopt; std::optional> segment_offsets{std::nullopt}; + renumber_meta_t meta{}; if (renumber) { - segment_offsets = std::vector{}; - std::tie(*renumber_map_labels, *segment_offsets) = - cugraph::renumber_edgelist( - handle, - vertex_span ? std::optional>{std::make_tuple( - (*vertex_span).data(), static_cast((*vertex_span).size()))} - : std::nullopt, - store_transposed ? edgelist_cols.data() : edgelist_rows.data(), - store_transposed ? edgelist_rows.data() : edgelist_cols.data(), - static_cast(edgelist_rows.size())); + std::tie(*renumber_map_labels, meta) = cugraph::renumber_edgelist( + handle, + vertex_span ? std::optional>{std::make_tuple( + (*vertex_span).data(), static_cast((*vertex_span).size()))} + : std::nullopt, + store_transposed ? edgelist_cols.data() : edgelist_rows.data(), + store_transposed ? edgelist_rows.data() : edgelist_cols.data(), + static_cast(edgelist_rows.size())); } vertex_t num_vertices{}; @@ -205,9 +196,10 @@ create_graph_from_edgelist_impl(raft::handle_t const& handle, edgelist_weights ? std::optional{(*edgelist_weights).data()} : std::nullopt, static_cast(edgelist_rows.size())}, - num_vertices, - graph_properties, - std::optional>{segment_offsets}), + cugraph::graph_meta_t{ + num_vertices, + graph_properties, + renumber ? std::optional>{meta.segment_offsets} : std::nullopt}), std::move(renumber_map_labels)); } diff --git a/cpp/src/structure/graph_impl.cuh b/cpp/src/structure/graph_impl.cuh index b226427d613..e54d69204ac 100644 --- a/cpp/src/structure/graph_impl.cuh +++ b/cpp/src/structure/graph_impl.cuh @@ -198,15 +198,11 @@ template >:: graph_t(raft::handle_t const& handle, std::vector> const& edgelists, - partition_t const& partition, - vertex_t number_of_vertices, - edge_t number_of_edges, - graph_properties_t properties, - std::optional> const& segment_offsets, + graph_meta_t meta, bool do_expensive_check) : detail::graph_base_t( - handle, number_of_vertices, number_of_edges, properties), - partition_(partition) + handle, meta.number_of_vertices, meta.number_of_edges, meta.properties), + partition_(meta.partition) { // cheap error checks @@ -225,15 +221,16 @@ graph_t(col_comm_size), "Invalid input argument: errneous edgelists.size()."); CUGRAPH_EXPECTS( - !segment_offsets.has_value() || - ((*segment_offsets).size() == (detail::num_sparse_segments_per_vertex_partition + 1)) || - ((*segment_offsets).size() == (detail::num_sparse_segments_per_vertex_partition + 2)), - "Invalid input argument: segment_offsets.size() returns an invalid value."); + !(meta.segment_offsets).has_value() || + ((*(meta.segment_offsets)).size() == + (detail::num_sparse_segments_per_vertex_partition + 1)) || + ((*(meta.segment_offsets)).size() == (detail::num_sparse_segments_per_vertex_partition + 2)), + "Invalid input argument: (*(meta.segment_offsets)).size() returns an invalid value."); auto is_weighted = edgelists[0].p_edge_weights.has_value(); auto use_dcs = - segment_offsets - ? ((*segment_offsets).size() > (detail::num_sparse_segments_per_vertex_partition + 1)) + meta.segment_offsets + ? ((*(meta.segment_offsets)).size() > (detail::num_sparse_segments_per_vertex_partition + 1)) : false; CUGRAPH_EXPECTS( @@ -255,8 +252,8 @@ graph_tget_number_of_edges(), - "Invalid input argument: the sum of local edge counts does not match with number_of_edges."); + CUGRAPH_EXPECTS(number_of_local_edges_sum == this->get_number_of_edges(), + "Invalid input argument: the sum of local edge counts does not match with " + "meta.number_of_edges."); CUGRAPH_EXPECTS( - partition.get_vertex_partition_last(comm_size - 1) == number_of_vertices, - "Invalid input argument: vertex partition should cover [0, number_of_vertices)."); + partition_.get_vertex_partition_last(comm_size - 1) == meta.number_of_vertices, + "Invalid input argument: vertex partition should cover [0, meta.number_of_vertices)."); } // aggregate segment_offsets - if (segment_offsets) { + if (meta.segment_offsets) { // FIXME: we need to add host_allgather - rmm::device_uvector d_segment_offsets((*segment_offsets).size(), default_stream_view); + rmm::device_uvector d_segment_offsets((*(meta.segment_offsets)).size(), + default_stream_view); raft::update_device(d_segment_offsets.data(), - (*segment_offsets).data(), - (*segment_offsets).size(), + (*(meta.segment_offsets)).data(), + (*(meta.segment_offsets)).size(), default_stream_view.value()); rmm::device_uvector d_aggregate_segment_offsets( col_comm_size * d_segment_offsets.size(), default_stream_view); @@ -325,12 +323,12 @@ graph_t{major_first + (*adj_matrix_partition_segment_offsets_) - [(*segment_offsets).size() * i + + [(*(meta.segment_offsets)).size() * i + detail::num_sparse_segments_per_vertex_partition]} : std::nullopt; auto [offsets, indices, weights, dcs_nzd_vertices] = @@ -371,15 +369,13 @@ template >:: graph_t(raft::handle_t const& handle, edgelist_t const& edgelist, - vertex_t number_of_vertices, - graph_properties_t properties, - std::optional> const& segment_offsets, + graph_meta_t meta, bool do_expensive_check) : detail::graph_base_t( - handle, number_of_vertices, edgelist.number_of_edges, properties), + handle, meta.number_of_vertices, edgelist.number_of_edges, meta.properties), offsets_(rmm::device_uvector(0, handle.get_stream_view())), indices_(rmm::device_uvector(0, handle.get_stream_view())), - segment_offsets_(segment_offsets) + segment_offsets_(meta.segment_offsets) { // cheap error checks @@ -397,9 +393,9 @@ graph_t 0."); CUGRAPH_EXPECTS( - !segment_offsets.has_value() || - ((*segment_offsets).size() == (detail::num_sparse_segments_per_vertex_partition + 1)), - "Invalid input argument: segment_offsets.size() returns an invalid value."); + !segment_offsets_.has_value() || + ((*segment_offsets_).size() == (detail::num_sparse_segments_per_vertex_partition + 1)), + "Invalid input argument: (*(meta.segment_offsets)).size() returns an invalid value."); // optional expensive checks (part 1/2) diff --git a/cpp/src/structure/graph_view_impl.cuh b/cpp/src/structure/graph_view_impl.cuh index 156d86e5e76..368573d4a91 100644 --- a/cpp/src/structure/graph_view_impl.cuh +++ b/cpp/src/structure/graph_view_impl.cuh @@ -163,14 +163,10 @@ graph_view_t> const& adj_matrix_partition_weights, std::optional> const& adj_matrix_partition_dcs_nzd_vertices, std::optional> const& adj_matrix_partition_dcs_nzd_vertex_counts, - partition_t const& partition, - vertex_t number_of_vertices, - edge_t number_of_edges, - graph_properties_t properties, - std::optional> const& adj_matrix_partition_segment_offsets, + graph_view_meta_t meta, bool do_expensive_check) : detail::graph_base_t( - handle, number_of_vertices, number_of_edges, properties), + handle, meta.number_of_vertices, meta.number_of_edges, meta.properties), adj_matrix_partition_offsets_(adj_matrix_partition_offsets), adj_matrix_partition_indices_(adj_matrix_partition_indices), adj_matrix_partition_weights_(adj_matrix_partition_weights), @@ -179,11 +175,11 @@ graph_view_t{}), - "Invalid Invalid input argument: adj_matrix_partition_segment_offsets are " - "provided, but degrees are not in descending order."); + "Invalid Invalid input argument: meta.adj_matrix_partition_segment_offsets " + "are provided, but degrees are not in descending order."); auto num_segments_per_vertex_partition = detail::num_sparse_segments_per_vertex_partition + (use_dcs ? 1 : 0); for (int i = 0; i < col_comm_size; ++i) { - CUGRAPH_EXPECTS(std::is_sorted((*adj_matrix_partition_segment_offsets).begin() + + CUGRAPH_EXPECTS(std::is_sorted((*(meta.adj_matrix_partition_segment_offsets)).begin() + (num_segments_per_vertex_partition + 1) * i, - (*adj_matrix_partition_segment_offsets).begin() + + (*(meta.adj_matrix_partition_segment_offsets)).begin() + (num_segments_per_vertex_partition + 1) * (i + 1)), - "Internal Error: erroneous adj_matrix_partition_segment_offsets."); + "Internal Error: erroneous meta.adj_matrix_partition_segment_offsets."); CUGRAPH_EXPECTS( - (*adj_matrix_partition_segment_offsets)[(num_segments_per_vertex_partition + 1) * i] == 0, - "Internal Error: erroneous adj_matrix_partition_segment_offsets."); + (*(meta.adj_matrix_partition_segment_offsets))[(num_segments_per_vertex_partition + 1) * + i] == 0, + "Internal Error: erroneous meta.adj_matrix_partition_segment_offsets."); auto vertex_partition_idx = row_comm_size * i + row_comm_rank; CUGRAPH_EXPECTS( - (*adj_matrix_partition_segment_offsets)[(num_segments_per_vertex_partition + 1) * i + - num_segments_per_vertex_partition] == - partition.get_vertex_partition_size(vertex_partition_idx), - "Internal Error: erroneous adj_matrix_partition_segment_offsets."); + (*(meta + .adj_matrix_partition_segment_offsets))[(num_segments_per_vertex_partition + 1) * i + + num_segments_per_vertex_partition] == + partition_.get_vertex_partition_size(vertex_partition_idx), + "Internal Error: erroneous meta.adj_matrix_partition_segment_offsets."); } } - CUGRAPH_EXPECTS(partition.get_vertex_partition_last(comm_size - 1) == number_of_vertices, - "Internal Error: vertex partition should cover [0, number_of_vertices)."); + CUGRAPH_EXPECTS( + partition_.get_vertex_partition_last(comm_size - 1) == this->get_number_of_vertices(), + "Internal Error: vertex partition should cover [0, number_of_vertices)."); // FIXME: check for symmetricity may better be implemetned with transpose(). if (this->is_symmetric()) {} @@ -327,34 +326,31 @@ template -graph_view_t>::graph_view_t(raft::handle_t const& handle, - edge_t const* offsets, - vertex_t const* indices, - std::optional weights, - vertex_t number_of_vertices, - edge_t number_of_edges, - graph_properties_t properties, - std::optional> const& - segment_offsets, - bool do_expensive_check) +graph_view_t< + vertex_t, + edge_t, + weight_t, + store_transposed, + multi_gpu, + std::enable_if_t>::graph_view_t(raft::handle_t const& handle, + edge_t const* offsets, + vertex_t const* indices, + std::optional weights, + graph_view_meta_t meta, + bool do_expensive_check) : detail::graph_base_t( - handle, number_of_vertices, number_of_edges, properties), + handle, meta.number_of_vertices, meta.number_of_edges, meta.properties), offsets_(offsets), indices_(indices), weights_(weights), - segment_offsets_(segment_offsets) + segment_offsets_(meta.segment_offsets) { // cheap error checks CUGRAPH_EXPECTS( - !segment_offsets.has_value() || - ((*segment_offsets).size() == (detail::num_sparse_segments_per_vertex_partition + 1)), - "Internal Error: segment_offsets.size() returns an invalid value."); + !(meta.segment_offsets).has_value() || + ((*(meta.segment_offsets)).size() == (detail::num_sparse_segments_per_vertex_partition + 1)), + "Internal Error: (*(meta.segment_offsets)).size() returns an invalid value."); // optional expensive checks @@ -374,20 +370,22 @@ graph_view_t{0, this->get_number_of_vertices()}) == 0, "Internal Error: adj_matrix_partition_indices[] have out-of-range vertex IDs."); - if (segment_offsets) { - auto degrees = detail::compute_major_degrees(handle, offsets, number_of_vertices); + if (meta.segment_offsets) { + auto degrees = detail::compute_major_degrees(handle, offsets, this->get_number_of_vertices()); CUGRAPH_EXPECTS(thrust::is_sorted(rmm::exec_policy(default_stream_view), degrees.begin(), degrees.end(), thrust::greater{}), - "Invalid Invalid input argument: segment_offsets are provided, but degrees " + "Invalid Invalid input argument: meta.segment_offsets is valid, but degrees " "are not in descending order."); - CUGRAPH_EXPECTS(std::is_sorted((*segment_offsets).begin(), (*segment_offsets).end()), - "Internal Error: erroneous segment_offsets."); - CUGRAPH_EXPECTS((*segment_offsets)[0] == 0, "Invalid input argument segment_offsets."); - CUGRAPH_EXPECTS((*segment_offsets).back() == this->get_number_of_vertices(), - "Invalid input argument: segment_offsets."); + CUGRAPH_EXPECTS( + std::is_sorted((*(meta.segment_offsets)).begin(), (*(meta.segment_offsets)).end()), + "Internal Error: erroneous meta.segment_offsets."); + CUGRAPH_EXPECTS((*(meta.segment_offsets))[0] == 0, + "Invalid input argument meta.segment_offsets."); + CUGRAPH_EXPECTS((*(meta.segment_offsets)).back() == this->get_number_of_vertices(), + "Invalid input argument: meta.segment_offsets."); } // FIXME: check for symmetricity may better be implemetned with transpose(). diff --git a/cpp/src/structure/renumber_edgelist_impl.cuh b/cpp/src/structure/renumber_edgelist_impl.cuh index 6efbf13e41b..8104db8eebc 100644 --- a/cpp/src/structure/renumber_edgelist_impl.cuh +++ b/cpp/src/structure/renumber_edgelist_impl.cuh @@ -605,12 +605,9 @@ void expensive_check_edgelist( } // namespace detail template -std::enable_if_t, - partition_t, - vertex_t, - edge_t, - std::vector>> +std::enable_if_t< + multi_gpu, + std::tuple, renumber_meta_t>> renumber_edgelist( raft::handle_t const& handle, std::optional> local_vertex_span, @@ -829,15 +826,16 @@ renumber_edgelist( comm.barrier(); // currently, this is ncclAllReduce #endif - return std::make_tuple(std::move(renumber_map_labels), - partition, - number_of_vertices, - number_of_edges, - vertex_partition_segment_offsets); + return std::make_tuple( + std::move(renumber_map_labels), + renumber_meta_t{ + number_of_vertices, number_of_edges, partition, vertex_partition_segment_offsets}); } template -std::enable_if_t, std::vector>> +std::enable_if_t< + !multi_gpu, + std::tuple, renumber_meta_t>> renumber_edgelist(raft::handle_t const& handle, std::optional> vertex_span, vertex_t* edgelist_major_vertices /* [INOUT] */, @@ -886,7 +884,8 @@ renumber_edgelist(raft::handle_t const& handle, renumber_map.find( edgelist_minor_vertices, edgelist_minor_vertices + num_edgelist_edges, edgelist_minor_vertices); - return std::make_tuple(std::move(renumber_map_labels), segment_offsets); + return std::make_tuple(std::move(renumber_map_labels), + renumber_meta_t{segment_offsets}); } } // namespace cugraph diff --git a/cpp/src/structure/renumber_edgelist_mg.cu b/cpp/src/structure/renumber_edgelist_mg.cu index 03ba230e598..4e9f37e10bb 100644 --- a/cpp/src/structure/renumber_edgelist_mg.cu +++ b/cpp/src/structure/renumber_edgelist_mg.cu @@ -19,40 +19,34 @@ namespace cugraph { // MG instantiation -template std:: - tuple, partition_t, int32_t, int32_t, std::vector> - renumber_edgelist( - raft::handle_t const& handle, - std::optional> optional_local_vertex_span, - std::vector const& edgelist_major_vertices /* [INOUT] */, - std::vector const& edgelist_minor_vertices /* [INOUT] */, - std::vector const& edgelist_edge_counts, - std::optional>> const& - edgelist_intra_partition_segment_offsets, - bool do_expensive_check); +template std::tuple, renumber_meta_t> +renumber_edgelist( + raft::handle_t const& handle, + std::optional> optional_local_vertex_span, + std::vector const& edgelist_major_vertices /* [INOUT] */, + std::vector const& edgelist_minor_vertices /* [INOUT] */, + std::vector const& edgelist_edge_counts, + std::optional>> const& edgelist_intra_partition_segment_offsets, + bool do_expensive_check); -template std:: - tuple, partition_t, int32_t, int64_t, std::vector> - renumber_edgelist( - raft::handle_t const& handle, - std::optional> optional_local_vertex_span, - std::vector const& edgelist_major_vertices /* [INOUT] */, - std::vector const& edgelist_minor_vertices /* [INOUT] */, - std::vector const& edgelist_edge_counts, - std::optional>> const& - edgelist_intra_partition_segment_offsets, - bool do_expensive_check); +template std::tuple, renumber_meta_t> +renumber_edgelist( + raft::handle_t const& handle, + std::optional> optional_local_vertex_span, + std::vector const& edgelist_major_vertices /* [INOUT] */, + std::vector const& edgelist_minor_vertices /* [INOUT] */, + std::vector const& edgelist_edge_counts, + std::optional>> const& edgelist_intra_partition_segment_offsets, + bool do_expensive_check); -template std:: - tuple, partition_t, int64_t, int64_t, std::vector> - renumber_edgelist( - raft::handle_t const& handle, - std::optional> optional_local_vertex_span, - std::vector const& edgelist_major_vertices /* [INOUT] */, - std::vector const& edgelist_minor_vertices /* [INOUT] */, - std::vector const& edgelist_edge_counts, - std::optional>> const& - edgelist_intra_partition_segment_offsets, - bool do_expensive_check); +template std::tuple, renumber_meta_t> +renumber_edgelist( + raft::handle_t const& handle, + std::optional> optional_local_vertex_span, + std::vector const& edgelist_major_vertices /* [INOUT] */, + std::vector const& edgelist_minor_vertices /* [INOUT] */, + std::vector const& edgelist_edge_counts, + std::optional>> const& edgelist_intra_partition_segment_offsets, + bool do_expensive_check); } // namespace cugraph diff --git a/cpp/src/structure/renumber_edgelist_sg.cu b/cpp/src/structure/renumber_edgelist_sg.cu index e8409cdfe9f..3bb25d74b2e 100644 --- a/cpp/src/structure/renumber_edgelist_sg.cu +++ b/cpp/src/structure/renumber_edgelist_sg.cu @@ -19,7 +19,7 @@ namespace cugraph { // SG instantiation -template std::tuple, std::vector> +template std::tuple, renumber_meta_t> renumber_edgelist( raft::handle_t const& handle, std::optional> optional_vertex_span, @@ -28,7 +28,7 @@ renumber_edgelist( int32_t num_edgelist_edges, bool do_expensive_check); -template std::tuple, std::vector> +template std::tuple, renumber_meta_t> renumber_edgelist( raft::handle_t const& handle, std::optional> optional_vertex_span, @@ -37,7 +37,7 @@ renumber_edgelist( int64_t num_edgelist_edges, bool do_expensive_check); -template std::tuple, std::vector> +template std::tuple, renumber_meta_t> renumber_edgelist( raft::handle_t const& handle, std::optional> optional_vertex_span, diff --git a/cpp/src/utilities/cython.cu b/cpp/src/utilities/cython.cu index 25d42ec1f22..bee78fb9952 100644 --- a/cpp/src/utilities/cython.cu +++ b/cpp/src/utilities/cython.cu @@ -159,16 +159,17 @@ std::unique_ptr> crea return std::make_unique>( handle, edgelists, - partition, - static_cast(graph_container.num_global_vertices), - static_cast(graph_container.num_global_edges), - graph_container.graph_props, - graph_container.segment_offsets != nullptr - ? std::make_optional>( - static_cast(graph_container.segment_offsets), - static_cast(graph_container.segment_offsets) + - graph_container.num_segments + 1) - : std::nullopt, + graph_meta_t{ + static_cast(graph_container.num_global_vertices), + static_cast(graph_container.num_global_edges), + graph_container.graph_props, + partition, + graph_container.segment_offsets != nullptr + ? std::make_optional>( + static_cast(graph_container.segment_offsets), + static_cast(graph_container.segment_offsets) + + graph_container.num_segments + 1) + : std::nullopt}, graph_container.do_expensive_check); } @@ -191,14 +192,15 @@ std::unique_ptr> crea return std::make_unique>( handle, edgelist, - static_cast(graph_container.num_global_vertices), - graph_container.graph_props, - graph_container.segment_offsets != nullptr - ? std::make_optional>( - static_cast(graph_container.segment_offsets), - static_cast(graph_container.segment_offsets) + - graph_container.num_segments + 1) - : std::nullopt, + graph_meta_t{ + static_cast(graph_container.num_global_vertices), + graph_container.graph_props, + graph_container.segment_offsets != nullptr + ? std::make_optional>( + static_cast(graph_container.segment_offsets), + static_cast(graph_container.segment_offsets) + + graph_container.num_segments + 1) + : std::nullopt}, graph_container.do_expensive_check); } @@ -1212,20 +1214,16 @@ std::unique_ptr> call_renumber( minor_ptrs[i] = shuffled_edgelist_minor_vertices + displacements[i]; } - std::tie(p_ret->get_dv(), - p_ret->get_partition(), - p_ret->get_num_vertices(), - p_ret->get_num_edges(), - p_ret->get_segment_offsets()) = - cugraph::renumber_edgelist(handle, - std::nullopt, - major_ptrs, - minor_ptrs, - edge_counts, - std::nullopt, - do_expensive_check); + cugraph::renumber_meta_t meta{}; + std::tie(p_ret->get_dv(), meta) = cugraph::renumber_edgelist( + handle, std::nullopt, major_ptrs, minor_ptrs, edge_counts, std::nullopt, do_expensive_check); + p_ret->get_num_vertices() = meta.number_of_vertices; + p_ret->get_num_edges() = meta.number_of_edges; + p_ret->get_partition() = meta.partition; + p_ret->get_segment_offsets() = meta.segment_offsets; } else { - std::tie(p_ret->get_dv(), p_ret->get_segment_offsets()) = + cugraph::renumber_meta_t meta{}; + std::tie(p_ret->get_dv(), meta) = cugraph::renumber_edgelist(handle, std::nullopt, shuffled_edgelist_major_vertices, @@ -1233,10 +1231,10 @@ std::unique_ptr> call_renumber( edge_counts[0], do_expensive_check); - p_ret->get_partition() = cugraph::partition_t{}; // dummy - - p_ret->get_num_vertices() = static_cast(p_ret->get_dv().size()); - p_ret->get_num_edges() = edge_counts[0]; + p_ret->get_num_vertices() = static_cast(p_ret->get_dv().size()); + p_ret->get_num_edges() = edge_counts[0]; + p_ret->get_partition() = cugraph::partition_t{}; // dummy + p_ret->get_segment_offsets() = meta.segment_offsets; } return p_ret; // RVO-ed (copy ellision) diff --git a/cpp/tests/community/mg_louvain_helper.cu b/cpp/tests/community/mg_louvain_helper.cu index d52d8657e2a..0bb0d801229 100644 --- a/cpp/tests/community/mg_louvain_helper.cu +++ b/cpp/tests/community/mg_louvain_helper.cu @@ -254,9 +254,10 @@ coarsen_graph( return std::make_unique>( handle, edgelist, - new_number_of_vertices, - cugraph::graph_properties_t{graph_view.is_symmetric(), false}, - std::nullopt); + cugraph::graph_meta_t{ + new_number_of_vertices, + cugraph::graph_properties_t{graph_view.is_symmetric(), false}, + std::nullopt}); } // explicit instantiation diff --git a/cpp/tests/structure/graph_test.cpp b/cpp/tests/structure/graph_test.cpp index 614ef2528c5..7966a4d141c 100644 --- a/cpp/tests/structure/graph_test.cpp +++ b/cpp/tests/structure/graph_test.cpp @@ -132,9 +132,8 @@ class Tests_Graph : public ::testing::TestWithParam { auto graph = cugraph::graph_t( handle, edgelist, - number_of_vertices, - cugraph::graph_properties_t{is_symmetric, false}, - std::nullopt, + cugraph::graph_meta_t{ + number_of_vertices, cugraph::graph_properties_t{is_symmetric, false}, std::nullopt}, true); auto graph_view = graph.view(); diff --git a/cpp/tests/visitors/bfs_test.cpp b/cpp/tests/visitors/bfs_test.cpp index e6337cd459f..1b971ea84e0 100644 --- a/cpp/tests/visitors/bfs_test.cpp +++ b/cpp/tests/visitors/bfs_test.cpp @@ -137,7 +137,8 @@ class Tests_BFS : public ::testing::TestWithParam { bool sorted{false}; bool check{false}; - erased_pack_t ep_graph{&handle, &edgelist, &num_vertices, &graph_props, &sorted, &check}; + cugraph::graph_meta_t meta{num_vertices, graph_props, std::nullopt}; + erased_pack_t ep_graph{&handle, &edgelist, &meta, &check}; DTypes vertex_tid = reverse_dmap_t::type_id; DTypes edge_tid = reverse_dmap_t::type_id; From 93e5839e82f9f9eeda31034e5db0f71f2eee5ea1 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Fri, 10 Sep 2021 15:48:38 -0400 Subject: [PATCH 34/57] cosmetics --- cpp/include/cugraph/graph_functions.hpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/include/cugraph/graph_functions.hpp b/cpp/include/cugraph/graph_functions.hpp index e11a1f8a2dc..b071f9eada4 100644 --- a/cpp/include/cugraph/graph_functions.hpp +++ b/cpp/include/cugraph/graph_functions.hpp @@ -29,8 +29,7 @@ namespace cugraph { template -struct renumber_meta_t { -}; +struct renumber_meta_t; template struct renumber_meta_t> { From d6a95c5811f5b517a52a060f2dec4deb7028c673 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Fri, 10 Sep 2021 17:55:42 -0400 Subject: [PATCH 35/57] additional fixes after merge --- cpp/include/cugraph/graph.hpp | 17 +++++++++++++++ cpp/include/cugraph/graph_functions.hpp | 3 +++ cpp/include/cugraph/graph_view.hpp | 5 +++++ cpp/src/structure/graph_impl.cuh | 22 ++++++++++---------- cpp/src/structure/renumber_edgelist_impl.cuh | 8 +++++-- 5 files changed, 42 insertions(+), 13 deletions(-) diff --git a/cpp/include/cugraph/graph.hpp b/cpp/include/cugraph/graph.hpp index e679b1eafd6..22629c63458 100644 --- a/cpp/include/cugraph/graph.hpp +++ b/cpp/include/cugraph/graph.hpp @@ -51,6 +51,9 @@ struct graph_meta_t> { // segment offsets based on vertex degree, relevant only if vertex IDs are renumbered std::optional> segment_offsets{std::nullopt}; + + vertex_t num_local_unique_edge_rows{}; + vertex_t num_local_unique_edge_cols{}; }; // single-GPU version @@ -136,6 +139,20 @@ class graph_tget_graph_properties(), partition_, adj_matrix_partition_segment_offsets_, + local_sorted_unique_edge_rows_ + ? std::optional{(*local_sorted_unique_edge_rows_).data()} + : std::nullopt, + local_sorted_unique_edge_rows_ + ? std::optional{(*local_sorted_unique_edge_rows_).data() + + (*local_sorted_unique_edge_rows_).size()} + : std::nullopt, + local_sorted_unique_edge_cols_ + ? std::optional{(*local_sorted_unique_edge_cols_).data()} + : std::nullopt, + local_sorted_unique_edge_cols_ + ? std::optional{(*local_sorted_unique_edge_cols_).data() + + (*local_sorted_unique_edge_cols_).size()} + : std::nullopt, }, false); } diff --git a/cpp/include/cugraph/graph_functions.hpp b/cpp/include/cugraph/graph_functions.hpp index 7277008cc63..83cfc6d831b 100644 --- a/cpp/include/cugraph/graph_functions.hpp +++ b/cpp/include/cugraph/graph_functions.hpp @@ -37,6 +37,9 @@ struct renumber_meta_t> edge_t number_of_edges{}; partition_t partition{}; std::vector segment_offsets{}; + + vertex_t num_local_unique_edge_majors{}; + vertex_t num_local_unique_edge_minors{}; }; template diff --git a/cpp/include/cugraph/graph_view.hpp b/cpp/include/cugraph/graph_view.hpp index a27aa39356e..804b765ae4b 100644 --- a/cpp/include/cugraph/graph_view.hpp +++ b/cpp/include/cugraph/graph_view.hpp @@ -305,6 +305,11 @@ struct graph_view_meta_t> adj_matrix_partition_segment_offsets{}; + + std::optional local_sorted_unique_edge_row_first_{}; + std::optional local_sorted_unique_edge_row_last_{}; + std::optional local_sorted_unique_edge_col_first_{}; + std::optional local_sorted_unique_edge_col_last_{}; }; // single-GPU version diff --git a/cpp/src/structure/graph_impl.cuh b/cpp/src/structure/graph_impl.cuh index 0f0c2cf76ee..3f6cc1a60fd 100644 --- a/cpp/src/structure/graph_impl.cuh +++ b/cpp/src/structure/graph_impl.cuh @@ -282,7 +282,7 @@ graph_tget_number_of_edges(), "Invalid input argument: the sum of local edge counts does not match with " @@ -314,14 +314,14 @@ graph_t(thrust::distance( minors.begin(), thrust::unique(handle.get_thrust_policy(), minors.begin(), minors.end()))); if constexpr (store_transposed) { - CUGRAPH_EXPECTS(num_local_unique_edge_majors == num_local_unique_edge_cols, + CUGRAPH_EXPECTS(num_local_unique_edge_majors == meta.num_local_unique_edge_cols, "Invalid input argument: num_unique_edge_cols is erroneous."); - CUGRAPH_EXPECTS(num_local_unique_edge_minors == num_local_unique_edge_rows, + CUGRAPH_EXPECTS(num_local_unique_edge_minors == meta.num_local_unique_edge_rows, "Invalid input argument: num_unique_edge_rows is erroneous."); } else { - CUGRAPH_EXPECTS(num_local_unique_edge_majors == num_local_unique_edge_rows, + CUGRAPH_EXPECTS(num_local_unique_edge_majors == meta.num_local_unique_edge_rows, "Invalid input argument: num_unique_edge_rows is erroneous."); - CUGRAPH_EXPECTS(num_local_unique_edge_minors == num_local_unique_edge_cols, + CUGRAPH_EXPECTS(num_local_unique_edge_minors == meta.num_local_unique_edge_cols, "Invalid input argument: num_unique_edge_cols is erroneous."); } } @@ -401,15 +401,15 @@ graph_t(num_local_unique_edge_majors) / static_cast(aggregate_major_size), @@ -424,7 +424,7 @@ graph_t{ - number_of_vertices, number_of_edges, partition, vertex_partition_segment_offsets}); + renumber_meta_t{number_of_vertices, + number_of_edges, + partition, + vertex_partition_segment_offsets, + num_unique_edge_majors, + num_unique_edge_minors}); } template From 7d3a5ed7a8ba6089a92e62c917f130670e2486c3 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Tue, 14 Sep 2021 12:44:23 -0400 Subject: [PATCH 36/57] merge major&minor_properties --- .../cugraph/prims/row_col_properties.cuh | 200 +++++------------- cpp/src/community/louvain.cuh | 20 +- .../weakly_connected_components_impl.cuh | 2 +- cpp/src/structure/coarsen_graph_impl.cuh | 2 +- cpp/src/traversal/sssp_impl.cuh | 2 +- 5 files changed, 62 insertions(+), 164 deletions(-) diff --git a/cpp/include/cugraph/prims/row_col_properties.cuh b/cpp/include/cugraph/prims/row_col_properties.cuh index 3aa14a2e859..6259a54b4fc 100644 --- a/cpp/include/cugraph/prims/row_col_properties.cuh +++ b/cpp/include/cugraph/prims/row_col_properties.cuh @@ -36,50 +36,41 @@ namespace cugraph { namespace detail { template -class major_properties_device_view_t { +class row_col_properties_device_view_t { public: using value_type = typename thrust::iterator_traits::value_type; - major_properties_device_view_t() = default; + row_col_properties_device_view_t() = default; - major_properties_device_view_t(ValueIterator value_first) : value_first_(value_first) {} - - void add_offset(vertex_t offset) { value_first_ += offset; } - - ValueIterator value_data() const { return value_first_; } - - __device__ ValueIterator get_iter(vertex_t offset) const { return value_first_ + offset; } - __device__ value_type get(vertex_t offset) const { return *get_iter(offset); } - - private: - ValueIterator value_first_{}; -}; - -template -class minor_properties_device_view_t { - public: - using value_type = typename thrust::iterator_traits::value_type; - - minor_properties_device_view_t() = default; - - minor_properties_device_view_t(ValueIterator value_first) - : key_first_(thrust::nullopt), key_last_(thrust::nullopt), value_first_(value_first) + row_col_properties_device_view_t(ValueIterator value_first) + : key_first_(thrust::nullopt), key_last_(thrust::nullopt), key_offset_(0), value_first_(value_first) { } - minor_properties_device_view_t(vertex_t const* key_first, - vertex_t const* key_last, - ValueIterator value_first) - : key_first_(key_first), key_last_(key_last), value_first_(value_first) + row_col_properties_device_view_t(vertex_t const* key_first, + vertex_t const* key_last, + ValueIterator value_first) + : key_first_(key_first), key_last_(key_last), key_offset_(0), value_first_(value_first) { } + void add_offset(vertex_t offset) { + if (key_first_) { + *key_offset_ += offset; + } + else { + value_first_ += offset; + } + } + + ValueIterator value_data() const { return value_first_; } + __device__ ValueIterator get_iter(vertex_t offset) const { auto value_offset = offset; if (key_first_) { - auto it = thrust::lower_bound(thrust::seq, *key_first_, *key_last_, offset); - assert((it != *key_last_) && (*it == offset)); + auto it = thrust::lower_bound(thrust::seq, *key_first_, *key_last_, offset + *key_offset_); + assert((it != *key_last_) && (*it == (offset + *key_offset_))); value_offset = static_cast(thrust::distance(*key_first_, it)); } return value_first_ + value_offset; @@ -90,66 +81,31 @@ class minor_properties_device_view_t { private: thrust::optional key_first_{thrust::nullopt}; thrust::optional key_last_{thrust::nullopt}; + thrust::optional key_offset_{0}; ValueIterator value_first_{}; }; template -class major_properties_t { - public: - major_properties_t() : buffer_(allocate_dataframe_buffer(0, rmm::cuda_stream_view{})) {} - - major_properties_t(raft::handle_t const& handle, vertex_t buffer_size) - : buffer_(allocate_dataframe_buffer(buffer_size, handle.get_stream())) - { - } - - void fill(T value, rmm::cuda_stream_view stream) - { - thrust::fill(rmm::exec_policy(stream), - value_data(), - value_data() + size_dataframe_buffer(buffer_), - value); - } - - auto value_data() { return get_dataframe_buffer_begin(buffer_); } - - auto device_view() const - { - auto value_first = get_dataframe_buffer_cbegin(buffer_); - return major_properties_device_view_t(value_first); - } - - auto mutable_device_view() - { - auto value_first = get_dataframe_buffer_begin(buffer_); - return major_properties_device_view_t(value_first); - } - - private: - decltype(allocate_dataframe_buffer(0, rmm::cuda_stream_view{})) buffer_; -}; - -template -class minor_properties_t { +class row_col_properties_t { public: - minor_properties_t() + row_col_properties_t() : key_first_(std::nullopt), key_last_(std::nullopt), buffer_(allocate_dataframe_buffer(0, rmm::cuda_stream_view{})) { } - minor_properties_t(raft::handle_t const& handle, vertex_t buffer_size) + row_col_properties_t(raft::handle_t const& handle, vertex_t buffer_size) : key_first_(std::nullopt), key_last_(std::nullopt), buffer_(allocate_dataframe_buffer(buffer_size, handle.get_stream())) { } - minor_properties_t(raft::handle_t const& handle, - vertex_t const* key_first, - vertex_t const* key_last) + row_col_properties_t(raft::handle_t const& handle, + vertex_t const* key_first, + vertex_t const* key_last) : key_first_(key_first), key_last_(key_last), buffer_( @@ -165,16 +121,18 @@ class minor_properties_t { value); } + auto key_first() { return key_first_; } + auto key_last() { return key_last_; } auto value_data() { return get_dataframe_buffer_begin(buffer_); } auto device_view() const { auto value_first = get_dataframe_buffer_cbegin(buffer_); if (key_first_) { - return minor_properties_device_view_t( + return row_col_properties_device_view_t( *key_first_, *key_last_, value_first); } else { - return minor_properties_device_view_t(value_first); + return row_col_properties_device_view_t(value_first); } } @@ -182,10 +140,10 @@ class minor_properties_t { { auto value_first = get_dataframe_buffer_begin(buffer_); if (key_first_) { - return minor_properties_device_view_t( + return row_col_properties_device_view_t( *key_first_, *key_last_, value_first); } else { - return minor_properties_device_view_t(value_first); + return row_col_properties_device_view_t(value_first); } } @@ -214,13 +172,8 @@ auto to_thrust_tuple(Iterator iter) } // namespace detail -template -class row_properties_t; - template -class row_properties_t> { +class row_properties_t { public: using value_type = T; @@ -233,43 +186,18 @@ class row_properties_t( + properties_ = detail::row_col_properties_t( handle, *key_first, *key_last); } else { - properties_ = detail::minor_properties_t( + properties_ = detail::row_col_properties_t( handle, graph_view.get_number_of_local_adj_matrix_partition_rows()); } } void fill(T value, rmm::cuda_stream_view stream) { properties_.fill(value, stream); } - auto value_data() { return properties_.value_data(); } - - auto device_view() const { return properties_.device_view(); } - auto mutable_device_view() { return properties_.mutable_device_view(); } - - private: - detail::minor_properties_t properties_{}; -}; - -template -class row_properties_t> { - public: - using value_type = T; - - static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic::value); - - row_properties_t() = default; - - row_properties_t(raft::handle_t const& handle, GraphViewType const& graph_view) - { - properties_ = detail::major_properties_t( - handle, graph_view.get_number_of_local_adj_matrix_partition_rows()); - } - - void fill(T value, rmm::cuda_stream_view stream) { properties_.fill(value, stream); } + auto key_first() { return properties_.key_first(); } + auto key_last() { return properties_.key_last(); } auto value_data() { return properties_.value_data(); } @@ -277,44 +205,11 @@ class row_properties_t properties_{}; -}; - -template -class col_properties_t; - -template -class col_properties_t> { - public: - using value_type = T; - - static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic::value); - - col_properties_t() = default; - - col_properties_t(raft::handle_t const& handle, GraphViewType const& graph_view) - { - properties_ = detail::major_properties_t( - handle, graph_view.get_number_of_local_adj_matrix_partition_cols()); - } - - void fill(T value, rmm::cuda_stream_view stream) { properties_.fill(value, stream); } - - auto value_data() { return properties_.value_data(); } - - auto device_view() const { return properties_.device_view(); } - auto mutable_device_view() { return properties_.mutable_device_view(); } - - private: - detail::major_properties_t properties_{}; + detail::row_col_properties_t properties_{}; }; template -class col_properties_t> { +class col_properties_t { public: using value_type = T; @@ -327,23 +222,26 @@ class col_properties_t( + properties_ = detail::row_col_properties_t( handle, *key_first, *key_last); } else { - properties_ = detail::minor_properties_t( + properties_ = detail::row_col_properties_t( handle, graph_view.get_number_of_local_adj_matrix_partition_cols()); } } void fill(T value, rmm::cuda_stream_view stream) { properties_.fill(value, stream); } + auto key_first() { return properties_.key_first(); } + auto key_last() { return properties_.key_last(); } + auto value_data() { return properties_.value_data(); } auto device_view() const { return properties_.device_view(); } auto mutable_device_view() { return properties_.mutable_device_view(); } private: - detail::minor_properties_t properties_{}; + detail::row_col_properties_t properties_{}; }; template @@ -365,11 +263,11 @@ class dummy_properties_t { }; template -auto device_view_concat(detail::major_properties_device_view_t... device_views) +auto device_view_concat(detail::row_col_properties_device_view_t... device_views) { auto concat_first = thrust::make_zip_iterator( thrust_tuple_cat(detail::to_thrust_tuple(device_views.value_data())...)); - return detail::major_properties_device_view_t(concat_first); + return detail::row_col_properties_device_view_t(concat_first); } } // namespace cugraph diff --git a/cpp/src/community/louvain.cuh b/cpp/src/community/louvain.cuh index 29153fc2d37..b409136ad85 100644 --- a/cpp/src/community/louvain.cuh +++ b/cpp/src/community/louvain.cuh @@ -260,11 +260,11 @@ class Louvain { current_graph_view_, graph_view_t::is_multi_gpu ? src_clusters_cache_.device_view() - : detail::major_properties_device_view_t( + : detail::row_col_properties_device_view_t( next_clusters_v_.begin()), graph_view_t::is_multi_gpu ? dst_clusters_cache_.device_view() - : detail::minor_properties_device_view_t( + : detail::row_col_properties_device_view_t( next_clusters_v_.begin()), [] __device__(auto, auto, weight_t wt, auto src_cluster, auto nbr_cluster) { if (src_cluster == nbr_cluster) { @@ -396,11 +396,11 @@ class Louvain { current_graph_view_, graph_view_t::is_multi_gpu ? src_clusters_cache_.device_view() - : detail::major_properties_device_view_t( + : detail::row_col_properties_device_view_t( next_clusters_v_.data()), graph_view_t::is_multi_gpu ? dst_clusters_cache_.device_view() - : detail::minor_properties_device_view_t( + : detail::row_col_properties_device_view_t( next_clusters_v_.data()), [] __device__(auto src, auto dst, auto wt, auto src_cluster, auto nbr_cluster) { weight_t sum{0}; @@ -491,13 +491,13 @@ class Louvain { src_cluster_weights.device_view(), src_old_cluster_sum_subtract_pairs.device_view()) : device_view_concat( - detail::major_properties_device_view_t( + detail::row_col_properties_device_view_t( vertex_weights_v_.data()), - detail::major_properties_device_view_t( + detail::row_col_properties_device_view_t( next_clusters_v_.data()), - detail::major_properties_device_view_t( + detail::row_col_properties_device_view_t( vertex_cluster_weights_v.data()), - detail::major_properties_device_view_t( cluster_old_sum_subtract_pair_first)); @@ -507,7 +507,7 @@ class Louvain { zipped_src_device_view, graph_view_t::is_multi_gpu ? dst_clusters_cache_.device_view() - : detail::minor_properties_device_view_t( + : detail::row_col_properties_device_view_t( next_clusters_v_.data()), cluster_keys_v_.begin(), cluster_keys_v_.end(), @@ -540,7 +540,7 @@ class Louvain { dummy_properties_t{}.device_view(), graph_view_t::is_multi_gpu ? src_clusters_cache_.device_view() - : detail::major_properties_device_view_t( + : detail::row_col_properties_device_view_t( next_clusters_v_.data()), detail::return_edge_weight_t{}, weight_t{0}); diff --git a/cpp/src/components/weakly_connected_components_impl.cuh b/cpp/src/components/weakly_connected_components_impl.cuh index 0440c6362c7..a3f6152fa95 100644 --- a/cpp/src/components/weakly_connected_components_impl.cuh +++ b/cpp/src/components/weakly_connected_components_impl.cuh @@ -545,7 +545,7 @@ void weakly_connected_components_impl(raft::handle_t const& handle, [col_components = GraphViewType::is_multi_gpu ? adj_matrix_col_components.mutable_device_view() - : detail::minor_properties_device_view_t(level_components), + : detail::row_col_properties_device_view_t(level_components), col_first = level_graph_view.get_local_adj_matrix_partition_col_first(), edge_buffer_first = get_dataframe_buffer_begin>(edge_buffer), diff --git a/cpp/src/structure/coarsen_graph_impl.cuh b/cpp/src/structure/coarsen_graph_impl.cuh index 716c9f67993..af8b32ef708 100644 --- a/cpp/src/structure/coarsen_graph_impl.cuh +++ b/cpp/src/structure/coarsen_graph_impl.cuh @@ -498,7 +498,7 @@ coarsen_graph( matrix_partition_device_view_t( graph_view.get_matrix_partition_view()), labels, - detail::minor_properties_device_view_t(labels), + detail::row_col_properties_device_view_t(labels), graph_view.get_local_adj_matrix_partition_segment_offsets(0)); rmm::device_uvector unique_labels(graph_view.get_number_of_vertices(), diff --git a/cpp/src/traversal/sssp_impl.cuh b/cpp/src/traversal/sssp_impl.cuh index ba91d485d65..3a95fdb8fbc 100644 --- a/cpp/src/traversal/sssp_impl.cuh +++ b/cpp/src/traversal/sssp_impl.cuh @@ -167,7 +167,7 @@ void sssp(raft::handle_t const& handle, std::vector{static_cast(Bucket::next_near), static_cast(Bucket::far)}, GraphViewType::is_multi_gpu ? adj_matrix_row_distances.device_view() - : detail::major_properties_device_view_t(distances), + : detail::row_col_properties_device_view_t(distances), dummy_properties_t{}.device_view(), [vertex_partition, distances, cutoff] __device__( vertex_t src, vertex_t dst, weight_t w, auto src_val, auto) { From 322490da607a223396ba41a16ab9d282a505707b Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Tue, 14 Sep 2021 12:45:23 -0400 Subject: [PATCH 37/57] update device_bcast to call bcast taking separate input and output iterators --- cpp/include/cugraph/utilities/device_comm.cuh | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/cpp/include/cugraph/utilities/device_comm.cuh b/cpp/include/cugraph/utilities/device_comm.cuh index d7a9b9c4983..9500f114c5a 100644 --- a/cpp/include/cugraph/utilities/device_comm.cuh +++ b/cpp/include/cugraph/utilities/device_comm.cuh @@ -376,11 +376,8 @@ device_bcast_impl(raft::comms::comms_t const& comm, { static_assert(std::is_same::value_type, typename std::iterator_traits::value_type>::value); - if (comm.get_rank() == root) { - comm.bcast(iter_to_raw_ptr(input_first), count, root, stream_view.value()); - } else { - comm.bcast(iter_to_raw_ptr(output_first), count, root, stream_view.value()); - } + comm.bcast( + iter_to_raw_ptr(input_first), iter_to_raw_ptr(output_first), count, root, stream_view.value()); } template From 1022012b29ec7efe49d3bf37a105adaa091a6f17 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Tue, 14 Sep 2021 13:25:57 -0400 Subject: [PATCH 38/57] compute unique edge row/col offsets in graph constructor --- cpp/include/cugraph/graph.hpp | 4 ++ cpp/include/cugraph/graph_view.hpp | 22 ++++++++-- cpp/src/structure/graph_impl.cuh | 62 +++++++++++++++++++++++++-- cpp/src/structure/graph_view_impl.cuh | 8 +++- 4 files changed, 87 insertions(+), 9 deletions(-) diff --git a/cpp/include/cugraph/graph.hpp b/cpp/include/cugraph/graph.hpp index 22629c63458..f03cc81e9cd 100644 --- a/cpp/include/cugraph/graph.hpp +++ b/cpp/include/cugraph/graph.hpp @@ -146,6 +146,7 @@ class graph_t{(*local_sorted_unique_edge_rows_).data() + (*local_sorted_unique_edge_rows_).size()} : std::nullopt, + local_sorted_unique_edge_row_offsets_, local_sorted_unique_edge_cols_ ? std::optional{(*local_sorted_unique_edge_cols_).data()} : std::nullopt, @@ -153,6 +154,7 @@ class graph_t{(*local_sorted_unique_edge_cols_).data() + (*local_sorted_unique_edge_cols_).size()} : std::nullopt, + local_sorted_unique_edge_col_offsets_, }, false); } @@ -177,6 +179,8 @@ class graph_t> local_sorted_unique_edge_rows_{std::nullopt}; std::optional> local_sorted_unique_edge_cols_{std::nullopt}; + std::optional> local_sorted_unique_edge_row_offsets_{std::nullopt}; + std::optional> local_sorted_unique_edge_col_offsets_{std::nullopt}; }; // single-GPU version diff --git a/cpp/include/cugraph/graph_view.hpp b/cpp/include/cugraph/graph_view.hpp index 804b765ae4b..2475cb71995 100644 --- a/cpp/include/cugraph/graph_view.hpp +++ b/cpp/include/cugraph/graph_view.hpp @@ -306,10 +306,12 @@ struct graph_view_meta_t> adj_matrix_partition_segment_offsets{}; - std::optional local_sorted_unique_edge_row_first_{}; - std::optional local_sorted_unique_edge_row_last_{}; - std::optional local_sorted_unique_edge_col_first_{}; - std::optional local_sorted_unique_edge_col_last_{}; + std::optional local_sorted_unique_edge_row_first{std::nullopt}; + std::optional local_sorted_unique_edge_row_last{std::nullopt}; + std::optional> local_sorted_unique_edge_row_offsets{std::nullopt}; + std::optional local_sorted_unique_edge_col_first{std::nullopt}; + std::optional local_sorted_unique_edge_col_last{std::nullopt}; + std::optional> local_sorted_unique_edge_col_offsets{std::nullopt}; }; // single-GPU version @@ -597,6 +599,11 @@ class graph_view_t> get_local_sorted_unique_edge_row_offsets() const + { + return local_sorted_unique_edge_row_offsets_; + } + std::optional get_local_sorted_unique_edge_col_begin() const { return local_sorted_unique_edge_col_first_; @@ -607,6 +614,11 @@ class graph_view_t> get_local_sorted_unique_edge_col_offsets() const + { + return local_sorted_unique_edge_col_offsets_; + } + private: std::vector adj_matrix_partition_offsets_{}; std::vector adj_matrix_partition_indices_{}; @@ -627,8 +639,10 @@ class graph_view_t local_sorted_unique_edge_row_first_{std::nullopt}; std::optional local_sorted_unique_edge_row_last_{std::nullopt}; + std::optional> local_sorted_unique_edge_row_offsets_{std::nullopt}; std::optional local_sorted_unique_edge_col_first_{std::nullopt}; std::optional local_sorted_unique_edge_col_last_{std::nullopt}; + std::optional> local_sorted_unique_edge_col_offsets_{std::nullopt}; }; // single-GPU version diff --git a/cpp/src/structure/graph_impl.cuh b/cpp/src/structure/graph_impl.cuh index 3f6cc1a60fd..e1371586dd8 100644 --- a/cpp/src/structure/graph_impl.cuh +++ b/cpp/src/structure/graph_impl.cuh @@ -435,10 +435,37 @@ graph_t{adj_matrix_partition_offsets_[i].data(), major_first})); } assert(cur_size == num_local_unique_edge_majors); + + std::vector h_vertex_partition_firsts(col_comm_size - 1); + for (int i = 1; i < col_comm_size; ++i) { + h_vertex_partition_firsts[i - 1] = + partition_.get_vertex_partition_first(i * row_comm_size + row_comm_rank); + } + rmm::device_uvector d_vertex_partition_firsts(h_vertex_partition_firsts.size(), + handle.get_stream()); + raft::update_device(d_vertex_partition_firsts.data(), + h_vertex_partition_firsts.data(), + h_vertex_partition_firsts.size(), + handle.get_stream()); + rmm::device_uvector d_key_offsets(d_vertex_partition_firsts.size(), + handle.get_stream()); + thrust::lower_bound(handle.get_thrust_policy(), + local_sorted_unique_edge_majors.begin(), + local_sorted_unique_edge_majors.end(), + d_vertex_partition_firsts.begin(), + d_vertex_partition_firsts.end(), + d_key_offsets.begin()); + std::vector h_key_offsets(col_comm_size + 1, vertex_t{0}); + h_key_offsets.back() = static_cast(local_sorted_unique_edge_majors.size()); + raft::update_host( + h_key_offsets.data() + 1, d_key_offsets.data(), d_key_offsets.size(), handle.get_stream()); + if constexpr (store_transposed) { - local_sorted_unique_edge_cols_ = std::move(local_sorted_unique_edge_majors); + local_sorted_unique_edge_cols_ = std::move(local_sorted_unique_edge_majors); + local_sorted_unique_edge_col_offsets_ = std::move(h_key_offsets); } else { - local_sorted_unique_edge_rows_ = std::move(local_sorted_unique_edge_majors); + local_sorted_unique_edge_rows_ = std::move(local_sorted_unique_edge_majors); + local_sorted_unique_edge_row_offsets_ = std::move(h_key_offsets); } } @@ -479,10 +506,37 @@ graph_t h_vertex_partition_firsts(row_comm_size - 1); + for (int i = 1; i < row_comm_size; ++i) { + h_vertex_partition_firsts[i - 1] = + partition_.get_vertex_partition_first(col_comm_rank * row_comm_size + i); + } + rmm::device_uvector d_vertex_partition_firsts(h_vertex_partition_firsts.size(), + handle.get_stream()); + raft::update_device(d_vertex_partition_firsts.data(), + h_vertex_partition_firsts.data(), + h_vertex_partition_firsts.size(), + handle.get_stream()); + rmm::device_uvector d_key_offsets(d_vertex_partition_firsts.size(), + handle.get_stream()); + thrust::lower_bound(handle.get_thrust_policy(), + local_sorted_unique_edge_minors.begin(), + local_sorted_unique_edge_minors.end(), + d_vertex_partition_firsts.begin(), + d_vertex_partition_firsts.end(), + d_key_offsets.begin()); + std::vector h_key_offsets(row_comm_size + 1, vertex_t{0}); + h_key_offsets.back() = static_cast(local_sorted_unique_edge_minors.size()); + raft::update_host( + h_key_offsets.data() + 1, d_key_offsets.data(), d_key_offsets.size(), handle.get_stream()); + if constexpr (store_transposed) { - local_sorted_unique_edge_rows_ = std::move(local_sorted_unique_edge_minors); + local_sorted_unique_edge_rows_ = std::move(local_sorted_unique_edge_minors); + local_sorted_unique_edge_row_offsets_ = std::move(h_key_offsets); } else { - local_sorted_unique_edge_cols_ = std::move(local_sorted_unique_edge_minors); + local_sorted_unique_edge_cols_ = std::move(local_sorted_unique_edge_minors); + local_sorted_unique_edge_col_offsets_ = std::move(h_key_offsets); } } diff --git a/cpp/src/structure/graph_view_impl.cuh b/cpp/src/structure/graph_view_impl.cuh index 368573d4a91..027697630e6 100644 --- a/cpp/src/structure/graph_view_impl.cuh +++ b/cpp/src/structure/graph_view_impl.cuh @@ -179,7 +179,13 @@ graph_view_t Date: Tue, 14 Sep 2021 15:57:22 -0400 Subject: [PATCH 39/57] update copy_to_adj_matrix_row/col to handle (key, value) pairs --- .../prims/copy_to_adj_matrix_row_col.cuh | 233 +++++++++++++----- 1 file changed, 168 insertions(+), 65 deletions(-) diff --git a/cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh b/cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh index ab27e7cc3c7..b03e653db27 100644 --- a/cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh +++ b/cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh @@ -30,11 +30,13 @@ #include #include +#include #include #include #include #include +#include #include #include #include @@ -51,7 +53,9 @@ void copy_to_matrix_major(raft::handle_t const& handle, VertexValueInputIterator vertex_value_input_first, MatrixMajorValueOutputWrapper& matrix_major_value_output) { - if (GraphViewType::is_multi_gpu) { + if constexpr (GraphViewType::is_multi_gpu) { + using vertex_t = typename GraphViewType::vertex_type; + auto& comm = handle.get_comms(); auto const comm_rank = comm.get_rank(); auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); @@ -72,18 +76,53 @@ void copy_to_matrix_major(raft::handle_t const& handle, comm.barrier(); // currently, this is ncclAllReduce #endif - std::vector rx_counts(col_comm_size, size_t{0}); - std::vector displacements(col_comm_size, size_t{0}); - for (int i = 0; i < col_comm_size; ++i) { - rx_counts[i] = graph_view.get_vertex_partition_size(i * row_comm_size + row_comm_rank); - displacements[i] = (i == 0) ? 0 : displacements[i - 1] + rx_counts[i - 1]; + if (matrix_major_value_output.key_first()) { + auto key_offsets = GraphViewType::is_adj_matrix_transposed + ? *(graph_view.get_local_sorted_unique_edge_col_offsets()) + : *(graph_view.get_local_sorted_unique_edge_row_offsets()); + + vertex_t max_rx_size{0}; + for (int i = 0; i < col_comm_size; ++i) { + max_rx_size = std::max( + max_rx_size, graph_view.get_vertex_partition_size(i * row_comm_size + row_comm_rank)); + } + auto rx_value_buffer = allocate_dataframe_buffer< + typename std::iterator_traits::value_type>(max_rx_size, + handle.get_stream()); + auto rx_value_first = get_dataframe_buffer_begin< + typename std::iterator_traits::value_type>(rx_value_buffer); + for (int i = 0; i < col_comm_size; ++i) { + device_bcast(col_comm, + vertex_value_input_first, + rx_value_first, + graph_view.get_vertex_partition_size(i * row_comm_size + row_comm_rank), + i, + handle.get_stream()); + + auto v_offset_first = thrust::make_transform_iterator( + *(matrix_major_value_output.key_first()) + key_offsets[i], + [v_first = graph_view.get_vertex_partition_first( + i * row_comm_size + row_comm_rank)] __device__(auto v) { return v - v_first; }); + thrust::gather(handle.get_thrust_policy(), + v_offset_first, + v_offset_first + (key_offsets[i + 1] - key_offsets[i]), + rx_value_first, + matrix_major_value_output.value_data() + key_offsets[i]); + } + } else { + std::vector rx_counts(col_comm_size, size_t{0}); + std::vector displacements(col_comm_size, size_t{0}); + for (int i = 0; i < col_comm_size; ++i) { + rx_counts[i] = graph_view.get_vertex_partition_size(i * row_comm_size + row_comm_rank); + displacements[i] = (i == 0) ? 0 : displacements[i - 1] + rx_counts[i - 1]; + } + device_allgatherv(col_comm, + vertex_value_input_first, + matrix_major_value_output.value_data(), + rx_counts, + displacements, + handle.get_stream()); } - device_allgatherv(col_comm, - vertex_value_input_first, - matrix_major_value_output.value_data(), - rx_counts, - displacements, - handle.get_stream()); // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between // two different communicators (end of col_comm) @@ -96,6 +135,7 @@ void copy_to_matrix_major(raft::handle_t const& handle, comm.barrier(); // currently, this is ncclAllReduce #endif } else { + assert(!(matrix_major_value_output.key_first())); assert(graph_view.get_number_of_local_vertices() == GraphViewType::is_adj_matrix_transposed ? graph_view.get_number_of_local_adj_matrix_partition_cols() : graph_view.get_number_of_local_adj_matrix_partition_rows()); @@ -121,7 +161,7 @@ void copy_to_matrix_major(raft::handle_t const& handle, using edge_t = typename GraphViewType::edge_type; using weight_t = typename GraphViewType::weight_type; - if (GraphViewType::is_multi_gpu) { + if constexpr (GraphViewType::is_multi_gpu) { auto& comm = handle.get_comms(); auto const comm_rank = comm.get_rank(); auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); @@ -146,20 +186,26 @@ void copy_to_matrix_major(raft::handle_t const& handle, host_scalar_allgather(col_comm, static_cast(thrust::distance(vertex_first, vertex_last)), handle.get_stream()); + auto max_rx_size = + std::reduce(rx_counts.begin(), rx_counts.end(), size_t{0}, [](auto lhs, auto rhs) { + return std::max(lhs, rhs); + }); + rmm::device_uvector rx_vertices(max_rx_size, handle.get_stream()); + auto rx_tmp_buffer = allocate_dataframe_buffer< + typename std::iterator_traits::value_type>(max_rx_size, + handle.get_stream()); + auto rx_value_first = get_dataframe_buffer_begin< + typename std::iterator_traits::value_type>(rx_tmp_buffer); + + auto key_offsets = GraphViewType::is_adj_matrix_transposed + ? graph_view.get_local_sorted_unique_edge_col_offsets() + : graph_view.get_local_sorted_unique_edge_row_offsets(); for (int i = 0; i < col_comm_size; ++i) { auto matrix_partition = matrix_partition_device_view_t( graph_view.get_matrix_partition_view(i)); - rmm::device_uvector rx_vertices(col_comm_rank == i ? size_t{0} : rx_counts[i], - handle.get_stream()); - auto rx_tmp_buffer = allocate_dataframe_buffer< - typename std::iterator_traits::value_type>(rx_counts[i], - handle.get_stream()); - auto rx_value_first = get_dataframe_buffer_begin< - typename std::iterator_traits::value_type>(rx_tmp_buffer); - if (col_comm_rank == i) { auto vertex_partition = vertex_partition_device_view_t( @@ -183,19 +229,24 @@ void copy_to_matrix_major(raft::handle_t const& handle, col_comm, vertex_first, rx_vertices.begin(), rx_counts[i], i, handle.get_stream()); device_bcast(col_comm, rx_value_first, rx_value_first, rx_counts[i], i, handle.get_stream()); - if (col_comm_rank == i) { - auto map_first = - thrust::make_transform_iterator(vertex_first, [matrix_partition] __device__(auto v) { - return matrix_partition.get_major_offset_from_major_nocheck(v); - }); - // FIXME: this scatter is unnecessary if NCCL directly takes a permutation iterator (and - // directly scatters from the internal buffer) - thrust::scatter( + if (matrix_major_value_output.key_first()) { + thrust::for_each( handle.get_thrust_policy(), - rx_value_first, - rx_value_first + rx_counts[i], - map_first, - matrix_major_value_output.value_data() + matrix_partition.get_major_value_start_offset()); + thrust::make_counting_iterator(vertex_t{0}), + thrust::make_counting_iterator((*key_offsets)[i + 1] - (*key_offsets)[i]), + [rx_vertex_first = rx_vertices.begin(), + rx_vertex_last = rx_vertices.end(), + rx_value_first, + output_key_first = *(matrix_major_value_output.key_first()) + (*key_offsets)[i], + output_value_first = + matrix_major_value_output.value_data() + (*key_offsets)[i]] __device__(auto i) { + auto major = *(output_key_first + i); + auto it = thrust::lower_bound(thrust::seq, rx_vertex_first, rx_vertex_last, major); + if (*it == major) { + auto rx_value = *(rx_value_first + thrust::distance(rx_vertex_first, it)); + *(output_value_first + i) = rx_value; + } + }); } else { auto map_first = thrust::make_transform_iterator( rx_vertices.begin(), [matrix_partition] __device__(auto v) { @@ -223,6 +274,7 @@ void copy_to_matrix_major(raft::handle_t const& handle, comm.barrier(); // currently, this is ncclAllReduce #endif } else { + assert(!(matrix_major_value_output.key_first())); assert(graph_view.get_number_of_local_vertices() == GraphViewType::is_adj_matrix_transposed ? graph_view.get_number_of_local_adj_matrix_partition_cols() : graph_view.get_number_of_local_adj_matrix_partition_rows()); @@ -243,7 +295,9 @@ void copy_to_matrix_minor(raft::handle_t const& handle, VertexValueInputIterator vertex_value_input_first, MatrixMinorValueOutputWrapper& matrix_minor_value_output) { - if (GraphViewType::is_multi_gpu) { + if constexpr (GraphViewType::is_multi_gpu) { + using vertex_t = typename GraphViewType::vertex_type; + auto& comm = handle.get_comms(); auto const comm_rank = comm.get_rank(); auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); @@ -264,18 +318,53 @@ void copy_to_matrix_minor(raft::handle_t const& handle, comm.barrier(); // currently, this is ncclAllReduce #endif - std::vector rx_counts(row_comm_size, size_t{0}); - std::vector displacements(row_comm_size, size_t{0}); - for (int i = 0; i < row_comm_size; ++i) { - rx_counts[i] = graph_view.get_vertex_partition_size(col_comm_rank * row_comm_size + i); - displacements[i] = (i == 0) ? 0 : displacements[i - 1] + rx_counts[i - 1]; + if (matrix_minor_value_output.key_first()) { + auto key_offsets = GraphViewType::is_adj_matrix_transposed + ? *(graph_view.get_local_sorted_unique_edge_row_offsets()) + : *(graph_view.get_local_sorted_unique_edge_col_offsets()); + + vertex_t max_rx_size{0}; + for (int i = 0; i < row_comm_size; ++i) { + max_rx_size = std::max( + max_rx_size, graph_view.get_vertex_partition_size(col_comm_rank * row_comm_size + i)); + } + auto rx_value_buffer = allocate_dataframe_buffer< + typename std::iterator_traits::value_type>(max_rx_size, + handle.get_stream()); + auto rx_value_first = get_dataframe_buffer_begin< + typename std::iterator_traits::value_type>(rx_value_buffer); + for (int i = 0; i < row_comm_size; ++i) { + device_bcast(row_comm, + vertex_value_input_first, + rx_value_first, + graph_view.get_vertex_partition_size(col_comm_rank * row_comm_size + i), + i, + handle.get_stream()); + + auto v_offset_first = thrust::make_transform_iterator( + *(matrix_minor_value_output.key_first()) + key_offsets[i], + [v_first = graph_view.get_vertex_partition_first( + col_comm_rank * row_comm_size + i)] __device__(auto v) { return v - v_first; }); + thrust::gather(handle.get_thrust_policy(), + v_offset_first, + v_offset_first + (key_offsets[i + 1] - key_offsets[i]), + rx_value_first, + matrix_minor_value_output.value_data() + key_offsets[i]); + } + } else { + std::vector rx_counts(row_comm_size, size_t{0}); + std::vector displacements(row_comm_size, size_t{0}); + for (int i = 0; i < row_comm_size; ++i) { + rx_counts[i] = graph_view.get_vertex_partition_size(col_comm_rank * row_comm_size + i); + displacements[i] = (i == 0) ? 0 : displacements[i - 1] + rx_counts[i - 1]; + } + device_allgatherv(row_comm, + vertex_value_input_first, + matrix_minor_value_output.value_data(), + rx_counts, + displacements, + handle.get_stream()); } - device_allgatherv(row_comm, - vertex_value_input_first, - matrix_minor_value_output.value_data(), - rx_counts, - displacements, - handle.get_stream()); // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between // two different communicators (end of row_comm) @@ -288,6 +377,7 @@ void copy_to_matrix_minor(raft::handle_t const& handle, comm.barrier(); // currently, this is ncclAllReduce #endif } else { + assert(!(matrix_minor_value_output.key_first())); assert(graph_view.get_number_of_local_vertices() == GraphViewType::is_adj_matrix_transposed ? graph_view.get_number_of_local_adj_matrix_partition_rows() : graph_view.get_number_of_local_adj_matrix_partition_cols()); @@ -313,7 +403,7 @@ void copy_to_matrix_minor(raft::handle_t const& handle, using edge_t = typename GraphViewType::edge_type; using weight_t = typename GraphViewType::weight_type; - if (GraphViewType::is_multi_gpu) { + if constexpr (GraphViewType::is_multi_gpu) { auto& comm = handle.get_comms(); auto const comm_rank = comm.get_rank(); auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); @@ -338,19 +428,25 @@ void copy_to_matrix_minor(raft::handle_t const& handle, host_scalar_allgather(row_comm, static_cast(thrust::distance(vertex_first, vertex_last)), handle.get_stream()); + auto max_rx_size = + std::reduce(rx_counts.begin(), rx_counts.end(), size_t{0}, [](auto lhs, auto rhs) { + return std::max(lhs, rhs); + }); + rmm::device_uvector rx_vertices(max_rx_size, handle.get_stream()); + auto rx_tmp_buffer = allocate_dataframe_buffer< + typename std::iterator_traits::value_type>(max_rx_size, + handle.get_stream()); + auto rx_value_first = get_dataframe_buffer_begin< + typename std::iterator_traits::value_type>(rx_tmp_buffer); + + auto key_offsets = GraphViewType::is_adj_matrix_transposed + ? graph_view.get_local_sorted_unique_edge_row_offsets() + : graph_view.get_local_sorted_unique_edge_col_offsets(); auto matrix_partition = matrix_partition_device_view_t( graph_view.get_matrix_partition_view(size_t{0})); for (int i = 0; i < row_comm_size; ++i) { - rmm::device_uvector rx_vertices(row_comm_rank == i ? size_t{0} : rx_counts[i], - handle.get_stream()); - auto rx_tmp_buffer = allocate_dataframe_buffer< - typename std::iterator_traits::value_type>(rx_counts[i], - handle.get_stream()); - auto rx_value_first = get_dataframe_buffer_begin< - typename std::iterator_traits::value_type>(rx_tmp_buffer); - if (row_comm_rank == i) { auto vertex_partition = vertex_partition_device_view_t( @@ -374,18 +470,24 @@ void copy_to_matrix_minor(raft::handle_t const& handle, row_comm, vertex_first, rx_vertices.begin(), rx_counts[i], i, handle.get_stream()); device_bcast(row_comm, rx_value_first, rx_value_first, rx_counts[i], i, handle.get_stream()); - if (row_comm_rank == i) { - auto map_first = - thrust::make_transform_iterator(vertex_first, [matrix_partition] __device__(auto v) { - return matrix_partition.get_minor_offset_from_minor_nocheck(v); + if (matrix_minor_value_output.key_first()) { + thrust::for_each( + handle.get_thrust_policy(), + thrust::make_counting_iterator(vertex_t{0}), + thrust::make_counting_iterator((*key_offsets)[i + 1] - (*key_offsets)[i]), + [rx_vertex_first = rx_vertices.begin(), + rx_vertex_last = rx_vertices.end(), + rx_value_first, + output_key_first = *(matrix_minor_value_output.key_first()) + (*key_offsets)[i], + output_value_first = + matrix_minor_value_output.value_data() + (*key_offsets)[i]] __device__(auto i) { + auto minor = *(output_key_first + i); + auto it = thrust::lower_bound(thrust::seq, rx_vertex_first, rx_vertex_last, minor); + if (*it == minor) { + auto rx_value = *(rx_value_first + thrust::distance(rx_vertex_first, it)); + *(output_value_first + i) = rx_value; + } }); - // FIXME: this scatter is unnecessary if NCCL directly takes a permutation iterator (and - // directly scatters from the internal buffer) - thrust::scatter(handle.get_thrust_policy(), - rx_value_first, - rx_value_first + rx_counts[i], - map_first, - matrix_minor_value_output.value_data()); } else { auto map_first = thrust::make_transform_iterator( rx_vertices.begin(), [matrix_partition] __device__(auto v) { @@ -412,6 +514,7 @@ void copy_to_matrix_minor(raft::handle_t const& handle, comm.barrier(); // currently, this is ncclAllReduce #endif } else { + assert(!(matrix_minor_value_output.key_first())); assert(graph_view.get_number_of_local_vertices() == graph_view.get_number_of_local_adj_matrix_partition_rows()); auto val_first = thrust::make_permutation_iterator(vertex_value_input_first, vertex_first); From 540b9734e4005a8f696e884cd6405867e7976f0f Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Tue, 14 Sep 2021 21:35:22 -0400 Subject: [PATCH 40/57] update code calling device_bcast --- ...ransform_reduce_key_aggregated_out_nbr.cuh | 12 ---------- .../update_frontier_v_push_if_out_nbr.cuh | 19 ++------------- cpp/src/structure/coarsen_graph_impl.cuh | 8 +------ cpp/src/structure/renumber_edgelist_impl.cuh | 24 ++++++++++++------- 4 files changed, 18 insertions(+), 45 deletions(-) diff --git a/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh b/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh index 7d68cf8d13d..58b5c19ed05 100644 --- a/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh +++ b/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh @@ -425,18 +425,6 @@ void copy_v_transform_reduce_key_aggregated_out_nbr( i, handle.get_stream()); } - // FIXME: these copies are unnecessary, better fix RAFT comm's bcast to take separate input & - // output pointers - auto execution_policy = handle.get_thrust_policy(); - thrust::copy(execution_policy, - map_unique_key_first, - map_unique_key_last, - map_keys.begin() + map_displacements[row_comm_rank]); - thrust::copy( - execution_policy, - map_value_first, - map_value_first + thrust::distance(map_unique_key_first, map_unique_key_last), - get_dataframe_buffer_begin(map_value_buffer) + map_displacements[row_comm_rank]); handle.get_stream_view().synchronize(); // cuco::static_map currently does not take stream diff --git a/cpp/include/cugraph/prims/update_frontier_v_push_if_out_nbr.cuh b/cpp/include/cugraph/prims/update_frontier_v_push_if_out_nbr.cuh index 77d0aaab2ec..507d54ecac8 100644 --- a/cpp/include/cugraph/prims/update_frontier_v_push_if_out_nbr.cuh +++ b/cpp/include/cugraph/prims/update_frontier_v_push_if_out_nbr.cuh @@ -654,18 +654,10 @@ typename GraphViewType::edge_type compute_num_out_nbrs_from_frontier( rmm::device_uvector frontier_vertices(local_frontier_sizes[i], handle.get_stream_view()); - // FIXME: this copy is unnecessary, better fix RAFT comm's bcast to take const iterators for - // input - if (col_comm_rank == static_cast(i)) { - thrust::copy(execution_policy, - local_frontier_vertex_first, - local_frontier_vertex_last, - frontier_vertices.begin()); - } device_bcast(col_comm, + local_frontier_vertex_first, frontier_vertices.data(), - frontier_vertices.data(), - frontier_vertices.size(), + local_frontier_sizes[i], static_cast(i), handle.get_stream()); @@ -893,13 +885,6 @@ void update_frontier_v_push_if_out_nbr( resize_dataframe_buffer( matrix_partition_frontier_key_buffer, matrix_partition_frontier_size, handle.get_stream()); - if (static_cast(col_comm_rank) == i) { - thrust::copy(handle.get_thrust_policy(), - frontier_key_first, - frontier_key_last, - get_dataframe_buffer_begin(matrix_partition_frontier_key_buffer)); - } - device_bcast(col_comm, frontier_key_first, get_dataframe_buffer_begin(matrix_partition_frontier_key_buffer), diff --git a/cpp/src/structure/coarsen_graph_impl.cuh b/cpp/src/structure/coarsen_graph_impl.cuh index af8b32ef708..1fe22f4c902 100644 --- a/cpp/src/structure/coarsen_graph_impl.cuh +++ b/cpp/src/structure/coarsen_graph_impl.cuh @@ -269,14 +269,8 @@ coarsen_graph( store_transposed ? graph_view.get_number_of_local_adj_matrix_partition_cols(i) : graph_view.get_number_of_local_adj_matrix_partition_rows(i), handle.get_stream()); - if (col_comm_rank == static_cast(i)) { - // FIXME: this copy is unnecessary, beter fix RAFT comm's bcast to take const iterators for - // input - thrust::copy( - handle.get_thrust_policy(), labels, labels + major_labels.size(), major_labels.begin()); - } device_bcast(col_comm, - major_labels.data(), + labels, major_labels.data(), major_labels.size(), static_cast(i), diff --git a/cpp/src/structure/renumber_edgelist_impl.cuh b/cpp/src/structure/renumber_edgelist_impl.cuh index 56aca7725ea..75b04bf8b6b 100644 --- a/cpp/src/structure/renumber_edgelist_impl.cuh +++ b/cpp/src/structure/renumber_edgelist_impl.cuh @@ -701,11 +701,14 @@ renumber_edgelist( comm.barrier(); // currently, this is ncclAllReduce #endif + vertex_t max_matrix_partition_major_size{0}; + for (size_t i = 0; i < edgelist_major_vertices.size(); ++i) { + max_matrix_partition_major_size = + std::max(max_matrix_partition_major_size, partition.get_matrix_partition_major_size(i)); + } + rmm::device_uvector renumber_map_major_labels(max_matrix_partition_major_size, + handle.get_stream()); for (size_t i = 0; i < edgelist_major_vertices.size(); ++i) { - rmm::device_uvector renumber_map_major_labels( - col_comm_rank == static_cast(i) ? vertex_t{0} - : partition.get_matrix_partition_major_size(i), - handle.get_stream()); device_bcast(col_comm, renumber_map_labels.data(), renumber_map_major_labels.data(), @@ -728,8 +731,7 @@ renumber_edgelist( invalid_vertex_id::value, stream_adapter}; auto pair_first = thrust::make_zip_iterator(thrust::make_tuple( - col_comm_rank == static_cast(i) ? renumber_map_labels.begin() - : renumber_map_major_labels.begin(), + renumber_map_major_labels.begin(), thrust::make_counting_iterator(partition.get_matrix_partition_major_first(i)))); renumber_map.insert(pair_first, pair_first + partition.get_matrix_partition_major_size(i)); renumber_map.find(edgelist_major_vertices[i], @@ -750,10 +752,14 @@ renumber_edgelist( if ((partition.get_matrix_partition_minor_size() >= number_of_edges / comm_size) && edgelist_intra_partition_segment_offsets) { // memory footprint dominated by the O(V/sqrt(P)) // part than the O(E/P) part + vertex_t max_segment_size{0}; + for (size_t i = 0; i < edgelist_major_vertices.size(); ++i) { + max_segment_size = std::max( + max_segment_size, partition.get_vertex_partition_size(col_comm_rank * row_comm_size + i)); + } + rmm::device_uvector renumber_map_minor_labels(max_segment_size, handle.get_stream()); for (int i = 0; i < row_comm_size; ++i) { auto segment_size = partition.get_vertex_partition_size(col_comm_rank * row_comm_size + i); - rmm::device_uvector renumber_map_minor_labels( - row_comm_rank == i ? vertex_t{0} : segment_size, handle.get_stream()); device_bcast(row_comm, renumber_map_labels.data(), renumber_map_minor_labels.data(), @@ -776,7 +782,7 @@ renumber_edgelist( invalid_vertex_id::value, stream_adapter}; auto pair_first = thrust::make_zip_iterator(thrust::make_tuple( - row_comm_rank == i ? renumber_map_labels.begin() : renumber_map_minor_labels.begin(), + renumber_map_minor_labels.begin(), thrust::make_counting_iterator( partition.get_vertex_partition_first(col_comm_rank * row_comm_size + i)))); renumber_map.insert(pair_first, pair_first + segment_size); From 947ef7beaf4f9c65b89e8bf7568b7dd196980702 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 15 Sep 2021 10:01:58 -0400 Subject: [PATCH 41/57] bug fix --- .../structure/create_graph_from_edgelist_impl.hpp | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/cpp/src/structure/create_graph_from_edgelist_impl.hpp b/cpp/src/structure/create_graph_from_edgelist_impl.hpp index 69e16266bcd..e9f670ef4a4 100644 --- a/cpp/src/structure/create_graph_from_edgelist_impl.hpp +++ b/cpp/src/structure/create_graph_from_edgelist_impl.hpp @@ -131,11 +131,14 @@ create_graph_from_edgelist_impl(raft::handle_t const& handle, cugraph::graph_t( handle, edgelists, - cugraph::graph_meta_t{meta.number_of_vertices, - meta.number_of_edges, - graph_properties, - meta.partition, - meta.segment_offsets}), + cugraph::graph_meta_t{ + meta.number_of_vertices, + meta.number_of_edges, + graph_properties, + meta.partition, + meta.segment_offsets, + store_transposed ? meta.num_local_unique_edge_minors : meta.num_local_unique_edge_majors, + store_transposed ? meta.num_local_unique_edge_majors : meta.num_local_unique_edge_minors}), std::optional>{std::move(renumber_map_labels)}); } From e2d5e5833a31dd2eb4ef40ea1e11099959c0dfd4 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 15 Sep 2021 14:11:47 -0400 Subject: [PATCH 42/57] bug fix --- cpp/src/structure/coarsen_graph_impl.cuh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cpp/src/structure/coarsen_graph_impl.cuh b/cpp/src/structure/coarsen_graph_impl.cuh index 1fe22f4c902..3d9e99a6fea 100644 --- a/cpp/src/structure/coarsen_graph_impl.cuh +++ b/cpp/src/structure/coarsen_graph_impl.cuh @@ -460,7 +460,9 @@ coarsen_graph( meta.number_of_edges, graph_properties_t{graph_view.is_symmetric(), false}, meta.partition, - meta.segment_offsets}), + meta.segment_offsets, + store_transposed ? meta.num_local_unique_edge_minors : meta.num_local_unique_edge_majors, + store_transposed ? meta.num_local_unique_edge_majors : meta.num_local_unique_edge_minors}), std::move(renumber_map_labels)); } From 593a3102e7f9900ae58d315ef7809dea491f343d Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 15 Sep 2021 14:36:16 -0400 Subject: [PATCH 43/57] update host_scalar_(all)reduce to take reduction op --- cpp/include/cugraph/prims/count_if_v.cuh | 4 +-- cpp/include/cugraph/prims/reduce_v.cuh | 6 ++-- .../cugraph/prims/transform_reduce_e.cuh | 3 +- .../cugraph/prims/transform_reduce_v.cuh | 6 ++-- cpp/include/cugraph/prims/vertex_frontier.cuh | 6 ++-- .../cugraph/utilities/host_scalar_comm.cuh | 28 +++++++++++-------- cpp/src/community/louvain.cuh | 4 +-- .../weakly_connected_components_impl.cuh | 2 +- cpp/src/link_analysis/pagerank_impl.cuh | 13 +++++---- cpp/src/structure/graph_impl.cuh | 4 +-- cpp/src/structure/graph_view_impl.cuh | 6 ++-- cpp/src/structure/renumber_edgelist_impl.cuh | 1 + cpp/src/traversal/bfs_impl.cuh | 3 +- cpp/src/utilities/cython.cu | 6 ++-- 14 files changed, 54 insertions(+), 38 deletions(-) diff --git a/cpp/include/cugraph/prims/count_if_v.cuh b/cpp/include/cugraph/prims/count_if_v.cuh index b2d4283d859..05f778c9a0c 100644 --- a/cpp/include/cugraph/prims/count_if_v.cuh +++ b/cpp/include/cugraph/prims/count_if_v.cuh @@ -59,7 +59,7 @@ typename GraphViewType::vertex_type count_if_v(raft::handle_t const& handle, vertex_value_input_first + graph_view.get_number_of_local_vertices(), v_op); if (GraphViewType::is_multi_gpu) { - count = host_scalar_allreduce(handle.get_comms(), count, handle.get_stream()); + count = host_scalar_allreduce(handle.get_comms(), count, raft::comms::op_t::SUM, handle.get_stream()); } return count; } @@ -94,7 +94,7 @@ typename GraphViewType::vertex_type count_if_v(raft::handle_t const& handle, { auto count = thrust::count_if(handle.get_thrust_policy(), input_first, input_last, v_op); if (GraphViewType::is_multi_gpu) { - count = host_scalar_allreduce(handle.get_comms(), count, handle.get_stream()); + count = host_scalar_allreduce(handle.get_comms(), count, raft::comms::op_t::SUM, handle.get_stream()); } return count; } diff --git a/cpp/include/cugraph/prims/reduce_v.cuh b/cpp/include/cugraph/prims/reduce_v.cuh index f41774675fb..998d9cdb917 100644 --- a/cpp/include/cugraph/prims/reduce_v.cuh +++ b/cpp/include/cugraph/prims/reduce_v.cuh @@ -58,7 +58,8 @@ T reduce_v(raft::handle_t const& handle, ((GraphViewType::is_multi_gpu) && (handle.get_comms().get_rank() == 0)) ? init : T{}, property_add()); if (GraphViewType::is_multi_gpu) { - ret = host_scalar_allreduce(handle.get_comms(), ret, handle.get_stream()); + ret = + host_scalar_allreduce(handle.get_comms(), ret, raft::comms::op_t::SUM, handle.get_stream()); } return ret; } @@ -95,7 +96,8 @@ T reduce_v(raft::handle_t const& handle, ((GraphViewType::is_multi_gpu) && (handle.get_comms().get_rank() == 0)) ? init : T{}, property_add()); if (GraphViewType::is_multi_gpu) { - ret = host_scalar_allreduce(handle.get_comms(), ret, handle.get_stream()); + ret = + host_scalar_allreduce(handle.get_comms(), ret, raft::comms::op_t::SUM, handle.get_stream()); } return ret; } diff --git a/cpp/include/cugraph/prims/transform_reduce_e.cuh b/cpp/include/cugraph/prims/transform_reduce_e.cuh index f15880b4ec0..990730dee32 100644 --- a/cpp/include/cugraph/prims/transform_reduce_e.cuh +++ b/cpp/include/cugraph/prims/transform_reduce_e.cuh @@ -517,7 +517,8 @@ T transform_reduce_e(raft::handle_t const& handle, edge_property_add); if (GraphViewType::is_multi_gpu) { - result = host_scalar_allreduce(handle.get_comms(), result, handle.get_stream()); + result = host_scalar_allreduce( + handle.get_comms(), result, raft::comms::op_t::SUM, handle.get_stream()); } return result; diff --git a/cpp/include/cugraph/prims/transform_reduce_v.cuh b/cpp/include/cugraph/prims/transform_reduce_v.cuh index 118db15b38a..812283da838 100644 --- a/cpp/include/cugraph/prims/transform_reduce_v.cuh +++ b/cpp/include/cugraph/prims/transform_reduce_v.cuh @@ -63,7 +63,8 @@ T transform_reduce_v(raft::handle_t const& handle, ((GraphViewType::is_multi_gpu) && (handle.get_comms().get_rank() == 0)) ? init : T{}, property_add()); if (GraphViewType::is_multi_gpu) { - ret = host_scalar_allreduce(handle.get_comms(), ret, handle.get_stream()); + ret = + host_scalar_allreduce(handle.get_comms(), ret, raft::comms::op_t::SUM, handle.get_stream()); } return ret; } @@ -106,7 +107,8 @@ T transform_reduce_v(raft::handle_t const& handle, ((GraphViewType::is_multi_gpu) && (handle.get_comms().get_rank() == 0)) ? init : T{}, property_add()); if (GraphViewType::is_multi_gpu) { - ret = host_scalar_allreduce(handle.get_comms(), ret, handle.get_stream()); + ret = + host_scalar_allreduce(handle.get_comms(), ret, raft::comms::op_t::SUM, handle.get_stream()); } return ret; } diff --git a/cpp/include/cugraph/prims/vertex_frontier.cuh b/cpp/include/cugraph/prims/vertex_frontier.cuh index 5f5a3225bdc..82e0f4ab880 100644 --- a/cpp/include/cugraph/prims/vertex_frontier.cuh +++ b/cpp/include/cugraph/prims/vertex_frontier.cuh @@ -194,8 +194,10 @@ class SortedUniqueKeyBucket { template std::enable_if_t aggregate_size() const { - return host_scalar_allreduce( - handle_ptr_->get_comms(), vertices_.size(), handle_ptr_->get_stream()); + return host_scalar_allreduce(handle_ptr_->get_comms(), + vertices_.size(), + raft::comms::op_t::SUM, + handle_ptr_->get_stream()); } template diff --git a/cpp/include/cugraph/utilities/host_scalar_comm.cuh b/cpp/include/cugraph/utilities/host_scalar_comm.cuh index 5675d57e8d3..bc056548dac 100644 --- a/cpp/include/cugraph/utilities/host_scalar_comm.cuh +++ b/cpp/include/cugraph/utilities/host_scalar_comm.cuh @@ -68,14 +68,15 @@ template struct host_allreduce_tuple_scalar_element_impl { void run(raft::comms::comms_t const& comm, rmm::device_uvector& tuple_scalar_elements, + raft::comms::op_t op, cudaStream_t stream) const { using element_t = typename thrust::tuple_element::type; static_assert(sizeof(element_t) <= sizeof(int64_t)); auto ptr = reinterpret_cast(tuple_scalar_elements.data() + I); - comm.allreduce(ptr, ptr, 1, raft::comms::op_t::SUM, stream); + comm.allreduce(ptr, ptr, 1, op, stream); host_allreduce_tuple_scalar_element_impl().run( - comm, tuple_scalar_elements, stream); + comm, tuple_scalar_elements, op, stream); } }; @@ -83,6 +84,7 @@ template struct host_allreduce_tuple_scalar_element_impl { void run(raft::comms::comms_t const& comm, rmm::device_uvector& tuple_scalar_elements, + raft::comms::op_t op, cudaStream_t stream) const { } @@ -92,15 +94,16 @@ template struct host_reduce_tuple_scalar_element_impl { void run(raft::comms::comms_t const& comm, rmm::device_uvector& tuple_scalar_elements, + raft::comms::op_t op, int root, cudaStream_t stream) const { using element_t = typename thrust::tuple_element::type; static_assert(sizeof(element_t) <= sizeof(int64_t)); auto ptr = reinterpret_cast(tuple_scalar_elements.data() + I); - comm.reduce(ptr, ptr, 1, raft::comms::op_t::SUM, root, stream); + comm.reduce(ptr, ptr, 1, op, root, stream); host_reduce_tuple_scalar_element_impl().run( - comm, tuple_scalar_elements, root, stream); + comm, tuple_scalar_elements, op, root, stream); } }; @@ -108,6 +111,7 @@ template struct host_reduce_tuple_scalar_element_impl { void run(raft::comms::comms_t const& comm, rmm::device_uvector& tuple_scalar_elements, + raft::comms::op_t op, int root, cudaStream_t stream) const { @@ -118,11 +122,11 @@ struct host_reduce_tuple_scalar_element_impl { template std::enable_if_t::value, T> host_scalar_allreduce( - raft::comms::comms_t const& comm, T input, cudaStream_t stream) + raft::comms::comms_t const& comm, T input, raft::comms::op_t op, cudaStream_t stream) { rmm::device_uvector d_input(1, stream); raft::update_device(d_input.data(), &input, 1, stream); - comm.allreduce(d_input.data(), d_input.data(), 1, raft::comms::op_t::SUM, stream); + comm.allreduce(d_input.data(), d_input.data(), 1, op, stream); T h_input{}; raft::update_host(&h_input, d_input.data(), 1, stream); auto status = comm.sync_stream(stream); @@ -132,7 +136,7 @@ std::enable_if_t::value, T> host_scalar_allreduce( template std::enable_if_t::value, T> host_scalar_allreduce( - raft::comms::comms_t const& comm, T input, cudaStream_t stream) + raft::comms::comms_t const& comm, T input, raft::comms::op_t op, cudaStream_t stream) { size_t constexpr tuple_size = thrust::tuple_size::value; std::vector h_tuple_scalar_elements(tuple_size); @@ -144,7 +148,7 @@ std::enable_if_t::value, T> host_scala raft::update_device( d_tuple_scalar_elements.data(), h_tuple_scalar_elements.data(), tuple_size, stream); detail::host_allreduce_tuple_scalar_element_impl().run( - comm, d_tuple_scalar_elements, stream); + comm, d_tuple_scalar_elements, op, stream); raft::update_host( h_tuple_scalar_elements.data(), d_tuple_scalar_elements.data(), tuple_size, stream); auto status = comm.sync_stream(stream); @@ -158,11 +162,11 @@ std::enable_if_t::value, T> host_scala // Return value is valid only in root (return value may better be std::optional in C++17 or later) template std::enable_if_t::value, T> host_scalar_reduce( - raft::comms::comms_t const& comm, T input, int root, cudaStream_t stream) + raft::comms::comms_t const& comm, T input, raft::comms::op_t op, int root, cudaStream_t stream) { rmm::device_uvector d_input(1, stream); raft::update_device(d_input.data(), &input, 1, stream); - comm.reduce(d_input.data(), d_input.data(), 1, raft::comms::op_t::SUM, stream); + comm.reduce(d_input.data(), d_input.data(), 1, op, stream); T h_input{}; if (comm.get_rank() == root) { raft::update_host(&h_input, d_input.data(), 1, stream); } auto status = comm.sync_stream(stream); @@ -173,7 +177,7 @@ std::enable_if_t::value, T> host_scalar_reduce( // Return value is valid only in root (return value may better be std::optional in C++17 or later) template std::enable_if_t::value, T> host_scalar_reduce( - raft::comms::comms_t const& comm, T input, int root, cudaStream_t stream) + raft::comms::comms_t const& comm, T input, raft::comms::op_t op, int root, cudaStream_t stream) { size_t constexpr tuple_size = thrust::tuple_size::value; std::vector h_tuple_scalar_elements(tuple_size); @@ -185,7 +189,7 @@ std::enable_if_t::value, T> host_scala raft::update_device( d_tuple_scalar_elements.data(), h_tuple_scalar_elements.data(), tuple_size, stream); detail::host_reduce_tuple_scalar_element_impl().run( - comm, d_tuple_scalar_elements, root, stream); + comm, d_tuple_scalar_elements, op, root, stream); if (comm.get_rank() == root) { raft::update_host( h_tuple_scalar_elements.data(), d_tuple_scalar_elements.data(), tuple_size, stream); diff --git a/cpp/src/community/louvain.cuh b/cpp/src/community/louvain.cuh index 613f8da3206..23adca97342 100644 --- a/cpp/src/community/louvain.cuh +++ b/cpp/src/community/louvain.cuh @@ -251,8 +251,8 @@ class Louvain { thrust::plus()); if (graph_view_t::is_multi_gpu) { - sum_degree_squared = - host_scalar_allreduce(handle_.get_comms(), sum_degree_squared, handle_.get_stream()); + sum_degree_squared = host_scalar_allreduce( + handle_.get_comms(), sum_degree_squared, raft::comms::op_t::SUM, handle_.get_stream()); } weight_t sum_internal = transform_reduce_e( diff --git a/cpp/src/components/weakly_connected_components_impl.cuh b/cpp/src/components/weakly_connected_components_impl.cuh index c7f214c6314..3fb9505fa54 100644 --- a/cpp/src/components/weakly_connected_components_impl.cuh +++ b/cpp/src/components/weakly_connected_components_impl.cuh @@ -656,7 +656,7 @@ void weakly_connected_components_impl(raft::handle_t const& handle, auto aggregate_num_inserts = num_inserts; if (GraphViewType::is_multi_gpu) { auto& comm = handle.get_comms(); - aggregate_num_inserts = host_scalar_allreduce(comm, num_inserts, handle.get_stream()); + aggregate_num_inserts = host_scalar_allreduce(comm, num_inserts, raft::comms::op_t::SUM, handle.get_stream()); } if (aggregate_num_inserts > 0) { diff --git a/cpp/src/link_analysis/pagerank_impl.cuh b/cpp/src/link_analysis/pagerank_impl.cuh index c3ccf4f4763..b6023d21bf2 100644 --- a/cpp/src/link_analysis/pagerank_impl.cuh +++ b/cpp/src/link_analysis/pagerank_impl.cuh @@ -69,12 +69,13 @@ void pagerank( if (num_vertices == 0) { return; } auto aggregate_personalization_vector_size = - personalization_vertices - ? GraphViewType::is_multi_gpu - ? host_scalar_allreduce( - handle.get_comms(), *personalization_vector_size, handle.get_stream()) - : *personalization_vector_size - : vertex_t{0}; + personalization_vertices ? GraphViewType::is_multi_gpu + ? host_scalar_allreduce(handle.get_comms(), + *personalization_vector_size, + raft::comms::op_t::SUM, + handle.get_stream()) + : *personalization_vector_size + : vertex_t{0}; // 1. check input arguments diff --git a/cpp/src/structure/graph_impl.cuh b/cpp/src/structure/graph_impl.cuh index e54d69204ac..23bf80adca0 100644 --- a/cpp/src/structure/graph_impl.cuh +++ b/cpp/src/structure/graph_impl.cuh @@ -268,8 +268,8 @@ graph_tget_number_of_edges(), "Invalid input argument: the sum of local edge counts does not match with " "meta.number_of_edges."); diff --git a/cpp/src/structure/graph_view_impl.cuh b/cpp/src/structure/graph_view_impl.cuh index 368573d4a91..572fa0598fc 100644 --- a/cpp/src/structure/graph_view_impl.cuh +++ b/cpp/src/structure/graph_view_impl.cuh @@ -267,8 +267,10 @@ graph_view_t{minor_first, minor_last}) == 0, "Internal Error: adj_matrix_partition_indices[] have out-of-range vertex IDs."); } - number_of_local_edges_sum = host_scalar_allreduce( - this->get_handle_ptr()->get_comms(), number_of_local_edges_sum, default_stream_view.value()); + number_of_local_edges_sum = host_scalar_allreduce(this->get_handle_ptr()->get_comms(), + number_of_local_edges_sum, + raft::comms::op_t::SUM, + default_stream_view.value()); CUGRAPH_EXPECTS(number_of_local_edges_sum == this->get_number_of_edges(), "Internal Error: the sum of local edges counts does not match with " "number_of_local_edges."); diff --git a/cpp/src/structure/renumber_edgelist_impl.cuh b/cpp/src/structure/renumber_edgelist_impl.cuh index 8104db8eebc..b6a0b3c98b2 100644 --- a/cpp/src/structure/renumber_edgelist_impl.cuh +++ b/cpp/src/structure/renumber_edgelist_impl.cuh @@ -669,6 +669,7 @@ renumber_edgelist( auto number_of_edges = host_scalar_allreduce( comm, std::accumulate(edgelist_edge_counts.begin(), edgelist_edge_counts.end(), edge_t{0}), + raft::comms::op_t::SUM, handle.get_stream()); // 3. renumber edges diff --git a/cpp/src/traversal/bfs_impl.cuh b/cpp/src/traversal/bfs_impl.cuh index ed528bd3d34..c1b8260b0a3 100644 --- a/cpp/src/traversal/bfs_impl.cuh +++ b/cpp/src/traversal/bfs_impl.cuh @@ -71,7 +71,8 @@ void bfs(raft::handle_t const& handle, auto aggregate_n_sources = GraphViewType::is_multi_gpu - ? host_scalar_allreduce(handle.get_comms(), n_sources, handle.get_stream()) + ? host_scalar_allreduce( + handle.get_comms(), n_sources, raft::comms::op_t::SUM, handle.get_stream()) : n_sources; CUGRAPH_EXPECTS(aggregate_n_sources > 0, "Invalid input argument: input should have at least one source"); diff --git a/cpp/src/utilities/cython.cu b/cpp/src/utilities/cython.cu index bee78fb9952..2fe4bd3fe60 100644 --- a/cpp/src/utilities/cython.cu +++ b/cpp/src/utilities/cython.cu @@ -580,9 +580,9 @@ void call_pagerank(raft::handle_t const& handle, bool has_guess) { if (graph_container.is_multi_gpu) { - auto& comm = handle.get_comms(); - auto aggregate_personalization_subset_size = - cugraph::host_scalar_allreduce(comm, personalization_subset_size, handle.get_stream()); + auto& comm = handle.get_comms(); + auto aggregate_personalization_subset_size = cugraph::host_scalar_allreduce( + comm, personalization_subset_size, raft::comms::op_t::SUM, handle.get_stream()); if (graph_container.edgeType == numberTypeEnum::int32Type) { auto graph = From 11a3053630f39c83878e168057e34bd6507b8fea Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 15 Sep 2021 14:40:54 -0400 Subject: [PATCH 44/57] clang-format --- cpp/include/cugraph/prims/count_if_v.cuh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cpp/include/cugraph/prims/count_if_v.cuh b/cpp/include/cugraph/prims/count_if_v.cuh index 05f778c9a0c..f56da008eca 100644 --- a/cpp/include/cugraph/prims/count_if_v.cuh +++ b/cpp/include/cugraph/prims/count_if_v.cuh @@ -59,7 +59,8 @@ typename GraphViewType::vertex_type count_if_v(raft::handle_t const& handle, vertex_value_input_first + graph_view.get_number_of_local_vertices(), v_op); if (GraphViewType::is_multi_gpu) { - count = host_scalar_allreduce(handle.get_comms(), count, raft::comms::op_t::SUM, handle.get_stream()); + count = + host_scalar_allreduce(handle.get_comms(), count, raft::comms::op_t::SUM, handle.get_stream()); } return count; } @@ -94,7 +95,8 @@ typename GraphViewType::vertex_type count_if_v(raft::handle_t const& handle, { auto count = thrust::count_if(handle.get_thrust_policy(), input_first, input_last, v_op); if (GraphViewType::is_multi_gpu) { - count = host_scalar_allreduce(handle.get_comms(), count, raft::comms::op_t::SUM, handle.get_stream()); + count = + host_scalar_allreduce(handle.get_comms(), count, raft::comms::op_t::SUM, handle.get_stream()); } return count; } From 3c5bb2d593b86db5c5a35c3620067e5c75d8acc2 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 15 Sep 2021 14:45:18 -0400 Subject: [PATCH 45/57] additional clang-format --- cpp/src/components/weakly_connected_components_impl.cuh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cpp/src/components/weakly_connected_components_impl.cuh b/cpp/src/components/weakly_connected_components_impl.cuh index 3fb9505fa54..66c9447605d 100644 --- a/cpp/src/components/weakly_connected_components_impl.cuh +++ b/cpp/src/components/weakly_connected_components_impl.cuh @@ -655,8 +655,9 @@ void weakly_connected_components_impl(raft::handle_t const& handle, auto num_inserts = num_edge_inserts.value(handle.get_stream_view()); auto aggregate_num_inserts = num_inserts; if (GraphViewType::is_multi_gpu) { - auto& comm = handle.get_comms(); - aggregate_num_inserts = host_scalar_allreduce(comm, num_inserts, raft::comms::op_t::SUM, handle.get_stream()); + auto& comm = handle.get_comms(); + aggregate_num_inserts = + host_scalar_allreduce(comm, num_inserts, raft::comms::op_t::SUM, handle.get_stream()); } if (aggregate_num_inserts > 0) { From a85181822dc627f3ebadbcb8ea2416382f2bbf9f Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 15 Sep 2021 14:52:48 -0400 Subject: [PATCH 46/57] bug fix (max reduction instead of erroneous sum reduction) --- cpp/src/structure/graph_impl.cuh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/src/structure/graph_impl.cuh b/cpp/src/structure/graph_impl.cuh index e1371586dd8..68c517dc599 100644 --- a/cpp/src/structure/graph_impl.cuh +++ b/cpp/src/structure/graph_impl.cuh @@ -413,10 +413,12 @@ graph_t(num_local_unique_edge_majors) / static_cast(aggregate_major_size), + raft::comms::op_t::MAX, handle.get_stream()); auto max_minor_properties_fill_ratio = host_scalar_allreduce( comm, static_cast(num_local_unique_edge_minors) / static_cast(minor_size), + raft::comms::op_t::MAX, handle.get_stream()); if (max_major_properties_fill_ratio < detail::row_col_properties_kv_pair_fill_ratio_threshold) { From 74ee5c4a7db6e3aed039e4c05dd868d6300094da Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 16 Sep 2021 08:33:54 -0400 Subject: [PATCH 47/57] bug fix --- cpp/src/structure/graph_impl.cuh | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/cpp/src/structure/graph_impl.cuh b/cpp/src/structure/graph_impl.cuh index 5789aefd0d7..206594c8348 100644 --- a/cpp/src/structure/graph_impl.cuh +++ b/cpp/src/structure/graph_impl.cuh @@ -264,7 +264,7 @@ graph_t Date: Thu, 16 Sep 2021 08:35:36 -0400 Subject: [PATCH 48/57] update row/col properties classes --- .../copy_v_transform_reduce_in_out_nbr.cuh | 4 +- ...ransform_reduce_key_aggregated_out_nbr.cuh | 2 +- .../cugraph/prims/row_col_properties.cuh | 326 +++++++++++++++--- ...orm_reduce_by_adj_matrix_row_col_key_e.cuh | 9 +- .../cugraph/prims/transform_reduce_e.cuh | 4 +- .../update_frontier_v_push_if_out_nbr.cuh | 2 +- cpp/src/community/louvain.cuh | 20 +- .../weakly_connected_components_impl.cuh | 2 +- cpp/src/structure/coarsen_graph_impl.cuh | 2 +- cpp/src/traversal/sssp_impl.cuh | 2 +- 10 files changed, 300 insertions(+), 73 deletions(-) diff --git a/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh b/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh index e208c13c412..6900c2a0d07 100644 --- a/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh +++ b/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh @@ -479,9 +479,9 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, auto matrix_partition_row_value_input = adj_matrix_row_value_input; auto matrix_partition_col_value_input = adj_matrix_col_value_input; if constexpr (GraphViewType::is_adj_matrix_transposed) { - matrix_partition_col_value_input.add_offset(matrix_partition.get_major_value_start_offset()); + matrix_partition_col_value_input.set_local_adj_matrix_partition_idx(i); } else { - matrix_partition_row_value_input.add_offset(matrix_partition.get_major_value_start_offset()); + matrix_partition_row_value_input.set_local_adj_matrix_partition_idx(i); } std::conditional_t< diff --git a/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh b/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh index f21ffebde53..a11e757b797 100644 --- a/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh +++ b/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh @@ -624,7 +624,7 @@ void copy_v_transform_reduce_key_aggregated_out_nbr( auto tmp_e_op_result_buffer_first = get_dataframe_buffer_begin(tmp_e_op_result_buffer); auto matrix_partition_row_value_input = adj_matrix_row_value_input; - matrix_partition_row_value_input.add_offset(matrix_partition.get_major_value_start_offset()); + matrix_partition_row_value_input.set_local_adj_matrix_partition_idx(i); auto triplet_first = thrust::make_zip_iterator(thrust::make_tuple( tmp_major_vertices.begin(), tmp_minor_keys.begin(), tmp_key_aggregated_edge_weights.begin())); diff --git a/cpp/include/cugraph/prims/row_col_properties.cuh b/cpp/include/cugraph/prims/row_col_properties.cuh index fbece29ceb0..ca6f16954ee 100644 --- a/cpp/include/cugraph/prims/row_col_properties.cuh +++ b/cpp/include/cugraph/prims/row_col_properties.cuh @@ -36,31 +36,116 @@ namespace cugraph { namespace detail { template -class row_col_properties_device_view_t { +class major_properties_device_view_t { public: using value_type = typename thrust::iterator_traits::value_type; - row_col_properties_device_view_t() = default; + major_properties_device_view_t() = default; - row_col_properties_device_view_t(ValueIterator value_first) - : key_first_(thrust::nullopt), key_last_(thrust::nullopt), key_offset_(0), value_first_(value_first) + major_properties_device_view_t( + ValueIterator value_first) // for single-GPU only and for advanced users + : value_first_(value_first) { } - row_col_properties_device_view_t(vertex_t const* key_first, - vertex_t const* key_last, - ValueIterator value_first) - : key_first_(key_first), key_last_(key_last), key_offset_(0), value_first_(value_first) + major_properties_device_view_t(ValueIterator value_first, + vertex_t const* matrix_partition_major_value_start_offsets) + : value_first_(value_first), + matrix_partition_major_value_start_offsets_(matrix_partition_major_value_start_offsets) { + set_local_adj_matrix_partition_idx(size_t{0}); } - void add_offset(vertex_t offset) { + major_properties_device_view_t(vertex_t const* key_first, + ValueIterator value_first, + vertex_t const* matrix_partition_key_offsets, + vertex_t const* matrix_partition_major_firsts) + : key_first_(key_first), + value_first_(value_first), + matrix_partition_key_offsets_(matrix_partition_key_offsets), + matrix_partition_major_firsts_(matrix_partition_major_firsts) + { + set_local_adj_matrix_partition_idx(size_t{0}); + } + + void set_local_adj_matrix_partition_idx(size_t adj_matrix_partition_idx) + { if (key_first_) { - *key_offset_ += offset; + matrix_partition_key_first_ = + *key_first_ + (*matrix_partition_key_offsets_)[adj_matrix_partition_idx]; + matrix_partition_key_last_ = + *key_first_ + (*matrix_partition_key_offsets_)[adj_matrix_partition_idx + 1]; + matrix_partition_major_first_ = (*matrix_partition_major_firsts_)[adj_matrix_partition_idx]; + matrix_partition_value_first_ = + value_first_ + (*matrix_partition_key_offsets_)[adj_matrix_partition_idx]; + } else { + if (matrix_partition_major_value_start_offsets_) { + matrix_partition_value_first_ = + value_first_ + (*matrix_partition_major_value_start_offsets_)[adj_matrix_partition_idx]; + } else { + assert(adj_matrix_partition_idx == 0); + matrix_partition_value_first_ = value_first_; + } } - else { - value_first_ += offset; + } + + ValueIterator value_data() const { return value_first_; } + + __device__ ValueIterator get_iter(vertex_t offset) const + { + auto value_offset = offset; + if (matrix_partition_key_first_) { + auto it = thrust::lower_bound(thrust::seq, + *matrix_partition_key_first_, + *matrix_partition_key_last_, + *matrix_partition_major_first_ + offset); + assert((it != *matrix_partition_key_last_) && + (*it == (*matrix_partition_major_first_ + offset))); + value_offset = static_cast(thrust::distance(*matrix_partition_key_first_, it)); } + return matrix_partition_value_first_ + value_offset; + } + + __device__ value_type get(vertex_t offset) const { return *get_iter(offset); } + + private: + thrust::optional key_first_{thrust::nullopt}; + ValueIterator value_first_{}; + + thrust::optional matrix_partition_key_offsets_{thrust::nullopt}; // host data + thrust::optional matrix_partition_major_firsts_{thrust::nullopt}; // host data + + thrust::optional matrix_partition_major_value_start_offsets_{ + thrust::nullopt}; // host data + + thrust::optional matrix_partition_key_first_{thrust::nullopt}; + thrust::optional matrix_partition_key_last_{thrust::nullopt}; + thrust::optional matrix_partition_major_first_{thrust::nullopt}; + + ValueIterator matrix_partition_value_first_{}; +}; + +template +class minor_properties_device_view_t { + public: + using value_type = typename thrust::iterator_traits::value_type; + + minor_properties_device_view_t() = default; + + minor_properties_device_view_t(ValueIterator value_first) + : value_first_(value_first) + { + } + + minor_properties_device_view_t(vertex_t const* key_first, + vertex_t const* key_last, + vertex_t minor_first, + ValueIterator value_first) + : key_first_(key_first), + key_last_(key_last), + minor_first_(minor_first), + value_first_(value_first) + { } ValueIterator value_data() const { return value_first_; } @@ -69,8 +154,8 @@ class row_col_properties_device_view_t { { auto value_offset = offset; if (key_first_) { - auto it = thrust::lower_bound(thrust::seq, *key_first_, *key_last_, offset + *key_offset_); - assert((it != *key_last_) && (*it == (offset + *key_offset_))); + auto it = thrust::lower_bound(thrust::seq, *key_first_, *key_last_, *minor_first_ + offset); + assert((it != *key_last_) && (*it == (*minor_first_ + offset))); value_offset = static_cast(thrust::distance(*key_first_, it)); } return value_first_ + value_offset; @@ -81,33 +166,105 @@ class row_col_properties_device_view_t { private: thrust::optional key_first_{thrust::nullopt}; thrust::optional key_last_{thrust::nullopt}; - thrust::optional key_offset_{0}; + thrust::optional minor_first_{thrust::nullopt}; ValueIterator value_first_{}; }; template -class row_col_properties_t { +class major_properties_t { public: - row_col_properties_t() - : key_first_(std::nullopt), - key_last_(std::nullopt), - buffer_(allocate_dataframe_buffer(0, rmm::cuda_stream_view{})) + major_properties_t() : buffer_(allocate_dataframe_buffer(0, rmm::cuda_stream_view{})) {} + + major_properties_t(raft::handle_t const& handle, + vertex_t buffer_size, + std::vector&& matrix_partition_major_value_start_offsets) + : buffer_(allocate_dataframe_buffer(buffer_size, handle.get_stream())), + matrix_partition_major_value_start_offsets_( + std::move(matrix_partition_major_value_start_offsets)) + { + } + + major_properties_t(raft::handle_t const& handle, + vertex_t const* key_first, + std::vector&& matrix_partition_key_offsets, + std::vector&& matrix_partition_major_firsts) + : key_first_(key_first), + buffer_( + allocate_dataframe_buffer(matrix_partition_key_offsets.back(), handle.get_stream())), + matrix_partition_key_offsets_(std::move(matrix_partition_key_offsets)), + matrix_partition_major_firsts_(std::move(matrix_partition_major_firsts)) + { + } + + void fill(T value, rmm::cuda_stream_view stream) + { + thrust::fill( + rmm::exec_policy(stream), value_data(), value_data() + size_dataframe_buffer(buffer_), value); + } + + auto key_first() { return key_first_; } + auto key_last() { return key_first_ + matrix_partition_key_offsets_.back(); } + auto value_data() { return get_dataframe_buffer_begin(buffer_); } + + auto device_view() const + { + auto value_first = get_dataframe_buffer_cbegin(buffer_); + if (key_first_) { + return major_properties_device_view_t( + *key_first_, + value_first, + (*matrix_partition_key_offsets_).data(), + (*matrix_partition_major_firsts_).data()); + } else { + return major_properties_device_view_t( + value_first, (*matrix_partition_major_value_start_offsets_).data()); + } + } + + auto mutable_device_view() { + auto value_first = get_dataframe_buffer_begin(buffer_); + if (key_first_) { + return major_properties_device_view_t( + *key_first_, + value_first, + (*matrix_partition_key_offsets_).data(), + (*matrix_partition_major_firsts_).data()); + } else { + return major_properties_device_view_t( + value_first, (*matrix_partition_major_value_start_offsets_).data()); + } } - row_col_properties_t(raft::handle_t const& handle, vertex_t buffer_size) - : key_first_(std::nullopt), - key_last_(std::nullopt), - buffer_(allocate_dataframe_buffer(buffer_size, handle.get_stream())) + private: + std::optional key_first_{std::nullopt}; + + decltype(allocate_dataframe_buffer(0, rmm::cuda_stream_view{})) buffer_; + + std::optional> matrix_partition_key_offsets_{std::nullopt}; + std::optional> matrix_partition_major_firsts_{std::nullopt}; + + std::optional> matrix_partition_major_value_start_offsets_{std::nullopt}; +}; + +template +class minor_properties_t { + public: + minor_properties_t() : buffer_(allocate_dataframe_buffer(0, rmm::cuda_stream_view{})) {} + + minor_properties_t(raft::handle_t const& handle, vertex_t buffer_size) + : buffer_(allocate_dataframe_buffer(buffer_size, handle.get_stream())) { } - row_col_properties_t(raft::handle_t const& handle, - vertex_t const* key_first, - vertex_t const* key_last) + minor_properties_t(raft::handle_t const& handle, + vertex_t const* key_first, + vertex_t const* key_last, + vertex_t minor_first) : key_first_(key_first), key_last_(key_last), + minor_first_(minor_first), buffer_( allocate_dataframe_buffer(thrust::distance(key_first, key_last), handle.get_stream())) { @@ -127,10 +284,10 @@ class row_col_properties_t { { auto value_first = get_dataframe_buffer_cbegin(buffer_); if (key_first_) { - return row_col_properties_device_view_t( - *key_first_, *key_last_, value_first); + return minor_properties_device_view_t( + *key_first_, *key_last_, *minor_first_, value_first); } else { - return row_col_properties_device_view_t(value_first); + return minor_properties_device_view_t(value_first); } } @@ -138,16 +295,17 @@ class row_col_properties_t { { auto value_first = get_dataframe_buffer_begin(buffer_); if (key_first_) { - return row_col_properties_device_view_t( - *key_first_, *key_last_, value_first); + return minor_properties_device_view_t( + *key_first_, *key_last_, *minor_first_, value_first); } else { - return row_col_properties_device_view_t(value_first); + return minor_properties_device_view_t(value_first); } } private: std::optional key_first_{std::nullopt}; std::optional key_last_{std::nullopt}; + std::optional minor_first_{std::nullopt}; decltype(allocate_dataframe_buffer(0, rmm::cuda_stream_view{})) buffer_; }; @@ -181,14 +339,47 @@ class row_properties_t { row_properties_t(raft::handle_t const& handle, GraphViewType const& graph_view) { + using vertex_t = typename GraphViewType::vertex_type; + auto key_first = graph_view.get_local_sorted_unique_edge_row_begin(); - auto key_last = graph_view.get_local_sorted_unique_edge_row_end(); if (key_first) { - properties_ = detail::row_col_properties_t( - handle, *key_first, *key_last); + if constexpr (GraphViewType::is_multi_gpu) { + if constexpr (GraphViewType::is_adj_matrix_transposed) { + auto key_last = graph_view.get_local_sorted_unique_edge_row_end(); + properties_ = detail::minor_properties_t( + handle, *key_first, *key_last, graph_view.get_local_adj_matrix_partition_row_first()); + } else { + std::vector matrix_partition_major_firsts( + graph_view.get_number_of_local_adj_matrix_partitions()); + for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) { + matrix_partition_major_firsts[i] = + graph_view.get_local_adj_matrix_partition_row_first(i); + } + properties_ = detail::major_properties_t( + handle, + *key_first, + *(graph_view.get_local_sorted_unique_edge_row_offsets()), + std::move(matrix_partition_major_firsts)); + } + } else { + assert(false); + } } else { - properties_ = detail::row_col_properties_t( - handle, graph_view.get_number_of_local_adj_matrix_partition_rows()); + if constexpr (GraphViewType::is_adj_matrix_transposed) { + properties_ = detail::minor_properties_t( + handle, graph_view.get_number_of_local_adj_matrix_partition_rows()); + } else { + std::vector matrix_partition_major_value_start_offsets( + graph_view.get_number_of_local_adj_matrix_partitions()); + for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) { + matrix_partition_major_value_start_offsets[i] = + graph_view.get_local_adj_matrix_partition_row_value_start_offset(i); + } + properties_ = detail::major_properties_t( + handle, + graph_view.get_number_of_local_adj_matrix_partition_rows(), + std::move(matrix_partition_major_value_start_offsets)); + } } } @@ -203,7 +394,10 @@ class row_properties_t { auto mutable_device_view() { return properties_.mutable_device_view(); } private: - detail::row_col_properties_t properties_{}; + std::conditional_t, + detail::major_properties_t> + properties_{}; }; template @@ -217,14 +411,47 @@ class col_properties_t { col_properties_t(raft::handle_t const& handle, GraphViewType const& graph_view) { + using vertex_t = typename GraphViewType::vertex_type; + auto key_first = graph_view.get_local_sorted_unique_edge_col_begin(); - auto key_last = graph_view.get_local_sorted_unique_edge_col_end(); if (key_first) { - properties_ = detail::row_col_properties_t( - handle, *key_first, *key_last); + if constexpr (GraphViewType::is_multi_gpu) { + if constexpr (GraphViewType::is_adj_matrix_transposed) { + std::vector matrix_partition_major_firsts( + graph_view.get_number_of_local_adj_matrix_partitions()); + for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) { + matrix_partition_major_firsts[i] = + graph_view.get_local_adj_matrix_partition_col_first(i); + } + properties_ = detail::major_properties_t( + handle, + *key_first, + *(graph_view.get_local_sorted_unique_edge_col_offsets()), + std::move(matrix_partition_major_firsts)); + } else { + auto key_last = graph_view.get_local_sorted_unique_edge_col_end(); + properties_ = detail::minor_properties_t( + handle, *key_first, *key_last, graph_view.get_local_adj_matrix_partition_col_first()); + } + } else { + assert(false); + } } else { - properties_ = detail::row_col_properties_t( - handle, graph_view.get_number_of_local_adj_matrix_partition_cols()); + if constexpr (GraphViewType::is_adj_matrix_transposed) { + std::vector matrix_partition_major_value_start_offsets( + graph_view.get_number_of_local_adj_matrix_partitions()); + for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) { + matrix_partition_major_value_start_offsets[i] = + graph_view.get_local_adj_matrix_partition_col_value_start_offset(i); + } + properties_ = detail::major_properties_t( + handle, + graph_view.get_number_of_local_adj_matrix_partition_cols(), + std::move(matrix_partition_major_value_start_offsets)); + } else { + properties_ = detail::minor_properties_t( + handle, graph_view.get_number_of_local_adj_matrix_partition_cols()); + } } } @@ -239,7 +466,10 @@ class col_properties_t { auto mutable_device_view() { return properties_.mutable_device_view(); } private: - detail::row_col_properties_t properties_{}; + std::conditional_t, + detail::minor_properties_t> + properties_{}; }; template @@ -247,7 +477,7 @@ class dummy_properties_device_view_t { public: using value_type = thrust::nullopt_t; - void add_offset(vertex_t offset) {} // no-op + void set_local_adj_matrix_partition_idx(size_t adj_matrix_partition_idx) {} // no-op __device__ auto get(vertex_t offset) const { return thrust::nullopt; } }; @@ -261,11 +491,11 @@ class dummy_properties_t { }; template -auto device_view_concat(detail::row_col_properties_device_view_t... device_views) +auto device_view_concat(detail::major_properties_device_view_t... device_views) { auto concat_first = thrust::make_zip_iterator( thrust_tuple_cat(detail::to_thrust_tuple(device_views.value_data())...)); - return detail::row_col_properties_device_view_t(concat_first); + return detail::major_properties_device_view_t(concat_first); } } // namespace cugraph diff --git a/cpp/include/cugraph/prims/transform_reduce_by_adj_matrix_row_col_key_e.cuh b/cpp/include/cugraph/prims/transform_reduce_by_adj_matrix_row_col_key_e.cuh index 79807b3728d..1ee2dd5b2d8 100644 --- a/cpp/include/cugraph/prims/transform_reduce_by_adj_matrix_row_col_key_e.cuh +++ b/cpp/include/cugraph/prims/transform_reduce_by_adj_matrix_row_col_key_e.cuh @@ -392,17 +392,14 @@ transform_reduce_by_adj_matrix_row_col_key_e( auto matrix_partition_row_value_input = adj_matrix_row_value_input; auto matrix_partition_col_value_input = adj_matrix_col_value_input; if constexpr (GraphViewType::is_adj_matrix_transposed) { - matrix_partition_col_value_input.add_offset( - matrix_partition.get_major_value_start_offset()); + matrix_partition_col_value_input.set_local_adj_matrix_partition_idx(i); } else { - matrix_partition_row_value_input.add_offset( - matrix_partition.get_major_value_start_offset()); + matrix_partition_row_value_input.set_local_adj_matrix_partition_idx(i); } auto matrix_partition_row_col_key_input = adj_matrix_row_col_key_input; if constexpr ((adj_matrix_row_key && !GraphViewType::is_adj_matrix_transposed) || (!adj_matrix_row_key && GraphViewType::is_adj_matrix_transposed)) { - matrix_partition_row_col_key_input.add_offset( - matrix_partition.get_major_value_start_offset()); + matrix_partition_row_col_key_input.set_local_adj_matrix_partition_idx(i); } auto segment_offsets = graph_view.get_local_adj_matrix_partition_segment_offsets(i); diff --git a/cpp/include/cugraph/prims/transform_reduce_e.cuh b/cpp/include/cugraph/prims/transform_reduce_e.cuh index 990730dee32..275fa11a95e 100644 --- a/cpp/include/cugraph/prims/transform_reduce_e.cuh +++ b/cpp/include/cugraph/prims/transform_reduce_e.cuh @@ -424,9 +424,9 @@ T transform_reduce_e(raft::handle_t const& handle, auto matrix_partition_row_value_input = adj_matrix_row_value_input; auto matrix_partition_col_value_input = adj_matrix_col_value_input; if constexpr (GraphViewType::is_adj_matrix_transposed) { - matrix_partition_col_value_input.add_offset(matrix_partition.get_major_value_start_offset()); + matrix_partition_col_value_input.set_local_adj_matrix_partition_idx(i); } else { - matrix_partition_row_value_input.add_offset(matrix_partition.get_major_value_start_offset()); + matrix_partition_row_value_input.set_local_adj_matrix_partition_idx(i); } auto segment_offsets = graph_view.get_local_adj_matrix_partition_segment_offsets(i); diff --git a/cpp/include/cugraph/prims/update_frontier_v_push_if_out_nbr.cuh b/cpp/include/cugraph/prims/update_frontier_v_push_if_out_nbr.cuh index 422bfbd82fa..c84fcd19ce4 100644 --- a/cpp/include/cugraph/prims/update_frontier_v_push_if_out_nbr.cuh +++ b/cpp/include/cugraph/prims/update_frontier_v_push_if_out_nbr.cuh @@ -980,7 +980,7 @@ void update_frontier_v_push_if_out_nbr( auto matrix_partition_row_value_input = adj_matrix_row_value_input; auto matrix_partition_col_value_input = adj_matrix_col_value_input; - matrix_partition_row_value_input.add_offset(matrix_partition.get_major_value_start_offset()); + matrix_partition_row_value_input.set_local_adj_matrix_partition_idx(i); if (segment_offsets) { static_assert(detail::num_sparse_segments_per_vertex_partition == 3); diff --git a/cpp/src/community/louvain.cuh b/cpp/src/community/louvain.cuh index 9b774bc3f4d..23adca97342 100644 --- a/cpp/src/community/louvain.cuh +++ b/cpp/src/community/louvain.cuh @@ -260,11 +260,11 @@ class Louvain { current_graph_view_, graph_view_t::is_multi_gpu ? src_clusters_cache_.device_view() - : detail::row_col_properties_device_view_t( + : detail::major_properties_device_view_t( next_clusters_v_.begin()), graph_view_t::is_multi_gpu ? dst_clusters_cache_.device_view() - : detail::row_col_properties_device_view_t( + : detail::minor_properties_device_view_t( next_clusters_v_.begin()), [] __device__(auto, auto, weight_t wt, auto src_cluster, auto nbr_cluster) { if (src_cluster == nbr_cluster) { @@ -396,11 +396,11 @@ class Louvain { current_graph_view_, graph_view_t::is_multi_gpu ? src_clusters_cache_.device_view() - : detail::row_col_properties_device_view_t( + : detail::major_properties_device_view_t( next_clusters_v_.data()), graph_view_t::is_multi_gpu ? dst_clusters_cache_.device_view() - : detail::row_col_properties_device_view_t( + : detail::minor_properties_device_view_t( next_clusters_v_.data()), [] __device__(auto src, auto dst, auto wt, auto src_cluster, auto nbr_cluster) { weight_t sum{0}; @@ -491,13 +491,13 @@ class Louvain { src_cluster_weights.device_view(), src_old_cluster_sum_subtract_pairs.device_view()) : device_view_concat( - detail::row_col_properties_device_view_t( + detail::major_properties_device_view_t( vertex_weights_v_.data()), - detail::row_col_properties_device_view_t( + detail::major_properties_device_view_t( next_clusters_v_.data()), - detail::row_col_properties_device_view_t( + detail::major_properties_device_view_t( vertex_cluster_weights_v.data()), - detail::row_col_properties_device_view_t( cluster_old_sum_subtract_pair_first)); @@ -507,7 +507,7 @@ class Louvain { zipped_src_device_view, graph_view_t::is_multi_gpu ? dst_clusters_cache_.device_view() - : detail::row_col_properties_device_view_t( + : detail::minor_properties_device_view_t( next_clusters_v_.data()), cluster_keys_v_.begin(), cluster_keys_v_.end(), @@ -539,7 +539,7 @@ class Louvain { dummy_properties_t{}.device_view(), graph_view_t::is_multi_gpu ? src_clusters_cache_.device_view() - : detail::row_col_properties_device_view_t( + : detail::major_properties_device_view_t( next_clusters_v_.data()), detail::return_edge_weight_t{}, weight_t{0}); diff --git a/cpp/src/components/weakly_connected_components_impl.cuh b/cpp/src/components/weakly_connected_components_impl.cuh index 1f8408b4f4a..0b672151708 100644 --- a/cpp/src/components/weakly_connected_components_impl.cuh +++ b/cpp/src/components/weakly_connected_components_impl.cuh @@ -544,7 +544,7 @@ void weakly_connected_components_impl(raft::handle_t const& handle, [col_components = GraphViewType::is_multi_gpu ? adj_matrix_col_components.mutable_device_view() - : detail::row_col_properties_device_view_t(level_components), + : detail::minor_properties_device_view_t(level_components), col_first = level_graph_view.get_local_adj_matrix_partition_col_first(), edge_buffer_first = get_dataframe_buffer_begin(edge_buffer), diff --git a/cpp/src/structure/coarsen_graph_impl.cuh b/cpp/src/structure/coarsen_graph_impl.cuh index 3d9e99a6fea..5e66b1c7667 100644 --- a/cpp/src/structure/coarsen_graph_impl.cuh +++ b/cpp/src/structure/coarsen_graph_impl.cuh @@ -494,7 +494,7 @@ coarsen_graph( matrix_partition_device_view_t( graph_view.get_matrix_partition_view()), labels, - detail::row_col_properties_device_view_t(labels), + detail::minor_properties_device_view_t(labels), graph_view.get_local_adj_matrix_partition_segment_offsets(0)); rmm::device_uvector unique_labels(graph_view.get_number_of_vertices(), diff --git a/cpp/src/traversal/sssp_impl.cuh b/cpp/src/traversal/sssp_impl.cuh index 3a95fdb8fbc..ba91d485d65 100644 --- a/cpp/src/traversal/sssp_impl.cuh +++ b/cpp/src/traversal/sssp_impl.cuh @@ -167,7 +167,7 @@ void sssp(raft::handle_t const& handle, std::vector{static_cast(Bucket::next_near), static_cast(Bucket::far)}, GraphViewType::is_multi_gpu ? adj_matrix_row_distances.device_view() - : detail::row_col_properties_device_view_t(distances), + : detail::major_properties_device_view_t(distances), dummy_properties_t{}.device_view(), [vertex_partition, distances, cutoff] __device__( vertex_t src, vertex_t dst, weight_t w, auto src_val, auto) { From d5ee1462f571c1bcc0c101824821a5e19385367a Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 16 Sep 2021 11:13:07 -0400 Subject: [PATCH 49/57] temp debug printouts --- cpp/src/community/louvain.cuh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cpp/src/community/louvain.cuh b/cpp/src/community/louvain.cuh index 23adca97342..dfa0c9725e8 100644 --- a/cpp/src/community/louvain.cuh +++ b/cpp/src/community/louvain.cuh @@ -175,6 +175,7 @@ class Louvain { compute_vertex_and_cluster_weights(); weight_t new_Q = update_clustering(total_edge_weight, resolution); +std::cout << graph_view_t::is_multi_gpu << " new_Q=" << new_Q << " dendrogram_->num_levels()=" << dendrogram_->num_levels() << " max_level=" << max_level << " total_edge_weight=" << total_edge_weight << " resolution=" << resolution << std::endl; if (new_Q <= best_modularity) { break; } @@ -355,6 +356,7 @@ class Louvain { } weight_t new_Q = modularity(total_edge_weight, resolution); +std::cout << graph_view_t::is_multi_gpu << "update_clustering new_Q=" << new_Q << std::endl; weight_t cur_Q = new_Q - 1; // To avoid the potential of having two vertices swap clusters @@ -370,6 +372,7 @@ class Louvain { up_down = !up_down; new_Q = modularity(total_edge_weight, resolution); +std::cout << graph_view_t::is_multi_gpu << "update_clustering loop new_Q=" << new_Q << std::endl; if (new_Q > cur_Q) { raft::copy(dendrogram_->current_level_begin(), @@ -523,6 +526,7 @@ class Louvain { cugraph::get_dataframe_buffer_begin(output_buffer), next_clusters_v_.begin(), detail::cluster_update_op_t{up_down}); +raft::print_device_vector("new next_clusters_v_", next_clusters_v_.data(), next_clusters_v_.size(), std::cout); if constexpr (graph_view_t::is_multi_gpu) { copy_to_adj_matrix_row( From e9b850a245e04296e24e6716165e8df84ea738d7 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Fri, 17 Sep 2021 06:42:02 -0400 Subject: [PATCH 50/57] undo debug printouts --- cpp/src/community/louvain.cuh | 4 ---- 1 file changed, 4 deletions(-) diff --git a/cpp/src/community/louvain.cuh b/cpp/src/community/louvain.cuh index dfa0c9725e8..23adca97342 100644 --- a/cpp/src/community/louvain.cuh +++ b/cpp/src/community/louvain.cuh @@ -175,7 +175,6 @@ class Louvain { compute_vertex_and_cluster_weights(); weight_t new_Q = update_clustering(total_edge_weight, resolution); -std::cout << graph_view_t::is_multi_gpu << " new_Q=" << new_Q << " dendrogram_->num_levels()=" << dendrogram_->num_levels() << " max_level=" << max_level << " total_edge_weight=" << total_edge_weight << " resolution=" << resolution << std::endl; if (new_Q <= best_modularity) { break; } @@ -356,7 +355,6 @@ std::cout << graph_view_t::is_multi_gpu << " new_Q=" << new_Q << " dendrogram_-> } weight_t new_Q = modularity(total_edge_weight, resolution); -std::cout << graph_view_t::is_multi_gpu << "update_clustering new_Q=" << new_Q << std::endl; weight_t cur_Q = new_Q - 1; // To avoid the potential of having two vertices swap clusters @@ -372,7 +370,6 @@ std::cout << graph_view_t::is_multi_gpu << "update_clustering new_Q=" << new_Q < up_down = !up_down; new_Q = modularity(total_edge_weight, resolution); -std::cout << graph_view_t::is_multi_gpu << "update_clustering loop new_Q=" << new_Q << std::endl; if (new_Q > cur_Q) { raft::copy(dendrogram_->current_level_begin(), @@ -526,7 +523,6 @@ std::cout << graph_view_t::is_multi_gpu << "update_clustering loop new_Q=" << ne cugraph::get_dataframe_buffer_begin(output_buffer), next_clusters_v_.begin(), detail::cluster_update_op_t{up_down}); -raft::print_device_vector("new next_clusters_v_", next_clusters_v_.data(), next_clusters_v_.size(), std::cout); if constexpr (graph_view_t::is_multi_gpu) { copy_to_adj_matrix_row( From ec02b9d8ecabbd1c79e07da5c1c21e1d8f9c0809 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Fri, 17 Sep 2021 13:45:36 -0400 Subject: [PATCH 51/57] bug fix --- .../cugraph/prims/row_col_properties.cuh | 113 ++++++++++++++---- 1 file changed, 87 insertions(+), 26 deletions(-) diff --git a/cpp/include/cugraph/prims/row_col_properties.cuh b/cpp/include/cugraph/prims/row_col_properties.cuh index ca6f16954ee..73358f64dd2 100644 --- a/cpp/include/cugraph/prims/row_col_properties.cuh +++ b/cpp/include/cugraph/prims/row_col_properties.cuh @@ -46,6 +46,7 @@ class major_properties_device_view_t { ValueIterator value_first) // for single-GPU only and for advanced users : value_first_(value_first) { + set_local_adj_matrix_partition_idx(size_t{0}); } major_properties_device_view_t(ValueIterator value_first, @@ -89,8 +90,34 @@ class major_properties_device_view_t { } } + std::optional key_data() const + { + return key_first_ ? std::optional{*key_first_} : std::nullopt; + } + ValueIterator value_data() const { return value_first_; } + std::optional matrix_partition_key_offsets() const + { + return matrix_partition_key_offsets_ + ? std::optional{*matrix_partition_key_offsets_} + : std::nullopt; + } + + std::optional matrix_partition_major_firsts() const + { + return matrix_partition_major_firsts_ + ? std::optional{*matrix_partition_major_firsts_} + : std::nullopt; + } + + std::optional matrix_partition_major_value_start_offsets() const + { + return matrix_partition_major_value_start_offsets_ + ? std::optional{*matrix_partition_major_value_start_offsets_} + : std::nullopt; + } + __device__ ValueIterator get_iter(vertex_t offset) const { auto value_offset = offset; @@ -132,10 +159,7 @@ class minor_properties_device_view_t { minor_properties_device_view_t() = default; - minor_properties_device_view_t(ValueIterator value_first) - : value_first_(value_first) - { - } + minor_properties_device_view_t(ValueIterator value_first) : value_first_(value_first) {} minor_properties_device_view_t(vertex_t const* key_first, vertex_t const* key_last, @@ -176,6 +200,11 @@ class major_properties_t { public: major_properties_t() : buffer_(allocate_dataframe_buffer(0, rmm::cuda_stream_view{})) {} + major_properties_t(raft::handle_t const& handle, vertex_t buffer_size) + : buffer_(allocate_dataframe_buffer(buffer_size, handle.get_stream())) + { + } + major_properties_t(raft::handle_t const& handle, vertex_t buffer_size, std::vector&& matrix_partition_major_value_start_offsets) @@ -216,9 +245,11 @@ class major_properties_t { value_first, (*matrix_partition_key_offsets_).data(), (*matrix_partition_major_firsts_).data()); - } else { + } else if (matrix_partition_major_value_start_offsets_) { return major_properties_device_view_t( value_first, (*matrix_partition_major_value_start_offsets_).data()); + } else { + return major_properties_device_view_t(value_first); } } @@ -231,9 +262,11 @@ class major_properties_t { value_first, (*matrix_partition_key_offsets_).data(), (*matrix_partition_major_firsts_).data()); - } else { + } else if (matrix_partition_major_value_start_offsets_) { return major_properties_device_view_t( value_first, (*matrix_partition_major_value_start_offsets_).data()); + } else { + return major_properties_device_view_t(value_first); } } @@ -326,6 +359,12 @@ auto to_thrust_tuple(Iterator iter) return iter.get_iterator_tuple(); } +template +decltype(auto) get_first_of_pack(T&& t, Ts&&...) +{ + return std::forward(t); +} + } // namespace detail template @@ -369,16 +408,21 @@ class row_properties_t { properties_ = detail::minor_properties_t( handle, graph_view.get_number_of_local_adj_matrix_partition_rows()); } else { - std::vector matrix_partition_major_value_start_offsets( - graph_view.get_number_of_local_adj_matrix_partitions()); - for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) { - matrix_partition_major_value_start_offsets[i] = - graph_view.get_local_adj_matrix_partition_row_value_start_offset(i); + if constexpr (GraphViewType::is_multi_gpu) { + std::vector matrix_partition_major_value_start_offsets( + graph_view.get_number_of_local_adj_matrix_partitions()); + for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) { + matrix_partition_major_value_start_offsets[i] = + graph_view.get_local_adj_matrix_partition_row_value_start_offset(i); + } + properties_ = detail::major_properties_t( + handle, + graph_view.get_number_of_local_adj_matrix_partition_rows(), + std::move(matrix_partition_major_value_start_offsets)); + } else { + properties_ = detail::major_properties_t( + handle, graph_view.get_number_of_local_adj_matrix_partition_rows()); } - properties_ = detail::major_properties_t( - handle, - graph_view.get_number_of_local_adj_matrix_partition_rows(), - std::move(matrix_partition_major_value_start_offsets)); } } } @@ -438,16 +482,21 @@ class col_properties_t { } } else { if constexpr (GraphViewType::is_adj_matrix_transposed) { - std::vector matrix_partition_major_value_start_offsets( - graph_view.get_number_of_local_adj_matrix_partitions()); - for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) { - matrix_partition_major_value_start_offsets[i] = - graph_view.get_local_adj_matrix_partition_col_value_start_offset(i); + if constexpr (GraphViewType::is_multi_gpu) { + std::vector matrix_partition_major_value_start_offsets( + graph_view.get_number_of_local_adj_matrix_partitions()); + for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) { + matrix_partition_major_value_start_offsets[i] = + graph_view.get_local_adj_matrix_partition_col_value_start_offset(i); + } + properties_ = detail::major_properties_t( + handle, + graph_view.get_number_of_local_adj_matrix_partition_cols(), + std::move(matrix_partition_major_value_start_offsets)); + } else { + properties_ = detail::major_properties_t( + handle, graph_view.get_number_of_local_adj_matrix_partition_cols()); } - properties_ = detail::major_properties_t( - handle, - graph_view.get_number_of_local_adj_matrix_partition_cols(), - std::move(matrix_partition_major_value_start_offsets)); } else { properties_ = detail::minor_properties_t( handle, graph_view.get_number_of_local_adj_matrix_partition_cols()); @@ -491,11 +540,23 @@ class dummy_properties_t { }; template -auto device_view_concat(detail::major_properties_device_view_t... device_views) +auto device_view_concat(detail::major_properties_device_view_t const&... device_views) { auto concat_first = thrust::make_zip_iterator( thrust_tuple_cat(detail::to_thrust_tuple(device_views.value_data())...)); - return detail::major_properties_device_view_t(concat_first); + auto first = detail::get_first_of_pack(device_views...); + if (first.key_data()) { + return detail::major_properties_device_view_t( + *(first.key_data()), + concat_first, + *(first.matrix_partition_key_offsets()), + *(first.matrix_partition_major_firsts())); + } else if (first.matrix_partition_major_value_start_offsets()) { + return detail::major_properties_device_view_t( + concat_first, *(first.matrix_partition_major_value_start_offsets())); + } else { + return detail::major_properties_device_view_t(concat_first); + } } } // namespace cugraph From 82b6f75d790ee4286d4ee743fc06db71cae25d0e Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Fri, 17 Sep 2021 15:35:58 -0400 Subject: [PATCH 52/57] bug fix --- cpp/src/structure/renumber_edgelist_impl.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/structure/renumber_edgelist_impl.cuh b/cpp/src/structure/renumber_edgelist_impl.cuh index 301edae3173..655a1b5fba5 100644 --- a/cpp/src/structure/renumber_edgelist_impl.cuh +++ b/cpp/src/structure/renumber_edgelist_impl.cuh @@ -754,7 +754,7 @@ renumber_edgelist( edgelist_intra_partition_segment_offsets) { // memory footprint dominated by the O(V/sqrt(P)) // part than the O(E/P) part vertex_t max_segment_size{0}; - for (size_t i = 0; i < edgelist_major_vertices.size(); ++i) { + for (size_t i = 0; i < row_comm_size; ++i) { max_segment_size = std::max( max_segment_size, partition.get_vertex_partition_size(col_comm_rank * row_comm_size + i)); } From 2007fe1e6eee1eb0d81ac65fff7ece07e47fcb77 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Fri, 17 Sep 2021 15:37:08 -0400 Subject: [PATCH 53/57] fix compiler warning --- cpp/src/structure/renumber_edgelist_impl.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/structure/renumber_edgelist_impl.cuh b/cpp/src/structure/renumber_edgelist_impl.cuh index 655a1b5fba5..6c13a44652d 100644 --- a/cpp/src/structure/renumber_edgelist_impl.cuh +++ b/cpp/src/structure/renumber_edgelist_impl.cuh @@ -754,7 +754,7 @@ renumber_edgelist( edgelist_intra_partition_segment_offsets) { // memory footprint dominated by the O(V/sqrt(P)) // part than the O(E/P) part vertex_t max_segment_size{0}; - for (size_t i = 0; i < row_comm_size; ++i) { + for (int i = 0; i < row_comm_size; ++i) { max_segment_size = std::max( max_segment_size, partition.get_vertex_partition_size(col_comm_rank * row_comm_size + i)); } From e264dc2032b08a5dd6fef7aaef6c2e1a43a4a679 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Sat, 18 Sep 2021 12:51:09 -0400 Subject: [PATCH 54/57] bug fix --- .../prims/copy_to_adj_matrix_row_col.cuh | 4 ++-- .../cugraph/prims/row_col_properties.cuh | 7 ++++++- cpp/src/structure/graph_impl.cuh | 18 ++++++++++++++++-- cpp/src/structure/graph_view_impl.cuh | 2 +- 4 files changed, 25 insertions(+), 6 deletions(-) diff --git a/cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh b/cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh index b9c54c758c1..7100e7c8663 100644 --- a/cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh +++ b/cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh @@ -240,7 +240,7 @@ void copy_to_matrix_major(raft::handle_t const& handle, matrix_major_value_output.value_data() + (*key_offsets)[i]] __device__(auto i) { auto major = *(output_key_first + i); auto it = thrust::lower_bound(thrust::seq, rx_vertex_first, rx_vertex_last, major); - if (*it == major) { + if ((it != rx_vertex_last) && (*it == major)) { auto rx_value = *(rx_value_first + thrust::distance(rx_vertex_first, it)); *(output_value_first + i) = rx_value; } @@ -479,7 +479,7 @@ void copy_to_matrix_minor(raft::handle_t const& handle, matrix_minor_value_output.value_data() + (*key_offsets)[i]] __device__(auto i) { auto minor = *(output_key_first + i); auto it = thrust::lower_bound(thrust::seq, rx_vertex_first, rx_vertex_last, minor); - if (*it == minor) { + if ((it != rx_vertex_last) && (*it == minor)) { auto rx_value = *(rx_value_first + thrust::distance(rx_vertex_first, it)); *(output_value_first + i) = rx_value; } diff --git a/cpp/include/cugraph/prims/row_col_properties.cuh b/cpp/include/cugraph/prims/row_col_properties.cuh index 73358f64dd2..ec219f5290f 100644 --- a/cpp/include/cugraph/prims/row_col_properties.cuh +++ b/cpp/include/cugraph/prims/row_col_properties.cuh @@ -233,7 +233,12 @@ class major_properties_t { } auto key_first() { return key_first_; } - auto key_last() { return key_first_ + matrix_partition_key_offsets_.back(); } + auto key_last() + { + return key_first_ ? std::make_optional(*key_first_ + + (*matrix_partition_key_offsets_).back()) + : std::nullopt; + } auto value_data() { return get_dataframe_buffer_begin(buffer_); } auto device_view() const diff --git a/cpp/src/structure/graph_impl.cuh b/cpp/src/structure/graph_impl.cuh index 206594c8348..4e393e43504 100644 --- a/cpp/src/structure/graph_impl.cuh +++ b/cpp/src/structure/graph_impl.cuh @@ -50,7 +50,7 @@ struct out_of_range_t { vertex_t minor_first{}; vertex_t minor_last{}; - __device__ bool operator()(thrust::tuple t) + __device__ bool operator()(thrust::tuple t) const { auto major = thrust::get<0>(t); auto minor = thrust::get<1>(t); @@ -428,14 +428,28 @@ graph_t{major_first + + (*adj_matrix_partition_segment_offsets_) + [(*(meta.segment_offsets)).size() * i + + detail::num_sparse_segments_per_vertex_partition]} + : std::nullopt; cur_size += thrust::distance( local_sorted_unique_edge_majors.data() + cur_size, thrust::copy_if( handle.get_thrust_policy(), thrust::make_counting_iterator(major_first), - thrust::make_counting_iterator(major_last), + thrust::make_counting_iterator(use_dcs ? *major_hypersparse_first : major_last), local_sorted_unique_edge_majors.data() + cur_size, has_nzd_t{adj_matrix_partition_offsets_[i].data(), major_first})); + if (use_dcs) { + thrust::copy(handle.get_thrust_policy(), + (*adj_matrix_partition_dcs_nzd_vertices_)[i].begin(), + (*adj_matrix_partition_dcs_nzd_vertices_)[i].begin() + + (*adj_matrix_partition_dcs_nzd_vertex_counts_)[i], + local_sorted_unique_edge_majors.data() + cur_size); + cur_size += (*adj_matrix_partition_dcs_nzd_vertex_counts_)[i]; + } } assert(cur_size == num_local_unique_edge_majors); diff --git a/cpp/src/structure/graph_view_impl.cuh b/cpp/src/structure/graph_view_impl.cuh index 87c7351f07a..41d43ab27a8 100644 --- a/cpp/src/structure/graph_view_impl.cuh +++ b/cpp/src/structure/graph_view_impl.cuh @@ -49,7 +49,7 @@ struct out_of_range_t { vertex_t min{}; vertex_t max{}; - __device__ bool operator()(vertex_t v) { return (v < min) || (v >= max); } + __device__ bool operator()(vertex_t v) const { return (v < min) || (v >= max); } }; template From 28e0741b89e3fb274fa177eaf7fd9c9f025f3fc1 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Sun, 19 Sep 2021 14:41:56 -0400 Subject: [PATCH 55/57] clnag-format --- cpp/src/components/weakly_connected_components_impl.cuh | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/cpp/src/components/weakly_connected_components_impl.cuh b/cpp/src/components/weakly_connected_components_impl.cuh index 0b672151708..66c9447605d 100644 --- a/cpp/src/components/weakly_connected_components_impl.cuh +++ b/cpp/src/components/weakly_connected_components_impl.cuh @@ -545,9 +545,8 @@ void weakly_connected_components_impl(raft::handle_t const& handle, GraphViewType::is_multi_gpu ? adj_matrix_col_components.mutable_device_view() : detail::minor_properties_device_view_t(level_components), - col_first = level_graph_view.get_local_adj_matrix_partition_col_first(), - edge_buffer_first = - get_dataframe_buffer_begin(edge_buffer), + col_first = level_graph_view.get_local_adj_matrix_partition_col_first(), + edge_buffer_first = get_dataframe_buffer_begin(edge_buffer), num_edge_inserts = num_edge_inserts.data()] __device__(auto tagged_src, vertex_t dst, auto, auto) { auto tag = thrust::get<1>(tagged_src); From f7af95b7bcde7b83649da9c8302520f3a1ff3ddc Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Sun, 19 Sep 2021 15:55:42 -0400 Subject: [PATCH 56/57] adjust variable scope to free memory buffer when unnecessary --- cpp/src/structure/renumber_edgelist_impl.cuh | 73 +++++++++++--------- 1 file changed, 39 insertions(+), 34 deletions(-) diff --git a/cpp/src/structure/renumber_edgelist_impl.cuh b/cpp/src/structure/renumber_edgelist_impl.cuh index 6c13a44652d..ec6c0696694 100644 --- a/cpp/src/structure/renumber_edgelist_impl.cuh +++ b/cpp/src/structure/renumber_edgelist_impl.cuh @@ -702,42 +702,47 @@ renumber_edgelist( comm.barrier(); // currently, this is ncclAllReduce #endif - vertex_t max_matrix_partition_major_size{0}; - for (size_t i = 0; i < edgelist_major_vertices.size(); ++i) { - max_matrix_partition_major_size = - std::max(max_matrix_partition_major_size, partition.get_matrix_partition_major_size(i)); - } - rmm::device_uvector renumber_map_major_labels(max_matrix_partition_major_size, - handle.get_stream()); - for (size_t i = 0; i < edgelist_major_vertices.size(); ++i) { - device_bcast(col_comm, - renumber_map_labels.data(), - renumber_map_major_labels.data(), - partition.get_matrix_partition_major_size(i), - i, - handle.get_stream()); + { + vertex_t max_matrix_partition_major_size{0}; + for (size_t i = 0; i < edgelist_major_vertices.size(); ++i) { + max_matrix_partition_major_size = + std::max(max_matrix_partition_major_size, partition.get_matrix_partition_major_size(i)); + } + rmm::device_uvector renumber_map_major_labels(max_matrix_partition_major_size, + handle.get_stream()); + for (size_t i = 0; i < edgelist_major_vertices.size(); ++i) { + device_bcast(col_comm, + renumber_map_labels.data(), + renumber_map_major_labels.data(), + partition.get_matrix_partition_major_size(i), + i, + handle.get_stream()); - CUDA_TRY(cudaStreamSynchronize( - handle.get_stream())); // cuco::static_map currently does not take stream + CUDA_TRY(cudaStreamSynchronize( + handle.get_stream())); // cuco::static_map currently does not take stream - auto poly_alloc = rmm::mr::polymorphic_allocator(rmm::mr::get_current_device_resource()); - auto stream_adapter = rmm::mr::make_stream_allocator_adaptor(poly_alloc, cudaStream_t{nullptr}); - cuco::static_map - renumber_map{ - // cuco::static_map requires at least one empty slot - std::max(static_cast( - static_cast(partition.get_matrix_partition_major_size(i)) / load_factor), - static_cast(partition.get_matrix_partition_major_size(i)) + 1), - invalid_vertex_id::value, - invalid_vertex_id::value, - stream_adapter}; - auto pair_first = thrust::make_zip_iterator(thrust::make_tuple( - renumber_map_major_labels.begin(), - thrust::make_counting_iterator(partition.get_matrix_partition_major_first(i)))); - renumber_map.insert(pair_first, pair_first + partition.get_matrix_partition_major_size(i)); - renumber_map.find(edgelist_major_vertices[i], - edgelist_major_vertices[i] + edgelist_edge_counts[i], - edgelist_major_vertices[i]); + auto poly_alloc = + rmm::mr::polymorphic_allocator(rmm::mr::get_current_device_resource()); + auto stream_adapter = + rmm::mr::make_stream_allocator_adaptor(poly_alloc, cudaStream_t{nullptr}); + cuco::static_map + renumber_map{ + // cuco::static_map requires at least one empty slot + std::max( + static_cast(static_cast(partition.get_matrix_partition_major_size(i)) / + load_factor), + static_cast(partition.get_matrix_partition_major_size(i)) + 1), + invalid_vertex_id::value, + invalid_vertex_id::value, + stream_adapter}; + auto pair_first = thrust::make_zip_iterator(thrust::make_tuple( + renumber_map_major_labels.begin(), + thrust::make_counting_iterator(partition.get_matrix_partition_major_first(i)))); + renumber_map.insert(pair_first, pair_first + partition.get_matrix_partition_major_size(i)); + renumber_map.find(edgelist_major_vertices[i], + edgelist_major_vertices[i] + edgelist_edge_counts[i], + edgelist_major_vertices[i]); + } } // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between two From 78cfbda5427ac20b78295ab83860736a9f88f32d Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Sun, 19 Sep 2021 17:36:17 -0400 Subject: [PATCH 57/57] disable (key, value) pairs --- cpp/include/cugraph/graph_view.hpp | 4 +-- cpp/include/cugraph/utilities/cython.hpp | 9 ------ cpp/src/utilities/cython.cu | 41 ++++++++++++------------ 3 files changed, 22 insertions(+), 32 deletions(-) diff --git a/cpp/include/cugraph/graph_view.hpp b/cpp/include/cugraph/graph_view.hpp index 2475cb71995..fa04852133b 100644 --- a/cpp/include/cugraph/graph_view.hpp +++ b/cpp/include/cugraph/graph_view.hpp @@ -223,10 +223,10 @@ namespace detail { using namespace cugraph::visitors; -// FIXME: threshold values require tuning +// FIXME: threshold values require tuning (currently disabled) // use (key, value) pairs to store row/column properties if (unique edge rows/cols) over (V / // row_comm_size|col_comm_size) is smaller than the threshold value -double constexpr row_col_properties_kv_pair_fill_ratio_threshold = 0.25; +double constexpr row_col_properties_kv_pair_fill_ratio_threshold = 0.0; // FIXME: threshold values require tuning // use the hypersparse format (currently, DCSR or DCSC) for the vertices with their degrees smaller diff --git a/cpp/include/cugraph/utilities/cython.hpp b/cpp/include/cugraph/utilities/cython.hpp index 37000bd57e7..3a4f437bfd0 100644 --- a/cpp/include/cugraph/utilities/cython.hpp +++ b/cpp/include/cugraph/utilities/cython.hpp @@ -102,8 +102,6 @@ struct graph_container_t { size_t num_local_edges; size_t num_global_vertices; size_t num_global_edges; - size_t num_local_unique_edge_rows{}; - size_t num_local_unique_edge_cols{}; numberTypeEnum vertexType; numberTypeEnum edgeType; numberTypeEnum weightType; @@ -272,9 +270,6 @@ struct renum_tuple_t { return std::make_unique>(segment_offsets_); } - vertex_t& get_num_unique_edge_majors(void) { return num_unique_edge_majors_; } - vertex_t& get_num_unique_edge_minors(void) { return num_unique_edge_minors_; } - // `partition_t` pass-through getters // int get_part_row_size() const { return part_.get_row_size(); } @@ -369,8 +364,6 @@ struct renum_tuple_t { vertex_t nv_{0}; edge_t ne_{0}; std::vector segment_offsets_; - vertex_t num_unique_edge_majors_{0}; - vertex_t num_unique_edge_minors_{0}; }; // FIXME: finish description for vertex_partition_offsets @@ -438,8 +431,6 @@ void populate_graph_container(graph_container_t& graph_container, size_t num_local_edges, size_t num_global_vertices, size_t num_global_edges, - size_t num_local_unique_edge_rows, - size_t num_local_unique_edge_cols, bool is_weighted, bool is_symmetric, bool transposed, diff --git a/cpp/src/utilities/cython.cu b/cpp/src/utilities/cython.cu index 226f9891340..aba35ceea0b 100644 --- a/cpp/src/utilities/cython.cu +++ b/cpp/src/utilities/cython.cu @@ -169,7 +169,10 @@ std::unique_ptr> crea static_cast(graph_container.segment_offsets), static_cast(graph_container.segment_offsets) + graph_container.num_segments + 1) - : std::nullopt}, + : std::nullopt, + // FIXME: disable (key, value) pairs at this moment (should be enabled once fully tuned). + std::numeric_limits::max(), + std::numeric_limits::max()}, graph_container.do_expensive_check); } @@ -223,8 +226,6 @@ void populate_graph_container(graph_container_t& graph_container, size_t num_local_edges, size_t num_global_vertices, size_t num_global_edges, - size_t num_local_unique_edge_rows, - size_t num_local_unique_edge_cols, bool is_weighted, bool is_symmetric, bool transposed, @@ -248,24 +249,22 @@ void populate_graph_container(graph_container_t& graph_container, graph_container.col_comm_rank = col_comm_rank; } - graph_container.src_vertices = src_vertices; - graph_container.dst_vertices = dst_vertices; - graph_container.weights = weights; - graph_container.is_weighted = is_weighted; - graph_container.vertex_partition_offsets = vertex_partition_offsets; - graph_container.segment_offsets = segment_offsets; - graph_container.num_segments = num_segments; - graph_container.num_local_edges = num_local_edges; - graph_container.num_global_vertices = num_global_vertices; - graph_container.num_global_edges = num_global_edges; - graph_container.num_local_unique_edge_rows = num_local_unique_edge_rows; - graph_container.num_local_unique_edge_cols = num_local_unique_edge_cols; - graph_container.vertexType = vertexType; - graph_container.edgeType = edgeType; - graph_container.weightType = weightType; - graph_container.transposed = transposed; - graph_container.is_multi_gpu = multi_gpu; - graph_container.do_expensive_check = do_expensive_check; + graph_container.src_vertices = src_vertices; + graph_container.dst_vertices = dst_vertices; + graph_container.weights = weights; + graph_container.is_weighted = is_weighted; + graph_container.vertex_partition_offsets = vertex_partition_offsets; + graph_container.segment_offsets = segment_offsets; + graph_container.num_segments = num_segments; + graph_container.num_local_edges = num_local_edges; + graph_container.num_global_vertices = num_global_vertices; + graph_container.num_global_edges = num_global_edges; + graph_container.vertexType = vertexType; + graph_container.edgeType = edgeType; + graph_container.weightType = weightType; + graph_container.transposed = transposed; + graph_container.is_multi_gpu = multi_gpu; + graph_container.do_expensive_check = do_expensive_check; graph_properties_t graph_props{.is_symmetric = is_symmetric, .is_multigraph = false}; graph_container.graph_props = graph_props;