From 97e7d837b7379f5f185ced360aa10144b1067b24 Mon Sep 17 00:00:00 2001 From: Chuck Hastings Date: Mon, 22 Feb 2021 19:19:02 -0500 Subject: [PATCH 01/18] Refactor Louvain with new graph primitives 1) Fix Dendrogram methods to be const consistent 2) Update experimental update_by_delta_modularity to use new graph primitives. Clean up unused local code --- cpp/src/community/louvain.cuh | 4 +- cpp/src/experimental/louvain.cuh | 1171 +++++------------------------- 2 files changed, 178 insertions(+), 997 deletions(-) diff --git a/cpp/src/community/louvain.cuh b/cpp/src/community/louvain.cuh index e28f0f1746d..a0f7f1c2421 100644 --- a/cpp/src/community/louvain.cuh +++ b/cpp/src/community/louvain.cuh @@ -138,7 +138,9 @@ class Louvain { return Q; } - Dendrogram &get_dendrogram() const { return *dendrogram_; } + Dendrogram const &get_dendrogram() const { return *dendrogram_; } + + Dendrogram &get_dendrogram() { return *dendrogram_; } std::unique_ptr> move_dendrogram() { return dendrogram_; } diff --git a/cpp/src/experimental/louvain.cuh b/cpp/src/experimental/louvain.cuh index f162cd17a61..3597923d6b9 100644 --- a/cpp/src/experimental/louvain.cuh +++ b/cpp/src/experimental/louvain.cuh @@ -29,11 +29,11 @@ #include #include #include +#include +#include #include #include -#include - #include #include @@ -47,343 +47,6 @@ namespace cugraph { namespace experimental { -namespace detail { - -#ifdef CUCO_STATIC_MAP_DEFINED -template -struct create_cuco_pair_t { - cuco::pair_type __device__ operator()(data_t data) - { - cuco::pair_type tmp; - tmp.first = data; - tmp.second = data_t{0}; - return tmp; - } -}; -#endif - -// -// These classes should allow cuco::static_map to generate hash tables of -// different configurations. -// - -// -// Compare edges based on src[e] and dst[e] matching -// -template -class src_dst_equality_comparator_t { - public: - src_dst_equality_comparator_t(rmm::device_vector const &src, - rmm::device_vector const &dst, - sentinel_t sentinel_value) - : d_src_{src.data().get()}, d_dst_{dst.data().get()}, sentinel_value_(sentinel_value) - { - } - - src_dst_equality_comparator_t(data_t const *d_src, data_t const *d_dst, sentinel_t sentinel_value) - : d_src_{d_src}, d_dst_{d_dst}, sentinel_value_(sentinel_value) - { - } - - template - __device__ bool operator()(idx_type lhs_index, idx_type rhs_index) const noexcept - { - return (lhs_index != sentinel_value_) && (rhs_index != sentinel_value_) && - (d_src_[lhs_index] == d_src_[rhs_index]) && (d_dst_[lhs_index] == d_dst_[rhs_index]); - } - - private: - data_t const *d_src_; - data_t const *d_dst_; - sentinel_t sentinel_value_; -}; - -// -// Hash edges based src[e] and dst[e] -// -template -class src_dst_hasher_t { - public: - src_dst_hasher_t(rmm::device_vector const &src, rmm::device_vector const &dst) - : d_src_{src.data().get()}, d_dst_{dst.data().get()} - { - } - - src_dst_hasher_t(data_t const *d_src, data_t const *d_dst) : d_src_{d_src}, d_dst_{d_dst} {} - - template - __device__ auto operator()(idx_type index) const - { - cuco::detail::MurmurHash3_32 hasher; - - auto h_src = hasher(d_src_[index]); - auto h_dst = hasher(d_dst_[index]); - - /* - * Combine the source hash and the dest hash into a single hash value - * - * Taken from the Boost hash_combine function - * https://www.boost.org/doc/libs/1_35_0/doc/html/boost/hash_combine_id241013.html - */ - h_src ^= h_dst + 0x9e3779b9 + (h_src << 6) + (h_src >> 2); - - return h_src; - } - - private: - data_t const *d_src_; - data_t const *d_dst_; -}; - -// -// Compare edges based on src[e] and cluster[dst[e]] matching -// -template -class src_cluster_equality_comparator_t { - public: - src_cluster_equality_comparator_t(rmm::device_vector const &src, - rmm::device_vector const &dst, - rmm::device_vector const &dst_cluster_cache, - data_t base_dst_id, - sentinel_t sentinel_value) - : d_src_{src.data().get()}, - d_dst_{dst.data().get()}, - d_dst_cluster_{dst_cluster_cache.data().get()}, - base_dst_id_(base_dst_id), - sentinel_value_(sentinel_value) - { - } - - src_cluster_equality_comparator_t(data_t const *d_src, - data_t const *d_dst, - data_t const *d_dst_cluster_cache, - data_t base_dst_id, - sentinel_t sentinel_value) - : d_src_{d_src}, - d_dst_{d_dst}, - d_dst_cluster_{d_dst_cluster_cache}, - base_dst_id_(base_dst_id), - sentinel_value_(sentinel_value) - { - } - - __device__ bool operator()(sentinel_t lhs_index, sentinel_t rhs_index) const noexcept - { - return (lhs_index != sentinel_value_) && (rhs_index != sentinel_value_) && - (d_src_[lhs_index] == d_src_[rhs_index]) && - (d_dst_cluster_[d_dst_[lhs_index] - base_dst_id_] == - d_dst_cluster_[d_dst_[rhs_index] - base_dst_id_]); - } - - private: - data_t const *d_src_; - data_t const *d_dst_; - data_t const *d_dst_cluster_; - data_t base_dst_id_; - sentinel_t sentinel_value_; -}; - -// -// Hash edges based src[e] and cluster[dst[e]] -// -template -class src_cluster_hasher_t { - public: - src_cluster_hasher_t(rmm::device_vector const &src, - rmm::device_vector const &dst, - rmm::device_vector const &dst_cluster_cache, - data_t base_dst_id) - : d_src_{src.data().get()}, - d_dst_{dst.data().get()}, - d_dst_cluster_{dst_cluster_cache.data().get()}, - base_dst_id_(base_dst_id) - { - } - - src_cluster_hasher_t(data_t const *d_src, - data_t const *d_dst, - data_t const *d_dst_cluster_cache, - data_t base_dst_id) - : d_src_{d_src}, d_dst_{d_dst}, d_dst_cluster_{d_dst_cluster_cache}, base_dst_id_(base_dst_id) - { - } - - template - __device__ auto operator()(idx_type index) const - { - cuco::detail::MurmurHash3_32 hasher; - - auto h_src = hasher(d_src_[index]); - auto h_cluster = hasher(d_dst_cluster_[d_dst_[index] - base_dst_id_]); - - /* - * Combine the source hash and the cluster hash into a single hash value - * - * Taken from the Boost hash_combine function - * https://www.boost.org/doc/libs/1_35_0/doc/html/boost/hash_combine_id241013.html - */ - h_src ^= h_cluster + 0x9e3779b9 + (h_src << 6) + (h_src >> 2); - - return h_src; - } - - private: - data_t const *d_src_; - data_t const *d_dst_; - data_t const *d_dst_cluster_; - data_t base_dst_id_; -}; - -// -// Skip edges where src[e] == dst[e] -// -template -class skip_edge_t { - public: - skip_edge_t(rmm::device_vector const &src, rmm::device_vector const &dst) - : d_src_{src.data().get()}, d_dst_{dst.data().get()} - { - } - - skip_edge_t(data_t const *src, data_t const *dst) : d_src_{src}, d_dst_{dst} {} - - template - __device__ auto operator()(idx_type index) const - { - return d_src_[index] == d_dst_[index]; - } - - private: - data_t const *d_src_; - data_t const *d_dst_; -}; - -template -struct lookup_by_vertex_id { - public: - lookup_by_vertex_id(data_t const *d_array, vertex_t const *d_vertices, vertex_t base_vertex_id) - : d_array_(d_array), d_vertices_(d_vertices), base_vertex_id_(base_vertex_id) - { - } - - template - data_t operator() __device__(edge_t edge_id) const - { - return d_array_[d_vertices_[edge_id] - base_vertex_id_]; - } - - private: - data_t const *d_array_; - vertex_t const *d_vertices_; - vertex_t base_vertex_id_; -}; - -template -vector_t remove_elements_from_vector(vector_t const &input_v, - iterator_t iterator_begin, - iterator_t iterator_end, - function_t function, - cudaStream_t stream) -{ - vector_t temp_v(input_v.size()); - - auto last = thrust::copy_if( - rmm::exec_policy(stream)->on(stream), iterator_begin, iterator_end, temp_v.begin(), function); - - temp_v.resize(thrust::distance(temp_v.begin(), last)); - - return temp_v; -} - -template -vector_t remove_elements_from_vector(vector_t const &input_v, - function_t function, - cudaStream_t stream) -{ - return remove_elements_from_vector(input_v, input_v.begin(), input_v.end(), function, stream); -} - -// FIXME: This should be a generic utility. The one in cython.cu -// is very close to this -template * = nullptr> -std::unique_ptr> -create_graph(raft::handle_t const &handle, - rmm::device_vector const &src_v, - rmm::device_vector const &dst_v, - rmm::device_vector const &weight_v, - std::size_t num_local_verts, - experimental::graph_properties_t graph_props, - view_t const &view) -{ - std::vector> edgelist( - {{src_v.data().get(), - dst_v.data().get(), - weight_v.data().get(), - static_cast(src_v.size())}}); - - return std::make_unique>( - handle, - edgelist, - view.get_partition(), - num_local_verts, - src_v.size(), - graph_props, - false, - false); -} - -template * = nullptr> -std::unique_ptr> -create_graph(raft::handle_t const &handle, - rmm::device_vector const &src_v, - rmm::device_vector const &dst_v, - rmm::device_vector const &weight_v, - std::size_t num_local_verts, - experimental::graph_properties_t graph_props, - view_t const &view) -{ - experimental::edgelist_t edgelist{ - src_v.data().get(), - dst_v.data().get(), - weight_v.data().get(), - static_cast(src_v.size())}; - - return std::make_unique>( - handle, edgelist, num_local_verts, graph_props, false, false); -} - -} // namespace detail - -// -// FIXME: Ultimately, this would be cleaner and more efficient if we did the following: -// -// 1) Create an object that does a single level Louvain computation on an input graph -// (no graph contraction) -// 2) Create an object that does graph contraction -// 3) Create Louvain to use these objects in sequence to compute the aggregate result. -// -// In MNMG-world, the graph contraction step is going to create another graph that likely -// fits efficiently in a smaller number of GPUs (eventually one). Decomposing the algorithm -// as above would allow us to eventually run the single GPU version of single level Louvain -// on the contracted graphs - which should be more efficient. -// -// FIXME: We should return the dendrogram and let the python layer clean it up (or have a -// separate C++ function to flatten the dendrogram). There are customers that might -// like the dendrogram and the implementation would be a bit cleaner if we did the -// collapsing as a separate step -// template class Louvain { public: @@ -405,52 +68,25 @@ class Louvain { handle_(handle), dendrogram_(std::make_unique>()), current_graph_view_(graph_view), - compute_partition_(graph_view), local_num_vertices_(graph_view.get_number_of_local_vertices()), - local_num_rows_(graph_view.get_number_of_local_adj_matrix_partition_rows()), - local_num_cols_(graph_view.get_number_of_local_adj_matrix_partition_cols()), - local_num_edges_(graph_view.get_number_of_edges()), - vertex_weights_v_(graph_view.get_number_of_local_vertices()), - cluster_weights_v_(graph_view.get_number_of_local_vertices()), - number_of_vertices_(graph_view.get_number_of_local_vertices()), + vertex_weights_v_(graph_view.get_number_of_local_vertices(), handle.get_stream()), + cluster_weights_v_(graph_view.get_number_of_local_vertices(), handle.get_stream()), + src_vertex_weights_cache_v_(0, handle.get_stream()), + src_cluster_weights_cache_v_(0, handle.get_stream()), + dst_cluster_weights_cache_v_(0, handle.get_stream()), + src_cluster_cache_v_(0, handle.get_stream()), + dst_cluster_cache_v_(0, handle.get_stream()), stream_(handle.get_stream()) { if (graph_view_t::is_multi_gpu) { - rank_ = handle.get_comms().get_rank(); - base_vertex_id_ = graph_view.get_local_vertex_first(); - base_src_vertex_id_ = graph_view.get_local_adj_matrix_partition_row_first(0); - base_dst_vertex_id_ = graph_view.get_local_adj_matrix_partition_col_first(0); - - local_num_edges_ = thrust::transform_reduce( - thrust::host, - thrust::make_counting_iterator(0), - thrust::make_counting_iterator( - graph_view.get_number_of_local_adj_matrix_partitions()), - [&graph_view](auto indx) { - return graph_view.get_number_of_local_adj_matrix_partition_edges(indx); - }, - size_t{0}, - thrust::plus()); - - CUDA_TRY(cudaStreamSynchronize(stream_)); - } - - src_indices_v_.resize(local_num_edges_); - - cugraph::detail::offsets_to_indices( - current_graph_view_.offsets(), local_num_rows_, src_indices_v_.data().get()); - - if (base_src_vertex_id_ > 0) { - thrust::transform(rmm::exec_policy(stream_)->on(stream_), - src_indices_v_.begin(), - src_indices_v_.end(), - thrust::make_constant_iterator(base_src_vertex_id_), - src_indices_v_.begin(), - thrust::plus()); + rank_ = handle.get_comms().get_rank(); + base_vertex_id_ = graph_view.get_local_vertex_first(); } } - Dendrogram &get_dendrogram() const { return *dendrogram_; } + Dendrogram const &get_dendrogram() const { return *dendrogram_; } + + Dendrogram &get_dendrogram() { return *dendrogram_; } std::unique_ptr> move_dendrogram() { return dendrogram_; } @@ -458,9 +94,7 @@ class Louvain { { weight_t best_modularity = weight_t{-1}; -#ifdef CUCO_STATIC_MAP_DEFINED - weight_t total_edge_weight; - total_edge_weight = experimental::transform_reduce_e( + weight_t total_edge_weight = experimental::transform_reduce_e( handle_, current_graph_view_, thrust::make_constant_iterator(0), @@ -486,7 +120,6 @@ class Louvain { } timer_display(std::cout); -#endif return best_modularity; } @@ -540,8 +173,8 @@ class Louvain { weight_t sum_internal = experimental::transform_reduce_e( handle_, current_graph_view_, - src_cluster_cache_v_.begin(), - dst_cluster_cache_v_.begin(), + d_src_cluster_cache_, + d_dst_cluster_cache_, [] __device__(auto src, auto dst, weight_t wt, auto src_cluster, auto nbr_cluster) { if (src_cluster == nbr_cluster) { return wt; @@ -561,6 +194,10 @@ class Louvain { { timer_start("compute_vertex_and_cluster_weights"); + // + // TODO: Once PR 1394 is merged, this can be replaced by: + // vertex_weights_v_ = current_graph_view_.compute_out_weight_sums(handle_); + // experimental::copy_v_transform_reduce_out_nbr( handle_, current_graph_view_, @@ -575,44 +212,67 @@ class Louvain { vertex_weights_v_.end(), cluster_weights_v_.begin()); - cache_vertex_properties( - vertex_weights_v_.begin(), src_vertex_weights_cache_v_, dst_vertex_weights_cache_v_); + d_src_vertex_weights_cache_ = + cache_src_vertex_properties(vertex_weights_v_, src_vertex_weights_cache_v_); - cache_vertex_properties( - cluster_weights_v_.begin(), src_cluster_weights_cache_v_, dst_cluster_weights_cache_v_); + std::tie(d_src_cluster_weights_cache_, d_dst_cluster_weights_cache_) = cache_vertex_properties( + cluster_weights_v_, src_cluster_weights_cache_v_, dst_cluster_weights_cache_v_); timer_stop(stream_); } - template - void cache_vertex_properties(iterator_t const &local_input_iterator, - rmm::device_vector &src_cache_v, - rmm::device_vector &dst_cache_v, - bool src = true, - bool dst = true) + template + T *cache_src_vertex_properties(rmm::device_uvector &input, + rmm::device_uvector &src_cache_v) { - if (src) { - src_cache_v.resize(current_graph_view_.get_number_of_local_adj_matrix_partition_rows()); - copy_to_adj_matrix_row( - handle_, current_graph_view_, local_input_iterator, src_cache_v.begin()); + if (graph_view_t::is_multi_gpu) { + src_cache_v.resize(current_graph_view_.get_number_of_local_adj_matrix_partition_rows(), + stream_); + copy_to_adj_matrix_row(handle_, current_graph_view_, input.begin(), src_cache_v.begin()); + return src_cache_v.begin(); + } else { + return input.begin(); } + } - if (dst) { - dst_cache_v.resize(current_graph_view_.get_number_of_local_adj_matrix_partition_cols()); - copy_to_adj_matrix_col( - handle_, current_graph_view_, local_input_iterator, dst_cache_v.begin()); + template + T *cache_dst_vertex_properties(rmm::device_uvector &input, + rmm::device_uvector &dst_cache_v) + { + if (graph_view_t::is_multi_gpu) { + dst_cache_v.resize(current_graph_view_.get_number_of_local_adj_matrix_partition_cols(), + stream_); + copy_to_adj_matrix_col(handle_, current_graph_view_, input.begin(), dst_cache_v.begin()); + return dst_cache_v.begin(); + } else { + return input.begin(); } } -#ifdef CUCO_STATIC_MAP_DEFINED + template + std::tuple cache_vertex_properties(rmm::device_uvector &input, + rmm::device_uvector &src_cache_v, + rmm::device_uvector &dst_cache_v) + { + auto src = cache_src_vertex_properties(input, src_cache_v); + auto dst = cache_dst_vertex_properties(input, dst_cache_v); + + return std::make_tuple(src, dst); + } + virtual weight_t update_clustering(weight_t total_edge_weight, weight_t resolution) { timer_start("update_clustering"); - rmm::device_vector next_cluster_v(dendrogram_->current_level_begin(), - dendrogram_->current_level_end()); + rmm::device_uvector next_cluster_v(dendrogram_->current_level_size(), stream_); - cache_vertex_properties(next_cluster_v.begin(), src_cluster_cache_v_, dst_cluster_cache_v_); + raft::copy(next_cluster_v.begin(), + dendrogram_->current_level_begin(), + dendrogram_->current_level_size(), + stream_); + + std::tie(d_src_cluster_cache_, d_dst_cluster_cache_) = + cache_vertex_properties(next_cluster_v, src_cluster_cache_v_, dst_cluster_cache_v_); weight_t new_Q = modularity(total_edge_weight, resolution); weight_t cur_Q = new_Q - 1; @@ -629,8 +289,6 @@ class Louvain { up_down = !up_down; - cache_vertex_properties(next_cluster_v.begin(), src_cluster_cache_v_, dst_cluster_cache_v_); - new_Q = modularity(total_edge_weight, resolution); if (new_Q > cur_Q) { @@ -641,27 +299,26 @@ class Louvain { } } - // cache the final clustering locally on each cpu - cache_vertex_properties( - dendrogram_->current_level_begin(), src_cluster_cache_v_, dst_cluster_cache_v_); - timer_stop(stream_); return cur_Q; } void update_by_delta_modularity(weight_t total_edge_weight, weight_t resolution, - rmm::device_vector &next_cluster_v, + rmm::device_uvector &next_cluster_v, bool up_down) { - rmm::device_vector old_cluster_sum_v(local_num_vertices_); - rmm::device_vector src_old_cluster_sum_cache_v; + rmm::device_uvector old_cluster_sum_v(local_num_vertices_, stream_); + rmm::device_uvector cluster_subtract_v(local_num_vertices_, stream_); + + rmm::device_uvector tmp_cluster_keys_v(0, stream_); + rmm::device_uvector tmp_cluster_weights_v(0, stream_); experimental::copy_v_transform_reduce_out_nbr( handle_, current_graph_view_, - src_cluster_cache_v_.begin(), - dst_cluster_cache_v_.begin(), + d_src_cluster_cache_, + d_dst_cluster_cache_, [] __device__(auto src, auto dst, auto wt, auto src_cluster, auto nbr_cluster) { if ((src != dst) && (src_cluster == nbr_cluster)) { return wt; @@ -671,549 +328,105 @@ class Louvain { weight_t{0}, old_cluster_sum_v.begin()); - cache_vertex_properties( - old_cluster_sum_v.begin(), src_old_cluster_sum_cache_v, empty_cache_weight_v_, true, false); - - detail::src_cluster_equality_comparator_t compare( - src_indices_v_.data().get(), - current_graph_view_.indices(), - dst_cluster_cache_v_.data().get(), - base_dst_vertex_id_, - std::numeric_limits::max()); - detail::src_cluster_hasher_t hasher(src_indices_v_.data().get(), - current_graph_view_.indices(), - dst_cluster_cache_v_.data().get(), - base_dst_vertex_id_); - detail::skip_edge_t skip_edge(src_indices_v_.data().get(), - current_graph_view_.indices()); - - // - // Group edges that lead from same source to same neighboring cluster together - // local_cluster_edge_ids_v will contain edge ids of unique pairs of (src,nbr_cluster). - // If multiple edges exist, one edge id will be chosen (by a parallel race). - // nbr_weights_v will contain the combined weight of all of the edges that connect - // that pair. - // - rmm::device_vector local_cluster_edge_ids_v; - rmm::device_vector nbr_weights_v; - - // - // Perform this combining on the local edges - // - std::tie(local_cluster_edge_ids_v, nbr_weights_v) = combine_local_src_nbr_cluster_weights( - hasher, compare, skip_edge, current_graph_view_.weights(), local_num_edges_); - - // - // In order to compute delta_Q for a given src/nbr_cluster pair, I need the following - // information: - // src - // old_cluster - the cluster that src is currently assigned to - // nbr_cluster - // sum of edges going to new cluster - // vertex weight of the src vertex - // sum of edges going to old cluster - // cluster_weights of old cluster - // cluster_weights of nbr_cluster - // - // Each GPU has locally cached: - // The sum of edges going to the old cluster (computed from - // experimental::copy_v_transform_reduce_out_nbr call above. - // old_cluster - // nbr_cluster - // vertex weight of src vertex - // partial sum of edges going to the new cluster (in nbr_weights) - // - // So the plan is to take the tuple: - // (src, old_cluster, src_vertex_weight, old_cluster_sum, nbr_cluster, nbr_weights) - // and shuffle it around the cluster so that they arrive at the GPU where the pair - // (old_cluster, new_cluster) would be assigned. Then we can aggregate this information - // and compute the delta_Q values. - // - - // - // Define the communication pattern, we're going to send detail - // for edge i to the GPU that is responsible for the vertex - // pair (cluster[src[i]], cluster[dst[i]]) - // - auto communication_schedule = thrust::make_transform_iterator( - local_cluster_edge_ids_v.begin(), - [d_edge_device_view = compute_partition_.edge_device_view(), - d_src_indices = src_indices_v_.data().get(), - d_src_cluster = src_cluster_cache_v_.data().get(), - d_dst_indices = current_graph_view_.indices(), - d_dst_cluster = dst_cluster_cache_v_.data().get(), - base_src_vertex_id = base_src_vertex_id_, - base_dst_vertex_id = base_dst_vertex_id_] __device__(edge_t edge_id) { - return d_edge_device_view(d_src_cluster[d_src_indices[edge_id] - base_src_vertex_id], - d_dst_cluster[d_dst_indices[edge_id] - base_dst_vertex_id]); - }); - - // FIXME: This should really be a variable_shuffle of a tuple, for time - // reasons I'm just doing 6 independent shuffles. - // - rmm::device_vector ocs_v = variable_shuffle( - handle_, - local_cluster_edge_ids_v.size(), - thrust::make_transform_iterator( - local_cluster_edge_ids_v.begin(), - detail::lookup_by_vertex_id(src_old_cluster_sum_cache_v.data().get(), - src_indices_v_.data().get(), - base_src_vertex_id_)), - communication_schedule); - - rmm::device_vector src_cluster_v = - variable_shuffle( - handle_, - local_cluster_edge_ids_v.size(), - thrust::make_transform_iterator( - local_cluster_edge_ids_v.begin(), - detail::lookup_by_vertex_id( - src_cluster_cache_v_.data().get(), src_indices_v_.data().get(), base_src_vertex_id_)), - communication_schedule); - - rmm::device_vector src_vertex_weight_v = - variable_shuffle( - handle_, - local_cluster_edge_ids_v.size(), - thrust::make_transform_iterator( - local_cluster_edge_ids_v.begin(), - detail::lookup_by_vertex_id(src_vertex_weights_cache_v_.data().get(), - src_indices_v_.data().get(), - base_src_vertex_id_)), - communication_schedule); - - rmm::device_vector src_v = variable_shuffle( + experimental::copy_v_transform_reduce_out_nbr( handle_, - local_cluster_edge_ids_v.size(), - thrust::make_permutation_iterator(src_indices_v_.begin(), local_cluster_edge_ids_v.begin()), - communication_schedule); - - rmm::device_vector nbr_cluster_v = - variable_shuffle( - handle_, - local_cluster_edge_ids_v.size(), - thrust::make_transform_iterator( - local_cluster_edge_ids_v.begin(), - detail::lookup_by_vertex_id( - dst_cluster_cache_v_.data().get(), current_graph_view_.indices(), base_dst_vertex_id_)), - communication_schedule); - - nbr_weights_v = variable_shuffle( - handle_, nbr_weights_v.size(), nbr_weights_v.begin(), communication_schedule); - - // - // At this point, src_v, nbr_cluster_v and nbr_weights_v have been - // shuffled to the correct GPU. We can now compute the final - // value of delta_Q for each neigboring cluster - // - // Again, we'll combine edges that connect the same source to the same - // neighboring cluster and sum their weights. - // - detail::src_dst_equality_comparator_t compare2( - src_v, nbr_cluster_v, std::numeric_limits::max()); - detail::src_dst_hasher_t hasher2(src_v, nbr_cluster_v); - - auto skip_edge2 = [] __device__(auto) { return false; }; - - std::tie(local_cluster_edge_ids_v, nbr_weights_v) = combine_local_src_nbr_cluster_weights( - hasher2, compare2, skip_edge2, nbr_weights_v.data().get(), src_v.size()); - - // - // Now local_cluster_edge_ids_v contains the edge ids of the src id/dest - // cluster id pairs, and nbr_weights_v contains the weight of edges - // going to that cluster id - // - // Now we can compute (locally) each delta_Q value - // - auto iter = thrust::make_zip_iterator( - thrust::make_tuple(local_cluster_edge_ids_v.begin(), nbr_weights_v.begin())); - - thrust::transform(rmm::exec_policy(stream_)->on(stream_), - iter, - iter + local_cluster_edge_ids_v.size(), - nbr_weights_v.begin(), - [total_edge_weight, - resolution, - d_src = src_v.data().get(), - d_src_cluster = src_cluster_v.data().get(), - d_nbr_cluster = nbr_cluster_v.data().get(), - d_src_vertex_weights = src_vertex_weight_v.data().get(), - d_src_cluster_weights = src_cluster_weights_cache_v_.data().get(), - d_dst_cluster_weights = dst_cluster_weights_cache_v_.data().get(), - d_ocs = ocs_v.data().get(), - base_src_vertex_id = base_src_vertex_id_, - base_dst_vertex_id = base_dst_vertex_id_] __device__(auto tuple) { - edge_t edge_id = thrust::get<0>(tuple); - vertex_t nbr_cluster = d_nbr_cluster[edge_id]; - weight_t new_cluster_sum = thrust::get<1>(tuple); - vertex_t old_cluster = d_src_cluster[edge_id]; - weight_t k_k = d_src_vertex_weights[edge_id]; - weight_t old_cluster_sum = d_ocs[edge_id]; - - weight_t a_old = d_src_cluster_weights[old_cluster - base_src_vertex_id]; - weight_t a_new = d_dst_cluster_weights[nbr_cluster - base_dst_vertex_id]; - - return 2 * (((new_cluster_sum - old_cluster_sum) / total_edge_weight) - - resolution * (a_new * k_k - a_old * k_k + k_k * k_k) / - (total_edge_weight * total_edge_weight)); - }); - - // - // Pick the largest delta_Q value for each vertex on this gpu. - // Then we will shuffle back to the gpu by vertex id - // - rmm::device_vector final_src_v(local_cluster_edge_ids_v.size()); - rmm::device_vector final_nbr_cluster_v(local_cluster_edge_ids_v.size()); - rmm::device_vector final_nbr_weights_v(local_cluster_edge_ids_v.size()); - - auto final_input_iter = thrust::make_zip_iterator(thrust::make_tuple( - thrust::make_permutation_iterator(src_v.begin(), local_cluster_edge_ids_v.begin()), - thrust::make_permutation_iterator(nbr_cluster_v.begin(), local_cluster_edge_ids_v.begin()), - nbr_weights_v.begin())); - - auto final_output_iter = thrust::make_zip_iterator(thrust::make_tuple( - final_src_v.begin(), final_nbr_cluster_v.begin(), final_nbr_weights_v.begin())); - - auto final_output_pos = - thrust::copy_if(rmm::exec_policy(stream_)->on(stream_), - final_input_iter, - final_input_iter + local_cluster_edge_ids_v.size(), - final_output_iter, - [] __device__(auto p) { return (thrust::get<2>(p) > weight_t{0}); }); - - final_src_v.resize(thrust::distance(final_output_iter, final_output_pos)); - final_nbr_cluster_v.resize(thrust::distance(final_output_iter, final_output_pos)); - final_nbr_weights_v.resize(thrust::distance(final_output_iter, final_output_pos)); - - // - // Sort the results, pick the largest version - // - thrust::sort(rmm::exec_policy(stream_)->on(stream_), - thrust::make_zip_iterator(thrust::make_tuple( - final_src_v.begin(), final_nbr_weights_v.begin(), final_nbr_cluster_v.begin())), - thrust::make_zip_iterator(thrust::make_tuple( - final_src_v.end(), final_nbr_weights_v.end(), final_nbr_cluster_v.begin())), - [] __device__(auto left, auto right) { - if (thrust::get<0>(left) < thrust::get<0>(right)) return true; - if (thrust::get<0>(left) > thrust::get<0>(right)) return false; - if (thrust::get<1>(left) > thrust::get<1>(right)) return true; - if (thrust::get<1>(left) < thrust::get<1>(right)) return false; - return (thrust::get<2>(left) < thrust::get<2>(right)); - }); - - // - // Now that we're sorted the first entry for each src value is the largest. - // - local_cluster_edge_ids_v.resize(final_src_v.size()); - - thrust::transform(rmm::exec_policy(stream_)->on(stream_), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(final_src_v.size()), - local_cluster_edge_ids_v.begin(), - [sentinel = std::numeric_limits::max(), - d_src = final_src_v.data().get()] __device__(edge_t edge_id) { - if (edge_id == 0) { return edge_id; } - - if (d_src[edge_id - 1] != d_src[edge_id]) { return edge_id; } - - return sentinel; - }); - - local_cluster_edge_ids_v = detail::remove_elements_from_vector( - local_cluster_edge_ids_v, - [sentinel = std::numeric_limits::max()] __device__(auto edge_id) { - return (edge_id != sentinel); + current_graph_view_, + d_src_cluster_cache_, + d_dst_cluster_cache_, + [] __device__(auto src, auto dst, auto wt, auto src_cluster, auto nbr_cluster) { + return (src == dst) ? wt : weight_t{0}; }, - stream_); - - final_nbr_cluster_v = variable_shuffle( - handle_, - local_cluster_edge_ids_v.size(), - thrust::make_permutation_iterator(final_nbr_cluster_v.begin(), - local_cluster_edge_ids_v.begin()), - thrust::make_transform_iterator( - thrust::make_permutation_iterator(final_src_v.begin(), local_cluster_edge_ids_v.begin()), - [d_vertex_device_view = compute_partition_.vertex_device_view()] __device__(vertex_t v) { - return d_vertex_device_view(v); - })); - - final_nbr_weights_v = variable_shuffle( - handle_, - local_cluster_edge_ids_v.size(), - thrust::make_permutation_iterator(final_nbr_weights_v.begin(), - local_cluster_edge_ids_v.begin()), - thrust::make_transform_iterator( - thrust::make_permutation_iterator(final_src_v.begin(), local_cluster_edge_ids_v.begin()), - [d_vertex_device_view = compute_partition_.vertex_device_view()] __device__(vertex_t v) { - return d_vertex_device_view(v); - })); - - final_src_v = variable_shuffle( - handle_, - local_cluster_edge_ids_v.size(), - thrust::make_permutation_iterator(final_src_v.begin(), local_cluster_edge_ids_v.begin()), - thrust::make_transform_iterator( - thrust::make_permutation_iterator(final_src_v.begin(), local_cluster_edge_ids_v.begin()), - [d_vertex_device_view = compute_partition_.vertex_device_view()] __device__(vertex_t v) { - return d_vertex_device_view(v); - })); - - // - // At this point... - // final_src_v contains the source indices - // final_nbr_cluster_v contains the neighboring clusters - // final_nbr_weights_v contains delta_Q for moving src to the neighboring - // - // They have been shuffled to the gpus responsible for their source vertex - // - // FIXME: Think about how this should work. - // I think Leiden is broken. I don't think that the code we have - // actually does anything. For now I'm going to ignore Leiden in - // MNMG, we can reconsider this later. - // - // If we ignore Leiden, I'd like to think about whether the reduction - // should occur now... - // - - // - // Sort the results, pick the largest version - // - thrust::sort(rmm::exec_policy(stream_)->on(stream_), - thrust::make_zip_iterator(thrust::make_tuple( - final_src_v.begin(), final_nbr_weights_v.begin(), final_nbr_cluster_v.begin())), - thrust::make_zip_iterator(thrust::make_tuple( - final_src_v.end(), final_nbr_weights_v.end(), final_nbr_cluster_v.begin())), - [] __device__(auto left, auto right) { - if (thrust::get<0>(left) < thrust::get<0>(right)) return true; - if (thrust::get<0>(left) > thrust::get<0>(right)) return false; - if (thrust::get<1>(left) > thrust::get<1>(right)) return true; - if (thrust::get<1>(left) < thrust::get<1>(right)) return false; - return (thrust::get<2>(left) < thrust::get<2>(right)); - }); - - // - // Now that we're sorted (ascending), the last entry for each src value is the largest. - // - local_cluster_edge_ids_v.resize(final_src_v.size()); - - thrust::transform(rmm::exec_policy(stream_)->on(stream_), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(final_src_v.size()), - local_cluster_edge_ids_v.begin(), - [sentinel = std::numeric_limits::max(), - d_src = final_src_v.data().get()] __device__(edge_t edge_id) { - if (edge_id == 0) { return edge_id; } - - if (d_src[edge_id - 1] != d_src[edge_id]) { return edge_id; } + weight_t{0}, + cluster_subtract_v.begin()); - return sentinel; - }); + auto output_buffer = + cugraph::experimental::allocate_dataframe_buffer>( + local_num_vertices_, stream_); - local_cluster_edge_ids_v = detail::remove_elements_from_vector( - local_cluster_edge_ids_v, - [sentinel = std::numeric_limits::max()] __device__(auto edge_id) { - return (edge_id != sentinel); + copy_v_transform_reduce_key_aggregated_out_nbr( + handle_, + current_graph_view_, + thrust::make_zip_iterator(thrust::make_tuple(old_cluster_sum_v.begin(), + d_src_vertex_weights_cache_, + cluster_subtract_v.begin(), + d_src_cluster_cache_)), + + d_dst_cluster_cache_, + thrust::make_counting_iterator(base_vertex_id_), + thrust::make_counting_iterator(base_vertex_id_ + local_num_vertices_), + d_dst_cluster_weights_cache_, + [base_vertex_id = base_vertex_id_, + d_src_cluster_weights = d_src_cluster_weights_cache_, + total_edge_weight, + resolution] __device__(auto src, + auto neighbor_cluster, + auto new_cluster_sum, + auto src_info, + auto a_new) { + auto old_cluster_sum = thrust::get<0>(src_info); + auto k_k = thrust::get<1>(src_info); + auto cluster_subtract = thrust::get<2>(src_info); + auto src_cluster = thrust::get<3>(src_info); + auto a_old = d_src_cluster_weights[src_cluster]; + + if (src_cluster == neighbor_cluster) new_cluster_sum -= cluster_subtract; + + weight_t delta_modularity = 2 * (((new_cluster_sum - old_cluster_sum) / total_edge_weight) - + resolution * (a_new * k_k - a_old * k_k + k_k * k_k) / + (total_edge_weight * total_edge_weight)); + + return thrust::make_tuple(neighbor_cluster, delta_modularity); }, - stream_); - - rmm::device_vector cluster_increase_v(final_src_v.size()); - rmm::device_vector cluster_decrease_v(final_src_v.size()); - rmm::device_vector old_cluster_v(final_src_v.size()); - - // - // Then we can, on each gpu, do a local assignment for all of the - // vertices assigned to that gpu using the up_down logic - // - local_cluster_edge_ids_v = detail::remove_elements_from_vector( - local_cluster_edge_ids_v, - local_cluster_edge_ids_v.begin(), - local_cluster_edge_ids_v.end(), - [d_final_src = final_src_v.data().get(), - d_final_nbr_cluster = final_nbr_cluster_v.data().get(), - d_final_nbr_weights = final_nbr_weights_v.data().get(), - d_cluster_increase = cluster_increase_v.data().get(), - d_cluster_decrease = cluster_decrease_v.data().get(), - d_vertex_weights = src_vertex_weights_cache_v_.data().get(), - d_next_cluster = next_cluster_v.data().get(), - d_old_cluster = old_cluster_v.data().get(), - base_vertex_id = base_vertex_id_, - base_src_vertex_id = base_src_vertex_id_, - up_down] __device__(edge_t idx) { - vertex_t src = d_final_src[idx]; - vertex_t new_cluster = d_final_nbr_cluster[idx]; - vertex_t old_cluster = d_next_cluster[src - base_vertex_id]; - weight_t src_weight = d_vertex_weights[src - base_src_vertex_id]; - - if (d_final_nbr_weights[idx] <= weight_t{0}) return false; - if (new_cluster == old_cluster) return false; - if ((new_cluster > old_cluster) != up_down) return false; - - d_next_cluster[src - base_vertex_id] = new_cluster; - d_cluster_increase[idx] = src_weight; - d_cluster_decrease[idx] = src_weight; - d_old_cluster[idx] = old_cluster; - return true; + [] __device__(auto p1, auto p2) { + return (thrust::get<1>(p1) < thrust::get<1>(p2)) ? p2 : p1; }, - stream_); + thrust::make_tuple(vertex_t{-1}, weight_t{0}), + cugraph::experimental::get_dataframe_buffer_begin>( + output_buffer)); - cluster_increase_v = variable_shuffle( - handle_, - local_cluster_edge_ids_v.size(), - thrust::make_permutation_iterator(cluster_increase_v.begin(), - local_cluster_edge_ids_v.begin()), - thrust::make_transform_iterator( - thrust::make_permutation_iterator(final_nbr_cluster_v.begin(), - local_cluster_edge_ids_v.begin()), - [d_vertex_device_view = compute_partition_.vertex_device_view()] __device__(vertex_t v) { - return d_vertex_device_view(v); - })); - - final_nbr_cluster_v = variable_shuffle( - handle_, - local_cluster_edge_ids_v.size(), - thrust::make_permutation_iterator(final_nbr_cluster_v.begin(), - local_cluster_edge_ids_v.begin()), - thrust::make_transform_iterator( - thrust::make_permutation_iterator(final_nbr_cluster_v.begin(), - local_cluster_edge_ids_v.begin()), - [d_vertex_device_view = compute_partition_.vertex_device_view()] __device__(vertex_t v) { - return d_vertex_device_view(v); - })); - - cluster_decrease_v = variable_shuffle( - handle_, - local_cluster_edge_ids_v.size(), - thrust::make_permutation_iterator(cluster_decrease_v.begin(), - local_cluster_edge_ids_v.begin()), - thrust::make_transform_iterator( - thrust::make_permutation_iterator(old_cluster_v.begin(), local_cluster_edge_ids_v.begin()), - [d_vertex_device_view = compute_partition_.vertex_device_view()] __device__(vertex_t v) { - return d_vertex_device_view(v); - })); - - old_cluster_v = variable_shuffle( - handle_, - local_cluster_edge_ids_v.size(), - thrust::make_permutation_iterator(old_cluster_v.begin(), local_cluster_edge_ids_v.begin()), - thrust::make_transform_iterator( - thrust::make_permutation_iterator(old_cluster_v.begin(), local_cluster_edge_ids_v.begin()), - [d_vertex_device_view = compute_partition_.vertex_device_view()] __device__(vertex_t v) { - return d_vertex_device_view(v); - })); - - thrust::for_each(rmm::exec_policy(stream_)->on(stream_), - thrust::make_zip_iterator( - thrust::make_tuple(final_nbr_cluster_v.begin(), cluster_increase_v.begin())), - thrust::make_zip_iterator( - thrust::make_tuple(final_nbr_cluster_v.end(), cluster_increase_v.end())), - [d_cluster_weights = cluster_weights_v_.data().get(), - base_vertex_id = base_vertex_id_] __device__(auto p) { - vertex_t cluster_id = thrust::get<0>(p); - weight_t weight = thrust::get<1>(p); - - atomicAdd(d_cluster_weights + cluster_id - base_vertex_id, weight); - }); - - thrust::for_each( + thrust::transform( rmm::exec_policy(stream_)->on(stream_), - thrust::make_zip_iterator( - thrust::make_tuple(old_cluster_v.begin(), cluster_decrease_v.begin())), - thrust::make_zip_iterator(thrust::make_tuple(old_cluster_v.end(), cluster_decrease_v.end())), - [d_cluster_weights = cluster_weights_v_.data().get(), - base_vertex_id = base_vertex_id_] __device__(auto p) { - vertex_t cluster_id = thrust::get<0>(p); - weight_t weight = thrust::get<1>(p); - - atomicAdd(d_cluster_weights + cluster_id - base_vertex_id, -weight); + next_cluster_v.begin(), + next_cluster_v.end(), + cugraph::experimental::get_dataframe_buffer_begin>( + output_buffer), + next_cluster_v.begin(), + [up_down] __device__(vertex_t old_cluster, auto p) { + vertex_t new_cluster = thrust::get<0>(p); + weight_t delta_modularity = thrust::get<1>(p); + + return (delta_modularity > weight_t{0}) + ? (((new_cluster > old_cluster) != up_down) ? old_cluster : new_cluster) + : old_cluster; }); - cache_vertex_properties( - cluster_weights_v_.begin(), src_cluster_weights_cache_v_, dst_cluster_weights_cache_v_); - } - - template - std::pair, rmm::device_vector> - combine_local_src_nbr_cluster_weights(hash_t hasher, - compare_t compare, - skip_edge_t skip_edge, - weight_t const *d_weights, - count_t num_weights) - { - rmm::device_vector relevant_edges_v; - rmm::device_vector relevant_edge_weights_v; - - if (num_weights > 0) { - std::size_t capacity{static_cast(num_weights / 0.7)}; - - cuco::static_map hash_map( - capacity, std::numeric_limits::max(), count_t{0}); - detail::create_cuco_pair_t create_cuco_pair; - - CUDA_TRY(cudaStreamSynchronize(stream_)); - - hash_map.insert(thrust::make_transform_iterator(thrust::make_counting_iterator(0), - create_cuco_pair), - thrust::make_transform_iterator( - thrust::make_counting_iterator(num_weights), create_cuco_pair), - hasher, - compare); - - CUDA_TRY(cudaStreamSynchronize(stream_)); - - relevant_edges_v.resize(num_weights); - - relevant_edges_v = detail::remove_elements_from_vector( - relevant_edges_v, - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_weights), - [d_hash_map = hash_map.get_device_view(), hasher, compare] __device__(count_t idx) { - auto pos = d_hash_map.find(idx, hasher, compare); - return (pos->first == idx); - }, - stream_); - - thrust::for_each_n( - rmm::exec_policy(stream_)->on(stream_), - thrust::make_counting_iterator(0), - relevant_edges_v.size(), - [d_hash_map = hash_map.get_device_view(), - hasher, - compare, - d_relevant_edges = relevant_edges_v.data().get()] __device__(count_t idx) mutable { - count_t edge_id = d_relevant_edges[idx]; - auto pos = d_hash_map.find(edge_id, hasher, compare); - pos->second.store(idx); - }); - - relevant_edge_weights_v.resize(relevant_edges_v.size()); - thrust::fill(rmm::exec_policy(stream_)->on(stream_), - relevant_edge_weights_v.begin(), - relevant_edge_weights_v.end(), - weight_t{0}); - - thrust::for_each_n( - rmm::exec_policy(stream_)->on(stream_), - thrust::make_counting_iterator(0), - num_weights, - [d_hash_map = hash_map.get_device_view(), - hasher, - compare, - skip_edge, - d_relevant_edge_weights = relevant_edge_weights_v.data().get(), - d_weights] __device__(count_t idx) { - if (!skip_edge(idx)) { - auto pos = d_hash_map.find(idx, hasher, compare); - if (pos != d_hash_map.end()) { - atomicAdd(d_relevant_edge_weights + pos->second.load(cuda::std::memory_order_relaxed), - d_weights[idx]); - } - } - }); - } + std::tie(d_src_cluster_cache_, d_dst_cluster_cache_) = + cache_vertex_properties(next_cluster_v, src_cluster_cache_v_, dst_cluster_cache_v_); - return std::make_pair(relevant_edges_v, relevant_edge_weights_v); + std::tie(tmp_cluster_keys_v, tmp_cluster_weights_v) = + cugraph::experimental::transform_reduce_by_adj_matrix_row_key_e( + handle_, + current_graph_view_, + thrust::make_constant_iterator(0), + thrust::make_constant_iterator(0), + d_src_cluster_cache_, + [] __device__(auto src, auto, auto wt, auto, auto) { return wt; }, + weight_t{0}); + + thrust::fill(rmm::exec_policy(stream_)->on(stream_), + cluster_weights_v_.begin(), + cluster_weights_v_.end(), + weight_t{0}); + + thrust::scatter(rmm::exec_policy(stream_)->on(stream_), + tmp_cluster_weights_v.begin(), + tmp_cluster_weights_v.end(), + tmp_cluster_keys_v.begin(), + cluster_weights_v_.begin()); + + std::tie(d_src_cluster_weights_cache_, d_dst_cluster_weights_cache_) = cache_vertex_properties( + cluster_weights_v_, src_cluster_weights_cache_v_, dst_cluster_weights_cache_v_); } -#endif void shrink_graph() { @@ -1227,26 +440,8 @@ class Louvain { current_graph_view_ = current_graph_->view(); local_num_vertices_ = current_graph_view_.get_number_of_local_vertices(); - local_num_rows_ = current_graph_view_.get_number_of_local_adj_matrix_partition_rows(); - local_num_cols_ = current_graph_view_.get_number_of_local_adj_matrix_partition_cols(); base_vertex_id_ = current_graph_view_.get_local_vertex_first(); - local_num_edges_ = thrust::transform_reduce( - thrust::host, - thrust::make_counting_iterator(0), - thrust::make_counting_iterator( - current_graph_view_.get_number_of_local_adj_matrix_partitions()), - [this](auto indx) { - return current_graph_view_.get_number_of_local_adj_matrix_partition_edges(indx); - }, - size_t{0}, - thrust::plus()); - - src_indices_v_.resize(local_num_edges_); - - cugraph::detail::offsets_to_indices( - current_graph_view_.offsets(), local_num_rows_, src_indices_v_.data().get()); - rmm::device_uvector numbering_indices(numbering_map.size(), stream_); thrust::sequence(rmm::exec_policy(stream_)->on(stream_), numbering_indices.begin(), @@ -1270,45 +465,29 @@ class Louvain { std::unique_ptr> dendrogram_; - vertex_t number_of_vertices_; + vertex_t local_num_vertices_; vertex_t base_vertex_id_{0}; - vertex_t base_src_vertex_id_{0}; - vertex_t base_dst_vertex_id_{0}; int rank_{0}; - vertex_t local_num_vertices_; - vertex_t local_num_rows_; - vertex_t local_num_cols_; - edge_t local_num_edges_; - // // Copy of graph // std::unique_ptr current_graph_{}; graph_view_t current_graph_view_; - // - // For partitioning - // - detail::compute_partition_t compute_partition_; - - rmm::device_vector src_indices_v_; - - // - // Weights and clustering across iterations of algorithm - // - rmm::device_vector vertex_weights_v_; - rmm::device_vector src_vertex_weights_cache_v_{}; - rmm::device_vector dst_vertex_weights_cache_v_{}; - - rmm::device_vector cluster_weights_v_; - rmm::device_vector src_cluster_weights_cache_v_{}; - rmm::device_vector dst_cluster_weights_cache_v_{}; - - rmm::device_vector src_cluster_cache_v_{}; - rmm::device_vector dst_cluster_cache_v_{}; - - rmm::device_vector empty_cache_weight_v_{}; + rmm::device_uvector vertex_weights_v_; + rmm::device_uvector cluster_weights_v_; + rmm::device_uvector src_vertex_weights_cache_v_; + rmm::device_uvector src_cluster_weights_cache_v_; + rmm::device_uvector dst_cluster_weights_cache_v_; + rmm::device_uvector src_cluster_cache_v_; + rmm::device_uvector dst_cluster_cache_v_; + + weight_t *d_src_vertex_weights_cache_; + weight_t *d_src_cluster_weights_cache_; + weight_t *d_dst_cluster_weights_cache_; + vertex_t *d_src_cluster_cache_; + vertex_t *d_dst_cluster_cache_; #ifdef TIMING HighResTimer hr_timer_; From e1581d7a56f0319738c9037dedfa613299c4ffd3 Mon Sep 17 00:00:00 2001 From: Chuck Hastings Date: Mon, 22 Feb 2021 20:08:40 -0500 Subject: [PATCH 02/18] clean up include files, delete obsolete header --- cpp/src/experimental/louvain.cuh | 27 ++-- cpp/src/experimental/shuffle.cuh | 226 ------------------------------- 2 files changed, 8 insertions(+), 245 deletions(-) delete mode 100644 cpp/src/experimental/shuffle.cuh diff --git a/cpp/src/experimental/louvain.cuh b/cpp/src/experimental/louvain.cuh index 3597923d6b9..5e70d32ac94 100644 --- a/cpp/src/experimental/louvain.cuh +++ b/cpp/src/experimental/louvain.cuh @@ -15,18 +15,11 @@ */ #pragma once -#include +#include #include - -#include -#include -#include -#include - -#include - #include + #include #include #include @@ -34,10 +27,6 @@ #include #include -#include - -#include - //#define TIMING #ifdef TIMING @@ -222,8 +211,7 @@ class Louvain { } template - T *cache_src_vertex_properties(rmm::device_uvector &input, - rmm::device_uvector &src_cache_v) + T *cache_src_vertex_properties(rmm::device_uvector &input, rmm::device_uvector &src_cache_v) { if (graph_view_t::is_multi_gpu) { src_cache_v.resize(current_graph_view_.get_number_of_local_adj_matrix_partition_rows(), @@ -236,8 +224,7 @@ class Louvain { } template - T *cache_dst_vertex_properties(rmm::device_uvector &input, - rmm::device_uvector &dst_cache_v) + T *cache_dst_vertex_properties(rmm::device_uvector &input, rmm::device_uvector &dst_cache_v) { if (graph_view_t::is_multi_gpu) { dst_cache_v.resize(current_graph_view_.get_number_of_local_adj_matrix_partition_cols(), @@ -470,7 +457,9 @@ class Louvain { int rank_{0}; // - // Copy of graph + // Initially we run on the input graph view, + // but as we shrink the graph we'll keep the + // current graph here // std::unique_ptr current_graph_{}; graph_view_t current_graph_view_; @@ -492,7 +481,7 @@ class Louvain { #ifdef TIMING HighResTimer hr_timer_; #endif -}; // namespace experimental +}; } // namespace experimental } // namespace cugraph diff --git a/cpp/src/experimental/shuffle.cuh b/cpp/src/experimental/shuffle.cuh deleted file mode 100644 index 40f3b510b10..00000000000 --- a/cpp/src/experimental/shuffle.cuh +++ /dev/null @@ -1,226 +0,0 @@ -/* - * Copyright (c) 2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include - -namespace cugraph { -namespace experimental { - -namespace detail { - -// -// FIXME: This implementation of variable_shuffle stages the data for transfer -// in host memory. It would be more efficient, I believe, to stage the -// data in device memory, but it would require actually instantiating -// the data in device memory which is already precious in the Louvain -// implementation. We should explore if it's actually more efficient -// through device memory and whether the improvement is worth the extra -// memory required. -// -template -rmm::device_vector variable_shuffle(raft::handle_t const &handle, - std::size_t n_elements, - iterator_t data_iter, - partition_iter_t partition_iter) -{ - // - // We need to compute the size of data movement - // - raft::comms::comms_t const &comms = handle.get_comms(); - - cudaStream_t stream = handle.get_stream(); - int num_gpus = comms.get_size(); - int my_gpu = comms.get_rank(); - - rmm::device_vector local_sizes_v(num_gpus, size_t{0}); - - thrust::for_each(rmm::exec_policy(stream)->on(stream), - partition_iter, - partition_iter + n_elements, - [num_gpus, d_local_sizes = local_sizes_v.data().get()] __device__(auto p) { - atomicAdd(d_local_sizes + p, size_t{1}); - }); - - std::vector h_local_sizes_v(num_gpus); - std::vector h_global_sizes_v(num_gpus); - std::vector h_input_v(n_elements); - std::vector h_partitions_v(n_elements); - - thrust::copy(local_sizes_v.begin(), local_sizes_v.end(), h_local_sizes_v.begin()); - thrust::copy(partition_iter, partition_iter + n_elements, h_partitions_v.begin()); - - std::vector requests(2 * num_gpus); - - int request_pos = 0; - - for (int gpu = 0; gpu < num_gpus; ++gpu) { - if (gpu != my_gpu) { - comms.irecv(&h_global_sizes_v[gpu], 1, gpu, 0, &requests[request_pos]); - ++request_pos; - comms.isend(&h_local_sizes_v[gpu], 1, gpu, 0, &requests[request_pos]); - ++request_pos; - } else { - h_global_sizes_v[gpu] = h_local_sizes_v[gpu]; - } - } - - if (request_pos > 0) { comms.waitall(request_pos, requests.data()); } - - comms.barrier(); - - // - // Now global_sizes contains all of the counts, we need to - // allocate an array of the appropriate size - // - int64_t receive_size = - thrust::reduce(thrust::host, h_global_sizes_v.begin(), h_global_sizes_v.end()); - - std::vector temp_data; - - if (receive_size > 0) temp_data.resize(receive_size); - - rmm::device_vector input_v(n_elements); - - auto input_start = input_v.begin(); - - for (int gpu = 0; gpu < num_gpus; ++gpu) { - input_start = thrust::copy_if(rmm::exec_policy(stream)->on(stream), - data_iter, - data_iter + n_elements, - partition_iter, - input_start, - [gpu] __device__(int32_t p) { return p == gpu; }); - } - - thrust::copy(input_v.begin(), input_v.end(), h_input_v.begin()); - - std::vector temp_v(num_gpus + 1); - - thrust::exclusive_scan( - thrust::host, h_global_sizes_v.begin(), h_global_sizes_v.end(), temp_v.begin()); - - temp_v[num_gpus] = temp_v[num_gpus - 1] + h_global_sizes_v[num_gpus - 1]; - h_global_sizes_v = temp_v; - - thrust::exclusive_scan( - thrust::host, h_local_sizes_v.begin(), h_local_sizes_v.end(), temp_v.begin()); - - temp_v[num_gpus] = temp_v[num_gpus - 1] + h_local_sizes_v[num_gpus - 1]; - h_local_sizes_v = temp_v; - - CUDA_TRY(cudaStreamSynchronize(handle.get_stream())); - comms.barrier(); - - request_pos = 0; - - for (int gpu = 0; gpu < num_gpus; ++gpu) { - size_t to_receive = h_global_sizes_v[gpu + 1] - h_global_sizes_v[gpu]; - size_t to_send = h_local_sizes_v[gpu + 1] - h_local_sizes_v[gpu]; - - if (gpu != my_gpu) { - if (to_receive > 0) { - comms.irecv( - temp_data.data() + h_global_sizes_v[gpu], to_receive, gpu, 0, &requests[request_pos]); - ++request_pos; - } - - if (to_send > 0) { - comms.isend( - h_input_v.data() + h_local_sizes_v[gpu], to_send, gpu, 0, &requests[request_pos]); - ++request_pos; - } - } else if (to_receive > 0) { - std::copy(h_input_v.begin() + h_local_sizes_v[gpu], - h_input_v.begin() + h_local_sizes_v[gpu + 1], - temp_data.begin() + h_global_sizes_v[gpu]); - } - } - - comms.barrier(); - - if (request_pos > 0) { comms.waitall(request_pos, requests.data()); } - - comms.barrier(); - - return rmm::device_vector(temp_data); -} - -} // namespace detail - -/** - * @brief shuffle data to the desired partition - * - * MNMG algorithms require shuffling data between partitions - * to get the data to the right location for computation. - * - * This function operates dynamically, there is no - * a priori knowledge about where the data will need - * to be transferred. - * - * This function will be executed on each GPU. Each gpu - * has a portion of the data (specified by begin_data and - * end_data iterators) and an iterator that identifies - * (for each corresponding element) which GPU the data - * should be shuffled to. - * - * The return value will be a device vector containing - * the data received by this GPU. - * - * Note that this function accepts iterators as input. - * `partition_iterator` will be traversed multiple times. - * - * @tparam is_multi_gpu If true, multi-gpu - shuffle will occur - * If false, single GPU - simple copy will occur - * @tparam data_t Type of the data being shuffled - * @tparam iterator_t Iterator referencing data to be shuffled - * @tparam partition_iter_t Iterator identifying the destination partition - * - * @param handle Library handle (RAFT) - * @param n_elements Number of elements to transfer - * @param data_iter Iterator that returns the elements to be transfered - * @param partition_iter Iterator that returns the partition where elements - * should be transfered. - */ -template * = nullptr> -rmm::device_vector variable_shuffle(raft::handle_t const &handle, - std::size_t n_elements, - iterator_t data_iter, - partition_iter_t partition_iter) -{ - return detail::variable_shuffle(handle, n_elements, data_iter, partition_iter); -} - -template * = nullptr> -rmm::device_vector variable_shuffle(raft::handle_t const &handle, - std::size_t n_elements, - iterator_t data_iter, - partition_iter_t partition_iter) -{ - return rmm::device_vector(data_iter, data_iter + n_elements); -} - -} // namespace experimental -} // namespace cugraph From 11aeffa34b2760b4837fcec5da851502ff7782f7 Mon Sep 17 00:00:00 2001 From: Chuck Hastings Date: Fri, 26 Feb 2021 09:44:18 -0500 Subject: [PATCH 03/18] fix weight use in copy_v_transform_reduce_key_aggregated_out_nbr --- ...opy_v_transform_reduce_key_aggregated_out_nbr.cuh | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/cpp/include/patterns/copy_v_transform_reduce_key_aggregated_out_nbr.cuh b/cpp/include/patterns/copy_v_transform_reduce_key_aggregated_out_nbr.cuh index 8490df1d17d..35938bad7cb 100644 --- a/cpp/include/patterns/copy_v_transform_reduce_key_aggregated_out_nbr.cuh +++ b/cpp/include/patterns/copy_v_transform_reduce_key_aggregated_out_nbr.cuh @@ -88,16 +88,17 @@ __global__ void for_all_major_for_all_nbr_low_degree( // in-place reduce_by_key vertex_t key_idx{0}; key_aggregated_edge_weights[local_offset + key_idx] = - weights != nullptr ? weights[0] : weight_t{1.0}; + weights != nullptr ? key_aggregated_edge_weights[local_offset] : weight_t{1.0}; + for (edge_t i = 1; i < local_degree; ++i) { if (minor_keys[local_offset + i] == minor_keys[local_offset + key_idx]) { key_aggregated_edge_weights[local_offset + key_idx] += - weights != nullptr ? weights[i] : weight_t{1.0}; + weights != nullptr ? key_aggregated_edge_weights[local_offset + i] : weight_t{1.0}; } else { ++key_idx; minor_keys[local_offset + key_idx] = minor_keys[local_offset + i]; key_aggregated_edge_weights[local_offset + key_idx] = - weights != nullptr ? weights[i] : weight_t{1.0}; + weights != nullptr ? key_aggregated_edge_weights[local_offset + i] : weight_t{1.0}; } } thrust::fill(thrust::seq, @@ -170,6 +171,7 @@ __global__ void for_all_major_for_all_nbr_low_degree( template Date: Fri, 26 Feb 2021 18:53:09 -0500 Subject: [PATCH 04/18] use compute_out_weight_sums, add MG unit test --- cpp/src/experimental/louvain.cuh | 18 +- cpp/tests/CMakeLists.txt | 12 ++ cpp/tests/community/louvain_mg_test.cpp | 213 +++++++++++++++++++++++ cpp/tests/utilities/mg_test_utilities.cu | 7 + 4 files changed, 237 insertions(+), 13 deletions(-) create mode 100644 cpp/tests/community/louvain_mg_test.cpp diff --git a/cpp/src/experimental/louvain.cuh b/cpp/src/experimental/louvain.cuh index 5e70d32ac94..b0df1f2a757 100644 --- a/cpp/src/experimental/louvain.cuh +++ b/cpp/src/experimental/louvain.cuh @@ -183,18 +183,7 @@ class Louvain { { timer_start("compute_vertex_and_cluster_weights"); - // - // TODO: Once PR 1394 is merged, this can be replaced by: - // vertex_weights_v_ = current_graph_view_.compute_out_weight_sums(handle_); - // - experimental::copy_v_transform_reduce_out_nbr( - handle_, - current_graph_view_, - thrust::make_constant_iterator(0), - thrust::make_constant_iterator(0), - [] __device__(auto src, auto, auto wt, auto, auto) { return wt; }, - weight_t{0}, - vertex_weights_v_.begin()); + vertex_weights_v_ = current_graph_view_.compute_out_weight_sums(handle_); thrust::copy(rmm::exec_policy(stream_)->on(stream_), vertex_weights_v_.begin(), @@ -301,6 +290,7 @@ class Louvain { rmm::device_uvector tmp_cluster_keys_v(0, stream_); rmm::device_uvector tmp_cluster_weights_v(0, stream_); + // TODO: These two calls could be combined and return a pair, I would think that would be faster experimental::copy_v_transform_reduce_out_nbr( handle_, current_graph_view_, @@ -408,7 +398,9 @@ class Louvain { thrust::scatter(rmm::exec_policy(stream_)->on(stream_), tmp_cluster_weights_v.begin(), tmp_cluster_weights_v.end(), - tmp_cluster_keys_v.begin(), + thrust::make_transform_iterator(tmp_cluster_keys_v.begin(), + [base_vertex_id = base_vertex_id_] __device__( + auto key) { return key - base_vertex_id; }), cluster_weights_v_.begin()); std::tie(d_src_cluster_weights_cache_, d_dst_cluster_weights_cache_) = cache_vertex_properties( diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 68b277871b1..daf43a9f87a 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -438,6 +438,18 @@ if(BUILD_CUGRAPH_MG_TESTS) ConfigureTest(MG_PAGERANK_TEST "${MG_PAGERANK_TEST_SRCS}") target_link_libraries(MG_PAGERANK_TEST PRIVATE MPI::MPI_C MPI::MPI_CXX) + ########################################################################################### + # - MG LOUVAIN tests --------------------------------------------------------------------- + + set(MG_LOUVAIN_TEST_SRCS + "${CMAKE_CURRENT_SOURCE_DIR}/../../thirdparty/mmio/mmio.c" + "${CMAKE_CURRENT_SOURCE_DIR}/utilities/test_utilities.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/utilities/mg_test_utilities.cu" + "${CMAKE_CURRENT_SOURCE_DIR}/community/louvain_mg_test.cpp") + + ConfigureTest(MG_LOUVAIN_TEST "${MG_LOUVAIN_TEST_SRCS}") + target_link_libraries(MG_LOUVAIN_TEST PRIVATE MPI::MPI_C MPI::MPI_CXX) + else(MPI_CXX_FOUND) message(FATAL_ERROR "OpenMPI NOT found, cannot build MG tests.") endif(MPI_CXX_FOUND) diff --git a/cpp/tests/community/louvain_mg_test.cpp b/cpp/tests/community/louvain_mg_test.cpp new file mode 100644 index 00000000000..06715d52bc0 --- /dev/null +++ b/cpp/tests/community/louvain_mg_test.cpp @@ -0,0 +1,213 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include + +#include + +//////////////////////////////////////////////////////////////////////////////// +// Test param object. This defines the input and expected output for a test, and +// will be instantiated as the parameter to the tests defined below using +// INSTANTIATE_TEST_CASE_P() +// +struct Louvain_Testparams { + std::string graph_file_full_path{}; + bool weighted{false}; + size_t max_level; + double resolution; + + // TODO: We really should have a Graph_Testparms_Base class or something + // like that which can handle this graph_full_path thing. + // + Louvain_Testparams(std::string const& graph_file_path, + bool weighted, + size_t max_level, + double resolution) + : weighted(weighted), max_level(max_level), resolution(resolution) + { + if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) { + graph_file_full_path = cugraph::test::get_rapids_dataset_root_dir() + "/" + graph_file_path; + } else { + graph_file_full_path = graph_file_path; + } + }; +}; + +//////////////////////////////////////////////////////////////////////////////// +// Parameterized test fixture, to be used with TEST_P(). This defines common +// setup and teardown steps as well as common utilities used by each E2E MG +// test. In this case, each test is identical except for the inputs and +// expected outputs, so the entire test is defined in the run_test() method. +// +class Louvain_MG_Testfixture : public cugraph::test::MG_TestFixture_t, + public ::testing::WithParamInterface { + public: + // Run once for each test instance + virtual void SetUp() {} + virtual void TearDown() {} + + // Return the results of running louvain on a single GPU for the dataset in + // graph_file_path. + template + std::tuple> get_sg_results( + raft::handle_t const& handle, + std::string const& graph_file_path, + size_t max_level, + weight_t resolution) + { + // TODO: Put this in the Graph test base class + // (make the call simpler here) + auto graph = + cugraph::test::read_graph_from_matrix_market_file( + handle, graph_file_path, true); // FIXME: should use param.test_weighted instead of true + + auto graph_view = graph.view(); + cudaStream_t stream = handle.get_stream(); + + rmm::device_uvector clustering_v(graph_view.get_number_of_local_vertices(), stream); + + size_t level; + weight_t modularity; + + std::tie(level, modularity) = + cugraph::louvain(handle, graph_view, clustering_v.data(), max_level, resolution); + + std::vector clustering(graph_view.get_number_of_local_vertices()); + raft::update_host(clustering.data(), clustering_v.data(), clustering_v.size(), stream); + + return std::make_tuple(level, modularity, clustering); + } + + // Compare the results of running louvain on multiple GPUs to that of a + // single-GPU run for the configuration in param. + template + void run_test(const Louvain_Testparams& param) + { + raft::handle_t handle; + raft::comms::initialize_mpi_comms(&handle, MPI_COMM_WORLD); + const auto& comm = handle.get_comms(); + + cudaStream_t stream = handle.get_stream(); + + // Assuming 2 GPUs which means 1 row, 2 cols. 2 cols = row_comm_size of 2. + // FIXME: DO NOT ASSUME 2 GPUs, add code to compute prows, pcols + size_t row_comm_size{2}; + cugraph::partition_2d::subcomm_factory_t + subcomm_factory(handle, row_comm_size); + + int my_rank = comm.get_rank(); + + // FIXME: graph must be weighted! + std::unique_ptr> // store_transposed=false, + // multi_gpu=true + mg_graph_ptr{}; + rmm::device_uvector d_renumber_map_labels(0, handle.get_stream()); + + std::tie(mg_graph_ptr, d_renumber_map_labels) = cugraph::test:: + create_graph_for_gpu // store_transposed=true + (handle, param.graph_file_full_path); + + auto mg_graph_view = mg_graph_ptr->view(); + + rmm::device_uvector clustering_v(mg_graph_view.get_number_of_local_vertices(), + stream); + + CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + + int level; + weight_t modularity; + + std::cout << "calling MG louvain" << std::endl; + + std::tie(level, modularity) = cugraph::louvain( + handle, mg_graph_view, clustering_v.data(), param.max_level, param.resolution); + + std::vector clustering(mg_graph_view.get_number_of_local_vertices()); + + raft::update_host(clustering.data(), clustering_v.data(), clustering_v.size(), stream); + + std::vector h_renumber_map_labels(mg_graph_view.get_number_of_vertices()); + raft::update_host(h_renumber_map_labels.data(), + d_renumber_map_labels.data(), + d_renumber_map_labels.size(), + stream); + + // Compare MG to SG + + // Each GPU will have a subset of the clustering + int sg_level; + weight_t sg_modularity; + std::vector sg_clustering; + + std::tie(sg_level, sg_modularity, sg_clustering) = get_sg_results( + handle, param.graph_file_full_path, param.max_level, param.resolution); + + std::cout << "MG: level = " << level << ", modularity = " << modularity << std::endl; + raft::print_host_vector("clustering", clustering.data(), clustering.size(), std::cout); + + std::cout << "SG: level = " << sg_level << ", modularity = " << sg_modularity << std::endl; + raft::print_host_vector("clustering", sg_clustering.data(), sg_clustering.size(), std::cout); + +#if 0 + // For this test, each GPU will have the full set of vertices and + // therefore the pageranks vectors should be equal in size. + ASSERT_EQ(h_sg_pageranks.size(), h_mg_pageranks.size()); + + auto threshold_ratio = 1e-3; + auto threshold_magnitude = + (1.0 / static_cast(mg_graph_view.get_number_of_vertices())) * + threshold_ratio; // skip comparison for low PageRank verties (lowly ranked vertices) + auto nearly_equal = [threshold_ratio, threshold_magnitude](auto lhs, auto rhs) { + return std::abs(lhs - rhs) < + std::max(std::max(lhs, rhs) * threshold_ratio, threshold_magnitude); + }; + + vertex_t mapped_vertex{0}; + for (vertex_t i = 0; + i + mg_graph_view.get_local_vertex_first() < mg_graph_view.get_local_vertex_last(); + ++i) { + mapped_vertex = h_renumber_map_labels[i]; + ASSERT_TRUE(nearly_equal(h_mg_pageranks[i], h_sg_pageranks[mapped_vertex])) + << "MG PageRank value for vertex: " << i << " in rank: " << my_rank + << " has value: " << h_mg_pageranks[i] + << " which exceeds the error margin for comparing to SG value: " << h_sg_pageranks[i]; + } +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////// +TEST_P(Louvain_MG_Testfixture, CheckInt32Int32FloatFloat) +{ + run_test(GetParam()); +} + +INSTANTIATE_TEST_CASE_P( + e2e, + Louvain_MG_Testfixture, + ::testing::Values(Louvain_Testparams("test/datasets/karate.mtx", true, 100, 1) + // Louvain_Testparams("test/datasets/webbase-1M.mtx", true, 100, 1), + )); + +// FIXME: Enable proper RMM configuration by using CUGRAPH_TEST_PROGRAM_MAIN(). +// Currently seeing a RMM failure during init, need to investigate. +// CUGRAPH_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/utilities/mg_test_utilities.cu b/cpp/tests/utilities/mg_test_utilities.cu index 26f2450b589..f3fe1480527 100644 --- a/cpp/tests/utilities/mg_test_utilities.cu +++ b/cpp/tests/utilities/mg_test_utilities.cu @@ -176,5 +176,12 @@ template std::tuple< rmm::device_uvector> create_graph_for_gpu(raft::handle_t& handle, const std::string& graph_file_path); +template std::tuple< + std::unique_ptr< + cugraph::experimental::graph_t>, // store_transposed=true + // multi_gpu=true + rmm::device_uvector> +create_graph_for_gpu(raft::handle_t& handle, const std::string& graph_file_path); + } // namespace test } // namespace cugraph From 9e4421e743f7ddbbf64a1eefcfa46e3ef4846c53 Mon Sep 17 00:00:00 2001 From: Chuck Hastings Date: Mon, 15 Mar 2021 19:13:24 -0400 Subject: [PATCH 05/18] MNMG Louvain working correctly --- cpp/include/utilities/device_comm.cuh | 12 +- cpp/src/community/dendrogram.cuh | 7 + cpp/src/community/ecg.cu | 2 +- cpp/src/community/flatten_dendrogram.cuh | 37 ++- cpp/src/community/leiden.cuh | 2 +- cpp/src/community/louvain.cuh | 2 +- cpp/src/experimental/louvain.cuh | 289 ++++++++++++------ cpp/tests/CMakeLists.txt | 3 - cpp/tests/community/louvain_mg_test.cpp | 148 ++++----- .../utilities/generate_graph_from_edgelist.cu | 210 +++++++++---- 10 files changed, 450 insertions(+), 262 deletions(-) diff --git a/cpp/include/utilities/device_comm.cuh b/cpp/include/utilities/device_comm.cuh index 7b9956902cc..e0bbea76068 100644 --- a/cpp/include/utilities/device_comm.cuh +++ b/cpp/include/utilities/device_comm.cuh @@ -238,10 +238,12 @@ template struct device_sendrecv_tuple_iterator_element_impl { void run(raft::comms::comms_t const& comm, InputIterator input_first, - size_t count, + size_t tx_count, int dst, - int base_tag, - raft::comms::request_t* requests) const + OutputIterator output_first, + size_t rx_count, + int src, + cudaStream_t stream) const { } }; @@ -460,7 +462,7 @@ struct device_reduce_tuple_iterator_element_impl { op, root, stream); - device_reduce_tuple_iterator_element_impl( + device_reduce_tuple_iterator_element_impl().run( comm, input_first, output_first, count, op, root, stream); } }; @@ -890,7 +892,7 @@ device_reduce(raft::comms::comms_t const& comm, thrust::tuple_size::value_type>::value; detail:: - device_reduce_tuple_iterator_element_impl( + device_reduce_tuple_iterator_element_impl().run( comm, input_first, output_first, count, op, root, stream); } diff --git a/cpp/src/community/dendrogram.cuh b/cpp/src/community/dendrogram.cuh index 414f5f3854d..eed34064d9a 100644 --- a/cpp/src/community/dendrogram.cuh +++ b/cpp/src/community/dendrogram.cuh @@ -26,12 +26,14 @@ template class Dendrogram { public: void add_level(vertex_t num_verts, + vertex_t first_index, cudaStream_t stream = 0, rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()) { level_ptr_.push_back( std::make_unique(num_verts * sizeof(vertex_t), stream, mr)); level_size_.push_back(num_verts); + level_first_index_.push_back(first_index); } size_t current_level() const { return level_size_.size() - 1; } @@ -50,6 +52,8 @@ class Dendrogram { vertex_t get_level_size_nocheck(size_t level) const { return level_size_[level]; } + vertex_t get_level_first_index_nocheck(size_t level) const { return level_first_index_[level]; } + vertex_t const *current_level_begin() const { return get_level_ptr_nocheck(current_level()); } vertex_t const *current_level_end() const { return current_level_begin() + current_level_size(); } @@ -60,8 +64,11 @@ class Dendrogram { vertex_t current_level_size() const { return get_level_size_nocheck(current_level()); } + vertex_t current_level_first_index() const { return get_level_first_index_nocheck(current_level()); } + private: std::vector level_size_; + std::vector level_first_index_; std::vector> level_ptr_; }; diff --git a/cpp/src/community/ecg.cu b/cpp/src/community/ecg.cu index 994204ecd32..1563d9393d8 100644 --- a/cpp/src/community/ecg.cu +++ b/cpp/src/community/ecg.cu @@ -117,7 +117,7 @@ class EcgLouvain : public cugraph::Louvain { void initialize_dendrogram_level(vertex_t num_vertices) override { - this->dendrogram_->add_level(num_vertices); + this->dendrogram_->add_level(num_vertices, 0); get_permutation_vector( num_vertices, seed_, this->dendrogram_->current_level_begin(), this->stream_); diff --git a/cpp/src/community/flatten_dendrogram.cuh b/cpp/src/community/flatten_dendrogram.cuh index 892fe2d1c51..6961d7c9b87 100644 --- a/cpp/src/community/flatten_dendrogram.cuh +++ b/cpp/src/community/flatten_dendrogram.cuh @@ -31,23 +31,28 @@ void partition_at_level(raft::handle_t const &handle, size_t level) { vertex_t local_num_verts = dendrogram.get_level_size_nocheck(0); + rmm::device_uvector local_vertex_ids_v(local_num_verts, handle.get_stream()); - thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - d_vertex_ids, - d_vertex_ids + local_num_verts, - d_partition); - - std::for_each(thrust::make_counting_iterator(0), - thrust::make_counting_iterator(level), - [&handle, &dendrogram, d_vertex_ids, &d_partition, local_num_verts](size_t l) { - cugraph::experimental::relabel( - handle, - std::tuple( - d_vertex_ids, dendrogram.get_level_ptr_nocheck(l)), - dendrogram.get_level_size_nocheck(l), - d_partition, - local_num_verts); - }); + raft::copy(d_partition, d_vertex_ids, local_num_verts, handle.get_stream()); + + std::for_each( + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(level), + [&handle, &dendrogram, &local_vertex_ids_v, d_vertex_ids, &d_partition, local_num_verts]( + size_t l) { + thrust::sequence(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + local_vertex_ids_v.begin(), + local_vertex_ids_v.begin() + dendrogram.get_level_size_nocheck(l), + dendrogram.get_level_first_index_nocheck(l)); + + cugraph::experimental::relabel( + handle, + std::tuple(local_vertex_ids_v.data(), + dendrogram.get_level_ptr_nocheck(l)), + dendrogram.get_level_size_nocheck(l), + d_partition, + local_num_verts); + }); } } // namespace cugraph diff --git a/cpp/src/community/leiden.cuh b/cpp/src/community/leiden.cuh index 141f8beac40..3c3c7815aa8 100644 --- a/cpp/src/community/leiden.cuh +++ b/cpp/src/community/leiden.cuh @@ -132,7 +132,7 @@ class Leiden : public Louvain { // // Initialize every cluster to reference each vertex to itself // - this->dendrogram_->add_level(current_graph.number_of_vertices); + this->dendrogram_->add_level(current_graph.number_of_vertices, 0); thrust::sequence(rmm::exec_policy(this->stream_)->on(this->stream_), this->dendrogram_->current_level_begin(), diff --git a/cpp/src/community/louvain.cuh b/cpp/src/community/louvain.cuh index a0f7f1c2421..ae72cd8d44b 100644 --- a/cpp/src/community/louvain.cuh +++ b/cpp/src/community/louvain.cuh @@ -210,7 +210,7 @@ class Louvain { virtual void initialize_dendrogram_level(vertex_t num_vertices) { - dendrogram_->add_level(num_vertices); + dendrogram_->add_level(num_vertices, 0); thrust::sequence(rmm::exec_policy(stream_)->on(stream_), dendrogram_->current_level_begin(), diff --git a/cpp/src/experimental/louvain.cuh b/cpp/src/experimental/louvain.cuh index b0df1f2a757..e3ae267e710 100644 --- a/cpp/src/experimental/louvain.cuh +++ b/cpp/src/experimental/louvain.cuh @@ -26,6 +26,9 @@ #include #include #include +#include + +#include //#define TIMING @@ -57,19 +60,16 @@ class Louvain { handle_(handle), dendrogram_(std::make_unique>()), current_graph_view_(graph_view), - local_num_vertices_(graph_view.get_number_of_local_vertices()), - vertex_weights_v_(graph_view.get_number_of_local_vertices(), handle.get_stream()), + cluster_keys_v_(graph_view.get_number_of_local_vertices(), handle.get_stream()), cluster_weights_v_(graph_view.get_number_of_local_vertices(), handle.get_stream()), + vertex_weights_v_(graph_view.get_number_of_local_vertices(), handle.get_stream()), src_vertex_weights_cache_v_(0, handle.get_stream()), - src_cluster_weights_cache_v_(0, handle.get_stream()), - dst_cluster_weights_cache_v_(0, handle.get_stream()), src_cluster_cache_v_(0, handle.get_stream()), dst_cluster_cache_v_(0, handle.get_stream()), stream_(handle.get_stream()) { if (graph_view_t::is_multi_gpu) { rank_ = handle.get_comms().get_rank(); - base_vertex_id_ = graph_view.get_local_vertex_first(); } } @@ -88,7 +88,7 @@ class Louvain { current_graph_view_, thrust::make_constant_iterator(0), thrust::make_constant_iterator(0), - [] __device__(auto, auto, weight_t wt, auto, auto) { return wt; }, + [] __device__(auto src, auto dst, weight_t wt, auto, auto) { return wt; }, weight_t{0}); while (dendrogram_->num_levels() < max_level) { @@ -141,23 +141,28 @@ class Louvain { protected: void initialize_dendrogram_level(vertex_t num_vertices) { - dendrogram_->add_level(num_vertices); + dendrogram_->add_level(num_vertices, current_graph_view_.get_local_vertex_first()); thrust::sequence(rmm::exec_policy(stream_)->on(stream_), dendrogram_->current_level_begin(), dendrogram_->current_level_end(), - base_vertex_id_); + current_graph_view_.get_local_vertex_first()); } public: weight_t modularity(weight_t total_edge_weight, weight_t resolution) { - weight_t sum_degree_squared = experimental::transform_reduce_v( - handle_, - current_graph_view_, + weight_t sum_degree_squared = thrust::transform_reduce( + rmm::exec_policy(stream_)->on(stream_), cluster_weights_v_.begin(), + cluster_weights_v_.end(), [] __device__(weight_t p) { return p * p; }, - weight_t{0}); + weight_t{0}, + thrust::plus()); + + if (graph_t::is_multi_gpu) { + sum_degree_squared = host_scalar_allreduce(handle_.get_comms(), sum_degree_squared, stream_); + } weight_t sum_internal = experimental::transform_reduce_e( handle_, @@ -185,16 +190,38 @@ class Louvain { vertex_weights_v_ = current_graph_view_.compute_out_weight_sums(handle_); - thrust::copy(rmm::exec_policy(stream_)->on(stream_), - vertex_weights_v_.begin(), - vertex_weights_v_.end(), - cluster_weights_v_.begin()); + thrust::sequence(rmm::exec_policy(stream_)->on(stream_), + cluster_keys_v_.begin(), + cluster_keys_v_.end(), + current_graph_view_.get_local_vertex_first()); + + raft::copy( + cluster_weights_v_.begin(), vertex_weights_v_.begin(), vertex_weights_v_.size(), stream_); d_src_vertex_weights_cache_ = cache_src_vertex_properties(vertex_weights_v_, src_vertex_weights_cache_v_); - std::tie(d_src_cluster_weights_cache_, d_dst_cluster_weights_cache_) = cache_vertex_properties( - cluster_weights_v_, src_cluster_weights_cache_v_, dst_cluster_weights_cache_v_); + if (graph_view_t::is_multi_gpu) { + auto const comm_size = handle_.get_comms().get_size(); + rmm::device_uvector rx_keys_v(0, handle_.get_stream()); + rmm::device_uvector rx_weights_v(0, handle_.get_stream()); + + auto pair_first = thrust::make_zip_iterator( + thrust::make_tuple(cluster_keys_v_.begin(), cluster_weights_v_.begin())); + + std::forward_as_tuple(std::tie(rx_keys_v, rx_weights_v), std::ignore) = + groupby_gpuid_and_shuffle_values( + handle_.get_comms(), + pair_first, + pair_first + current_graph_view_.get_number_of_local_vertices(), + [key_func = + cugraph::experimental::detail::compute_gpu_id_from_vertex_t{ + comm_size}] __device__(auto val) { return key_func(thrust::get<0>(val)); }, + handle_.get_stream()); + + cluster_keys_v_ = std::move(rx_keys_v); + cluster_weights_v_ = std::move(rx_weights_v); + } timer_stop(stream_); } @@ -268,10 +295,10 @@ class Louvain { new_Q = modularity(total_edge_weight, resolution); if (new_Q > cur_Q) { - thrust::copy(rmm::exec_policy(stream_)->on(stream_), - next_cluster_v.begin(), - next_cluster_v.end(), - dendrogram_->current_level_begin()); + raft::copy(dendrogram_->current_level_begin(), + next_cluster_v.begin(), + next_cluster_v.size(), + stream_); } } @@ -279,46 +306,118 @@ class Louvain { return cur_Q; } - void update_by_delta_modularity(weight_t total_edge_weight, - weight_t resolution, - rmm::device_uvector &next_cluster_v, - bool up_down) + void compute_cluster_sum_and_subtract(rmm::device_uvector &old_cluster_sum_v, + rmm::device_uvector &cluster_subtract_v) { - rmm::device_uvector old_cluster_sum_v(local_num_vertices_, stream_); - rmm::device_uvector cluster_subtract_v(local_num_vertices_, stream_); - - rmm::device_uvector tmp_cluster_keys_v(0, stream_); - rmm::device_uvector tmp_cluster_weights_v(0, stream_); + auto output_buffer = + cugraph::experimental::allocate_dataframe_buffer>( + current_graph_view_.get_number_of_local_vertices(), stream_); - // TODO: These two calls could be combined and return a pair, I would think that would be faster experimental::copy_v_transform_reduce_out_nbr( handle_, current_graph_view_, d_src_cluster_cache_, d_dst_cluster_cache_, [] __device__(auto src, auto dst, auto wt, auto src_cluster, auto nbr_cluster) { - if ((src != dst) && (src_cluster == nbr_cluster)) { - return wt; - } else - return weight_t{0}; - }, - weight_t{0}, - old_cluster_sum_v.begin()); + weight_t subtract{0}; + weight_t sum{0}; - experimental::copy_v_transform_reduce_out_nbr( - handle_, - current_graph_view_, - d_src_cluster_cache_, - d_dst_cluster_cache_, - [] __device__(auto src, auto dst, auto wt, auto src_cluster, auto nbr_cluster) { - return (src == dst) ? wt : weight_t{0}; + if (src == dst) + subtract = wt; + else if (src_cluster == nbr_cluster) + sum = wt; + + return thrust::make_tuple(subtract, sum); }, - weight_t{0}, - cluster_subtract_v.begin()); + thrust::make_tuple(weight_t{0}, weight_t{0}), + cugraph::experimental::get_dataframe_buffer_begin>( + output_buffer)); + + thrust::transform( + rmm::exec_policy(stream_)->on(stream_), + cugraph::experimental::get_dataframe_buffer_begin>( + output_buffer), + cugraph::experimental::get_dataframe_buffer_begin>( + output_buffer) + + current_graph_view_.get_number_of_local_vertices(), + old_cluster_sum_v.begin(), + [] __device__(auto p) { return thrust::get<1>(p); }); + + thrust::transform( + rmm::exec_policy(stream_)->on(stream_), + cugraph::experimental::get_dataframe_buffer_begin>( + output_buffer), + cugraph::experimental::get_dataframe_buffer_begin>( + output_buffer) + + current_graph_view_.get_number_of_local_vertices(), + cluster_subtract_v.begin(), + [] __device__(auto p) { return thrust::get<0>(p); }); + } + + void update_by_delta_modularity(weight_t total_edge_weight, + weight_t resolution, + rmm::device_uvector &next_cluster_v, + bool up_down) + { + rmm::device_uvector old_cluster_sum_v(current_graph_view_.get_number_of_local_vertices(), stream_); + rmm::device_uvector cluster_subtract_v(current_graph_view_.get_number_of_local_vertices(), stream_); + rmm::device_uvector src_cluster_weights_v(next_cluster_v.size(), stream_); + rmm::device_uvector dst_cluster_weights_v(next_cluster_v.size(), stream_); + + compute_cluster_sum_and_subtract(old_cluster_sum_v, cluster_subtract_v); auto output_buffer = cugraph::experimental::allocate_dataframe_buffer>( - local_num_vertices_, stream_); + current_graph_view_.get_number_of_local_vertices(), stream_); + + vertex_t *map_key_first; + vertex_t *map_key_last; + weight_t *map_value_first; + + if (graph_t::is_multi_gpu) { + cugraph::experimental::detail::compute_gpu_id_from_vertex_t vertex_to_gpu_id_op{ + handle_.get_comms().get_size()}; + + src_cluster_weights_v = cugraph::experimental::collect_values_for_keys( + handle_.get_comms(), + cluster_keys_v_.begin(), + cluster_keys_v_.end(), + cluster_weights_v_.data(), + d_src_cluster_cache_, + d_src_cluster_cache_ + src_cluster_cache_v_.size(), + vertex_to_gpu_id_op, + stream_); + + dst_cluster_weights_v = cugraph::experimental::collect_values_for_keys( + handle_.get_comms(), + cluster_keys_v_.begin(), + cluster_keys_v_.end(), + cluster_weights_v_.data(), + d_dst_cluster_cache_, + d_dst_cluster_cache_ + dst_cluster_cache_v_.size(), + vertex_to_gpu_id_op, + stream_); + + map_key_first = d_dst_cluster_cache_; + map_key_last = d_dst_cluster_cache_ + dst_cluster_cache_v_.size(); + map_value_first = dst_cluster_weights_v.begin(); + } else { + thrust::transform(rmm::exec_policy(stream_)->on(stream_), + next_cluster_v.begin(), + next_cluster_v.end(), + src_cluster_weights_v.begin(), + [d_cluster_weights = cluster_weights_v_.data(), + d_cluster_keys = cluster_keys_v_.data(), + num_clusters = cluster_keys_v_.size()] __device__(vertex_t cluster) { + auto pos = thrust::find( + thrust::seq, d_cluster_keys, d_cluster_keys + num_clusters, cluster); + return d_cluster_weights[pos - d_cluster_keys]; + }); + + map_key_first = d_src_cluster_cache_; + map_key_last = d_src_cluster_cache_ + src_cluster_weights_v.size(); + map_value_first = src_cluster_weights_v.begin(); + } copy_v_transform_reduce_key_aggregated_out_nbr( handle_, @@ -326,25 +425,20 @@ class Louvain { thrust::make_zip_iterator(thrust::make_tuple(old_cluster_sum_v.begin(), d_src_vertex_weights_cache_, cluster_subtract_v.begin(), - d_src_cluster_cache_)), + d_src_cluster_cache_, + src_cluster_weights_v.begin())), d_dst_cluster_cache_, - thrust::make_counting_iterator(base_vertex_id_), - thrust::make_counting_iterator(base_vertex_id_ + local_num_vertices_), - d_dst_cluster_weights_cache_, - [base_vertex_id = base_vertex_id_, - d_src_cluster_weights = d_src_cluster_weights_cache_, - total_edge_weight, - resolution] __device__(auto src, - auto neighbor_cluster, - auto new_cluster_sum, - auto src_info, - auto a_new) { + map_key_first, + map_key_last, + map_value_first, + [rank = rank_, total_edge_weight, resolution] __device__( + auto src, auto neighbor_cluster, auto new_cluster_sum, auto src_info, auto a_new) { auto old_cluster_sum = thrust::get<0>(src_info); auto k_k = thrust::get<1>(src_info); auto cluster_subtract = thrust::get<2>(src_info); auto src_cluster = thrust::get<3>(src_info); - auto a_old = d_src_cluster_weights[src_cluster]; + auto a_old = thrust::get<4>(src_info); if (src_cluster == neighbor_cluster) new_cluster_sum -= cluster_subtract; @@ -352,10 +446,31 @@ class Louvain { resolution * (a_new * k_k - a_old * k_k + k_k * k_k) / (total_edge_weight * total_edge_weight)); +#if 0 + printf( + "rank = %d, src = %d, neighbor_cluster = %d, new_cluster_sum = %g, old_cluster_sum = " + "%g, " + "a_new = %g, a_old = %g, k_k = %g, delta_modularity = %g\n", + (int)rank, + (int)src, + (int)neighbor_cluster, + (float)new_cluster_sum, + (float)old_cluster_sum, + (float)a_new, + (float)a_old, + (float)k_k, + (float)delta_modularity); +#endif + return thrust::make_tuple(neighbor_cluster, delta_modularity); }, [] __device__(auto p1, auto p2) { - return (thrust::get<1>(p1) < thrust::get<1>(p2)) ? p2 : p1; + auto id1 = thrust::get<0>(p1); + auto id2 = thrust::get<0>(p2); + auto wt1 = thrust::get<1>(p1); + auto wt2 = thrust::get<1>(p2); + + return (wt1 < wt2) ? p2 : ((wt1 > wt2) ? p1 : ((id1 < id2) ? p1 : p2)); }, thrust::make_tuple(vertex_t{-1}, weight_t{0}), cugraph::experimental::get_dataframe_buffer_begin>( @@ -368,10 +483,20 @@ class Louvain { cugraph::experimental::get_dataframe_buffer_begin>( output_buffer), next_cluster_v.begin(), - [up_down] __device__(vertex_t old_cluster, auto p) { + [rank = rank_, up_down] __device__(vertex_t old_cluster, auto p) { vertex_t new_cluster = thrust::get<0>(p); weight_t delta_modularity = thrust::get<1>(p); +#if 0 + printf( + "rank = %d, old_cluster = %d, new_cluster = %d, delta_modularity = %g, up_down = %s\n", + (int)rank, + (int)old_cluster, + (int)new_cluster, + (float)delta_modularity, + up_down ? "true" : false); +#endif + return (delta_modularity > weight_t{0}) ? (((new_cluster > old_cluster) != up_down) ? old_cluster : new_cluster) : old_cluster; @@ -380,31 +505,15 @@ class Louvain { std::tie(d_src_cluster_cache_, d_dst_cluster_cache_) = cache_vertex_properties(next_cluster_v, src_cluster_cache_v_, dst_cluster_cache_v_); - std::tie(tmp_cluster_keys_v, tmp_cluster_weights_v) = + std::tie(cluster_keys_v_, cluster_weights_v_) = cugraph::experimental::transform_reduce_by_adj_matrix_row_key_e( handle_, current_graph_view_, thrust::make_constant_iterator(0), thrust::make_constant_iterator(0), d_src_cluster_cache_, - [] __device__(auto src, auto, auto wt, auto, auto) { return wt; }, + [] __device__(auto src, auto dst, auto wt, auto x, auto y) { return wt; }, weight_t{0}); - - thrust::fill(rmm::exec_policy(stream_)->on(stream_), - cluster_weights_v_.begin(), - cluster_weights_v_.end(), - weight_t{0}); - - thrust::scatter(rmm::exec_policy(stream_)->on(stream_), - tmp_cluster_weights_v.begin(), - tmp_cluster_weights_v.end(), - thrust::make_transform_iterator(tmp_cluster_keys_v.begin(), - [base_vertex_id = base_vertex_id_] __device__( - auto key) { return key - base_vertex_id; }), - cluster_weights_v_.begin()); - - std::tie(d_src_cluster_weights_cache_, d_dst_cluster_weights_cache_) = cache_vertex_properties( - cluster_weights_v_, src_cluster_weights_cache_v_, dst_cluster_weights_cache_v_); } void shrink_graph() @@ -418,20 +527,17 @@ class Louvain { current_graph_view_ = current_graph_->view(); - local_num_vertices_ = current_graph_view_.get_number_of_local_vertices(); - base_vertex_id_ = current_graph_view_.get_local_vertex_first(); - rmm::device_uvector numbering_indices(numbering_map.size(), stream_); thrust::sequence(rmm::exec_policy(stream_)->on(stream_), numbering_indices.begin(), numbering_indices.end(), - base_vertex_id_); + current_graph_view_.get_local_vertex_first()); relabel( handle_, std::make_tuple(static_cast(numbering_map.begin()), static_cast(numbering_indices.begin())), - local_num_vertices_, + current_graph_view_.get_number_of_local_vertices(), dendrogram_->current_level_begin(), dendrogram_->current_level_size()); @@ -444,8 +550,6 @@ class Louvain { std::unique_ptr> dendrogram_; - vertex_t local_num_vertices_; - vertex_t base_vertex_id_{0}; int rank_{0}; // @@ -457,16 +561,13 @@ class Louvain { graph_view_t current_graph_view_; rmm::device_uvector vertex_weights_v_; - rmm::device_uvector cluster_weights_v_; rmm::device_uvector src_vertex_weights_cache_v_; - rmm::device_uvector src_cluster_weights_cache_v_; - rmm::device_uvector dst_cluster_weights_cache_v_; rmm::device_uvector src_cluster_cache_v_; rmm::device_uvector dst_cluster_cache_v_; + rmm::device_uvector cluster_keys_v_; + rmm::device_uvector cluster_weights_v_; weight_t *d_src_vertex_weights_cache_; - weight_t *d_src_cluster_weights_cache_; - weight_t *d_dst_cluster_weights_cache_; vertex_t *d_src_cluster_cache_; vertex_t *d_dst_cluster_cache_; diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 813008a6fdf..d686125db29 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -427,9 +427,6 @@ if(BUILD_CUGRAPH_MG_TESTS) # - MG LOUVAIN tests --------------------------------------------------------------------- set(MG_LOUVAIN_TEST_SRCS - "${CMAKE_CURRENT_SOURCE_DIR}/../../thirdparty/mmio/mmio.c" - "${CMAKE_CURRENT_SOURCE_DIR}/utilities/test_utilities.cpp" - "${CMAKE_CURRENT_SOURCE_DIR}/utilities/mg_test_utilities.cu" "${CMAKE_CURRENT_SOURCE_DIR}/community/louvain_mg_test.cpp") ConfigureTest(MG_LOUVAIN_TEST "${MG_LOUVAIN_TEST_SRCS}") diff --git a/cpp/tests/community/louvain_mg_test.cpp b/cpp/tests/community/louvain_mg_test.cpp index 06715d52bc0..3c14f56d3c8 100644 --- a/cpp/tests/community/louvain_mg_test.cpp +++ b/cpp/tests/community/louvain_mg_test.cpp @@ -15,14 +15,23 @@ */ #include -#include #include #include #include +#include +#include +#include + #include +void compare(float modularity, float sg_modularity) { ASSERT_FLOAT_EQ(modularity, sg_modularity); } +void compare(double modularity, double sg_modularity) +{ + ASSERT_DOUBLE_EQ(modularity, sg_modularity); +} + //////////////////////////////////////////////////////////////////////////////// // Test param object. This defines the input and expected output for a test, and // will be instantiated as the parameter to the tests defined below using @@ -34,8 +43,8 @@ struct Louvain_Testparams { size_t max_level; double resolution; - // TODO: We really should have a Graph_Testparms_Base class or something - // like that which can handle this graph_full_path thing. + // FIXME: We really should have a Graph_Testparms_Base class or something + // like that which can handle this graph_full_path thing. // Louvain_Testparams(std::string const& graph_file_path, bool weighted, @@ -57,9 +66,11 @@ struct Louvain_Testparams { // test. In this case, each test is identical except for the inputs and // expected outputs, so the entire test is defined in the run_test() method. // -class Louvain_MG_Testfixture : public cugraph::test::MG_TestFixture_t, - public ::testing::WithParamInterface { +class Louvain_MG_Testfixture : public ::testing::TestWithParam { public: + static void SetUpTestCase() {} + static void TearDownTestCase() {} + // Run once for each test instance virtual void SetUp() {} virtual void TearDown() {} @@ -73,13 +84,16 @@ class Louvain_MG_Testfixture : public cugraph::test::MG_TestFixture_t, size_t max_level, weight_t resolution) { - // TODO: Put this in the Graph test base class - // (make the call simpler here) - auto graph = - cugraph::test::read_graph_from_matrix_market_file( - handle, graph_file_path, true); // FIXME: should use param.test_weighted instead of true - - auto graph_view = graph.view(); + // FIXME: Put this in the Graph test base class + // (make the call simpler here) + auto graph_tuple = + cugraph::test::read_graph_from_matrix_market_file( + handle, + graph_file_path, + true, + false); // FIXME: should use param.test_weighted instead of true + + auto graph_view = std::get<0>(graph_tuple).view(); cudaStream_t stream = handle.get_stream(); rmm::device_uvector clustering_v(graph_view.get_number_of_local_vertices(), stream); @@ -97,36 +111,46 @@ class Louvain_MG_Testfixture : public cugraph::test::MG_TestFixture_t, } // Compare the results of running louvain on multiple GPUs to that of a - // single-GPU run for the configuration in param. + // single-GPU run for the configuration in param. Note that MNMG Louvain + // and single GPU Louvain are ONLY deterministic through a single + // iteration of the outer loop. Renumbering of the partitions when coarsening + // the graph is a function of the number of GPUs in the GPU cluster. template void run_test(const Louvain_Testparams& param) { raft::handle_t handle; + raft::comms::initialize_mpi_comms(&handle, MPI_COMM_WORLD); const auto& comm = handle.get_comms(); - cudaStream_t stream = handle.get_stream(); + auto const comm_size = comm.get_size(); + auto const comm_rank = comm.get_rank(); - // Assuming 2 GPUs which means 1 row, 2 cols. 2 cols = row_comm_size of 2. - // FIXME: DO NOT ASSUME 2 GPUs, add code to compute prows, pcols - size_t row_comm_size{2}; + auto row_comm_size = static_cast(sqrt(static_cast(comm_size))); + while (comm_size % row_comm_size != 0) { --row_comm_size; } cugraph::partition_2d::subcomm_factory_t subcomm_factory(handle, row_comm_size); - int my_rank = comm.get_rank(); + cudaStream_t stream = handle.get_stream(); - // FIXME: graph must be weighted! - std::unique_ptr> // store_transposed=false, - // multi_gpu=true - mg_graph_ptr{}; + cugraph::experimental::graph_t mg_graph(handle); rmm::device_uvector d_renumber_map_labels(0, handle.get_stream()); - std::tie(mg_graph_ptr, d_renumber_map_labels) = cugraph::test:: - create_graph_for_gpu // store_transposed=true - (handle, param.graph_file_full_path); + std::tie(mg_graph, d_renumber_map_labels) = + cugraph::test::read_graph_from_matrix_market_file( + handle, param.graph_file_full_path, true, false); + + // Each GPU will have a subset of the clustering + int sg_level; + weight_t sg_modularity; + std::vector sg_clustering; + + // FIXME: Consider how to test for max_level > 1 + // perhaps some sort of approximation + // size_t local_max_level{param.max_level}; + size_t local_max_level{1}; - auto mg_graph_view = mg_graph_ptr->view(); + auto mg_graph_view = mg_graph.view(); rmm::device_uvector clustering_v(mg_graph_view.get_number_of_local_vertices(), stream); @@ -136,78 +160,30 @@ class Louvain_MG_Testfixture : public cugraph::test::MG_TestFixture_t, int level; weight_t modularity; - std::cout << "calling MG louvain" << std::endl; - std::tie(level, modularity) = cugraph::louvain( - handle, mg_graph_view, clustering_v.data(), param.max_level, param.resolution); - - std::vector clustering(mg_graph_view.get_number_of_local_vertices()); - - raft::update_host(clustering.data(), clustering_v.data(), clustering_v.size(), stream); - - std::vector h_renumber_map_labels(mg_graph_view.get_number_of_vertices()); - raft::update_host(h_renumber_map_labels.data(), - d_renumber_map_labels.data(), - d_renumber_map_labels.size(), - stream); + handle, mg_graph_view, clustering_v.data(), local_max_level, param.resolution); - // Compare MG to SG + if (comm_rank == 0) { + SCOPED_TRACE("compare modularity input: " + param.graph_file_full_path); - // Each GPU will have a subset of the clustering - int sg_level; - weight_t sg_modularity; - std::vector sg_clustering; + std::tie(sg_level, sg_modularity, sg_clustering) = get_sg_results( + handle, param.graph_file_full_path, local_max_level, param.resolution); - std::tie(sg_level, sg_modularity, sg_clustering) = get_sg_results( - handle, param.graph_file_full_path, param.max_level, param.resolution); - - std::cout << "MG: level = " << level << ", modularity = " << modularity << std::endl; - raft::print_host_vector("clustering", clustering.data(), clustering.size(), std::cout); - - std::cout << "SG: level = " << sg_level << ", modularity = " << sg_modularity << std::endl; - raft::print_host_vector("clustering", sg_clustering.data(), sg_clustering.size(), std::cout); - -#if 0 - // For this test, each GPU will have the full set of vertices and - // therefore the pageranks vectors should be equal in size. - ASSERT_EQ(h_sg_pageranks.size(), h_mg_pageranks.size()); - - auto threshold_ratio = 1e-3; - auto threshold_magnitude = - (1.0 / static_cast(mg_graph_view.get_number_of_vertices())) * - threshold_ratio; // skip comparison for low PageRank verties (lowly ranked vertices) - auto nearly_equal = [threshold_ratio, threshold_magnitude](auto lhs, auto rhs) { - return std::abs(lhs - rhs) < - std::max(std::max(lhs, rhs) * threshold_ratio, threshold_magnitude); - }; - - vertex_t mapped_vertex{0}; - for (vertex_t i = 0; - i + mg_graph_view.get_local_vertex_first() < mg_graph_view.get_local_vertex_last(); - ++i) { - mapped_vertex = h_renumber_map_labels[i]; - ASSERT_TRUE(nearly_equal(h_mg_pageranks[i], h_sg_pageranks[mapped_vertex])) - << "MG PageRank value for vertex: " << i << " in rank: " << my_rank - << " has value: " << h_mg_pageranks[i] - << " which exceeds the error margin for comparing to SG value: " << h_sg_pageranks[i]; + compare(modularity, sg_modularity); } -#endif } }; //////////////////////////////////////////////////////////////////////////////// -TEST_P(Louvain_MG_Testfixture, CheckInt32Int32FloatFloat) +TEST_P(Louvain_MG_Testfixture, CheckInt32Int32Float) { run_test(GetParam()); } INSTANTIATE_TEST_CASE_P( - e2e, + simple_test, Louvain_MG_Testfixture, - ::testing::Values(Louvain_Testparams("test/datasets/karate.mtx", true, 100, 1) - // Louvain_Testparams("test/datasets/webbase-1M.mtx", true, 100, 1), - )); + ::testing::Values(Louvain_Testparams("test/datasets/karate.mtx", true, 100, 1), + Louvain_Testparams("test/datasets/smallworld.mtx", true, 100, 1))); -// FIXME: Enable proper RMM configuration by using CUGRAPH_TEST_PROGRAM_MAIN(). -// Currently seeing a RMM failure during init, need to investigate. -// CUGRAPH_TEST_PROGRAM_MAIN() +CUGRAPH_MG_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/utilities/generate_graph_from_edgelist.cu b/cpp/tests/utilities/generate_graph_from_edgelist.cu index 1b9fe6051f7..fa10c31a5a4 100644 --- a/cpp/tests/utilities/generate_graph_from_edgelist.cu +++ b/cpp/tests/utilities/generate_graph_from_edgelist.cu @@ -30,39 +30,47 @@ namespace test { namespace detail { -template -std::enable_if_t< - multi_gpu, - std::tuple< - cugraph::experimental::graph_t, - rmm::device_uvector>> -generate_graph_from_edgelist(raft::handle_t const& handle, - rmm::device_uvector&& vertices, - rmm::device_uvector&& edgelist_rows, - rmm::device_uvector&& edgelist_cols, - rmm::device_uvector&& edgelist_weights, - bool is_symmetric, - bool test_weighted, - bool renumber) -{ - CUGRAPH_EXPECTS(renumber, "renumber should be true if multi_gpu is true."); - - auto& comm = handle.get_comms(); - auto const comm_size = comm.get_size(); - auto const comm_rank = comm.get_rank(); - auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); - auto const row_comm_size = row_comm.get_size(); - auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); - auto const col_comm_size = col_comm.get_size(); - - vertex_t number_of_vertices = static_cast(vertices.size()); +template +struct compute_gpu_id_from_vertex_no_renumbering { + int comm_size{0}; + vertex_t num_vertices{1}; + + __device__ int operator()(vertex_t v) const + { + vertex_t vertices_per_gpu = static_cast(num_vertices + comm_size - 1) / comm_size; + return static_cast(v / vertices_per_gpu); + } +}; + +template +struct compute_gpu_id_from_edge_no_renumbering { + bool hypergraph_partitioned{false}; + int comm_size{0}; + int row_comm_size{0}; + int col_comm_size{0}; + vertex_t num_vertices{1}; + + __device__ int operator()(vertex_t major, vertex_t minor) const + { + vertex_t vertices_per_gpu = static_cast(num_vertices + comm_size - 1) / comm_size; + auto major_comm_rank = static_cast(major / vertices_per_gpu); + auto minor_comm_rank = static_cast(minor / vertices_per_gpu); + + if (hypergraph_partitioned) { + return (minor_comm_rank / col_comm_size) * row_comm_size + (major_comm_rank % row_comm_size); + } else { + return (major_comm_rank - (major_comm_rank % row_comm_size)) + + (minor_comm_rank / col_comm_size); + } + } +}; - auto vertex_key_func = - cugraph::experimental::detail::compute_gpu_id_from_vertex_t{comm_size}; +template +void filter_vertices(raft::handle_t const& handle, + rmm::device_uvector& vertices, + key_function_t vertex_key_func, + int comm_rank) +{ vertices.resize(thrust::distance(vertices.begin(), thrust::remove_if( rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), @@ -72,10 +80,17 @@ generate_graph_from_edgelist(raft::handle_t const& handle, return key_func(val) != comm_rank; })), handle.get_stream()); - vertices.shrink_to_fit(handle.get_stream()); +} - auto edge_key_func = cugraph::experimental::detail::compute_gpu_id_from_edge_t{ - false, comm_size, row_comm_size, col_comm_size}; +template +void filter_edges(raft::handle_t const& handle, + rmm::device_uvector& edgelist_rows, + rmm::device_uvector& edgelist_cols, + rmm::device_uvector& edgelist_weights, + key_function_t edge_key_func, + int comm_rank, + bool test_weighted) +{ size_t number_of_local_edges{}; if (test_weighted) { auto edge_first = thrust::make_zip_iterator( @@ -106,35 +121,120 @@ generate_graph_from_edgelist(raft::handle_t const& handle, } edgelist_rows.resize(number_of_local_edges, handle.get_stream()); - edgelist_rows.shrink_to_fit(handle.get_stream()); edgelist_cols.resize(number_of_local_edges, handle.get_stream()); - edgelist_cols.shrink_to_fit(handle.get_stream()); - if (test_weighted) { - edgelist_weights.resize(number_of_local_edges, handle.get_stream()); - edgelist_weights.shrink_to_fit(handle.get_stream()); + if (test_weighted) edgelist_weights.resize(number_of_local_edges, handle.get_stream()); +} + +template +std::enable_if_t< + multi_gpu, + std::tuple< + cugraph::experimental::graph_t, + rmm::device_uvector>> +generate_graph_from_edgelist(raft::handle_t const& handle, + rmm::device_uvector&& vertices, + rmm::device_uvector&& edgelist_rows, + rmm::device_uvector&& edgelist_cols, + rmm::device_uvector&& edgelist_weights, + bool is_symmetric, + bool test_weighted, + bool renumber) +{ + auto& comm = handle.get_comms(); + auto const comm_size = comm.get_size(); + auto const comm_rank = comm.get_rank(); + auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); + auto const row_comm_size = row_comm.get_size(); + auto const row_comm_rank = row_comm.get_rank(); + auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); + auto const col_comm_size = col_comm.get_size(); + auto const col_comm_rank = col_comm.get_rank(); + + vertex_t number_of_vertices = static_cast(vertices.size()); + edge_t number_of_edges = static_cast(edgelist_rows.size()); + + if (renumber) { + filter_vertices( + handle, + vertices, + cugraph::experimental::detail::compute_gpu_id_from_vertex_t{comm_size}, + comm_rank); + filter_edges( + handle, + edgelist_rows, + edgelist_cols, + edgelist_weights, + cugraph::experimental::detail::compute_gpu_id_from_edge_t{ + false, comm_size, row_comm_size, col_comm_size}, + comm_rank, + test_weighted); + } else { + filter_vertices( + handle, + vertices, + compute_gpu_id_from_vertex_no_renumbering{comm_size, number_of_vertices}, + comm_rank); + filter_edges(handle, + edgelist_rows, + edgelist_cols, + edgelist_weights, + compute_gpu_id_from_edge_no_renumbering{ + false, comm_size, row_comm_size, col_comm_size, number_of_vertices}, + comm_rank, + test_weighted); } + vertices.shrink_to_fit(handle.get_stream()); + edgelist_rows.shrink_to_fit(handle.get_stream()); + edgelist_cols.shrink_to_fit(handle.get_stream()); + if (test_weighted) edgelist_weights.shrink_to_fit(handle.get_stream()); + // 3. renumber rmm::device_uvector renumber_map_labels(0, handle.get_stream()); cugraph::experimental::partition_t partition{}; vertex_t aggregate_number_of_vertices{}; - edge_t number_of_edges{}; - // FIXME: set do_expensive_check to false once validated - std::tie(renumber_map_labels, partition, aggregate_number_of_vertices, number_of_edges) = - cugraph::experimental::renumber_edgelist( - handle, - vertices.data(), - static_cast(vertices.size()), - store_transposed ? edgelist_cols.data() : edgelist_rows.data(), - store_transposed ? edgelist_rows.data() : edgelist_cols.data(), - edgelist_rows.size(), - false, - true); - assert(aggregate_number_of_vertices == number_of_vertices); - // 4. create a graph + if (renumber) { + // FIXME: set do_expensive_check to false once validated + std::tie(renumber_map_labels, partition, aggregate_number_of_vertices, number_of_edges) = + cugraph::experimental::renumber_edgelist( + handle, + vertices.data(), + static_cast(vertices.size()), + store_transposed ? edgelist_cols.data() : edgelist_rows.data(), + store_transposed ? edgelist_rows.data() : edgelist_cols.data(), + edgelist_rows.size(), + false, + true); + assert(aggregate_number_of_vertices == number_of_vertices); + } else { + std::vector vertex_partition_offsets(comm_size + 1, 0); + vertex_t vertices_per_gpu = + static_cast(number_of_vertices + comm_size - 1) / comm_size; + + std::for_each(thrust::make_counting_iterator(0), + thrust::make_counting_iterator(comm_size), + [h_vertex_partition_offsets = vertex_partition_offsets.data(), vertices_per_gpu]( + auto idx) { h_vertex_partition_offsets[idx] = idx * vertices_per_gpu; }); + + vertex_partition_offsets[comm_size] = number_of_vertices; + partition = cugraph::experimental::partition_t(vertex_partition_offsets, + false, // is_hypergraph_partitioned, + row_comm_size, + col_comm_size, + row_comm_rank, + col_comm_rank); + + //number_of_edges = edgelist_rows.size(); + } + + // 4. create a graph return std::make_tuple( cugraph::experimental::graph_t( handle, @@ -148,7 +248,7 @@ generate_graph_from_edgelist(raft::handle_t const& handle, number_of_vertices, number_of_edges, cugraph::experimental::graph_properties_t{is_symmetric, false}, - true, + false, true), std::move(renumber_map_labels)); } From 7cb4c135b364bd97ef6e15a768bf4dbc093f30e5 Mon Sep 17 00:00:00 2001 From: Chuck Hastings Date: Tue, 16 Mar 2021 15:03:40 -0400 Subject: [PATCH 06/18] respond to PR comments --- cpp/src/community/dendrogram.cuh | 4 +- cpp/src/community/ecg.cu | 2 +- cpp/src/community/leiden.cuh | 2 +- cpp/src/community/louvain.cuh | 2 +- cpp/src/experimental/louvain.cuh | 145 +++++++++++++------------------ 5 files changed, 66 insertions(+), 89 deletions(-) diff --git a/cpp/src/community/dendrogram.cuh b/cpp/src/community/dendrogram.cuh index eed34064d9a..9389de3292d 100644 --- a/cpp/src/community/dendrogram.cuh +++ b/cpp/src/community/dendrogram.cuh @@ -25,8 +25,8 @@ namespace cugraph { template class Dendrogram { public: - void add_level(vertex_t num_verts, - vertex_t first_index, + void add_level(vertex_t first_index, + vertex_t num_verts, cudaStream_t stream = 0, rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()) { diff --git a/cpp/src/community/ecg.cu b/cpp/src/community/ecg.cu index 1563d9393d8..45f7d723191 100644 --- a/cpp/src/community/ecg.cu +++ b/cpp/src/community/ecg.cu @@ -117,7 +117,7 @@ class EcgLouvain : public cugraph::Louvain { void initialize_dendrogram_level(vertex_t num_vertices) override { - this->dendrogram_->add_level(num_vertices, 0); + this->dendrogram_->add_level(0, num_vertices); get_permutation_vector( num_vertices, seed_, this->dendrogram_->current_level_begin(), this->stream_); diff --git a/cpp/src/community/leiden.cuh b/cpp/src/community/leiden.cuh index 3c3c7815aa8..aae2d3712b5 100644 --- a/cpp/src/community/leiden.cuh +++ b/cpp/src/community/leiden.cuh @@ -132,7 +132,7 @@ class Leiden : public Louvain { // // Initialize every cluster to reference each vertex to itself // - this->dendrogram_->add_level(current_graph.number_of_vertices, 0); + this->dendrogram_->add_level(0, current_graph.number_of_vertices); thrust::sequence(rmm::exec_policy(this->stream_)->on(this->stream_), this->dendrogram_->current_level_begin(), diff --git a/cpp/src/community/louvain.cuh b/cpp/src/community/louvain.cuh index ae72cd8d44b..b81e619e747 100644 --- a/cpp/src/community/louvain.cuh +++ b/cpp/src/community/louvain.cuh @@ -210,7 +210,7 @@ class Louvain { virtual void initialize_dendrogram_level(vertex_t num_vertices) { - dendrogram_->add_level(num_vertices, 0); + dendrogram_->add_level(0, num_vertices); thrust::sequence(rmm::exec_policy(stream_)->on(stream_), dendrogram_->current_level_begin(), diff --git a/cpp/src/experimental/louvain.cuh b/cpp/src/experimental/louvain.cuh index e3ae267e710..60086341e4b 100644 --- a/cpp/src/experimental/louvain.cuh +++ b/cpp/src/experimental/louvain.cuh @@ -65,12 +65,8 @@ class Louvain { vertex_weights_v_(graph_view.get_number_of_local_vertices(), handle.get_stream()), src_vertex_weights_cache_v_(0, handle.get_stream()), src_cluster_cache_v_(0, handle.get_stream()), - dst_cluster_cache_v_(0, handle.get_stream()), - stream_(handle.get_stream()) + dst_cluster_cache_v_(0, handle.get_stream()) { - if (graph_view_t::is_multi_gpu) { - rank_ = handle.get_comms().get_rank(); - } } Dendrogram const &get_dendrogram() const { return *dendrogram_; } @@ -117,14 +113,23 @@ class Louvain { void timer_start(std::string const ®ion) { #ifdef TIMING - if (rank_ == 0) hr_timer_.start(region); + if (graph_view_t::is_multi_gpu) { + if (handle.get_comms().get_rank() == 0) hr_timer_.start(region); + } else { + hr_timer_.start(region); + } #endif } void timer_stop(cudaStream_t stream) { #ifdef TIMING - if (rank_ == 0) { + if (graph_view_t::is_multi_gpu) { + if (handle.get_comms().get_rank() == 0) { + CUDA_TRY(cudaStreamSynchronize(stream)); + hr_timer_.stop(); + } + } else { CUDA_TRY(cudaStreamSynchronize(stream)); hr_timer_.stop(); } @@ -134,16 +139,20 @@ class Louvain { void timer_display(std::ostream &os) { #ifdef TIMING - if (rank_ == 0) hr_timer_.display(os); + if (graph_view_t::is_multi_gpu) { + if (handle.get_comms().get_rank() == 0) hr_timer_.display(os); + } else { + hr_timer_.display(os); + } #endif } protected: void initialize_dendrogram_level(vertex_t num_vertices) { - dendrogram_->add_level(num_vertices, current_graph_view_.get_local_vertex_first()); + dendrogram_->add_level(current_graph_view_.get_local_vertex_first(), num_vertices); - thrust::sequence(rmm::exec_policy(stream_)->on(stream_), + thrust::sequence(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), dendrogram_->current_level_begin(), dendrogram_->current_level_end(), current_graph_view_.get_local_vertex_first()); @@ -153,7 +162,7 @@ class Louvain { weight_t modularity(weight_t total_edge_weight, weight_t resolution) { weight_t sum_degree_squared = thrust::transform_reduce( - rmm::exec_policy(stream_)->on(stream_), + rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), cluster_weights_v_.begin(), cluster_weights_v_.end(), [] __device__(weight_t p) { return p * p; }, @@ -161,7 +170,8 @@ class Louvain { thrust::plus()); if (graph_t::is_multi_gpu) { - sum_degree_squared = host_scalar_allreduce(handle_.get_comms(), sum_degree_squared, stream_); + sum_degree_squared = + host_scalar_allreduce(handle_.get_comms(), sum_degree_squared, handle_.get_stream()); } weight_t sum_internal = experimental::transform_reduce_e( @@ -190,13 +200,15 @@ class Louvain { vertex_weights_v_ = current_graph_view_.compute_out_weight_sums(handle_); - thrust::sequence(rmm::exec_policy(stream_)->on(stream_), + thrust::sequence(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), cluster_keys_v_.begin(), cluster_keys_v_.end(), current_graph_view_.get_local_vertex_first()); - raft::copy( - cluster_weights_v_.begin(), vertex_weights_v_.begin(), vertex_weights_v_.size(), stream_); + raft::copy(cluster_weights_v_.begin(), + vertex_weights_v_.begin(), + vertex_weights_v_.size(), + handle_.get_stream()); d_src_vertex_weights_cache_ = cache_src_vertex_properties(vertex_weights_v_, src_vertex_weights_cache_v_); @@ -223,7 +235,7 @@ class Louvain { cluster_weights_v_ = std::move(rx_weights_v); } - timer_stop(stream_); + timer_stop(handle_.get_stream()); } template @@ -231,7 +243,7 @@ class Louvain { { if (graph_view_t::is_multi_gpu) { src_cache_v.resize(current_graph_view_.get_number_of_local_adj_matrix_partition_rows(), - stream_); + handle_.get_stream()); copy_to_adj_matrix_row(handle_, current_graph_view_, input.begin(), src_cache_v.begin()); return src_cache_v.begin(); } else { @@ -244,7 +256,7 @@ class Louvain { { if (graph_view_t::is_multi_gpu) { dst_cache_v.resize(current_graph_view_.get_number_of_local_adj_matrix_partition_cols(), - stream_); + handle_.get_stream()); copy_to_adj_matrix_col(handle_, current_graph_view_, input.begin(), dst_cache_v.begin()); return dst_cache_v.begin(); } else { @@ -252,30 +264,20 @@ class Louvain { } } - template - std::tuple cache_vertex_properties(rmm::device_uvector &input, - rmm::device_uvector &src_cache_v, - rmm::device_uvector &dst_cache_v) - { - auto src = cache_src_vertex_properties(input, src_cache_v); - auto dst = cache_dst_vertex_properties(input, dst_cache_v); - - return std::make_tuple(src, dst); - } - virtual weight_t update_clustering(weight_t total_edge_weight, weight_t resolution) { timer_start("update_clustering"); - rmm::device_uvector next_cluster_v(dendrogram_->current_level_size(), stream_); + rmm::device_uvector next_cluster_v(dendrogram_->current_level_size(), + handle_.get_stream()); raft::copy(next_cluster_v.begin(), dendrogram_->current_level_begin(), dendrogram_->current_level_size(), - stream_); + handle_.get_stream()); - std::tie(d_src_cluster_cache_, d_dst_cluster_cache_) = - cache_vertex_properties(next_cluster_v, src_cluster_cache_v_, dst_cluster_cache_v_); + d_src_cluster_cache_ = cache_src_vertex_properties(next_cluster_v, src_cluster_cache_v_); + d_dst_cluster_cache_ = cache_dst_vertex_properties(next_cluster_v, dst_cluster_cache_v_); weight_t new_Q = modularity(total_edge_weight, resolution); weight_t cur_Q = new_Q - 1; @@ -298,11 +300,11 @@ class Louvain { raft::copy(dendrogram_->current_level_begin(), next_cluster_v.begin(), next_cluster_v.size(), - stream_); + handle_.get_stream()); } } - timer_stop(stream_); + timer_stop(handle_.get_stream()); return cur_Q; } @@ -311,7 +313,7 @@ class Louvain { { auto output_buffer = cugraph::experimental::allocate_dataframe_buffer>( - current_graph_view_.get_number_of_local_vertices(), stream_); + current_graph_view_.get_number_of_local_vertices(), handle_.get_stream()); experimental::copy_v_transform_reduce_out_nbr( handle_, @@ -334,7 +336,7 @@ class Louvain { output_buffer)); thrust::transform( - rmm::exec_policy(stream_)->on(stream_), + rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), cugraph::experimental::get_dataframe_buffer_begin>( output_buffer), cugraph::experimental::get_dataframe_buffer_begin>( @@ -344,7 +346,7 @@ class Louvain { [] __device__(auto p) { return thrust::get<1>(p); }); thrust::transform( - rmm::exec_policy(stream_)->on(stream_), + rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), cugraph::experimental::get_dataframe_buffer_begin>( output_buffer), cugraph::experimental::get_dataframe_buffer_begin>( @@ -359,16 +361,20 @@ class Louvain { rmm::device_uvector &next_cluster_v, bool up_down) { - rmm::device_uvector old_cluster_sum_v(current_graph_view_.get_number_of_local_vertices(), stream_); - rmm::device_uvector cluster_subtract_v(current_graph_view_.get_number_of_local_vertices(), stream_); - rmm::device_uvector src_cluster_weights_v(next_cluster_v.size(), stream_); - rmm::device_uvector dst_cluster_weights_v(next_cluster_v.size(), stream_); + rmm::device_uvector old_cluster_sum_v( + current_graph_view_.get_number_of_local_vertices(), handle_.get_stream()); + rmm::device_uvector cluster_subtract_v( + current_graph_view_.get_number_of_local_vertices(), handle_.get_stream()); + rmm::device_uvector src_cluster_weights_v(next_cluster_v.size(), + handle_.get_stream()); + rmm::device_uvector dst_cluster_weights_v(next_cluster_v.size(), + handle_.get_stream()); compute_cluster_sum_and_subtract(old_cluster_sum_v, cluster_subtract_v); auto output_buffer = cugraph::experimental::allocate_dataframe_buffer>( - current_graph_view_.get_number_of_local_vertices(), stream_); + current_graph_view_.get_number_of_local_vertices(), handle_.get_stream()); vertex_t *map_key_first; vertex_t *map_key_last; @@ -386,7 +392,7 @@ class Louvain { d_src_cluster_cache_, d_src_cluster_cache_ + src_cluster_cache_v_.size(), vertex_to_gpu_id_op, - stream_); + handle_.get_stream()); dst_cluster_weights_v = cugraph::experimental::collect_values_for_keys( handle_.get_comms(), @@ -396,13 +402,13 @@ class Louvain { d_dst_cluster_cache_, d_dst_cluster_cache_ + dst_cluster_cache_v_.size(), vertex_to_gpu_id_op, - stream_); + handle_.get_stream()); map_key_first = d_dst_cluster_cache_; map_key_last = d_dst_cluster_cache_ + dst_cluster_cache_v_.size(); map_value_first = dst_cluster_weights_v.begin(); } else { - thrust::transform(rmm::exec_policy(stream_)->on(stream_), + thrust::transform(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), next_cluster_v.begin(), next_cluster_v.end(), src_cluster_weights_v.begin(), @@ -432,7 +438,7 @@ class Louvain { map_key_first, map_key_last, map_value_first, - [rank = rank_, total_edge_weight, resolution] __device__( + [total_edge_weight, resolution] __device__( auto src, auto neighbor_cluster, auto new_cluster_sum, auto src_info, auto a_new) { auto old_cluster_sum = thrust::get<0>(src_info); auto k_k = thrust::get<1>(src_info); @@ -446,22 +452,6 @@ class Louvain { resolution * (a_new * k_k - a_old * k_k + k_k * k_k) / (total_edge_weight * total_edge_weight)); -#if 0 - printf( - "rank = %d, src = %d, neighbor_cluster = %d, new_cluster_sum = %g, old_cluster_sum = " - "%g, " - "a_new = %g, a_old = %g, k_k = %g, delta_modularity = %g\n", - (int)rank, - (int)src, - (int)neighbor_cluster, - (float)new_cluster_sum, - (float)old_cluster_sum, - (float)a_new, - (float)a_old, - (float)k_k, - (float)delta_modularity); -#endif - return thrust::make_tuple(neighbor_cluster, delta_modularity); }, [] __device__(auto p1, auto p2) { @@ -477,33 +467,23 @@ class Louvain { output_buffer)); thrust::transform( - rmm::exec_policy(stream_)->on(stream_), + rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), next_cluster_v.begin(), next_cluster_v.end(), cugraph::experimental::get_dataframe_buffer_begin>( output_buffer), next_cluster_v.begin(), - [rank = rank_, up_down] __device__(vertex_t old_cluster, auto p) { + [up_down] __device__(vertex_t old_cluster, auto p) { vertex_t new_cluster = thrust::get<0>(p); weight_t delta_modularity = thrust::get<1>(p); -#if 0 - printf( - "rank = %d, old_cluster = %d, new_cluster = %d, delta_modularity = %g, up_down = %s\n", - (int)rank, - (int)old_cluster, - (int)new_cluster, - (float)delta_modularity, - up_down ? "true" : false); -#endif - return (delta_modularity > weight_t{0}) ? (((new_cluster > old_cluster) != up_down) ? old_cluster : new_cluster) : old_cluster; }); - std::tie(d_src_cluster_cache_, d_dst_cluster_cache_) = - cache_vertex_properties(next_cluster_v, src_cluster_cache_v_, dst_cluster_cache_v_); + d_src_cluster_cache_ = cache_src_vertex_properties(next_cluster_v, src_cluster_cache_v_); + d_dst_cluster_cache_ = cache_dst_vertex_properties(next_cluster_v, dst_cluster_cache_v_); std::tie(cluster_keys_v_, cluster_weights_v_) = cugraph::experimental::transform_reduce_by_adj_matrix_row_key_e( @@ -520,15 +500,15 @@ class Louvain { { timer_start("shrinking graph"); - rmm::device_uvector numbering_map(0, stream_); + rmm::device_uvector numbering_map(0, handle_.get_stream()); std::tie(current_graph_, numbering_map) = coarsen_graph(handle_, current_graph_view_, dendrogram_->current_level_begin()); current_graph_view_ = current_graph_->view(); - rmm::device_uvector numbering_indices(numbering_map.size(), stream_); - thrust::sequence(rmm::exec_policy(stream_)->on(stream_), + rmm::device_uvector numbering_indices(numbering_map.size(), handle_.get_stream()); + thrust::sequence(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), numbering_indices.begin(), numbering_indices.end(), current_graph_view_.get_local_vertex_first()); @@ -541,17 +521,14 @@ class Louvain { dendrogram_->current_level_begin(), dendrogram_->current_level_size()); - timer_stop(stream_); + timer_stop(handle_.get_stream()); } protected: raft::handle_t const &handle_; - cudaStream_t stream_; std::unique_ptr> dendrogram_; - int rank_{0}; - // // Initially we run on the input graph view, // but as we shrink the graph we'll keep the From d660d4ce5b48045bda785b410d366a44538d3738 Mon Sep 17 00:00:00 2001 From: Chuck Hastings Date: Tue, 16 Mar 2021 15:34:18 -0400 Subject: [PATCH 07/18] rename louvain mg test file --- cpp/tests/CMakeLists.txt | 2 +- .../community/{louvain_mg_test.cpp => mg_louvain_test.cpp} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename cpp/tests/community/{louvain_mg_test.cpp => mg_louvain_test.cpp} (100%) diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index d686125db29..80c7b8193fb 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -427,7 +427,7 @@ if(BUILD_CUGRAPH_MG_TESTS) # - MG LOUVAIN tests --------------------------------------------------------------------- set(MG_LOUVAIN_TEST_SRCS - "${CMAKE_CURRENT_SOURCE_DIR}/community/louvain_mg_test.cpp") + "${CMAKE_CURRENT_SOURCE_DIR}/community/mg_louvain_test.cpp") ConfigureTest(MG_LOUVAIN_TEST "${MG_LOUVAIN_TEST_SRCS}") target_link_libraries(MG_LOUVAIN_TEST PRIVATE MPI::MPI_C MPI::MPI_CXX) diff --git a/cpp/tests/community/louvain_mg_test.cpp b/cpp/tests/community/mg_louvain_test.cpp similarity index 100% rename from cpp/tests/community/louvain_mg_test.cpp rename to cpp/tests/community/mg_louvain_test.cpp From c90f608b305e2d14f55a648a22ba96473604fe2a Mon Sep 17 00:00:00 2001 From: Chuck Hastings Date: Tue, 16 Mar 2021 15:45:32 -0400 Subject: [PATCH 08/18] change Testparams to Usecase --- cpp/tests/community/mg_louvain_test.cpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/cpp/tests/community/mg_louvain_test.cpp b/cpp/tests/community/mg_louvain_test.cpp index 3c14f56d3c8..e97f93d4424 100644 --- a/cpp/tests/community/mg_louvain_test.cpp +++ b/cpp/tests/community/mg_louvain_test.cpp @@ -37,7 +37,7 @@ void compare(double modularity, double sg_modularity) // will be instantiated as the parameter to the tests defined below using // INSTANTIATE_TEST_CASE_P() // -struct Louvain_Testparams { +struct Louvain_Usecase { std::string graph_file_full_path{}; bool weighted{false}; size_t max_level; @@ -46,10 +46,10 @@ struct Louvain_Testparams { // FIXME: We really should have a Graph_Testparms_Base class or something // like that which can handle this graph_full_path thing. // - Louvain_Testparams(std::string const& graph_file_path, - bool weighted, - size_t max_level, - double resolution) + Louvain_Usecase(std::string const& graph_file_path, + bool weighted, + size_t max_level, + double resolution) : weighted(weighted), max_level(max_level), resolution(resolution) { if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) { @@ -66,7 +66,7 @@ struct Louvain_Testparams { // test. In this case, each test is identical except for the inputs and // expected outputs, so the entire test is defined in the run_test() method. // -class Louvain_MG_Testfixture : public ::testing::TestWithParam { +class Louvain_MG_Testfixture : public ::testing::TestWithParam { public: static void SetUpTestCase() {} static void TearDownTestCase() {} @@ -116,7 +116,7 @@ class Louvain_MG_Testfixture : public ::testing::TestWithParam - void run_test(const Louvain_Testparams& param) + void run_test(const Louvain_Usecase& param) { raft::handle_t handle; @@ -183,7 +183,7 @@ TEST_P(Louvain_MG_Testfixture, CheckInt32Int32Float) INSTANTIATE_TEST_CASE_P( simple_test, Louvain_MG_Testfixture, - ::testing::Values(Louvain_Testparams("test/datasets/karate.mtx", true, 100, 1), - Louvain_Testparams("test/datasets/smallworld.mtx", true, 100, 1))); + ::testing::Values(Louvain_Usecase("test/datasets/karate.mtx", true, 100, 1), + Louvain_Usecase("test/datasets/smallworld.mtx", true, 100, 1))); CUGRAPH_MG_TEST_PROGRAM_MAIN() From 0673a527400d6bd1643e15c75d981415e4e4d28d Mon Sep 17 00:00:00 2001 From: Chuck Hastings Date: Thu, 25 Mar 2021 09:48:33 -0400 Subject: [PATCH 09/18] Address PR review items, improved testing 1. Exposed flatten_dendrogram and a version of louvain that returns dendrogram in algorithms 2. Added methods to the legacy graph implementations to make it easier to template between old and new graph objects 3. Removed some class member data that can be derived 4. Added code to suppress gtest output from rank > 0 --- cpp/include/algorithms.hpp | 64 ++++ .../dendrogram.cuh => include/dendrogram.hpp} | 26 +- cpp/include/graph.hpp | 8 + cpp/src/community/flatten_dendrogram.cuh | 2 +- cpp/src/community/louvain.cu | 122 +++--- cpp/src/community/louvain.cuh | 4 +- cpp/src/experimental/louvain.cuh | 4 +- cpp/tests/CMakeLists.txt | 2 + cpp/tests/community/mg_louvain_helper.cu | 353 ++++++++++++++++++ cpp/tests/community/mg_louvain_helper.hpp | 51 +++ cpp/tests/community/mg_louvain_test.cpp | 150 +++++--- cpp/tests/utilities/base_fixture.hpp | 5 + 12 files changed, 677 insertions(+), 114 deletions(-) rename cpp/{src/community/dendrogram.cuh => include/dendrogram.hpp} (68%) create mode 100644 cpp/tests/community/mg_louvain_helper.cu create mode 100644 cpp/tests/community/mg_louvain_helper.hpp diff --git a/cpp/include/algorithms.hpp b/cpp/include/algorithms.hpp index c3a4f3ec985..1c4a84a27e4 100644 --- a/cpp/include/algorithms.hpp +++ b/cpp/include/algorithms.hpp @@ -14,10 +14,14 @@ * limitations under the License. */ #pragma once + +#include #include #include + #include #include + #include namespace cugraph { @@ -637,6 +641,66 @@ std::pair louvain( size_t max_level = 100, typename graph_t::weight_type resolution = typename graph_t::weight_type{1}); +/** + * @brief Louvain implementation, returning dendrogram + * + * Compute a clustering of the graph by maximizing modularity + * + * Computed using the Louvain method described in: + * + * VD Blondel, J-L Guillaume, R Lambiotte and E Lefebvre: Fast unfolding of + * community hierarchies in large networks, J Stat Mech P10008 (2008), + * http://arxiv.org/abs/0803.0476 + * + * @throws cugraph::logic_error when an error occurs. + * + * @tparam graph_t Type of graph + * + * @param[in] handle Library handle (RAFT). If a communicator is set in the handle, + * @param[in] graph input graph object (CSR) + * @param[in] max_level (optional) maximum number of levels to run (default 100) + * @param[in] resolution (optional) The value of the resolution parameter to use. + * Called gamma in the modularity formula, this changes the size + * of the communities. Higher resolutions lead to more smaller + * communities, lower resolutions lead to fewer larger + * communities. (default 1) + * + * @return a pair containing: + * 1) unique pointer to dendrogram + * 2) modularity of the returned clustering + * + */ +template +std::pair>, typename graph_t::weight_type> +louvain(raft::handle_t const &handle, + graph_t const &graph, + size_t max_level = 100, + typename graph_t::weight_type resolution = typename graph_t::weight_type{1}); + +/** + * @brief Flatten a Dendrogram at a particular level + * + * A Dendrogram represents a hierarchical clustering/partitioning of + * a graph. This function will flatten the hierarchical clustering into + * a label for each vertex representing the final cluster/partition to + * which it is assigned + * + * @throws cugraph::logic_error when an error occurs. + * + * @tparam graph_t Type of graph + * + * @param[in] handle Library handle (RAFT). If a communicator is set in the handle, + * @param[in] graph input graph object + * @param[in] dendrogram input dendrogram object + * @param[out] clustering Pointer to device array where the clustering should be stored + * + */ +template +void flatten_dendrogram(raft::handle_t const &handle, + graph_t const &graph_view, + Dendrogram const &dendrogram, + typename graph_t::vertex_type *clustering); + /** * @brief Leiden implementation * diff --git a/cpp/src/community/dendrogram.cuh b/cpp/include/dendrogram.hpp similarity index 68% rename from cpp/src/community/dendrogram.cuh rename to cpp/include/dendrogram.hpp index 9389de3292d..83a35110605 100644 --- a/cpp/src/community/dendrogram.cuh +++ b/cpp/include/dendrogram.hpp @@ -16,6 +16,7 @@ #pragma once #include +#include #include #include @@ -30,27 +31,26 @@ class Dendrogram { cudaStream_t stream = 0, rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()) { - level_ptr_.push_back( - std::make_unique(num_verts * sizeof(vertex_t), stream, mr)); - level_size_.push_back(num_verts); + level_ptr_.push_back(std::make_unique>(num_verts, stream, mr)); level_first_index_.push_back(first_index); } - size_t current_level() const { return level_size_.size() - 1; } + size_t current_level() const { return level_ptr_.size() - 1; } - size_t num_levels() const { return level_size_.size(); } + size_t num_levels() const { return level_ptr_.size(); } vertex_t const *get_level_ptr_nocheck(size_t level) const { - return static_cast(level_ptr_[level]->data()); + //return static_cast(level_ptr_[level]->data()); + return level_ptr_[level]->data(); } vertex_t *get_level_ptr_nocheck(size_t level) { - return static_cast(level_ptr_[level]->data()); + return level_ptr_[level]->data(); } - vertex_t get_level_size_nocheck(size_t level) const { return level_size_[level]; } + size_t get_level_size_nocheck(size_t level) const { return level_ptr_[level]->size(); } vertex_t get_level_first_index_nocheck(size_t level) const { return level_first_index_[level]; } @@ -62,14 +62,16 @@ class Dendrogram { vertex_t *current_level_end() { return current_level_begin() + current_level_size(); } - vertex_t current_level_size() const { return get_level_size_nocheck(current_level()); } + size_t current_level_size() const { return get_level_size_nocheck(current_level()); } - vertex_t current_level_first_index() const { return get_level_first_index_nocheck(current_level()); } + vertex_t current_level_first_index() const + { + return get_level_first_index_nocheck(current_level()); + } private: - std::vector level_size_; std::vector level_first_index_; - std::vector> level_ptr_; + std::vector>> level_ptr_; }; } // namespace cugraph diff --git a/cpp/include/graph.hpp b/cpp/include/graph.hpp index b30159566b5..c4ca3b40c11 100644 --- a/cpp/include/graph.hpp +++ b/cpp/include/graph.hpp @@ -69,6 +69,14 @@ class GraphViewBase { edge_t *local_edges; vertex_t *local_offsets; + vertex_t get_number_of_vertices() const { + return number_of_vertices; + } + + vertex_t get_local_vertex_first() const { + return vertex_t{0}; + } + /** * @brief Fill the identifiers array with the vertex identifiers. * diff --git a/cpp/src/community/flatten_dendrogram.cuh b/cpp/src/community/flatten_dendrogram.cuh index 6961d7c9b87..6d455a68192 100644 --- a/cpp/src/community/flatten_dendrogram.cuh +++ b/cpp/src/community/flatten_dendrogram.cuh @@ -15,7 +15,7 @@ */ #pragma once -#include +#include #include #include diff --git a/cpp/src/community/louvain.cu b/cpp/src/community/louvain.cu index a851777ad93..ab27923cdb1 100644 --- a/cpp/src/community/louvain.cu +++ b/cpp/src/community/louvain.cu @@ -26,50 +26,28 @@ namespace cugraph { namespace detail { template -std::pair louvain(raft::handle_t const &handle, - GraphCSRView const &graph_view, - vertex_t *clustering, - size_t max_level, - weight_t resolution) +std::pair>, weight_t> louvain( + raft::handle_t const &handle, + GraphCSRView const &graph_view, + size_t max_level, + weight_t resolution) { CUGRAPH_EXPECTS(graph_view.edge_data != nullptr, "Invalid input argument: louvain expects a weighted graph"); - CUGRAPH_EXPECTS(clustering != nullptr, - "Invalid input argument: clustering is null, should be a device pointer to " - "memory for storing the result"); Louvain> runner(handle, graph_view); weight_t wt = runner(max_level, resolution); - rmm::device_uvector vertex_ids_v(graph_view.number_of_vertices, handle.get_stream()); - - thrust::sequence(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - vertex_ids_v.begin(), - vertex_ids_v.end(), - vertex_t{0}); - - partition_at_level(handle, - runner.get_dendrogram(), - vertex_ids_v.data(), - clustering, - runner.get_dendrogram().num_levels()); - - // FIXME: Consider returning the Dendrogram at some point - return std::make_pair(runner.get_dendrogram().num_levels(), wt); + return std::make_pair(runner.move_dendrogram(), wt); } template -std::pair louvain( +std::pair>, weight_t> louvain( raft::handle_t const &handle, experimental::graph_view_t const &graph_view, - vertex_t *clustering, size_t max_level, weight_t resolution) { - CUGRAPH_EXPECTS(clustering != nullptr, - "Invalid input argument: clustering is null, should be a device pointer to " - "memory for storing the result"); - // "FIXME": remove this check and the guards below // // Disable louvain(experimental::graph_view_t,...) @@ -87,27 +65,73 @@ std::pair louvain( weight_t wt = runner(max_level, resolution); - rmm::device_uvector vertex_ids_v(graph_view.get_number_of_vertices(), - handle.get_stream()); + return std::make_pair(runner.move_dendrogram(), wt); + } +} - thrust::sequence(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - vertex_ids_v.begin(), - vertex_ids_v.end(), - graph_view.get_local_vertex_first()); +template +void flatten_dendrogram(raft::handle_t const &handle, + GraphCSRView const &graph_view, + Dendrogram const &dendrogram, + vertex_t *clustering) +{ + rmm::device_uvector vertex_ids_v(graph_view.number_of_vertices, handle.get_stream()); - partition_at_level(handle, - runner.get_dendrogram(), - vertex_ids_v.data(), - clustering, - runner.get_dendrogram().num_levels()); + thrust::sequence(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + vertex_ids_v.begin(), + vertex_ids_v.end(), + vertex_t{0}); - // FIXME: Consider returning the Dendrogram at some point - return std::make_pair(runner.get_dendrogram().num_levels(), wt); - } + partition_at_level(handle, + dendrogram, + vertex_ids_v.data(), + clustering, + dendrogram.num_levels()); +} + +template +void flatten_dendrogram( + raft::handle_t const &handle, + experimental::graph_view_t const &graph_view, + Dendrogram const &dendrogram, + vertex_t *clustering) +{ + rmm::device_uvector vertex_ids_v(graph_view.get_number_of_vertices(), + handle.get_stream()); + + thrust::sequence(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + vertex_ids_v.begin(), + vertex_ids_v.end(), + graph_view.get_local_vertex_first()); + + partition_at_level(handle, + dendrogram, + vertex_ids_v.data(), + clustering, + dendrogram.num_levels()); } } // namespace detail +template +std::pair>, typename graph_t::weight_type> +louvain(raft::handle_t const &handle, + graph_t const &graph, + size_t max_level, + typename graph_t::weight_type resolution) +{ + return detail::louvain(handle, graph, max_level, resolution); +} + +template +void flatten_dendrogram(raft::handle_t const &handle, + graph_t const &graph_view, + Dendrogram const &dendrogram, + typename graph_t::vertex_type *clustering) { + + detail::flatten_dendrogram(handle, graph_view, dendrogram, clustering); +} + template std::pair louvain(raft::handle_t const &handle, graph_t const &graph, @@ -115,9 +139,19 @@ std::pair louvain(raft::handle_t const &h size_t max_level, typename graph_t::weight_type resolution) { + using vertex_t = typename graph_t::vertex_type; + using weight_t = typename graph_t::weight_type; + CUGRAPH_EXPECTS(clustering != nullptr, "Invalid input argument: clustering is null"); - return detail::louvain(handle, graph, clustering, max_level, resolution); + std::unique_ptr> dendrogram; + weight_t modularity; + + std::tie(dendrogram, modularity) = louvain(handle, graph, max_level, resolution); + + flatten_dendrogram(handle, graph, *dendrogram, clustering); + + return std::make_pair(dendrogram->num_levels(), modularity); } // Explicit template instantations diff --git a/cpp/src/community/louvain.cuh b/cpp/src/community/louvain.cuh index b81e619e747..0862bbc62a9 100644 --- a/cpp/src/community/louvain.cuh +++ b/cpp/src/community/louvain.cuh @@ -20,7 +20,7 @@ #include #include -#include +#include #include @@ -142,7 +142,7 @@ class Louvain { Dendrogram &get_dendrogram() { return *dendrogram_; } - std::unique_ptr> move_dendrogram() { return dendrogram_; } + std::unique_ptr> move_dendrogram() { return std::move(dendrogram_); } virtual weight_t operator()(size_t max_level, weight_t resolution) { diff --git a/cpp/src/experimental/louvain.cuh b/cpp/src/experimental/louvain.cuh index 60086341e4b..9b2840aab3a 100644 --- a/cpp/src/experimental/louvain.cuh +++ b/cpp/src/experimental/louvain.cuh @@ -15,7 +15,7 @@ */ #pragma once -#include +#include #include #include @@ -73,7 +73,7 @@ class Louvain { Dendrogram &get_dendrogram() { return *dendrogram_; } - std::unique_ptr> move_dendrogram() { return dendrogram_; } + std::unique_ptr> move_dendrogram() { return std::move(dendrogram_); } virtual weight_t operator()(size_t max_level, weight_t resolution) { diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 80c7b8193fb..b8745c4fb86 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -427,10 +427,12 @@ if(BUILD_CUGRAPH_MG_TESTS) # - MG LOUVAIN tests --------------------------------------------------------------------- set(MG_LOUVAIN_TEST_SRCS + "${CMAKE_CURRENT_SOURCE_DIR}/community/mg_louvain_helper.cu" "${CMAKE_CURRENT_SOURCE_DIR}/community/mg_louvain_test.cpp") ConfigureTest(MG_LOUVAIN_TEST "${MG_LOUVAIN_TEST_SRCS}") target_link_libraries(MG_LOUVAIN_TEST PRIVATE MPI::MPI_C MPI::MPI_CXX) + target_link_libraries(MG_LOUVAIN_TEST PRIVATE cugraph) else(MPI_CXX_FOUND) message(FATAL_ERROR "OpenMPI NOT found, cannot build MG tests.") diff --git a/cpp/tests/community/mg_louvain_helper.cu b/cpp/tests/community/mg_louvain_helper.cu new file mode 100644 index 00000000000..57e030da214 --- /dev/null +++ b/cpp/tests/community/mg_louvain_helper.cu @@ -0,0 +1,353 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "mg_louvain_helper.hpp" + +#include + +#include +#include +#include + +#include + +#include +#include +#include + +namespace cugraph { +namespace test { + +template +rmm::device_uvector gather_distributed_vector(raft::handle_t const &handle, + T const *d_input, + size_t size) +{ + auto rx_sizes = + cugraph::experimental::host_scalar_gather(handle.get_comms(), size, 0, handle.get_stream()); + std::vector rx_displs(static_cast(handle.get_comms().get_rank()) == 0 + ? handle.get_comms().get_size() + : int{0}, + size_t{0}); + if (static_cast(handle.get_comms().get_rank()) == 0) { + std::partial_sum(rx_sizes.begin(), rx_sizes.end() - 1, rx_displs.begin() + 1); + } + + auto total_size = thrust::reduce(thrust::host, rx_sizes.begin(), rx_sizes.end()); + rmm::device_uvector gathered_v(total_size, handle.get_stream()); + + cugraph::experimental::device_gatherv(handle.get_comms(), + d_input, + gathered_v.data(), + size, + rx_sizes, + rx_displs, + 0, + handle.get_stream()); + + return gathered_v; +} + +template +bool compare_renumbered_vectors(raft::handle_t const &handle, + rmm::device_uvector const &v1, + rmm::device_uvector const &v2) +{ + vertex_t max = 1 + thrust::reduce(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + v1.begin(), + v1.end(), + vertex_t{0}); + + rmm::device_uvector map(max, size_t{0}); + + auto iter = thrust::make_zip_iterator(thrust::make_tuple(v1.begin(), v2.begin())); + + thrust::for_each(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + iter, + iter + v1.size(), + [d_map = map.data()] __device__(auto pair) { + vertex_t e1 = thrust::get<0>(pair); + vertex_t e2 = thrust::get<1>(pair); + + d_map[e1] = e2; + }); + + auto error_count = thrust::count_if( + rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + iter, + iter + v1.size(), + [d_map = map.data()] __device__(auto pair) { + vertex_t e1 = thrust::get<0>(pair); + vertex_t e2 = thrust::get<1>(pair); + + return (d_map[e1] != e2); + }); + + return (error_count == 0); +} + +template +void single_gpu_renumber_edgelist_given_number_map(raft::handle_t const &handle, + rmm::device_uvector &edgelist_rows_v, + rmm::device_uvector &edgelist_cols_v, + rmm::device_uvector &renumber_map_gathered_v) +{ + rmm::device_uvector index_v(renumber_map_gathered_v.size(), handle.get_stream()); + + thrust::for_each( + rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(renumber_map_gathered_v.size()), + [d_renumber_map_gathered = renumber_map_gathered_v.data(), d_index = index_v.data()] __device__( + auto idx) { d_index[d_renumber_map_gathered[idx]] = idx; }); + + thrust::transform(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + edgelist_rows_v.begin(), + edgelist_rows_v.end(), + edgelist_rows_v.begin(), + [d_index = index_v.data()] __device__(auto v) { return d_index[v]; }); + + thrust::transform(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + edgelist_cols_v.begin(), + edgelist_cols_v.end(), + edgelist_cols_v.begin(), + [d_index = index_v.data()] __device__(auto v) { return d_index[v]; }); +} + +template +std:: + tuple, rmm::device_uvector, rmm::device_uvector> + compressed_sparse_to_edgelist(edge_t const *compressed_sparse_offsets, + vertex_t const *compressed_sparse_indices, + weight_t const *compressed_sparse_weights, + vertex_t major_first, + vertex_t major_last, + cudaStream_t stream) +{ + edge_t number_of_edges{0}; + raft::update_host( + &number_of_edges, compressed_sparse_offsets + (major_last - major_first), 1, stream); + CUDA_TRY(cudaStreamSynchronize(stream)); + rmm::device_uvector edgelist_major_vertices(number_of_edges, stream); + rmm::device_uvector edgelist_minor_vertices(number_of_edges, stream); + rmm::device_uvector edgelist_weights( + compressed_sparse_weights != nullptr ? number_of_edges : 0, stream); + + // FIXME: this is highly inefficient for very high-degree vertices, for better performance, we can + // fill high-degree vertices using one CUDA block per vertex, mid-degree vertices using one CUDA + // warp per vertex, and low-degree vertices using one CUDA thread per block + thrust::for_each(rmm::exec_policy(stream)->on(stream), + thrust::make_counting_iterator(major_first), + thrust::make_counting_iterator(major_last), + [compressed_sparse_offsets, + major_first, + p_majors = edgelist_major_vertices.begin()] __device__(auto v) { + auto first = compressed_sparse_offsets[v - major_first]; + auto last = compressed_sparse_offsets[v - major_first + 1]; + thrust::fill(thrust::seq, p_majors + first, p_majors + last, v); + }); + thrust::copy(rmm::exec_policy(stream)->on(stream), + compressed_sparse_indices, + compressed_sparse_indices + number_of_edges, + edgelist_minor_vertices.begin()); + if (compressed_sparse_weights != nullptr) { + thrust::copy(rmm::exec_policy(stream)->on(stream), + compressed_sparse_weights, + compressed_sparse_weights + number_of_edges, + edgelist_weights.data()); + } + + return std::make_tuple(std::move(edgelist_major_vertices), + std::move(edgelist_minor_vertices), + std::move(edgelist_weights)); +} + +template +void sort_and_coarsen_edgelist(rmm::device_uvector &edgelist_major_vertices /* [INOUT] */, + rmm::device_uvector &edgelist_minor_vertices /* [INOUT] */, + rmm::device_uvector &edgelist_weights /* [INOUT] */, + cudaStream_t stream) +{ + auto pair_first = thrust::make_zip_iterator( + thrust::make_tuple(edgelist_major_vertices.begin(), edgelist_minor_vertices.begin())); + + size_t number_of_edges{0}; + if (edgelist_weights.size() > 0) { + thrust::sort_by_key(rmm::exec_policy(stream)->on(stream), + pair_first, + pair_first + edgelist_major_vertices.size(), + edgelist_weights.begin()); + + rmm::device_uvector tmp_edgelist_major_vertices(edgelist_major_vertices.size(), + stream); + rmm::device_uvector tmp_edgelist_minor_vertices(tmp_edgelist_major_vertices.size(), + stream); + rmm::device_uvector tmp_edgelist_weights(tmp_edgelist_major_vertices.size(), stream); + auto it = thrust::reduce_by_key( + rmm::exec_policy(stream)->on(stream), + pair_first, + pair_first + edgelist_major_vertices.size(), + edgelist_weights.begin(), + thrust::make_zip_iterator(thrust::make_tuple(tmp_edgelist_major_vertices.begin(), + tmp_edgelist_minor_vertices.begin())), + tmp_edgelist_weights.begin()); + number_of_edges = thrust::distance(tmp_edgelist_weights.begin(), thrust::get<1>(it)); + + edgelist_major_vertices = std::move(tmp_edgelist_major_vertices); + edgelist_minor_vertices = std::move(tmp_edgelist_minor_vertices); + edgelist_weights = std::move(tmp_edgelist_weights); + } else { + thrust::sort(rmm::exec_policy(stream)->on(stream), + pair_first, + pair_first + edgelist_major_vertices.size()); + auto it = thrust::unique(rmm::exec_policy(stream)->on(stream), + pair_first, + pair_first + edgelist_major_vertices.size()); + number_of_edges = thrust::distance(pair_first, it); + } + + edgelist_major_vertices.resize(number_of_edges, stream); + edgelist_minor_vertices.resize(number_of_edges, stream); + edgelist_weights.resize(number_of_edges, stream); + edgelist_major_vertices.shrink_to_fit(stream); + edgelist_minor_vertices.shrink_to_fit(stream); + edgelist_weights.shrink_to_fit(stream); +} + +template +std:: + tuple, rmm::device_uvector, rmm::device_uvector> + compressed_sparse_to_relabeled_and_sorted_and_coarsened_edgelist( + edge_t const *compressed_sparse_offsets, + vertex_t const *compressed_sparse_indices, + weight_t const *compressed_sparse_weights, + vertex_t const *p_major_labels, + vertex_t const *p_minor_labels, + vertex_t major_first, + vertex_t major_last, + vertex_t minor_first, + vertex_t minor_last, + cudaStream_t stream) +{ + // FIXME: it might be possible to directly create relabled & coarsened edgelist from the + // compressed sparse format to save memory + + rmm::device_uvector edgelist_major_vertices(0, stream); + rmm::device_uvector edgelist_minor_vertices(0, stream); + rmm::device_uvector edgelist_weights(0, stream); + std::tie(edgelist_major_vertices, edgelist_minor_vertices, edgelist_weights) = + compressed_sparse_to_edgelist(compressed_sparse_offsets, + compressed_sparse_indices, + compressed_sparse_weights, + major_first, + major_last, + stream); + + auto pair_first = thrust::make_zip_iterator( + thrust::make_tuple(edgelist_major_vertices.begin(), edgelist_minor_vertices.begin())); + thrust::transform( + rmm::exec_policy(stream)->on(stream), + pair_first, + pair_first + edgelist_major_vertices.size(), + pair_first, + [p_major_labels, p_minor_labels, major_first, minor_first] __device__(auto val) { + return thrust::make_tuple(p_major_labels[thrust::get<0>(val) - major_first], + p_minor_labels[thrust::get<1>(val) - minor_first]); + }); + + sort_and_coarsen_edgelist( + edgelist_major_vertices, edgelist_minor_vertices, edgelist_weights, stream); + + return std::make_tuple(std::move(edgelist_major_vertices), + std::move(edgelist_minor_vertices), + std::move(edgelist_weights)); +} + +// single-GPU version +template +std::unique_ptr> +coarsen_graph( + raft::handle_t const &handle, + cugraph::experimental::graph_view_t const + &graph_view, + vertex_t const *labels) +{ + rmm::device_uvector coarsened_edgelist_major_vertices(0, handle.get_stream()); + rmm::device_uvector coarsened_edgelist_minor_vertices(0, handle.get_stream()); + rmm::device_uvector coarsened_edgelist_weights(0, handle.get_stream()); + std::tie(coarsened_edgelist_major_vertices, + coarsened_edgelist_minor_vertices, + coarsened_edgelist_weights) = + compressed_sparse_to_relabeled_and_sorted_and_coarsened_edgelist( + graph_view.offsets(), + graph_view.indices(), + graph_view.weights(), + labels, + labels, + vertex_t{0}, + graph_view.get_number_of_vertices(), + vertex_t{0}, + graph_view.get_number_of_vertices(), + handle.get_stream()); + + cugraph::experimental::edgelist_t edgelist{}; + edgelist.p_src_vertices = store_transposed ? coarsened_edgelist_minor_vertices.data() + : coarsened_edgelist_major_vertices.data(); + edgelist.p_dst_vertices = store_transposed ? coarsened_edgelist_major_vertices.data() + : coarsened_edgelist_minor_vertices.data(); + edgelist.p_edge_weights = coarsened_edgelist_weights.data(); + edgelist.number_of_edges = static_cast(coarsened_edgelist_major_vertices.size()); + + vertex_t new_number_of_vertices = + 1 + thrust::reduce(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + labels, + labels + graph_view.get_number_of_vertices(), + vertex_t{0}, + thrust::maximum()); + + return std::make_unique< + cugraph::experimental::graph_t>( + handle, + edgelist, + new_number_of_vertices, + cugraph::experimental::graph_properties_t{graph_view.is_symmetric(), false}, + true); +} + +// explicit instantiation + +template void single_gpu_renumber_edgelist_given_number_map( + raft::handle_t const &handle, + rmm::device_uvector &d_edgelist_rows, + rmm::device_uvector &d_edgelist_cols, + rmm::device_uvector &d_renumber_map_gathered_v); + +template rmm::device_uvector gather_distributed_vector(raft::handle_t const &handle, + int const *d_input, + size_t size); + +template bool compare_renumbered_vectors(raft::handle_t const &handle, + rmm::device_uvector const &v1, + rmm::device_uvector const &v2); + +template std::unique_ptr> +coarsen_graph( + raft::handle_t const &handle, + cugraph::experimental::graph_view_t const &graph_view, + int32_t const *labels); + +} // namespace test +} // namespace cugraph diff --git a/cpp/tests/community/mg_louvain_helper.hpp b/cpp/tests/community/mg_louvain_helper.hpp new file mode 100644 index 00000000000..87d46907bfc --- /dev/null +++ b/cpp/tests/community/mg_louvain_helper.hpp @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include +#include + +namespace cugraph { +namespace test { + +template +rmm::device_uvector gather_distributed_vector(raft::handle_t const &handle, + T const *d_input, + size_t size); + +template +bool compare_renumbered_vectors(raft::handle_t const &handle, + rmm::device_uvector const &v1, + rmm::device_uvector const &v2); + +template +void single_gpu_renumber_edgelist_given_number_map( + raft::handle_t const &handle, + rmm::device_uvector &d_edgelist_rows, + rmm::device_uvector &d_edgelist_cols, + rmm::device_uvector &d_renumber_map_gathered_v); + +template +std::unique_ptr> +coarsen_graph(raft::handle_t const &handle, + cugraph::experimental::graph_view_t const &graph_view, + vertex_t const *labels); + +} // namespace test +} // namespace cugraph diff --git a/cpp/tests/community/mg_louvain_test.cpp b/cpp/tests/community/mg_louvain_test.cpp index e97f93d4424..d62a0c8eae1 100644 --- a/cpp/tests/community/mg_louvain_test.cpp +++ b/cpp/tests/community/mg_louvain_test.cpp @@ -14,16 +14,21 @@ * limitations under the License. */ +#include "mg_louvain_helper.hpp" + #include #include #include #include +#include #include #include #include +#include + #include void compare(float modularity, float sg_modularity) { ASSERT_FLOAT_EQ(modularity, sg_modularity); } @@ -75,39 +80,87 @@ class Louvain_MG_Testfixture : public ::testing::TestWithParam virtual void SetUp() {} virtual void TearDown() {} - // Return the results of running louvain on a single GPU for the dataset in - // graph_file_path. + // Compare the results of MNMG Louvain with the results of running + // each step of SG Louvain, renumbering the coarsened graphs based + // on the MNMG renumbering. template - std::tuple> get_sg_results( - raft::handle_t const& handle, - std::string const& graph_file_path, - size_t max_level, - weight_t resolution) + void compare_sg_results(raft::handle_t const& handle, + std::string const& graph_filename, + rmm::device_uvector& d_renumber_map_gathered_v, + cugraph::Dendrogram const& dendrogram, + weight_t resolution, + int rank, + weight_t modularity) { - // FIXME: Put this in the Graph test base class - // (make the call simpler here) - auto graph_tuple = - cugraph::test::read_graph_from_matrix_market_file( - handle, - graph_file_path, - true, - false); // FIXME: should use param.test_weighted instead of true - - auto graph_view = std::get<0>(graph_tuple).view(); - cudaStream_t stream = handle.get_stream(); + auto sg_graph = + std::make_unique>( + handle); + rmm::device_uvector d_clustering_v(0, handle.get_stream()); + weight_t sg_modularity; - rmm::device_uvector clustering_v(graph_view.get_number_of_local_vertices(), stream); + if (rank == 0) { + // Create initial SG graph, renumbered according to the MNMG renumber map + rmm::device_uvector d_edgelist_rows(0, handle.get_stream()); + rmm::device_uvector d_edgelist_cols(0, handle.get_stream()); + rmm::device_uvector d_edgelist_weights(0, handle.get_stream()); + vertex_t number_of_vertices{}; + bool is_symmetric{}; + + std::tie( + d_edgelist_rows, d_edgelist_cols, d_edgelist_weights, number_of_vertices, is_symmetric) = + cugraph::test::read_edgelist_from_matrix_market_file( + handle, graph_filename, true); + + rmm::device_uvector d_vertices(number_of_vertices, handle.get_stream()); + std::vector h_vertices(number_of_vertices); + + d_clustering_v.resize(d_vertices.size(), handle.get_stream()); + + thrust::sequence(thrust::host, h_vertices.begin(), h_vertices.end(), vertex_t{0}); + raft::update_device( + d_vertices.data(), h_vertices.data(), d_vertices.size(), handle.get_stream()); + + // renumber using d_renumber_map_gathered_v + cugraph::test::single_gpu_renumber_edgelist_given_number_map( + handle, d_edgelist_rows, d_edgelist_cols, d_renumber_map_gathered_v); + + std::tie(*sg_graph, std::ignore) = + cugraph::test::generate_graph_from_edgelist( + handle, + std::move(d_vertices), + std::move(d_edgelist_rows), + std::move(d_edgelist_cols), + std::move(d_edgelist_weights), + is_symmetric, + true, + false); + } - size_t level; - weight_t modularity; + std::for_each( + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(dendrogram.num_levels()), + [&dendrogram, &sg_graph, &d_clustering_v, &sg_modularity, &handle, resolution, rank]( + size_t i) { + auto d_dendrogram_gathered_v = cugraph::test::gather_distributed_vector( + handle, dendrogram.get_level_ptr_nocheck(i), dendrogram.get_level_size_nocheck(i)); + + if (rank == 0) { + auto graph_view = sg_graph->view(); + + d_clustering_v.resize(graph_view.get_number_of_vertices(), handle.get_stream()); - std::tie(level, modularity) = - cugraph::louvain(handle, graph_view, clustering_v.data(), max_level, resolution); + std::tie(std::ignore, sg_modularity) = + cugraph::louvain(handle, graph_view, d_clustering_v.data(), 1, resolution); - std::vector clustering(graph_view.get_number_of_local_vertices()); - raft::update_host(clustering.data(), clustering_v.data(), clustering_v.size(), stream); + EXPECT_TRUE(cugraph::test::compare_renumbered_vectors( + handle, d_clustering_v, d_dendrogram_gathered_v)); - return std::make_tuple(level, modularity, clustering); + sg_graph = + cugraph::test::coarsen_graph(handle, graph_view, d_dendrogram_gathered_v.data()); + } + }); + + if (rank == 0) compare(modularity, sg_modularity); } // Compare the results of running louvain on multiple GPUs to that of a @@ -134,43 +187,33 @@ class Louvain_MG_Testfixture : public ::testing::TestWithParam cudaStream_t stream = handle.get_stream(); cugraph::experimental::graph_t mg_graph(handle); + rmm::device_uvector d_renumber_map_labels(0, handle.get_stream()); std::tie(mg_graph, d_renumber_map_labels) = cugraph::test::read_graph_from_matrix_market_file( - handle, param.graph_file_full_path, true, false); - - // Each GPU will have a subset of the clustering - int sg_level; - weight_t sg_modularity; - std::vector sg_clustering; - - // FIXME: Consider how to test for max_level > 1 - // perhaps some sort of approximation - // size_t local_max_level{param.max_level}; - size_t local_max_level{1}; + handle, param.graph_file_full_path, true, true); auto mg_graph_view = mg_graph.view(); - rmm::device_uvector clustering_v(mg_graph_view.get_number_of_local_vertices(), - stream); - - CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement - - int level; + std::unique_ptr> dendrogram; weight_t modularity; - std::tie(level, modularity) = cugraph::louvain( - handle, mg_graph_view, clustering_v.data(), local_max_level, param.resolution); + std::tie(dendrogram, modularity) = + cugraph::louvain(handle, mg_graph_view, param.max_level, param.resolution); - if (comm_rank == 0) { - SCOPED_TRACE("compare modularity input: " + param.graph_file_full_path); + SCOPED_TRACE("compare modularity input: " + param.graph_file_full_path); - std::tie(sg_level, sg_modularity, sg_clustering) = get_sg_results( - handle, param.graph_file_full_path, local_max_level, param.resolution); + auto d_renumber_map_gathered_v = cugraph::test::gather_distributed_vector( + handle, d_renumber_map_labels.data(), d_renumber_map_labels.size()); - compare(modularity, sg_modularity); - } + compare_sg_results(handle, + param.graph_file_full_path, + d_renumber_map_gathered_v, + *dendrogram, + param.resolution, + comm_rank, + modularity); } }; @@ -183,7 +226,8 @@ TEST_P(Louvain_MG_Testfixture, CheckInt32Int32Float) INSTANTIATE_TEST_CASE_P( simple_test, Louvain_MG_Testfixture, - ::testing::Values(Louvain_Usecase("test/datasets/karate.mtx", true, 100, 1), - Louvain_Usecase("test/datasets/smallworld.mtx", true, 100, 1))); + ::testing::Values(Louvain_Usecase("test/datasets/karate.mtx", true, 100, 1) + //,Louvain_Usecase("test/datasets/smallworld.mtx", true, 100, 1) + )); CUGRAPH_MG_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/utilities/base_fixture.hpp b/cpp/tests/utilities/base_fixture.hpp index e8f11acfbf4..78ac57dba58 100644 --- a/cpp/tests/utilities/base_fixture.hpp +++ b/cpp/tests/utilities/base_fixture.hpp @@ -160,6 +160,11 @@ inline auto parse_test_options(int argc, char **argv) auto const cmd_opts = parse_test_options(argc, argv); \ auto const rmm_mode = cmd_opts["rmm_mode"].as(); \ auto resource = cugraph::test::create_memory_resource(rmm_mode); \ + \ + if (comm_rank != 0) { \ + auto& listeners = ::testing::UnitTest::GetInstance()->listeners(); \ + delete listeners.Release(listeners.default_result_printer()); \ + } \ rmm::mr::set_current_device_resource(resource.get()); \ auto ret = RUN_ALL_TESTS(); \ MPI_TRY(MPI_Finalize()); \ From b9cdf406a6d6426cc73fbf320fecf56acde97a55 Mon Sep 17 00:00:00 2001 From: Chuck Hastings Date: Fri, 26 Mar 2021 13:44:57 -0400 Subject: [PATCH 10/18] clean up serial louvain tests --- cpp/include/algorithms.hpp | 4 +- cpp/src/experimental/louvain.cuh | 8 +- cpp/tests/CMakeLists.txt | 8 -- cpp/tests/community/louvain_test.cpp | 160 +++++++++++++++++++++++-- cpp/tests/experimental/louvain_test.cu | 133 -------------------- 5 files changed, 158 insertions(+), 155 deletions(-) delete mode 100644 cpp/tests/experimental/louvain_test.cu diff --git a/cpp/include/algorithms.hpp b/cpp/include/algorithms.hpp index 1c4a84a27e4..beb4333d508 100644 --- a/cpp/include/algorithms.hpp +++ b/cpp/include/algorithms.hpp @@ -656,8 +656,8 @@ std::pair louvain( * * @tparam graph_t Type of graph * - * @param[in] handle Library handle (RAFT). If a communicator is set in the handle, - * @param[in] graph input graph object (CSR) + * @param[in] handle Library handle (RAFT) + * @param[in] graph Input graph object (CSR) * @param[in] max_level (optional) maximum number of levels to run (default 100) * @param[in] resolution (optional) The value of the resolution parameter to use. * Called gamma in the modularity formula, this changes the size diff --git a/cpp/src/experimental/louvain.cuh b/cpp/src/experimental/louvain.cuh index 9b2840aab3a..30c9c1cfc4d 100644 --- a/cpp/src/experimental/louvain.cuh +++ b/cpp/src/experimental/louvain.cuh @@ -29,6 +29,7 @@ #include #include +#include //#define TIMING @@ -408,6 +409,11 @@ class Louvain { map_key_last = d_dst_cluster_cache_ + dst_cluster_cache_v_.size(); map_value_first = dst_cluster_weights_v.begin(); } else { + thrust::sort_by_key(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), + cluster_keys_v_.begin(), + cluster_keys_v_.end(), + cluster_weights_v_.begin()); + thrust::transform(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), next_cluster_v.begin(), next_cluster_v.end(), @@ -415,7 +421,7 @@ class Louvain { [d_cluster_weights = cluster_weights_v_.data(), d_cluster_keys = cluster_keys_v_.data(), num_clusters = cluster_keys_v_.size()] __device__(vertex_t cluster) { - auto pos = thrust::find( + auto pos = thrust::lower_bound( thrust::seq, d_cluster_keys, d_cluster_keys + num_clusters, cluster); return d_cluster_weights[pos - d_cluster_keys]; }); diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 9b8c1b9322e..97d3f77e26e 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -391,14 +391,6 @@ set(EXPERIMENTAL_PAGERANK_TEST_SRCS ConfigureTest(EXPERIMENTAL_PAGERANK_TEST "${EXPERIMENTAL_PAGERANK_TEST_SRCS}") -################################################################################################### -# - Experimental LOUVAIN tests ------------------------------------------------------------------- - -set(EXPERIMENTAL_LOUVAIN_TEST_SRCS - "${CMAKE_CURRENT_SOURCE_DIR}/experimental/louvain_test.cu") - -ConfigureTest(EXPERIMENTAL_LOUVAIN_TEST "${EXPERIMENTAL_LOUVAIN_TEST_SRCS}") - ################################################################################################### # - Experimental KATZ_CENTRALITY tests ------------------------------------------------------------ diff --git a/cpp/tests/community/louvain_test.cpp b/cpp/tests/community/louvain_test.cpp index d3024282be3..ab92ce411b9 100644 --- a/cpp/tests/community/louvain_test.cpp +++ b/cpp/tests/community/louvain_test.cpp @@ -9,15 +9,141 @@ * */ #include +#include + +#include +#include +#include +#include + +#include #include -#include -#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include + +struct Louvain_Usecase { + std::string graph_file_full_path_{}; + bool test_weighted_{false}; + int expected_level_{0}; + float expected_modularity_{0}; + + Louvain_Usecase(std::string const& graph_file_path, + bool test_weighted, + int expected_level, + float expected_modularity) + : test_weighted_(test_weighted), + expected_level_(expected_level), + expected_modularity_(expected_modularity) + { + if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) { + graph_file_full_path_ = cugraph::test::get_rapids_dataset_root_dir() + "/" + graph_file_path; + } else { + graph_file_full_path_ = graph_file_path; + } + }; +}; + +class Tests_Louvain : public ::testing::TestWithParam { + public: + Tests_Louvain() {} + static void SetupTestCase() {} + static void TearDownTestCase() {} + + virtual void SetUp() {} + virtual void TearDown() {} + + template + void run_legacy_test(Louvain_Usecase const& configuration) + { + raft::handle_t handle{}; + + bool directed{false}; + + auto graph = cugraph::test::generate_graph_csr_from_mm( + directed, configuration.graph_file_full_path_); + auto graph_view = graph->view(); + + louvain(graph_view, + graph_view.get_number_of_vertices(), + configuration.expected_level_, + configuration.expected_modularity_); + } -#include + template + void run_current_test(Louvain_Usecase const& configuration) + { + raft::handle_t handle{}; + + cugraph::experimental::graph_t graph(handle); + std::tie(graph, std::ignore) = + cugraph::test::read_graph_from_matrix_market_file( + handle, configuration.graph_file_full_path_, configuration.test_weighted_, false); + + auto graph_view = graph.view(); + + // "FIXME": remove this check once we drop support for Pascal + // + // Calling louvain on Pascal will throw an exception, we'll check that + // this is the behavior while we still support Pascal (device_prop.major < 7) + // + cudaDeviceProp device_prop; + CUDA_CHECK(cudaGetDeviceProperties(&device_prop, 0)); + + if (device_prop.major < 7) { + EXPECT_THROW(louvain(graph_view, + graph_view.get_number_of_local_vertices(), + configuration.expected_level_, + configuration.expected_modularity_), + cugraph::logic_error); + } else { + louvain(graph_view, + graph_view.get_number_of_local_vertices(), + configuration.expected_level_, + configuration.expected_modularity_); + } + } + + template + void louvain(graph_t const& graph_view, + typename graph_t::vertex_type num_vertices, + int expected_level, + float expected_modularity) + { + using vertex_t = typename graph_t::vertex_type; + using weight_t = typename graph_t::weight_type; + + raft::handle_t handle{}; + + rmm::device_uvector clustering_v(num_vertices, handle.get_stream()); + size_t level; + weight_t modularity; + + std::tie(level, modularity) = + cugraph::louvain(handle, graph_view, clustering_v.data(), size_t{100}, weight_t{1}); + + CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + + float compare_modularity = static_cast(modularity); -TEST(louvain, success) + ASSERT_FLOAT_EQ(compare_modularity, expected_modularity); + ASSERT_EQ(level, expected_level); + } +}; + +// FIXME: add tests for type combinations + +TEST(louvain_legacy, success) { raft::handle_t handle; @@ -84,15 +210,13 @@ TEST(louvain, success) int min = *min_element(cluster_id.begin(), cluster_id.end()); - std::cout << "modularity = " << modularity << std::endl; - ASSERT_GE(min, 0); - ASSERT_GE(modularity, 0.402777 * 0.95); + ASSERT_FLOAT_EQ(modularity, 0.408695); ASSERT_EQ(cluster_id, result_h); } } -TEST(louvain_renumbered, success) +TEST(louvain_legacy_renumbered, success) { raft::handle_t handle; @@ -157,11 +281,25 @@ TEST(louvain_renumbered, success) int min = *min_element(cluster_id.begin(), cluster_id.end()); - std::cout << "modularity = " << modularity << std::endl; - ASSERT_GE(min, 0); - ASSERT_GE(modularity, 0.402777 * 0.95); + ASSERT_FLOAT_EQ(modularity, 0.41880345); } } +TEST_P(Tests_Louvain, CheckInt32Int32FloatFloatLegacy) +{ + run_legacy_test(GetParam()); +} + +TEST_P(Tests_Louvain, CheckInt32Int32FloatFloat) +{ + run_current_test(GetParam()); +} + +// FIXME: Expand testing once we evaluate RMM memory use +INSTANTIATE_TEST_CASE_P( + simple_test, + Tests_Louvain, + ::testing::Values(Louvain_Usecase("test/datasets/karate.mtx", true, 3, 0.408695))); + CUGRAPH_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/experimental/louvain_test.cu b/cpp/tests/experimental/louvain_test.cu deleted file mode 100644 index 56fb2c109bf..00000000000 --- a/cpp/tests/experimental/louvain_test.cu +++ /dev/null @@ -1,133 +0,0 @@ -/* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governin_from_mtxg permissions and - * limitations under the License. - */ - -#include -#include - -#include -#include -#include - -#include -#include - -#include - -#include -#include -#include - -#include - -#include -#include -#include -#include -#include - -typedef struct Louvain_Usecase_t { - std::string graph_file_full_path{}; - bool test_weighted{false}; - - Louvain_Usecase_t(std::string const& graph_file_path, bool test_weighted) - : test_weighted(test_weighted) - { - if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) { - graph_file_full_path = cugraph::test::get_rapids_dataset_root_dir() + "/" + graph_file_path; - } else { - graph_file_full_path = graph_file_path; - } - }; -} Louvain_Usecase; - -class Tests_Louvain : public ::testing::TestWithParam { - public: - Tests_Louvain() {} - static void SetupTestCase() {} - static void TearDownTestCase() {} - - virtual void SetUp() {} - virtual void TearDown() {} - - template - void run_current_test(Louvain_Usecase const& configuration) - { - raft::handle_t handle{}; - - std::cout << "read graph file: " << configuration.graph_file_full_path << std::endl; - - cugraph::experimental::graph_t graph(handle); - std::tie(graph, std::ignore) = - cugraph::test::read_graph_from_matrix_market_file( - handle, configuration.graph_file_full_path, configuration.test_weighted, false); - - auto graph_view = graph.view(); - - // "FIXME": remove this check once we drop support for Pascal - // - // Calling louvain on Pascal will throw an exception, we'll check that - // this is the behavior while we still support Pascal (device_prop.major < 7) - // - cudaDeviceProp device_prop; - CUDA_CHECK(cudaGetDeviceProperties(&device_prop, 0)); - - if (device_prop.major < 7) { - EXPECT_THROW(louvain(graph_view), cugraph::logic_error); - } else { - louvain(graph_view); - } - } - - template - void louvain(graph_t const& graph_view) - { - using vertex_t = typename graph_t::vertex_type; - using weight_t = typename graph_t::weight_type; - - raft::handle_t handle{}; - - rmm::device_vector clustering_v(graph_view.get_number_of_local_vertices()); - size_t level; - weight_t modularity; - - std::tie(level, modularity) = - cugraph::louvain(handle, graph_view, clustering_v.data().get(), size_t{100}, weight_t{1}); - - CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement - - std::cout << "level = " << level << std::endl; - std::cout << "modularity = " << modularity << std::endl; - } -}; - -// FIXME: add tests for type combinations -TEST_P(Tests_Louvain, CheckInt32Int32FloatFloat) -{ - run_current_test(GetParam()); -} - -INSTANTIATE_TEST_CASE_P(simple_test, - Tests_Louvain, - ::testing::Values(Louvain_Usecase("test/datasets/karate.mtx", true) -#if 0 - , - Louvain_Usecase("test/datasets/web-Google.mtx", true), - Louvain_Usecase("test/datasets/ljournal-2008.mtx", true), - Louvain_Usecase("test/datasets/webbase-1M.mtx", true) -#endif - )); - -CUGRAPH_TEST_PROGRAM_MAIN() From b64e65bec3ade1462101faa15451bead0de6c17f Mon Sep 17 00:00:00 2001 From: Chuck Hastings Date: Fri, 26 Mar 2021 15:42:38 -0400 Subject: [PATCH 11/18] revert generate_graph_from_edgelist changes, not necessary with new test --- cpp/src/community/louvain.cu | 79 ++++++- cpp/tests/community/mg_louvain_test.cpp | 2 +- .../utilities/generate_graph_from_edgelist.cu | 210 +++++------------- 3 files changed, 123 insertions(+), 168 deletions(-) diff --git a/cpp/src/community/louvain.cu b/cpp/src/community/louvain.cu index ab27923cdb1..a3a2ffa39eb 100644 --- a/cpp/src/community/louvain.cu +++ b/cpp/src/community/louvain.cu @@ -82,11 +82,8 @@ void flatten_dendrogram(raft::handle_t const &handle, vertex_ids_v.end(), vertex_t{0}); - partition_at_level(handle, - dendrogram, - vertex_ids_v.data(), - clustering, - dendrogram.num_levels()); + partition_at_level( + handle, dendrogram, vertex_ids_v.data(), clustering, dendrogram.num_levels()); } template @@ -104,11 +101,8 @@ void flatten_dendrogram( vertex_ids_v.end(), graph_view.get_local_vertex_first()); - partition_at_level(handle, - dendrogram, - vertex_ids_v.data(), - clustering, - dendrogram.num_levels()); + partition_at_level( + handle, dendrogram, vertex_ids_v.data(), clustering, dendrogram.num_levels()); } } // namespace detail @@ -127,8 +121,8 @@ template void flatten_dendrogram(raft::handle_t const &handle, graph_t const &graph_view, Dendrogram const &dendrogram, - typename graph_t::vertex_type *clustering) { - + typename graph_t::vertex_type *clustering) +{ detail::flatten_dendrogram(handle, graph_view, dendrogram, clustering); } @@ -155,6 +149,67 @@ std::pair louvain(raft::handle_t const &h } // Explicit template instantations +template std::pair>, float> louvain( + raft::handle_t const &, + experimental::graph_view_t const &, + size_t, + float); +template std::pair>, float> louvain( + raft::handle_t const &, + experimental::graph_view_t const &, + size_t, + float); +template std::pair>, float> louvain( + raft::handle_t const &, + experimental::graph_view_t const &, + size_t, + float); +template std::pair>, double> louvain( + raft::handle_t const &, + experimental::graph_view_t const &, + size_t, + double); +template std::pair>, double> louvain( + raft::handle_t const &, + experimental::graph_view_t const &, + size_t, + double); +template std::pair>, double> louvain( + raft::handle_t const &, + experimental::graph_view_t const &, + size_t, + double); +template std::pair>, float> louvain( + raft::handle_t const &, + experimental::graph_view_t const &, + size_t, + float); +template std::pair>, float> louvain( + raft::handle_t const &, + experimental::graph_view_t const &, + size_t, + float); +template std::pair>, float> louvain( + raft::handle_t const &, + experimental::graph_view_t const &, + size_t, + float); +template std::pair>, double> louvain( + raft::handle_t const &, + experimental::graph_view_t const &, + size_t, + double); +template std::pair>, double> louvain( + raft::handle_t const &, + experimental::graph_view_t const &, + size_t, + double); +template std::pair>, double> louvain( + raft::handle_t const &, + experimental::graph_view_t const &, + size_t, + double); + template std::pair louvain( raft::handle_t const &, GraphCSRView const &, int32_t *, size_t, float); template std::pair louvain(raft::handle_t const &, diff --git a/cpp/tests/community/mg_louvain_test.cpp b/cpp/tests/community/mg_louvain_test.cpp index d62a0c8eae1..f6596a6b59a 100644 --- a/cpp/tests/community/mg_louvain_test.cpp +++ b/cpp/tests/community/mg_louvain_test.cpp @@ -150,7 +150,7 @@ class Louvain_MG_Testfixture : public ::testing::TestWithParam d_clustering_v.resize(graph_view.get_number_of_vertices(), handle.get_stream()); std::tie(std::ignore, sg_modularity) = - cugraph::louvain(handle, graph_view, d_clustering_v.data(), 1, resolution); + cugraph::louvain(handle, graph_view, d_clustering_v.data(), size_t{1}, resolution); EXPECT_TRUE(cugraph::test::compare_renumbered_vectors( handle, d_clustering_v, d_dendrogram_gathered_v)); diff --git a/cpp/tests/utilities/generate_graph_from_edgelist.cu b/cpp/tests/utilities/generate_graph_from_edgelist.cu index fa10c31a5a4..1b9fe6051f7 100644 --- a/cpp/tests/utilities/generate_graph_from_edgelist.cu +++ b/cpp/tests/utilities/generate_graph_from_edgelist.cu @@ -30,47 +30,39 @@ namespace test { namespace detail { -template -struct compute_gpu_id_from_vertex_no_renumbering { - int comm_size{0}; - vertex_t num_vertices{1}; - - __device__ int operator()(vertex_t v) const - { - vertex_t vertices_per_gpu = static_cast(num_vertices + comm_size - 1) / comm_size; - return static_cast(v / vertices_per_gpu); - } -}; - -template -struct compute_gpu_id_from_edge_no_renumbering { - bool hypergraph_partitioned{false}; - int comm_size{0}; - int row_comm_size{0}; - int col_comm_size{0}; - vertex_t num_vertices{1}; - - __device__ int operator()(vertex_t major, vertex_t minor) const - { - vertex_t vertices_per_gpu = static_cast(num_vertices + comm_size - 1) / comm_size; - auto major_comm_rank = static_cast(major / vertices_per_gpu); - auto minor_comm_rank = static_cast(minor / vertices_per_gpu); - - if (hypergraph_partitioned) { - return (minor_comm_rank / col_comm_size) * row_comm_size + (major_comm_rank % row_comm_size); - } else { - return (major_comm_rank - (major_comm_rank % row_comm_size)) + - (minor_comm_rank / col_comm_size); - } - } -}; - -template -void filter_vertices(raft::handle_t const& handle, - rmm::device_uvector& vertices, - key_function_t vertex_key_func, - int comm_rank) +template +std::enable_if_t< + multi_gpu, + std::tuple< + cugraph::experimental::graph_t, + rmm::device_uvector>> +generate_graph_from_edgelist(raft::handle_t const& handle, + rmm::device_uvector&& vertices, + rmm::device_uvector&& edgelist_rows, + rmm::device_uvector&& edgelist_cols, + rmm::device_uvector&& edgelist_weights, + bool is_symmetric, + bool test_weighted, + bool renumber) { + CUGRAPH_EXPECTS(renumber, "renumber should be true if multi_gpu is true."); + + auto& comm = handle.get_comms(); + auto const comm_size = comm.get_size(); + auto const comm_rank = comm.get_rank(); + auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); + auto const row_comm_size = row_comm.get_size(); + auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); + auto const col_comm_size = col_comm.get_size(); + + vertex_t number_of_vertices = static_cast(vertices.size()); + + auto vertex_key_func = + cugraph::experimental::detail::compute_gpu_id_from_vertex_t{comm_size}; vertices.resize(thrust::distance(vertices.begin(), thrust::remove_if( rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), @@ -80,17 +72,10 @@ void filter_vertices(raft::handle_t const& handle, return key_func(val) != comm_rank; })), handle.get_stream()); -} + vertices.shrink_to_fit(handle.get_stream()); -template -void filter_edges(raft::handle_t const& handle, - rmm::device_uvector& edgelist_rows, - rmm::device_uvector& edgelist_cols, - rmm::device_uvector& edgelist_weights, - key_function_t edge_key_func, - int comm_rank, - bool test_weighted) -{ + auto edge_key_func = cugraph::experimental::detail::compute_gpu_id_from_edge_t{ + false, comm_size, row_comm_size, col_comm_size}; size_t number_of_local_edges{}; if (test_weighted) { auto edge_first = thrust::make_zip_iterator( @@ -121,120 +106,35 @@ void filter_edges(raft::handle_t const& handle, } edgelist_rows.resize(number_of_local_edges, handle.get_stream()); - edgelist_cols.resize(number_of_local_edges, handle.get_stream()); - if (test_weighted) edgelist_weights.resize(number_of_local_edges, handle.get_stream()); -} - -template -std::enable_if_t< - multi_gpu, - std::tuple< - cugraph::experimental::graph_t, - rmm::device_uvector>> -generate_graph_from_edgelist(raft::handle_t const& handle, - rmm::device_uvector&& vertices, - rmm::device_uvector&& edgelist_rows, - rmm::device_uvector&& edgelist_cols, - rmm::device_uvector&& edgelist_weights, - bool is_symmetric, - bool test_weighted, - bool renumber) -{ - auto& comm = handle.get_comms(); - auto const comm_size = comm.get_size(); - auto const comm_rank = comm.get_rank(); - auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); - auto const row_comm_size = row_comm.get_size(); - auto const row_comm_rank = row_comm.get_rank(); - auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); - auto const col_comm_size = col_comm.get_size(); - auto const col_comm_rank = col_comm.get_rank(); - - vertex_t number_of_vertices = static_cast(vertices.size()); - edge_t number_of_edges = static_cast(edgelist_rows.size()); - - if (renumber) { - filter_vertices( - handle, - vertices, - cugraph::experimental::detail::compute_gpu_id_from_vertex_t{comm_size}, - comm_rank); - filter_edges( - handle, - edgelist_rows, - edgelist_cols, - edgelist_weights, - cugraph::experimental::detail::compute_gpu_id_from_edge_t{ - false, comm_size, row_comm_size, col_comm_size}, - comm_rank, - test_weighted); - } else { - filter_vertices( - handle, - vertices, - compute_gpu_id_from_vertex_no_renumbering{comm_size, number_of_vertices}, - comm_rank); - filter_edges(handle, - edgelist_rows, - edgelist_cols, - edgelist_weights, - compute_gpu_id_from_edge_no_renumbering{ - false, comm_size, row_comm_size, col_comm_size, number_of_vertices}, - comm_rank, - test_weighted); - } - - vertices.shrink_to_fit(handle.get_stream()); edgelist_rows.shrink_to_fit(handle.get_stream()); + edgelist_cols.resize(number_of_local_edges, handle.get_stream()); edgelist_cols.shrink_to_fit(handle.get_stream()); - if (test_weighted) edgelist_weights.shrink_to_fit(handle.get_stream()); + if (test_weighted) { + edgelist_weights.resize(number_of_local_edges, handle.get_stream()); + edgelist_weights.shrink_to_fit(handle.get_stream()); + } // 3. renumber rmm::device_uvector renumber_map_labels(0, handle.get_stream()); cugraph::experimental::partition_t partition{}; vertex_t aggregate_number_of_vertices{}; - - if (renumber) { - // FIXME: set do_expensive_check to false once validated - std::tie(renumber_map_labels, partition, aggregate_number_of_vertices, number_of_edges) = - cugraph::experimental::renumber_edgelist( - handle, - vertices.data(), - static_cast(vertices.size()), - store_transposed ? edgelist_cols.data() : edgelist_rows.data(), - store_transposed ? edgelist_rows.data() : edgelist_cols.data(), - edgelist_rows.size(), - false, - true); - assert(aggregate_number_of_vertices == number_of_vertices); - } else { - std::vector vertex_partition_offsets(comm_size + 1, 0); - vertex_t vertices_per_gpu = - static_cast(number_of_vertices + comm_size - 1) / comm_size; - - std::for_each(thrust::make_counting_iterator(0), - thrust::make_counting_iterator(comm_size), - [h_vertex_partition_offsets = vertex_partition_offsets.data(), vertices_per_gpu]( - auto idx) { h_vertex_partition_offsets[idx] = idx * vertices_per_gpu; }); - - vertex_partition_offsets[comm_size] = number_of_vertices; - - partition = cugraph::experimental::partition_t(vertex_partition_offsets, - false, // is_hypergraph_partitioned, - row_comm_size, - col_comm_size, - row_comm_rank, - col_comm_rank); - - //number_of_edges = edgelist_rows.size(); - } + edge_t number_of_edges{}; + // FIXME: set do_expensive_check to false once validated + std::tie(renumber_map_labels, partition, aggregate_number_of_vertices, number_of_edges) = + cugraph::experimental::renumber_edgelist( + handle, + vertices.data(), + static_cast(vertices.size()), + store_transposed ? edgelist_cols.data() : edgelist_rows.data(), + store_transposed ? edgelist_rows.data() : edgelist_cols.data(), + edgelist_rows.size(), + false, + true); + assert(aggregate_number_of_vertices == number_of_vertices); // 4. create a graph + return std::make_tuple( cugraph::experimental::graph_t( handle, @@ -248,7 +148,7 @@ generate_graph_from_edgelist(raft::handle_t const& handle, number_of_vertices, number_of_edges, cugraph::experimental::graph_properties_t{is_symmetric, false}, - false, + true, true), std::move(renumber_map_labels)); } From bf522a4445f03ca6b2dc42a6d511ceb148b69343 Mon Sep 17 00:00:00 2001 From: Chuck Hastings Date: Fri, 26 Mar 2021 15:51:37 -0400 Subject: [PATCH 12/18] update format and copyright information --- cpp/include/dendrogram.hpp | 6 +----- cpp/include/graph.hpp | 10 +++------- cpp/include/utilities/device_comm.cuh | 8 +++++--- cpp/src/experimental/louvain.cuh | 4 ++-- cpp/tests/community/mg_louvain_helper.cu | 20 ++++++++++---------- cpp/tests/community/mg_louvain_helper.hpp | 8 +++++--- cpp/tests/utilities/base_fixture.hpp | 2 +- 7 files changed, 27 insertions(+), 31 deletions(-) diff --git a/cpp/include/dendrogram.hpp b/cpp/include/dendrogram.hpp index 83a35110605..230f64343ad 100644 --- a/cpp/include/dendrogram.hpp +++ b/cpp/include/dendrogram.hpp @@ -41,14 +41,10 @@ class Dendrogram { vertex_t const *get_level_ptr_nocheck(size_t level) const { - //return static_cast(level_ptr_[level]->data()); return level_ptr_[level]->data(); } - vertex_t *get_level_ptr_nocheck(size_t level) - { - return level_ptr_[level]->data(); - } + vertex_t *get_level_ptr_nocheck(size_t level) { return level_ptr_[level]->data(); } size_t get_level_size_nocheck(size_t level) const { return level_ptr_[level]->size(); } diff --git a/cpp/include/graph.hpp b/cpp/include/graph.hpp index c4ca3b40c11..8ea58546ce1 100644 --- a/cpp/include/graph.hpp +++ b/cpp/include/graph.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -69,13 +69,9 @@ class GraphViewBase { edge_t *local_edges; vertex_t *local_offsets; - vertex_t get_number_of_vertices() const { - return number_of_vertices; - } + vertex_t get_number_of_vertices() const { return number_of_vertices; } - vertex_t get_local_vertex_first() const { - return vertex_t{0}; - } + vertex_t get_local_vertex_first() const { return vertex_t{0}; } /** * @brief Fill the identifiers array with the vertex identifiers. diff --git a/cpp/include/utilities/device_comm.cuh b/cpp/include/utilities/device_comm.cuh index e0bbea76068..53711f21a6c 100644 --- a/cpp/include/utilities/device_comm.cuh +++ b/cpp/include/utilities/device_comm.cuh @@ -891,9 +891,11 @@ device_reduce(raft::comms::comms_t const& comm, size_t constexpr tuple_size = thrust::tuple_size::value_type>::value; - detail:: - device_reduce_tuple_iterator_element_impl().run( - comm, input_first, output_first, count, op, root, stream); + detail::device_reduce_tuple_iterator_element_impl() + .run(comm, input_first, output_first, count, op, root, stream); } template diff --git a/cpp/src/experimental/louvain.cuh b/cpp/src/experimental/louvain.cuh index 30c9c1cfc4d..5ce43dbf3df 100644 --- a/cpp/src/experimental/louvain.cuh +++ b/cpp/src/experimental/louvain.cuh @@ -28,8 +28,8 @@ #include #include -#include #include +#include //#define TIMING @@ -413,7 +413,7 @@ class Louvain { cluster_keys_v_.begin(), cluster_keys_v_.end(), cluster_weights_v_.begin()); - + thrust::transform(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), next_cluster_v.begin(), next_cluster_v.end(), diff --git a/cpp/tests/community/mg_louvain_helper.cu b/cpp/tests/community/mg_louvain_helper.cu index 57e030da214..a7f95e6d718 100644 --- a/cpp/tests/community/mg_louvain_helper.cu +++ b/cpp/tests/community/mg_louvain_helper.cu @@ -85,16 +85,16 @@ bool compare_renumbered_vectors(raft::handle_t const &handle, d_map[e1] = e2; }); - auto error_count = thrust::count_if( - rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - iter, - iter + v1.size(), - [d_map = map.data()] __device__(auto pair) { - vertex_t e1 = thrust::get<0>(pair); - vertex_t e2 = thrust::get<1>(pair); - - return (d_map[e1] != e2); - }); + auto error_count = + thrust::count_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + iter, + iter + v1.size(), + [d_map = map.data()] __device__(auto pair) { + vertex_t e1 = thrust::get<0>(pair); + vertex_t e2 = thrust::get<1>(pair); + + return (d_map[e1] != e2); + }); return (error_count == 0); } diff --git a/cpp/tests/community/mg_louvain_helper.hpp b/cpp/tests/community/mg_louvain_helper.hpp index 87d46907bfc..43eb294cd13 100644 --- a/cpp/tests/community/mg_louvain_helper.hpp +++ b/cpp/tests/community/mg_louvain_helper.hpp @@ -43,9 +43,11 @@ void single_gpu_renumber_edgelist_given_number_map( template std::unique_ptr> -coarsen_graph(raft::handle_t const &handle, - cugraph::experimental::graph_view_t const &graph_view, - vertex_t const *labels); +coarsen_graph( + raft::handle_t const &handle, + cugraph::experimental::graph_view_t const + &graph_view, + vertex_t const *labels); } // namespace test } // namespace cugraph diff --git a/cpp/tests/utilities/base_fixture.hpp b/cpp/tests/utilities/base_fixture.hpp index 78ac57dba58..79a86e1fc95 100644 --- a/cpp/tests/utilities/base_fixture.hpp +++ b/cpp/tests/utilities/base_fixture.hpp @@ -162,7 +162,7 @@ inline auto parse_test_options(int argc, char **argv) auto resource = cugraph::test::create_memory_resource(rmm_mode); \ \ if (comm_rank != 0) { \ - auto& listeners = ::testing::UnitTest::GetInstance()->listeners(); \ + auto &listeners = ::testing::UnitTest::GetInstance()->listeners(); \ delete listeners.Release(listeners.default_result_printer()); \ } \ rmm::mr::set_current_device_resource(resource.get()); \ From 0810213bceca66c5b88fd80d7d835207d03da20f Mon Sep 17 00:00:00 2001 From: Chuck Hastings Date: Fri, 26 Mar 2021 15:56:21 -0400 Subject: [PATCH 13/18] missed a format issue --- cpp/include/dendrogram.hpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/cpp/include/dendrogram.hpp b/cpp/include/dendrogram.hpp index 230f64343ad..6316bdb25fc 100644 --- a/cpp/include/dendrogram.hpp +++ b/cpp/include/dendrogram.hpp @@ -39,10 +39,7 @@ class Dendrogram { size_t num_levels() const { return level_ptr_.size(); } - vertex_t const *get_level_ptr_nocheck(size_t level) const - { - return level_ptr_[level]->data(); - } + vertex_t const *get_level_ptr_nocheck(size_t level) const { return level_ptr_[level]->data(); } vertex_t *get_level_ptr_nocheck(size_t level) { return level_ptr_[level]->data(); } From 9e753ca3f93ee5ace1cfbdfde26e9bc687c8ce2e Mon Sep 17 00:00:00 2001 From: Chuck Hastings Date: Fri, 26 Mar 2021 17:26:00 -0400 Subject: [PATCH 14/18] add cuco/static_map guards for a few primitives that didn't have them --- cpp/include/dendrogram.hpp | 1 - cpp/{src => include}/experimental/include_cuco_static_map.cuh | 0 .../patterns/copy_v_transform_reduce_key_aggregated_out_nbr.cuh | 2 +- .../patterns/transform_reduce_by_adj_matrix_row_col_key_e.cuh | 2 -- cpp/include/utilities/collect_comm.cuh | 2 +- 5 files changed, 2 insertions(+), 5 deletions(-) rename cpp/{src => include}/experimental/include_cuco_static_map.cuh (100%) diff --git a/cpp/include/dendrogram.hpp b/cpp/include/dendrogram.hpp index 6316bdb25fc..bb9ba470a52 100644 --- a/cpp/include/dendrogram.hpp +++ b/cpp/include/dendrogram.hpp @@ -15,7 +15,6 @@ */ #pragma once -#include #include #include diff --git a/cpp/src/experimental/include_cuco_static_map.cuh b/cpp/include/experimental/include_cuco_static_map.cuh similarity index 100% rename from cpp/src/experimental/include_cuco_static_map.cuh rename to cpp/include/experimental/include_cuco_static_map.cuh diff --git a/cpp/include/patterns/copy_v_transform_reduce_key_aggregated_out_nbr.cuh b/cpp/include/patterns/copy_v_transform_reduce_key_aggregated_out_nbr.cuh index 11cf2cb1137..19a5f67c9de 100644 --- a/cpp/include/patterns/copy_v_transform_reduce_key_aggregated_out_nbr.cuh +++ b/cpp/include/patterns/copy_v_transform_reduce_key_aggregated_out_nbr.cuh @@ -27,7 +27,7 @@ #include -#include +#include #include diff --git a/cpp/include/patterns/transform_reduce_by_adj_matrix_row_col_key_e.cuh b/cpp/include/patterns/transform_reduce_by_adj_matrix_row_col_key_e.cuh index 0b3588bc8c5..e621ed91ddb 100644 --- a/cpp/include/patterns/transform_reduce_by_adj_matrix_row_col_key_e.cuh +++ b/cpp/include/patterns/transform_reduce_by_adj_matrix_row_col_key_e.cuh @@ -25,8 +25,6 @@ #include -#include - #include namespace cugraph { diff --git a/cpp/include/utilities/collect_comm.cuh b/cpp/include/utilities/collect_comm.cuh index 5ca58ebeb17..8d2227c0f60 100644 --- a/cpp/include/utilities/collect_comm.cuh +++ b/cpp/include/utilities/collect_comm.cuh @@ -16,6 +16,7 @@ #pragma once #include +#include #include #include @@ -24,7 +25,6 @@ #include #include -#include #include #include From d2d1dfe541e26dee9243484828ba276fa7b17fad Mon Sep 17 00:00:00 2001 From: Chuck Hastings Date: Fri, 26 Mar 2021 17:58:46 -0400 Subject: [PATCH 15/18] add guard for cuco static map usage in collect_values_for_keys --- cpp/src/experimental/louvain.cuh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/src/experimental/louvain.cuh b/cpp/src/experimental/louvain.cuh index 5ce43dbf3df..3136515faa6 100644 --- a/cpp/src/experimental/louvain.cuh +++ b/cpp/src/experimental/louvain.cuh @@ -362,6 +362,7 @@ class Louvain { rmm::device_uvector &next_cluster_v, bool up_down) { +#ifdef CUCO_STATIC_MAP_DEFINED rmm::device_uvector old_cluster_sum_v( current_graph_view_.get_number_of_local_vertices(), handle_.get_stream()); rmm::device_uvector cluster_subtract_v( @@ -500,6 +501,7 @@ class Louvain { d_src_cluster_cache_, [] __device__(auto src, auto dst, auto wt, auto x, auto y) { return wt; }, weight_t{0}); +#endif } void shrink_graph() From 4a266868c88a29c5d6f43045d0a5ad195e4d48b1 Mon Sep 17 00:00:00 2001 From: Chuck Hastings Date: Fri, 26 Mar 2021 18:48:28 -0400 Subject: [PATCH 16/18] update louvain parameters and documentation to reference a graph_view --- cpp/include/algorithms.hpp | 39 ++++++++++++++++++------------------ cpp/src/community/louvain.cu | 36 ++++++++++++++++----------------- 2 files changed, 38 insertions(+), 37 deletions(-) diff --git a/cpp/include/algorithms.hpp b/cpp/include/algorithms.hpp index 3c89c6a2dd3..b8706d81e21 100644 --- a/cpp/include/algorithms.hpp +++ b/cpp/include/algorithms.hpp @@ -616,7 +616,7 @@ weight_t hungarian(raft::handle_t const &handle, * * @throws cugraph::logic_error when an error occurs. * - * @tparam graph_t Type of graph + * @tparam graph_view_t Type of graph * * @param[in] handle Library handle (RAFT). If a communicator is set in the handle, * @param[in] graph input graph object (CSR) @@ -633,13 +633,13 @@ weight_t hungarian(raft::handle_t const &handle, * 2) modularity of the returned clustering * */ -template -std::pair louvain( +template +std::pair louvain( raft::handle_t const &handle, - graph_t const &graph, - typename graph_t::vertex_type *clustering, - size_t max_level = 100, - typename graph_t::weight_type resolution = typename graph_t::weight_type{1}); + graph_view_t const &graph_view, + typename graph_view_t::vertex_type *clustering, + size_t max_level = 100, + typename graph_view_t::weight_type resolution = typename graph_view_t::weight_type{1}); /** * @brief Louvain implementation, returning dendrogram @@ -654,10 +654,10 @@ std::pair louvain( * * @throws cugraph::logic_error when an error occurs. * - * @tparam graph_t Type of graph + * @tparam graph_view_t Type of graph * * @param[in] handle Library handle (RAFT) - * @param[in] graph Input graph object (CSR) + * @param[in] graph_view Input graph view object (CSR) * @param[in] max_level (optional) maximum number of levels to run (default 100) * @param[in] resolution (optional) The value of the resolution parameter to use. * Called gamma in the modularity formula, this changes the size @@ -670,12 +670,13 @@ std::pair louvain( * 2) modularity of the returned clustering * */ -template -std::pair>, typename graph_t::weight_type> +template +std::pair>, + typename graph_view_t::weight_type> louvain(raft::handle_t const &handle, - graph_t const &graph, - size_t max_level = 100, - typename graph_t::weight_type resolution = typename graph_t::weight_type{1}); + graph_view_t const &graph_view, + size_t max_level = 100, + typename graph_view_t::weight_type resolution = typename graph_view_t::weight_type{1}); /** * @brief Flatten a Dendrogram at a particular level @@ -687,7 +688,7 @@ louvain(raft::handle_t const &handle, * * @throws cugraph::logic_error when an error occurs. * - * @tparam graph_t Type of graph + * @tparam graph_view_t Type of graph * * @param[in] handle Library handle (RAFT). If a communicator is set in the handle, * @param[in] graph input graph object @@ -695,11 +696,11 @@ louvain(raft::handle_t const &handle, * @param[out] clustering Pointer to device array where the clustering should be stored * */ -template +template void flatten_dendrogram(raft::handle_t const &handle, - graph_t const &graph_view, - Dendrogram const &dendrogram, - typename graph_t::vertex_type *clustering); + graph_view_t const &graph_view, + Dendrogram const &dendrogram, + typename graph_view_t::vertex_type *clustering); /** * @brief Leiden implementation diff --git a/cpp/src/community/louvain.cu b/cpp/src/community/louvain.cu index a3a2ffa39eb..25b11e9028a 100644 --- a/cpp/src/community/louvain.cu +++ b/cpp/src/community/louvain.cu @@ -107,43 +107,43 @@ void flatten_dendrogram( } // namespace detail -template -std::pair>, typename graph_t::weight_type> +template +std::pair>, typename graph_view_t::weight_type> louvain(raft::handle_t const &handle, - graph_t const &graph, + graph_view_t const &graph_view, size_t max_level, - typename graph_t::weight_type resolution) + typename graph_view_t::weight_type resolution) { - return detail::louvain(handle, graph, max_level, resolution); + return detail::louvain(handle, graph_view, max_level, resolution); } -template +template void flatten_dendrogram(raft::handle_t const &handle, - graph_t const &graph_view, - Dendrogram const &dendrogram, - typename graph_t::vertex_type *clustering) + graph_view_t const &graph_view, + Dendrogram const &dendrogram, + typename graph_view_t::vertex_type *clustering) { detail::flatten_dendrogram(handle, graph_view, dendrogram, clustering); } -template -std::pair louvain(raft::handle_t const &handle, - graph_t const &graph, - typename graph_t::vertex_type *clustering, +template +std::pair louvain(raft::handle_t const &handle, + graph_view_t const &graph_view, + typename graph_view_t::vertex_type *clustering, size_t max_level, - typename graph_t::weight_type resolution) + typename graph_view_t::weight_type resolution) { - using vertex_t = typename graph_t::vertex_type; - using weight_t = typename graph_t::weight_type; + using vertex_t = typename graph_view_t::vertex_type; + using weight_t = typename graph_view_t::weight_type; CUGRAPH_EXPECTS(clustering != nullptr, "Invalid input argument: clustering is null"); std::unique_ptr> dendrogram; weight_t modularity; - std::tie(dendrogram, modularity) = louvain(handle, graph, max_level, resolution); + std::tie(dendrogram, modularity) = louvain(handle, graph_view, max_level, resolution); - flatten_dendrogram(handle, graph, *dendrogram, clustering); + flatten_dendrogram(handle, graph_view, *dendrogram, clustering); return std::make_pair(dendrogram->num_levels(), modularity); } From 38dc0e5a73ee8aab8c204b0eb7a6c26ba40e24bc Mon Sep 17 00:00:00 2001 From: Chuck Hastings Date: Mon, 29 Mar 2021 09:40:21 -0400 Subject: [PATCH 17/18] fix clang formatting issue --- cpp/src/community/louvain.cu | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/cpp/src/community/louvain.cu b/cpp/src/community/louvain.cu index 25b11e9028a..2affcf29805 100644 --- a/cpp/src/community/louvain.cu +++ b/cpp/src/community/louvain.cu @@ -108,7 +108,8 @@ void flatten_dendrogram( } // namespace detail template -std::pair>, typename graph_view_t::weight_type> +std::pair>, + typename graph_view_t::weight_type> louvain(raft::handle_t const &handle, graph_view_t const &graph_view, size_t max_level, @@ -127,11 +128,12 @@ void flatten_dendrogram(raft::handle_t const &handle, } template -std::pair louvain(raft::handle_t const &handle, - graph_view_t const &graph_view, - typename graph_view_t::vertex_type *clustering, - size_t max_level, - typename graph_view_t::weight_type resolution) +std::pair louvain( + raft::handle_t const &handle, + graph_view_t const &graph_view, + typename graph_view_t::vertex_type *clustering, + size_t max_level, + typename graph_view_t::weight_type resolution) { using vertex_t = typename graph_view_t::vertex_type; using weight_t = typename graph_view_t::weight_type; From a8c91c20b10742cdba922ea9a410f9e87cf81222 Mon Sep 17 00:00:00 2001 From: Chuck Hastings Date: Mon, 29 Mar 2021 13:09:34 -0400 Subject: [PATCH 18/18] legacy louvain path now uses cuco also --- cpp/tests/community/louvain_test.cpp | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/cpp/tests/community/louvain_test.cpp b/cpp/tests/community/louvain_test.cpp index ab92ce411b9..2ebf9a85902 100644 --- a/cpp/tests/community/louvain_test.cpp +++ b/cpp/tests/community/louvain_test.cpp @@ -74,10 +74,26 @@ class Tests_Louvain : public ::testing::TestWithParam { directed, configuration.graph_file_full_path_); auto graph_view = graph->view(); - louvain(graph_view, - graph_view.get_number_of_vertices(), - configuration.expected_level_, - configuration.expected_modularity_); + // "FIXME": remove this check once we drop support for Pascal + // + // Calling louvain on Pascal will throw an exception, we'll check that + // this is the behavior while we still support Pascal (device_prop.major < 7) + // + cudaDeviceProp device_prop; + CUDA_CHECK(cudaGetDeviceProperties(&device_prop, 0)); + + if (device_prop.major < 7) { + EXPECT_THROW(louvain(graph_view, + graph_view.get_number_of_vertices(), + configuration.expected_level_, + configuration.expected_modularity_), + cugraph::logic_error); + } else { + louvain(graph_view, + graph_view.get_number_of_vertices(), + configuration.expected_level_, + configuration.expected_modularity_); + } } template