From c4e8dc041b176e0b60d14677486c8260478f4d90 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 4 Nov 2021 19:45:34 -0400 Subject: [PATCH 01/17] Updates: bitmask_and returns output mask and count of valid bits --- cpp/include/cudf/detail/null_mask.cuh | 84 +++++++++++++++---------- cpp/include/cudf/detail/null_mask.hpp | 4 +- cpp/include/cudf/null_mask.hpp | 7 ++- cpp/src/binaryop/binaryop.cpp | 6 +- cpp/src/bitmask/null_mask.cu | 90 ++++++++++++++------------- cpp/src/datetime/datetime_ops.cu | 2 +- cpp/src/groupby/hash/groupby.cu | 4 +- cpp/src/groupby/sort/aggregate.cpp | 4 +- cpp/src/groupby/sort/sort_helper.cu | 2 +- cpp/src/join/hash_join.cu | 2 +- cpp/src/join/semi_join.cu | 2 +- cpp/src/strings/repeat_strings.cu | 2 +- cpp/src/structs/utilities.cpp | 3 +- cpp/tests/bitmask/bitmask_tests.cpp | 6 +- 14 files changed, 120 insertions(+), 98 deletions(-) diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh index b6d6a2529ed..1f05e215161 100644 --- a/cpp/include/cudf/detail/null_mask.cuh +++ b/cpp/include/cudf/detail/null_mask.cuh @@ -23,26 +23,34 @@ #include #include +#include namespace cudf { namespace detail { /** * @brief Computes the merger of an array of bitmasks using a binary operator * + * @tparam block_size Number of threads in each thread block + * @tparam Binop Type of binary operator + * * @param op The binary operator used to combine the bitmasks * @param destination The bitmask to write result into * @param source Array of source mask pointers. All masks must be of same size * @param source_begin_bits Array of offsets into corresponding @p source masks. * Must be same size as source array * @param source_size_bits Number of bits in each mask in @p source + * @param count Pointer to valid-bit counter */ -template +template __global__ void offset_bitmask_binop(Binop op, device_span destination, device_span source, device_span source_begin_bits, - size_type source_size_bits) + size_type source_size_bits, + size_type* valid_count_ptr) { + size_type thread_valid_count = 0; + for (size_type destination_word_index = threadIdx.x + blockIdx.x * blockDim.x; destination_word_index < destination.size(); destination_word_index += blockDim.x * gridDim.x) { @@ -52,17 +60,21 @@ __global__ void offset_bitmask_binop(Binop op, source_begin_bits[0], source_begin_bits[0] + source_size_bits); for (size_type i = 1; i < source.size(); i++) { - destination_word = - - op(destination_word, - detail::get_mask_offset_word(source[i], - destination_word_index, - source_begin_bits[i], - source_begin_bits[i] + source_size_bits)); + destination_word = op(destination_word, + detail::get_mask_offset_word(source[i], + destination_word_index, + source_begin_bits[i], + source_begin_bits[i] + source_size_bits)); } destination[destination_word_index] = destination_word; + thread_valid_count += __popc(destination_word); } + + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + size_type block_valid_count = BlockReduce(temp_storage).Sum(thread_valid_count); + if (threadIdx.x == 0) { atomicAdd(valid_count_ptr, block_valid_count); } } /** @@ -72,7 +84,7 @@ __global__ void offset_bitmask_binop(Binop op, * @param stream CUDA stream used for device memory operations and kernel launches */ template -rmm::device_buffer bitmask_binop( +std::pair bitmask_binop( Binop op, host_span masks, host_span masks_begin_bits, @@ -82,33 +94,34 @@ rmm::device_buffer bitmask_binop( { auto dest_mask = rmm::device_buffer{bitmask_allocation_size_bytes(mask_size_bits), stream, mr}; - inplace_bitmask_binop(op, - device_span(static_cast(dest_mask.data()), - num_bitmask_words(mask_size_bits)), - masks, - masks_begin_bits, - mask_size_bits, - stream, - mr); - - return dest_mask; + auto valid_count = + inplace_bitmask_binop(op, + device_span(static_cast(dest_mask.data()), + num_bitmask_words(mask_size_bits)), + masks, + masks_begin_bits, + mask_size_bits, + stream, + mr); + + return std::make_pair(std::move(dest_mask), valid_count); } /** * @brief Performs a merge of the specified bitmasks using the binary operator - * provided, and writes in place to destination + * provided, writes in place to destination and returns count of valid bits * - * @param op The binary operator used to combine the bitmasks - * @param dest_mask Destination to which the merged result is written - * @param masks The list of data pointers of the bitmasks to be merged - * @param masks_begin_bits The bit offsets from which each mask is to be merged - * @param mask_size_bits The number of bits to be ANDed in each mask - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource used to allocate the returned device_buffer - * @return rmm::device_buffer Output bitmask + * @param[in] op The binary operator used to combine the bitmasks + * @param[out] dest_mask Destination to which the merged result is written + * @param[in] masks The list of data pointers of the bitmasks to be merged + * @param[in] masks_begin_bits The bit offsets from which each mask is to be merged + * @param[in] mask_size_bits The number of bits to be ANDed in each mask + * @param[in] stream CUDA stream used for device memory operations and kernel launches + * @param[in] mr Device memory resource used to allocate the returned device_buffer + * @return size_type Count of valid bits */ template -void inplace_bitmask_binop( +size_type inplace_bitmask_binop( Binop op, device_span dest_mask, host_span masks, @@ -124,6 +137,7 @@ void inplace_bitmask_binop( CUDF_EXPECTS(std::all_of(masks.begin(), masks.end(), [](auto p) { return p != nullptr; }), "Mask pointer cannot be null"); + rmm::device_scalar d_counter{0, stream, mr}; rmm::device_uvector d_masks(masks.size(), stream, mr); rmm::device_uvector d_begin_bits(masks_begin_bits.size(), stream, mr); @@ -135,11 +149,13 @@ void inplace_bitmask_binop( cudaMemcpyHostToDevice, stream.value())); - cudf::detail::grid_1d config(dest_mask.size(), 256); - offset_bitmask_binop<<>>( - op, dest_mask, d_masks, d_begin_bits, mask_size_bits); + auto constexpr block_size = 256; + cudf::detail::grid_1d config(dest_mask.size(), block_size); + offset_bitmask_binop + <<>>( + op, dest_mask, d_masks, d_begin_bits, mask_size_bits, d_counter.data()); CHECK_CUDA(stream.value()); - stream.synchronize(); + return d_counter.value(stream); } /** diff --git a/cpp/include/cudf/detail/null_mask.hpp b/cpp/include/cudf/detail/null_mask.hpp index f757929d839..d682021177b 100644 --- a/cpp/include/cudf/detail/null_mask.hpp +++ b/cpp/include/cudf/detail/null_mask.hpp @@ -114,7 +114,7 @@ rmm::device_buffer copy_bitmask( * * @param stream CUDA stream used for device memory operations and kernel launches */ -rmm::device_buffer bitmask_and( +std::pair bitmask_and( host_span masks, host_span masks_begin_bits, size_type mask_size_bits, @@ -126,7 +126,7 @@ rmm::device_buffer bitmask_and( * * @param[in] stream CUDA stream used for device memory operations and kernel launches. */ -rmm::device_buffer bitmask_and( +std::pair bitmask_and( table_view const& view, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); diff --git a/cpp/include/cudf/null_mask.hpp b/cpp/include/cudf/null_mask.hpp index 7146360fd6f..af01059b9d1 100644 --- a/cpp/include/cudf/null_mask.hpp +++ b/cpp/include/cudf/null_mask.hpp @@ -202,16 +202,17 @@ rmm::device_buffer copy_bitmask( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Returns a bitwise AND of the bitmasks of columns of a table + * @brief Returns a bitwise AND of the bitmasks of columns of a table and count + * of valid bits * * If any of the columns isn't nullable, it is considered all valid. * If no column in the table is nullable, an empty bitmask is returned. * * @param view The table of columns * @param mr Device memory resource used to allocate the returned device_buffer - * @return rmm::device_buffer Output bitmask + * @return Output bitmask and count of valid bits */ -rmm::device_buffer bitmask_and( +std::pair bitmask_and( table_view const& view, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp index 73a3f55163d..6c39807291d 100644 --- a/cpp/src/binaryop/binaryop.cpp +++ b/cpp/src/binaryop/binaryop.cpp @@ -392,7 +392,7 @@ std::unique_ptr make_fixed_width_column_for_output(column_view const& lh if (binops::is_null_dependent(op)) { return make_fixed_width_column(output_type, rhs.size(), mask_state::ALL_VALID, stream, mr); } else { - auto new_mask = cudf::detail::bitmask_and(table_view({lhs, rhs}), stream, mr); + auto [new_mask, _] = cudf::detail::bitmask_and(table_view({lhs, rhs}), stream, mr); return make_fixed_width_column( output_type, lhs.size(), std::move(new_mask), cudf::UNKNOWN_NULL_COUNT, stream, mr); } @@ -799,8 +799,8 @@ std::unique_ptr binary_operation(column_view const& lhs, CUDF_EXPECTS((lhs.size() == rhs.size()), "Column sizes don't match"); - auto new_mask = bitmask_and(table_view({lhs, rhs}), stream, mr); - auto out = make_fixed_width_column( + auto [new_mask, _] = bitmask_and(table_view({lhs, rhs}), stream, mr); + auto out = make_fixed_width_column( output_type, lhs.size(), std::move(new_mask), cudf::UNKNOWN_NULL_COUNT, stream, mr); // Check for 0 sized data diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu index fe13277ac8e..e89b9b261a6 100644 --- a/cpp/src/bitmask/null_mask.cu +++ b/cpp/src/bitmask/null_mask.cu @@ -318,11 +318,11 @@ void inplace_bitmask_and(device_span dest_mask, } // Bitwise AND of the masks -rmm::device_buffer bitmask_and(host_span masks, - host_span begin_bits, - size_type mask_size, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::pair bitmask_and(host_span masks, + host_span begin_bits, + size_type mask_size, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { return bitmask_binop( [] __device__(bitmask_type left, bitmask_type right) { return left & right; }, @@ -333,6 +333,39 @@ rmm::device_buffer bitmask_and(host_span masks, mr); } +// Returns the bitwise AND of the null masks of all columns in the table view +std::pair bitmask_and(table_view const& view, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + rmm::device_buffer null_mask{0, stream, mr}; + if (view.num_rows() == 0 or view.num_columns() == 0) { + return std::make_pair(std::move(null_mask), 0); + } + + std::vector masks; + std::vector offsets; + for (auto&& col : view) { + if (col.nullable()) { + masks.push_back(col.null_mask()); + offsets.push_back(col.offset()); + } + } + + if (masks.size() > 0) { + return cudf::detail::bitmask_binop( + [] __device__(bitmask_type left, bitmask_type right) { return left & right; }, + masks, + offsets, + view.num_rows(), + stream, + mr); + } + + return std::make_pair(std::move(null_mask), 0); +} + cudf::size_type count_set_bits(bitmask_type const* bitmask, size_type start, size_type stop, @@ -371,37 +404,6 @@ cudf::size_type count_unset_bits(bitmask_type const* bitmask, return (num_bits - detail::count_set_bits(bitmask, start, stop, stream)); } -// Returns the bitwise AND of the null masks of all columns in the table view -rmm::device_buffer bitmask_and(table_view const& view, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - rmm::device_buffer null_mask{0, stream, mr}; - if (view.num_rows() == 0 or view.num_columns() == 0) { return null_mask; } - - std::vector masks; - std::vector offsets; - for (auto&& col : view) { - if (col.nullable()) { - masks.push_back(col.null_mask()); - offsets.push_back(col.offset()); - } - } - - if (masks.size() > 0) { - return cudf::detail::bitmask_binop( - [] __device__(bitmask_type left, bitmask_type right) { return left & right; }, - masks, - offsets, - view.num_rows(), - stream, - mr); - } - - return null_mask; -} - // Returns the bitwise OR of the null masks of all columns in the table view rmm::device_buffer bitmask_or(table_view const& view, rmm::cuda_stream_view stream, @@ -422,12 +424,13 @@ rmm::device_buffer bitmask_or(table_view const& view, if (static_cast(masks.size()) == view.num_columns()) { return cudf::detail::bitmask_binop( - [] __device__(bitmask_type left, bitmask_type right) { return left | right; }, - masks, - offsets, - view.num_rows(), - stream, - mr); + [] __device__(bitmask_type left, bitmask_type right) { return left | right; }, + masks, + offsets, + view.num_rows(), + stream, + mr) + .first; } return null_mask; @@ -502,7 +505,8 @@ rmm::device_buffer copy_bitmask(column_view const& view, rmm::mr::device_memory_ return detail::copy_bitmask(view, rmm::cuda_stream_default, mr); } -rmm::device_buffer bitmask_and(table_view const& view, rmm::mr::device_memory_resource* mr) +std::pair bitmask_and(table_view const& view, + rmm::mr::device_memory_resource* mr) { return detail::bitmask_and(view, rmm::cuda_stream_default, mr); } diff --git a/cpp/src/datetime/datetime_ops.cu b/cpp/src/datetime/datetime_ops.cu index 6e892b3e461..87adcd37e3a 100644 --- a/cpp/src/datetime/datetime_ops.cu +++ b/cpp/src/datetime/datetime_ops.cu @@ -364,7 +364,7 @@ std::unique_ptr add_calendrical_months(column_view const& timestamp_colu stream, mr); - auto output_null_mask = + auto [output_null_mask, _] = cudf::detail::bitmask_and(table_view{{timestamp_column, months_column}}, stream, mr); output->set_null_mask(std::move(output_null_mask)); return output; diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index e7024c80a68..a6c0cd71c38 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -390,7 +390,7 @@ void sparse_to_dense_results(table_view const& keys, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto row_bitmask{bitmask_and(keys, stream, rmm::mr::get_current_device_resource())}; + auto [row_bitmask, _] = bitmask_and(keys, stream, rmm::mr::get_current_device_resource()); bool skip_key_rows_with_nulls = keys_have_nulls and include_null_keys == null_policy::EXCLUDE; bitmask_type const* row_bitmask_ptr = skip_key_rows_with_nulls ? static_cast(row_bitmask.data()) : nullptr; @@ -502,7 +502,7 @@ void compute_single_pass_aggs(table_view const& keys, bool skip_key_rows_with_nulls = keys_have_nulls and include_null_keys == null_policy::EXCLUDE; auto row_bitmask = - skip_key_rows_with_nulls ? cudf::detail::bitmask_and(keys, stream) : rmm::device_buffer{}; + skip_key_rows_with_nulls ? cudf::detail::bitmask_and(keys, stream).first : rmm::device_buffer{}; thrust::for_each_n( rmm::exec_policy(stream), thrust::make_counting_iterator(0), diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp index 83c6c1bca57..fab6a4b5f3b 100644 --- a/cpp/src/groupby/sort/aggregate.cpp +++ b/cpp/src/groupby/sort/aggregate.cpp @@ -538,8 +538,8 @@ void aggregate_result_functor::operator()(aggregation con */ auto column_view_with_common_nulls(column_view const& column_0, column_view const& column_1) { - rmm::device_buffer new_nullmask = cudf::bitmask_and(table_view{{column_0, column_1}}); - auto null_count = cudf::count_unset_bits( + auto [new_nullmask, _] = cudf::bitmask_and(table_view{{column_0, column_1}}); + auto null_count = cudf::count_unset_bits( static_cast(new_nullmask.data()), 0, column_0.size()); if (null_count == 0) { return std::make_tuple(std::move(new_nullmask), column_0, column_1); } auto column_view_with_new_nullmask = [](auto const& col, void* nullmask, auto null_count) { diff --git a/cpp/src/groupby/sort/sort_helper.cu b/cpp/src/groupby/sort/sort_helper.cu index 1caf2ff0371..fa1bbd59801 100644 --- a/cpp/src/groupby/sort/sort_helper.cu +++ b/cpp/src/groupby/sort/sort_helper.cu @@ -276,7 +276,7 @@ column_view sort_groupby_helper::keys_bitmask_column(rmm::cuda_stream_view strea { if (_keys_bitmask_column) return _keys_bitmask_column->view(); - auto row_bitmask = cudf::detail::bitmask_and(_keys, stream); + auto [row_bitmask, _] = cudf::detail::bitmask_and(_keys, stream); _keys_bitmask_column = make_numeric_column(data_type(type_id::INT8), _keys.num_rows(), diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu index 99a94c45510..380c05a8b18 100644 --- a/cpp/src/join/hash_join.cu +++ b/cpp/src/join/hash_join.cu @@ -92,7 +92,7 @@ void build_join_hash_table(cudf::table_view const& build, hash_table.insert(iter, iter + build_table_num_rows, stream.value()); } else { thrust::counting_iterator stencil(0); - auto const row_bitmask = cudf::detail::bitmask_and(build, stream); + auto const [row_bitmask, _] = cudf::detail::bitmask_and(build, stream); row_is_valid pred{static_cast(row_bitmask.data())}; // insert valid rows diff --git a/cpp/src/join/semi_join.cu b/cpp/src/join/semi_join.cu index 4a2f46d6f43..5b5dd418a97 100644 --- a/cpp/src/join/semi_join.cu +++ b/cpp/src/join/semi_join.cu @@ -97,7 +97,7 @@ std::unique_ptr> left_semi_anti_join( // contain a NULL in any column as they will never compare to equal. auto const row_bitmask = (compare_nulls == null_equality::EQUAL) ? rmm::device_buffer{} - : cudf::detail::bitmask_and(right_flattened_keys, stream); + : cudf::detail::bitmask_and(right_flattened_keys, stream).first; // skip rows that are null here. thrust::for_each_n( rmm::exec_policy(stream), diff --git a/cpp/src/strings/repeat_strings.cu b/cpp/src/strings/repeat_strings.cu index 2e5be9e55f6..6e6708ae929 100644 --- a/cpp/src/strings/repeat_strings.cu +++ b/cpp/src/strings/repeat_strings.cu @@ -319,7 +319,7 @@ std::unique_ptr repeat_strings(strings_column_view const& input, // We generate new bitmask by AND of the input columns' bitmasks. // Note that if the input columns are nullable, the output column will also be nullable (which may // not have nulls). - auto null_mask = + auto [null_mask, _] = cudf::detail::bitmask_and(table_view{{input.parent(), repeat_times}}, stream, mr); return make_strings_column(strings_count, diff --git a/cpp/src/structs/utilities.cpp b/cpp/src/structs/utilities.cpp index 47f8f29385c..b5f897ccccf 100644 --- a/cpp/src/structs/utilities.cpp +++ b/cpp/src/structs/utilities.cpp @@ -383,7 +383,8 @@ std::tuple> superimpose_paren std::vector{0, 0}, child.offset() + child.size(), stream, - mr)); + mr) + .first); return reinterpret_cast(ret_validity_buffers.back().data()); }(); diff --git a/cpp/tests/bitmask/bitmask_tests.cpp b/cpp/tests/bitmask/bitmask_tests.cpp index d82ff7f2ac4..dc11ef6ac5d 100644 --- a/cpp/tests/bitmask/bitmask_tests.cpp +++ b/cpp/tests/bitmask/bitmask_tests.cpp @@ -545,9 +545,9 @@ TEST_F(MergeBitmaskTest, TestBitmaskAnd) auto const input2 = cudf::table_view({bools_col1, bools_col2}); auto const input3 = cudf::table_view({bools_col1, bools_col2, bools_col3}); - rmm::device_buffer result1 = cudf::bitmask_and(input1); - rmm::device_buffer result2 = cudf::bitmask_and(input2); - rmm::device_buffer result3 = cudf::bitmask_and(input3); + auto [result1, count1] = cudf::bitmask_and(input1); + auto [result2, count2] = cudf::bitmask_and(input2); + auto [result3, count3] = cudf::bitmask_and(input3); auto odd_indices = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; }); From 005183194b58f102d2b308bfb757efe1c3e99fd0 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 4 Nov 2021 20:27:42 -0400 Subject: [PATCH 02/17] Update group sort to use new bitmask_and --- cpp/include/cudf/detail/null_mask.cuh | 1 + cpp/src/groupby/sort/aggregate.cpp | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh index 1f05e215161..1fce4ba46fe 100644 --- a/cpp/include/cudf/detail/null_mask.cuh +++ b/cpp/include/cudf/detail/null_mask.cuh @@ -74,6 +74,7 @@ __global__ void offset_bitmask_binop(Binop op, using BlockReduce = cub::BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; size_type block_valid_count = BlockReduce(temp_storage).Sum(thread_valid_count); + if (threadIdx.x == 0) { atomicAdd(valid_count_ptr, block_valid_count); } } diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp index fab6a4b5f3b..be17fafbfc4 100644 --- a/cpp/src/groupby/sort/aggregate.cpp +++ b/cpp/src/groupby/sort/aggregate.cpp @@ -538,9 +538,9 @@ void aggregate_result_functor::operator()(aggregation con */ auto column_view_with_common_nulls(column_view const& column_0, column_view const& column_1) { - auto [new_nullmask, _] = cudf::bitmask_and(table_view{{column_0, column_1}}); - auto null_count = cudf::count_unset_bits( - static_cast(new_nullmask.data()), 0, column_0.size()); + auto table = table_view{{column_0, column_1}}; + auto [new_nullmask, null_count] = cudf::bitmask_and(table); + if (null_count == 0) { return std::make_tuple(std::move(new_nullmask), column_0, column_1); } auto column_view_with_new_nullmask = [](auto const& col, void* nullmask, auto null_count) { return column_view(col.type(), From 9442b0dab03ddb267f8d0995d4cb2a9229f4d071 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 5 Nov 2021 13:43:42 -0400 Subject: [PATCH 03/17] Updates: bitmask_and return a struct of mask, valid count and null count --- cpp/include/cudf/detail/null_mask.cuh | 16 ++++++++-------- cpp/include/cudf/detail/null_mask.hpp | 21 +++++++++++---------- cpp/include/cudf/null_mask.hpp | 20 ++++++++++++++------ cpp/src/binaryop/binaryop.cpp | 6 +++--- cpp/src/bitmask/null_mask.cu | 25 ++++++++++++------------- cpp/src/datetime/datetime_ops.cu | 4 ++-- cpp/src/groupby/hash/groupby.cu | 8 +++++--- cpp/src/groupby/sort/aggregate.cpp | 5 +++-- cpp/src/groupby/sort/sort_helper.cu | 2 +- cpp/src/join/hash_join.cu | 2 +- cpp/src/join/semi_join.cu | 7 ++++--- cpp/src/strings/repeat_strings.cu | 4 ++-- cpp/src/structs/utilities.cpp | 13 +++++++------ cpp/tests/bitmask/bitmask_tests.cpp | 12 ++++++------ 14 files changed, 79 insertions(+), 66 deletions(-) diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh index 1fce4ba46fe..d11edc35f92 100644 --- a/cpp/include/cudf/detail/null_mask.cuh +++ b/cpp/include/cudf/detail/null_mask.cuh @@ -85,13 +85,12 @@ __global__ void offset_bitmask_binop(Binop op, * @param stream CUDA stream used for device memory operations and kernel launches */ template -std::pair bitmask_binop( - Binop op, - host_span masks, - host_span masks_begin_bits, - size_type mask_size_bits, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +bitmask bitmask_binop(Binop op, + host_span masks, + host_span masks_begin_bits, + size_type mask_size_bits, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { auto dest_mask = rmm::device_buffer{bitmask_allocation_size_bytes(mask_size_bits), stream, mr}; @@ -104,8 +103,9 @@ std::pair bitmask_binop( mask_size_bits, stream, mr); + auto null_count = mask_size_bits - valid_count; - return std::make_pair(std::move(dest_mask), valid_count); + return bitmask{std::move(dest_mask), valid_count, null_count}; } /** diff --git a/cpp/include/cudf/detail/null_mask.hpp b/cpp/include/cudf/detail/null_mask.hpp index d682021177b..5a488affcb2 100644 --- a/cpp/include/cudf/detail/null_mask.hpp +++ b/cpp/include/cudf/detail/null_mask.hpp @@ -23,6 +23,9 @@ #include namespace cudf { + +struct bitmask; + namespace detail { /** @@ -114,22 +117,20 @@ rmm::device_buffer copy_bitmask( * * @param stream CUDA stream used for device memory operations and kernel launches */ -std::pair bitmask_and( - host_span masks, - host_span masks_begin_bits, - size_type mask_size_bits, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +bitmask bitmask_and(host_span masks, + host_span masks_begin_bits, + size_type mask_size_bits, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @copydoc cudf::bitmask_and * * @param[in] stream CUDA stream used for device memory operations and kernel launches. */ -std::pair bitmask_and( - table_view const& view, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +bitmask bitmask_and(table_view const& view, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @copydoc cudf::bitmask_or diff --git a/cpp/include/cudf/null_mask.hpp b/cpp/include/cudf/null_mask.hpp index af01059b9d1..74a48b0d101 100644 --- a/cpp/include/cudf/null_mask.hpp +++ b/cpp/include/cudf/null_mask.hpp @@ -24,6 +24,15 @@ namespace cudf { +/** + * @brief Bitmask output type. + */ +struct bitmask { + rmm::device_buffer mask; ///< Resulting bitmask + size_type num_set_bits; ///< Number of set bits + size_type num_unset_bits; ///< Number of unset bits +}; + /** * @addtogroup column_nullmask * @{ @@ -202,19 +211,18 @@ rmm::device_buffer copy_bitmask( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Returns a bitwise AND of the bitmasks of columns of a table and count - * of valid bits + * @brief Returns a struct of bitwise AND of the bitmasks of columns of a table, + * count of valid bits and count of null bits * * If any of the columns isn't nullable, it is considered all valid. * If no column in the table is nullable, an empty bitmask is returned. * * @param view The table of columns * @param mr Device memory resource used to allocate the returned device_buffer - * @return Output bitmask and count of valid bits + * @return A struct of resulting bitmask, count of valid bits and count of null bits */ -std::pair bitmask_and( - table_view const& view, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +bitmask bitmask_and(table_view const& view, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @brief Returns a bitwise OR of the bitmasks of columns of a table diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp index 6c39807291d..ee1f9d4dd4b 100644 --- a/cpp/src/binaryop/binaryop.cpp +++ b/cpp/src/binaryop/binaryop.cpp @@ -392,7 +392,7 @@ std::unique_ptr make_fixed_width_column_for_output(column_view const& lh if (binops::is_null_dependent(op)) { return make_fixed_width_column(output_type, rhs.size(), mask_state::ALL_VALID, stream, mr); } else { - auto [new_mask, _] = cudf::detail::bitmask_and(table_view({lhs, rhs}), stream, mr); + auto new_mask = std::move(cudf::detail::bitmask_and(table_view({lhs, rhs}), stream, mr).mask); return make_fixed_width_column( output_type, lhs.size(), std::move(new_mask), cudf::UNKNOWN_NULL_COUNT, stream, mr); } @@ -799,8 +799,8 @@ std::unique_ptr binary_operation(column_view const& lhs, CUDF_EXPECTS((lhs.size() == rhs.size()), "Column sizes don't match"); - auto [new_mask, _] = bitmask_and(table_view({lhs, rhs}), stream, mr); - auto out = make_fixed_width_column( + auto new_mask = std::move(bitmask_and(table_view({lhs, rhs}), stream, mr).mask); + auto out = make_fixed_width_column( output_type, lhs.size(), std::move(new_mask), cudf::UNKNOWN_NULL_COUNT, stream, mr); // Check for 0 sized data diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu index e89b9b261a6..e5b881ea337 100644 --- a/cpp/src/bitmask/null_mask.cu +++ b/cpp/src/bitmask/null_mask.cu @@ -318,11 +318,11 @@ void inplace_bitmask_and(device_span dest_mask, } // Bitwise AND of the masks -std::pair bitmask_and(host_span masks, - host_span begin_bits, - size_type mask_size, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +bitmask bitmask_and(host_span masks, + host_span begin_bits, + size_type mask_size, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { return bitmask_binop( [] __device__(bitmask_type left, bitmask_type right) { return left & right; }, @@ -334,14 +334,14 @@ std::pair bitmask_and(host_span bitmask_and(table_view const& view, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +bitmask bitmask_and(table_view const& view, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); rmm::device_buffer null_mask{0, stream, mr}; if (view.num_rows() == 0 or view.num_columns() == 0) { - return std::make_pair(std::move(null_mask), 0); + return bitmask{std::move(null_mask), 0, 0}; } std::vector masks; @@ -363,7 +363,7 @@ std::pair bitmask_and(table_view const& view, mr); } - return std::make_pair(std::move(null_mask), 0); + return bitmask{std::move(null_mask), 0, 0}; } cudf::size_type count_set_bits(bitmask_type const* bitmask, @@ -430,7 +430,7 @@ rmm::device_buffer bitmask_or(table_view const& view, view.num_rows(), stream, mr) - .first; + .mask; } return null_mask; @@ -505,8 +505,7 @@ rmm::device_buffer copy_bitmask(column_view const& view, rmm::mr::device_memory_ return detail::copy_bitmask(view, rmm::cuda_stream_default, mr); } -std::pair bitmask_and(table_view const& view, - rmm::mr::device_memory_resource* mr) +bitmask bitmask_and(table_view const& view, rmm::mr::device_memory_resource* mr) { return detail::bitmask_and(view, rmm::cuda_stream_default, mr); } diff --git a/cpp/src/datetime/datetime_ops.cu b/cpp/src/datetime/datetime_ops.cu index 87adcd37e3a..9bd4bb20291 100644 --- a/cpp/src/datetime/datetime_ops.cu +++ b/cpp/src/datetime/datetime_ops.cu @@ -364,8 +364,8 @@ std::unique_ptr add_calendrical_months(column_view const& timestamp_colu stream, mr); - auto [output_null_mask, _] = - cudf::detail::bitmask_and(table_view{{timestamp_column, months_column}}, stream, mr); + auto output_null_mask = std::move( + cudf::detail::bitmask_and(table_view{{timestamp_column, months_column}}, stream, mr).mask); output->set_null_mask(std::move(output_null_mask)); return output; } diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index a6c0cd71c38..6d0d1c0f644 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -390,7 +390,8 @@ void sparse_to_dense_results(table_view const& keys, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto [row_bitmask, _] = bitmask_and(keys, stream, rmm::mr::get_current_device_resource()); + auto row_bitmask = + std::move(bitmask_and(keys, stream, rmm::mr::get_current_device_resource()).mask); bool skip_key_rows_with_nulls = keys_have_nulls and include_null_keys == null_policy::EXCLUDE; bitmask_type const* row_bitmask_ptr = skip_key_rows_with_nulls ? static_cast(row_bitmask.data()) : nullptr; @@ -501,8 +502,9 @@ void compute_single_pass_aggs(table_view const& keys, bool skip_key_rows_with_nulls = keys_have_nulls and include_null_keys == null_policy::EXCLUDE; - auto row_bitmask = - skip_key_rows_with_nulls ? cudf::detail::bitmask_and(keys, stream).first : rmm::device_buffer{}; + auto row_bitmask = skip_key_rows_with_nulls + ? std::move(cudf::detail::bitmask_and(keys, stream).mask) + : rmm::device_buffer{}; thrust::for_each_n( rmm::exec_policy(stream), thrust::make_counting_iterator(0), diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp index be17fafbfc4..b263cbb9732 100644 --- a/cpp/src/groupby/sort/aggregate.cpp +++ b/cpp/src/groupby/sort/aggregate.cpp @@ -538,8 +538,9 @@ void aggregate_result_functor::operator()(aggregation con */ auto column_view_with_common_nulls(column_view const& column_0, column_view const& column_1) { - auto table = table_view{{column_0, column_1}}; - auto [new_nullmask, null_count] = cudf::bitmask_and(table); + auto bitmask_output = cudf::bitmask_and(table_view{{column_0, column_1}}); + auto new_nullmask = std::move(bitmask_output.mask); + auto null_count = bitmask_output.num_unset_bits; if (null_count == 0) { return std::make_tuple(std::move(new_nullmask), column_0, column_1); } auto column_view_with_new_nullmask = [](auto const& col, void* nullmask, auto null_count) { diff --git a/cpp/src/groupby/sort/sort_helper.cu b/cpp/src/groupby/sort/sort_helper.cu index fa1bbd59801..615180ecb87 100644 --- a/cpp/src/groupby/sort/sort_helper.cu +++ b/cpp/src/groupby/sort/sort_helper.cu @@ -276,7 +276,7 @@ column_view sort_groupby_helper::keys_bitmask_column(rmm::cuda_stream_view strea { if (_keys_bitmask_column) return _keys_bitmask_column->view(); - auto [row_bitmask, _] = cudf::detail::bitmask_and(_keys, stream); + auto row_bitmask = std::move(cudf::detail::bitmask_and(_keys, stream).mask); _keys_bitmask_column = make_numeric_column(data_type(type_id::INT8), _keys.num_rows(), diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu index 380c05a8b18..52ac3f8a646 100644 --- a/cpp/src/join/hash_join.cu +++ b/cpp/src/join/hash_join.cu @@ -92,7 +92,7 @@ void build_join_hash_table(cudf::table_view const& build, hash_table.insert(iter, iter + build_table_num_rows, stream.value()); } else { thrust::counting_iterator stencil(0); - auto const [row_bitmask, _] = cudf::detail::bitmask_and(build, stream); + auto const row_bitmask = std::move(cudf::detail::bitmask_and(build, stream).mask); row_is_valid pred{static_cast(row_bitmask.data())}; // insert valid rows diff --git a/cpp/src/join/semi_join.cu b/cpp/src/join/semi_join.cu index 5b5dd418a97..acfce688fc4 100644 --- a/cpp/src/join/semi_join.cu +++ b/cpp/src/join/semi_join.cu @@ -95,9 +95,10 @@ std::unique_ptr> left_semi_anti_join( // if compare_nulls == UNEQUAL, we can simply ignore any rows that // contain a NULL in any column as they will never compare to equal. - auto const row_bitmask = (compare_nulls == null_equality::EQUAL) - ? rmm::device_buffer{} - : cudf::detail::bitmask_and(right_flattened_keys, stream).first; + auto const row_bitmask = + (compare_nulls == null_equality::EQUAL) + ? rmm::device_buffer{} + : std::move(cudf::detail::bitmask_and(right_flattened_keys, stream).mask); // skip rows that are null here. thrust::for_each_n( rmm::exec_policy(stream), diff --git a/cpp/src/strings/repeat_strings.cu b/cpp/src/strings/repeat_strings.cu index 6e6708ae929..7994a4bd892 100644 --- a/cpp/src/strings/repeat_strings.cu +++ b/cpp/src/strings/repeat_strings.cu @@ -319,8 +319,8 @@ std::unique_ptr repeat_strings(strings_column_view const& input, // We generate new bitmask by AND of the input columns' bitmasks. // Note that if the input columns are nullable, the output column will also be nullable (which may // not have nulls). - auto [null_mask, _] = - cudf::detail::bitmask_and(table_view{{input.parent(), repeat_times}}, stream, mr); + auto null_mask = std::move( + cudf::detail::bitmask_and(table_view{{input.parent(), repeat_times}}, stream, mr).mask); return make_strings_column(strings_count, std::move(offsets_column), diff --git a/cpp/src/structs/utilities.cpp b/cpp/src/structs/utilities.cpp index b5f897ccccf..a740950f0f6 100644 --- a/cpp/src/structs/utilities.cpp +++ b/cpp/src/structs/utilities.cpp @@ -379,12 +379,13 @@ std::tuple> superimpose_paren // and the _null_mask(). It would be better to AND the bits from the beginning, and apply // offset() uniformly. // Alternatively, one could construct a big enough buffer, and use inplace_bitwise_and. - ret_validity_buffers.push_back(cudf::detail::bitmask_and(parent_child_null_masks, - std::vector{0, 0}, - child.offset() + child.size(), - stream, - mr) - .first); + ret_validity_buffers.push_back( + std::move(cudf::detail::bitmask_and(parent_child_null_masks, + std::vector{0, 0}, + child.offset() + child.size(), + stream, + mr) + .mask)); return reinterpret_cast(ret_validity_buffers.back().data()); }(); diff --git a/cpp/tests/bitmask/bitmask_tests.cpp b/cpp/tests/bitmask/bitmask_tests.cpp index dc11ef6ac5d..7885ccefe4b 100644 --- a/cpp/tests/bitmask/bitmask_tests.cpp +++ b/cpp/tests/bitmask/bitmask_tests.cpp @@ -545,19 +545,19 @@ TEST_F(MergeBitmaskTest, TestBitmaskAnd) auto const input2 = cudf::table_view({bools_col1, bools_col2}); auto const input3 = cudf::table_view({bools_col1, bools_col2, bools_col3}); - auto [result1, count1] = cudf::bitmask_and(input1); - auto [result2, count2] = cudf::bitmask_and(input2); - auto [result3, count3] = cudf::bitmask_and(input3); + auto result1 = cudf::bitmask_and(input1); + auto result2 = cudf::bitmask_and(input2); + auto result3 = cudf::bitmask_and(input3); auto odd_indices = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; }); auto odd = cudf::test::detail::make_null_mask(odd_indices, odd_indices + input2.num_rows()); - EXPECT_EQ(nullptr, result1.data()); + EXPECT_EQ(nullptr, result1.mask.data()); CUDF_TEST_EXPECT_EQUAL_BUFFERS( - result2.data(), odd.data(), cudf::num_bitmask_words(input2.num_rows())); + result2.mask.data(), odd.data(), cudf::num_bitmask_words(input2.num_rows())); CUDF_TEST_EXPECT_EQUAL_BUFFERS( - result3.data(), odd.data(), cudf::num_bitmask_words(input2.num_rows())); + result3.mask.data(), odd.data(), cudf::num_bitmask_words(input2.num_rows())); } TEST_F(MergeBitmaskTest, TestBitmaskOr) From 7d23f99563a052054d7e85b5cb8598af9bc73bdf Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 5 Nov 2021 15:06:36 -0400 Subject: [PATCH 04/17] Refactor related functions with new bitmask_and --- cpp/src/binaryop/binaryop.cpp | 20 ++++++++++++++------ cpp/src/datetime/datetime_ops.cu | 6 +++--- cpp/src/groupby/sort/sort_helper.cu | 6 +++--- cpp/src/strings/repeat_strings.cu | 8 ++++---- cpp/src/structs/utilities.cpp | 23 ++++++++++++----------- 5 files changed, 36 insertions(+), 27 deletions(-) diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp index ee1f9d4dd4b..794d0676f8a 100644 --- a/cpp/src/binaryop/binaryop.cpp +++ b/cpp/src/binaryop/binaryop.cpp @@ -392,9 +392,13 @@ std::unique_ptr make_fixed_width_column_for_output(column_view const& lh if (binops::is_null_dependent(op)) { return make_fixed_width_column(output_type, rhs.size(), mask_state::ALL_VALID, stream, mr); } else { - auto new_mask = std::move(cudf::detail::bitmask_and(table_view({lhs, rhs}), stream, mr).mask); - return make_fixed_width_column( - output_type, lhs.size(), std::move(new_mask), cudf::UNKNOWN_NULL_COUNT, stream, mr); + auto bitmask_output = cudf::detail::bitmask_and(table_view({lhs, rhs}), stream, mr); + return make_fixed_width_column(output_type, + lhs.size(), + std::move(bitmask_output.mask), + bitmask_output.num_unset_bits, + stream, + mr); } }; @@ -799,9 +803,13 @@ std::unique_ptr binary_operation(column_view const& lhs, CUDF_EXPECTS((lhs.size() == rhs.size()), "Column sizes don't match"); - auto new_mask = std::move(bitmask_and(table_view({lhs, rhs}), stream, mr).mask); - auto out = make_fixed_width_column( - output_type, lhs.size(), std::move(new_mask), cudf::UNKNOWN_NULL_COUNT, stream, mr); + auto bitmask_output = bitmask_and(table_view({lhs, rhs}), stream, mr); + auto out = make_fixed_width_column(output_type, + lhs.size(), + std::move(bitmask_output.mask), + bitmask_output.num_unset_bits, + stream, + mr); // Check for 0 sized data if (lhs.is_empty() or rhs.is_empty()) return out; diff --git a/cpp/src/datetime/datetime_ops.cu b/cpp/src/datetime/datetime_ops.cu index 9bd4bb20291..fef720da174 100644 --- a/cpp/src/datetime/datetime_ops.cu +++ b/cpp/src/datetime/datetime_ops.cu @@ -364,9 +364,9 @@ std::unique_ptr add_calendrical_months(column_view const& timestamp_colu stream, mr); - auto output_null_mask = std::move( - cudf::detail::bitmask_and(table_view{{timestamp_column, months_column}}, stream, mr).mask); - output->set_null_mask(std::move(output_null_mask)); + auto bitmask_output = + cudf::detail::bitmask_and(table_view{{timestamp_column, months_column}}, stream, mr); + output->set_null_mask(std::move(bitmask_output.mask), bitmask_output.num_unset_bits); return output; } diff --git a/cpp/src/groupby/sort/sort_helper.cu b/cpp/src/groupby/sort/sort_helper.cu index 615180ecb87..be26d9e52af 100644 --- a/cpp/src/groupby/sort/sort_helper.cu +++ b/cpp/src/groupby/sort/sort_helper.cu @@ -276,12 +276,12 @@ column_view sort_groupby_helper::keys_bitmask_column(rmm::cuda_stream_view strea { if (_keys_bitmask_column) return _keys_bitmask_column->view(); - auto row_bitmask = std::move(cudf::detail::bitmask_and(_keys, stream).mask); + auto bitmask_output = cudf::detail::bitmask_and(_keys, stream); _keys_bitmask_column = make_numeric_column(data_type(type_id::INT8), _keys.num_rows(), - std::move(row_bitmask), - cudf::UNKNOWN_NULL_COUNT, + std::move(bitmask_output.mask), + bitmask_output.num_unset_bits, stream); auto keys_bitmask_view = _keys_bitmask_column->mutable_view(); diff --git a/cpp/src/strings/repeat_strings.cu b/cpp/src/strings/repeat_strings.cu index 7994a4bd892..ba218883f8a 100644 --- a/cpp/src/strings/repeat_strings.cu +++ b/cpp/src/strings/repeat_strings.cu @@ -319,14 +319,14 @@ std::unique_ptr repeat_strings(strings_column_view const& input, // We generate new bitmask by AND of the input columns' bitmasks. // Note that if the input columns are nullable, the output column will also be nullable (which may // not have nulls). - auto null_mask = std::move( - cudf::detail::bitmask_and(table_view{{input.parent(), repeat_times}}, stream, mr).mask); + auto bitmask_output = + cudf::detail::bitmask_and(table_view{{input.parent(), repeat_times}}, stream, mr); return make_strings_column(strings_count, std::move(offsets_column), std::move(chars_column), - UNKNOWN_NULL_COUNT, - std::move(null_mask)); + bitmask_output.num_unset_bits, + std::move(bitmask_output.mask)); } std::pair, int64_t> repeat_strings_output_sizes( diff --git a/cpp/src/structs/utilities.cpp b/cpp/src/structs/utilities.cpp index a740950f0f6..7628ea51f15 100644 --- a/cpp/src/structs/utilities.cpp +++ b/cpp/src/structs/utilities.cpp @@ -366,10 +366,10 @@ std::tuple> superimpose_paren auto parent_child_null_masks = std::vector{structs_column.null_mask(), child.null_mask()}; - auto new_child_mask = [&] { + auto [new_child_mask, null_count] = [&] { if (not child.nullable()) { // Adopt parent STRUCT's null mask. - return structs_column.null_mask(); + return std::make_pair(structs_column.null_mask(), 0); } // Both STRUCT and child are nullable. AND() for the child's new null mask. @@ -379,14 +379,15 @@ std::tuple> superimpose_paren // and the _null_mask(). It would be better to AND the bits from the beginning, and apply // offset() uniformly. // Alternatively, one could construct a big enough buffer, and use inplace_bitwise_and. - ret_validity_buffers.push_back( - std::move(cudf::detail::bitmask_and(parent_child_null_masks, - std::vector{0, 0}, - child.offset() + child.size(), - stream, - mr) - .mask)); - return reinterpret_cast(ret_validity_buffers.back().data()); + auto bitmask_output = cudf::detail::bitmask_and(parent_child_null_masks, + std::vector{0, 0}, + child.offset() + child.size(), + stream, + mr); + ret_validity_buffers.push_back(std::move(bitmask_output.mask)); + return std::make_pair( + reinterpret_cast(ret_validity_buffers.back().data()), + bitmask_output.num_unset_bits); }(); return cudf::column_view( @@ -394,7 +395,7 @@ std::tuple> superimpose_paren child.size(), child.head(), new_child_mask, - cudf::UNKNOWN_NULL_COUNT, + null_count, child.offset(), std::vector{child.child_begin(), child.child_end()}); }; From 5abaf267fb102df06c35267a0cea1ce8f8e5a25d Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 5 Nov 2021 15:17:42 -0400 Subject: [PATCH 05/17] Update unit tests --- cpp/tests/bitmask/bitmask_tests.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/cpp/tests/bitmask/bitmask_tests.cpp b/cpp/tests/bitmask/bitmask_tests.cpp index 7885ccefe4b..489857325a3 100644 --- a/cpp/tests/bitmask/bitmask_tests.cpp +++ b/cpp/tests/bitmask/bitmask_tests.cpp @@ -549,6 +549,16 @@ TEST_F(MergeBitmaskTest, TestBitmaskAnd) auto result2 = cudf::bitmask_and(input2); auto result3 = cudf::bitmask_and(input3); + constexpr cudf::size_type gold_valid_count = 2; + constexpr cudf::size_type gold_null_count = 3; + + EXPECT_EQ(result1.num_set_bits, 0); + EXPECT_EQ(result1.num_unset_bits, 0); + EXPECT_EQ(result2.num_set_bits, gold_valid_count); + EXPECT_EQ(result2.num_unset_bits, gold_null_count); + EXPECT_EQ(result3.num_set_bits, gold_valid_count); + EXPECT_EQ(result3.num_unset_bits, gold_null_count); + auto odd_indices = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; }); auto odd = cudf::test::detail::make_null_mask(odd_indices, odd_indices + input2.num_rows()); From 4d60f87f2a81435304c3d7f895f6f6cb1cdf28ef Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Sun, 7 Nov 2021 20:09:20 -0500 Subject: [PATCH 06/17] Fix a repeated string bug --- cpp/src/strings/repeat_strings.cu | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/strings/repeat_strings.cu b/cpp/src/strings/repeat_strings.cu index ba218883f8a..7994a4bd892 100644 --- a/cpp/src/strings/repeat_strings.cu +++ b/cpp/src/strings/repeat_strings.cu @@ -319,14 +319,14 @@ std::unique_ptr repeat_strings(strings_column_view const& input, // We generate new bitmask by AND of the input columns' bitmasks. // Note that if the input columns are nullable, the output column will also be nullable (which may // not have nulls). - auto bitmask_output = - cudf::detail::bitmask_and(table_view{{input.parent(), repeat_times}}, stream, mr); + auto null_mask = std::move( + cudf::detail::bitmask_and(table_view{{input.parent(), repeat_times}}, stream, mr).mask); return make_strings_column(strings_count, std::move(offsets_column), std::move(chars_column), - bitmask_output.num_unset_bits, - std::move(bitmask_output.mask)); + UNKNOWN_NULL_COUNT, + std::move(null_mask)); } std::pair, int64_t> repeat_strings_output_sizes( From 8981f5d3a53438ef237951d83883bcfdac7bdabf Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 8 Nov 2021 15:44:43 -0500 Subject: [PATCH 07/17] Fix a slack bit bug when counting valid bits --- cpp/include/cudf/detail/null_mask.cuh | 32 +++++++++++++++++++++++++-- cpp/src/strings/repeat_strings.cu | 8 +++---- 2 files changed, 34 insertions(+), 6 deletions(-) diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh index d11edc35f92..941711146ef 100644 --- a/cpp/include/cudf/detail/null_mask.cuh +++ b/cpp/include/cudf/detail/null_mask.cuh @@ -49,10 +49,17 @@ __global__ void offset_bitmask_binop(Binop op, size_type source_size_bits, size_type* valid_count_ptr) { + constexpr auto const word_size{detail::size_in_bits()}; + + auto const tid = threadIdx.x + blockIdx.x * blockDim.x; + auto const first_bit_index = 0; + auto const last_bit_index = source_size_bits - 1; + auto const first_word_index{word_index(first_bit_index)}; + auto const last_word_index{word_index(last_bit_index)}; + size_type thread_valid_count = 0; - for (size_type destination_word_index = threadIdx.x + blockIdx.x * blockDim.x; - destination_word_index < destination.size(); + for (size_type destination_word_index = tid; destination_word_index < destination.size(); destination_word_index += blockDim.x * gridDim.x) { bitmask_type destination_word = detail::get_mask_offset_word(source[0], @@ -71,6 +78,27 @@ __global__ void offset_bitmask_binop(Binop op, thread_valid_count += __popc(destination_word); } + // Subtract any slack bits counted from the first and last word + // Two threads handle this -- one for first word, one for last + if (tid < 2) { + bool const first{tid == 0}; + bool const last{not first}; + + size_type bit_index = (first) ? first_bit_index : last_bit_index; + size_type word_index = (first) ? first_word_index : last_word_index; + + size_type num_slack_bits = bit_index % word_size; + if (last) { num_slack_bits = word_size - num_slack_bits - 1; } + + if (num_slack_bits > 0) { + bitmask_type word = destination[word_index]; + auto slack_mask = (first) ? set_least_significant_bits(num_slack_bits) + : set_most_significant_bits(num_slack_bits); + + thread_valid_count -= __popc(word & slack_mask); + } + } + using BlockReduce = cub::BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; size_type block_valid_count = BlockReduce(temp_storage).Sum(thread_valid_count); diff --git a/cpp/src/strings/repeat_strings.cu b/cpp/src/strings/repeat_strings.cu index 7994a4bd892..beaa68b9cc5 100644 --- a/cpp/src/strings/repeat_strings.cu +++ b/cpp/src/strings/repeat_strings.cu @@ -319,14 +319,14 @@ std::unique_ptr repeat_strings(strings_column_view const& input, // We generate new bitmask by AND of the input columns' bitmasks. // Note that if the input columns are nullable, the output column will also be nullable (which may // not have nulls). - auto null_mask = std::move( - cudf::detail::bitmask_and(table_view{{input.parent(), repeat_times}}, stream, mr).mask); + auto bitmask_out = + cudf::detail::bitmask_and(table_view{{input.parent(), repeat_times}}, stream, mr); return make_strings_column(strings_count, std::move(offsets_column), std::move(chars_column), - UNKNOWN_NULL_COUNT, - std::move(null_mask)); + bitmask_out.num_unset_bits, + std::move(bitmask_out.mask)); } std::pair, int64_t> repeat_strings_output_sizes( From 56fdd444f3ac0aa02c1bf370440ef8d93650464b Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 9 Nov 2021 09:36:52 -0500 Subject: [PATCH 08/17] Minor code cleanup --- cpp/include/cudf/detail/null_mask.cuh | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh index 941711146ef..44afbfa9042 100644 --- a/cpp/include/cudf/detail/null_mask.cuh +++ b/cpp/include/cudf/detail/null_mask.cuh @@ -50,12 +50,7 @@ __global__ void offset_bitmask_binop(Binop op, size_type* valid_count_ptr) { constexpr auto const word_size{detail::size_in_bits()}; - - auto const tid = threadIdx.x + blockIdx.x * blockDim.x; - auto const first_bit_index = 0; - auto const last_bit_index = source_size_bits - 1; - auto const first_word_index{word_index(first_bit_index)}; - auto const last_word_index{word_index(last_bit_index)}; + auto const tid = threadIdx.x + blockIdx.x * blockDim.x; size_type thread_valid_count = 0; @@ -84,8 +79,12 @@ __global__ void offset_bitmask_binop(Binop op, bool const first{tid == 0}; bool const last{not first}; - size_type bit_index = (first) ? first_bit_index : last_bit_index; - size_type word_index = (first) ? first_word_index : last_word_index; + auto const first_bit_index = 0; + auto const last_bit_index = source_size_bits - 1; + + size_type bit_index = (first) ? first_bit_index : last_bit_index; + size_type word_index = + (first) ? cudf::word_index(first_bit_index) : cudf::word_index(last_bit_index); size_type num_slack_bits = bit_index % word_size; if (last) { num_slack_bits = word_size - num_slack_bits - 1; } From 0a0153ea0575b7463610d72176cc5b6d600b7c01 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 9 Nov 2021 10:24:06 -0500 Subject: [PATCH 09/17] Get rid of unnessary std::move --- cpp/src/groupby/hash/groupby.cu | 8 +++----- cpp/src/join/hash_join.cu | 2 +- cpp/src/join/semi_join.cu | 7 +++---- 3 files changed, 7 insertions(+), 10 deletions(-) diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index 6d0d1c0f644..c29fc272078 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -390,8 +390,7 @@ void sparse_to_dense_results(table_view const& keys, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto row_bitmask = - std::move(bitmask_and(keys, stream, rmm::mr::get_current_device_resource()).mask); + auto row_bitmask = bitmask_and(keys, stream, rmm::mr::get_current_device_resource()).mask; bool skip_key_rows_with_nulls = keys_have_nulls and include_null_keys == null_policy::EXCLUDE; bitmask_type const* row_bitmask_ptr = skip_key_rows_with_nulls ? static_cast(row_bitmask.data()) : nullptr; @@ -502,9 +501,8 @@ void compute_single_pass_aggs(table_view const& keys, bool skip_key_rows_with_nulls = keys_have_nulls and include_null_keys == null_policy::EXCLUDE; - auto row_bitmask = skip_key_rows_with_nulls - ? std::move(cudf::detail::bitmask_and(keys, stream).mask) - : rmm::device_buffer{}; + auto row_bitmask = + skip_key_rows_with_nulls ? cudf::detail::bitmask_and(keys, stream).mask : rmm::device_buffer{}; thrust::for_each_n( rmm::exec_policy(stream), thrust::make_counting_iterator(0), diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu index 52ac3f8a646..affde8228ce 100644 --- a/cpp/src/join/hash_join.cu +++ b/cpp/src/join/hash_join.cu @@ -92,7 +92,7 @@ void build_join_hash_table(cudf::table_view const& build, hash_table.insert(iter, iter + build_table_num_rows, stream.value()); } else { thrust::counting_iterator stencil(0); - auto const row_bitmask = std::move(cudf::detail::bitmask_and(build, stream).mask); + auto const row_bitmask = cudf::detail::bitmask_and(build, stream).mask; row_is_valid pred{static_cast(row_bitmask.data())}; // insert valid rows diff --git a/cpp/src/join/semi_join.cu b/cpp/src/join/semi_join.cu index acfce688fc4..b6433578077 100644 --- a/cpp/src/join/semi_join.cu +++ b/cpp/src/join/semi_join.cu @@ -95,10 +95,9 @@ std::unique_ptr> left_semi_anti_join( // if compare_nulls == UNEQUAL, we can simply ignore any rows that // contain a NULL in any column as they will never compare to equal. - auto const row_bitmask = - (compare_nulls == null_equality::EQUAL) - ? rmm::device_buffer{} - : std::move(cudf::detail::bitmask_and(right_flattened_keys, stream).mask); + auto const row_bitmask = (compare_nulls == null_equality::EQUAL) + ? rmm::device_buffer{} + : cudf::detail::bitmask_and(right_flattened_keys, stream).mask; // skip rows that are null here. thrust::for_each_n( rmm::exec_policy(stream), From dbe9f1e1bbbae0bd90f2f6576bfd34bab2b5d525 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 9 Nov 2021 10:59:40 -0500 Subject: [PATCH 10/17] Minor cleanups --- cpp/include/cudf/detail/null_mask.cuh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh index 44afbfa9042..9685cdeb3be 100644 --- a/cpp/include/cudf/detail/null_mask.cuh +++ b/cpp/include/cudf/detail/null_mask.cuh @@ -79,17 +79,17 @@ __global__ void offset_bitmask_binop(Binop op, bool const first{tid == 0}; bool const last{not first}; - auto const first_bit_index = 0; - auto const last_bit_index = source_size_bits - 1; + auto constexpr first_bit_index = 0; + auto const last_bit_index = source_size_bits - 1; size_type bit_index = (first) ? first_bit_index : last_bit_index; - size_type word_index = - (first) ? cudf::word_index(first_bit_index) : cudf::word_index(last_bit_index); size_type num_slack_bits = bit_index % word_size; if (last) { num_slack_bits = word_size - num_slack_bits - 1; } if (num_slack_bits > 0) { + size_type word_index = + (first) ? cudf::word_index(first_bit_index) : cudf::word_index(last_bit_index); bitmask_type word = destination[word_index]; auto slack_mask = (first) ? set_least_significant_bits(num_slack_bits) : set_most_significant_bits(num_slack_bits); From 827491c46c909f303688bdacf68478e7060f52df Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 9 Nov 2021 18:07:32 -0500 Subject: [PATCH 11/17] Updates: bitmask_or returns a struct of mask, valid count and null count + unit tests --- cpp/include/cudf/detail/null_mask.hpp | 7 +- cpp/include/cudf/null_mask.hpp | 8 +- cpp/src/bitmask/null_mask.cu | 103 +++++++++++++------------- cpp/tests/bitmask/bitmask_tests.cpp | 22 ++++-- 4 files changed, 75 insertions(+), 65 deletions(-) diff --git a/cpp/include/cudf/detail/null_mask.hpp b/cpp/include/cudf/detail/null_mask.hpp index 5a488affcb2..334d0339ca2 100644 --- a/cpp/include/cudf/detail/null_mask.hpp +++ b/cpp/include/cudf/detail/null_mask.hpp @@ -137,10 +137,9 @@ bitmask bitmask_and(table_view const& view, * * @param[in] stream CUDA stream used for device memory operations and kernel launches. */ -rmm::device_buffer bitmask_or( - table_view const& view, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +bitmask bitmask_or(table_view const& view, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @brief Performs a bitwise AND of the specified bitmasks, diff --git a/cpp/include/cudf/null_mask.hpp b/cpp/include/cudf/null_mask.hpp index 74a48b0d101..1ea3160161a 100644 --- a/cpp/include/cudf/null_mask.hpp +++ b/cpp/include/cudf/null_mask.hpp @@ -225,7 +225,8 @@ bitmask bitmask_and(table_view const& view, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Returns a bitwise OR of the bitmasks of columns of a table + * @brief Returns a struct of bitwise OR of the bitmasks of columns of a table, + * count of valid bits and count of null bits * * If any of the columns isn't nullable, it is considered all valid. * If no column in the table is nullable, an empty bitmask is returned. @@ -234,9 +235,8 @@ bitmask bitmask_and(table_view const& view, * @param mr Device memory resource used to allocate the returned device_buffer * @return rmm::device_buffer Output bitmask */ -rmm::device_buffer bitmask_or( - table_view const& view, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +bitmask bitmask_or(table_view const& view, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group } // namespace cudf diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu index e5b881ea337..09560396c21 100644 --- a/cpp/src/bitmask/null_mask.cu +++ b/cpp/src/bitmask/null_mask.cu @@ -299,6 +299,44 @@ rmm::device_buffer copy_bitmask(column_view const& view, return null_mask; } +cudf::size_type count_set_bits(bitmask_type const* bitmask, + size_type start, + size_type stop, + rmm::cuda_stream_view stream = rmm::cuda_stream_default) +{ + if (nullptr == bitmask) { return 0; } + + CUDF_EXPECTS(start >= 0, "Invalid range."); + CUDF_EXPECTS(start <= stop, "Invalid bit range."); + + std::size_t num_bits_to_count = stop - start; + if (num_bits_to_count == 0) { return 0; } + + auto num_words = num_bitmask_words(num_bits_to_count); + + constexpr size_type block_size{256}; + + cudf::detail::grid_1d grid(num_words, block_size); + + rmm::device_scalar non_zero_count(0, stream); + + count_set_bits_kernel + <<>>( + bitmask, start, stop - 1, non_zero_count.data()); + + return non_zero_count.value(stream); +} + +cudf::size_type count_unset_bits(bitmask_type const* bitmask, + size_type start, + size_type stop, + rmm::cuda_stream_view stream = rmm::cuda_stream_default) +{ + if (nullptr == bitmask) { return 0; } + auto num_bits = (stop - start); + return (num_bits - detail::count_set_bits(bitmask, start, stop, stream)); +} + // Inplace Bitwise AND of the masks void inplace_bitmask_and(device_span dest_mask, host_span masks, @@ -366,52 +404,16 @@ bitmask bitmask_and(table_view const& view, return bitmask{std::move(null_mask), 0, 0}; } -cudf::size_type count_set_bits(bitmask_type const* bitmask, - size_type start, - size_type stop, - rmm::cuda_stream_view stream = rmm::cuda_stream_default) -{ - if (nullptr == bitmask) { return 0; } - - CUDF_EXPECTS(start >= 0, "Invalid range."); - CUDF_EXPECTS(start <= stop, "Invalid bit range."); - - std::size_t num_bits_to_count = stop - start; - if (num_bits_to_count == 0) { return 0; } - - auto num_words = num_bitmask_words(num_bits_to_count); - - constexpr size_type block_size{256}; - - cudf::detail::grid_1d grid(num_words, block_size); - - rmm::device_scalar non_zero_count(0, stream); - - count_set_bits_kernel - <<>>( - bitmask, start, stop - 1, non_zero_count.data()); - - return non_zero_count.value(stream); -} - -cudf::size_type count_unset_bits(bitmask_type const* bitmask, - size_type start, - size_type stop, - rmm::cuda_stream_view stream = rmm::cuda_stream_default) -{ - if (nullptr == bitmask) { return 0; } - auto num_bits = (stop - start); - return (num_bits - detail::count_set_bits(bitmask, start, stop, stream)); -} - // Returns the bitwise OR of the null masks of all columns in the table view -rmm::device_buffer bitmask_or(table_view const& view, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +bitmask bitmask_or(table_view const& view, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); rmm::device_buffer null_mask{0, stream, mr}; - if (view.num_rows() == 0 or view.num_columns() == 0) { return null_mask; } + if (view.num_rows() == 0 or view.num_columns() == 0) { + return bitmask{std::move(null_mask), 0, 0}; + } std::vector masks; std::vector offsets; @@ -424,16 +426,15 @@ rmm::device_buffer bitmask_or(table_view const& view, if (static_cast(masks.size()) == view.num_columns()) { return cudf::detail::bitmask_binop( - [] __device__(bitmask_type left, bitmask_type right) { return left | right; }, - masks, - offsets, - view.num_rows(), - stream, - mr) - .mask; + [] __device__(bitmask_type left, bitmask_type right) { return left | right; }, + masks, + offsets, + view.num_rows(), + stream, + mr); } - return null_mask; + return bitmask{std::move(null_mask), 0, 0}; } /** @@ -510,7 +511,7 @@ bitmask bitmask_and(table_view const& view, rmm::mr::device_memory_resource* mr) return detail::bitmask_and(view, rmm::cuda_stream_default, mr); } -rmm::device_buffer bitmask_or(table_view const& view, rmm::mr::device_memory_resource* mr) +bitmask bitmask_or(table_view const& view, rmm::mr::device_memory_resource* mr) { return detail::bitmask_or(view, rmm::cuda_stream_default, mr); } diff --git a/cpp/tests/bitmask/bitmask_tests.cpp b/cpp/tests/bitmask/bitmask_tests.cpp index 489857325a3..72853f1f15a 100644 --- a/cpp/tests/bitmask/bitmask_tests.cpp +++ b/cpp/tests/bitmask/bitmask_tests.cpp @@ -580,19 +580,29 @@ TEST_F(MergeBitmaskTest, TestBitmaskOr) auto const input2 = cudf::table_view({bools_col1, bools_col2}); auto const input3 = cudf::table_view({bools_col1, bools_col2, bools_col3}); - rmm::device_buffer result1 = cudf::bitmask_or(input1); - rmm::device_buffer result2 = cudf::bitmask_or(input2); - rmm::device_buffer result3 = cudf::bitmask_or(input3); + auto result1 = cudf::bitmask_or(input1); + auto result2 = cudf::bitmask_or(input2); + auto result3 = cudf::bitmask_or(input3); + + constexpr cudf::size_type gold_valid_count = 4; + constexpr cudf::size_type gold_null_count = 1; + + EXPECT_EQ(result1.num_set_bits, 0); + EXPECT_EQ(result1.num_unset_bits, 0); + EXPECT_EQ(result2.num_set_bits, gold_valid_count); + EXPECT_EQ(result2.num_unset_bits, gold_null_count); + EXPECT_EQ(result3.num_set_bits, 0); + EXPECT_EQ(result3.num_unset_bits, 0); auto all_but_index3 = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 3; }); auto null3 = cudf::test::detail::make_null_mask(all_but_index3, all_but_index3 + input2.num_rows()); - EXPECT_EQ(nullptr, result1.data()); + EXPECT_EQ(nullptr, result1.mask.data()); CUDF_TEST_EXPECT_EQUAL_BUFFERS( - result2.data(), null3.data(), cudf::num_bitmask_words(input2.num_rows())); - EXPECT_EQ(nullptr, result3.data()); + result2.mask.data(), null3.data(), cudf::num_bitmask_words(input2.num_rows())); + EXPECT_EQ(nullptr, result3.mask.data()); } CUDF_TEST_PROGRAM_MAIN() From 67a320b61d31e3029c92894aacf5943927ab2e9c Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 9 Nov 2021 18:22:59 -0500 Subject: [PATCH 12/17] Cleanups: simplify slack bit logic --- cpp/include/cudf/detail/null_mask.cuh | 27 ++++++--------------------- 1 file changed, 6 insertions(+), 21 deletions(-) diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh index 9685cdeb3be..f1d060a2140 100644 --- a/cpp/include/cudf/detail/null_mask.cuh +++ b/cpp/include/cudf/detail/null_mask.cuh @@ -73,28 +73,13 @@ __global__ void offset_bitmask_binop(Binop op, thread_valid_count += __popc(destination_word); } - // Subtract any slack bits counted from the first and last word - // Two threads handle this -- one for first word, one for last - if (tid < 2) { - bool const first{tid == 0}; - bool const last{not first}; - - auto constexpr first_bit_index = 0; - auto const last_bit_index = source_size_bits - 1; - - size_type bit_index = (first) ? first_bit_index : last_bit_index; - - size_type num_slack_bits = bit_index % word_size; - if (last) { num_slack_bits = word_size - num_slack_bits - 1; } - + // Subtract any slack bits from the last word + if (tid == 0) { + size_type const last_bit_index = source_size_bits - 1; + size_type const num_slack_bits = word_size - (last_bit_index % word_size) - 1; if (num_slack_bits > 0) { - size_type word_index = - (first) ? cudf::word_index(first_bit_index) : cudf::word_index(last_bit_index); - bitmask_type word = destination[word_index]; - auto slack_mask = (first) ? set_least_significant_bits(num_slack_bits) - : set_most_significant_bits(num_slack_bits); - - thread_valid_count -= __popc(word & slack_mask); + size_type word_index = cudf::word_index(last_bit_index); + thread_valid_count -= __popc(destination[word_index] & set_most_significant_bits(num_slack_bits)); } } From ca924a3c460df94aaddf0bc319c1d0b4022d69cd Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 9 Nov 2021 18:23:30 -0500 Subject: [PATCH 13/17] Update java code accordingly --- java/src/main/native/src/ColumnViewJni.cpp | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp index f95b05d5aeb..679f66887a7 100644 --- a/java/src/main/native/src/ColumnViewJni.cpp +++ b/java/src/main/native/src/ColumnViewJni.cpp @@ -1492,12 +1492,16 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_bitwiseMergeAndSetValidit cudf::binary_operator op = static_cast(bin_op); switch (op) { - case cudf::binary_operator::BITWISE_AND: - copy->set_null_mask(cudf::bitmask_and(*input_table)); + case cudf::binary_operator::BITWISE_AND: { + auto bitmask_output = cudf::bitmask_and(*input_table); + copy->set_null_mask(std::move(bitmask_output.mask), bitmask_output.num_unset_bits); break; - case cudf::binary_operator::BITWISE_OR: - copy->set_null_mask(cudf::bitmask_or(*input_table)); + } + case cudf::binary_operator::BITWISE_OR: { + auto bitmask_output = cudf::bitmask_or(*input_table); + copy->set_null_mask(std::move(bitmask_output.mask), bitmask_output.num_unset_bits); break; + } default: JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "Unsupported merge operation", 0); } From 362eb854b278727748829454110f2bad90fdad5d Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 9 Nov 2021 18:27:14 -0500 Subject: [PATCH 14/17] Code formatting --- cpp/include/cudf/detail/null_mask.cuh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh index f1d060a2140..02c93e7e31e 100644 --- a/cpp/include/cudf/detail/null_mask.cuh +++ b/cpp/include/cudf/detail/null_mask.cuh @@ -75,11 +75,12 @@ __global__ void offset_bitmask_binop(Binop op, // Subtract any slack bits from the last word if (tid == 0) { - size_type const last_bit_index = source_size_bits - 1; + size_type const last_bit_index = source_size_bits - 1; size_type const num_slack_bits = word_size - (last_bit_index % word_size) - 1; if (num_slack_bits > 0) { size_type word_index = cudf::word_index(last_bit_index); - thread_valid_count -= __popc(destination[word_index] & set_most_significant_bits(num_slack_bits)); + thread_valid_count -= + __popc(destination[word_index] & set_most_significant_bits(num_slack_bits)); } } From bf1bca06270f88b33759679410fdc9fc89df8600 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 9 Nov 2021 22:10:33 -0500 Subject: [PATCH 15/17] Updates: return a pair of mask and null count --- cpp/include/cudf/detail/null_mask.cuh | 20 ++++----- cpp/include/cudf/detail/null_mask.hpp | 28 ++++++------- cpp/include/cudf/null_mask.hpp | 29 +++++-------- cpp/src/binaryop/binaryop.cpp | 20 +++------ cpp/src/bitmask/null_mask.cu | 36 ++++++++-------- cpp/src/datetime/datetime_ops.cu | 4 +- cpp/src/groupby/hash/groupby.cu | 4 +- cpp/src/groupby/sort/aggregate.cpp | 5 +-- cpp/src/groupby/sort/sort_helper.cu | 9 ++-- cpp/src/join/hash_join.cu | 2 +- cpp/src/join/semi_join.cu | 2 +- cpp/src/strings/repeat_strings.cu | 6 +-- cpp/src/structs/utilities.cpp | 15 ++++--- cpp/tests/bitmask/bitmask_tests.cpp | 48 +++++++++------------- java/src/main/native/src/ColumnViewJni.cpp | 8 ++-- 15 files changed, 102 insertions(+), 134 deletions(-) diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh index 02c93e7e31e..753525128bb 100644 --- a/cpp/include/cudf/detail/null_mask.cuh +++ b/cpp/include/cudf/detail/null_mask.cuh @@ -98,16 +98,17 @@ __global__ void offset_bitmask_binop(Binop op, * @param stream CUDA stream used for device memory operations and kernel launches */ template -bitmask bitmask_binop(Binop op, - host_span masks, - host_span masks_begin_bits, - size_type mask_size_bits, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +std::pair bitmask_binop( + Binop op, + host_span masks, + host_span masks_begin_bits, + size_type mask_size_bits, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { auto dest_mask = rmm::device_buffer{bitmask_allocation_size_bytes(mask_size_bits), stream, mr}; - - auto valid_count = + auto null_count = + mask_size_bits - inplace_bitmask_binop(op, device_span(static_cast(dest_mask.data()), num_bitmask_words(mask_size_bits)), @@ -116,9 +117,8 @@ bitmask bitmask_binop(Binop op, mask_size_bits, stream, mr); - auto null_count = mask_size_bits - valid_count; - return bitmask{std::move(dest_mask), valid_count, null_count}; + return std::make_pair(std::move(dest_mask), null_count); } /** diff --git a/cpp/include/cudf/detail/null_mask.hpp b/cpp/include/cudf/detail/null_mask.hpp index 334d0339ca2..d2819e665df 100644 --- a/cpp/include/cudf/detail/null_mask.hpp +++ b/cpp/include/cudf/detail/null_mask.hpp @@ -23,9 +23,6 @@ #include namespace cudf { - -struct bitmask; - namespace detail { /** @@ -117,29 +114,32 @@ rmm::device_buffer copy_bitmask( * * @param stream CUDA stream used for device memory operations and kernel launches */ -bitmask bitmask_and(host_span masks, - host_span masks_begin_bits, - size_type mask_size_bits, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +std::pair bitmask_and( + host_span masks, + host_span masks_begin_bits, + size_type mask_size_bits, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @copydoc cudf::bitmask_and * * @param[in] stream CUDA stream used for device memory operations and kernel launches. */ -bitmask bitmask_and(table_view const& view, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +std::pair bitmask_and( + table_view const& view, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @copydoc cudf::bitmask_or * * @param[in] stream CUDA stream used for device memory operations and kernel launches. */ -bitmask bitmask_or(table_view const& view, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +std::pair bitmask_or( + table_view const& view, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @brief Performs a bitwise AND of the specified bitmasks, diff --git a/cpp/include/cudf/null_mask.hpp b/cpp/include/cudf/null_mask.hpp index 1ea3160161a..4b887b20049 100644 --- a/cpp/include/cudf/null_mask.hpp +++ b/cpp/include/cudf/null_mask.hpp @@ -24,15 +24,6 @@ namespace cudf { -/** - * @brief Bitmask output type. - */ -struct bitmask { - rmm::device_buffer mask; ///< Resulting bitmask - size_type num_set_bits; ///< Number of set bits - size_type num_unset_bits; ///< Number of unset bits -}; - /** * @addtogroup column_nullmask * @{ @@ -211,32 +202,32 @@ rmm::device_buffer copy_bitmask( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Returns a struct of bitwise AND of the bitmasks of columns of a table, - * count of valid bits and count of null bits + * @brief Returns a pair of bitwise AND of the bitmasks of columns of a table and count of null bits * * If any of the columns isn't nullable, it is considered all valid. * If no column in the table is nullable, an empty bitmask is returned. * * @param view The table of columns * @param mr Device memory resource used to allocate the returned device_buffer - * @return A struct of resulting bitmask, count of valid bits and count of null bits + * @return A pair of resulting bitmask and count of null bits */ -bitmask bitmask_and(table_view const& view, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +std::pair bitmask_and( + table_view const& view, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Returns a struct of bitwise OR of the bitmasks of columns of a table, - * count of valid bits and count of null bits + * @brief Returns a pair of bitwise OR of the bitmasks of columns of a table and count of null bits * * If any of the columns isn't nullable, it is considered all valid. * If no column in the table is nullable, an empty bitmask is returned. * * @param view The table of columns * @param mr Device memory resource used to allocate the returned device_buffer - * @return rmm::device_buffer Output bitmask + * @return A pair of resulting bitmask and count of null bits */ -bitmask bitmask_or(table_view const& view, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +std::pair bitmask_or( + table_view const& view, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group } // namespace cudf diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp index 794d0676f8a..b9ed95daf1b 100644 --- a/cpp/src/binaryop/binaryop.cpp +++ b/cpp/src/binaryop/binaryop.cpp @@ -392,13 +392,9 @@ std::unique_ptr make_fixed_width_column_for_output(column_view const& lh if (binops::is_null_dependent(op)) { return make_fixed_width_column(output_type, rhs.size(), mask_state::ALL_VALID, stream, mr); } else { - auto bitmask_output = cudf::detail::bitmask_and(table_view({lhs, rhs}), stream, mr); - return make_fixed_width_column(output_type, - lhs.size(), - std::move(bitmask_output.mask), - bitmask_output.num_unset_bits, - stream, - mr); + auto [new_mask, null_count] = cudf::detail::bitmask_and(table_view({lhs, rhs}), stream, mr); + return make_fixed_width_column( + output_type, lhs.size(), std::move(new_mask), null_count, stream, mr); } }; @@ -803,13 +799,9 @@ std::unique_ptr binary_operation(column_view const& lhs, CUDF_EXPECTS((lhs.size() == rhs.size()), "Column sizes don't match"); - auto bitmask_output = bitmask_and(table_view({lhs, rhs}), stream, mr); - auto out = make_fixed_width_column(output_type, - lhs.size(), - std::move(bitmask_output.mask), - bitmask_output.num_unset_bits, - stream, - mr); + auto [new_mask, null_count] = bitmask_and(table_view({lhs, rhs}), stream, mr); + auto out = + make_fixed_width_column(output_type, lhs.size(), std::move(new_mask), null_count, stream, mr); // Check for 0 sized data if (lhs.is_empty() or rhs.is_empty()) return out; diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu index 09560396c21..1cd3def61ac 100644 --- a/cpp/src/bitmask/null_mask.cu +++ b/cpp/src/bitmask/null_mask.cu @@ -356,11 +356,11 @@ void inplace_bitmask_and(device_span dest_mask, } // Bitwise AND of the masks -bitmask bitmask_and(host_span masks, - host_span begin_bits, - size_type mask_size, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::pair bitmask_and(host_span masks, + host_span begin_bits, + size_type mask_size, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { return bitmask_binop( [] __device__(bitmask_type left, bitmask_type right) { return left & right; }, @@ -372,14 +372,14 @@ bitmask bitmask_and(host_span masks, } // Returns the bitwise AND of the null masks of all columns in the table view -bitmask bitmask_and(table_view const& view, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::pair bitmask_and(table_view const& view, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); rmm::device_buffer null_mask{0, stream, mr}; if (view.num_rows() == 0 or view.num_columns() == 0) { - return bitmask{std::move(null_mask), 0, 0}; + return std::make_pair(std::move(null_mask), 0); } std::vector masks; @@ -401,18 +401,18 @@ bitmask bitmask_and(table_view const& view, mr); } - return bitmask{std::move(null_mask), 0, 0}; + return std::make_pair(std::move(null_mask), 0); } // Returns the bitwise OR of the null masks of all columns in the table view -bitmask bitmask_or(table_view const& view, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::pair bitmask_or(table_view const& view, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); rmm::device_buffer null_mask{0, stream, mr}; if (view.num_rows() == 0 or view.num_columns() == 0) { - return bitmask{std::move(null_mask), 0, 0}; + return std::make_pair(std::move(null_mask), 0); } std::vector masks; @@ -434,7 +434,7 @@ bitmask bitmask_or(table_view const& view, mr); } - return bitmask{std::move(null_mask), 0, 0}; + return std::make_pair(std::move(null_mask), 0); } /** @@ -506,12 +506,14 @@ rmm::device_buffer copy_bitmask(column_view const& view, rmm::mr::device_memory_ return detail::copy_bitmask(view, rmm::cuda_stream_default, mr); } -bitmask bitmask_and(table_view const& view, rmm::mr::device_memory_resource* mr) +std::pair bitmask_and(table_view const& view, + rmm::mr::device_memory_resource* mr) { return detail::bitmask_and(view, rmm::cuda_stream_default, mr); } -bitmask bitmask_or(table_view const& view, rmm::mr::device_memory_resource* mr) +std::pair bitmask_or(table_view const& view, + rmm::mr::device_memory_resource* mr) { return detail::bitmask_or(view, rmm::cuda_stream_default, mr); } diff --git a/cpp/src/datetime/datetime_ops.cu b/cpp/src/datetime/datetime_ops.cu index fef720da174..7d66daf226e 100644 --- a/cpp/src/datetime/datetime_ops.cu +++ b/cpp/src/datetime/datetime_ops.cu @@ -364,9 +364,9 @@ std::unique_ptr add_calendrical_months(column_view const& timestamp_colu stream, mr); - auto bitmask_output = + auto [output_null_mask, null_count] = cudf::detail::bitmask_and(table_view{{timestamp_column, months_column}}, stream, mr); - output->set_null_mask(std::move(bitmask_output.mask), bitmask_output.num_unset_bits); + output->set_null_mask(std::move(output_null_mask), null_count); return output; } diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index c29fc272078..1b9b2a196c3 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -390,7 +390,7 @@ void sparse_to_dense_results(table_view const& keys, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto row_bitmask = bitmask_and(keys, stream, rmm::mr::get_current_device_resource()).mask; + auto row_bitmask = bitmask_and(keys, stream, rmm::mr::get_current_device_resource()).first; bool skip_key_rows_with_nulls = keys_have_nulls and include_null_keys == null_policy::EXCLUDE; bitmask_type const* row_bitmask_ptr = skip_key_rows_with_nulls ? static_cast(row_bitmask.data()) : nullptr; @@ -502,7 +502,7 @@ void compute_single_pass_aggs(table_view const& keys, bool skip_key_rows_with_nulls = keys_have_nulls and include_null_keys == null_policy::EXCLUDE; auto row_bitmask = - skip_key_rows_with_nulls ? cudf::detail::bitmask_and(keys, stream).mask : rmm::device_buffer{}; + skip_key_rows_with_nulls ? cudf::detail::bitmask_and(keys, stream).first : rmm::device_buffer{}; thrust::for_each_n( rmm::exec_policy(stream), thrust::make_counting_iterator(0), diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp index b263cbb9732..234bb447761 100644 --- a/cpp/src/groupby/sort/aggregate.cpp +++ b/cpp/src/groupby/sort/aggregate.cpp @@ -538,10 +538,7 @@ void aggregate_result_functor::operator()(aggregation con */ auto column_view_with_common_nulls(column_view const& column_0, column_view const& column_1) { - auto bitmask_output = cudf::bitmask_and(table_view{{column_0, column_1}}); - auto new_nullmask = std::move(bitmask_output.mask); - auto null_count = bitmask_output.num_unset_bits; - + auto [new_nullmask, null_count] = cudf::bitmask_and(table_view{{column_0, column_1}}); if (null_count == 0) { return std::make_tuple(std::move(new_nullmask), column_0, column_1); } auto column_view_with_new_nullmask = [](auto const& col, void* nullmask, auto null_count) { return column_view(col.type(), diff --git a/cpp/src/groupby/sort/sort_helper.cu b/cpp/src/groupby/sort/sort_helper.cu index be26d9e52af..7adb4ccec76 100644 --- a/cpp/src/groupby/sort/sort_helper.cu +++ b/cpp/src/groupby/sort/sort_helper.cu @@ -276,13 +276,10 @@ column_view sort_groupby_helper::keys_bitmask_column(rmm::cuda_stream_view strea { if (_keys_bitmask_column) return _keys_bitmask_column->view(); - auto bitmask_output = cudf::detail::bitmask_and(_keys, stream); + auto [row_bitmask, null_count] = cudf::detail::bitmask_and(_keys, stream); - _keys_bitmask_column = make_numeric_column(data_type(type_id::INT8), - _keys.num_rows(), - std::move(bitmask_output.mask), - bitmask_output.num_unset_bits, - stream); + _keys_bitmask_column = make_numeric_column( + data_type(type_id::INT8), _keys.num_rows(), std::move(row_bitmask), null_count, stream); auto keys_bitmask_view = _keys_bitmask_column->mutable_view(); using T = id_to_type; diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu index affde8228ce..d5065278afc 100644 --- a/cpp/src/join/hash_join.cu +++ b/cpp/src/join/hash_join.cu @@ -92,7 +92,7 @@ void build_join_hash_table(cudf::table_view const& build, hash_table.insert(iter, iter + build_table_num_rows, stream.value()); } else { thrust::counting_iterator stencil(0); - auto const row_bitmask = cudf::detail::bitmask_and(build, stream).mask; + auto const row_bitmask = cudf::detail::bitmask_and(build, stream).first; row_is_valid pred{static_cast(row_bitmask.data())}; // insert valid rows diff --git a/cpp/src/join/semi_join.cu b/cpp/src/join/semi_join.cu index b6433578077..5b5dd418a97 100644 --- a/cpp/src/join/semi_join.cu +++ b/cpp/src/join/semi_join.cu @@ -97,7 +97,7 @@ std::unique_ptr> left_semi_anti_join( // contain a NULL in any column as they will never compare to equal. auto const row_bitmask = (compare_nulls == null_equality::EQUAL) ? rmm::device_buffer{} - : cudf::detail::bitmask_and(right_flattened_keys, stream).mask; + : cudf::detail::bitmask_and(right_flattened_keys, stream).first; // skip rows that are null here. thrust::for_each_n( rmm::exec_policy(stream), diff --git a/cpp/src/strings/repeat_strings.cu b/cpp/src/strings/repeat_strings.cu index beaa68b9cc5..458f3ed885c 100644 --- a/cpp/src/strings/repeat_strings.cu +++ b/cpp/src/strings/repeat_strings.cu @@ -319,14 +319,14 @@ std::unique_ptr repeat_strings(strings_column_view const& input, // We generate new bitmask by AND of the input columns' bitmasks. // Note that if the input columns are nullable, the output column will also be nullable (which may // not have nulls). - auto bitmask_out = + auto [null_mask, null_count] = cudf::detail::bitmask_and(table_view{{input.parent(), repeat_times}}, stream, mr); return make_strings_column(strings_count, std::move(offsets_column), std::move(chars_column), - bitmask_out.num_unset_bits, - std::move(bitmask_out.mask)); + null_count, + std::move(null_mask)); } std::pair, int64_t> repeat_strings_output_sizes( diff --git a/cpp/src/structs/utilities.cpp b/cpp/src/structs/utilities.cpp index 7628ea51f15..d4e2f48feba 100644 --- a/cpp/src/structs/utilities.cpp +++ b/cpp/src/structs/utilities.cpp @@ -379,15 +379,14 @@ std::tuple> superimpose_paren // and the _null_mask(). It would be better to AND the bits from the beginning, and apply // offset() uniformly. // Alternatively, one could construct a big enough buffer, and use inplace_bitwise_and. - auto bitmask_output = cudf::detail::bitmask_and(parent_child_null_masks, - std::vector{0, 0}, - child.offset() + child.size(), - stream, - mr); - ret_validity_buffers.push_back(std::move(bitmask_output.mask)); + auto [new_mask, null_count] = cudf::detail::bitmask_and(parent_child_null_masks, + std::vector{0, 0}, + child.offset() + child.size(), + stream, + mr); + ret_validity_buffers.push_back(std::move(new_mask)); return std::make_pair( - reinterpret_cast(ret_validity_buffers.back().data()), - bitmask_output.num_unset_bits); + reinterpret_cast(ret_validity_buffers.back().data()), null_count); }(); return cudf::column_view( diff --git a/cpp/tests/bitmask/bitmask_tests.cpp b/cpp/tests/bitmask/bitmask_tests.cpp index 72853f1f15a..c7ae6e12366 100644 --- a/cpp/tests/bitmask/bitmask_tests.cpp +++ b/cpp/tests/bitmask/bitmask_tests.cpp @@ -545,29 +545,25 @@ TEST_F(MergeBitmaskTest, TestBitmaskAnd) auto const input2 = cudf::table_view({bools_col1, bools_col2}); auto const input3 = cudf::table_view({bools_col1, bools_col2, bools_col3}); - auto result1 = cudf::bitmask_and(input1); - auto result2 = cudf::bitmask_and(input2); - auto result3 = cudf::bitmask_and(input3); + auto [result1_mask, result1_null_count] = cudf::bitmask_and(input1); + auto [result2_mask, result2_null_count] = cudf::bitmask_and(input2); + auto [result3_mask, result3_null_count] = cudf::bitmask_and(input3); - constexpr cudf::size_type gold_valid_count = 2; - constexpr cudf::size_type gold_null_count = 3; + constexpr cudf::size_type gold_null_count = 3; - EXPECT_EQ(result1.num_set_bits, 0); - EXPECT_EQ(result1.num_unset_bits, 0); - EXPECT_EQ(result2.num_set_bits, gold_valid_count); - EXPECT_EQ(result2.num_unset_bits, gold_null_count); - EXPECT_EQ(result3.num_set_bits, gold_valid_count); - EXPECT_EQ(result3.num_unset_bits, gold_null_count); + EXPECT_EQ(result1_null_count, 0); + EXPECT_EQ(result2_null_count, gold_null_count); + EXPECT_EQ(result3_null_count, gold_null_count); auto odd_indices = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; }); auto odd = cudf::test::detail::make_null_mask(odd_indices, odd_indices + input2.num_rows()); - EXPECT_EQ(nullptr, result1.mask.data()); + EXPECT_EQ(nullptr, result1_mask.data()); CUDF_TEST_EXPECT_EQUAL_BUFFERS( - result2.mask.data(), odd.data(), cudf::num_bitmask_words(input2.num_rows())); + result2_mask.data(), odd.data(), cudf::num_bitmask_words(input2.num_rows())); CUDF_TEST_EXPECT_EQUAL_BUFFERS( - result3.mask.data(), odd.data(), cudf::num_bitmask_words(input2.num_rows())); + result3_mask.data(), odd.data(), cudf::num_bitmask_words(input2.num_rows())); } TEST_F(MergeBitmaskTest, TestBitmaskOr) @@ -580,29 +576,23 @@ TEST_F(MergeBitmaskTest, TestBitmaskOr) auto const input2 = cudf::table_view({bools_col1, bools_col2}); auto const input3 = cudf::table_view({bools_col1, bools_col2, bools_col3}); - auto result1 = cudf::bitmask_or(input1); - auto result2 = cudf::bitmask_or(input2); - auto result3 = cudf::bitmask_or(input3); + auto [result1_mask, result1_null_count] = cudf::bitmask_or(input1); + auto [result2_mask, result2_null_count] = cudf::bitmask_or(input2); + auto [result3_mask, result3_null_count] = cudf::bitmask_or(input3); - constexpr cudf::size_type gold_valid_count = 4; - constexpr cudf::size_type gold_null_count = 1; - - EXPECT_EQ(result1.num_set_bits, 0); - EXPECT_EQ(result1.num_unset_bits, 0); - EXPECT_EQ(result2.num_set_bits, gold_valid_count); - EXPECT_EQ(result2.num_unset_bits, gold_null_count); - EXPECT_EQ(result3.num_set_bits, 0); - EXPECT_EQ(result3.num_unset_bits, 0); + EXPECT_EQ(result1_null_count, 0); + EXPECT_EQ(result2_null_count, 1); + EXPECT_EQ(result3_null_count, 0); auto all_but_index3 = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 3; }); auto null3 = cudf::test::detail::make_null_mask(all_but_index3, all_but_index3 + input2.num_rows()); - EXPECT_EQ(nullptr, result1.mask.data()); + EXPECT_EQ(nullptr, result1_mask.data()); CUDF_TEST_EXPECT_EQUAL_BUFFERS( - result2.mask.data(), null3.data(), cudf::num_bitmask_words(input2.num_rows())); - EXPECT_EQ(nullptr, result3.mask.data()); + result2_mask.data(), null3.data(), cudf::num_bitmask_words(input2.num_rows())); + EXPECT_EQ(nullptr, result3_mask.data()); } CUDF_TEST_PROGRAM_MAIN() diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp index 679f66887a7..5ae9fd03063 100644 --- a/java/src/main/native/src/ColumnViewJni.cpp +++ b/java/src/main/native/src/ColumnViewJni.cpp @@ -1493,13 +1493,13 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_bitwiseMergeAndSetValidit cudf::binary_operator op = static_cast(bin_op); switch (op) { case cudf::binary_operator::BITWISE_AND: { - auto bitmask_output = cudf::bitmask_and(*input_table); - copy->set_null_mask(std::move(bitmask_output.mask), bitmask_output.num_unset_bits); + auto [new_bitmask, null_count] = cudf::bitmask_and(*input_table); + copy->set_null_mask(std::move(new_bitmask), null_count); break; } case cudf::binary_operator::BITWISE_OR: { - auto bitmask_output = cudf::bitmask_or(*input_table); - copy->set_null_mask(std::move(bitmask_output.mask), bitmask_output.num_unset_bits); + auto [new_bitmask, null_count] = cudf::bitmask_or(*input_table); + copy->set_null_mask(std::move(new_bitmask), null_count); break; } default: JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "Unsupported merge operation", 0); From 9a0f562a50201d0ad903c292ad27bacb877e7122 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 10 Nov 2021 11:11:51 -0500 Subject: [PATCH 16/17] Update docs --- cpp/include/cudf/detail/null_mask.cuh | 6 +++--- cpp/include/cudf/null_mask.hpp | 10 ++++++---- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh index 753525128bb..15db29c1b0d 100644 --- a/cpp/include/cudf/detail/null_mask.cuh +++ b/cpp/include/cudf/detail/null_mask.cuh @@ -39,7 +39,7 @@ namespace detail { * @param source_begin_bits Array of offsets into corresponding @p source masks. * Must be same size as source array * @param source_size_bits Number of bits in each mask in @p source - * @param count Pointer to valid-bit counter + * @param count Pointer to counter of set bits */ template __global__ void offset_bitmask_binop(Binop op, @@ -123,7 +123,7 @@ std::pair bitmask_binop( /** * @brief Performs a merge of the specified bitmasks using the binary operator - * provided, writes in place to destination and returns count of valid bits + * provided, writes in place to destination and returns count of set bits * * @param[in] op The binary operator used to combine the bitmasks * @param[out] dest_mask Destination to which the merged result is written @@ -132,7 +132,7 @@ std::pair bitmask_binop( * @param[in] mask_size_bits The number of bits to be ANDed in each mask * @param[in] stream CUDA stream used for device memory operations and kernel launches * @param[in] mr Device memory resource used to allocate the returned device_buffer - * @return size_type Count of valid bits + * @return size_type Count of set bits */ template size_type inplace_bitmask_binop( diff --git a/cpp/include/cudf/null_mask.hpp b/cpp/include/cudf/null_mask.hpp index 4b887b20049..c74e077dc32 100644 --- a/cpp/include/cudf/null_mask.hpp +++ b/cpp/include/cudf/null_mask.hpp @@ -202,28 +202,30 @@ rmm::device_buffer copy_bitmask( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Returns a pair of bitwise AND of the bitmasks of columns of a table and count of null bits + * @brief Performs bitwise AND of the bitmasks of columns of a table. Returns + * a pair of resulting mask and count of unset bits. * * If any of the columns isn't nullable, it is considered all valid. * If no column in the table is nullable, an empty bitmask is returned. * * @param view The table of columns * @param mr Device memory resource used to allocate the returned device_buffer - * @return A pair of resulting bitmask and count of null bits + * @return A pair of resulting bitmask and count of unset bits */ std::pair bitmask_and( table_view const& view, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Returns a pair of bitwise OR of the bitmasks of columns of a table and count of null bits + * @brief Performs bitwise OR of the bitmasks of columns of a table. Returns + * a pair of resulting mask and count of unset bits. * * If any of the columns isn't nullable, it is considered all valid. * If no column in the table is nullable, an empty bitmask is returned. * * @param view The table of columns * @param mr Device memory resource used to allocate the returned device_buffer - * @return A pair of resulting bitmask and count of null bits + * @return A pair of resulting bitmask and count of unset bits */ std::pair bitmask_or( table_view const& view, From 9baa00a68ed2825b81d4f43df14f73095224c4a3 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 10 Nov 2021 11:30:35 -0500 Subject: [PATCH 17/17] Corrections --- cpp/include/cudf/detail/null_mask.cuh | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh index 15db29c1b0d..cf8c3343406 100644 --- a/cpp/include/cudf/detail/null_mask.cuh +++ b/cpp/include/cudf/detail/null_mask.cuh @@ -47,12 +47,12 @@ __global__ void offset_bitmask_binop(Binop op, device_span source, device_span source_begin_bits, size_type source_size_bits, - size_type* valid_count_ptr) + size_type* count_ptr) { constexpr auto const word_size{detail::size_in_bits()}; auto const tid = threadIdx.x + blockIdx.x * blockDim.x; - size_type thread_valid_count = 0; + size_type thread_count = 0; for (size_type destination_word_index = tid; destination_word_index < destination.size(); destination_word_index += blockDim.x * gridDim.x) { @@ -70,7 +70,7 @@ __global__ void offset_bitmask_binop(Binop op, } destination[destination_word_index] = destination_word; - thread_valid_count += __popc(destination_word); + thread_count += __popc(destination_word); } // Subtract any slack bits from the last word @@ -78,17 +78,16 @@ __global__ void offset_bitmask_binop(Binop op, size_type const last_bit_index = source_size_bits - 1; size_type const num_slack_bits = word_size - (last_bit_index % word_size) - 1; if (num_slack_bits > 0) { - size_type word_index = cudf::word_index(last_bit_index); - thread_valid_count -= - __popc(destination[word_index] & set_most_significant_bits(num_slack_bits)); + size_type const word_index = cudf::word_index(last_bit_index); + thread_count -= __popc(destination[word_index] & set_most_significant_bits(num_slack_bits)); } } using BlockReduce = cub::BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; - size_type block_valid_count = BlockReduce(temp_storage).Sum(thread_valid_count); + size_type block_count = BlockReduce(temp_storage).Sum(thread_count); - if (threadIdx.x == 0) { atomicAdd(valid_count_ptr, block_valid_count); } + if (threadIdx.x == 0) { atomicAdd(count_ptr, block_count); } } /**