Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update bitmask_and and bitmask_or to return a pair of resulting mask and count of unset bits #9616

Merged
merged 17 commits into from
Nov 11, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 66 additions & 37 deletions cpp/include/cudf/detail/null_mask.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -23,46 +23,71 @@
#include <cudf/utilities/span.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_scalar.hpp>

namespace cudf {
namespace detail {
/**
* @brief Computes the merger of an array of bitmasks using a binary operator
*
* @tparam block_size Number of threads in each thread block
* @tparam Binop Type of binary operator
*
* @param op The binary operator used to combine the bitmasks
* @param destination The bitmask to write result into
* @param source Array of source mask pointers. All masks must be of same size
* @param source_begin_bits Array of offsets into corresponding @p source masks.
* Must be same size as source array
* @param source_size_bits Number of bits in each mask in @p source
* @param count Pointer to counter of set bits
*/
template <typename Binop>
template <int block_size, typename Binop>
__global__ void offset_bitmask_binop(Binop op,
device_span<bitmask_type> destination,
device_span<bitmask_type const*> source,
device_span<size_type const> source_begin_bits,
size_type source_size_bits)
size_type source_size_bits,
size_type* count_ptr)
{
for (size_type destination_word_index = threadIdx.x + blockIdx.x * blockDim.x;
destination_word_index < destination.size();
constexpr auto const word_size{detail::size_in_bits<bitmask_type>()};
auto const tid = threadIdx.x + blockIdx.x * blockDim.x;

size_type thread_count = 0;

for (size_type destination_word_index = tid; destination_word_index < destination.size();
destination_word_index += blockDim.x * gridDim.x) {
bitmask_type destination_word =
detail::get_mask_offset_word(source[0],
destination_word_index,
source_begin_bits[0],
source_begin_bits[0] + source_size_bits);
for (size_type i = 1; i < source.size(); i++) {
destination_word =

op(destination_word,
detail::get_mask_offset_word(source[i],
destination_word_index,
source_begin_bits[i],
source_begin_bits[i] + source_size_bits));
destination_word = op(destination_word,
detail::get_mask_offset_word(source[i],
destination_word_index,
source_begin_bits[i],
source_begin_bits[i] + source_size_bits));
}

destination[destination_word_index] = destination_word;
thread_count += __popc(destination_word);
}

// Subtract any slack bits from the last word
if (tid == 0) {
size_type const last_bit_index = source_size_bits - 1;
size_type const num_slack_bits = word_size - (last_bit_index % word_size) - 1;
if (num_slack_bits > 0) {
size_type const word_index = cudf::word_index(last_bit_index);
thread_count -= __popc(destination[word_index] & set_most_significant_bits(num_slack_bits));
}
}

using BlockReduce = cub::BlockReduce<size_type, block_size>;
__shared__ typename BlockReduce::TempStorage temp_storage;
size_type block_count = BlockReduce(temp_storage).Sum(thread_count);

if (threadIdx.x == 0) { atomicAdd(count_ptr, block_count); }
}

/**
Expand All @@ -72,7 +97,7 @@ __global__ void offset_bitmask_binop(Binop op,
* @param stream CUDA stream used for device memory operations and kernel launches
*/
template <typename Binop>
rmm::device_buffer bitmask_binop(
std::pair<rmm::device_buffer, size_type> bitmask_binop(
Binop op,
host_span<bitmask_type const*> masks,
host_span<size_type const> masks_begin_bits,
Expand All @@ -81,34 +106,35 @@ rmm::device_buffer bitmask_binop(
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
{
auto dest_mask = rmm::device_buffer{bitmask_allocation_size_bytes(mask_size_bits), stream, mr};

inplace_bitmask_binop(op,
device_span<bitmask_type>(static_cast<bitmask_type*>(dest_mask.data()),
num_bitmask_words(mask_size_bits)),
masks,
masks_begin_bits,
mask_size_bits,
stream,
mr);

return dest_mask;
auto null_count =
mask_size_bits -
inplace_bitmask_binop(op,
device_span<bitmask_type>(static_cast<bitmask_type*>(dest_mask.data()),
num_bitmask_words(mask_size_bits)),
masks,
masks_begin_bits,
mask_size_bits,
stream,
mr);

return std::make_pair(std::move(dest_mask), null_count);
}

/**
* @brief Performs a merge of the specified bitmasks using the binary operator
* provided, and writes in place to destination
* provided, writes in place to destination and returns count of set bits
*
* @param op The binary operator used to combine the bitmasks
* @param dest_mask Destination to which the merged result is written
* @param masks The list of data pointers of the bitmasks to be merged
* @param masks_begin_bits The bit offsets from which each mask is to be merged
* @param mask_size_bits The number of bits to be ANDed in each mask
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned device_buffer
* @return rmm::device_buffer Output bitmask
* @param[in] op The binary operator used to combine the bitmasks
* @param[out] dest_mask Destination to which the merged result is written
* @param[in] masks The list of data pointers of the bitmasks to be merged
* @param[in] masks_begin_bits The bit offsets from which each mask is to be merged
* @param[in] mask_size_bits The number of bits to be ANDed in each mask
* @param[in] stream CUDA stream used for device memory operations and kernel launches
* @param[in] mr Device memory resource used to allocate the returned device_buffer
* @return size_type Count of set bits
*/
template <typename Binop>
void inplace_bitmask_binop(
size_type inplace_bitmask_binop(
Binop op,
device_span<bitmask_type> dest_mask,
host_span<bitmask_type const*> masks,
Expand All @@ -124,6 +150,7 @@ void inplace_bitmask_binop(
CUDF_EXPECTS(std::all_of(masks.begin(), masks.end(), [](auto p) { return p != nullptr; }),
"Mask pointer cannot be null");

rmm::device_scalar<size_type> d_counter{0, stream, mr};
rmm::device_uvector<bitmask_type const*> d_masks(masks.size(), stream, mr);
rmm::device_uvector<size_type> d_begin_bits(masks_begin_bits.size(), stream, mr);

Expand All @@ -135,11 +162,13 @@ void inplace_bitmask_binop(
cudaMemcpyHostToDevice,
stream.value()));

cudf::detail::grid_1d config(dest_mask.size(), 256);
offset_bitmask_binop<<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(
op, dest_mask, d_masks, d_begin_bits, mask_size_bits);
auto constexpr block_size = 256;
cudf::detail::grid_1d config(dest_mask.size(), block_size);
offset_bitmask_binop<block_size>
<<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(
op, dest_mask, d_masks, d_begin_bits, mask_size_bits, d_counter.data());
CHECK_CUDA(stream.value());
stream.synchronize();
return d_counter.value(stream);
}

/**
Expand Down
6 changes: 3 additions & 3 deletions cpp/include/cudf/detail/null_mask.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ rmm::device_buffer copy_bitmask(
*
* @param stream CUDA stream used for device memory operations and kernel launches
*/
rmm::device_buffer bitmask_and(
std::pair<rmm::device_buffer, size_type> bitmask_and(
PointKernel marked this conversation as resolved.
Show resolved Hide resolved
host_span<bitmask_type const*> masks,
host_span<size_type const> masks_begin_bits,
size_type mask_size_bits,
Expand All @@ -126,7 +126,7 @@ rmm::device_buffer bitmask_and(
*
* @param[in] stream CUDA stream used for device memory operations and kernel launches.
*/
rmm::device_buffer bitmask_and(
std::pair<rmm::device_buffer, size_type> bitmask_and(
table_view const& view,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
Expand All @@ -136,7 +136,7 @@ rmm::device_buffer bitmask_and(
*
* @param[in] stream CUDA stream used for device memory operations and kernel launches.
*/
rmm::device_buffer bitmask_or(
std::pair<rmm::device_buffer, size_type> bitmask_or(
table_view const& view,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
Expand Down
14 changes: 8 additions & 6 deletions cpp/include/cudf/null_mask.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -202,30 +202,32 @@ rmm::device_buffer copy_bitmask(
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Returns a bitwise AND of the bitmasks of columns of a table
* @brief Performs bitwise AND of the bitmasks of columns of a table. Returns
* a pair of resulting mask and count of unset bits.
*
* If any of the columns isn't nullable, it is considered all valid.
* If no column in the table is nullable, an empty bitmask is returned.
*
* @param view The table of columns
* @param mr Device memory resource used to allocate the returned device_buffer
* @return rmm::device_buffer Output bitmask
* @return A pair of resulting bitmask and count of unset bits
*/
rmm::device_buffer bitmask_and(
std::pair<rmm::device_buffer, size_type> bitmask_and(
table_view const& view,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Returns a bitwise OR of the bitmasks of columns of a table
* @brief Performs bitwise OR of the bitmasks of columns of a table. Returns
* a pair of resulting mask and count of unset bits.
*
* If any of the columns isn't nullable, it is considered all valid.
* If no column in the table is nullable, an empty bitmask is returned.
*
* @param view The table of columns
* @param mr Device memory resource used to allocate the returned device_buffer
* @return rmm::device_buffer Output bitmask
* @return A pair of resulting bitmask and count of unset bits
*/
rmm::device_buffer bitmask_or(
std::pair<rmm::device_buffer, size_type> bitmask_or(
table_view const& view,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

Expand Down
10 changes: 5 additions & 5 deletions cpp/src/binaryop/binaryop.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -392,9 +392,9 @@ std::unique_ptr<column> make_fixed_width_column_for_output(column_view const& lh
if (binops::is_null_dependent(op)) {
return make_fixed_width_column(output_type, rhs.size(), mask_state::ALL_VALID, stream, mr);
} else {
auto new_mask = cudf::detail::bitmask_and(table_view({lhs, rhs}), stream, mr);
auto [new_mask, null_count] = cudf::detail::bitmask_and(table_view({lhs, rhs}), stream, mr);
return make_fixed_width_column(
output_type, lhs.size(), std::move(new_mask), cudf::UNKNOWN_NULL_COUNT, stream, mr);
output_type, lhs.size(), std::move(new_mask), null_count, stream, mr);
PointKernel marked this conversation as resolved.
Show resolved Hide resolved
}
};

Expand Down Expand Up @@ -799,9 +799,9 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,

CUDF_EXPECTS((lhs.size() == rhs.size()), "Column sizes don't match");

auto new_mask = bitmask_and(table_view({lhs, rhs}), stream, mr);
auto out = make_fixed_width_column(
output_type, lhs.size(), std::move(new_mask), cudf::UNKNOWN_NULL_COUNT, stream, mr);
auto [new_mask, null_count] = bitmask_and(table_view({lhs, rhs}), stream, mr);
auto out =
make_fixed_width_column(output_type, lhs.size(), std::move(new_mask), null_count, stream, mr);

// Check for 0 sized data
if (lhs.is_empty() or rhs.is_empty()) return out;
Expand Down
Loading