Skip to content

Commit

Permalink
Superimpose null masks for STRUCT columns. (#9144)
Browse files Browse the repository at this point in the history
Per #5700, when a STRUCT column is constructed, the null mask of the parent
column is bitwise-ANDed with that of all its children, such that a null row
in the parent column corresponds to nulls in all its children. This is done
recursively, allowing grand-child columns to also have nulls at the same
row positions.

`superimpose_parent_nulls()` makes this functionality available for columns
that might not have been constructed through `make_struct_column()`, e.g.
with columns received directly from Arrow. It does not require that the
`column_view` is modifiable. For a STRUCT `column_view` argument, a new
equivalent instance is created, with all its children's null masks modified
to account for the parent nulls.

`superimpose_parent_nulls()` can be used for all code that assumes that the
child null masks account for the nulls in the parents (and grandparents,
ad infinitum).

Authors:
  - MithunR (https://github.com/mythrocks)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Conor Hoekstra (https://github.com/codereport)

URL: #9144
  • Loading branch information
mythrocks authored Sep 2, 2021
1 parent f5e870b commit cd4c8c7
Show file tree
Hide file tree
Showing 3 changed files with 400 additions and 6 deletions.
82 changes: 82 additions & 0 deletions cpp/src/structs/utilities.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,15 @@
#include <cudf/structs/structs_column_view.hpp>
#include <cudf/table/table.hpp>
#include <cudf/table/table_view.hpp>
#include <cudf/types.hpp>
#include <cudf/unary.hpp>
#include <cudf/utilities/error.hpp>
#include <cudf/utilities/span.hpp>
#include <cudf/utilities/traits.hpp>
#include <structs/utilities.hpp>

#include <bitset>

namespace cudf {
namespace structs {
namespace detail {
Expand Down Expand Up @@ -337,6 +341,84 @@ void superimpose_parent_nulls(bitmask_type const* parent_null_mask,
}
}

std::tuple<cudf::column_view, std::vector<rmm::device_buffer>> superimpose_parent_nulls(
column_view const& parent, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
{
if (parent.type().id() != type_id::STRUCT) {
// NOOP for non-STRUCT columns.
return std::make_tuple(parent, std::vector<rmm::device_buffer>{});
}

auto structs_column = structs_column_view{parent};

auto ret_validity_buffers = std::vector<rmm::device_buffer>{};

// Function to rewrite child null mask.
auto rewrite_child_mask = [&](auto const& child_idx) {
auto child = structs_column.get_sliced_child(child_idx);

// If struct is not nullable, child null mask is retained. NOOP.
if (not structs_column.nullable()) { return child; }

auto parent_child_null_masks =
std::vector<cudf::bitmask_type const*>{structs_column.null_mask(), child.null_mask()};

auto new_child_mask = [&] {
if (not child.nullable()) {
// Adopt parent STRUCT's null mask.
return structs_column.null_mask();
}

// Both STRUCT and child are nullable. AND() for the child's new null mask.
//
// Note: ANDing only [offset(), offset()+size()) would not work. The null-mask produced thus
// would start at offset=0. The column-view attempts to apply its offset() to both the _data
// and the _null_mask(). It would be better to AND the bits from the beginning, and apply
// offset() uniformly.
// Alternatively, one could construct a big enough buffer, and use inplace_bitwise_and.
ret_validity_buffers.push_back(cudf::detail::bitmask_and(parent_child_null_masks,
std::vector<size_type>{0, 0},
child.offset() + child.size(),
stream,
mr));
return reinterpret_cast<bitmask_type const*>(ret_validity_buffers.back().data());
}();

return cudf::column_view(
child.type(),
child.size(),
child.head(),
new_child_mask,
cudf::UNKNOWN_NULL_COUNT,
child.offset(),
std::vector<cudf::column_view>{child.child_begin(), child.child_end()});
};

auto child_begin =
thrust::make_transform_iterator(thrust::make_counting_iterator(0), rewrite_child_mask);
auto child_end = child_begin + structs_column.num_children();

auto ret_children = std::vector<cudf::column_view>{};
std::for_each(child_begin, child_end, [&](auto const& child) {
auto [processed_child, backing_buffers] = superimpose_parent_nulls(child, stream, mr);
ret_children.push_back(processed_child);
ret_validity_buffers.insert(ret_validity_buffers.end(),
std::make_move_iterator(backing_buffers.begin()),
std::make_move_iterator(backing_buffers.end()));
});

// Make column view out of newly constructed column_views, and all the validity buffers.

return std::make_tuple(column_view(parent.type(),
parent.size(),
nullptr,
parent.null_mask(),
parent.null_count(), // Alternatively, postpone.
parent.offset(),
ret_children),
std::move(ret_validity_buffers));
}

} // namespace detail
} // namespace structs
} // namespace cudf
27 changes: 26 additions & 1 deletion cpp/src/structs/utilities.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
#include <cudf/types.hpp>
#include <cudf/utilities/span.hpp>

#include <rmm/cuda_stream_view.hpp>

namespace cudf {
namespace structs {
namespace detail {
Expand Down Expand Up @@ -106,7 +108,7 @@ std::unique_ptr<cudf::table> unflatten_nested_columns(std::unique_ptr<cudf::tabl
table_view const& blueprint);

/**
* @brief Pushdown nulls from a parent mask into a child column, using AND.
* @brief Push down nulls from a parent mask into a child column, using bitwise AND.
*
* This function will recurse through all struct descendants. It is expected that
* the size of `parent_null_mask` in bits is the same as `child.size()`
Expand All @@ -123,6 +125,29 @@ void superimpose_parent_nulls(bitmask_type const* parent_null_mask,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr);

/**
* @brief Push down nulls from a parent mask into a child column, using bitwise AND.
*
* This function constructs a new column_view instance equivalent to the argument column_view,
* with possibly new child column_views, all with possibly new null mask values reflecting
* null rows from the parent column:
* 1. If the specified column is not STRUCT, the column is returned unmodified, with no new
* supporting device_buffer instances.
* 2. If the column is STRUCT, the null masks of the parent and child are bitwise-ANDed, and a
* modified column_view is returned. This applies recursively.
*
* @param parent The parent (possibly STRUCT) column whose nulls need to be pushed to its members.
* @param stream CUDA stream used for device memory operations and kernel launches.
* @param mr Device memory resource used to allocate new device memory.
* @return A pair of:
* 1. column_view with nulls pushed down to child columns, as appropriate.
* 2. Supporting device_buffer instances, for any newly constructed null masks.
*/
std::tuple<cudf::column_view, std::vector<rmm::device_buffer>> superimpose_parent_nulls(
column_view const& parent,
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

} // namespace detail
} // namespace structs
} // namespace cudf
Loading

0 comments on commit cd4c8c7

Please sign in to comment.