Skip to content

Commit

Permalink
Fix memory issues in cuIO due to removal of memory padding (#13586)
Browse files Browse the repository at this point in the history
After `rmm` removed memory padding (rapidsai/rmm#1278), some of cuIO code started to have out-of-bound access issues because many of its compute kernels shift the input pointers back and forth to satisfy some alignment.

This adds back padding to various memory buffers so the buffers now will have some extra space enough for such shifting.

With this fix, the reported issues (#13567,  #13571, #13570) no longer show up.

Closes:
 * #13567
 * #13571
 * #13570

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: #13586
  • Loading branch information
ttnghia authored Jun 23, 2023
1 parent 0688872 commit 0b4e354
Show file tree
Hide file tree
Showing 5 changed files with 55 additions and 14 deletions.
14 changes: 14 additions & 0 deletions cpp/src/io/comp/gpuinflate.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,20 @@ struct compression_result {

enum class gzip_header_included { NO, YES };

/**
* @brief The value used for padding a data buffer such that its size will be multiple of it.
*
* Padding is necessary for input/output buffers of several compression/decompression kernels
* (inflate_kernel and nvcomp snappy). Such kernels operate on aligned data pointers, which require
* padding to the buffers so that the pointers can shift along the address space to satisfy their
* alignment requirement.
*
* In the meantime, it is not entirely clear why such padding is needed. We need to further
* investigate and implement a better fix rather than just padding the buffer.
* See https://github.com/rapidsai/cudf/issues/13605.
*/
constexpr std::size_t BUFFER_PADDING_MULTIPLE{8};

/**
* @brief Interface for decompressing GZIP-compressed data
*
Expand Down
10 changes: 8 additions & 2 deletions cpp/src/io/orc/reader_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,10 @@ rmm::device_buffer reader::impl::decompress_stripe_data(
}
CUDF_EXPECTS(total_decomp_size > 0, "No decompressible data found");

rmm::device_buffer decomp_data(total_decomp_size, stream);
// Buffer needs to be padded.
// Required by `gpuDecodeOrcColumnData`.
rmm::device_buffer decomp_data(
cudf::util::round_up_safe(total_decomp_size, BUFFER_PADDING_MULTIPLE), stream);
rmm::device_uvector<device_span<uint8_t const>> inflate_in(
num_compressed_blocks + num_uncompressed_blocks, stream);
rmm::device_uvector<device_span<uint8_t>> inflate_out(
Expand Down Expand Up @@ -1067,7 +1070,10 @@ table_with_metadata reader::impl::read(int64_t skip_rows,
CUDF_EXPECTS(not is_stripe_data_empty or stripe_info->indexLength == 0,
"Invalid index rowgroup stream data");

stripe_data.emplace_back(total_data_size, stream);
// Buffer needs to be padded.
// Required by `copy_uncompressed_kernel`.
stripe_data.emplace_back(
cudf::util::round_up_safe(total_data_size, BUFFER_PADDING_MULTIPLE), stream);
auto dst_base = static_cast<uint8_t*>(stripe_data.back().data());

// Coalesce consecutive streams into one read
Expand Down
22 changes: 16 additions & 6 deletions cpp/src/io/parquet/reader_impl_preprocess.cu
Original file line number Diff line number Diff line change
Expand Up @@ -256,15 +256,23 @@ template <typename T = uint8_t>
if (io_size != 0) {
auto& source = sources[chunk_source_map[chunk]];
if (source->is_device_read_preferred(io_size)) {
auto buffer = rmm::device_buffer(io_size, stream);
// Buffer needs to be padded.
// Required by `gpuDecodePageData`.
auto buffer =
rmm::device_buffer(cudf::util::round_up_safe(io_size, BUFFER_PADDING_MULTIPLE), stream);
auto fut_read_size = source->device_read_async(
io_offset, io_size, static_cast<uint8_t*>(buffer.data()), stream);
read_tasks.emplace_back(std::move(fut_read_size));
page_data[chunk] = datasource::buffer::create(std::move(buffer));
} else {
auto const buffer = source->host_read(io_offset, io_size);
page_data[chunk] =
datasource::buffer::create(rmm::device_buffer(buffer->data(), buffer->size(), stream));
auto const read_buffer = source->host_read(io_offset, io_size);
// Buffer needs to be padded.
// Required by `gpuDecodePageData`.
auto tmp_buffer = rmm::device_buffer(
cudf::util::round_up_safe(read_buffer->size(), BUFFER_PADDING_MULTIPLE), stream);
CUDF_CUDA_TRY(cudaMemcpyAsync(
tmp_buffer.data(), read_buffer->data(), read_buffer->size(), cudaMemcpyDefault, stream));
page_data[chunk] = datasource::buffer::create(std::move(tmp_buffer));
}
auto d_compdata = page_data[chunk]->data();
do {
Expand Down Expand Up @@ -440,8 +448,10 @@ int decode_page_headers(cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>& c
}
}

// Dispatch batches of pages to decompress for each codec
rmm::device_buffer decomp_pages(total_decomp_size, stream);
// Dispatch batches of pages to decompress for each codec.
// Buffer needs to be padded, required by `gpuDecodePageData`.
rmm::device_buffer decomp_pages(
cudf::util::round_up_safe(total_decomp_size, BUFFER_PADDING_MULTIPLE), stream);

std::vector<device_span<uint8_t const>> comp_in;
comp_in.reserve(num_comp_pages);
Expand Down
11 changes: 9 additions & 2 deletions cpp/src/io/parquet/writer_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@

#include <cudf/column/column_device_view.cuh>
#include <cudf/detail/get_value.cuh>
#include <cudf/detail/utilities/integer_utils.hpp>
#include <cudf/detail/utilities/linked_column.hpp>
#include <cudf/detail/utilities/pinned_host_vector.hpp>
#include <cudf/detail/utilities/vector_factories.hpp>
Expand Down Expand Up @@ -1831,8 +1832,14 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
// Initialize data pointers in batch
uint32_t const num_stats_bfr =
(stats_granularity != statistics_freq::STATISTICS_NONE) ? num_pages + num_chunks : 0;
rmm::device_buffer uncomp_bfr(max_uncomp_bfr_size, stream);
rmm::device_buffer comp_bfr(max_comp_bfr_size, stream);

// Buffers need to be padded.
// Required by `gpuGatherPages`.
rmm::device_buffer uncomp_bfr(
cudf::util::round_up_safe(max_uncomp_bfr_size, BUFFER_PADDING_MULTIPLE), stream);
rmm::device_buffer comp_bfr(cudf::util::round_up_safe(max_comp_bfr_size, BUFFER_PADDING_MULTIPLE),
stream);

rmm::device_buffer col_idx_bfr(column_index_bfr_size, stream);
rmm::device_uvector<gpu::EncPage> pages(num_pages, stream);

Expand Down
12 changes: 8 additions & 4 deletions cpp/src/io/text/bgzip_data_chunk_source.cu
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,13 @@
* limitations under the License.
*/

#include "io/comp/nvcomp_adapter.hpp"
#include "io/text/device_data_chunks.hpp"
#include "io/utilities/config_utils.hpp"

#include <io/comp/nvcomp_adapter.hpp>
#include <io/utilities/config_utils.hpp>

#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/utilities/integer_utils.hpp>
#include <cudf/detail/utilities/pinned_host_vector.hpp>
#include <cudf/io/text/data_chunk_source_factories.hpp>
#include <cudf/io/text/detail/bgzip_utils.hpp>
Expand Down Expand Up @@ -69,7 +71,9 @@ class bgzip_data_chunk_reader : public data_chunk_reader {
rmm::device_uvector<T>& device,
rmm::cuda_stream_view stream)
{
device.resize(host.size(), stream);
// Buffer needs to be padded.
// Required by `inflate_kernel`.
device.resize(cudf::util::round_up_safe(host.size(), BUFFER_PADDING_MULTIPLE), stream);
CUDF_CUDA_TRY(cudaMemcpyAsync(
device.data(), host.data(), host.size() * sizeof(T), cudaMemcpyDefault, stream.value()));
}
Expand Down Expand Up @@ -139,7 +143,7 @@ class bgzip_data_chunk_reader : public data_chunk_reader {
offset_it + num_blocks(),
span_it,
bgzip_nvcomp_transform_functor{reinterpret_cast<uint8_t const*>(d_compressed_blocks.data()),
reinterpret_cast<uint8_t*>(d_decompressed_blocks.begin())});
reinterpret_cast<uint8_t*>(d_decompressed_blocks.data())});
if (decompressed_size() > 0) {
if (nvcomp::is_decompression_disabled(nvcomp::compression_type::DEFLATE)) {
gpuinflate(d_compressed_spans,
Expand Down

0 comments on commit 0b4e354

Please sign in to comment.