Fix memory issues in cuIO due to removal of memory padding (#13586)

After `rmm` removed memory padding (rapidsai/rmm#1278), some of cuIO code started to have out-of-bound access issues because many of its compute kernels shift the input pointers back and forth to satisfy some alignment. This adds back padding to various memory buffers so the buffers now will have some extra space enough for such shifting. With this fix, the reported issues (#13567, #13571, #13570) no longer show up. Closes: * #13567 * #13571 * #13570 Authors: - Nghia Truong (https://github.com/ttnghia) Approvers: - David Wendt (https://github.com/davidwendt) - Bradley Dice (https://github.com/bdice) - Vyas Ramasubramani (https://github.com/vyasr) - Vukasin Milovanovic (https://github.com/vuule) URL: #13586
rapidsai · Jun 23, 2023 · 0b4e354 · 0b4e354
1 parent 0688872
commit 0b4e354
Show file tree

Hide file tree

Showing 5 changed files with 55 additions and 14 deletions.
diff --git a/cpp/src/io/comp/gpuinflate.hpp b/cpp/src/io/comp/gpuinflate.hpp
@@ -47,6 +47,20 @@ struct compression_result {
 
 enum class gzip_header_included { NO, YES };
 
+/**
+ * @brief The value used for padding a data buffer such that its size will be multiple of it.
+ *
+ * Padding is necessary for input/output buffers of several compression/decompression kernels
+ * (inflate_kernel and nvcomp snappy). Such kernels operate on aligned data pointers, which require
+ * padding to the buffers so that the pointers can shift along the address space to satisfy their
+ * alignment requirement.
+ *
+ * In the meantime, it is not entirely clear why such padding is needed. We need to further
+ * investigate and implement a better fix rather than just padding the buffer.
+ * See https://github.com/rapidsai/cudf/issues/13605.
+ */
+constexpr std::size_t BUFFER_PADDING_MULTIPLE{8};
+
 /**
  * @brief Interface for decompressing GZIP-compressed data
  *

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
@@ -319,7 +319,10 @@ rmm::device_buffer reader::impl::decompress_stripe_data(
   }
   CUDF_EXPECTS(total_decomp_size > 0, "No decompressible data found");
 
-  rmm::device_buffer decomp_data(total_decomp_size, stream);
+  // Buffer needs to be padded.
+  // Required by `gpuDecodeOrcColumnData`.
+  rmm::device_buffer decomp_data(
+    cudf::util::round_up_safe(total_decomp_size, BUFFER_PADDING_MULTIPLE), stream);
   rmm::device_uvector<device_span<uint8_t const>> inflate_in(
     num_compressed_blocks + num_uncompressed_blocks, stream);
   rmm::device_uvector<device_span<uint8_t>> inflate_out(
@@ -1067,7 +1070,10 @@ table_with_metadata reader::impl::read(int64_t skip_rows,
           CUDF_EXPECTS(not is_stripe_data_empty or stripe_info->indexLength == 0,
                        "Invalid index rowgroup stream data");
 
-          stripe_data.emplace_back(total_data_size, stream);
+          // Buffer needs to be padded.
+          // Required by `copy_uncompressed_kernel`.
+          stripe_data.emplace_back(
+            cudf::util::round_up_safe(total_data_size, BUFFER_PADDING_MULTIPLE), stream);
           auto dst_base = static_cast<uint8_t*>(stripe_data.back().data());
 
           // Coalesce consecutive streams into one read

diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -256,15 +256,23 @@ template <typename T = uint8_t>
     if (io_size != 0) {
       auto& source = sources[chunk_source_map[chunk]];
       if (source->is_device_read_preferred(io_size)) {
-        auto buffer        = rmm::device_buffer(io_size, stream);
+        // Buffer needs to be padded.
+        // Required by `gpuDecodePageData`.
+        auto buffer =
+          rmm::device_buffer(cudf::util::round_up_safe(io_size, BUFFER_PADDING_MULTIPLE), stream);
         auto fut_read_size = source->device_read_async(
           io_offset, io_size, static_cast<uint8_t*>(buffer.data()), stream);
         read_tasks.emplace_back(std::move(fut_read_size));
         page_data[chunk] = datasource::buffer::create(std::move(buffer));
       } else {
-        auto const buffer = source->host_read(io_offset, io_size);
-        page_data[chunk] =
-          datasource::buffer::create(rmm::device_buffer(buffer->data(), buffer->size(), stream));
+        auto const read_buffer = source->host_read(io_offset, io_size);
+        // Buffer needs to be padded.
+        // Required by `gpuDecodePageData`.
+        auto tmp_buffer = rmm::device_buffer(
+          cudf::util::round_up_safe(read_buffer->size(), BUFFER_PADDING_MULTIPLE), stream);
+        CUDF_CUDA_TRY(cudaMemcpyAsync(
+          tmp_buffer.data(), read_buffer->data(), read_buffer->size(), cudaMemcpyDefault, stream));
+        page_data[chunk] = datasource::buffer::create(std::move(tmp_buffer));
       }
       auto d_compdata = page_data[chunk]->data();
       do {
@@ -440,8 +448,10 @@ int decode_page_headers(cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>& c
     }
   }
 
-  // Dispatch batches of pages to decompress for each codec
-  rmm::device_buffer decomp_pages(total_decomp_size, stream);
+  // Dispatch batches of pages to decompress for each codec.
+  // Buffer needs to be padded, required by `gpuDecodePageData`.
+  rmm::device_buffer decomp_pages(
+    cudf::util::round_up_safe(total_decomp_size, BUFFER_PADDING_MULTIPLE), stream);
 
   std::vector<device_span<uint8_t const>> comp_in;
   comp_in.reserve(num_comp_pages);

diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
@@ -32,6 +32,7 @@
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/detail/get_value.cuh>
+#include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/linked_column.hpp>
 #include <cudf/detail/utilities/pinned_host_vector.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
@@ -1831,8 +1832,14 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
   // Initialize data pointers in batch
   uint32_t const num_stats_bfr =
     (stats_granularity != statistics_freq::STATISTICS_NONE) ? num_pages + num_chunks : 0;
-  rmm::device_buffer uncomp_bfr(max_uncomp_bfr_size, stream);
-  rmm::device_buffer comp_bfr(max_comp_bfr_size, stream);
+
+  // Buffers need to be padded.
+  // Required by `gpuGatherPages`.
+  rmm::device_buffer uncomp_bfr(
+    cudf::util::round_up_safe(max_uncomp_bfr_size, BUFFER_PADDING_MULTIPLE), stream);
+  rmm::device_buffer comp_bfr(cudf::util::round_up_safe(max_comp_bfr_size, BUFFER_PADDING_MULTIPLE),
+                              stream);
+
   rmm::device_buffer col_idx_bfr(column_index_bfr_size, stream);
   rmm::device_uvector<gpu::EncPage> pages(num_pages, stream);
 

diff --git a/cpp/src/io/text/bgzip_data_chunk_source.cu b/cpp/src/io/text/bgzip_data_chunk_source.cu
@@ -14,11 +14,13 @@
  * limitations under the License.
  */
 
-#include "io/comp/nvcomp_adapter.hpp"
 #include "io/text/device_data_chunks.hpp"
-#include "io/utilities/config_utils.hpp"
+
+#include <io/comp/nvcomp_adapter.hpp>
+#include <io/utilities/config_utils.hpp>
 
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/pinned_host_vector.hpp>
 #include <cudf/io/text/data_chunk_source_factories.hpp>
 #include <cudf/io/text/detail/bgzip_utils.hpp>
@@ -69,7 +71,9 @@ class bgzip_data_chunk_reader : public data_chunk_reader {
                              rmm::device_uvector<T>& device,
                              rmm::cuda_stream_view stream)
   {
-    device.resize(host.size(), stream);
+    // Buffer needs to be padded.
+    // Required by `inflate_kernel`.
+    device.resize(cudf::util::round_up_safe(host.size(), BUFFER_PADDING_MULTIPLE), stream);
     CUDF_CUDA_TRY(cudaMemcpyAsync(
       device.data(), host.data(), host.size() * sizeof(T), cudaMemcpyDefault, stream.value()));
   }
@@ -139,7 +143,7 @@ class bgzip_data_chunk_reader : public data_chunk_reader {
         offset_it + num_blocks(),
         span_it,
         bgzip_nvcomp_transform_functor{reinterpret_cast<uint8_t const*>(d_compressed_blocks.data()),
-                                       reinterpret_cast<uint8_t*>(d_decompressed_blocks.begin())});
+                                       reinterpret_cast<uint8_t*>(d_decompressed_blocks.data())});
       if (decompressed_size() > 0) {
         if (nvcomp::is_decompression_disabled(nvcomp::compression_type::DEFLATE)) {
           gpuinflate(d_compressed_spans,