From cf58b897872e515ca0784ca15a6ed3d047c17e6d Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Mon, 7 Jun 2021 08:14:52 +0000 Subject: [PATCH 01/80] working on row and column conversions --- cpp/CMakeLists.txt | 1 + cpp/benchmarks/CMakeLists.txt | 4 + .../row_conversion/row_conversion.cpp | 116 ++ cpp/include/cudf/row_conversion.hpp | 51 + cpp/src/row_conversion/row_conversion.cu | 1106 +++++++++++++++++ 5 files changed, 1278 insertions(+) create mode 100644 cpp/benchmarks/row_conversion/row_conversion.cpp create mode 100644 cpp/include/cudf/row_conversion.hpp create mode 100644 cpp/src/row_conversion/row_conversion.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 82bc5bfba93..785ac1f72de 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -350,6 +350,7 @@ add_library(cudf src/rolling/rolling.cu src/rolling/rolling_collect_list.cu src/round/round.cu + src/row_conversion/row_conversion.cu src/scalar/scalar.cpp src/scalar/scalar_factories.cpp src/search/search.cu diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index b3b92003573..7d353c37df7 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -250,3 +250,7 @@ ConfigureBench(JSON_BENCH # - io benchmark --------------------------------------------------------------------- ConfigureBench(MULTIBYTE_SPLIT_BENCHMARK io/text/multibyte_split_benchmark.cpp) + +################################################################################################### +# - row conversion benchmark --------------------------------------------------------- +ConfigureBench(ROW_CONVERSION_BENCH row_conversion/row_conversion.cpp) diff --git a/cpp/benchmarks/row_conversion/row_conversion.cpp b/cpp/benchmarks/row_conversion/row_conversion.cpp new file mode 100644 index 00000000000..c4edee91b3c --- /dev/null +++ b/cpp/benchmarks/row_conversion/row_conversion.cpp @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include +#include "cudf_test/column_utilities.hpp" + +class RowConversion : public cudf::benchmark { +}; + +static void BM_to_row(benchmark::State& state) +{ + cudf::size_type const n_rows{(cudf::size_type)state.range(0)}; + auto const table = create_random_table({cudf::type_id::INT8, + cudf::type_id::INT32, + cudf::type_id::INT16, + cudf::type_id::INT64, + cudf::type_id::INT32, + cudf::type_id::BOOL8, + cudf::type_id::UINT16, + cudf::type_id::UINT8, + cudf::type_id::UINT64}, + 50, + row_count{n_rows}); + + cudf::size_type total_bytes = 0; + for (int i = 0; i < table->num_columns(); ++i) { + auto t = table->get_column(i).type(); + total_bytes += cudf::size_of(t); + } + + for (auto _ : state) { + cuda_event_timer raii(state, true, rmm::cuda_stream_default); + +// auto rows = cudf::convert_to_rows(table->view()); + auto new_rows = cudf::convert_to_rows2(table->view()); + } + + state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows()); +} + +static void BM_from_row(benchmark::State& state) +{ + cudf::size_type const n_rows{(cudf::size_type)state.range(0)}; + auto const table = create_random_table({cudf::type_id::INT8, + cudf::type_id::INT32, + cudf::type_id::INT16, + cudf::type_id::INT64, + cudf::type_id::INT32, + cudf::type_id::BOOL8, + cudf::type_id::UINT16, + cudf::type_id::UINT8, + cudf::type_id::UINT64}, + 256, + row_count{n_rows}); + /* auto const table = create_random_table({cudf::type_id::INT32}, + 4, + row_count{n_rows});*/ + + std::vector schema; + cudf::size_type total_bytes = 0; + for (int i = 0; i < table->num_columns(); ++i) { + auto t = table->get_column(i).type(); + schema.push_back(t); + total_bytes += cudf::size_of(t); + } + + auto rows = cudf::convert_to_rows(table->view()); + + for (auto _ : state) { + cuda_event_timer raii(state, true, rmm::cuda_stream_default); + + auto out = cudf::convert_from_rows(rows, schema); + } + + state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows()); +} + +#define TO_ROW_CONVERSION_BENCHMARK_DEFINE(name) \ + BENCHMARK_DEFINE_F(RowConversion, name) \ + (::benchmark::State & st) { BM_to_row(st); } \ + BENCHMARK_REGISTER_F(RowConversion, name) \ + ->RangeMultiplier(8) \ + ->Ranges({{1 << 16, 1 << 24}}) \ + ->UseManualTime() \ + ->Unit(benchmark::kMillisecond); + +TO_ROW_CONVERSION_BENCHMARK_DEFINE(to_row_conversion) + +#define FROM_ROW_CONVERSION_BENCHMARK_DEFINE(name) \ + BENCHMARK_DEFINE_F(RowConversion, name) \ + (::benchmark::State & st) { BM_from_row(st); } \ + BENCHMARK_REGISTER_F(RowConversion, name) \ + ->RangeMultiplier(8) \ + ->Ranges({{1 << 6, 1 << 22}}) \ + ->UseManualTime() \ + ->Unit(benchmark::kMillisecond); + +FROM_ROW_CONVERSION_BENCHMARK_DEFINE(from_row_conversion) diff --git a/cpp/include/cudf/row_conversion.hpp b/cpp/include/cudf/row_conversion.hpp new file mode 100644 index 00000000000..f5e2225ad19 --- /dev/null +++ b/cpp/include/cudf/row_conversion.hpp @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include +#include +#include + +namespace cudf { + +std::vector> convert_to_rows( + cudf::table_view const &tbl, + // TODO need something for validity + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); + +std::vector> convert_to_rows2( + cudf::table_view const &tbl, + // TODO need something for validity + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); + +std::unique_ptr convert_from_rows( + cudf::lists_column_view const &input, + std::vector const &schema, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); + +std::unique_ptr convert_from_rows( + std::vector> const &input, + std::vector const &schema, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); + +} // namespace cudf diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu new file mode 100644 index 00000000000..fb5dc4cb38d --- /dev/null +++ b/cpp/src/row_conversion/row_conversion.cu @@ -0,0 +1,1106 @@ +/* + * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "cudf/types.hpp" +#include "rmm/device_buffer.hpp" +#include "thrust/iterator/counting_iterator.h" +#include "thrust/iterator/transform_iterator.h" + +namespace cudf { + +namespace detail { + +static inline __host__ __device__ int32_t align_offset(int32_t offset, std::size_t alignment) +{ + return (offset + alignment - 1) & ~(alignment - 1); +} + + +/** + * Copy a simple vector to device memory asynchronously. Be sure to read + * the data on the same stream as is used to copy it. + */ +template +std::unique_ptr> copy_to_dev_async(const std::vector &input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) +{ + std::unique_ptr> ret(new rmm::device_uvector(input.size(), stream, mr)); + CUDA_TRY(cudaMemcpyAsync( + ret->data(), input.data(), sizeof(T) * input.size(), cudaMemcpyHostToDevice, stream.value())); + return ret; +} + +template +rmm::device_uvector copy_to_dev_async2( + const std::vector &input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) +{ + rmm::device_uvector ret(input.size(), stream, mr); + CUDA_TRY(cudaMemcpyAsync( + ret.data(), input.data(), sizeof(T) * input.size(), cudaMemcpyHostToDevice, stream.value())); + return ret; +} + +__global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows, + const cudf::size_type num_columns, + const cudf::size_type row_size, + const cudf::size_type *input_offset_in_row, + const cudf::size_type *num_bytes, + int8_t **output_data, + cudf::bitmask_type **output_nm, + const int8_t *input_data) +{ + // We are going to copy the data in two passes. + // The first pass copies a chunk of data into shared memory. + // The second pass copies that chunk from shared memory out to the final location. + + // Because shared memory is limited we copy a subset of the rows at a time. + // For simplicity we will refer to this as a row_group + + // In practice we have found writing more than 4 columns of data per thread + // results in performance loss. As such we are using a 2 dimensional + // kernel in terms of threads, but not in terms of blocks. Columns are + // controlled by the y dimension (there is no y dimension in blocks). Rows + // are controlled by the x dimension (there are multiple blocks in the x + // dimension). + + cudf::size_type rows_per_group = blockDim.x; + cudf::size_type row_group_start = blockIdx.x; + cudf::size_type row_group_stride = gridDim.x; + cudf::size_type row_group_end = (num_rows + rows_per_group - 1) / rows_per_group + 1; + + extern __shared__ int8_t shared_data[]; + + // Because we are copying fixed width only data and we stride the rows + // this thread will always start copying from shared data in the same place + int8_t *row_tmp = &shared_data[row_size * threadIdx.x]; + int8_t *row_vld_tmp = &row_tmp[input_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]]; + + for (cudf::size_type row_group_index = row_group_start; row_group_index < row_group_end; + row_group_index += row_group_stride) { + // Step 1: Copy the data into shared memory + // We know row_size is always aligned with and a multiple of int64_t; + int64_t *long_shared = reinterpret_cast(shared_data); + const int64_t *long_input = reinterpret_cast(input_data); + + cudf::size_type shared_output_index = threadIdx.x + (threadIdx.y * blockDim.x); + cudf::size_type shared_output_stride = blockDim.x * blockDim.y; + cudf::size_type row_index_end = ((row_group_index + 1) * rows_per_group); + if (row_index_end > num_rows) { row_index_end = num_rows; } + cudf::size_type num_rows_in_group = row_index_end - (row_group_index * rows_per_group); + cudf::size_type shared_length = row_size * num_rows_in_group; + + cudf::size_type shared_output_end = shared_length / sizeof(int64_t); + + cudf::size_type start_input_index = + (row_size * row_group_index * rows_per_group) / sizeof(int64_t); + + for (cudf::size_type shared_index = shared_output_index; shared_index < shared_output_end; + shared_index += shared_output_stride) { + long_shared[shared_index] = long_input[start_input_index + shared_index]; + } + // Wait for all of the data to be in shared memory + __syncthreads(); + + // Step 2 copy the data back out + + // Within the row group there should be 1 thread for each row. This is a + // requirement for launching the kernel + cudf::size_type row_index = (row_group_index * rows_per_group) + threadIdx.x; + // But we might not use all of the threads if the number of rows does not go + // evenly into the thread count. We don't want those threads to exit yet + // because we may need them to copy data in for the next row group. + uint32_t active_mask = __ballot_sync(0xffffffff, row_index < num_rows); + if (row_index < num_rows) { + cudf::size_type col_index_start = threadIdx.y; + cudf::size_type col_index_stride = blockDim.y; + for (cudf::size_type col_index = col_index_start; col_index < num_columns; + col_index += col_index_stride) { + cudf::size_type col_size = num_bytes[col_index]; + const int8_t *col_tmp = &(row_tmp[input_offset_in_row[col_index]]); + int8_t *col_output = output_data[col_index]; + switch (col_size) { + case 1: { + col_output[row_index] = *col_tmp; + break; + } + case 2: { + int16_t *short_col_output = reinterpret_cast(col_output); + short_col_output[row_index] = *reinterpret_cast(col_tmp); + break; + } + case 4: { + int32_t *int_col_output = reinterpret_cast(col_output); + int_col_output[row_index] = *reinterpret_cast(col_tmp); + break; + } + case 8: { + int64_t *long_col_output = reinterpret_cast(col_output); + long_col_output[row_index] = *reinterpret_cast(col_tmp); + break; + } + default: { + cudf::size_type output_offset = col_size * row_index; + // TODO this should just not be supported for fixed width columns, but just in case... + for (cudf::size_type b = 0; b < col_size; b++) { + col_output[b + output_offset] = col_tmp[b]; + } + break; + } + } + + cudf::bitmask_type *nm = output_nm[col_index]; + int8_t *valid_byte = &row_vld_tmp[col_index / 8]; + cudf::size_type byte_bit_offset = col_index % 8; + int predicate = *valid_byte & (1 << byte_bit_offset); + uint32_t bitmask = __ballot_sync(active_mask, predicate); + if (row_index % 32 == 0) { nm[word_index(row_index)] = bitmask; } + } // end column loop + } // end row copy + // wait for the row_group to be totally copied before starting on the next row group + __syncthreads(); + } +} + +__global__ void copy_from_fixed_width_columns(const cudf::size_type start_row, + const cudf::size_type num_rows, + const cudf::size_type num_columns, + const cudf::size_type row_size, + const cudf::size_type *output_offset_in_row, + const cudf::size_type *num_bytes, + const int8_t **input_data, + const cudf::bitmask_type **input_nm, + int8_t *output_data) +{ + // We are going to copy the data in two passes. + // The first pass copies a chunk of data into shared memory. + // The second pass copies that chunk from shared memory out to the final location. + + // Because shared memory is limited we copy a subset of the rows at a time. + // We do not support copying a subset of the columns in a row yet, so we don't + // currently support a row that is wider than shared memory. + // For simplicity we will refer to this as a row_group + + // In practice we have found reading more than 4 columns of data per thread + // results in performance loss. As such we are using a 2 dimensional + // kernel in terms of threads, but not in terms of blocks. Columns are + // controlled by the y dimension (there is no y dimension in blocks). Rows + // are controlled by the x dimension (there are multiple blocks in the x + // dimension). + + cudf::size_type rows_per_group = blockDim.x; + cudf::size_type row_group_start = blockIdx.x; + cudf::size_type row_group_stride = gridDim.x; + cudf::size_type row_group_end = (num_rows + rows_per_group - 1) / rows_per_group + 1; + + extern __shared__ int8_t shared_data[]; + + // Because we are copying fixed width only data and we stride the rows + // this thread will always start copying to shared data in the same place + int8_t *row_tmp = &shared_data[row_size * threadIdx.x]; + int8_t *row_vld_tmp = + &row_tmp[output_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]]; + + for (cudf::size_type row_group_index = row_group_start; row_group_index < row_group_end; + row_group_index += row_group_stride) { + // Within the row group there should be 1 thread for each row. This is a + // requirement for launching the kernel + cudf::size_type row_index = start_row + (row_group_index * rows_per_group) + threadIdx.x; + // But we might not use all of the threads if the number of rows does not go + // evenly into the thread count. We don't want those threads to exit yet + // because we may need them to copy data back out. + if (row_index < (start_row + num_rows)) { + cudf::size_type col_index_start = threadIdx.y; + cudf::size_type col_index_stride = blockDim.y; + for (cudf::size_type col_index = col_index_start; col_index < num_columns; + col_index += col_index_stride) { + cudf::size_type col_size = num_bytes[col_index]; + int8_t *col_tmp = &(row_tmp[output_offset_in_row[col_index]]); + const int8_t *col_input = input_data[col_index]; + switch (col_size) { + case 1: { + *col_tmp = col_input[row_index]; + break; + } + case 2: { + const int16_t *short_col_input = reinterpret_cast(col_input); + *reinterpret_cast(col_tmp) = short_col_input[row_index]; + break; + } + case 4: { + const int32_t *int_col_input = reinterpret_cast(col_input); + *reinterpret_cast(col_tmp) = int_col_input[row_index]; + break; + } + case 8: { + const int64_t *long_col_input = reinterpret_cast(col_input); + *reinterpret_cast(col_tmp) = long_col_input[row_index]; + break; + } + default: { + cudf::size_type input_offset = col_size * row_index; + // TODO this should just not be supported for fixed width columns, but just in case... + for (cudf::size_type b = 0; b < col_size; b++) { + col_tmp[b] = col_input[b + input_offset]; + } + break; + } + } + // atomicOr only works on 32 bit or 64 bit aligned values, and not byte aligned + // so we have to rewrite the addresses to make sure that it is 4 byte aligned + int8_t *valid_byte = &row_vld_tmp[col_index / 8]; + cudf::size_type byte_bit_offset = col_index % 8; + uint64_t fixup_bytes = reinterpret_cast(valid_byte) % 4; + int32_t *valid_int = reinterpret_cast(valid_byte - fixup_bytes); + cudf::size_type int_bit_offset = byte_bit_offset + (fixup_bytes * 8); + // Now copy validity for the column + if (input_nm[col_index]) { + if (bit_is_set(input_nm[col_index], row_index)) { + atomicOr_block(valid_int, 1 << int_bit_offset); + } else { + atomicAnd_block(valid_int, ~(1 << int_bit_offset)); + } + } else { + // It is valid so just set the bit + atomicOr_block(valid_int, 1 << int_bit_offset); + } + } // end column loop + } // end row copy + // wait for the row_group to be totally copied into shared memory + __syncthreads(); + + // Step 2: Copy the data back out + // We know row_size is always aligned with and a multiple of int64_t; + int64_t *long_shared = reinterpret_cast(shared_data); + int64_t *long_output = reinterpret_cast(output_data); + + cudf::size_type shared_input_index = threadIdx.x + (threadIdx.y * blockDim.x); + cudf::size_type shared_input_stride = blockDim.x * blockDim.y; + cudf::size_type row_index_end = ((row_group_index + 1) * rows_per_group); + if (row_index_end > num_rows) { row_index_end = num_rows; } + cudf::size_type num_rows_in_group = row_index_end - (row_group_index * rows_per_group); + cudf::size_type shared_length = row_size * num_rows_in_group; + + cudf::size_type shared_input_end = shared_length / sizeof(int64_t); + + cudf::size_type start_output_index = + (row_size * row_group_index * rows_per_group) / sizeof(int64_t); + + for (cudf::size_type shared_index = shared_input_index; shared_index < shared_input_end; + shared_index += shared_input_stride) { + long_output[start_output_index + shared_index] = long_shared[shared_index]; + } + __syncthreads(); + // Go for the next round + } +} + +struct block_info { + int start_col; + int start_row; + int end_col; + int end_row; + int buffer_num; +}; + +/** + * @brief copy data from cudf columns into x format, which is row-based + * + * @param num_rows total number of rows in the table + * @param num_columns total number of columns in the table + * @param input_data pointer to raw table data + * @param input_nm pointer to validity data + * @param col_sizes array of sizes for each element in a column - one per column + * @param col_offsets offset into input data row for each column's start + * @param block_infos information about the blocks of work + * @param row_offsets offset to a specific row in the input data + * @param output_data pointer to output data + * + */ +__global__ void copy_from_columns(const cudf::size_type num_rows, + const cudf::size_type num_columns, + const int8_t **input_data, + const cudf::bitmask_type **input_nm, + const cudf::size_type *col_sizes, + const cudf::size_type *col_offsets, + const block_info *block_infos, + const uint64_t *row_offsets, + int8_t **output_data) +{ + // We are going to copy the data in two passes. + // The first pass copies a chunk of data into shared memory. + // The second pass copies that chunk from shared memory out to the final location. + + // Because shared memory is limited we copy a subset of the rows at a time. + // This has been broken up for us in the block_info struct, so we don't have + // any calculation to do here, but it is important to note. + + auto block = block_infos[blockIdx.x]; + extern __shared__ int8_t shared_data[]; + uint64_t const output_start_offset = col_offsets[block.start_col] + row_offsets[block.start_row]; + uint8_t const dest_shim_offset = reinterpret_cast(&output_data[0][output_start_offset]) & 7; // offset for alignment shim in order to match shared memory with final dest + + printf("copying from column %d to column %d with rows %d to row %d(grid dim %d, blockIdx %d)\n", block.start_col, block.end_col, block.start_row, block.end_row, gridDim.x, blockIdx.x); + + // each thread is responsible for every threadcount rows of data. + // the data is copies into shared memory in the final layout. + auto const shmem_row_size = align_offset(col_offsets[block.end_col] + col_sizes[block.end_col] - col_offsets[block.start_col] + dest_shim_offset, 8); // 8 byte alignment required for shared memory rows + auto const validity_offset = col_offsets[num_columns]; + for (int col=block.start_col; col<=block.end_col; ++col) { + /*if (!col_is_variable) */{ + uint64_t col_offset = 0; + cudf::size_type col_size = col_sizes[col]; + auto const dest_col_offset = col_offsets[col] - col_offsets[block.start_col] + dest_shim_offset; + for (int row=block.start_row + threadIdx.x; row(input_data[col]); + *reinterpret_cast(shmem_dest) = short_col_input[row]; + break; + } + case 4: { + const int32_t *int_col_input = reinterpret_cast(input_data[col]); + *reinterpret_cast(shmem_dest) = int_col_input[row]; + break; + } + case 8: { + const int64_t *long_col_input = reinterpret_cast(input_data[col]); + *reinterpret_cast(shmem_dest) = long_col_input[row]; + break; + } + default: { + cudf::size_type input_offset = col_size * row; + // TODO this should just not be supported for fixed width columns, but just in case... + for (cudf::size_type b = 0; b < col_size; b++) { + shmem_dest[b] = input_data[col][b + input_offset]; + } + break; + } + } + + // atomicOr only works on 32 bit or 64 bit aligned values, and not byte aligned + // so we have to rewrite the addresses to make sure that it is 4 byte aligned + // we do this directly in the final location because the entire row may not + // fit in shared memory and may require many blocks to process it entirely + int8_t *valid_byte = &output_data[block.buffer_num][row_offsets[row] + validity_offset + col / 8]; + cudf::size_type byte_bit_offset = col % 8; + uint64_t fixup_bytes = reinterpret_cast(valid_byte) % 4; + int32_t *valid_int = reinterpret_cast(valid_byte - fixup_bytes); + cudf::size_type int_bit_offset = byte_bit_offset + (fixup_bytes * 8); + // Now copy validity for the column + if (input_nm[col]) { + if (bit_is_set(input_nm[col], row)) { + atomicOr_block(valid_int, 1 << int_bit_offset); + } else { + atomicAnd_block(valid_int, ~(1 << int_bit_offset)); + } + } else { + // It is valid so just set the bit + atomicOr_block(valid_int, 1 << int_bit_offset); + } + } // end row + + col_offset += col_sizes[col] * (block.end_row - block.start_row); + } + } // end col + + // wait for the data to be totally copied into shared memory + __syncthreads(); + + // Step 2: Copy the data from shared memory to final destination + // each block is potentially a slice of the table, so no assumptions + // can be made about alignments. We do know that the alignment in shared + // memory matches the final destination alignment. Also note that + // we are not writing to entirely contiguous destinations as each + // row in shared memory may not be an entire row of the destination. + // + auto const thread_start_offset = threadIdx.x * 8; + auto const thread_stride = gridDim.x * 8; + for (auto src_offset = thread_start_offset; src_offset < shmem_row_size * (block.end_row - block.start_row); src_offset += thread_stride) { + auto const output_row_num = src_offset / shmem_row_size; + auto const row_offset = row_offsets[block.start_row + output_row_num]; + auto const col_offset = src_offset % shmem_row_size; + int8_t *output_ptr = &output_data[block.buffer_num][row_offset + col_offset]; + int8_t *input_ptr = &shared_data[src_offset]; + // the first part and last part of the row is unaligned data copy. This is copied a single byte + // at a time. + if (dest_shim_offset > 0 && src_offset % shmem_row_size == 0) { + // first part of a row, copy single bytes + auto const num_single_bytes = 8 - dest_shim_offset; + for (auto i=0; i 0 && (src_offset + 8) % shmem_row_size == 0) { + // last part of a row, copy single bytes + auto const num_single_bytes = dest_shim_offset; + for (auto i=0; i(input_ptr); + *reinterpret_cast(output_ptr) = *long_col_input; + } + } +} + +/** + * Calculate the dimensions of the kernel for fixed width only columns. + * @param [in] num_columns the number of columns being copied. + * @param [in] num_rows the number of rows being copied. + * @param [in] size_per_row the size each row takes up when padded. + * @param [out] blocks the size of the blocks for the kernel + * @param [out] threads the size of the threads for the kernel + * @return the size in bytes of shared memory needed for each block. + */ +static int calc_fixed_width_kernel_dims(const cudf::size_type num_columns, + const cudf::size_type num_rows, + const cudf::size_type size_per_row, + dim3 &blocks, + dim3 &threads) +{ + // We have found speed degrades when a thread handles more than 4 columns. + // Each block is 2 dimensional. The y dimension indicates the columns. + // We limit this to 32 threads in the y dimension so we can still + // have at least 32 threads in the x dimension (1 warp) which should + // result in better coalescing of memory operations. We also + // want to guarantee that we are processing a multiple of 32 threads + // in the x dimension because we use atomic operations at the block + // level when writing validity data out to main memory, and that would + // need to change if we split a word of validity data between blocks. + int y_block_size = (num_columns + 3) / 4; + if (y_block_size > 32) { y_block_size = 32; } + int x_possible_block_size = 1024 / y_block_size; + // 48KB is the default setting for shared memory per block according to the cuda tutorials + // If someone configures the GPU to only have 16 KB this might not work. + int max_shared_size = 48 * 1024; + int max_block_size = max_shared_size / size_per_row; + // If we don't have enough shared memory there is no point in having more threads + // per block that will just sit idle + max_block_size = max_block_size > x_possible_block_size ? x_possible_block_size : max_block_size; + // Make sure that the x dimension is a multiple of 32 this not only helps + // coalesce memory access it also lets us do a ballot sync for validity to write + // the data back out the warp level. If x is a multiple of 32 then each thread in the y + // dimension is associated with one or more warps, that should correspond to the validity + // words directly. + int block_size = (max_block_size / 32) * 32; + CUDF_EXPECTS(block_size != 0, "Row size is too large to fit in shared memory"); + + int num_blocks = (num_rows + block_size - 1) / block_size; + if (num_blocks < 1) { + num_blocks = 1; + } else if (num_blocks > 10240) { + // The maximum number of blocks supported in the x dimension is 2 ^ 31 - 1 + // but in practice haveing too many can cause some overhead that I don't totally + // understand. Playing around with this haveing as little as 600 blocks appears + // to be able to saturate memory on V100, so this is an order of magnitude higher + // to try and future proof this a bit. + num_blocks = 10240; + } + blocks.x = num_blocks; + blocks.y = 1; + blocks.z = 1; + threads.x = block_size; + threads.y = y_block_size; + threads.z = 1; + return size_per_row * block_size; +} + +/** + * When converting to rows it is possible that the size of the table was too big to fit + * in a single column. This creates an output column for a subset of the rows in a table + * going from start row and containing the next num_rows. Most of the parameters passed + * into this function are common between runs and should be calculated once. + */ +static std::unique_ptr fixed_width_convert_to_rows( + const cudf::size_type start_row, + const cudf::size_type num_rows, + const cudf::size_type num_columns, + const cudf::size_type size_per_row, + std::unique_ptr> &column_start, + std::unique_ptr> &column_size, + std::unique_ptr> &input_data, + std::unique_ptr> &input_nm, + const cudf::scalar &zero, + const cudf::scalar &scalar_size_per_row, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) +{ + int64_t total_allocation = size_per_row * num_rows; + // We made a mistake in the split somehow + CUDF_EXPECTS(total_allocation < std::numeric_limits::max(), "Table is too large to fit!"); + + // Allocate and set the offsets row for the byte array + std::unique_ptr offsets = + cudf::detail::sequence(num_rows + 1, zero, scalar_size_per_row, stream); + + std::unique_ptr data = + cudf::make_numeric_column(cudf::data_type(cudf::type_id::INT8), + static_cast(total_allocation), + cudf::mask_state::UNALLOCATED, + stream, + mr); + + dim3 blocks; + dim3 threads; + int shared_size = + detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads); + + copy_from_fixed_width_columns<<>>( + start_row, + num_rows, + num_columns, + size_per_row, + column_start->data(), + column_size->data(), + input_data->data(), + input_nm->data(), + data->mutable_view().data()); + + return cudf::make_lists_column(num_rows, + std::move(offsets), + std::move(data), + 0, + rmm::device_buffer{0, rmm::cuda_stream_default, mr}, + stream, + mr); +} + +static cudf::data_type get_data_type(const cudf::column_view &v) { return v.type(); } + +static inline bool are_all_fixed_width(std::vector const &schema) +{ + return std::all_of( + schema.begin(), schema.end(), [](const cudf::data_type &t) { return cudf::is_fixed_width(t); }); +} + +/** + * Given a set of fixed width columns, calculate how the data will be laid out in memory. + * @param [in] schema the types of columns that need to be laid out. + * @param [out] column_start the byte offset where each column starts in the row. + * @param [out] column_size the size in bytes of the data for each columns in the row. + * @return the size in bytes each row needs. + */ +static inline int32_t compute_fixed_width_layout(std::vector const &schema, + std::vector &column_start, + std::vector &column_size) +{ + // We guarantee that the start of each column is 64-bit aligned so anything can go + // there, but to make the code simple we will still do an alignment for it. + int32_t at_offset = 0; + for (auto col = schema.begin(); col < schema.end(); col++) { + cudf::size_type s = cudf::size_of(*col); + column_size.emplace_back(s); + std::size_t allocation_needed = s; + std::size_t alignment_needed = allocation_needed; // They are the same for fixed width types + at_offset = align_offset(at_offset, alignment_needed); + column_start.emplace_back(at_offset); + at_offset += allocation_needed; + } + + // Now we need to add in space for validity + // Eventually we can think about nullable vs not nullable, but for now we will just always add it + // in + int32_t validity_bytes_needed = (schema.size() + 7) / 8; + // validity comes at the end and is byte aligned so we can pack more in. + at_offset += validity_bytes_needed; + // Now we need to pad the end so all rows are 64 bit aligned + return align_offset(at_offset, 8); // 8 bytes (64 bits) +} + +} // namespace detail + +//#define DEBUG +std::vector> convert_to_rows2(cudf::table_view const &tbl, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) +{ + // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the data, but small enough + // that multiple columns fit in memory so the writes can coalese as well. Potential optimization for window sizes. + constexpr int max_window_height = 1024; + const size_type num_columns = tbl.num_columns(); + const size_type num_rows = tbl.num_rows(); + + #if defined(DEBUG) + auto pretty_print = [](uint64_t i) { + if (i > (1 * 1024 * 1024 * 1024)) { + printf("%.2f GB", i / float(1 * 1024 * 1024 * 1024)); + } else if (i > (1 * 1024 * 1024)) { + printf("%.2f MB", i / float(1 * 1024 * 1024)); + } else if (i > (1 * 1024)) { + printf("%.2f KB", float(i / 1024)); + } else { + printf("%lu Bytes", i); + } + }; + #endif + + int device_id; + CUDA_TRY(cudaGetDevice(&device_id)); + int shmem_limit_per_block; + CUDA_TRY( + cudaDeviceGetAttribute(&shmem_limit_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); + + // break up the work into blocks, which are a starting and ending row/col #. + // this window size is calculated based on the shared memory size available + // we want a single block to fill up the entire shared memory space available + // for the transpose-like conversion. + + // There are two different processes going on here. The GPU conversion of the data + // and the writing of the data into the list of byte columns that are a maximum of + // 2 gigs each due to offset maximum size. The GPU conversion portion has to understand + // this limitation because the column must own the data inside and as a result it must be + // a distinct allocation for that column. Copying the data into these final buffers would + // be prohibitively expensive, so care is taken to ensure the GPU writes to the proper buffer. + // The windows are broken at the boundaries of specific rows based on the row sizes up + // to that point. These are row batches and they are decided first before building the + // windows so the windows can be properly cut around them. + + std::vector row_sizes; // size of each row in bytes including any alignment padding + std::vector row_offsets; // offset from the start of the data to this row + std::vector column_sizes; // byte size of each column + std::vector column_starts; // offset of column inside a row including alignment + std::vector variable_width_columns; // list of the variable width columns in the table + row_sizes.reserve(num_rows); + row_offsets.reserve(num_rows); + column_sizes.reserve(num_columns); + column_starts.reserve(num_columns+1); // we add a final offset for validity data start + + size_type fixed_width_size_per_row = 0; + for (int col = 0; col < num_columns; ++col) { + auto cv = tbl.column(col); + auto col_type = cv.type(); + bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; + + if (nested_type) { variable_width_columns.push_back(cv);} + + // a list or string column will write a single uint64 + // of data here for offset/length + auto col_size = nested_type ? 8 : size_of(col_type); + + // align size for this type + std::size_t const alignment_needed = col_size; // They are the same for fixed width types + fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed); + column_starts.push_back(fixed_width_size_per_row); + column_sizes.push_back(col_size); + fixed_width_size_per_row += col_size; + } + + // When building the columns to return, we have to be mindful of the offset limit in cudf. + // It is 32-bit and these data columns are capable of surpassing that easily. The data should + // not be cut off exactly at the limit though due to the validity buffers. The most efficient + // place to cut the validity is on a 32-row boundary, so as we calculate the row sizes + // we keep track of the cut points for the validity, which we call row batches. If the row + // is larger than can be represented with the 32-bit offsets, we use the last 32-row boundary we hit. + // Note that this boundary is for our book-keeping with column pointers and not anything + // that the kernel needs to worry about. We cut the output at convienient boundaries + // when assembling the outgoing data stream. + struct row_batch { + size_type num_bytes; + size_type row_count; + }; + std::vector row_batches; + + auto calculate_variable_width_row_data_size = [](int const row) { + // each level of variable-width data will add an offset/length + // uint64 of data. The first of which is inside the fixed-width + // data itself and needs to be aligned based on what is around + // that data. This is handled above with the fixed-width calculations + // for that reason. We may still need to add more of these offset/length + // combinations if the nesting is deeper than one level as these + // will be included in the variable-width data blob at the end of the + // row. + return 0; +/* auto c = variable_width_columns[col]; + while (true) { + auto col_offsets = c.child(0).data(); + auto col_data_size = size_of(c.child(1).type()); + std::size_t alignment_needed = col_data_size; + + row_sizes[row] += (col_offsets[row + 1] - col_offsets[row]) * col_data_size; + if (c.num_children() == 0) { + break; + } + c = c.child(1); + } +*/ + }; + + uint64_t row_batch_size = 0; + uint64_t total_table_size = 0; + size_type row_batch_rows = 0; + uint64_t row_offset = 0; + + // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then calculate + // the size of each row's variable-width data as well. + for (int row = 0; row < num_rows; ++row) { + row_sizes[row] = fixed_width_size_per_row + calculate_variable_width_row_data_size(row); + if (row_batch_size + row_sizes[row] > std::numeric_limits::max()) { + // a new batch starts at the last 32-row boundary + row_batches.push_back(row_batch{static_cast(row_batch_size), row_batch_rows & ~31}); + row_batch_size = 0; + row_batch_rows = row_batch_rows & 31; + row_offset = 0; + } + row_offset = detail::align_offset(row_offset, 8); // rows are 8 byte aligned + row_offsets.push_back(row_offset); + row_batch_size += row_sizes[row]; + row_offset += row_sizes[row]; + total_table_size = detail::align_offset(total_table_size, 8); // rows are 8 byte aligned + total_table_size += row_sizes[row]; + row_batch_rows++; + } + if (row_batch_size > 0) { + row_batches.push_back(row_batch{static_cast(row_batch_size), row_batch_rows & ~31}); + } + + #if defined(DEBUG) + printf("%lu batches:\n", row_batches.size()); + for (auto i = 0; i < (int)row_batches.size(); ++i) { + printf("%d: %d rows, ", i, row_batches[i].row_count); + pretty_print(row_batches[i].num_bytes); + printf("\n"); + } + #endif + + std::vector block_infos; + + // block infos are organized with the windows going "down" the columns + // this provides the most coalescing of memory access + int current_window_size = 0; + int current_window_start_col = 0; + + // build the blocks for a specific set of columns + auto build_blocks = [&block_infos, &row_batches, num_rows](int const start_col, int const end_col, int const desired_window_height) { + int current_window_start_row = 0; + int current_window_row_batch = 0; + int rows_left_in_batch = row_batches[current_window_row_batch].row_count; + int i = 0; + while (i < num_rows) { + if (rows_left_in_batch == 0) { + current_window_row_batch++; + rows_left_in_batch = row_batches[current_window_row_batch].row_count; + } + int const window_height = std::min(desired_window_height, rows_left_in_batch); + + block_infos.emplace_back( + detail::block_info{start_col, + current_window_start_row, + start_col + end_col, + std::min(current_window_start_row + window_height - 1, num_rows), current_window_row_batch}); + + i += window_height; + current_window_start_row += window_height; + rows_left_in_batch -= window_height; + } + }; + + int const window_height = std::min(std::min(max_window_height, num_rows), row_batches[0].row_count); + + int row_size = 0; + + // march each column and build the blocks of appropriate sizes + for (int col = 0; col < num_columns; ++col) { + auto const col_size = column_sizes[col]; + + // align size for this type + std::size_t alignment_needed = col_size; // They are the same for fixed width types + auto row_size_with_this_col = detail::align_offset(row_size, alignment_needed) + col_size; + + if (row_size_with_this_col * window_height > shmem_limit_per_block) { + // too large, close this window, generate vertical blocks and restart + build_blocks(current_window_start_col, col - 1, window_height); + row_size = detail::align_offset(column_starts[col] & 7, alignment_needed) + col_size; // alignment required for shared memory window boundary to match alignment of output row + current_window_start_col = col; + } else { + row_size = row_size_with_this_col; + } + } + + auto validity_offset = detail::align_offset(column_starts.back(), 4); + column_starts.push_back(validity_offset); + + // build last set of blocks + if (current_window_size > 0) { build_blocks(current_window_start_col, (int)tbl.num_columns() - 1, window_height); } + + // Get the pointers to the input columnar data ready - possibly moved up to copy to gpu while calculating other things + std::vector input_data; + std::vector input_nm; + for (size_type column_number = 0; column_number < num_columns; column_number++) { + column_view cv = tbl.column(column_number); + auto const col_type = cv.type(); + bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; + + if (!nested_type) { + input_data.emplace_back(cv.data()); + input_nm.emplace_back(cv.null_mask()); + } + } + + #if defined(DEBUG) + printf("%lu windows for %d columns, %d rows to fit in ", block_infos.size(), block_infos[0].end_col - block_infos[0].start_col, block_infos[0].end_row - block_infos[0].start_row); + pretty_print(shmem_limit_per_block); + printf(" shared mem("); + pretty_print(fixed_width_size_per_row); + printf("/row, %d columns, %d rows, ", num_columns, num_rows); + pretty_print(total_table_size); + printf(" total):\n"); + #endif + + auto dev_input_data = detail::copy_to_dev_async2(input_data, stream, mr); + auto dev_input_nm = detail::copy_to_dev_async2(input_nm, stream, mr); + auto dev_col_sizes = detail::copy_to_dev_async2(column_sizes, stream, mr); + auto dev_col_starts = detail::copy_to_dev_async2(column_starts, stream, mr); + auto dev_block_infos = detail::copy_to_dev_async2(block_infos, stream, mr); + auto dev_row_offsets = detail::copy_to_dev_async2(row_offsets, stream, mr); + + std::vector output_data; + output_data.reserve(row_batches.size()); + for (uint i=0; i>>(num_rows, + num_columns, + dev_input_data.data(), + dev_input_nm.data(), + dev_col_sizes.data(), + dev_col_starts.data(), + dev_block_infos.data(), + dev_row_offsets.data(), + reinterpret_cast(dev_output_data.data())); + + // split up the output buffer into multiple buffers based on row batch sizes + // and create list of byte columns + int offset_offset = 0; + std::vector> ret; + for (uint i=0; i offset_vals; + offset_vals.reserve(row_batches[i].row_count + 1); + size_type cur_offset = 0; + offset_vals.push_back(cur_offset); + for (int row=0; row(data_type{type_id::INT32}, (size_type)offset_vals.size(), dev_offsets.release()); + + auto data = + std::make_unique(data_type{cudf::type_id::INT8}, + row_batches[i].num_bytes, + std::move(output_data[i])); + + ret.push_back(cudf::make_lists_column(row_batches[i].row_count, + std::move(offsets), + std::move(data), + 0, + rmm::device_buffer{0, rmm::cuda_stream_default, mr}, + stream, + mr)); + } + + return ret; +} + +std::vector> convert_to_rows(cudf::table_view const &tbl, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) +{ + const cudf::size_type num_columns = tbl.num_columns(); + + std::vector schema; + schema.resize(num_columns); + std::transform(tbl.begin(), tbl.end(), schema.begin(), detail::get_data_type); + + if (detail::are_all_fixed_width(schema)) { + std::vector column_start; + std::vector column_size; + + int32_t size_per_row = detail::compute_fixed_width_layout(schema, column_start, column_size); + auto dev_column_start = detail::copy_to_dev_async(column_start, stream, mr); + auto dev_column_size = detail::copy_to_dev_async(column_size, stream, mr); + + int32_t max_rows_per_batch = std::numeric_limits::max() / size_per_row; + // Make the number of rows per batch a multiple of 32 so we don't have to worry about + // splitting validity at a specific row offset. This might change in the future. + max_rows_per_batch = (max_rows_per_batch / 32) * 32; + + cudf::size_type num_rows = tbl.num_rows(); + + // Get the pointers to the input columnar data ready + std::vector input_data; + std::vector input_nm; + for (cudf::size_type column_number = 0; column_number < num_columns; column_number++) { + cudf::column_view cv = tbl.column(column_number); + input_data.emplace_back(cv.data()); + input_nm.emplace_back(cv.null_mask()); + } + auto dev_input_data = detail::copy_to_dev_async(input_data, stream, mr); + auto dev_input_nm = detail::copy_to_dev_async(input_nm, stream, mr); + + using ScalarType = cudf::scalar_type_t; + auto zero = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value()); + zero->set_valid(true, stream); + static_cast(zero.get())->set_value(0, stream); + + auto step = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value()); + step->set_valid(true, stream); + static_cast(step.get()) + ->set_value(static_cast(size_per_row), stream); + + std::vector> ret; + for (cudf::size_type row_start = 0; row_start < num_rows; row_start += max_rows_per_batch) { + cudf::size_type row_count = num_rows - row_start; + row_count = row_count > max_rows_per_batch ? max_rows_per_batch : row_count; + ret.emplace_back(detail::fixed_width_convert_to_rows(row_start, + row_count, + num_columns, + size_per_row, + dev_column_start, + dev_column_size, + dev_input_data, + dev_input_nm, + *zero, + *step, + stream, + mr)); + } + + return ret; + } else { + CUDF_FAIL("Only fixed width types are currently supported"); + } +} + +std::unique_ptr convert_from_rows(cudf::lists_column_view const &input, + std::vector const &schema, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) +{ + // verify that the types are what we expect + cudf::column_view child = input.child(); + cudf::type_id list_type = child.type().id(); + CUDF_EXPECTS(list_type == cudf::type_id::INT8 || list_type == cudf::type_id::UINT8, + "Only a list of bytes is supported as input"); + + cudf::size_type num_columns = schema.size(); + + if (detail::are_all_fixed_width(schema)) { + std::vector column_start; + std::vector column_size; + + cudf::size_type num_rows = input.parent().size(); + int32_t size_per_row = detail::compute_fixed_width_layout(schema, column_start, column_size); + + // Ideally we would check that the offsets are all the same, etc. but for now + // this is probably fine + CUDF_EXPECTS(size_per_row * num_rows == child.size(), + "The layout of the data appears to be off"); + auto dev_column_start = detail::copy_to_dev_async(column_start, stream, mr); + auto dev_column_size = detail::copy_to_dev_async(column_size, stream, mr); + + // Allocate the columns we are going to write into + std::vector> output_columns; + std::vector output_data; + std::vector output_nm; + for (cudf::size_type i = 0; i < num_columns; i++) { + auto column = cudf::make_fixed_width_column( + schema[i], num_rows, cudf::mask_state::UNINITIALIZED, stream, mr); + auto mut = column->mutable_view(); + output_data.emplace_back(mut.data()); + output_nm.emplace_back(mut.null_mask()); + output_columns.emplace_back(std::move(column)); + } + + auto dev_output_data = detail::copy_to_dev_async(output_data, stream, mr); + auto dev_output_nm = detail::copy_to_dev_async(output_nm, stream, mr); + + dim3 blocks; + dim3 threads; + int shared_size = + detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads); + + detail::copy_to_fixed_width_columns<<>>( + num_rows, + num_columns, + size_per_row, + dev_column_start->data(), + dev_column_size->data(), + dev_output_data->data(), + dev_output_nm->data(), + child.data()); + + return std::make_unique(std::move(output_columns)); + } else { + CUDF_FAIL("Only fixed width types are currently supported"); + } +} + +std::unique_ptr convert_from_rows( + std::vector> const &input, + std::vector const &schema, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) +{ + CUDF_EXPECTS(input.size() == 1, "Too large of an input, need to concat the output tables..."); + + // for (uint i=0; iview(); + auto ret = convert_from_rows(lcv, schema, stream, mr); + + return ret; + // } +} + +} // namespace cudf From 6e869b61c91546175792a95834c7a81f951060fd Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Thu, 10 Jun 2021 17:53:09 +0000 Subject: [PATCH 02/80] fixing kernel launch and updating --- .../row_conversion/row_conversion.cpp | 9 +- cpp/src/row_conversion/row_conversion.cu | 105 +++++++++++++----- 2 files changed, 83 insertions(+), 31 deletions(-) diff --git a/cpp/benchmarks/row_conversion/row_conversion.cpp b/cpp/benchmarks/row_conversion/row_conversion.cpp index c4edee91b3c..9fa05c408e5 100644 --- a/cpp/benchmarks/row_conversion/row_conversion.cpp +++ b/cpp/benchmarks/row_conversion/row_conversion.cpp @@ -28,7 +28,7 @@ class RowConversion : public cudf::benchmark { static void BM_to_row(benchmark::State& state) { cudf::size_type const n_rows{(cudf::size_type)state.range(0)}; - auto const table = create_random_table({cudf::type_id::INT8, +/* auto const table = create_random_table({cudf::type_id::INT8, cudf::type_id::INT32, cudf::type_id::INT16, cudf::type_id::INT64, @@ -38,7 +38,10 @@ static void BM_to_row(benchmark::State& state) cudf::type_id::UINT8, cudf::type_id::UINT64}, 50, - row_count{n_rows}); + row_count{n_rows});*/ + auto const table = create_random_table({cudf::type_id::INT32}, + 64, + row_count{n_rows}); cudf::size_type total_bytes = 0; for (int i = 0; i < table->num_columns(); ++i) { @@ -98,7 +101,7 @@ static void BM_from_row(benchmark::State& state) (::benchmark::State & st) { BM_to_row(st); } \ BENCHMARK_REGISTER_F(RowConversion, name) \ ->RangeMultiplier(8) \ - ->Ranges({{1 << 16, 1 << 24}}) \ + ->Ranges({{1 << 6, 1 << 20}}) \ ->UseManualTime() \ ->Unit(benchmark::kMillisecond); diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu index fb5dc4cb38d..994233a0700 100644 --- a/cpp/src/row_conversion/row_conversion.cu +++ b/cpp/src/row_conversion/row_conversion.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include #include #include #include @@ -347,14 +348,14 @@ struct block_info { * @param output_data pointer to output data * */ -__global__ void copy_from_columns(const cudf::size_type num_rows, - const cudf::size_type num_columns, +__global__ void copy_from_columns(const size_type num_rows, + const size_type num_columns, const int8_t **input_data, - const cudf::bitmask_type **input_nm, - const cudf::size_type *col_sizes, - const cudf::size_type *col_offsets, + const bitmask_type **input_nm, + const size_type *col_sizes, + const size_type *col_offsets, const block_info *block_infos, - const uint64_t *row_offsets, + const size_type *row_offsets, int8_t **output_data) { // We are going to copy the data in two passes. @@ -365,47 +366,92 @@ __global__ void copy_from_columns(const cudf::size_type num_rows, // This has been broken up for us in the block_info struct, so we don't have // any calculation to do here, but it is important to note. + bool debug_print = false; + + if (debug_print) { + printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns); + printf("Column Info:\n"); + for (int i=0; i(&output_data[0][output_start_offset]) & 7; // offset for alignment shim in order to match shared memory with final dest - - printf("copying from column %d to column %d with rows %d to row %d(grid dim %d, blockIdx %d)\n", block.start_col, block.end_col, block.start_row, block.end_row, gridDim.x, blockIdx.x); - + if (debug_print) { + printf("outputting to offset %lu\n", output_start_offset); + printf("dest shim offset is %d\n", dest_shim_offset); + printf("Shared data is %p-%p\n", shared_data, shared_data + (48 * 1024)); + } // each thread is responsible for every threadcount rows of data. // the data is copies into shared memory in the final layout. auto const shmem_row_size = align_offset(col_offsets[block.end_col] + col_sizes[block.end_col] - col_offsets[block.start_col] + dest_shim_offset, 8); // 8 byte alignment required for shared memory rows auto const validity_offset = col_offsets[num_columns]; + if (debug_print) { + printf("col_offsets[%d] = %d, col_sizes[%d] = %d, col_offsets[%d] = %d\n", block.end_col, col_offsets[block.end_col], block.end_col, col_sizes[block.end_col], block.start_col, col_offsets[block.start_col]); + printf("shmem row size %d\n", shmem_row_size); + printf("validity offset is %d\n", validity_offset); + printf("starting at %d,%d and going to %d, %d\n", block.start_col, block.start_row, block.end_col, block.end_row); + } for (int col=block.start_col; col<=block.end_col; ++col) { /*if (!col_is_variable) */{ uint64_t col_offset = 0; cudf::size_type col_size = col_sizes[col]; auto const dest_col_offset = col_offsets[col] - col_offsets[block.start_col] + dest_shim_offset; + if (debug_print) { + printf("dest col offset %d\n", dest_col_offset); + } for (int row=block.start_row + threadIdx.x; row(input_data[col]); + if (debug_print) { + printf("%p <- short %d\n", shmem_dest, short_col_input[row]); + } *reinterpret_cast(shmem_dest) = short_col_input[row]; break; } case 4: { const int32_t *int_col_input = reinterpret_cast(input_data[col]); + if (debug_print) { + printf("shmem[%d][%d] - %p <- int %d\n", row, col, shmem_dest, int_col_input[row]); + } *reinterpret_cast(shmem_dest) = int_col_input[row]; break; } case 8: { const int64_t *long_col_input = reinterpret_cast(input_data[col]); + if (debug_print) { + printf("%p <- long %lu\n", shmem_dest, long_col_input[row]); + } *reinterpret_cast(shmem_dest) = long_col_input[row]; break; } default: { cudf::size_type input_offset = col_size * row; - // TODO this should just not be supported for fixed width columns, but just in case... + if (debug_print) { + printf("byte for byte copy due to size %d\n", col_size); + printf("%p <- input_data[%d] which is %d\n", shmem_dest, input_offset, input_data[col][input_offset]); + } + // TODO this should just not be supported for fixed width columns, but just in case... for (cudf::size_type b = 0; b < col_size; b++) { shmem_dest[b] = input_data[col][b + input_offset]; } @@ -676,6 +722,12 @@ std::vector> convert_to_rows2(cudf::table_view con CUDA_TRY( cudaDeviceGetAttribute(&shmem_limit_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); + #if defined(DEBUG) + size_t free, total; + cudaMemGetInfo( &free, &total ); + printf("%lu/%lu Memory", free, total); + #endif + // break up the work into blocks, which are a starting and ending row/col #. // this window size is calculated based on the shared memory size available // we want a single block to fill up the entire shared memory space available @@ -692,7 +744,7 @@ std::vector> convert_to_rows2(cudf::table_view con // windows so the windows can be properly cut around them. std::vector row_sizes; // size of each row in bytes including any alignment padding - std::vector row_offsets; // offset from the start of the data to this row + std::vector row_offsets; // offset from the start of the data to this row std::vector column_sizes; // byte size of each column std::vector column_starts; // offset of column inside a row including alignment std::vector variable_width_columns; // list of the variable width columns in the table @@ -821,7 +873,7 @@ std::vector> convert_to_rows2(cudf::table_view con block_infos.emplace_back( detail::block_info{start_col, current_window_start_row, - start_col + end_col, + end_col, std::min(current_window_start_row + window_height - 1, num_rows), current_window_row_batch}); i += window_height; @@ -889,23 +941,20 @@ std::vector> convert_to_rows2(cudf::table_view con auto dev_block_infos = detail::copy_to_dev_async2(block_infos, stream, mr); auto dev_row_offsets = detail::copy_to_dev_async2(row_offsets, stream, mr); - std::vector output_data; + std::vector output_buffers; + std::vector output_data; output_data.reserve(row_batches.size()); for (uint i=0; i(temp.data())); + output_buffers.push_back(std::move(temp)); } - auto dev_output_data = detail::copy_to_dev_async2(row_offsets, stream, mr); + auto dev_output_data = detail::copy_to_dev_async2(output_data, stream, mr); // blast through the entire table and convert it - dim3 blocks; - dim3 threads; - blocks.x = block_infos.size(); - blocks.y = 0; - blocks.z = 0; - threads.x = 1024; - threads.y = 0; - threads.z = 0; - detail::copy_from_columns<<>>(num_rows, + dim3 blocks(block_infos.size()); + dim3 threads(1024); + copy_from_columns<<>>(num_rows, num_columns, dev_input_data.data(), dev_input_nm.data(), @@ -932,14 +981,14 @@ std::vector> convert_to_rows2(cudf::table_view con } offset_offset += row_batches[i].row_count; - auto dev_offsets = detail::copy_to_dev_async2(offset_vals, stream, mr); + auto dev_offsets = detail::copy_to_dev_async2(offset_vals, stream, mr); auto offsets = std::make_unique(data_type{type_id::INT32}, (size_type)offset_vals.size(), dev_offsets.release()); auto data = std::make_unique(data_type{cudf::type_id::INT8}, row_batches[i].num_bytes, - std::move(output_data[i])); + std::move(output_buffers[i])); ret.push_back(cudf::make_lists_column(row_batches[i].row_count, std::move(offsets), From 2703baf52c60ec74bfecb1a495441380fbf55d39 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Wed, 16 Jun 2021 19:25:57 +0000 Subject: [PATCH 03/80] Updates and bug fixing --- .../row_conversion/row_conversion.cpp | 76 ++- cpp/src/row_conversion/row_conversion.cu | 498 ++++++++++++------ cpp/tests/CMakeLists.txt | 4 + cpp/tests/row_conversion/row_conversion.cpp | 110 ++++ 4 files changed, 492 insertions(+), 196 deletions(-) create mode 100644 cpp/tests/row_conversion/row_conversion.cpp diff --git a/cpp/benchmarks/row_conversion/row_conversion.cpp b/cpp/benchmarks/row_conversion/row_conversion.cpp index 9fa05c408e5..e1228c9df21 100644 --- a/cpp/benchmarks/row_conversion/row_conversion.cpp +++ b/cpp/benchmarks/row_conversion/row_conversion.cpp @@ -25,10 +25,43 @@ class RowConversion : public cudf::benchmark { }; -static void BM_to_row(benchmark::State& state) +static void BM_old_to_row(benchmark::State& state) { cudf::size_type const n_rows{(cudf::size_type)state.range(0)}; -/* auto const table = create_random_table({cudf::type_id::INT8, + auto const table = create_random_table({cudf::type_id::INT8, + cudf::type_id::INT32, + cudf::type_id::INT16, + cudf::type_id::INT64, + cudf::type_id::INT32, + cudf::type_id::BOOL8, + cudf::type_id::UINT16, + cudf::type_id::UINT8, + cudf::type_id::UINT64}, + 212, + row_count{n_rows}); + /* auto const table = create_random_table({cudf::type_id::INT32}, + 64, + row_count{n_rows});*/ + + cudf::size_type total_bytes = 0; + for (int i = 0; i < table->num_columns(); ++i) { + auto t = table->get_column(i).type(); + total_bytes += cudf::size_of(t); + } + + for (auto _ : state) { + cuda_event_timer raii(state, true, rmm::cuda_stream_default); + + auto rows = cudf::convert_to_rows(table->view()); + } + + state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows()); +} + +static void BM_new_to_row(benchmark::State& state) +{ + cudf::size_type const n_rows{(cudf::size_type)state.range(0)}; + auto const table = create_random_table({cudf::type_id::INT8, cudf::type_id::INT32, cudf::type_id::INT16, cudf::type_id::INT64, @@ -37,11 +70,11 @@ static void BM_to_row(benchmark::State& state) cudf::type_id::UINT16, cudf::type_id::UINT8, cudf::type_id::UINT64}, - 50, - row_count{n_rows});*/ - auto const table = create_random_table({cudf::type_id::INT32}, - 64, - row_count{n_rows}); + 212, + row_count{n_rows}); + /* auto const table = create_random_table({cudf::type_id::INT32}, + 64, + row_count{n_rows});*/ cudf::size_type total_bytes = 0; for (int i = 0; i < table->num_columns(); ++i) { @@ -52,14 +85,13 @@ static void BM_to_row(benchmark::State& state) for (auto _ : state) { cuda_event_timer raii(state, true, rmm::cuda_stream_default); -// auto rows = cudf::convert_to_rows(table->view()); auto new_rows = cudf::convert_to_rows2(table->view()); } state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows()); } -static void BM_from_row(benchmark::State& state) +/*static void BM_from_row(benchmark::State& state) { cudf::size_type const n_rows{(cudf::size_type)state.range(0)}; auto const table = create_random_table({cudf::type_id::INT8, @@ -73,9 +105,6 @@ static void BM_from_row(benchmark::State& state) cudf::type_id::UINT64}, 256, row_count{n_rows}); - /* auto const table = create_random_table({cudf::type_id::INT32}, - 4, - row_count{n_rows});*/ std::vector schema; cudf::size_type total_bytes = 0; @@ -94,18 +123,19 @@ static void BM_from_row(benchmark::State& state) } state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows()); -} - -#define TO_ROW_CONVERSION_BENCHMARK_DEFINE(name) \ - BENCHMARK_DEFINE_F(RowConversion, name) \ - (::benchmark::State & st) { BM_to_row(st); } \ - BENCHMARK_REGISTER_F(RowConversion, name) \ - ->RangeMultiplier(8) \ - ->Ranges({{1 << 6, 1 << 20}}) \ - ->UseManualTime() \ +}*/ + +#define TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \ + BENCHMARK_DEFINE_F(RowConversion, name) \ + (::benchmark::State & st) { f(st); } \ + BENCHMARK_REGISTER_F(RowConversion, name) \ + ->RangeMultiplier(8) \ + ->Ranges({{1 << 6, 1 << 20}}) \ + ->UseManualTime() \ ->Unit(benchmark::kMillisecond); -TO_ROW_CONVERSION_BENCHMARK_DEFINE(to_row_conversion) +TO_ROW_CONVERSION_BENCHMARK_DEFINE(old_to_row_conversion, BM_old_to_row) +TO_ROW_CONVERSION_BENCHMARK_DEFINE(new_to_row_conversion, BM_new_to_row) #define FROM_ROW_CONVERSION_BENCHMARK_DEFINE(name) \ BENCHMARK_DEFINE_F(RowConversion, name) \ @@ -116,4 +146,4 @@ TO_ROW_CONVERSION_BENCHMARK_DEFINE(to_row_conversion) ->UseManualTime() \ ->Unit(benchmark::kMillisecond); -FROM_ROW_CONVERSION_BENCHMARK_DEFINE(from_row_conversion) +//FROM_ROW_CONVERSION_BENCHMARK_DEFINE(from_row_conversion) diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu index 994233a0700..92ba075c316 100644 --- a/cpp/src/row_conversion/row_conversion.cu +++ b/cpp/src/row_conversion/row_conversion.cu @@ -44,7 +44,6 @@ static inline __host__ __device__ int32_t align_offset(int32_t offset, std::size return (offset + alignment - 1) & ~(alignment - 1); } - /** * Copy a simple vector to device memory asynchronously. Be sure to read * the data on the same stream as is used to copy it. @@ -61,10 +60,9 @@ std::unique_ptr> copy_to_dev_async(const std::vector & } template -rmm::device_uvector copy_to_dev_async2( - const std::vector &input, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) +rmm::device_uvector copy_to_dev_async2(const std::vector &input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) { rmm::device_uvector ret(input.size(), stream, mr); CUDA_TRY(cudaMemcpyAsync( @@ -346,7 +344,7 @@ struct block_info { * @param block_infos information about the blocks of work * @param row_offsets offset to a specific row in the input data * @param output_data pointer to output data - * + * */ __global__ void copy_from_columns(const size_type num_rows, const size_type num_columns, @@ -366,92 +364,119 @@ __global__ void copy_from_columns(const size_type num_rows, // This has been broken up for us in the block_info struct, so we don't have // any calculation to do here, but it is important to note. - bool debug_print = false; - + bool debug_print = false; // blockIdx.x == 70 && threadIdx.x == 448; + if (debug_print) { printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns); printf("Column Info:\n"); - for (int i=0; i(&output_data[0][output_start_offset]) & 7; // offset for alignment shim in order to match shared memory with final dest + uint8_t const dest_shim_offset = + reinterpret_cast(&output_data[0][output_start_offset]) & + 7; // offset for alignment shim in order to match shared memory with final dest if (debug_print) { printf("outputting to offset %lu\n", output_start_offset); printf("dest shim offset is %d\n", dest_shim_offset); printf("Shared data is %p-%p\n", shared_data, shared_data + (48 * 1024)); + printf("my block is %d,%d -> %d,%d - buffer %d\n", + block.start_col, + block.start_row, + block.end_col, + block.end_row, + block.buffer_num); } // each thread is responsible for every threadcount rows of data. // the data is copies into shared memory in the final layout. - auto const shmem_row_size = align_offset(col_offsets[block.end_col] + col_sizes[block.end_col] - col_offsets[block.start_col] + dest_shim_offset, 8); // 8 byte alignment required for shared memory rows + auto const real_bytes_in_row = + col_offsets[block.end_col] + col_sizes[block.end_col] - col_offsets[block.start_col]; + auto const shmem_row_size = align_offset(real_bytes_in_row + dest_shim_offset, + 8); // 8 byte alignment required for shared memory rows auto const validity_offset = col_offsets[num_columns]; if (debug_print) { - printf("col_offsets[%d] = %d, col_sizes[%d] = %d, col_offsets[%d] = %d\n", block.end_col, col_offsets[block.end_col], block.end_col, col_sizes[block.end_col], block.start_col, col_offsets[block.start_col]); + printf("col_offsets[%d] = %d, col_sizes[%d] = %d, col_offsets[%d] = %d\n", + block.end_col, + col_offsets[block.end_col], + block.end_col, + col_sizes[block.end_col], + block.start_col, + col_offsets[block.start_col]); printf("shmem row size %d\n", shmem_row_size); printf("validity offset is %d\n", validity_offset); - printf("starting at %d,%d and going to %d, %d\n", block.start_col, block.start_row, block.end_col, block.end_row); + printf("starting at %d,%d and going to %d, %d\n", + block.start_col, + block.start_row, + block.end_col, + block.end_row); } - for (int col=block.start_col; col<=block.end_col; ++col) { - /*if (!col_is_variable) */{ - uint64_t col_offset = 0; + for (int col = block.start_col; col <= block.end_col; ++col) { + /*if (!col_is_variable) */ { + uint64_t col_offset = 0; cudf::size_type col_size = col_sizes[col]; - auto const dest_col_offset = col_offsets[col] - col_offsets[block.start_col] + dest_shim_offset; - if (debug_print) { - printf("dest col offset %d\n", dest_col_offset); - } - for (int row=block.start_row + threadIdx.x; row(input_data[col]); - if (debug_print) { - printf("%p <- short %d\n", shmem_dest, short_col_input[row]); - } + const int16_t *short_col_input = reinterpret_cast(input_data[col]); + if (debug_print) { printf("%p <- short %d\n", shmem_dest, short_col_input[row]); } *reinterpret_cast(shmem_dest) = short_col_input[row]; break; } case 4: { - const int32_t *int_col_input = reinterpret_cast(input_data[col]); + const int32_t *int_col_input = reinterpret_cast(input_data[col]); if (debug_print) { - printf("shmem[%d][%d] - %p <- int %d\n", row, col, shmem_dest, int_col_input[row]); + printf("shmem[%d][%d] - %p <- int 0x%x\n", row, col, shmem_dest, int_col_input[row]); } *reinterpret_cast(shmem_dest) = int_col_input[row]; break; } case 8: { - const int64_t *long_col_input = reinterpret_cast(input_data[col]); - if (debug_print) { - printf("%p <- long %lu\n", shmem_dest, long_col_input[row]); - } + const int64_t *long_col_input = reinterpret_cast(input_data[col]); + if (debug_print) { printf("%p <- long %lu\n", shmem_dest, long_col_input[row]); } *reinterpret_cast(shmem_dest) = long_col_input[row]; break; } default: { cudf::size_type input_offset = col_size * row; if (debug_print) { - printf("byte for byte copy due to size %d\n", col_size); - printf("%p <- input_data[%d] which is %d\n", shmem_dest, input_offset, input_data[col][input_offset]); - } - // TODO this should just not be supported for fixed width columns, but just in case... + printf("byte for byte copy due to size %d of column %d\n", col_size, col); + printf("%p <- input_data[%d] which is %d\n", + shmem_dest, + input_offset, + input_data[col][input_offset]); + } + // TODO this should just not be supported for fixed width columns, but just in case... for (cudf::size_type b = 0; b < col_size; b++) { shmem_dest[b] = input_data[col][b + input_offset]; } @@ -463,11 +488,13 @@ __global__ void copy_from_columns(const size_type num_rows, // so we have to rewrite the addresses to make sure that it is 4 byte aligned // we do this directly in the final location because the entire row may not // fit in shared memory and may require many blocks to process it entirely - int8_t *valid_byte = &output_data[block.buffer_num][row_offsets[row] + validity_offset + col / 8]; + int8_t *valid_byte = + &output_data[block.buffer_num][row_offsets[row] + validity_offset + col / 8]; cudf::size_type byte_bit_offset = col % 8; uint64_t fixup_bytes = reinterpret_cast(valid_byte) % 4; int32_t *valid_int = reinterpret_cast(valid_byte - fixup_bytes); cudf::size_type int_bit_offset = byte_bit_offset + (fixup_bytes * 8); + if (debug_print) { printf("Outputting validity to %p\n", valid_byte); } // Now copy validity for the column if (input_nm[col]) { if (bit_is_set(input_nm[col], row)) { @@ -479,11 +506,11 @@ __global__ void copy_from_columns(const size_type num_rows, // It is valid so just set the bit atomicOr_block(valid_int, 1 << int_bit_offset); } - } // end row + } // end row - col_offset += col_sizes[col] * (block.end_row - block.start_row); + col_offset += col_sizes[col] * rows_in_block; } - } // end col + } // end col // wait for the data to be totally copied into shared memory __syncthreads(); @@ -496,30 +523,75 @@ __global__ void copy_from_columns(const size_type num_rows, // row in shared memory may not be an entire row of the destination. // auto const thread_start_offset = threadIdx.x * 8; - auto const thread_stride = gridDim.x * 8; - for (auto src_offset = thread_start_offset; src_offset < shmem_row_size * (block.end_row - block.start_row); src_offset += thread_stride) { + auto const thread_stride = gridDim.x * 8; + if (debug_print) { + printf("writing final data from %d to %d at stride %d\n", + thread_start_offset, + shmem_row_size * rows_in_block, + thread_stride); + printf("rows in block %d\n", rows_in_block); + } + for (auto src_offset = thread_start_offset; src_offset < shmem_row_size * rows_in_block; + src_offset += thread_stride) { auto const output_row_num = src_offset / shmem_row_size; - auto const row_offset = row_offsets[block.start_row + output_row_num]; - auto const col_offset = src_offset % shmem_row_size; - int8_t *output_ptr = &output_data[block.buffer_num][row_offset + col_offset]; - int8_t *input_ptr = &shared_data[src_offset]; - // the first part and last part of the row is unaligned data copy. This is copied a single byte - // at a time. - if (dest_shim_offset > 0 && src_offset % shmem_row_size == 0) { - // first part of a row, copy single bytes + auto const row_offset = row_offsets[block.start_row + output_row_num]; + auto const col_offset = src_offset % shmem_row_size; + int8_t *output_ptr = &output_data[block.buffer_num][row_offset + col_offset]; + int8_t *input_ptr = &shared_data[src_offset]; + + // three cases to worry about here + // 1) first 8-byte part of a large row - dest_shim_offset bytes of pad at the front + // 2) last 8-byte part of a large row - some bytes of pad at the end + // 3) corner case of <= 8 bytes of data, which means dest_shim_offset bytes of pad at the front + // AND potentially pad at the rear + + // we know the real number of bytes in a row, so we can figure out if we are in case 3 easily. + // 1st case is when we're at some even multiple of shmem_row_size offset. + // 2nd case is when offset + 8 is some even multiple of shmem_row_size. + // must be an 8 byte copy + + // there is a chance we have a 0 dest_shim_offset and an 8 byte thing to copy, optimize? + if (real_bytes_in_row + dest_shim_offset <= 8) { + // case 3, we want to copy real_bytes_in_row bytes + auto const num_single_bytes = real_bytes_in_row - dest_shim_offset; + for (auto i = 0; i < num_single_bytes; ++i) { + if (debug_print) { + printf("case 3 - %d single byte final write %p -> %p\n", + num_single_bytes, + &input_ptr[i + dest_shim_offset], + &output_ptr[i]); + } + output_ptr[i] = input_ptr[i + dest_shim_offset]; + } + } else if (dest_shim_offset > 0 && src_offset % shmem_row_size == 0) { + // first byte with leading pad auto const num_single_bytes = 8 - dest_shim_offset; - for (auto i=0; i %p\n", &input_ptr[i + dest_shim_offset], &output_ptr[i]); + } output_ptr[i] = input_ptr[i + dest_shim_offset]; } - } else if (dest_shim_offset > 0 && (src_offset + 8) % shmem_row_size == 0) { - // last part of a row, copy single bytes - auto const num_single_bytes = dest_shim_offset; - for (auto i=0; i 0) { + // last bytes of a row + auto const num_single_bytes = (real_bytes_in_row + dest_shim_offset) % 8; + for (auto i = 0; i < num_single_bytes; ++i) { + if (debug_print) { + printf("single trailing byte final write %p -> %p\n", + &input_ptr[i + dest_shim_offset], + &output_ptr[i]); + } output_ptr[i] = input_ptr[i + dest_shim_offset]; } } else { // copy 8 bytes aligned - const int64_t *long_col_input = reinterpret_cast(input_ptr); + const int64_t *long_col_input = reinterpret_cast(input_ptr); + if (debug_print) { + printf( + "long final write %p -> %p\n", long_col_input, reinterpret_cast(output_ptr)); + } *reinterpret_cast(output_ptr) = *long_col_input; } } @@ -696,13 +768,14 @@ std::vector> convert_to_rows2(cudf::table_view con rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) { - // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the data, but small enough - // that multiple columns fit in memory so the writes can coalese as well. Potential optimization for window sizes. + // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the + // data, but small enough that multiple columns fit in memory so the writes can coalese as well. + // Potential optimization for window sizes. constexpr int max_window_height = 1024; - const size_type num_columns = tbl.num_columns(); - const size_type num_rows = tbl.num_rows(); + const size_type num_columns = tbl.num_columns(); + const size_type num_rows = tbl.num_rows(); - #if defined(DEBUG) +#if defined(DEBUG) auto pretty_print = [](uint64_t i) { if (i > (1 * 1024 * 1024 * 1024)) { printf("%.2f GB", i / float(1 * 1024 * 1024 * 1024)); @@ -714,7 +787,7 @@ std::vector> convert_to_rows2(cudf::table_view con printf("%lu Bytes", i); } }; - #endif +#endif int device_id; CUDA_TRY(cudaGetDevice(&device_id)); @@ -722,11 +795,11 @@ std::vector> convert_to_rows2(cudf::table_view con CUDA_TRY( cudaDeviceGetAttribute(&shmem_limit_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); - #if defined(DEBUG) +#if defined(DEBUG) size_t free, total; - cudaMemGetInfo( &free, &total ); - printf("%lu/%lu Memory", free, total); - #endif + cudaMemGetInfo(&free, &total); + printf("%lu/%lu Memory\n", free, total); +#endif // break up the work into blocks, which are a starting and ending row/col #. // this window size is calculated based on the shared memory size available @@ -743,45 +816,46 @@ std::vector> convert_to_rows2(cudf::table_view con // to that point. These are row batches and they are decided first before building the // windows so the windows can be properly cut around them. - std::vector row_sizes; // size of each row in bytes including any alignment padding - std::vector row_offsets; // offset from the start of the data to this row + std::vector row_sizes; // size of each row in bytes including any alignment padding + std::vector row_offsets; // offset from the start of the data to this row std::vector column_sizes; // byte size of each column - std::vector column_starts; // offset of column inside a row including alignment - std::vector variable_width_columns; // list of the variable width columns in the table + std::vector column_starts; // offset of column inside a row including alignment + std::vector + variable_width_columns; // list of the variable width columns in the table row_sizes.reserve(num_rows); row_offsets.reserve(num_rows); column_sizes.reserve(num_columns); - column_starts.reserve(num_columns+1); // we add a final offset for validity data start + column_starts.reserve(num_columns + 1); // we add a final offset for validity data start size_type fixed_width_size_per_row = 0; for (int col = 0; col < num_columns; ++col) { - auto cv = tbl.column(col); - auto col_type = cv.type(); + auto cv = tbl.column(col); + auto col_type = cv.type(); bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; - if (nested_type) { variable_width_columns.push_back(cv);} + if (nested_type) { variable_width_columns.push_back(cv); } // a list or string column will write a single uint64 // of data here for offset/length auto col_size = nested_type ? 8 : size_of(col_type); // align size for this type - std::size_t const alignment_needed = col_size; // They are the same for fixed width types - fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed); + std::size_t const alignment_needed = col_size; // They are the same for fixed width types + fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed); column_starts.push_back(fixed_width_size_per_row); column_sizes.push_back(col_size); fixed_width_size_per_row += col_size; } - + // When building the columns to return, we have to be mindful of the offset limit in cudf. // It is 32-bit and these data columns are capable of surpassing that easily. The data should // not be cut off exactly at the limit though due to the validity buffers. The most efficient // place to cut the validity is on a 32-row boundary, so as we calculate the row sizes // we keep track of the cut points for the validity, which we call row batches. If the row - // is larger than can be represented with the 32-bit offsets, we use the last 32-row boundary we hit. - // Note that this boundary is for our book-keeping with column pointers and not anything - // that the kernel needs to worry about. We cut the output at convienient boundaries - // when assembling the outgoing data stream. + // is larger than can be represented with the 32-bit offsets, we use the last 32-row boundary we + // hit. Note that this boundary is for our book-keeping with column pointers and not anything that + // the kernel needs to worry about. We cut the output at convienient boundaries when assembling + // the outgoing data stream. struct row_batch { size_type num_bytes; size_type row_count; @@ -798,71 +872,90 @@ std::vector> convert_to_rows2(cudf::table_view con // will be included in the variable-width data blob at the end of the // row. return 0; -/* auto c = variable_width_columns[col]; - while (true) { - auto col_offsets = c.child(0).data(); - auto col_data_size = size_of(c.child(1).type()); - std::size_t alignment_needed = col_data_size; - - row_sizes[row] += (col_offsets[row + 1] - col_offsets[row]) * col_data_size; - if (c.num_children() == 0) { - break; - } - c = c.child(1); - } -*/ + /* auto c = variable_width_columns[col]; + while (true) { + auto col_offsets = c.child(0).data(); + auto col_data_size = size_of(c.child(1).type()); + std::size_t alignment_needed = col_data_size; + + row_sizes[row] += (col_offsets[row + 1] - col_offsets[row]) * col_data_size; + if (c.num_children() == 0) { + break; + } + c = c.child(1); + } + */ }; uint64_t row_batch_size = 0; uint64_t total_table_size = 0; - size_type row_batch_rows = 0; - uint64_t row_offset = 0; + size_type row_batch_rows = 0; + uint64_t row_offset = 0; + + auto calculate_validity_size = [](int const num_cols) { + // Now we need to add in space for validity + // Eventually we can think about nullable vs not nullable, but for now we will just always add + // it in + return (num_cols + 7) / 8; + }; - // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then calculate - // the size of each row's variable-width data as well. + // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then + // calculate the size of each row's variable-width data and validity as well. for (int row = 0; row < num_rows; ++row) { - row_sizes[row] = fixed_width_size_per_row + calculate_variable_width_row_data_size(row); - if (row_batch_size + row_sizes[row] > std::numeric_limits::max()) { + auto aligned_row_batch_size = + detail::align_offset(row_batch_size, 8); // rows are 8 byte aligned + row_sizes[row] = fixed_width_size_per_row; + // validity is byte aligned + row_sizes[row] += calculate_validity_size(num_columns); + // variable width data is 8-byte aligned + row_sizes[row] = detail::align_offset(row_sizes[row], 8) + + calculate_variable_width_row_data_size(row); // rows are 8 byte aligned + + if (aligned_row_batch_size + row_sizes[row] > std::numeric_limits::max()) { // a new batch starts at the last 32-row boundary - row_batches.push_back(row_batch{static_cast(row_batch_size), row_batch_rows & ~31}); - row_batch_size = 0; - row_batch_rows = row_batch_rows & 31; - row_offset = 0; + row_batches.push_back( + row_batch{static_cast(row_batch_size), row_batch_rows & ~31}); + row_batch_size = 0; + row_batch_rows = row_batch_rows & 31; + row_offset = 0; + aligned_row_batch_size = 0; } - row_offset = detail::align_offset(row_offset, 8); // rows are 8 byte aligned + row_offset = detail::align_offset(row_offset, 8); // rows are 8 byte aligned row_offsets.push_back(row_offset); - row_batch_size += row_sizes[row]; + row_batch_size = aligned_row_batch_size + row_sizes[row]; row_offset += row_sizes[row]; - total_table_size = detail::align_offset(total_table_size, 8); // rows are 8 byte aligned + total_table_size = detail::align_offset(total_table_size, 8); // rows are 8 byte aligned total_table_size += row_sizes[row]; row_batch_rows++; } if (row_batch_size > 0) { - row_batches.push_back(row_batch{static_cast(row_batch_size), row_batch_rows & ~31}); + row_batches.push_back(row_batch{static_cast(row_batch_size), row_batch_rows}); } - #if defined(DEBUG) +#if defined(DEBUG) + printf("%d rows and %d columns in table\n", num_rows, num_columns); printf("%lu batches:\n", row_batches.size()); for (auto i = 0; i < (int)row_batches.size(); ++i) { printf("%d: %d rows, ", i, row_batches[i].row_count); pretty_print(row_batches[i].num_bytes); printf("\n"); } - #endif +#endif std::vector block_infos; // block infos are organized with the windows going "down" the columns // this provides the most coalescing of memory access - int current_window_size = 0; + int current_window_width = 0; int current_window_start_col = 0; // build the blocks for a specific set of columns - auto build_blocks = [&block_infos, &row_batches, num_rows](int const start_col, int const end_col, int const desired_window_height) { + auto build_blocks = [&block_infos, &row_batches, num_rows]( + int const start_col, int const end_col, int const desired_window_height) { int current_window_start_row = 0; int current_window_row_batch = 0; - int rows_left_in_batch = row_batches[current_window_row_batch].row_count; - int i = 0; + int rows_left_in_batch = row_batches[current_window_row_batch].row_count; + int i = 0; while (i < num_rows) { if (rows_left_in_batch == 0) { current_window_row_batch++; @@ -872,9 +965,10 @@ std::vector> convert_to_rows2(cudf::table_view con block_infos.emplace_back( detail::block_info{start_col, - current_window_start_row, - end_col, - std::min(current_window_start_row + window_height - 1, num_rows), current_window_row_batch}); + current_window_start_row, + end_col, + std::min(current_window_start_row + window_height - 1, num_rows - 1), + current_window_row_batch}); i += window_height; current_window_start_row += window_height; @@ -882,7 +976,17 @@ std::vector> convert_to_rows2(cudf::table_view con } }; - int const window_height = std::min(std::min(max_window_height, num_rows), row_batches[0].row_count); + int const window_height = + std::min(std::min(max_window_height, num_rows), row_batches[0].row_count); +#if defined(DEBUG) + printf( + "max_window_height is %d, num_rows is %d, batch row count is %d - which makes window height " + "%d\n", + max_window_height, + num_rows, + row_batches[0].row_count, + window_height); +#endif int row_size = 0; @@ -891,32 +995,74 @@ std::vector> convert_to_rows2(cudf::table_view con auto const col_size = column_sizes[col]; // align size for this type - std::size_t alignment_needed = col_size; // They are the same for fixed width types - auto row_size_with_this_col = detail::align_offset(row_size, alignment_needed) + col_size; + std::size_t alignment_needed = col_size; // They are the same for fixed width types + auto row_size_aligned = detail::align_offset(row_size, alignment_needed); + auto row_size_with_this_col = row_size_aligned + col_size; if (row_size_with_this_col * window_height > shmem_limit_per_block) { +#if defined(DEBUG) + printf( + "Window size %d too large at column %d, bumping back to build windows of size %d(cols " + "%d-%d), which is %d tall. Row size is too large at %d and ok at %d(aligned overall is %d) " + "for shared mem size %d\n", + row_size_with_this_col * window_height, + col, + row_size * window_height, + current_window_start_col, + col - 1, + window_height, + row_size_with_this_col, + row_size, + row_size_aligned, + shmem_limit_per_block); +#endif // too large, close this window, generate vertical blocks and restart build_blocks(current_window_start_col, col - 1, window_height); - row_size = detail::align_offset(column_starts[col] & 7, alignment_needed) + col_size; // alignment required for shared memory window boundary to match alignment of output row + row_size = + detail::align_offset((column_starts[col] + column_sizes[col]) & 7, alignment_needed); +#if defined(DEBUG) + printf( + "New window starting with offset %d and row size %d to be %d (previous column offset %d+%d " + "or %d)\n", + row_size, + col_size, + row_size + col_size, + column_starts[col - 1], + column_sizes[col - 1], + column_starts[col - 1] + column_sizes[col - 1]); +#endif + row_size += col_size; // alignment required for shared memory window boundary to match + // alignment of output row current_window_start_col = col; + current_window_width = 0; } else { row_size = row_size_with_this_col; + current_window_width++; } } - auto validity_offset = detail::align_offset(column_starts.back(), 4); +#if defined(DEBUG) + printf("validity offset will be %d + %d = %d\n", + column_starts.back(), + column_sizes.back(), + column_starts.back() + column_sizes.back()); +#endif + auto validity_offset = detail::align_offset(column_starts.back() + column_sizes.back(), 4); column_starts.push_back(validity_offset); - + // build last set of blocks - if (current_window_size > 0) { build_blocks(current_window_start_col, (int)tbl.num_columns() - 1, window_height); } + if (current_window_width > 0) { + build_blocks(current_window_start_col, (int)tbl.num_columns() - 1, window_height); + } - // Get the pointers to the input columnar data ready - possibly moved up to copy to gpu while calculating other things + // Get the pointers to the input columnar data ready - possibly moved up to copy to gpu while + // calculating other things std::vector input_data; std::vector input_nm; for (size_type column_number = 0; column_number < num_columns; column_number++) { - column_view cv = tbl.column(column_number); + column_view cv = tbl.column(column_number); auto const col_type = cv.type(); - bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; + bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; if (!nested_type) { input_data.emplace_back(cv.data()); @@ -924,81 +1070,87 @@ std::vector> convert_to_rows2(cudf::table_view con } } - #if defined(DEBUG) - printf("%lu windows for %d columns, %d rows to fit in ", block_infos.size(), block_infos[0].end_col - block_infos[0].start_col, block_infos[0].end_row - block_infos[0].start_row); +#if defined(DEBUG) + printf("%lu windows for %d columns, %d rows to fit in ", + block_infos.size(), + block_infos[0].end_col - block_infos[0].start_col + 1, + block_infos[0].end_row - block_infos[0].start_row); pretty_print(shmem_limit_per_block); printf(" shared mem("); pretty_print(fixed_width_size_per_row); printf("/row, %d columns, %d rows, ", num_columns, num_rows); pretty_print(total_table_size); printf(" total):\n"); - #endif +#endif auto dev_input_data = detail::copy_to_dev_async2(input_data, stream, mr); auto dev_input_nm = detail::copy_to_dev_async2(input_nm, stream, mr); auto dev_col_sizes = detail::copy_to_dev_async2(column_sizes, stream, mr); - auto dev_col_starts = detail::copy_to_dev_async2(column_starts, stream, mr); + auto dev_col_starts = detail::copy_to_dev_async2(column_starts, stream, mr); auto dev_block_infos = detail::copy_to_dev_async2(block_infos, stream, mr); - auto dev_row_offsets = detail::copy_to_dev_async2(row_offsets, stream, mr); + auto dev_row_offsets = detail::copy_to_dev_async2(row_offsets, stream, mr); std::vector output_buffers; std::vector output_data; output_data.reserve(row_batches.size()); - for (uint i=0; i(temp.data())); output_buffers.push_back(std::move(temp)); } - auto dev_output_data = detail::copy_to_dev_async2(output_data, stream, mr); + auto dev_output_data = detail::copy_to_dev_async2(output_data, stream, mr); // blast through the entire table and convert it dim3 blocks(block_infos.size()); - dim3 threads(1024); - copy_from_columns<<>>(num_rows, - num_columns, - dev_input_data.data(), - dev_input_nm.data(), - dev_col_sizes.data(), - dev_col_starts.data(), - dev_block_infos.data(), - dev_row_offsets.data(), - reinterpret_cast(dev_output_data.data())); + dim3 threads(std::min((uint64_t)1024, total_table_size / 8)); +#if defined(DEBUG) + printf("Launching kernel with %d blocks, %d threads, ", blocks.x, threads.x); + pretty_print(shmem_limit_per_block); + printf(" shared memory\n"); +#endif + copy_from_columns<<>>( + num_rows, + num_columns, + dev_input_data.data(), + dev_input_nm.data(), + dev_col_sizes.data(), + dev_col_starts.data(), + dev_block_infos.data(), + dev_row_offsets.data(), + reinterpret_cast(dev_output_data.data())); // split up the output buffer into multiple buffers based on row batch sizes // and create list of byte columns int offset_offset = 0; std::vector> ret; - for (uint i=0; i offset_vals; offset_vals.reserve(row_batches[i].row_count + 1); size_type cur_offset = 0; offset_vals.push_back(cur_offset); - for (int row=0; row(data_type{type_id::INT32}, (size_type)offset_vals.size(), dev_offsets.release()); + auto dev_offsets = detail::copy_to_dev_async2(offset_vals, stream, mr); + auto offsets = std::make_unique( + data_type{type_id::INT32}, (size_type)offset_vals.size(), dev_offsets.release()); - auto data = - std::make_unique(data_type{cudf::type_id::INT8}, - row_batches[i].num_bytes, - std::move(output_buffers[i])); + auto data = std::make_unique( + data_type{cudf::type_id::INT8}, row_batches[i].num_bytes, std::move(output_buffers[i])); ret.push_back(cudf::make_lists_column(row_batches[i].row_count, - std::move(offsets), - std::move(data), - 0, - rmm::device_buffer{0, rmm::cuda_stream_default, mr}, - stream, - mr)); + std::move(offsets), + std::move(data), + 0, + rmm::device_buffer{0, rmm::cuda_stream_default, mr}, + stream, + mr)); } - + return ret; } diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 088b0b747fb..2da28425c9e 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -330,6 +330,10 @@ ConfigureTest(RESHAPE_TEST reshape/interleave_columns_tests.cpp reshape/tile_tests.cpp) +################################################################################################### +# - row conversion test ---------------------------------------------------------------------------------- +ConfigureTest(ROW_CONVERSION_TEST row_conversion/row_conversion.cpp) + ################################################################################################### # - traits test ----------------------------------------------------------------------------------- ConfigureTest(TRAITS_TEST types/traits_test.cpp) diff --git a/cpp/tests/row_conversion/row_conversion.cpp b/cpp/tests/row_conversion/row_conversion.cpp new file mode 100644 index 00000000000..c02f83ad1d5 --- /dev/null +++ b/cpp/tests/row_conversion/row_conversion.cpp @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include +#include + +struct ColumnToRowTests : public cudf::test::BaseFixture { +}; + +TEST_F(ColumnToRowTests, Single) +{ + cudf::test::fixed_width_column_wrapper a({-1}); + cudf::table_view in(std::vector{a}); + + auto old_rows = cudf::convert_to_rows(in); + auto new_rows = cudf::convert_to_rows2(in); + + EXPECT_EQ(old_rows.size(), new_rows.size()); + for (uint i = 0; i < old_rows.size(); i++) { + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); + } +} + +TEST_F(ColumnToRowTests, Simple) +{ + cudf::test::fixed_width_column_wrapper a({-1, 0, 1}); + cudf::table_view in(std::vector{a}); + + auto old_rows = cudf::convert_to_rows(in); + auto new_rows = cudf::convert_to_rows2(in); + + EXPECT_EQ(old_rows.size(), new_rows.size()); + for (uint i = 0; i < old_rows.size(); i++) { + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); + } +} + +TEST_F(ColumnToRowTests, Tall) +{ + auto r = + cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); }); + cudf::test::fixed_width_column_wrapper a(r, r + (size_t)4096); + cudf::table_view in(std::vector{a}); + + auto old_rows = cudf::convert_to_rows(in); + auto new_rows = cudf::convert_to_rows2(in); + + EXPECT_EQ(old_rows.size(), new_rows.size()); + for (uint i = 0; i < old_rows.size(); i++) { + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); + } +} + +TEST_F(ColumnToRowTests, Wide) +{ + std::vector> cols; + std::vector views; + + for (int i = 0; i < 256; ++i) { + cols.push_back(cudf::test::fixed_width_column_wrapper({rand()})); + views.push_back(cols.back()); + } + cudf::table_view in(views); + + auto old_rows = cudf::convert_to_rows(in); + auto new_rows = cudf::convert_to_rows2(in); + + EXPECT_EQ(old_rows.size(), new_rows.size()); + for (uint i = 0; i < old_rows.size(); i++) { + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); + } +} + +TEST_F(ColumnToRowTests, SingleByteWide) +{ + std::vector> cols; + std::vector views; + + for (int i = 0; i < 256; ++i) { + cols.push_back(cudf::test::fixed_width_column_wrapper({rand()})); + views.push_back(cols.back()); + } + cudf::table_view in(views); + + auto old_rows = cudf::convert_to_rows(in); + auto new_rows = cudf::convert_to_rows2(in); + + EXPECT_EQ(old_rows.size(), new_rows.size()); + for (uint i = 0; i < old_rows.size(); i++) { + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); + } +} From 63a663697f9af3756e6b907d5a1595f3cdd8127a Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Mon, 21 Jun 2021 18:17:45 +0000 Subject: [PATCH 04/80] Updating windows to be generated in a square way so we can have more data to write out as 8-byte writes from shared memory. Shuffled some of the copy to GPU code up so it can start the copy sooner and hopefully won't force stalls. Some bug fixes. --- .../row_conversion/row_conversion.cpp | 15 ++- cpp/src/row_conversion/row_conversion.cu | 96 +++++++++++-------- 2 files changed, 67 insertions(+), 44 deletions(-) diff --git a/cpp/benchmarks/row_conversion/row_conversion.cpp b/cpp/benchmarks/row_conversion/row_conversion.cpp index e1228c9df21..d6b195433cf 100644 --- a/cpp/benchmarks/row_conversion/row_conversion.cpp +++ b/cpp/benchmarks/row_conversion/row_conversion.cpp @@ -125,7 +125,7 @@ static void BM_new_to_row(benchmark::State& state) state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows()); }*/ -#define TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \ +#define OLD_TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \ BENCHMARK_DEFINE_F(RowConversion, name) \ (::benchmark::State & st) { f(st); } \ BENCHMARK_REGISTER_F(RowConversion, name) \ @@ -134,8 +134,17 @@ static void BM_new_to_row(benchmark::State& state) ->UseManualTime() \ ->Unit(benchmark::kMillisecond); -TO_ROW_CONVERSION_BENCHMARK_DEFINE(old_to_row_conversion, BM_old_to_row) -TO_ROW_CONVERSION_BENCHMARK_DEFINE(new_to_row_conversion, BM_new_to_row) +#define NEW_TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \ + BENCHMARK_DEFINE_F(RowConversion, name) \ + (::benchmark::State & st) { f(st); } \ + BENCHMARK_REGISTER_F(RowConversion, name) \ + ->RangeMultiplier(8) \ + ->Ranges({{1 << 6, 1 << 20}}) \ + ->UseManualTime() \ + ->Unit(benchmark::kMillisecond); + +OLD_TO_ROW_CONVERSION_BENCHMARK_DEFINE(old_to_row_conversion, BM_old_to_row) +NEW_TO_ROW_CONVERSION_BENCHMARK_DEFINE(new_to_row_conversion, BM_new_to_row) #define FROM_ROW_CONVERSION_BENCHMARK_DEFINE(name) \ BENCHMARK_DEFINE_F(RowConversion, name) \ diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu index 92ba075c316..3f221e2f716 100644 --- a/cpp/src/row_conversion/row_conversion.cu +++ b/cpp/src/row_conversion/row_conversion.cu @@ -364,7 +364,7 @@ __global__ void copy_from_columns(const size_type num_rows, // This has been broken up for us in the block_info struct, so we don't have // any calculation to do here, but it is important to note. - bool debug_print = false; // blockIdx.x == 70 && threadIdx.x == 448; + constexpr bool debug_print = false; //blockIdx.x == 2649 && threadIdx.x == 479; if (debug_print) { printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns); @@ -383,6 +383,7 @@ __global__ void copy_from_columns(const size_type num_rows, }*/ printf("output data to %p\n", output_data[block_infos[blockIdx.x].buffer_num]); } + //else { return; } auto block = block_infos[blockIdx.x]; auto const rows_in_block = block.end_row - block.start_row + 1; extern __shared__ int8_t shared_data[]; @@ -416,7 +417,7 @@ __global__ void copy_from_columns(const size_type num_rows, col_sizes[block.end_col], block.start_col, col_offsets[block.start_col]); - printf("shmem row size %d\n", shmem_row_size); + printf("shmem row size %d with real bytes %d\n", shmem_row_size, real_bytes_in_row); printf("validity offset is %d\n", validity_offset); printf("starting at %d,%d and going to %d, %d\n", block.start_col, @@ -524,6 +525,8 @@ __global__ void copy_from_columns(const size_type num_rows, // auto const thread_start_offset = threadIdx.x * 8; auto const thread_stride = gridDim.x * 8; + auto const end_offset = shmem_row_size * rows_in_block; + if (debug_print) { printf("writing final data from %d to %d at stride %d\n", thread_start_offset, @@ -531,7 +534,7 @@ __global__ void copy_from_columns(const size_type num_rows, thread_stride); printf("rows in block %d\n", rows_in_block); } - for (auto src_offset = thread_start_offset; src_offset < shmem_row_size * rows_in_block; + for (auto src_offset = thread_start_offset; src_offset < end_offset; src_offset += thread_stride) { auto const output_row_num = src_offset / shmem_row_size; auto const row_offset = row_offsets[block.start_row + output_row_num]; @@ -771,7 +774,6 @@ std::vector> convert_to_rows2(cudf::table_view con // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the // data, but small enough that multiple columns fit in memory so the writes can coalese as well. // Potential optimization for window sizes. - constexpr int max_window_height = 1024; const size_type num_columns = tbl.num_columns(); const size_type num_rows = tbl.num_rows(); @@ -816,6 +818,25 @@ std::vector> convert_to_rows2(cudf::table_view con // to that point. These are row batches and they are decided first before building the // windows so the windows can be properly cut around them. + // Get the pointers to the input columnar data ready + std::vector input_data; + std::vector input_nm; + input_data.reserve(num_columns); + input_nm.reserve(num_columns); + for (size_type column_number = 0; column_number < num_columns; column_number++) { + column_view cv = tbl.column(column_number); + auto const col_type = cv.type(); + bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; + + if (!nested_type) { + input_data.emplace_back(cv.data()); + input_nm.emplace_back(cv.null_mask()); + } + } + + auto dev_input_data = detail::copy_to_dev_async2(input_data, stream, mr); + auto dev_input_nm = detail::copy_to_dev_async2(input_nm, stream, mr); + std::vector row_sizes; // size of each row in bytes including any alignment padding std::vector row_offsets; // offset from the start of the data to this row std::vector column_sizes; // byte size of each column @@ -847,6 +868,9 @@ std::vector> convert_to_rows2(cudf::table_view con fixed_width_size_per_row += col_size; } + auto dev_col_sizes = detail::copy_to_dev_async2(column_sizes, stream, mr); + auto dev_col_starts = detail::copy_to_dev_async2(column_starts, stream, mr); + // When building the columns to return, we have to be mindful of the offset limit in cudf. // It is 32-bit and these data columns are capable of surpassing that easily. The data should // not be cut off exactly at the limit though due to the validity buffers. The most efficient @@ -901,17 +925,18 @@ std::vector> convert_to_rows2(cudf::table_view con // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then // calculate the size of each row's variable-width data and validity as well. + auto validity_size = calculate_validity_size(num_columns); for (int row = 0; row < num_rows; ++row) { auto aligned_row_batch_size = detail::align_offset(row_batch_size, 8); // rows are 8 byte aligned row_sizes[row] = fixed_width_size_per_row; // validity is byte aligned - row_sizes[row] += calculate_validity_size(num_columns); + row_sizes[row] += validity_size; // variable width data is 8-byte aligned row_sizes[row] = detail::align_offset(row_sizes[row], 8) + calculate_variable_width_row_data_size(row); // rows are 8 byte aligned - if (aligned_row_batch_size + row_sizes[row] > std::numeric_limits::max()) { + if ((uint64_t)aligned_row_batch_size + row_sizes[row] > (uint64_t)std::numeric_limits::max()) { // a new batch starts at the last 32-row boundary row_batches.push_back( row_batch{static_cast(row_batch_size), row_batch_rows & ~31}); @@ -932,7 +957,9 @@ std::vector> convert_to_rows2(cudf::table_view con row_batches.push_back(row_batch{static_cast(row_batch_size), row_batch_rows}); } -#if defined(DEBUG) + auto dev_row_offsets = detail::copy_to_dev_async2(row_offsets, stream, mr); + + #if defined(DEBUG) printf("%d rows and %d columns in table\n", num_rows, num_columns); printf("%lu batches:\n", row_batches.size()); for (auto i = 0; i < (int)row_batches.size(); ++i) { @@ -942,6 +969,16 @@ std::vector> convert_to_rows2(cudf::table_view con } #endif + std::vector output_buffers; + std::vector output_data; + output_data.reserve(row_batches.size()); + for (uint i = 0; i < row_batches.size(); ++i) { + rmm::device_buffer temp(row_batches[i].num_bytes, stream, mr); + output_data.push_back(static_cast(temp.data())); + output_buffers.push_back(std::move(temp)); + } + auto dev_output_data = detail::copy_to_dev_async2(output_data, stream, mr); + std::vector block_infos; // block infos are organized with the windows going "down" the columns @@ -976,8 +1013,13 @@ std::vector> convert_to_rows2(cudf::table_view con } }; + // the ideal window height has lots of 8-byte reads and 8-byte writes. The optimal read/write would be memory cache line sized + // access, but since other blocks will read/write the edges this may not turn out to be overly important. + // For now, we will attempt to build a square window as far as byte sizes. x * y = shared_mem_size. + // Which translates to x^2 = shared_mem_size since we want them equal, so height and width are + // sqrt(shared_mem_size). The trick is that it's in bytes, not rows or columns. int const window_height = - std::min(std::min(max_window_height, num_rows), row_batches[0].row_count); + std::min(std::min(size_type(sqrt(shmem_limit_per_block))/column_sizes[0], num_rows), row_batches[0].row_count); #if defined(DEBUG) printf( "max_window_height is %d, num_rows is %d, batch row count is %d - which makes window height " @@ -998,20 +1040,21 @@ std::vector> convert_to_rows2(cudf::table_view con std::size_t alignment_needed = col_size; // They are the same for fixed width types auto row_size_aligned = detail::align_offset(row_size, alignment_needed); auto row_size_with_this_col = row_size_aligned + col_size; + auto row_size_with_end_pad = detail::align_offset(row_size_with_this_col, 8); - if (row_size_with_this_col * window_height > shmem_limit_per_block) { + if (row_size_with_end_pad * window_height > shmem_limit_per_block) { #if defined(DEBUG) printf( "Window size %d too large at column %d, bumping back to build windows of size %d(cols " "%d-%d), which is %d tall. Row size is too large at %d and ok at %d(aligned overall is %d) " "for shared mem size %d\n", - row_size_with_this_col * window_height, + row_size_with_end_pad * window_height, col, row_size * window_height, current_window_start_col, col - 1, window_height, - row_size_with_this_col, + row_size_with_end_pad, row_size, row_size_aligned, shmem_limit_per_block); @@ -1055,20 +1098,6 @@ std::vector> convert_to_rows2(cudf::table_view con build_blocks(current_window_start_col, (int)tbl.num_columns() - 1, window_height); } - // Get the pointers to the input columnar data ready - possibly moved up to copy to gpu while - // calculating other things - std::vector input_data; - std::vector input_nm; - for (size_type column_number = 0; column_number < num_columns; column_number++) { - column_view cv = tbl.column(column_number); - auto const col_type = cv.type(); - bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; - - if (!nested_type) { - input_data.emplace_back(cv.data()); - input_nm.emplace_back(cv.null_mask()); - } - } #if defined(DEBUG) printf("%lu windows for %d columns, %d rows to fit in ", @@ -1083,26 +1112,11 @@ std::vector> convert_to_rows2(cudf::table_view con printf(" total):\n"); #endif - auto dev_input_data = detail::copy_to_dev_async2(input_data, stream, mr); - auto dev_input_nm = detail::copy_to_dev_async2(input_nm, stream, mr); - auto dev_col_sizes = detail::copy_to_dev_async2(column_sizes, stream, mr); - auto dev_col_starts = detail::copy_to_dev_async2(column_starts, stream, mr); auto dev_block_infos = detail::copy_to_dev_async2(block_infos, stream, mr); - auto dev_row_offsets = detail::copy_to_dev_async2(row_offsets, stream, mr); - - std::vector output_buffers; - std::vector output_data; - output_data.reserve(row_batches.size()); - for (uint i = 0; i < row_batches.size(); ++i) { - rmm::device_buffer temp(row_batches[i].num_bytes, stream, mr); - output_data.push_back(static_cast(temp.data())); - output_buffers.push_back(std::move(temp)); - } - auto dev_output_data = detail::copy_to_dev_async2(output_data, stream, mr); // blast through the entire table and convert it dim3 blocks(block_infos.size()); - dim3 threads(std::min((uint64_t)1024, total_table_size / 8)); + dim3 threads(std::min(1024, shmem_limit_per_block / 8)); #if defined(DEBUG) printf("Launching kernel with %d blocks, %d threads, ", blocks.x, threads.x); pretty_print(shmem_limit_per_block); From b444279cbe7f41d858eb642e9c74c8bd8e9c8d69 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Thu, 8 Jul 2021 01:52:36 +0000 Subject: [PATCH 05/80] Adding row to column conversion code. Performance falls off a cliff, but starts out reasonably. I haven't looked at this in nsight yet. --- .../row_conversion/row_conversion.cpp | 74 +- cpp/include/cudf/row_conversion.hpp | 12 + cpp/src/row_conversion/row_conversion.cu | 759 +++++++++++++----- cpp/tests/row_conversion/row_conversion.cpp | 106 +++ 4 files changed, 748 insertions(+), 203 deletions(-) diff --git a/cpp/benchmarks/row_conversion/row_conversion.cpp b/cpp/benchmarks/row_conversion/row_conversion.cpp index d6b195433cf..7c1f52c5cd6 100644 --- a/cpp/benchmarks/row_conversion/row_conversion.cpp +++ b/cpp/benchmarks/row_conversion/row_conversion.cpp @@ -91,7 +91,7 @@ static void BM_new_to_row(benchmark::State& state) state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows()); } -/*static void BM_from_row(benchmark::State& state) +static void BM_old_from_row(benchmark::State& state) { cudf::size_type const n_rows{(cudf::size_type)state.range(0)}; auto const table = create_random_table({cudf::type_id::INT8, @@ -123,36 +123,62 @@ static void BM_new_to_row(benchmark::State& state) } state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows()); -}*/ - -#define OLD_TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \ - BENCHMARK_DEFINE_F(RowConversion, name) \ - (::benchmark::State & st) { f(st); } \ - BENCHMARK_REGISTER_F(RowConversion, name) \ - ->RangeMultiplier(8) \ - ->Ranges({{1 << 6, 1 << 20}}) \ - ->UseManualTime() \ - ->Unit(benchmark::kMillisecond); +} + +static void BM_new_from_row(benchmark::State& state) +{ + cudf::size_type const n_rows{(cudf::size_type)state.range(0)}; + auto const table = create_random_table({cudf::type_id::INT8, + cudf::type_id::INT32, + cudf::type_id::INT16, + cudf::type_id::INT64, + cudf::type_id::INT32, + cudf::type_id::BOOL8, + cudf::type_id::UINT16, + cudf::type_id::UINT8, + cudf::type_id::UINT64}, + 256, + row_count{n_rows}); + + std::vector schema; + cudf::size_type total_bytes = 0; + for (int i = 0; i < table->num_columns(); ++i) { + auto t = table->get_column(i).type(); + schema.push_back(t); + total_bytes += cudf::size_of(t); + } + + auto rows = cudf::convert_to_rows(table->view()); + + for (auto _ : state) { + cuda_event_timer raii(state, true, rmm::cuda_stream_default); + + auto out = cudf::convert_from_rows2(rows, schema); + } + + state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows()); +} -#define NEW_TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \ - BENCHMARK_DEFINE_F(RowConversion, name) \ - (::benchmark::State & st) { f(st); } \ - BENCHMARK_REGISTER_F(RowConversion, name) \ - ->RangeMultiplier(8) \ - ->Ranges({{1 << 6, 1 << 20}}) \ - ->UseManualTime() \ +#define TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \ + BENCHMARK_DEFINE_F(RowConversion, name) \ + (::benchmark::State & st) { f(st); } \ + BENCHMARK_REGISTER_F(RowConversion, name) \ + ->RangeMultiplier(8) \ + ->Ranges({{1 << 6, 1 << 20}}) \ + ->UseManualTime() \ ->Unit(benchmark::kMillisecond); -OLD_TO_ROW_CONVERSION_BENCHMARK_DEFINE(old_to_row_conversion, BM_old_to_row) -NEW_TO_ROW_CONVERSION_BENCHMARK_DEFINE(new_to_row_conversion, BM_new_to_row) +TO_ROW_CONVERSION_BENCHMARK_DEFINE(old_to_row_conversion, BM_old_to_row) +TO_ROW_CONVERSION_BENCHMARK_DEFINE(new_to_row_conversion, BM_new_to_row) -#define FROM_ROW_CONVERSION_BENCHMARK_DEFINE(name) \ +#define FROM_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \ BENCHMARK_DEFINE_F(RowConversion, name) \ - (::benchmark::State & st) { BM_from_row(st); } \ + (::benchmark::State & st) { f(st); } \ BENCHMARK_REGISTER_F(RowConversion, name) \ ->RangeMultiplier(8) \ - ->Ranges({{1 << 6, 1 << 22}}) \ + ->Ranges({{1 << 6, 1 << 20}}) \ ->UseManualTime() \ ->Unit(benchmark::kMillisecond); -//FROM_ROW_CONVERSION_BENCHMARK_DEFINE(from_row_conversion) +FROM_ROW_CONVERSION_BENCHMARK_DEFINE(old_from_row_conversion, BM_old_from_row) +FROM_ROW_CONVERSION_BENCHMARK_DEFINE(new_from_row_conversion, BM_new_from_row) diff --git a/cpp/include/cudf/row_conversion.hpp b/cpp/include/cudf/row_conversion.hpp index f5e2225ad19..282ffa4b0cb 100644 --- a/cpp/include/cudf/row_conversion.hpp +++ b/cpp/include/cudf/row_conversion.hpp @@ -48,4 +48,16 @@ std::unique_ptr convert_from_rows( rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); +std::unique_ptr convert_from_rows2( + cudf::lists_column_view const &input, + std::vector const &schema, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); + +std::unique_ptr convert_from_rows2( + std::vector> const &input, + std::vector const &schema, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); + } // namespace cudf diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu index 3f221e2f716..c0e78a03576 100644 --- a/cpp/src/row_conversion/row_conversion.cu +++ b/cpp/src/row_conversion/row_conversion.cu @@ -30,6 +30,7 @@ #include #include +#include #include "cudf/types.hpp" #include "rmm/device_buffer.hpp" #include "thrust/iterator/counting_iterator.h" @@ -332,6 +333,20 @@ struct block_info { int buffer_num; }; +// When building the columns to return, we have to be mindful of the offset limit in cudf. +// It is 32-bit and these data columns are capable of surpassing that easily. The data should +// not be cut off exactly at the limit though due to the validity buffers. The most efficient +// place to cut the validity is on a 32-row boundary, so as we calculate the row sizes +// we keep track of the cut points for the validity, which we call row batches. If the row +// is larger than can be represented with the 32-bit offsets, we use the last 32-row boundary we +// hit. Note that this boundary is for our book-keeping with column pointers and not anything that +// the kernel needs to worry about. We cut the output at convienient boundaries when assembling +// the outgoing data stream. +struct row_batch { + size_type num_bytes; + size_type row_count; +}; + /** * @brief copy data from cudf columns into x format, which is row-based * @@ -364,7 +379,7 @@ __global__ void copy_from_columns(const size_type num_rows, // This has been broken up for us in the block_info struct, so we don't have // any calculation to do here, but it is important to note. - constexpr bool debug_print = false; //blockIdx.x == 2649 && threadIdx.x == 479; + bool debug_print = false; //blockIdx.x == 0 && threadIdx.x == 0; if (debug_print) { printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns); @@ -383,7 +398,7 @@ __global__ void copy_from_columns(const size_type num_rows, }*/ printf("output data to %p\n", output_data[block_infos[blockIdx.x].buffer_num]); } - //else { return; } + // else { return; } auto block = block_infos[blockIdx.x]; auto const rows_in_block = block.end_row - block.start_row + 1; extern __shared__ int8_t shared_data[]; @@ -403,7 +418,7 @@ __global__ void copy_from_columns(const size_type num_rows, block.buffer_num); } // each thread is responsible for every threadcount rows of data. - // the data is copies into shared memory in the final layout. + // the data is copied into shared memory in the final layout. auto const real_bytes_in_row = col_offsets[block.end_col] + col_sizes[block.end_col] - col_offsets[block.start_col]; auto const shmem_row_size = align_offset(real_bytes_in_row + dest_shim_offset, @@ -432,7 +447,7 @@ __global__ void copy_from_columns(const size_type num_rows, auto const dest_col_offset = col_offsets[col] - col_offsets[block.start_col] + dest_shim_offset; if (debug_print) { printf("dest col offset %d\n", dest_col_offset); } - for (int row = block.start_row + threadIdx.x; row <= block.end_row; row += gridDim.x) { + for (int row = block.start_row + threadIdx.x; row <= block.end_row; row += blockDim.x) { if (debug_print) { printf("shmem row %d(%d) at offset %d(%d)\n", row - block.start_row, @@ -524,8 +539,8 @@ __global__ void copy_from_columns(const size_type num_rows, // row in shared memory may not be an entire row of the destination. // auto const thread_start_offset = threadIdx.x * 8; - auto const thread_stride = gridDim.x * 8; - auto const end_offset = shmem_row_size * rows_in_block; + auto const thread_stride = blockDim.x * 8; + auto const end_offset = shmem_row_size * rows_in_block; if (debug_print) { printf("writing final data from %d to %d at stride %d\n", @@ -559,9 +574,10 @@ __global__ void copy_from_columns(const size_type num_rows, auto const num_single_bytes = real_bytes_in_row - dest_shim_offset; for (auto i = 0; i < num_single_bytes; ++i) { if (debug_print) { - printf("case 3 - %d single byte final write %p -> %p\n", + printf("case 3 - %d single byte final write %p(%d) -> %p\n", num_single_bytes, &input_ptr[i + dest_shim_offset], + input_ptr[i + dest_shim_offset], &output_ptr[i]); } output_ptr[i] = input_ptr[i + dest_shim_offset]; @@ -600,6 +616,237 @@ __global__ void copy_from_columns(const size_type num_rows, } } +/** + * @brief copy data from row-based format to cudf columns + * + * @param num_rows total number of rows in the table + * @param num_columns total number of columns in the table + * @param offsets + * @param output_data + * @param output_nm + * @param col_sizes array of sizes for each element in a column - one per column + * @param col_offsets offset into input data row for each column's start + * @param block_infos information about the blocks of work + * @param input_data pointer to input data + * + */ +__global__ void copy_to_columns(const size_type num_rows, + const size_type num_columns, + const size_type *offsets, + int8_t **output_data, + cudf::bitmask_type **output_nm, + const size_type *col_sizes, + const size_type *col_offsets, + const block_info *block_infos, + const int8_t *input_data) +{ + // We are going to copy the data in two passes. + // The first pass copies a chunk of data into shared memory. + // The second pass copies that chunk from shared memory out to the final location. + + // Because shared memory is limited we copy a subset of the rows at a time. + // This has been broken up for us in the block_info struct, so we don't have + // any calculation to do here, but it is important to note. + + bool debug_print = false; //blockIdx.x == 1 && threadIdx.x == 0; + + if (debug_print) { + printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns); + printf("Column Info:\n"); + for (int i = 0; i < num_columns; ++i) { + printf("col %d is at %p with size %d and offset %d\n", + i, + output_data[i], + col_sizes[i], + col_offsets[i]); + } + printf("block infos are at %p and my index is %d\n", block_infos, blockIdx.x); + /* printf("Row Offsets:\n"); + for (int i=0; i(&input_data[offsets[absolute_row] + offset_in_row]); + if (debug_print) { + printf("which will be address %p\n", long_col_input); + printf("%p <- long %lu\n", shmem_dest, *long_col_input); } + *reinterpret_cast(shmem_dest) = *long_col_input; + } + + __syncthreads(); + + // now we copy from shared memory to final destination. + // the data is laid out in rows in shared memory, so the reads + // for a column will be "vertical". Because of this and the different + // sizes for each column, this portion is handled on row/column basis. + // to prevent each thread working on a single row and also to ensure + // that all threads can do work in the case of more threads than rows, + // we do a global index instead of a double for loop with col/row. + for (int index = threadIdx.x; index < rows_in_block * cols_in_block; index += blockDim.x) { + auto const relative_col = index % cols_in_block; + auto const relative_row = index / cols_in_block; + auto const absolute_col = relative_col + block.start_col; + auto const absolute_row = relative_row + block.start_row; + + auto const shared_memory_row_offset = window_quad_width * 8 * relative_row; + auto const shared_memory_offset = col_offsets[absolute_col] - col_offsets[block.start_col] + + shared_memory_row_offset + shared_memory_starting_pad; + auto const column_size = col_sizes[absolute_col]; + + int8_t *shmem_src = &shared_data[shared_memory_offset]; + int8_t *dst = &output_data[absolute_col][absolute_row * column_size]; + + if (debug_print) { + printf("relative_col: %d, relative_row: %d, absolute_col: %d, absolute_row: %d, shared_mmeory_row_offset: %d, shared_memory_offset: %d," + " column_size: %d, shmem_src: %p, dst: %p\n", relative_col, relative_row, absolute_col, absolute_row, shared_memory_row_offset, shared_memory_offset, column_size, + shmem_src, dst) ; + } + switch (column_size) { + case 1: { + if (debug_print) { printf("%p <- byte %d\n", dst, *shmem_src); } + *dst = *shmem_src; + break; + } + case 2: { + const int16_t *short_col_input = reinterpret_cast(shmem_src); + if (debug_print) { printf("%p <- short %d\n", dst, *short_col_input); } + *reinterpret_cast(dst) = *short_col_input; + break; + } + case 4: { + const int32_t *int_col_input = reinterpret_cast(shmem_src); + if (debug_print) { printf("%p <- int 0x%x\n", dst, *int_col_input); } + *reinterpret_cast(dst) = *int_col_input; + break; + } + case 8: { + const int64_t *long_col_input = reinterpret_cast(shmem_src); + if (debug_print) { printf("%p <- long %lu\n", dst, *long_col_input); } + *reinterpret_cast(dst) = *long_col_input; + break; + } + default: { + if (debug_print) { + printf("byte for byte copy due to size %d of column %d\n", column_size, absolute_col); + } + // TODO this should just not be supported for fixed width columns, but just in case... + for (cudf::size_type b = 0; b < column_size; b++) { dst[b] = shmem_src[b]; } + break; + } + } + } + + __syncthreads(); + + // now handle validity. Each thread is responsible for 32 rows in a single column. + // to prevent indexing issues with a large number of threads, this is compressed + // to a single loop like above. TODO: investigate using shared memory here + auto const validity_batches_per_col = (num_rows + 31) / 32; + auto const validity_batches_total = validity_batches_per_col * num_columns; + if (debug_print) { + printf("validity_batched_per_col is %d\nvalidity_batches_total is %d for %d rows\n", validity_batches_per_col, validity_batches_total, num_rows); + } + for (int index = threadIdx.x; index < validity_batches_total; index += blockDim.x) { + // what column is this? + auto const col = index / validity_batches_per_col; + auto const batch = index % validity_batches_per_col; + auto const starting_row = batch * 32; + auto const validity_offset = col_offsets[num_columns] + col / 8; + + if (debug_print) { + printf("col: %d, batch: %d, starting_row: %d, validity_offset: %d\n", col, batch, starting_row, validity_offset); + } + + int32_t dst_validity = 0; + for (int row = starting_row; row < std::min(num_rows, starting_row + 32); ++row) { + int8_t const * const validity_ptr = &input_data[offsets[row] + validity_offset]; + + if (debug_print) { + printf("validity_ptr is %p for row %d\nwhich is input_data[%d]\n", validity_ptr, row, offsets[row] + validity_offset); + } + + auto const val_byte = *validity_ptr; + auto const src_shift = col % 8; + auto const dst_shift = row % 32; + auto const src_bit_mask = 1 << src_shift; + if (debug_print) { + printf("src bit mask is 0x%x\n", src_bit_mask); + printf("src shift is 0x%x and dst shift is 0x%x\n", src_shift, dst_shift); + printf("validity bit is 0x%x\n", (val_byte & src_bit_mask) >> src_shift); + } +// auto const dst_bit_mask = 1 << dst_shift; + dst_validity |= (((val_byte & src_bit_mask) >> src_shift) << dst_shift); + if (debug_print) { + printf("validity is now 0x%x\n", dst_validity); + } + } + + + int32_t *validity_ptr = reinterpret_cast(output_nm[col] + (starting_row / 32)); + if (debug_print) { + printf("valiidty_ptr is output_nm[%d]: %p + starting_row / 8: %d because starting row is %d, which becomes %p\n", col, output_nm[col], starting_row / 32, starting_row, output_nm[col] + (starting_row / 32)); + printf("validity to write is %d\n", dst_validity); + printf("validity write %p <- %d\n", validity_ptr, dst_validity); + } + *validity_ptr = dst_validity; + } +} + /** * Calculate the dimensions of the kernel for fixed width only columns. * @param [in] num_columns the number of columns being copied. @@ -764,21 +1011,165 @@ static inline int32_t compute_fixed_width_layout(std::vector co return align_offset(at_offset, 8); // 8 bytes (64 bits) } -} // namespace detail +template +static size_type compute_column_information( + iterator begin, + iterator end, + std::vector &column_starts, + std::vector &column_sizes)//, + //std::function nested_type_cb) +{ + size_type fixed_width_size_per_row = 0; + for (auto cv = begin; cv != end; ++cv) { + auto col_type = std::get<0>(*cv); + bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; + +// if (nested_type && nested_type_cb) { nested_type_cb(cv->get<1>()); } + + // a list or string column will write a single uint64 + // of data here for offset/length + auto col_size = nested_type ? 8 : size_of(col_type); + + // align size for this type + std::size_t const alignment_needed = col_size; // They are the same for fixed width types + fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed); + column_starts.push_back(fixed_width_size_per_row); + column_sizes.push_back(col_size); + fixed_width_size_per_row += col_size; + } + + auto validity_offset = detail::align_offset(fixed_width_size_per_row, 4); + column_starts.push_back(validity_offset); + + return fixed_width_size_per_row; +} //#define DEBUG -std::vector> convert_to_rows2(cudf::table_view const &tbl, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) + +static std::vector build_block_infos(std::vector const &column_sizes, + std::vector const &column_starts, + std::vector const &row_batches, + size_type const total_number_of_rows, + size_type const &shmem_limit_per_block) { - // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the - // data, but small enough that multiple columns fit in memory so the writes can coalese as well. - // Potential optimization for window sizes. - const size_type num_columns = tbl.num_columns(); - const size_type num_rows = tbl.num_rows(); + std::vector block_infos; + + // block infos are organized with the windows going "down" the columns + // this provides the most coalescing of memory access + int current_window_width = 0; + int current_window_start_col = 0; + + // build the blocks for a specific set of columns + auto build_blocks = [&block_infos, &row_batches, total_number_of_rows]( + int const start_col, int const end_col, int const desired_window_height) { + int current_window_start_row = 0; + int current_window_row_batch = 0; + int rows_left_in_batch = row_batches[current_window_row_batch].row_count; + int i = 0; + while (i < total_number_of_rows) { + if (rows_left_in_batch == 0) { + current_window_row_batch++; + rows_left_in_batch = row_batches[current_window_row_batch].row_count; + } + int const window_height = std::min(desired_window_height, rows_left_in_batch); + + block_infos.emplace_back(detail::block_info{ + start_col, + current_window_start_row, + end_col, + std::min(current_window_start_row + window_height - 1, total_number_of_rows - 1), + current_window_row_batch}); + + i += window_height; + current_window_start_row += window_height; + rows_left_in_batch -= window_height; + } + }; + + // the ideal window height has lots of 8-byte reads and 8-byte writes. The optimal read/write + // would be memory cache line sized access, but since other blocks will read/write the edges this + // may not turn out to be overly important. For now, we will attempt to build a square window as + // far as byte sizes. x * y = shared_mem_size. Which translates to x^2 = shared_mem_size since we + // want them equal, so height and width are sqrt(shared_mem_size). The trick is that it's in + // bytes, not rows or columns. + int const window_height = std::min( + std::min(size_type(sqrt(shmem_limit_per_block)) / column_sizes[0], total_number_of_rows), + row_batches[0].row_count); +#if defined(DEBUG) + printf( + "sqrt(shmem_limit_per_block) / column_sizes[0] is %d and num_rows is %d, batch row count is %d - which makes window height " + "%d\n", + size_type(sqrt(shmem_limit_per_block)) / column_sizes[0], + total_number_of_rows, + row_batches[0].row_count, + window_height); +#endif + + int row_size = 0; + + // march each column and build the blocks of appropriate sizes + for (unsigned int col = 0; col < column_sizes.size(); ++col) { + auto const col_size = column_sizes[col]; + + // align size for this type + std::size_t alignment_needed = col_size; // They are the same for fixed width types + auto row_size_aligned = detail::align_offset(row_size, alignment_needed); + auto row_size_with_this_col = row_size_aligned + col_size; + auto row_size_with_end_pad = detail::align_offset(row_size_with_this_col, 8); + + if (row_size_with_end_pad * window_height > shmem_limit_per_block) { +#if defined(DEBUG) + printf( + "Window size %d too large at column %d, bumping back to build windows of size %d(cols " + "%d-%d), which is %d tall. Row size is too large at %d and ok at %d(aligned overall is %d) " + "for shared mem size %d\n", + row_size_with_end_pad * window_height, + col, + row_size * window_height, + current_window_start_col, + col - 1, + window_height, + row_size_with_end_pad, + row_size, + row_size_aligned, + shmem_limit_per_block); +#endif + // too large, close this window, generate vertical blocks and restart + build_blocks(current_window_start_col, col - 1, window_height); + row_size = + detail::align_offset((column_starts[col] + column_sizes[col]) & 7, alignment_needed); +#if defined(DEBUG) + printf( + "New window starting with offset %d and row size %d to be %d (previous column offset %d+%d " + "or %d)\n", + row_size, + col_size, + row_size + col_size, + column_starts[col - 1], + column_sizes[col - 1], + column_starts[col - 1] + column_sizes[col - 1]); +#endif + row_size += col_size; // alignment required for shared memory window boundary to match + // alignment of output row + current_window_start_col = col; + current_window_width = 0; + } else { + row_size = row_size_with_this_col; + current_window_width++; + } + } + + // build last set of blocks + if (current_window_width > 0) { + build_blocks(current_window_start_col, (int)column_sizes.size()-1, window_height); + } + + return block_infos; +} +} // namespace detail #if defined(DEBUG) - auto pretty_print = [](uint64_t i) { + void pretty_print(uint64_t i) { if (i > (1 * 1024 * 1024 * 1024)) { printf("%.2f GB", i / float(1 * 1024 * 1024 * 1024)); } else if (i > (1 * 1024 * 1024)) { @@ -788,9 +1179,19 @@ std::vector> convert_to_rows2(cudf::table_view con } else { printf("%lu Bytes", i); } - }; + } #endif +std::vector> convert_to_rows2(cudf::table_view const &tbl, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) +{ + // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the + // data, but small enough that multiple columns fit in memory so the writes can coalese as well. + // Potential optimization for window sizes. + const size_type num_columns = tbl.num_columns(); + const size_type num_rows = tbl.num_rows(); + int device_id; CUDA_TRY(cudaGetDevice(&device_id)); int shmem_limit_per_block; @@ -834,8 +1235,8 @@ std::vector> convert_to_rows2(cudf::table_view con } } - auto dev_input_data = detail::copy_to_dev_async2(input_data, stream, mr); - auto dev_input_nm = detail::copy_to_dev_async2(input_nm, stream, mr); + auto dev_input_data = detail::copy_to_dev_async2(input_data, stream, mr); + auto dev_input_nm = detail::copy_to_dev_async2(input_nm, stream, mr); std::vector row_sizes; // size of each row in bytes including any alignment padding std::vector row_offsets; // offset from the start of the data to this row @@ -848,43 +1249,48 @@ std::vector> convert_to_rows2(cudf::table_view con column_sizes.reserve(num_columns); column_starts.reserve(num_columns + 1); // we add a final offset for validity data start - size_type fixed_width_size_per_row = 0; - for (int col = 0; col < num_columns; ++col) { - auto cv = tbl.column(col); - auto col_type = cv.type(); - bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; + auto iter = thrust::make_transform_iterator(thrust::make_counting_iterator(0), [&tbl](auto i) -> std::tuple { + return std::make_tuple(tbl.column(i).type(), tbl.column(i)); + }); + + size_type fixed_width_size_per_row = detail::compute_column_information( + iter, + iter + num_columns, + column_starts, + column_sizes);//, +// [&variable_width_columns](column_view const &cv) { variable_width_columns.push_back(cv); }); + /* size_type fixed_width_size_per_row = 0; + for (int col = 0; col < num_columns; ++col) { + auto cv = tbl.column(col); + auto col_type = cv.type(); + bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; + + if (nested_type) { variable_width_columns.push_back(cv); } + + // a list or string column will write a single uint64 + // of data here for offset/length + auto col_size = nested_type ? 8 : size_of(col_type); + + // align size for this type + std::size_t const alignment_needed = col_size; // They are the same for fixed width types + fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed); + column_starts.push_back(fixed_width_size_per_row); + column_sizes.push_back(col_size); + fixed_width_size_per_row += col_size; + }*/ - if (nested_type) { variable_width_columns.push_back(cv); } +#if defined(DEBUG) + printf("validity offset will be %d + %d = %d\n", + column_starts.back(), + column_sizes.back(), + column_starts.back() + column_sizes.back()); +#endif - // a list or string column will write a single uint64 - // of data here for offset/length - auto col_size = nested_type ? 8 : size_of(col_type); - // align size for this type - std::size_t const alignment_needed = col_size; // They are the same for fixed width types - fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed); - column_starts.push_back(fixed_width_size_per_row); - column_sizes.push_back(col_size); - fixed_width_size_per_row += col_size; - } + auto dev_col_sizes = detail::copy_to_dev_async2(column_sizes, stream, mr); + auto dev_col_starts = detail::copy_to_dev_async2(column_starts, stream, mr); - auto dev_col_sizes = detail::copy_to_dev_async2(column_sizes, stream, mr); - auto dev_col_starts = detail::copy_to_dev_async2(column_starts, stream, mr); - - // When building the columns to return, we have to be mindful of the offset limit in cudf. - // It is 32-bit and these data columns are capable of surpassing that easily. The data should - // not be cut off exactly at the limit though due to the validity buffers. The most efficient - // place to cut the validity is on a 32-row boundary, so as we calculate the row sizes - // we keep track of the cut points for the validity, which we call row batches. If the row - // is larger than can be represented with the 32-bit offsets, we use the last 32-row boundary we - // hit. Note that this boundary is for our book-keeping with column pointers and not anything that - // the kernel needs to worry about. We cut the output at convienient boundaries when assembling - // the outgoing data stream. - struct row_batch { - size_type num_bytes; - size_type row_count; - }; - std::vector row_batches; + std::vector row_batches; auto calculate_variable_width_row_data_size = [](int const row) { // each level of variable-width data will add an offset/length @@ -936,10 +1342,11 @@ std::vector> convert_to_rows2(cudf::table_view con row_sizes[row] = detail::align_offset(row_sizes[row], 8) + calculate_variable_width_row_data_size(row); // rows are 8 byte aligned - if ((uint64_t)aligned_row_batch_size + row_sizes[row] > (uint64_t)std::numeric_limits::max()) { + if ((uint64_t)aligned_row_batch_size + row_sizes[row] > + (uint64_t)std::numeric_limits::max()) { // a new batch starts at the last 32-row boundary row_batches.push_back( - row_batch{static_cast(row_batch_size), row_batch_rows & ~31}); + detail::row_batch{static_cast(row_batch_size), row_batch_rows & ~31}); row_batch_size = 0; row_batch_rows = row_batch_rows & 31; row_offset = 0; @@ -954,12 +1361,12 @@ std::vector> convert_to_rows2(cudf::table_view con row_batch_rows++; } if (row_batch_size > 0) { - row_batches.push_back(row_batch{static_cast(row_batch_size), row_batch_rows}); + row_batches.push_back(detail::row_batch{static_cast(row_batch_size), row_batch_rows}); } auto dev_row_offsets = detail::copy_to_dev_async2(row_offsets, stream, mr); - #if defined(DEBUG) +#if defined(DEBUG) printf("%d rows and %d columns in table\n", num_rows, num_columns); printf("%lu batches:\n", row_batches.size()); for (auto i = 0; i < (int)row_batches.size(); ++i) { @@ -979,125 +1386,8 @@ std::vector> convert_to_rows2(cudf::table_view con } auto dev_output_data = detail::copy_to_dev_async2(output_data, stream, mr); - std::vector block_infos; - - // block infos are organized with the windows going "down" the columns - // this provides the most coalescing of memory access - int current_window_width = 0; - int current_window_start_col = 0; - - // build the blocks for a specific set of columns - auto build_blocks = [&block_infos, &row_batches, num_rows]( - int const start_col, int const end_col, int const desired_window_height) { - int current_window_start_row = 0; - int current_window_row_batch = 0; - int rows_left_in_batch = row_batches[current_window_row_batch].row_count; - int i = 0; - while (i < num_rows) { - if (rows_left_in_batch == 0) { - current_window_row_batch++; - rows_left_in_batch = row_batches[current_window_row_batch].row_count; - } - int const window_height = std::min(desired_window_height, rows_left_in_batch); - - block_infos.emplace_back( - detail::block_info{start_col, - current_window_start_row, - end_col, - std::min(current_window_start_row + window_height - 1, num_rows - 1), - current_window_row_batch}); - - i += window_height; - current_window_start_row += window_height; - rows_left_in_batch -= window_height; - } - }; - - // the ideal window height has lots of 8-byte reads and 8-byte writes. The optimal read/write would be memory cache line sized - // access, but since other blocks will read/write the edges this may not turn out to be overly important. - // For now, we will attempt to build a square window as far as byte sizes. x * y = shared_mem_size. - // Which translates to x^2 = shared_mem_size since we want them equal, so height and width are - // sqrt(shared_mem_size). The trick is that it's in bytes, not rows or columns. - int const window_height = - std::min(std::min(size_type(sqrt(shmem_limit_per_block))/column_sizes[0], num_rows), row_batches[0].row_count); -#if defined(DEBUG) - printf( - "max_window_height is %d, num_rows is %d, batch row count is %d - which makes window height " - "%d\n", - max_window_height, - num_rows, - row_batches[0].row_count, - window_height); -#endif - - int row_size = 0; - - // march each column and build the blocks of appropriate sizes - for (int col = 0; col < num_columns; ++col) { - auto const col_size = column_sizes[col]; - - // align size for this type - std::size_t alignment_needed = col_size; // They are the same for fixed width types - auto row_size_aligned = detail::align_offset(row_size, alignment_needed); - auto row_size_with_this_col = row_size_aligned + col_size; - auto row_size_with_end_pad = detail::align_offset(row_size_with_this_col, 8); - - if (row_size_with_end_pad * window_height > shmem_limit_per_block) { -#if defined(DEBUG) - printf( - "Window size %d too large at column %d, bumping back to build windows of size %d(cols " - "%d-%d), which is %d tall. Row size is too large at %d and ok at %d(aligned overall is %d) " - "for shared mem size %d\n", - row_size_with_end_pad * window_height, - col, - row_size * window_height, - current_window_start_col, - col - 1, - window_height, - row_size_with_end_pad, - row_size, - row_size_aligned, - shmem_limit_per_block); -#endif - // too large, close this window, generate vertical blocks and restart - build_blocks(current_window_start_col, col - 1, window_height); - row_size = - detail::align_offset((column_starts[col] + column_sizes[col]) & 7, alignment_needed); -#if defined(DEBUG) - printf( - "New window starting with offset %d and row size %d to be %d (previous column offset %d+%d " - "or %d)\n", - row_size, - col_size, - row_size + col_size, - column_starts[col - 1], - column_sizes[col - 1], - column_starts[col - 1] + column_sizes[col - 1]); -#endif - row_size += col_size; // alignment required for shared memory window boundary to match - // alignment of output row - current_window_start_col = col; - current_window_width = 0; - } else { - row_size = row_size_with_this_col; - current_window_width++; - } - } - -#if defined(DEBUG) - printf("validity offset will be %d + %d = %d\n", - column_starts.back(), - column_sizes.back(), - column_starts.back() + column_sizes.back()); -#endif - auto validity_offset = detail::align_offset(column_starts.back() + column_sizes.back(), 4); - column_starts.push_back(validity_offset); - - // build last set of blocks - if (current_window_width > 0) { - build_blocks(current_window_start_col, (int)tbl.num_columns() - 1, window_height); - } - + std::vector block_infos = + build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block); #if defined(DEBUG) printf("%lu windows for %d columns, %d rows to fit in ", @@ -1116,7 +1406,11 @@ std::vector> convert_to_rows2(cudf::table_view con // blast through the entire table and convert it dim3 blocks(block_infos.size()); - dim3 threads(std::min(1024, shmem_limit_per_block / 8)); + #if defined(DEBUG) || 1 + dim3 threads(std::min(std::min(512, shmem_limit_per_block / 8), (int)total_table_size)); + #else + dim3 threads(std::min(std::min(1024, shmem_limit_per_block / 8), (int)total_table_size)); + #endif #if defined(DEBUG) printf("Launching kernel with %d blocks, %d threads, ", blocks.x, threads.x); pretty_print(shmem_limit_per_block); @@ -1206,11 +1500,11 @@ std::vector> convert_to_rows(cudf::table_view cons using ScalarType = cudf::scalar_type_t; auto zero = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value()); - zero->set_valid(true, stream); + zero->set_valid_async(true, stream); static_cast(zero.get())->set_value(0, stream); auto step = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value()); - step->set_valid(true, stream); + step->set_valid_async(true, stream); static_cast(step.get()) ->set_value(static_cast(size_per_row), stream); @@ -1238,6 +1532,97 @@ std::vector> convert_to_rows(cudf::table_view cons } } +std::unique_ptr convert_from_rows2(cudf::lists_column_view const &input, + std::vector const &schema, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) +{ + // verify that the types are what we expect + cudf::column_view child = input.child(); + cudf::type_id list_type = child.type().id(); + CUDF_EXPECTS(list_type == cudf::type_id::INT8 || list_type == cudf::type_id::UINT8, + "Only a list of bytes is supported as input"); + + cudf::size_type num_columns = schema.size(); + cudf::size_type num_rows = input.parent().size(); + + int device_id; + CUDA_TRY(cudaGetDevice(&device_id)); + int shmem_limit_per_block; + CUDA_TRY( + cudaDeviceGetAttribute(&shmem_limit_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); + + std::vector column_starts; + std::vector column_sizes; + + auto iter = thrust::make_transform_iterator(thrust::make_counting_iterator(0), [&schema](auto i) { + return std::make_tuple(schema[i], nullptr); + }); + size_type fixed_width_size_per_row = detail::compute_column_information( + iter, iter + num_columns, column_starts, column_sizes);//, [](void *) {}); + + size_type validity_size = (num_columns + 7) / 8; + + size_type row_size = detail::align_offset(fixed_width_size_per_row + validity_size, 8); + + // Ideally we would check that the offsets are all the same, etc. but for now + // this is probably fine + CUDF_EXPECTS(row_size * num_rows == child.size(), + "The layout of the data appears to be off"); + auto dev_col_starts = detail::copy_to_dev_async2(column_starts, stream, mr); + auto dev_col_sizes = detail::copy_to_dev_async2(column_sizes, stream, mr); + + // build the row_batches from the passed in list column + std::vector row_batches; + + row_batches.push_back(detail::row_batch{child.size(), num_rows}); + + // Allocate the columns we are going to write into + std::vector> output_columns; + std::vector output_data; + std::vector output_nm; + for (cudf::size_type i = 0; i < num_columns; i++) { + auto column = cudf::make_fixed_width_column( + schema[i], num_rows, cudf::mask_state::UNINITIALIZED, stream, mr); + auto mut = column->mutable_view(); + output_data.emplace_back(mut.data()); + output_nm.emplace_back(mut.null_mask()); + output_columns.emplace_back(std::move(column)); + } + + auto dev_output_data = detail::copy_to_dev_async2(output_data, stream, mr); + auto dev_output_nm = detail::copy_to_dev_async2(output_nm, stream, mr); + + std::vector block_infos = + build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block); + + auto dev_block_infos = detail::copy_to_dev_async2(block_infos, stream, mr); + + dim3 blocks(block_infos.size()); + #if defined(DEBUG) || 1 + dim3 threads(std::min(std::min(512, shmem_limit_per_block / 8), (int)child.size())); + #else + dim3 threads(std::min(std::min(1024, shmem_limit_per_block / 8), (int)child.size())); + #endif +#if defined(DEBUG) + printf("Launching kernel with %d blocks, %d threads, ", blocks.x, threads.x); + pretty_print(shmem_limit_per_block); + printf(" shared memory\n"); +#endif + detail::copy_to_columns<<>>( + num_rows, + num_columns, + input.offsets().data(), + dev_output_data.data(), + dev_output_nm.data(), + dev_col_sizes.data(), + dev_col_starts.data(), + dev_block_infos.data(), + child.data()); + + return std::make_unique(std::move(output_columns)); +} + std::unique_ptr convert_from_rows(cudf::lists_column_view const &input, std::vector const &schema, rmm::cuda_stream_view stream, @@ -1318,4 +1703,20 @@ std::unique_ptr convert_from_rows( // } } +std::unique_ptr convert_from_rows2( + std::vector> const &input, + std::vector const &schema, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) +{ + CUDF_EXPECTS(input.size() == 1, "Too large of an input, need to concat the output tables..."); + + // for (uint i=0; iview(); + auto ret = convert_from_rows2(lcv, schema, stream, mr); + + return ret; + // } +} + } // namespace cudf diff --git a/cpp/tests/row_conversion/row_conversion.cpp b/cpp/tests/row_conversion/row_conversion.cpp index c02f83ad1d5..818d7a89ddb 100644 --- a/cpp/tests/row_conversion/row_conversion.cpp +++ b/cpp/tests/row_conversion/row_conversion.cpp @@ -21,9 +21,13 @@ #include #include +#include "cudf/lists/lists_column_view.hpp" +#include "cudf/types.hpp" struct ColumnToRowTests : public cudf::test::BaseFixture { }; +struct RowToColumnTests : public cudf::test::BaseFixture { +}; TEST_F(ColumnToRowTests, Single) { @@ -108,3 +112,105 @@ TEST_F(ColumnToRowTests, SingleByteWide) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); } } + +TEST_F(RowToColumnTests, Single) +{ + cudf::test::fixed_width_column_wrapper a({-1}); + cudf::table_view in(std::vector{a}); + + auto old_rows = cudf::convert_to_rows(in); + std::vector schema{cudf::data_type{cudf::type_id::INT32}}; + for (uint i=0; i a({-1, 0, 1}); + cudf::table_view in(std::vector{a}); + + auto old_rows = cudf::convert_to_rows(in); + std::vector schema{cudf::data_type{cudf::type_id::INT32}}; + for (uint i=0; i int32_t { return rand(); }); + cudf::test::fixed_width_column_wrapper a(r, r + (size_t)4096); + cudf::table_view in(std::vector{a}); + + auto old_rows = cudf::convert_to_rows(in); + std::vector schema; + schema.reserve(in.num_columns()); + for (auto col = in.begin(); col < in.end(); ++col) { + schema.push_back(col->type()); + } + for (uint i=0; i> cols; + std::vector views; + + for (int i = 0; i < 256; ++i) { + cols.push_back(cudf::test::fixed_width_column_wrapper({rand()})); + views.push_back(cols.back()); + } + cudf::table_view in(views); + + auto old_rows = cudf::convert_to_rows(in); + std::vector schema; + schema.reserve(in.num_columns()); + for (auto col = in.begin(); col < in.end(); ++col) { + schema.push_back(col->type()); + } + + for (uint i=0; i> cols; + std::vector views; + + for (int i = 0; i < 256; ++i) { + cols.push_back(cudf::test::fixed_width_column_wrapper({rand()})); + views.push_back(cols.back()); + } + cudf::table_view in(views); + + auto old_rows = cudf::convert_to_rows(in); + std::vector schema; + schema.reserve(in.num_columns()); + for (auto col = in.begin(); col < in.end(); ++col) { + schema.push_back(col->type()); + } + for (uint i=0; i Date: Thu, 8 Jul 2021 20:45:18 +0000 Subject: [PATCH 06/80] updating to use make_device_uvector_async and bitmask functions per review comments --- cpp/src/row_conversion/row_conversion.cu | 125 +++++++++-------------- 1 file changed, 47 insertions(+), 78 deletions(-) diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu index c0e78a03576..c73e967cf0f 100644 --- a/cpp/src/row_conversion/row_conversion.cu +++ b/cpp/src/row_conversion/row_conversion.cu @@ -21,6 +21,7 @@ #include #include +#include #include #include #include @@ -36,6 +37,7 @@ #include "thrust/iterator/counting_iterator.h" #include "thrust/iterator/transform_iterator.h" +using cudf::detail::make_device_uvector_async; namespace cudf { namespace detail { @@ -45,32 +47,6 @@ static inline __host__ __device__ int32_t align_offset(int32_t offset, std::size return (offset + alignment - 1) & ~(alignment - 1); } -/** - * Copy a simple vector to device memory asynchronously. Be sure to read - * the data on the same stream as is used to copy it. - */ -template -std::unique_ptr> copy_to_dev_async(const std::vector &input, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) -{ - std::unique_ptr> ret(new rmm::device_uvector(input.size(), stream, mr)); - CUDA_TRY(cudaMemcpyAsync( - ret->data(), input.data(), sizeof(T) * input.size(), cudaMemcpyHostToDevice, stream.value())); - return ret; -} - -template -rmm::device_uvector copy_to_dev_async2(const std::vector &input, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) -{ - rmm::device_uvector ret(input.size(), stream, mr); - CUDA_TRY(cudaMemcpyAsync( - ret.data(), input.data(), sizeof(T) * input.size(), cudaMemcpyHostToDevice, stream.value())); - return ret; -} - __global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows, const cudf::size_type num_columns, const cudf::size_type row_size, @@ -180,8 +156,8 @@ __global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows, } cudf::bitmask_type *nm = output_nm[col_index]; - int8_t *valid_byte = &row_vld_tmp[col_index / 8]; - cudf::size_type byte_bit_offset = col_index % 8; + int8_t *valid_byte = &row_vld_tmp[word_index(col_index)]; + cudf::size_type byte_bit_offset = intra_word_index(col_index); int predicate = *valid_byte & (1 << byte_bit_offset); uint32_t bitmask = __ballot_sync(active_mask, predicate); if (row_index % 32 == 0) { nm[word_index(row_index)] = bitmask; } @@ -278,8 +254,8 @@ __global__ void copy_from_fixed_width_columns(const cudf::size_type start_row, } // atomicOr only works on 32 bit or 64 bit aligned values, and not byte aligned // so we have to rewrite the addresses to make sure that it is 4 byte aligned - int8_t *valid_byte = &row_vld_tmp[col_index / 8]; - cudf::size_type byte_bit_offset = col_index % 8; + int8_t *valid_byte = &row_vld_tmp[word_index(col_index)]; + cudf::size_type byte_bit_offset = intra_word_index(col_index); uint64_t fixup_bytes = reinterpret_cast(valid_byte) % 4; int32_t *valid_int = reinterpret_cast(valid_byte - fixup_bytes); cudf::size_type int_bit_offset = byte_bit_offset + (fixup_bytes * 8); @@ -505,8 +481,8 @@ __global__ void copy_from_columns(const size_type num_rows, // we do this directly in the final location because the entire row may not // fit in shared memory and may require many blocks to process it entirely int8_t *valid_byte = - &output_data[block.buffer_num][row_offsets[row] + validity_offset + col / 8]; - cudf::size_type byte_bit_offset = col % 8; + &output_data[block.buffer_num][row_offsets[row] + validity_offset + word_index(col)]; + cudf::size_type byte_bit_offset = intra_word_index(col); uint64_t fixup_bytes = reinterpret_cast(valid_byte) % 4; int32_t *valid_int = reinterpret_cast(valid_byte - fixup_bytes); cudf::size_type int_bit_offset = byte_bit_offset + (fixup_bytes * 8); @@ -648,7 +624,7 @@ __global__ void copy_to_columns(const size_type num_rows, // This has been broken up for us in the block_info struct, so we don't have // any calculation to do here, but it is important to note. - bool debug_print = false; //blockIdx.x == 1 && threadIdx.x == 0; + bool debug_print = false; //blockIdx.x == 0 && threadIdx.x == 0; if (debug_print) { printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns); @@ -806,7 +782,7 @@ __global__ void copy_to_columns(const size_type num_rows, auto const col = index / validity_batches_per_col; auto const batch = index % validity_batches_per_col; auto const starting_row = batch * 32; - auto const validity_offset = col_offsets[num_columns] + col / 8; + auto const validity_offset = col_offsets[num_columns] + word_index(col); if (debug_print) { printf("col: %d, batch: %d, starting_row: %d, validity_offset: %d\n", col, batch, starting_row, validity_offset); @@ -821,7 +797,7 @@ __global__ void copy_to_columns(const size_type num_rows, } auto const val_byte = *validity_ptr; - auto const src_shift = col % 8; + auto const src_shift = intra_word_index(col); auto const dst_shift = row % 32; auto const src_bit_mask = 1 << src_shift; if (debug_print) { @@ -920,10 +896,10 @@ static std::unique_ptr fixed_width_convert_to_rows( const cudf::size_type num_rows, const cudf::size_type num_columns, const cudf::size_type size_per_row, - std::unique_ptr> &column_start, - std::unique_ptr> &column_size, - std::unique_ptr> &input_data, - std::unique_ptr> &input_nm, + rmm::device_uvector &column_start, + rmm::device_uvector &column_size, + rmm::device_uvector &input_data, + rmm::device_uvector &input_nm, const cudf::scalar &zero, const cudf::scalar &scalar_size_per_row, rmm::cuda_stream_view stream, @@ -954,10 +930,10 @@ static std::unique_ptr fixed_width_convert_to_rows( num_rows, num_columns, size_per_row, - column_start->data(), - column_size->data(), - input_data->data(), - input_nm->data(), + column_start.data(), + column_size.data(), + input_data.data(), + input_nm.data(), data->mutable_view().data()); return cudf::make_lists_column(num_rows, @@ -1004,7 +980,7 @@ static inline int32_t compute_fixed_width_layout(std::vector co // Now we need to add in space for validity // Eventually we can think about nullable vs not nullable, but for now we will just always add it // in - int32_t validity_bytes_needed = (schema.size() + 7) / 8; + int32_t validity_bytes_needed = word_index(schema.size() + 7); // validity comes at the end and is byte aligned so we can pack more in. at_offset += validity_bytes_needed; // Now we need to pad the end so all rows are 64 bit aligned @@ -1235,8 +1211,8 @@ std::vector> convert_to_rows2(cudf::table_view con } } - auto dev_input_data = detail::copy_to_dev_async2(input_data, stream, mr); - auto dev_input_nm = detail::copy_to_dev_async2(input_nm, stream, mr); + auto dev_input_data = make_device_uvector_async(input_data, stream, mr); + auto dev_input_nm = make_device_uvector_async(input_nm, stream, mr); std::vector row_sizes; // size of each row in bytes including any alignment padding std::vector row_offsets; // offset from the start of the data to this row @@ -1287,8 +1263,8 @@ std::vector> convert_to_rows2(cudf::table_view con #endif - auto dev_col_sizes = detail::copy_to_dev_async2(column_sizes, stream, mr); - auto dev_col_starts = detail::copy_to_dev_async2(column_starts, stream, mr); + auto dev_col_sizes = make_device_uvector_async(column_sizes, stream, mr); + auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr); std::vector row_batches; @@ -1322,16 +1298,9 @@ std::vector> convert_to_rows2(cudf::table_view con size_type row_batch_rows = 0; uint64_t row_offset = 0; - auto calculate_validity_size = [](int const num_cols) { - // Now we need to add in space for validity - // Eventually we can think about nullable vs not nullable, but for now we will just always add - // it in - return (num_cols + 7) / 8; - }; - // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then // calculate the size of each row's variable-width data and validity as well. - auto validity_size = calculate_validity_size(num_columns); + auto validity_size = num_bitmask_words(num_columns); for (int row = 0; row < num_rows; ++row) { auto aligned_row_batch_size = detail::align_offset(row_batch_size, 8); // rows are 8 byte aligned @@ -1364,7 +1333,7 @@ std::vector> convert_to_rows2(cudf::table_view con row_batches.push_back(detail::row_batch{static_cast(row_batch_size), row_batch_rows}); } - auto dev_row_offsets = detail::copy_to_dev_async2(row_offsets, stream, mr); + auto dev_row_offsets = make_device_uvector_async(row_offsets, stream, mr); #if defined(DEBUG) printf("%d rows and %d columns in table\n", num_rows, num_columns); @@ -1384,7 +1353,7 @@ std::vector> convert_to_rows2(cudf::table_view con output_data.push_back(static_cast(temp.data())); output_buffers.push_back(std::move(temp)); } - auto dev_output_data = detail::copy_to_dev_async2(output_data, stream, mr); + auto dev_output_data = make_device_uvector_async(output_data, stream, mr); std::vector block_infos = build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block); @@ -1402,7 +1371,7 @@ std::vector> convert_to_rows2(cudf::table_view con printf(" total):\n"); #endif - auto dev_block_infos = detail::copy_to_dev_async2(block_infos, stream, mr); + auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr); // blast through the entire table and convert it dim3 blocks(block_infos.size()); @@ -1443,7 +1412,7 @@ std::vector> convert_to_rows2(cudf::table_view con } offset_offset += row_batches[i].row_count; - auto dev_offsets = detail::copy_to_dev_async2(offset_vals, stream, mr); + auto dev_offsets = make_device_uvector_async(offset_vals, stream, mr); auto offsets = std::make_unique( data_type{type_id::INT32}, (size_type)offset_vals.size(), dev_offsets.release()); @@ -1477,8 +1446,8 @@ std::vector> convert_to_rows(cudf::table_view cons std::vector column_size; int32_t size_per_row = detail::compute_fixed_width_layout(schema, column_start, column_size); - auto dev_column_start = detail::copy_to_dev_async(column_start, stream, mr); - auto dev_column_size = detail::copy_to_dev_async(column_size, stream, mr); + auto dev_column_start = make_device_uvector_async(column_start, stream, mr); + auto dev_column_size = make_device_uvector_async(column_size, stream, mr); int32_t max_rows_per_batch = std::numeric_limits::max() / size_per_row; // Make the number of rows per batch a multiple of 32 so we don't have to worry about @@ -1495,8 +1464,8 @@ std::vector> convert_to_rows(cudf::table_view cons input_data.emplace_back(cv.data()); input_nm.emplace_back(cv.null_mask()); } - auto dev_input_data = detail::copy_to_dev_async(input_data, stream, mr); - auto dev_input_nm = detail::copy_to_dev_async(input_nm, stream, mr); + auto dev_input_data = make_device_uvector_async(input_data, stream, mr); + auto dev_input_nm = make_device_uvector_async(input_nm, stream, mr); using ScalarType = cudf::scalar_type_t; auto zero = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value()); @@ -1561,7 +1530,7 @@ std::unique_ptr convert_from_rows2(cudf::lists_column_view const &i size_type fixed_width_size_per_row = detail::compute_column_information( iter, iter + num_columns, column_starts, column_sizes);//, [](void *) {}); - size_type validity_size = (num_columns + 7) / 8; + size_type validity_size = num_bitmask_words(num_columns); size_type row_size = detail::align_offset(fixed_width_size_per_row + validity_size, 8); @@ -1569,8 +1538,8 @@ std::unique_ptr convert_from_rows2(cudf::lists_column_view const &i // this is probably fine CUDF_EXPECTS(row_size * num_rows == child.size(), "The layout of the data appears to be off"); - auto dev_col_starts = detail::copy_to_dev_async2(column_starts, stream, mr); - auto dev_col_sizes = detail::copy_to_dev_async2(column_sizes, stream, mr); + auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr); + auto dev_col_sizes = make_device_uvector_async(column_sizes, stream, mr); // build the row_batches from the passed in list column std::vector row_batches; @@ -1590,13 +1559,13 @@ std::unique_ptr convert_from_rows2(cudf::lists_column_view const &i output_columns.emplace_back(std::move(column)); } - auto dev_output_data = detail::copy_to_dev_async2(output_data, stream, mr); - auto dev_output_nm = detail::copy_to_dev_async2(output_nm, stream, mr); + auto dev_output_data = make_device_uvector_async(output_data, stream, mr); + auto dev_output_nm = make_device_uvector_async(output_nm, stream, mr); std::vector block_infos = build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block); - auto dev_block_infos = detail::copy_to_dev_async2(block_infos, stream, mr); + auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr); dim3 blocks(block_infos.size()); #if defined(DEBUG) || 1 @@ -1647,8 +1616,8 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const &in // this is probably fine CUDF_EXPECTS(size_per_row * num_rows == child.size(), "The layout of the data appears to be off"); - auto dev_column_start = detail::copy_to_dev_async(column_start, stream, mr); - auto dev_column_size = detail::copy_to_dev_async(column_size, stream, mr); + auto dev_column_start = make_device_uvector_async(column_start, stream); + auto dev_column_size = make_device_uvector_async(column_size, stream); // Allocate the columns we are going to write into std::vector> output_columns; @@ -1663,8 +1632,8 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const &in output_columns.emplace_back(std::move(column)); } - auto dev_output_data = detail::copy_to_dev_async(output_data, stream, mr); - auto dev_output_nm = detail::copy_to_dev_async(output_nm, stream, mr); + auto dev_output_data = make_device_uvector_async(output_data, stream, mr); + auto dev_output_nm = make_device_uvector_async(output_nm, stream, mr); dim3 blocks; dim3 threads; @@ -1675,10 +1644,10 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const &in num_rows, num_columns, size_per_row, - dev_column_start->data(), - dev_column_size->data(), - dev_output_data->data(), - dev_output_nm->data(), + dev_column_start.data(), + dev_column_size.data(), + dev_output_data.data(), + dev_output_nm.data(), child.data()); return std::make_unique(std::move(output_columns)); From f8bc01fa175a44fed79645f4c39c6e0944acfb6e Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Tue, 13 Jul 2021 07:18:49 +0000 Subject: [PATCH 07/80] updating conversion code. Found out bit operations are on 32-bit values, so they can't be used since row data has byte-aligned validity. Performance improvements on the row to column side. --- cpp/src/row_conversion/row_conversion.cu | 106 ++++++++++++----------- 1 file changed, 54 insertions(+), 52 deletions(-) diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu index c73e967cf0f..0879a1c50a5 100644 --- a/cpp/src/row_conversion/row_conversion.cu +++ b/cpp/src/row_conversion/row_conversion.cu @@ -37,6 +37,8 @@ #include "thrust/iterator/counting_iterator.h" #include "thrust/iterator/transform_iterator.h" +#define NUM_BLOCKS_PER_KERNEL_TO_COLUMNS (2) + using cudf::detail::make_device_uvector_async; namespace cudf { @@ -156,11 +158,11 @@ __global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows, } cudf::bitmask_type *nm = output_nm[col_index]; - int8_t *valid_byte = &row_vld_tmp[word_index(col_index)]; - cudf::size_type byte_bit_offset = intra_word_index(col_index); + int8_t *valid_byte = &row_vld_tmp[col_index / 8]; + cudf::size_type byte_bit_offset = col_index % 8; int predicate = *valid_byte & (1 << byte_bit_offset); uint32_t bitmask = __ballot_sync(active_mask, predicate); - if (row_index % 32 == 0) { nm[word_index(row_index)] = bitmask; } + if (row_index % 32 == 0) { nm[row_index / 8] = bitmask; } } // end column loop } // end row copy // wait for the row_group to be totally copied before starting on the next row group @@ -254,8 +256,8 @@ __global__ void copy_from_fixed_width_columns(const cudf::size_type start_row, } // atomicOr only works on 32 bit or 64 bit aligned values, and not byte aligned // so we have to rewrite the addresses to make sure that it is 4 byte aligned - int8_t *valid_byte = &row_vld_tmp[word_index(col_index)]; - cudf::size_type byte_bit_offset = intra_word_index(col_index); + int8_t *valid_byte = &row_vld_tmp[col_index / 8]; + cudf::size_type byte_bit_offset = col_index % 8; uint64_t fixup_bytes = reinterpret_cast(valid_byte) % 4; int32_t *valid_int = reinterpret_cast(valid_byte - fixup_bytes); cudf::size_type int_bit_offset = byte_bit_offset + (fixup_bytes * 8); @@ -481,8 +483,8 @@ __global__ void copy_from_columns(const size_type num_rows, // we do this directly in the final location because the entire row may not // fit in shared memory and may require many blocks to process it entirely int8_t *valid_byte = - &output_data[block.buffer_num][row_offsets[row] + validity_offset + word_index(col)]; - cudf::size_type byte_bit_offset = intra_word_index(col); + &output_data[block.buffer_num][row_offsets[row] + validity_offset + (col / 8)]; + cudf::size_type byte_bit_offset = col % 8; uint64_t fixup_bytes = reinterpret_cast(valid_byte) % 4; int32_t *valid_int = reinterpret_cast(valid_byte - fixup_bytes); cudf::size_type int_bit_offset = byte_bit_offset + (fixup_bytes * 8); @@ -597,6 +599,7 @@ __global__ void copy_from_columns(const size_type num_rows, * * @param num_rows total number of rows in the table * @param num_columns total number of columns in the table + * @param shmem_used_per_block amount of shared memory that is used by a block * @param offsets * @param output_data * @param output_nm @@ -608,6 +611,7 @@ __global__ void copy_from_columns(const size_type num_rows, */ __global__ void copy_to_columns(const size_type num_rows, const size_type num_columns, + const size_type shmem_used_per_block, const size_type *offsets, int8_t **output_data, cudf::bitmask_type **output_nm, @@ -624,18 +628,10 @@ __global__ void copy_to_columns(const size_type num_rows, // This has been broken up for us in the block_info struct, so we don't have // any calculation to do here, but it is important to note. - bool debug_print = false; //blockIdx.x == 0 && threadIdx.x == 0; + constexpr bool debug_print = false; //blockIdx.x == 0 && threadIdx.x == 0; if (debug_print) { printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns); - printf("Column Info:\n"); - for (int i = 0; i < num_columns; ++i) { - printf("col %d is at %p with size %d and offset %d\n", - i, - output_data[i], - col_sizes[i], - col_offsets[i]); - } printf("block infos are at %p and my index is %d\n", block_infos, blockIdx.x); /* printf("Row Offsets:\n"); for (int i=0; i blockDim.x) { + break; + } + auto block = block_infos[this_block_index]; auto const rows_in_block = block.end_row - block.start_row + 1; auto const cols_in_block = block.end_col - block.start_col + 1; extern __shared__ int8_t shared_data[]; @@ -767,61 +769,58 @@ __global__ void copy_to_columns(const size_type num_rows, } } - __syncthreads(); - - // now handle validity. Each thread is responsible for 32 rows in a single column. + // now handle validity. Each thread is responsible for 32 rows in 8 columns. // to prevent indexing issues with a large number of threads, this is compressed // to a single loop like above. TODO: investigate using shared memory here auto const validity_batches_per_col = (num_rows + 31) / 32; - auto const validity_batches_total = validity_batches_per_col * num_columns; - if (debug_print) { - printf("validity_batched_per_col is %d\nvalidity_batches_total is %d for %d rows\n", validity_batches_per_col, validity_batches_total, num_rows); + auto const validity_batches_total = std::max(1, validity_batches_per_col * (num_columns / 8)); + if (debug_print && threadIdx.x == 0 && blockIdx.x == 0) { + printf("validity_batched_per_col is %d\nvalidity_batches_total is %d for %d rows\n%d blocks of %d threads\n", validity_batches_per_col, validity_batches_total, num_rows, gridDim.x, blockDim.x); } - for (int index = threadIdx.x; index < validity_batches_total; index += blockDim.x) { - // what column is this? - auto const col = index / validity_batches_per_col; + for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < validity_batches_total; index += blockDim.x * gridDim.x) { + auto const start_col = (index * 8) / validity_batches_per_col; auto const batch = index % validity_batches_per_col; auto const starting_row = batch * 32; - auto const validity_offset = col_offsets[num_columns] + word_index(col); + auto const validity_offset = col_offsets[num_columns] + (start_col / 8); if (debug_print) { - printf("col: %d, batch: %d, starting_row: %d, validity_offset: %d\n", col, batch, starting_row, validity_offset); + printf("%d-%d: cols: %d-%d, word index: %d, batch: %d, starting_row: %d, +validity_offset: %d, index: %d, stride: %d\n", threadIdx.x, blockIdx.x, start_col, start_col + 7, (start_col / 8), batch, starting_row, validity_offset, index, blockDim.x * gridDim.x); } - int32_t dst_validity = 0; + // one for each column + int32_t dst_validity[8] = {0}; for (int row = starting_row; row < std::min(num_rows, starting_row + 32); ++row) { int8_t const * const validity_ptr = &input_data[offsets[row] + validity_offset]; if (debug_print) { - printf("validity_ptr is %p for row %d\nwhich is input_data[%d]\n", validity_ptr, row, offsets[row] + validity_offset); + printf("%d: validity_ptr is %p for row %d\n", threadIdx.x, validity_ptr, row); } auto const val_byte = *validity_ptr; - auto const src_shift = intra_word_index(col); - auto const dst_shift = row % 32; - auto const src_bit_mask = 1 << src_shift; - if (debug_print) { - printf("src bit mask is 0x%x\n", src_bit_mask); - printf("src shift is 0x%x and dst shift is 0x%x\n", src_shift, dst_shift); - printf("validity bit is 0x%x\n", (val_byte & src_bit_mask) >> src_shift); - } -// auto const dst_bit_mask = 1 << dst_shift; - dst_validity |= (((val_byte & src_bit_mask) >> src_shift) << dst_shift); - if (debug_print) { - printf("validity is now 0x%x\n", dst_validity); + + for (int i=0; i> src_shift); + } + // auto const dst_bit_mask = 1 << dst_shift; + dst_validity[i] |= (((val_byte & src_bit_mask) >> src_shift) << dst_shift); } } - int32_t *validity_ptr = reinterpret_cast(output_nm[col] + (starting_row / 32)); - if (debug_print) { - printf("valiidty_ptr is output_nm[%d]: %p + starting_row / 8: %d because starting row is %d, which becomes %p\n", col, output_nm[col], starting_row / 32, starting_row, output_nm[col] + (starting_row / 32)); - printf("validity to write is %d\n", dst_validity); - printf("validity write %p <- %d\n", validity_ptr, dst_validity); + for (int i=0; i(output_nm[start_col + i] + (starting_row / 32)); + if (debug_print) { + printf("%d-%d: validity write output_nm[%d][%d] - %p <- %d\n", threadIdx.x, blockIdx.x, start_col + i, starting_row, validity_ptr, dst_validity[i]); + } + *validity_ptr = dst_validity[i]; } - *validity_ptr = dst_validity; } } +} /** * Calculate the dimensions of the kernel for fixed width only columns. @@ -980,7 +979,7 @@ static inline int32_t compute_fixed_width_layout(std::vector co // Now we need to add in space for validity // Eventually we can think about nullable vs not nullable, but for now we will just always add it // in - int32_t validity_bytes_needed = word_index(schema.size() + 7); + int32_t validity_bytes_needed = (schema.size() + 7) / 8; // validity comes at the end and is byte aligned so we can pack more in. at_offset += validity_bytes_needed; // Now we need to pad the end so all rows are 64 bit aligned @@ -1300,7 +1299,7 @@ std::vector> convert_to_rows2(cudf::table_view con // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then // calculate the size of each row's variable-width data and validity as well. - auto validity_size = num_bitmask_words(num_columns); + auto validity_size = num_bitmask_words(num_columns) * 4; for (int row = 0; row < num_rows; ++row) { auto aligned_row_batch_size = detail::align_offset(row_batch_size, 8); // rows are 8 byte aligned @@ -1521,6 +1520,8 @@ std::unique_ptr convert_from_rows2(cudf::lists_column_view const &i CUDA_TRY( cudaDeviceGetAttribute(&shmem_limit_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); + shmem_limit_per_block /= NUM_BLOCKS_PER_KERNEL_TO_COLUMNS; + std::vector column_starts; std::vector column_sizes; @@ -1530,7 +1531,7 @@ std::unique_ptr convert_from_rows2(cudf::lists_column_view const &i size_type fixed_width_size_per_row = detail::compute_column_information( iter, iter + num_columns, column_starts, column_sizes);//, [](void *) {}); - size_type validity_size = num_bitmask_words(num_columns); + size_type validity_size = num_bitmask_words(num_columns) * 4; size_type row_size = detail::align_offset(fixed_width_size_per_row + validity_size, 8); @@ -1567,7 +1568,7 @@ std::unique_ptr convert_from_rows2(cudf::lists_column_view const &i auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr); - dim3 blocks(block_infos.size()); + dim3 blocks((block_infos.size() + (NUM_BLOCKS_PER_KERNEL_TO_COLUMNS - 1)) / NUM_BLOCKS_PER_KERNEL_TO_COLUMNS); #if defined(DEBUG) || 1 dim3 threads(std::min(std::min(512, shmem_limit_per_block / 8), (int)child.size())); #else @@ -1581,6 +1582,7 @@ std::unique_ptr convert_from_rows2(cudf::lists_column_view const &i detail::copy_to_columns<<>>( num_rows, num_columns, + shmem_limit_per_block, input.offsets().data(), dev_output_data.data(), dev_output_nm.data(), From 636b235750668dc06d63512574b6b8cee2d263e6 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Mon, 7 Jun 2021 08:14:52 +0000 Subject: [PATCH 08/80] working on row and column conversions --- cpp/benchmarks/CMakeLists.txt | 27 +- .../row_conversion/row_conversion.cpp | 106 +- cpp/include/cudf/row_conversion.hpp | 12 - cpp/src/row_conversion/row_conversion.cu | 1183 +++++------------ 4 files changed, 320 insertions(+), 1008 deletions(-) diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 7d353c37df7..5cc48436d01 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -29,7 +29,6 @@ target_link_libraries(cudf_datagen GTest::gmock_main GTest::gtest_main benchmark::benchmark - nvbench::nvbench Threads::Threads cudf) @@ -51,19 +50,11 @@ target_link_libraries(cudf_benchmark_common PRIVATE cudf_datagen) function(ConfigureBench CMAKE_BENCH_NAME) add_executable(${CMAKE_BENCH_NAME} ${ARGN}) set_target_properties(${CMAKE_BENCH_NAME} - PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$") + PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$") target_link_libraries(${CMAKE_BENCH_NAME} PRIVATE cudf_benchmark_common cudf_datagen benchmark::benchmark_main) endfunction() -function(ConfigureNVBench CMAKE_BENCH_NAME) - add_executable(${CMAKE_BENCH_NAME} ${ARGN}) - set_target_properties(${CMAKE_BENCH_NAME} - PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$") - target_link_libraries(${CMAKE_BENCH_NAME} - PRIVATE cudf_benchmark_common cudf_datagen nvbench::main) -endfunction() - ################################################################################################### # - column benchmarks ----------------------------------------------------------------------------- ConfigureBench(COLUMN_CONCAT_BENCH column/concatenate_benchmark.cpp) @@ -76,10 +67,6 @@ ConfigureBench(GATHER_BENCH copying/gather_benchmark.cu) # - scatter benchmark ----------------------------------------------------------------------------- ConfigureBench(SCATTER_BENCH copying/scatter_benchmark.cu) -################################################################################################### -# - lists scatter benchmark ----------------------------------------------------------------------- -ConfigureBench(SCATTER_LISTS_BENCH lists/copying/scatter_lists_benchmark.cu) - ################################################################################################### # - contiguous_split benchmark ------------------------------------------------------------------- ConfigureBench(CONTIGUOUS_SPLIT_BENCH copying/contiguous_split_benchmark.cu) @@ -102,8 +89,7 @@ ConfigureBench(STREAM_COMPACTION_BENCH stream_compaction/drop_duplicates_benchma ################################################################################################### # - join benchmark -------------------------------------------------------------------------------- -ConfigureBench(JOIN_BENCH join/join_benchmark.cu join/conditional_join_benchmark.cu) -ConfigureNVBench(JOIN_NVBENCH join/join_nvbench.cu) +ConfigureBench(JOIN_BENCH join/join_benchmark.cu) ################################################################################################### # - iterator benchmark ---------------------------------------------------------------------------- @@ -205,7 +191,6 @@ ConfigureBench(AST_BENCH ast/transform_benchmark.cpp) # - binaryop benchmark ---------------------------------------------------------------------------- ConfigureBench(BINARYOP_BENCH binaryop/binaryop_benchmark.cpp - binaryop/compiled_binaryop_benchmark.cpp binaryop/jit_binaryop_benchmark.cpp) ################################################################################################### @@ -233,7 +218,6 @@ ConfigureBench(STRINGS_BENCH string/factory_benchmark.cu string/filter_benchmark.cpp string/find_benchmark.cpp - string/repeat_strings_benchmark.cpp string/replace_benchmark.cpp string/replace_re_benchmark.cpp string/split_benchmark.cpp @@ -247,10 +231,5 @@ ConfigureBench(JSON_BENCH string/json_benchmark.cpp) ################################################################################################### -# - io benchmark --------------------------------------------------------------------- -ConfigureBench(MULTIBYTE_SPLIT_BENCHMARK - io/text/multibyte_split_benchmark.cpp) - -################################################################################################### -# - row conversion benchmark --------------------------------------------------------- +# - row conversion benchmark ---------------------------------------------------------------------------- ConfigureBench(ROW_CONVERSION_BENCH row_conversion/row_conversion.cpp) diff --git a/cpp/benchmarks/row_conversion/row_conversion.cpp b/cpp/benchmarks/row_conversion/row_conversion.cpp index 7c1f52c5cd6..c4edee91b3c 100644 --- a/cpp/benchmarks/row_conversion/row_conversion.cpp +++ b/cpp/benchmarks/row_conversion/row_conversion.cpp @@ -25,7 +25,7 @@ class RowConversion : public cudf::benchmark { }; -static void BM_old_to_row(benchmark::State& state) +static void BM_to_row(benchmark::State& state) { cudf::size_type const n_rows{(cudf::size_type)state.range(0)}; auto const table = create_random_table({cudf::type_id::INT8, @@ -37,44 +37,8 @@ static void BM_old_to_row(benchmark::State& state) cudf::type_id::UINT16, cudf::type_id::UINT8, cudf::type_id::UINT64}, - 212, + 50, row_count{n_rows}); - /* auto const table = create_random_table({cudf::type_id::INT32}, - 64, - row_count{n_rows});*/ - - cudf::size_type total_bytes = 0; - for (int i = 0; i < table->num_columns(); ++i) { - auto t = table->get_column(i).type(); - total_bytes += cudf::size_of(t); - } - - for (auto _ : state) { - cuda_event_timer raii(state, true, rmm::cuda_stream_default); - - auto rows = cudf::convert_to_rows(table->view()); - } - - state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows()); -} - -static void BM_new_to_row(benchmark::State& state) -{ - cudf::size_type const n_rows{(cudf::size_type)state.range(0)}; - auto const table = create_random_table({cudf::type_id::INT8, - cudf::type_id::INT32, - cudf::type_id::INT16, - cudf::type_id::INT64, - cudf::type_id::INT32, - cudf::type_id::BOOL8, - cudf::type_id::UINT16, - cudf::type_id::UINT8, - cudf::type_id::UINT64}, - 212, - row_count{n_rows}); - /* auto const table = create_random_table({cudf::type_id::INT32}, - 64, - row_count{n_rows});*/ cudf::size_type total_bytes = 0; for (int i = 0; i < table->num_columns(); ++i) { @@ -85,13 +49,14 @@ static void BM_new_to_row(benchmark::State& state) for (auto _ : state) { cuda_event_timer raii(state, true, rmm::cuda_stream_default); +// auto rows = cudf::convert_to_rows(table->view()); auto new_rows = cudf::convert_to_rows2(table->view()); } state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows()); } -static void BM_old_from_row(benchmark::State& state) +static void BM_from_row(benchmark::State& state) { cudf::size_type const n_rows{(cudf::size_type)state.range(0)}; auto const table = create_random_table({cudf::type_id::INT8, @@ -105,6 +70,9 @@ static void BM_old_from_row(benchmark::State& state) cudf::type_id::UINT64}, 256, row_count{n_rows}); + /* auto const table = create_random_table({cudf::type_id::INT32}, + 4, + row_count{n_rows});*/ std::vector schema; cudf::size_type total_bytes = 0; @@ -125,60 +93,24 @@ static void BM_old_from_row(benchmark::State& state) state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows()); } -static void BM_new_from_row(benchmark::State& state) -{ - cudf::size_type const n_rows{(cudf::size_type)state.range(0)}; - auto const table = create_random_table({cudf::type_id::INT8, - cudf::type_id::INT32, - cudf::type_id::INT16, - cudf::type_id::INT64, - cudf::type_id::INT32, - cudf::type_id::BOOL8, - cudf::type_id::UINT16, - cudf::type_id::UINT8, - cudf::type_id::UINT64}, - 256, - row_count{n_rows}); - - std::vector schema; - cudf::size_type total_bytes = 0; - for (int i = 0; i < table->num_columns(); ++i) { - auto t = table->get_column(i).type(); - schema.push_back(t); - total_bytes += cudf::size_of(t); - } - - auto rows = cudf::convert_to_rows(table->view()); - - for (auto _ : state) { - cuda_event_timer raii(state, true, rmm::cuda_stream_default); - - auto out = cudf::convert_from_rows2(rows, schema); - } - - state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows()); -} - -#define TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \ - BENCHMARK_DEFINE_F(RowConversion, name) \ - (::benchmark::State & st) { f(st); } \ - BENCHMARK_REGISTER_F(RowConversion, name) \ - ->RangeMultiplier(8) \ - ->Ranges({{1 << 6, 1 << 20}}) \ - ->UseManualTime() \ +#define TO_ROW_CONVERSION_BENCHMARK_DEFINE(name) \ + BENCHMARK_DEFINE_F(RowConversion, name) \ + (::benchmark::State & st) { BM_to_row(st); } \ + BENCHMARK_REGISTER_F(RowConversion, name) \ + ->RangeMultiplier(8) \ + ->Ranges({{1 << 16, 1 << 24}}) \ + ->UseManualTime() \ ->Unit(benchmark::kMillisecond); -TO_ROW_CONVERSION_BENCHMARK_DEFINE(old_to_row_conversion, BM_old_to_row) -TO_ROW_CONVERSION_BENCHMARK_DEFINE(new_to_row_conversion, BM_new_to_row) +TO_ROW_CONVERSION_BENCHMARK_DEFINE(to_row_conversion) -#define FROM_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \ +#define FROM_ROW_CONVERSION_BENCHMARK_DEFINE(name) \ BENCHMARK_DEFINE_F(RowConversion, name) \ - (::benchmark::State & st) { f(st); } \ + (::benchmark::State & st) { BM_from_row(st); } \ BENCHMARK_REGISTER_F(RowConversion, name) \ ->RangeMultiplier(8) \ - ->Ranges({{1 << 6, 1 << 20}}) \ + ->Ranges({{1 << 6, 1 << 22}}) \ ->UseManualTime() \ ->Unit(benchmark::kMillisecond); -FROM_ROW_CONVERSION_BENCHMARK_DEFINE(old_from_row_conversion, BM_old_from_row) -FROM_ROW_CONVERSION_BENCHMARK_DEFINE(new_from_row_conversion, BM_new_from_row) +FROM_ROW_CONVERSION_BENCHMARK_DEFINE(from_row_conversion) diff --git a/cpp/include/cudf/row_conversion.hpp b/cpp/include/cudf/row_conversion.hpp index 282ffa4b0cb..f5e2225ad19 100644 --- a/cpp/include/cudf/row_conversion.hpp +++ b/cpp/include/cudf/row_conversion.hpp @@ -48,16 +48,4 @@ std::unique_ptr convert_from_rows( rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); -std::unique_ptr convert_from_rows2( - cudf::lists_column_view const &input, - std::vector const &schema, - rmm::cuda_stream_view stream = rmm::cuda_stream_default, - rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); - -std::unique_ptr convert_from_rows2( - std::vector> const &input, - std::vector const &schema, - rmm::cuda_stream_view stream = rmm::cuda_stream_default, - rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); - } // namespace cudf diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu index 0879a1c50a5..fb5dc4cb38d 100644 --- a/cpp/src/row_conversion/row_conversion.cu +++ b/cpp/src/row_conversion/row_conversion.cu @@ -14,14 +14,12 @@ * limitations under the License. */ -#include #include #include #include #include #include -#include #include #include #include @@ -31,15 +29,11 @@ #include #include -#include #include "cudf/types.hpp" #include "rmm/device_buffer.hpp" #include "thrust/iterator/counting_iterator.h" #include "thrust/iterator/transform_iterator.h" -#define NUM_BLOCKS_PER_KERNEL_TO_COLUMNS (2) - -using cudf::detail::make_device_uvector_async; namespace cudf { namespace detail { @@ -49,6 +43,34 @@ static inline __host__ __device__ int32_t align_offset(int32_t offset, std::size return (offset + alignment - 1) & ~(alignment - 1); } + +/** + * Copy a simple vector to device memory asynchronously. Be sure to read + * the data on the same stream as is used to copy it. + */ +template +std::unique_ptr> copy_to_dev_async(const std::vector &input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) +{ + std::unique_ptr> ret(new rmm::device_uvector(input.size(), stream, mr)); + CUDA_TRY(cudaMemcpyAsync( + ret->data(), input.data(), sizeof(T) * input.size(), cudaMemcpyHostToDevice, stream.value())); + return ret; +} + +template +rmm::device_uvector copy_to_dev_async2( + const std::vector &input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) +{ + rmm::device_uvector ret(input.size(), stream, mr); + CUDA_TRY(cudaMemcpyAsync( + ret.data(), input.data(), sizeof(T) * input.size(), cudaMemcpyHostToDevice, stream.value())); + return ret; +} + __global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows, const cudf::size_type num_columns, const cudf::size_type row_size, @@ -162,7 +184,7 @@ __global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows, cudf::size_type byte_bit_offset = col_index % 8; int predicate = *valid_byte & (1 << byte_bit_offset); uint32_t bitmask = __ballot_sync(active_mask, predicate); - if (row_index % 32 == 0) { nm[row_index / 8] = bitmask; } + if (row_index % 32 == 0) { nm[word_index(row_index)] = bitmask; } } // end column loop } // end row copy // wait for the row_group to be totally copied before starting on the next row group @@ -311,20 +333,6 @@ struct block_info { int buffer_num; }; -// When building the columns to return, we have to be mindful of the offset limit in cudf. -// It is 32-bit and these data columns are capable of surpassing that easily. The data should -// not be cut off exactly at the limit though due to the validity buffers. The most efficient -// place to cut the validity is on a 32-row boundary, so as we calculate the row sizes -// we keep track of the cut points for the validity, which we call row batches. If the row -// is larger than can be represented with the 32-bit offsets, we use the last 32-row boundary we -// hit. Note that this boundary is for our book-keeping with column pointers and not anything that -// the kernel needs to worry about. We cut the output at convienient boundaries when assembling -// the outgoing data stream. -struct row_batch { - size_type num_bytes; - size_type row_count; -}; - /** * @brief copy data from cudf columns into x format, which is row-based * @@ -337,16 +345,16 @@ struct row_batch { * @param block_infos information about the blocks of work * @param row_offsets offset to a specific row in the input data * @param output_data pointer to output data - * + * */ -__global__ void copy_from_columns(const size_type num_rows, - const size_type num_columns, +__global__ void copy_from_columns(const cudf::size_type num_rows, + const cudf::size_type num_columns, const int8_t **input_data, - const bitmask_type **input_nm, - const size_type *col_sizes, - const size_type *col_offsets, + const cudf::bitmask_type **input_nm, + const cudf::size_type *col_sizes, + const cudf::size_type *col_offsets, const block_info *block_infos, - const size_type *row_offsets, + const uint64_t *row_offsets, int8_t **output_data) { // We are going to copy the data in two passes. @@ -357,119 +365,46 @@ __global__ void copy_from_columns(const size_type num_rows, // This has been broken up for us in the block_info struct, so we don't have // any calculation to do here, but it is important to note. - bool debug_print = false; //blockIdx.x == 0 && threadIdx.x == 0; - - if (debug_print) { - printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns); - printf("Column Info:\n"); - for (int i = 0; i < num_columns; ++i) { - printf("col %d is at %p with size %d and offset %d\n", - i, - input_data[i], - col_sizes[i], - col_offsets[i]); - } - printf("block infos are at %p and my index is %d\n", block_infos, blockIdx.x); - /* printf("Row Offsets:\n"); - for (int i=0; i(&output_data[0][output_start_offset]) & - 7; // offset for alignment shim in order to match shared memory with final dest - if (debug_print) { - printf("outputting to offset %lu\n", output_start_offset); - printf("dest shim offset is %d\n", dest_shim_offset); - printf("Shared data is %p-%p\n", shared_data, shared_data + (48 * 1024)); - printf("my block is %d,%d -> %d,%d - buffer %d\n", - block.start_col, - block.start_row, - block.end_col, - block.end_row, - block.buffer_num); - } + uint8_t const dest_shim_offset = reinterpret_cast(&output_data[0][output_start_offset]) & 7; // offset for alignment shim in order to match shared memory with final dest + + printf("copying from column %d to column %d with rows %d to row %d(grid dim %d, blockIdx %d)\n", block.start_col, block.end_col, block.start_row, block.end_row, gridDim.x, blockIdx.x); + // each thread is responsible for every threadcount rows of data. - // the data is copied into shared memory in the final layout. - auto const real_bytes_in_row = - col_offsets[block.end_col] + col_sizes[block.end_col] - col_offsets[block.start_col]; - auto const shmem_row_size = align_offset(real_bytes_in_row + dest_shim_offset, - 8); // 8 byte alignment required for shared memory rows + // the data is copies into shared memory in the final layout. + auto const shmem_row_size = align_offset(col_offsets[block.end_col] + col_sizes[block.end_col] - col_offsets[block.start_col] + dest_shim_offset, 8); // 8 byte alignment required for shared memory rows auto const validity_offset = col_offsets[num_columns]; - if (debug_print) { - printf("col_offsets[%d] = %d, col_sizes[%d] = %d, col_offsets[%d] = %d\n", - block.end_col, - col_offsets[block.end_col], - block.end_col, - col_sizes[block.end_col], - block.start_col, - col_offsets[block.start_col]); - printf("shmem row size %d with real bytes %d\n", shmem_row_size, real_bytes_in_row); - printf("validity offset is %d\n", validity_offset); - printf("starting at %d,%d and going to %d, %d\n", - block.start_col, - block.start_row, - block.end_col, - block.end_row); - } - for (int col = block.start_col; col <= block.end_col; ++col) { - /*if (!col_is_variable) */ { - uint64_t col_offset = 0; + for (int col=block.start_col; col<=block.end_col; ++col) { + /*if (!col_is_variable) */{ + uint64_t col_offset = 0; cudf::size_type col_size = col_sizes[col]; - auto const dest_col_offset = - col_offsets[col] - col_offsets[block.start_col] + dest_shim_offset; - if (debug_print) { printf("dest col offset %d\n", dest_col_offset); } - for (int row = block.start_row + threadIdx.x; row <= block.end_row; row += blockDim.x) { - if (debug_print) { - printf("shmem row %d(%d) at offset %d(%d)\n", - row - block.start_row, - row, - (row - block.start_row) * shmem_row_size, - row * shmem_row_size); - } - int8_t *shmem_dest = - &shared_data[dest_col_offset + shmem_row_size * (row - block.start_row)]; + auto const dest_col_offset = col_offsets[col] - col_offsets[block.start_col] + dest_shim_offset; + for (int row=block.start_row + threadIdx.x; row(input_data[col]); - if (debug_print) { printf("%p <- short %d\n", shmem_dest, short_col_input[row]); } + const int16_t *short_col_input = reinterpret_cast(input_data[col]); *reinterpret_cast(shmem_dest) = short_col_input[row]; break; } case 4: { - const int32_t *int_col_input = reinterpret_cast(input_data[col]); - if (debug_print) { - printf("shmem[%d][%d] - %p <- int 0x%x\n", row, col, shmem_dest, int_col_input[row]); - } + const int32_t *int_col_input = reinterpret_cast(input_data[col]); *reinterpret_cast(shmem_dest) = int_col_input[row]; break; } case 8: { - const int64_t *long_col_input = reinterpret_cast(input_data[col]); - if (debug_print) { printf("%p <- long %lu\n", shmem_dest, long_col_input[row]); } + const int64_t *long_col_input = reinterpret_cast(input_data[col]); *reinterpret_cast(shmem_dest) = long_col_input[row]; break; } default: { cudf::size_type input_offset = col_size * row; - if (debug_print) { - printf("byte for byte copy due to size %d of column %d\n", col_size, col); - printf("%p <- input_data[%d] which is %d\n", - shmem_dest, - input_offset, - input_data[col][input_offset]); - } // TODO this should just not be supported for fixed width columns, but just in case... for (cudf::size_type b = 0; b < col_size; b++) { shmem_dest[b] = input_data[col][b + input_offset]; @@ -482,13 +417,11 @@ __global__ void copy_from_columns(const size_type num_rows, // so we have to rewrite the addresses to make sure that it is 4 byte aligned // we do this directly in the final location because the entire row may not // fit in shared memory and may require many blocks to process it entirely - int8_t *valid_byte = - &output_data[block.buffer_num][row_offsets[row] + validity_offset + (col / 8)]; + int8_t *valid_byte = &output_data[block.buffer_num][row_offsets[row] + validity_offset + col / 8]; cudf::size_type byte_bit_offset = col % 8; uint64_t fixup_bytes = reinterpret_cast(valid_byte) % 4; int32_t *valid_int = reinterpret_cast(valid_byte - fixup_bytes); cudf::size_type int_bit_offset = byte_bit_offset + (fixup_bytes * 8); - if (debug_print) { printf("Outputting validity to %p\n", valid_byte); } // Now copy validity for the column if (input_nm[col]) { if (bit_is_set(input_nm[col], row)) { @@ -500,11 +433,11 @@ __global__ void copy_from_columns(const size_type num_rows, // It is valid so just set the bit atomicOr_block(valid_int, 1 << int_bit_offset); } - } // end row + } // end row - col_offset += col_sizes[col] * rows_in_block; + col_offset += col_sizes[col] * (block.end_row - block.start_row); } - } // end col + } // end col // wait for the data to be totally copied into shared memory __syncthreads(); @@ -517,311 +450,35 @@ __global__ void copy_from_columns(const size_type num_rows, // row in shared memory may not be an entire row of the destination. // auto const thread_start_offset = threadIdx.x * 8; - auto const thread_stride = blockDim.x * 8; - auto const end_offset = shmem_row_size * rows_in_block; - - if (debug_print) { - printf("writing final data from %d to %d at stride %d\n", - thread_start_offset, - shmem_row_size * rows_in_block, - thread_stride); - printf("rows in block %d\n", rows_in_block); - } - for (auto src_offset = thread_start_offset; src_offset < end_offset; - src_offset += thread_stride) { + auto const thread_stride = gridDim.x * 8; + for (auto src_offset = thread_start_offset; src_offset < shmem_row_size * (block.end_row - block.start_row); src_offset += thread_stride) { auto const output_row_num = src_offset / shmem_row_size; - auto const row_offset = row_offsets[block.start_row + output_row_num]; - auto const col_offset = src_offset % shmem_row_size; - int8_t *output_ptr = &output_data[block.buffer_num][row_offset + col_offset]; - int8_t *input_ptr = &shared_data[src_offset]; - - // three cases to worry about here - // 1) first 8-byte part of a large row - dest_shim_offset bytes of pad at the front - // 2) last 8-byte part of a large row - some bytes of pad at the end - // 3) corner case of <= 8 bytes of data, which means dest_shim_offset bytes of pad at the front - // AND potentially pad at the rear - - // we know the real number of bytes in a row, so we can figure out if we are in case 3 easily. - // 1st case is when we're at some even multiple of shmem_row_size offset. - // 2nd case is when offset + 8 is some even multiple of shmem_row_size. - // must be an 8 byte copy - - // there is a chance we have a 0 dest_shim_offset and an 8 byte thing to copy, optimize? - if (real_bytes_in_row + dest_shim_offset <= 8) { - // case 3, we want to copy real_bytes_in_row bytes - auto const num_single_bytes = real_bytes_in_row - dest_shim_offset; - for (auto i = 0; i < num_single_bytes; ++i) { - if (debug_print) { - printf("case 3 - %d single byte final write %p(%d) -> %p\n", - num_single_bytes, - &input_ptr[i + dest_shim_offset], - input_ptr[i + dest_shim_offset], - &output_ptr[i]); - } - output_ptr[i] = input_ptr[i + dest_shim_offset]; - } - } else if (dest_shim_offset > 0 && src_offset % shmem_row_size == 0) { - // first byte with leading pad + auto const row_offset = row_offsets[block.start_row + output_row_num]; + auto const col_offset = src_offset % shmem_row_size; + int8_t *output_ptr = &output_data[block.buffer_num][row_offset + col_offset]; + int8_t *input_ptr = &shared_data[src_offset]; + // the first part and last part of the row is unaligned data copy. This is copied a single byte + // at a time. + if (dest_shim_offset > 0 && src_offset % shmem_row_size == 0) { + // first part of a row, copy single bytes auto const num_single_bytes = 8 - dest_shim_offset; - for (auto i = 0; i < num_single_bytes; ++i) { - if (debug_print) { - printf( - "single byte final write %p -> %p\n", &input_ptr[i + dest_shim_offset], &output_ptr[i]); - } + for (auto i=0; i 0) { - // last bytes of a row - auto const num_single_bytes = (real_bytes_in_row + dest_shim_offset) % 8; - for (auto i = 0; i < num_single_bytes; ++i) { - if (debug_print) { - printf("single trailing byte final write %p -> %p\n", - &input_ptr[i + dest_shim_offset], - &output_ptr[i]); - } + } else if (dest_shim_offset > 0 && (src_offset + 8) % shmem_row_size == 0) { + // last part of a row, copy single bytes + auto const num_single_bytes = dest_shim_offset; + for (auto i=0; i(input_ptr); - if (debug_print) { - printf( - "long final write %p -> %p\n", long_col_input, reinterpret_cast(output_ptr)); - } + const int64_t *long_col_input = reinterpret_cast(input_ptr); *reinterpret_cast(output_ptr) = *long_col_input; } } } -/** - * @brief copy data from row-based format to cudf columns - * - * @param num_rows total number of rows in the table - * @param num_columns total number of columns in the table - * @param shmem_used_per_block amount of shared memory that is used by a block - * @param offsets - * @param output_data - * @param output_nm - * @param col_sizes array of sizes for each element in a column - one per column - * @param col_offsets offset into input data row for each column's start - * @param block_infos information about the blocks of work - * @param input_data pointer to input data - * - */ -__global__ void copy_to_columns(const size_type num_rows, - const size_type num_columns, - const size_type shmem_used_per_block, - const size_type *offsets, - int8_t **output_data, - cudf::bitmask_type **output_nm, - const size_type *col_sizes, - const size_type *col_offsets, - const block_info *block_infos, - const int8_t *input_data) -{ - // We are going to copy the data in two passes. - // The first pass copies a chunk of data into shared memory. - // The second pass copies that chunk from shared memory out to the final location. - - // Because shared memory is limited we copy a subset of the rows at a time. - // This has been broken up for us in the block_info struct, so we don't have - // any calculation to do here, but it is important to note. - - constexpr bool debug_print = false; //blockIdx.x == 0 && threadIdx.x == 0; - - if (debug_print) { - printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns); - printf("block infos are at %p and my index is %d\n", block_infos, blockIdx.x); - /* printf("Row Offsets:\n"); - for (int i=0; i blockDim.x) { - break; - } - auto block = block_infos[this_block_index]; - auto const rows_in_block = block.end_row - block.start_row + 1; - auto const cols_in_block = block.end_col - block.start_col + 1; - extern __shared__ int8_t shared_data[]; - - // copy data from our block's window to shared memory - // offsets information can get us on the row, then we need to know where the column - // starts to offset into the row data. - - // each thread is responsible for 8-byte chunks starting at threadIdx.x and striding - // at blockDim.x. If the 8-byte chunk falls on the boundary of the window, then the - // thread may copy less than 8 bytes. Even if at the beginning of the window, because - // every internal copy is aligned to 8-byte boundaries. - // - // thread 0 thread 1 thread 2 thread 3 thread 4 thread 5 - // 01234567 89abcdef 01234567 89abcdef 01234567 89abcdef - // xxxbbbbb bbbbbbbb bbbbbbbb bbbbbbbb bbbbbbbb bbxxxxxx - // | | | | | | | - // - // - - auto const window_start_quad = col_offsets[block.start_col] / 8; - auto const window_end_quad = (col_offsets[block.end_col] + col_sizes[block.end_col] + 7) / 8; - auto const window_quad_width = window_end_quad - window_start_quad; - auto const total_quads = window_quad_width * rows_in_block; - auto const shared_memory_starting_pad = col_offsets[block.start_col] & 0x7; - - if (debug_print) { - printf("col_offsets[%d]: %d, col_offsets[%d]: %d col_sizes[%d]: %d\n", block.start_col, col_offsets[block.start_col], block.end_col, col_offsets[block.end_col], block.end_col, col_sizes[block.end_col]); - printf("window start quad is %d, window end quad is %d\n", window_start_quad, window_end_quad); - printf("window quad width is %d and there are %d total quads\n%d shared memory starting pad\n", window_quad_width, total_quads, shared_memory_starting_pad); - } - - // the copy to shared memory will be greedy. We know that the data is 8-byte aligned, so we won't - // access illegal memory by doing 8-byte aligned copies, so we can copy 8-byte aligned. This will - // result in the window edges being duplicated across blocks, but we can copy the padding as well - // to speed up our transfers to shared memory. - for (int i = threadIdx.x; i < total_quads; i += blockDim.x) { - auto const relative_row = i / window_quad_width; - auto const absolute_row = relative_row + block.start_row; - //auto const row = i / window_quad_width; - auto const offset_in_row = i % window_quad_width * 8; - auto const shmem_dest = &shared_data[i * 8]; - - if (debug_print) { - printf("relative_row: %d, absolute_row: %d, offset_in_row: %d, shmem_dest: %p\n", relative_row, absolute_row, offset_in_row, shmem_dest); - printf("offsets is %p\n", offsets); - printf("offsets[%d]: %d\n", absolute_row, offsets[absolute_row]); - printf("input_data[%d] will be dereferenced\n", offsets[absolute_row] + offset_in_row); - } - - // full 8-byte copy - const int64_t *long_col_input = - reinterpret_cast(&input_data[offsets[absolute_row] + offset_in_row]); - if (debug_print) { - printf("which will be address %p\n", long_col_input); - printf("%p <- long %lu\n", shmem_dest, *long_col_input); } - *reinterpret_cast(shmem_dest) = *long_col_input; - } - - __syncthreads(); - - // now we copy from shared memory to final destination. - // the data is laid out in rows in shared memory, so the reads - // for a column will be "vertical". Because of this and the different - // sizes for each column, this portion is handled on row/column basis. - // to prevent each thread working on a single row and also to ensure - // that all threads can do work in the case of more threads than rows, - // we do a global index instead of a double for loop with col/row. - for (int index = threadIdx.x; index < rows_in_block * cols_in_block; index += blockDim.x) { - auto const relative_col = index % cols_in_block; - auto const relative_row = index / cols_in_block; - auto const absolute_col = relative_col + block.start_col; - auto const absolute_row = relative_row + block.start_row; - - auto const shared_memory_row_offset = window_quad_width * 8 * relative_row; - auto const shared_memory_offset = col_offsets[absolute_col] - col_offsets[block.start_col] + - shared_memory_row_offset + shared_memory_starting_pad; - auto const column_size = col_sizes[absolute_col]; - - int8_t *shmem_src = &shared_data[shared_memory_offset]; - int8_t *dst = &output_data[absolute_col][absolute_row * column_size]; - - if (debug_print) { - printf("relative_col: %d, relative_row: %d, absolute_col: %d, absolute_row: %d, shared_mmeory_row_offset: %d, shared_memory_offset: %d," - " column_size: %d, shmem_src: %p, dst: %p\n", relative_col, relative_row, absolute_col, absolute_row, shared_memory_row_offset, shared_memory_offset, column_size, - shmem_src, dst) ; - } - switch (column_size) { - case 1: { - if (debug_print) { printf("%p <- byte %d\n", dst, *shmem_src); } - *dst = *shmem_src; - break; - } - case 2: { - const int16_t *short_col_input = reinterpret_cast(shmem_src); - if (debug_print) { printf("%p <- short %d\n", dst, *short_col_input); } - *reinterpret_cast(dst) = *short_col_input; - break; - } - case 4: { - const int32_t *int_col_input = reinterpret_cast(shmem_src); - if (debug_print) { printf("%p <- int 0x%x\n", dst, *int_col_input); } - *reinterpret_cast(dst) = *int_col_input; - break; - } - case 8: { - const int64_t *long_col_input = reinterpret_cast(shmem_src); - if (debug_print) { printf("%p <- long %lu\n", dst, *long_col_input); } - *reinterpret_cast(dst) = *long_col_input; - break; - } - default: { - if (debug_print) { - printf("byte for byte copy due to size %d of column %d\n", column_size, absolute_col); - } - // TODO this should just not be supported for fixed width columns, but just in case... - for (cudf::size_type b = 0; b < column_size; b++) { dst[b] = shmem_src[b]; } - break; - } - } - } - - // now handle validity. Each thread is responsible for 32 rows in 8 columns. - // to prevent indexing issues with a large number of threads, this is compressed - // to a single loop like above. TODO: investigate using shared memory here - auto const validity_batches_per_col = (num_rows + 31) / 32; - auto const validity_batches_total = std::max(1, validity_batches_per_col * (num_columns / 8)); - if (debug_print && threadIdx.x == 0 && blockIdx.x == 0) { - printf("validity_batched_per_col is %d\nvalidity_batches_total is %d for %d rows\n%d blocks of %d threads\n", validity_batches_per_col, validity_batches_total, num_rows, gridDim.x, blockDim.x); - } - for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < validity_batches_total; index += blockDim.x * gridDim.x) { - auto const start_col = (index * 8) / validity_batches_per_col; - auto const batch = index % validity_batches_per_col; - auto const starting_row = batch * 32; - auto const validity_offset = col_offsets[num_columns] + (start_col / 8); - - if (debug_print) { - printf("%d-%d: cols: %d-%d, word index: %d, batch: %d, starting_row: %d, +validity_offset: %d, index: %d, stride: %d\n", threadIdx.x, blockIdx.x, start_col, start_col + 7, (start_col / 8), batch, starting_row, validity_offset, index, blockDim.x * gridDim.x); - } - - // one for each column - int32_t dst_validity[8] = {0}; - for (int row = starting_row; row < std::min(num_rows, starting_row + 32); ++row) { - int8_t const * const validity_ptr = &input_data[offsets[row] + validity_offset]; - - if (debug_print) { - printf("%d: validity_ptr is %p for row %d\n", threadIdx.x, validity_ptr, row); - } - - auto const val_byte = *validity_ptr; - - for (int i=0; i> src_shift); - } - // auto const dst_bit_mask = 1 << dst_shift; - dst_validity[i] |= (((val_byte & src_bit_mask) >> src_shift) << dst_shift); - } - } - - - for (int i=0; i(output_nm[start_col + i] + (starting_row / 32)); - if (debug_print) { - printf("%d-%d: validity write output_nm[%d][%d] - %p <- %d\n", threadIdx.x, blockIdx.x, start_col + i, starting_row, validity_ptr, dst_validity[i]); - } - *validity_ptr = dst_validity[i]; - } - } -} -} - /** * Calculate the dimensions of the kernel for fixed width only columns. * @param [in] num_columns the number of columns being copied. @@ -895,10 +552,10 @@ static std::unique_ptr fixed_width_convert_to_rows( const cudf::size_type num_rows, const cudf::size_type num_columns, const cudf::size_type size_per_row, - rmm::device_uvector &column_start, - rmm::device_uvector &column_size, - rmm::device_uvector &input_data, - rmm::device_uvector &input_nm, + std::unique_ptr> &column_start, + std::unique_ptr> &column_size, + std::unique_ptr> &input_data, + std::unique_ptr> &input_nm, const cudf::scalar &zero, const cudf::scalar &scalar_size_per_row, rmm::cuda_stream_view stream, @@ -929,10 +586,10 @@ static std::unique_ptr fixed_width_convert_to_rows( num_rows, num_columns, size_per_row, - column_start.data(), - column_size.data(), - input_data.data(), - input_nm.data(), + column_start->data(), + column_size->data(), + input_data->data(), + input_nm->data(), data->mutable_view().data()); return cudf::make_lists_column(num_rows, @@ -986,165 +643,21 @@ static inline int32_t compute_fixed_width_layout(std::vector co return align_offset(at_offset, 8); // 8 bytes (64 bits) } -template -static size_type compute_column_information( - iterator begin, - iterator end, - std::vector &column_starts, - std::vector &column_sizes)//, - //std::function nested_type_cb) -{ - size_type fixed_width_size_per_row = 0; - for (auto cv = begin; cv != end; ++cv) { - auto col_type = std::get<0>(*cv); - bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; - -// if (nested_type && nested_type_cb) { nested_type_cb(cv->get<1>()); } - - // a list or string column will write a single uint64 - // of data here for offset/length - auto col_size = nested_type ? 8 : size_of(col_type); - - // align size for this type - std::size_t const alignment_needed = col_size; // They are the same for fixed width types - fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed); - column_starts.push_back(fixed_width_size_per_row); - column_sizes.push_back(col_size); - fixed_width_size_per_row += col_size; - } - - auto validity_offset = detail::align_offset(fixed_width_size_per_row, 4); - column_starts.push_back(validity_offset); - - return fixed_width_size_per_row; -} +} // namespace detail //#define DEBUG - -static std::vector build_block_infos(std::vector const &column_sizes, - std::vector const &column_starts, - std::vector const &row_batches, - size_type const total_number_of_rows, - size_type const &shmem_limit_per_block) +std::vector> convert_to_rows2(cudf::table_view const &tbl, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) { - std::vector block_infos; - - // block infos are organized with the windows going "down" the columns - // this provides the most coalescing of memory access - int current_window_width = 0; - int current_window_start_col = 0; - - // build the blocks for a specific set of columns - auto build_blocks = [&block_infos, &row_batches, total_number_of_rows]( - int const start_col, int const end_col, int const desired_window_height) { - int current_window_start_row = 0; - int current_window_row_batch = 0; - int rows_left_in_batch = row_batches[current_window_row_batch].row_count; - int i = 0; - while (i < total_number_of_rows) { - if (rows_left_in_batch == 0) { - current_window_row_batch++; - rows_left_in_batch = row_batches[current_window_row_batch].row_count; - } - int const window_height = std::min(desired_window_height, rows_left_in_batch); - - block_infos.emplace_back(detail::block_info{ - start_col, - current_window_start_row, - end_col, - std::min(current_window_start_row + window_height - 1, total_number_of_rows - 1), - current_window_row_batch}); - - i += window_height; - current_window_start_row += window_height; - rows_left_in_batch -= window_height; - } - }; - - // the ideal window height has lots of 8-byte reads and 8-byte writes. The optimal read/write - // would be memory cache line sized access, but since other blocks will read/write the edges this - // may not turn out to be overly important. For now, we will attempt to build a square window as - // far as byte sizes. x * y = shared_mem_size. Which translates to x^2 = shared_mem_size since we - // want them equal, so height and width are sqrt(shared_mem_size). The trick is that it's in - // bytes, not rows or columns. - int const window_height = std::min( - std::min(size_type(sqrt(shmem_limit_per_block)) / column_sizes[0], total_number_of_rows), - row_batches[0].row_count); -#if defined(DEBUG) - printf( - "sqrt(shmem_limit_per_block) / column_sizes[0] is %d and num_rows is %d, batch row count is %d - which makes window height " - "%d\n", - size_type(sqrt(shmem_limit_per_block)) / column_sizes[0], - total_number_of_rows, - row_batches[0].row_count, - window_height); -#endif - - int row_size = 0; - - // march each column and build the blocks of appropriate sizes - for (unsigned int col = 0; col < column_sizes.size(); ++col) { - auto const col_size = column_sizes[col]; - - // align size for this type - std::size_t alignment_needed = col_size; // They are the same for fixed width types - auto row_size_aligned = detail::align_offset(row_size, alignment_needed); - auto row_size_with_this_col = row_size_aligned + col_size; - auto row_size_with_end_pad = detail::align_offset(row_size_with_this_col, 8); - - if (row_size_with_end_pad * window_height > shmem_limit_per_block) { -#if defined(DEBUG) - printf( - "Window size %d too large at column %d, bumping back to build windows of size %d(cols " - "%d-%d), which is %d tall. Row size is too large at %d and ok at %d(aligned overall is %d) " - "for shared mem size %d\n", - row_size_with_end_pad * window_height, - col, - row_size * window_height, - current_window_start_col, - col - 1, - window_height, - row_size_with_end_pad, - row_size, - row_size_aligned, - shmem_limit_per_block); -#endif - // too large, close this window, generate vertical blocks and restart - build_blocks(current_window_start_col, col - 1, window_height); - row_size = - detail::align_offset((column_starts[col] + column_sizes[col]) & 7, alignment_needed); -#if defined(DEBUG) - printf( - "New window starting with offset %d and row size %d to be %d (previous column offset %d+%d " - "or %d)\n", - row_size, - col_size, - row_size + col_size, - column_starts[col - 1], - column_sizes[col - 1], - column_starts[col - 1] + column_sizes[col - 1]); -#endif - row_size += col_size; // alignment required for shared memory window boundary to match - // alignment of output row - current_window_start_col = col; - current_window_width = 0; - } else { - row_size = row_size_with_this_col; - current_window_width++; - } - } - - // build last set of blocks - if (current_window_width > 0) { - build_blocks(current_window_start_col, (int)column_sizes.size()-1, window_height); - } - - return block_infos; -} -} // namespace detail + // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the data, but small enough + // that multiple columns fit in memory so the writes can coalese as well. Potential optimization for window sizes. + constexpr int max_window_height = 1024; + const size_type num_columns = tbl.num_columns(); + const size_type num_rows = tbl.num_rows(); -#if defined(DEBUG) - void pretty_print(uint64_t i) { + #if defined(DEBUG) + auto pretty_print = [](uint64_t i) { if (i > (1 * 1024 * 1024 * 1024)) { printf("%.2f GB", i / float(1 * 1024 * 1024 * 1024)); } else if (i > (1 * 1024 * 1024)) { @@ -1154,18 +667,8 @@ static std::vector build_block_infos(std::vector const &c } else { printf("%lu Bytes", i); } - } -#endif - -std::vector> convert_to_rows2(cudf::table_view const &tbl, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) -{ - // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the - // data, but small enough that multiple columns fit in memory so the writes can coalese as well. - // Potential optimization for window sizes. - const size_type num_columns = tbl.num_columns(); - const size_type num_rows = tbl.num_rows(); + }; + #endif int device_id; CUDA_TRY(cudaGetDevice(&device_id)); @@ -1173,12 +676,6 @@ std::vector> convert_to_rows2(cudf::table_view con CUDA_TRY( cudaDeviceGetAttribute(&shmem_limit_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); -#if defined(DEBUG) - size_t free, total; - cudaMemGetInfo(&free, &total); - printf("%lu/%lu Memory\n", free, total); -#endif - // break up the work into blocks, which are a starting and ending row/col #. // this window size is calculated based on the shared memory size available // we want a single block to fill up the entire shared memory space available @@ -1194,78 +691,50 @@ std::vector> convert_to_rows2(cudf::table_view con // to that point. These are row batches and they are decided first before building the // windows so the windows can be properly cut around them. - // Get the pointers to the input columnar data ready - std::vector input_data; - std::vector input_nm; - input_data.reserve(num_columns); - input_nm.reserve(num_columns); - for (size_type column_number = 0; column_number < num_columns; column_number++) { - column_view cv = tbl.column(column_number); - auto const col_type = cv.type(); - bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; - - if (!nested_type) { - input_data.emplace_back(cv.data()); - input_nm.emplace_back(cv.null_mask()); - } - } - - auto dev_input_data = make_device_uvector_async(input_data, stream, mr); - auto dev_input_nm = make_device_uvector_async(input_nm, stream, mr); - - std::vector row_sizes; // size of each row in bytes including any alignment padding - std::vector row_offsets; // offset from the start of the data to this row + std::vector row_sizes; // size of each row in bytes including any alignment padding + std::vector row_offsets; // offset from the start of the data to this row std::vector column_sizes; // byte size of each column - std::vector column_starts; // offset of column inside a row including alignment - std::vector - variable_width_columns; // list of the variable width columns in the table + std::vector column_starts; // offset of column inside a row including alignment + std::vector variable_width_columns; // list of the variable width columns in the table row_sizes.reserve(num_rows); row_offsets.reserve(num_rows); column_sizes.reserve(num_columns); - column_starts.reserve(num_columns + 1); // we add a final offset for validity data start - - auto iter = thrust::make_transform_iterator(thrust::make_counting_iterator(0), [&tbl](auto i) -> std::tuple { - return std::make_tuple(tbl.column(i).type(), tbl.column(i)); - }); - - size_type fixed_width_size_per_row = detail::compute_column_information( - iter, - iter + num_columns, - column_starts, - column_sizes);//, -// [&variable_width_columns](column_view const &cv) { variable_width_columns.push_back(cv); }); - /* size_type fixed_width_size_per_row = 0; - for (int col = 0; col < num_columns; ++col) { - auto cv = tbl.column(col); - auto col_type = cv.type(); - bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; - - if (nested_type) { variable_width_columns.push_back(cv); } - - // a list or string column will write a single uint64 - // of data here for offset/length - auto col_size = nested_type ? 8 : size_of(col_type); - - // align size for this type - std::size_t const alignment_needed = col_size; // They are the same for fixed width types - fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed); - column_starts.push_back(fixed_width_size_per_row); - column_sizes.push_back(col_size); - fixed_width_size_per_row += col_size; - }*/ - -#if defined(DEBUG) - printf("validity offset will be %d + %d = %d\n", - column_starts.back(), - column_sizes.back(), - column_starts.back() + column_sizes.back()); -#endif - - - auto dev_col_sizes = make_device_uvector_async(column_sizes, stream, mr); - auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr); - - std::vector row_batches; + column_starts.reserve(num_columns+1); // we add a final offset for validity data start + + size_type fixed_width_size_per_row = 0; + for (int col = 0; col < num_columns; ++col) { + auto cv = tbl.column(col); + auto col_type = cv.type(); + bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; + + if (nested_type) { variable_width_columns.push_back(cv);} + + // a list or string column will write a single uint64 + // of data here for offset/length + auto col_size = nested_type ? 8 : size_of(col_type); + + // align size for this type + std::size_t const alignment_needed = col_size; // They are the same for fixed width types + fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed); + column_starts.push_back(fixed_width_size_per_row); + column_sizes.push_back(col_size); + fixed_width_size_per_row += col_size; + } + + // When building the columns to return, we have to be mindful of the offset limit in cudf. + // It is 32-bit and these data columns are capable of surpassing that easily. The data should + // not be cut off exactly at the limit though due to the validity buffers. The most efficient + // place to cut the validity is on a 32-row boundary, so as we calculate the row sizes + // we keep track of the cut points for the validity, which we call row batches. If the row + // is larger than can be represented with the 32-bit offsets, we use the last 32-row boundary we hit. + // Note that this boundary is for our book-keeping with column pointers and not anything + // that the kernel needs to worry about. We cut the output at convienient boundaries + // when assembling the outgoing data stream. + struct row_batch { + size_type num_bytes; + size_type row_count; + }; + std::vector row_batches; auto calculate_variable_width_row_data_size = [](int const row) { // each level of variable-width data will add an offset/length @@ -1277,156 +746,210 @@ std::vector> convert_to_rows2(cudf::table_view con // will be included in the variable-width data blob at the end of the // row. return 0; - /* auto c = variable_width_columns[col]; - while (true) { - auto col_offsets = c.child(0).data(); - auto col_data_size = size_of(c.child(1).type()); - std::size_t alignment_needed = col_data_size; - - row_sizes[row] += (col_offsets[row + 1] - col_offsets[row]) * col_data_size; - if (c.num_children() == 0) { - break; - } - c = c.child(1); - } - */ +/* auto c = variable_width_columns[col]; + while (true) { + auto col_offsets = c.child(0).data(); + auto col_data_size = size_of(c.child(1).type()); + std::size_t alignment_needed = col_data_size; + + row_sizes[row] += (col_offsets[row + 1] - col_offsets[row]) * col_data_size; + if (c.num_children() == 0) { + break; + } + c = c.child(1); + } +*/ }; uint64_t row_batch_size = 0; uint64_t total_table_size = 0; - size_type row_batch_rows = 0; - uint64_t row_offset = 0; + size_type row_batch_rows = 0; + uint64_t row_offset = 0; - // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then - // calculate the size of each row's variable-width data and validity as well. - auto validity_size = num_bitmask_words(num_columns) * 4; + // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then calculate + // the size of each row's variable-width data as well. for (int row = 0; row < num_rows; ++row) { - auto aligned_row_batch_size = - detail::align_offset(row_batch_size, 8); // rows are 8 byte aligned - row_sizes[row] = fixed_width_size_per_row; - // validity is byte aligned - row_sizes[row] += validity_size; - // variable width data is 8-byte aligned - row_sizes[row] = detail::align_offset(row_sizes[row], 8) + - calculate_variable_width_row_data_size(row); // rows are 8 byte aligned - - if ((uint64_t)aligned_row_batch_size + row_sizes[row] > - (uint64_t)std::numeric_limits::max()) { + row_sizes[row] = fixed_width_size_per_row + calculate_variable_width_row_data_size(row); + if (row_batch_size + row_sizes[row] > std::numeric_limits::max()) { // a new batch starts at the last 32-row boundary - row_batches.push_back( - detail::row_batch{static_cast(row_batch_size), row_batch_rows & ~31}); - row_batch_size = 0; - row_batch_rows = row_batch_rows & 31; - row_offset = 0; - aligned_row_batch_size = 0; + row_batches.push_back(row_batch{static_cast(row_batch_size), row_batch_rows & ~31}); + row_batch_size = 0; + row_batch_rows = row_batch_rows & 31; + row_offset = 0; } - row_offset = detail::align_offset(row_offset, 8); // rows are 8 byte aligned + row_offset = detail::align_offset(row_offset, 8); // rows are 8 byte aligned row_offsets.push_back(row_offset); - row_batch_size = aligned_row_batch_size + row_sizes[row]; + row_batch_size += row_sizes[row]; row_offset += row_sizes[row]; - total_table_size = detail::align_offset(total_table_size, 8); // rows are 8 byte aligned + total_table_size = detail::align_offset(total_table_size, 8); // rows are 8 byte aligned total_table_size += row_sizes[row]; row_batch_rows++; } if (row_batch_size > 0) { - row_batches.push_back(detail::row_batch{static_cast(row_batch_size), row_batch_rows}); + row_batches.push_back(row_batch{static_cast(row_batch_size), row_batch_rows & ~31}); } - auto dev_row_offsets = make_device_uvector_async(row_offsets, stream, mr); - -#if defined(DEBUG) - printf("%d rows and %d columns in table\n", num_rows, num_columns); + #if defined(DEBUG) printf("%lu batches:\n", row_batches.size()); for (auto i = 0; i < (int)row_batches.size(); ++i) { printf("%d: %d rows, ", i, row_batches[i].row_count); pretty_print(row_batches[i].num_bytes); printf("\n"); } -#endif + #endif - std::vector output_buffers; - std::vector output_data; - output_data.reserve(row_batches.size()); - for (uint i = 0; i < row_batches.size(); ++i) { - rmm::device_buffer temp(row_batches[i].num_bytes, stream, mr); - output_data.push_back(static_cast(temp.data())); - output_buffers.push_back(std::move(temp)); + std::vector block_infos; + + // block infos are organized with the windows going "down" the columns + // this provides the most coalescing of memory access + int current_window_size = 0; + int current_window_start_col = 0; + + // build the blocks for a specific set of columns + auto build_blocks = [&block_infos, &row_batches, num_rows](int const start_col, int const end_col, int const desired_window_height) { + int current_window_start_row = 0; + int current_window_row_batch = 0; + int rows_left_in_batch = row_batches[current_window_row_batch].row_count; + int i = 0; + while (i < num_rows) { + if (rows_left_in_batch == 0) { + current_window_row_batch++; + rows_left_in_batch = row_batches[current_window_row_batch].row_count; + } + int const window_height = std::min(desired_window_height, rows_left_in_batch); + + block_infos.emplace_back( + detail::block_info{start_col, + current_window_start_row, + start_col + end_col, + std::min(current_window_start_row + window_height - 1, num_rows), current_window_row_batch}); + + i += window_height; + current_window_start_row += window_height; + rows_left_in_batch -= window_height; + } + }; + + int const window_height = std::min(std::min(max_window_height, num_rows), row_batches[0].row_count); + + int row_size = 0; + + // march each column and build the blocks of appropriate sizes + for (int col = 0; col < num_columns; ++col) { + auto const col_size = column_sizes[col]; + + // align size for this type + std::size_t alignment_needed = col_size; // They are the same for fixed width types + auto row_size_with_this_col = detail::align_offset(row_size, alignment_needed) + col_size; + + if (row_size_with_this_col * window_height > shmem_limit_per_block) { + // too large, close this window, generate vertical blocks and restart + build_blocks(current_window_start_col, col - 1, window_height); + row_size = detail::align_offset(column_starts[col] & 7, alignment_needed) + col_size; // alignment required for shared memory window boundary to match alignment of output row + current_window_start_col = col; + } else { + row_size = row_size_with_this_col; + } } - auto dev_output_data = make_device_uvector_async(output_data, stream, mr); - std::vector block_infos = - build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block); + auto validity_offset = detail::align_offset(column_starts.back(), 4); + column_starts.push_back(validity_offset); + + // build last set of blocks + if (current_window_size > 0) { build_blocks(current_window_start_col, (int)tbl.num_columns() - 1, window_height); } + + // Get the pointers to the input columnar data ready - possibly moved up to copy to gpu while calculating other things + std::vector input_data; + std::vector input_nm; + for (size_type column_number = 0; column_number < num_columns; column_number++) { + column_view cv = tbl.column(column_number); + auto const col_type = cv.type(); + bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; + + if (!nested_type) { + input_data.emplace_back(cv.data()); + input_nm.emplace_back(cv.null_mask()); + } + } -#if defined(DEBUG) - printf("%lu windows for %d columns, %d rows to fit in ", - block_infos.size(), - block_infos[0].end_col - block_infos[0].start_col + 1, - block_infos[0].end_row - block_infos[0].start_row); + #if defined(DEBUG) + printf("%lu windows for %d columns, %d rows to fit in ", block_infos.size(), block_infos[0].end_col - block_infos[0].start_col, block_infos[0].end_row - block_infos[0].start_row); pretty_print(shmem_limit_per_block); printf(" shared mem("); pretty_print(fixed_width_size_per_row); printf("/row, %d columns, %d rows, ", num_columns, num_rows); pretty_print(total_table_size); printf(" total):\n"); -#endif + #endif - auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr); + auto dev_input_data = detail::copy_to_dev_async2(input_data, stream, mr); + auto dev_input_nm = detail::copy_to_dev_async2(input_nm, stream, mr); + auto dev_col_sizes = detail::copy_to_dev_async2(column_sizes, stream, mr); + auto dev_col_starts = detail::copy_to_dev_async2(column_starts, stream, mr); + auto dev_block_infos = detail::copy_to_dev_async2(block_infos, stream, mr); + auto dev_row_offsets = detail::copy_to_dev_async2(row_offsets, stream, mr); + + std::vector output_data; + output_data.reserve(row_batches.size()); + for (uint i=0; i>>( - num_rows, - num_columns, - dev_input_data.data(), - dev_input_nm.data(), - dev_col_sizes.data(), - dev_col_starts.data(), - dev_block_infos.data(), - dev_row_offsets.data(), - reinterpret_cast(dev_output_data.data())); + dim3 blocks; + dim3 threads; + blocks.x = block_infos.size(); + blocks.y = 0; + blocks.z = 0; + threads.x = 1024; + threads.y = 0; + threads.z = 0; + detail::copy_from_columns<<>>(num_rows, + num_columns, + dev_input_data.data(), + dev_input_nm.data(), + dev_col_sizes.data(), + dev_col_starts.data(), + dev_block_infos.data(), + dev_row_offsets.data(), + reinterpret_cast(dev_output_data.data())); // split up the output buffer into multiple buffers based on row batch sizes // and create list of byte columns int offset_offset = 0; std::vector> ret; - for (uint i = 0; i < row_batches.size(); ++i) { + for (uint i=0; i offset_vals; offset_vals.reserve(row_batches[i].row_count + 1); size_type cur_offset = 0; offset_vals.push_back(cur_offset); - for (int row = 0; row < row_batches[i].row_count; ++row) { - cur_offset = detail::align_offset(cur_offset, 8) + row_sizes[row + offset_offset]; + for (int row=0; row( - data_type{type_id::INT32}, (size_type)offset_vals.size(), dev_offsets.release()); + auto dev_offsets = detail::copy_to_dev_async2(offset_vals, stream, mr); + auto offsets = + std::make_unique(data_type{type_id::INT32}, (size_type)offset_vals.size(), dev_offsets.release()); - auto data = std::make_unique( - data_type{cudf::type_id::INT8}, row_batches[i].num_bytes, std::move(output_buffers[i])); + auto data = + std::make_unique(data_type{cudf::type_id::INT8}, + row_batches[i].num_bytes, + std::move(output_data[i])); ret.push_back(cudf::make_lists_column(row_batches[i].row_count, - std::move(offsets), - std::move(data), - 0, - rmm::device_buffer{0, rmm::cuda_stream_default, mr}, - stream, - mr)); + std::move(offsets), + std::move(data), + 0, + rmm::device_buffer{0, rmm::cuda_stream_default, mr}, + stream, + mr)); } - + return ret; } @@ -1445,8 +968,8 @@ std::vector> convert_to_rows(cudf::table_view cons std::vector column_size; int32_t size_per_row = detail::compute_fixed_width_layout(schema, column_start, column_size); - auto dev_column_start = make_device_uvector_async(column_start, stream, mr); - auto dev_column_size = make_device_uvector_async(column_size, stream, mr); + auto dev_column_start = detail::copy_to_dev_async(column_start, stream, mr); + auto dev_column_size = detail::copy_to_dev_async(column_size, stream, mr); int32_t max_rows_per_batch = std::numeric_limits::max() / size_per_row; // Make the number of rows per batch a multiple of 32 so we don't have to worry about @@ -1463,16 +986,16 @@ std::vector> convert_to_rows(cudf::table_view cons input_data.emplace_back(cv.data()); input_nm.emplace_back(cv.null_mask()); } - auto dev_input_data = make_device_uvector_async(input_data, stream, mr); - auto dev_input_nm = make_device_uvector_async(input_nm, stream, mr); + auto dev_input_data = detail::copy_to_dev_async(input_data, stream, mr); + auto dev_input_nm = detail::copy_to_dev_async(input_nm, stream, mr); using ScalarType = cudf::scalar_type_t; auto zero = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value()); - zero->set_valid_async(true, stream); + zero->set_valid(true, stream); static_cast(zero.get())->set_value(0, stream); auto step = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value()); - step->set_valid_async(true, stream); + step->set_valid(true, stream); static_cast(step.get()) ->set_value(static_cast(size_per_row), stream); @@ -1500,100 +1023,6 @@ std::vector> convert_to_rows(cudf::table_view cons } } -std::unique_ptr convert_from_rows2(cudf::lists_column_view const &input, - std::vector const &schema, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) -{ - // verify that the types are what we expect - cudf::column_view child = input.child(); - cudf::type_id list_type = child.type().id(); - CUDF_EXPECTS(list_type == cudf::type_id::INT8 || list_type == cudf::type_id::UINT8, - "Only a list of bytes is supported as input"); - - cudf::size_type num_columns = schema.size(); - cudf::size_type num_rows = input.parent().size(); - - int device_id; - CUDA_TRY(cudaGetDevice(&device_id)); - int shmem_limit_per_block; - CUDA_TRY( - cudaDeviceGetAttribute(&shmem_limit_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); - - shmem_limit_per_block /= NUM_BLOCKS_PER_KERNEL_TO_COLUMNS; - - std::vector column_starts; - std::vector column_sizes; - - auto iter = thrust::make_transform_iterator(thrust::make_counting_iterator(0), [&schema](auto i) { - return std::make_tuple(schema[i], nullptr); - }); - size_type fixed_width_size_per_row = detail::compute_column_information( - iter, iter + num_columns, column_starts, column_sizes);//, [](void *) {}); - - size_type validity_size = num_bitmask_words(num_columns) * 4; - - size_type row_size = detail::align_offset(fixed_width_size_per_row + validity_size, 8); - - // Ideally we would check that the offsets are all the same, etc. but for now - // this is probably fine - CUDF_EXPECTS(row_size * num_rows == child.size(), - "The layout of the data appears to be off"); - auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr); - auto dev_col_sizes = make_device_uvector_async(column_sizes, stream, mr); - - // build the row_batches from the passed in list column - std::vector row_batches; - - row_batches.push_back(detail::row_batch{child.size(), num_rows}); - - // Allocate the columns we are going to write into - std::vector> output_columns; - std::vector output_data; - std::vector output_nm; - for (cudf::size_type i = 0; i < num_columns; i++) { - auto column = cudf::make_fixed_width_column( - schema[i], num_rows, cudf::mask_state::UNINITIALIZED, stream, mr); - auto mut = column->mutable_view(); - output_data.emplace_back(mut.data()); - output_nm.emplace_back(mut.null_mask()); - output_columns.emplace_back(std::move(column)); - } - - auto dev_output_data = make_device_uvector_async(output_data, stream, mr); - auto dev_output_nm = make_device_uvector_async(output_nm, stream, mr); - - std::vector block_infos = - build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block); - - auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr); - - dim3 blocks((block_infos.size() + (NUM_BLOCKS_PER_KERNEL_TO_COLUMNS - 1)) / NUM_BLOCKS_PER_KERNEL_TO_COLUMNS); - #if defined(DEBUG) || 1 - dim3 threads(std::min(std::min(512, shmem_limit_per_block / 8), (int)child.size())); - #else - dim3 threads(std::min(std::min(1024, shmem_limit_per_block / 8), (int)child.size())); - #endif -#if defined(DEBUG) - printf("Launching kernel with %d blocks, %d threads, ", blocks.x, threads.x); - pretty_print(shmem_limit_per_block); - printf(" shared memory\n"); -#endif - detail::copy_to_columns<<>>( - num_rows, - num_columns, - shmem_limit_per_block, - input.offsets().data(), - dev_output_data.data(), - dev_output_nm.data(), - dev_col_sizes.data(), - dev_col_starts.data(), - dev_block_infos.data(), - child.data()); - - return std::make_unique(std::move(output_columns)); -} - std::unique_ptr convert_from_rows(cudf::lists_column_view const &input, std::vector const &schema, rmm::cuda_stream_view stream, @@ -1618,8 +1047,8 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const &in // this is probably fine CUDF_EXPECTS(size_per_row * num_rows == child.size(), "The layout of the data appears to be off"); - auto dev_column_start = make_device_uvector_async(column_start, stream); - auto dev_column_size = make_device_uvector_async(column_size, stream); + auto dev_column_start = detail::copy_to_dev_async(column_start, stream, mr); + auto dev_column_size = detail::copy_to_dev_async(column_size, stream, mr); // Allocate the columns we are going to write into std::vector> output_columns; @@ -1634,8 +1063,8 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const &in output_columns.emplace_back(std::move(column)); } - auto dev_output_data = make_device_uvector_async(output_data, stream, mr); - auto dev_output_nm = make_device_uvector_async(output_nm, stream, mr); + auto dev_output_data = detail::copy_to_dev_async(output_data, stream, mr); + auto dev_output_nm = detail::copy_to_dev_async(output_nm, stream, mr); dim3 blocks; dim3 threads; @@ -1646,10 +1075,10 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const &in num_rows, num_columns, size_per_row, - dev_column_start.data(), - dev_column_size.data(), - dev_output_data.data(), - dev_output_nm.data(), + dev_column_start->data(), + dev_column_size->data(), + dev_output_data->data(), + dev_output_nm->data(), child.data()); return std::make_unique(std::move(output_columns)); @@ -1674,20 +1103,4 @@ std::unique_ptr convert_from_rows( // } } -std::unique_ptr convert_from_rows2( - std::vector> const &input, - std::vector const &schema, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) -{ - CUDF_EXPECTS(input.size() == 1, "Too large of an input, need to concat the output tables..."); - - // for (uint i=0; iview(); - auto ret = convert_from_rows2(lcv, schema, stream, mr); - - return ret; - // } -} - } // namespace cudf From 3bff2aad0834b29b37df68f4b1d9cdf5e01e5742 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Thu, 10 Jun 2021 17:53:09 +0000 Subject: [PATCH 09/80] fixing kernel launch and updating --- .../row_conversion/row_conversion.cpp | 9 +- cpp/src/row_conversion/row_conversion.cu | 105 +++++++++++++----- 2 files changed, 83 insertions(+), 31 deletions(-) diff --git a/cpp/benchmarks/row_conversion/row_conversion.cpp b/cpp/benchmarks/row_conversion/row_conversion.cpp index c4edee91b3c..9fa05c408e5 100644 --- a/cpp/benchmarks/row_conversion/row_conversion.cpp +++ b/cpp/benchmarks/row_conversion/row_conversion.cpp @@ -28,7 +28,7 @@ class RowConversion : public cudf::benchmark { static void BM_to_row(benchmark::State& state) { cudf::size_type const n_rows{(cudf::size_type)state.range(0)}; - auto const table = create_random_table({cudf::type_id::INT8, +/* auto const table = create_random_table({cudf::type_id::INT8, cudf::type_id::INT32, cudf::type_id::INT16, cudf::type_id::INT64, @@ -38,7 +38,10 @@ static void BM_to_row(benchmark::State& state) cudf::type_id::UINT8, cudf::type_id::UINT64}, 50, - row_count{n_rows}); + row_count{n_rows});*/ + auto const table = create_random_table({cudf::type_id::INT32}, + 64, + row_count{n_rows}); cudf::size_type total_bytes = 0; for (int i = 0; i < table->num_columns(); ++i) { @@ -98,7 +101,7 @@ static void BM_from_row(benchmark::State& state) (::benchmark::State & st) { BM_to_row(st); } \ BENCHMARK_REGISTER_F(RowConversion, name) \ ->RangeMultiplier(8) \ - ->Ranges({{1 << 16, 1 << 24}}) \ + ->Ranges({{1 << 6, 1 << 20}}) \ ->UseManualTime() \ ->Unit(benchmark::kMillisecond); diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu index fb5dc4cb38d..994233a0700 100644 --- a/cpp/src/row_conversion/row_conversion.cu +++ b/cpp/src/row_conversion/row_conversion.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include #include #include #include @@ -347,14 +348,14 @@ struct block_info { * @param output_data pointer to output data * */ -__global__ void copy_from_columns(const cudf::size_type num_rows, - const cudf::size_type num_columns, +__global__ void copy_from_columns(const size_type num_rows, + const size_type num_columns, const int8_t **input_data, - const cudf::bitmask_type **input_nm, - const cudf::size_type *col_sizes, - const cudf::size_type *col_offsets, + const bitmask_type **input_nm, + const size_type *col_sizes, + const size_type *col_offsets, const block_info *block_infos, - const uint64_t *row_offsets, + const size_type *row_offsets, int8_t **output_data) { // We are going to copy the data in two passes. @@ -365,47 +366,92 @@ __global__ void copy_from_columns(const cudf::size_type num_rows, // This has been broken up for us in the block_info struct, so we don't have // any calculation to do here, but it is important to note. + bool debug_print = false; + + if (debug_print) { + printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns); + printf("Column Info:\n"); + for (int i=0; i(&output_data[0][output_start_offset]) & 7; // offset for alignment shim in order to match shared memory with final dest - - printf("copying from column %d to column %d with rows %d to row %d(grid dim %d, blockIdx %d)\n", block.start_col, block.end_col, block.start_row, block.end_row, gridDim.x, blockIdx.x); - + if (debug_print) { + printf("outputting to offset %lu\n", output_start_offset); + printf("dest shim offset is %d\n", dest_shim_offset); + printf("Shared data is %p-%p\n", shared_data, shared_data + (48 * 1024)); + } // each thread is responsible for every threadcount rows of data. // the data is copies into shared memory in the final layout. auto const shmem_row_size = align_offset(col_offsets[block.end_col] + col_sizes[block.end_col] - col_offsets[block.start_col] + dest_shim_offset, 8); // 8 byte alignment required for shared memory rows auto const validity_offset = col_offsets[num_columns]; + if (debug_print) { + printf("col_offsets[%d] = %d, col_sizes[%d] = %d, col_offsets[%d] = %d\n", block.end_col, col_offsets[block.end_col], block.end_col, col_sizes[block.end_col], block.start_col, col_offsets[block.start_col]); + printf("shmem row size %d\n", shmem_row_size); + printf("validity offset is %d\n", validity_offset); + printf("starting at %d,%d and going to %d, %d\n", block.start_col, block.start_row, block.end_col, block.end_row); + } for (int col=block.start_col; col<=block.end_col; ++col) { /*if (!col_is_variable) */{ uint64_t col_offset = 0; cudf::size_type col_size = col_sizes[col]; auto const dest_col_offset = col_offsets[col] - col_offsets[block.start_col] + dest_shim_offset; + if (debug_print) { + printf("dest col offset %d\n", dest_col_offset); + } for (int row=block.start_row + threadIdx.x; row(input_data[col]); + if (debug_print) { + printf("%p <- short %d\n", shmem_dest, short_col_input[row]); + } *reinterpret_cast(shmem_dest) = short_col_input[row]; break; } case 4: { const int32_t *int_col_input = reinterpret_cast(input_data[col]); + if (debug_print) { + printf("shmem[%d][%d] - %p <- int %d\n", row, col, shmem_dest, int_col_input[row]); + } *reinterpret_cast(shmem_dest) = int_col_input[row]; break; } case 8: { const int64_t *long_col_input = reinterpret_cast(input_data[col]); + if (debug_print) { + printf("%p <- long %lu\n", shmem_dest, long_col_input[row]); + } *reinterpret_cast(shmem_dest) = long_col_input[row]; break; } default: { cudf::size_type input_offset = col_size * row; - // TODO this should just not be supported for fixed width columns, but just in case... + if (debug_print) { + printf("byte for byte copy due to size %d\n", col_size); + printf("%p <- input_data[%d] which is %d\n", shmem_dest, input_offset, input_data[col][input_offset]); + } + // TODO this should just not be supported for fixed width columns, but just in case... for (cudf::size_type b = 0; b < col_size; b++) { shmem_dest[b] = input_data[col][b + input_offset]; } @@ -676,6 +722,12 @@ std::vector> convert_to_rows2(cudf::table_view con CUDA_TRY( cudaDeviceGetAttribute(&shmem_limit_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); + #if defined(DEBUG) + size_t free, total; + cudaMemGetInfo( &free, &total ); + printf("%lu/%lu Memory", free, total); + #endif + // break up the work into blocks, which are a starting and ending row/col #. // this window size is calculated based on the shared memory size available // we want a single block to fill up the entire shared memory space available @@ -692,7 +744,7 @@ std::vector> convert_to_rows2(cudf::table_view con // windows so the windows can be properly cut around them. std::vector row_sizes; // size of each row in bytes including any alignment padding - std::vector row_offsets; // offset from the start of the data to this row + std::vector row_offsets; // offset from the start of the data to this row std::vector column_sizes; // byte size of each column std::vector column_starts; // offset of column inside a row including alignment std::vector variable_width_columns; // list of the variable width columns in the table @@ -821,7 +873,7 @@ std::vector> convert_to_rows2(cudf::table_view con block_infos.emplace_back( detail::block_info{start_col, current_window_start_row, - start_col + end_col, + end_col, std::min(current_window_start_row + window_height - 1, num_rows), current_window_row_batch}); i += window_height; @@ -889,23 +941,20 @@ std::vector> convert_to_rows2(cudf::table_view con auto dev_block_infos = detail::copy_to_dev_async2(block_infos, stream, mr); auto dev_row_offsets = detail::copy_to_dev_async2(row_offsets, stream, mr); - std::vector output_data; + std::vector output_buffers; + std::vector output_data; output_data.reserve(row_batches.size()); for (uint i=0; i(temp.data())); + output_buffers.push_back(std::move(temp)); } - auto dev_output_data = detail::copy_to_dev_async2(row_offsets, stream, mr); + auto dev_output_data = detail::copy_to_dev_async2(output_data, stream, mr); // blast through the entire table and convert it - dim3 blocks; - dim3 threads; - blocks.x = block_infos.size(); - blocks.y = 0; - blocks.z = 0; - threads.x = 1024; - threads.y = 0; - threads.z = 0; - detail::copy_from_columns<<>>(num_rows, + dim3 blocks(block_infos.size()); + dim3 threads(1024); + copy_from_columns<<>>(num_rows, num_columns, dev_input_data.data(), dev_input_nm.data(), @@ -932,14 +981,14 @@ std::vector> convert_to_rows2(cudf::table_view con } offset_offset += row_batches[i].row_count; - auto dev_offsets = detail::copy_to_dev_async2(offset_vals, stream, mr); + auto dev_offsets = detail::copy_to_dev_async2(offset_vals, stream, mr); auto offsets = std::make_unique(data_type{type_id::INT32}, (size_type)offset_vals.size(), dev_offsets.release()); auto data = std::make_unique(data_type{cudf::type_id::INT8}, row_batches[i].num_bytes, - std::move(output_data[i])); + std::move(output_buffers[i])); ret.push_back(cudf::make_lists_column(row_batches[i].row_count, std::move(offsets), From 8e52ba174b06f11ecd12a9f3fe35de17ade4f9e6 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Wed, 16 Jun 2021 19:25:57 +0000 Subject: [PATCH 10/80] Updates and bug fixing --- .../row_conversion/row_conversion.cpp | 76 ++- cpp/src/row_conversion/row_conversion.cu | 498 ++++++++++++------ cpp/tests/row_conversion/row_conversion.cpp | 106 ---- 3 files changed, 378 insertions(+), 302 deletions(-) diff --git a/cpp/benchmarks/row_conversion/row_conversion.cpp b/cpp/benchmarks/row_conversion/row_conversion.cpp index 9fa05c408e5..e1228c9df21 100644 --- a/cpp/benchmarks/row_conversion/row_conversion.cpp +++ b/cpp/benchmarks/row_conversion/row_conversion.cpp @@ -25,10 +25,43 @@ class RowConversion : public cudf::benchmark { }; -static void BM_to_row(benchmark::State& state) +static void BM_old_to_row(benchmark::State& state) { cudf::size_type const n_rows{(cudf::size_type)state.range(0)}; -/* auto const table = create_random_table({cudf::type_id::INT8, + auto const table = create_random_table({cudf::type_id::INT8, + cudf::type_id::INT32, + cudf::type_id::INT16, + cudf::type_id::INT64, + cudf::type_id::INT32, + cudf::type_id::BOOL8, + cudf::type_id::UINT16, + cudf::type_id::UINT8, + cudf::type_id::UINT64}, + 212, + row_count{n_rows}); + /* auto const table = create_random_table({cudf::type_id::INT32}, + 64, + row_count{n_rows});*/ + + cudf::size_type total_bytes = 0; + for (int i = 0; i < table->num_columns(); ++i) { + auto t = table->get_column(i).type(); + total_bytes += cudf::size_of(t); + } + + for (auto _ : state) { + cuda_event_timer raii(state, true, rmm::cuda_stream_default); + + auto rows = cudf::convert_to_rows(table->view()); + } + + state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows()); +} + +static void BM_new_to_row(benchmark::State& state) +{ + cudf::size_type const n_rows{(cudf::size_type)state.range(0)}; + auto const table = create_random_table({cudf::type_id::INT8, cudf::type_id::INT32, cudf::type_id::INT16, cudf::type_id::INT64, @@ -37,11 +70,11 @@ static void BM_to_row(benchmark::State& state) cudf::type_id::UINT16, cudf::type_id::UINT8, cudf::type_id::UINT64}, - 50, - row_count{n_rows});*/ - auto const table = create_random_table({cudf::type_id::INT32}, - 64, - row_count{n_rows}); + 212, + row_count{n_rows}); + /* auto const table = create_random_table({cudf::type_id::INT32}, + 64, + row_count{n_rows});*/ cudf::size_type total_bytes = 0; for (int i = 0; i < table->num_columns(); ++i) { @@ -52,14 +85,13 @@ static void BM_to_row(benchmark::State& state) for (auto _ : state) { cuda_event_timer raii(state, true, rmm::cuda_stream_default); -// auto rows = cudf::convert_to_rows(table->view()); auto new_rows = cudf::convert_to_rows2(table->view()); } state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows()); } -static void BM_from_row(benchmark::State& state) +/*static void BM_from_row(benchmark::State& state) { cudf::size_type const n_rows{(cudf::size_type)state.range(0)}; auto const table = create_random_table({cudf::type_id::INT8, @@ -73,9 +105,6 @@ static void BM_from_row(benchmark::State& state) cudf::type_id::UINT64}, 256, row_count{n_rows}); - /* auto const table = create_random_table({cudf::type_id::INT32}, - 4, - row_count{n_rows});*/ std::vector schema; cudf::size_type total_bytes = 0; @@ -94,18 +123,19 @@ static void BM_from_row(benchmark::State& state) } state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows()); -} - -#define TO_ROW_CONVERSION_BENCHMARK_DEFINE(name) \ - BENCHMARK_DEFINE_F(RowConversion, name) \ - (::benchmark::State & st) { BM_to_row(st); } \ - BENCHMARK_REGISTER_F(RowConversion, name) \ - ->RangeMultiplier(8) \ - ->Ranges({{1 << 6, 1 << 20}}) \ - ->UseManualTime() \ +}*/ + +#define TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \ + BENCHMARK_DEFINE_F(RowConversion, name) \ + (::benchmark::State & st) { f(st); } \ + BENCHMARK_REGISTER_F(RowConversion, name) \ + ->RangeMultiplier(8) \ + ->Ranges({{1 << 6, 1 << 20}}) \ + ->UseManualTime() \ ->Unit(benchmark::kMillisecond); -TO_ROW_CONVERSION_BENCHMARK_DEFINE(to_row_conversion) +TO_ROW_CONVERSION_BENCHMARK_DEFINE(old_to_row_conversion, BM_old_to_row) +TO_ROW_CONVERSION_BENCHMARK_DEFINE(new_to_row_conversion, BM_new_to_row) #define FROM_ROW_CONVERSION_BENCHMARK_DEFINE(name) \ BENCHMARK_DEFINE_F(RowConversion, name) \ @@ -116,4 +146,4 @@ TO_ROW_CONVERSION_BENCHMARK_DEFINE(to_row_conversion) ->UseManualTime() \ ->Unit(benchmark::kMillisecond); -FROM_ROW_CONVERSION_BENCHMARK_DEFINE(from_row_conversion) +//FROM_ROW_CONVERSION_BENCHMARK_DEFINE(from_row_conversion) diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu index 994233a0700..92ba075c316 100644 --- a/cpp/src/row_conversion/row_conversion.cu +++ b/cpp/src/row_conversion/row_conversion.cu @@ -44,7 +44,6 @@ static inline __host__ __device__ int32_t align_offset(int32_t offset, std::size return (offset + alignment - 1) & ~(alignment - 1); } - /** * Copy a simple vector to device memory asynchronously. Be sure to read * the data on the same stream as is used to copy it. @@ -61,10 +60,9 @@ std::unique_ptr> copy_to_dev_async(const std::vector & } template -rmm::device_uvector copy_to_dev_async2( - const std::vector &input, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) +rmm::device_uvector copy_to_dev_async2(const std::vector &input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) { rmm::device_uvector ret(input.size(), stream, mr); CUDA_TRY(cudaMemcpyAsync( @@ -346,7 +344,7 @@ struct block_info { * @param block_infos information about the blocks of work * @param row_offsets offset to a specific row in the input data * @param output_data pointer to output data - * + * */ __global__ void copy_from_columns(const size_type num_rows, const size_type num_columns, @@ -366,92 +364,119 @@ __global__ void copy_from_columns(const size_type num_rows, // This has been broken up for us in the block_info struct, so we don't have // any calculation to do here, but it is important to note. - bool debug_print = false; - + bool debug_print = false; // blockIdx.x == 70 && threadIdx.x == 448; + if (debug_print) { printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns); printf("Column Info:\n"); - for (int i=0; i(&output_data[0][output_start_offset]) & 7; // offset for alignment shim in order to match shared memory with final dest + uint8_t const dest_shim_offset = + reinterpret_cast(&output_data[0][output_start_offset]) & + 7; // offset for alignment shim in order to match shared memory with final dest if (debug_print) { printf("outputting to offset %lu\n", output_start_offset); printf("dest shim offset is %d\n", dest_shim_offset); printf("Shared data is %p-%p\n", shared_data, shared_data + (48 * 1024)); + printf("my block is %d,%d -> %d,%d - buffer %d\n", + block.start_col, + block.start_row, + block.end_col, + block.end_row, + block.buffer_num); } // each thread is responsible for every threadcount rows of data. // the data is copies into shared memory in the final layout. - auto const shmem_row_size = align_offset(col_offsets[block.end_col] + col_sizes[block.end_col] - col_offsets[block.start_col] + dest_shim_offset, 8); // 8 byte alignment required for shared memory rows + auto const real_bytes_in_row = + col_offsets[block.end_col] + col_sizes[block.end_col] - col_offsets[block.start_col]; + auto const shmem_row_size = align_offset(real_bytes_in_row + dest_shim_offset, + 8); // 8 byte alignment required for shared memory rows auto const validity_offset = col_offsets[num_columns]; if (debug_print) { - printf("col_offsets[%d] = %d, col_sizes[%d] = %d, col_offsets[%d] = %d\n", block.end_col, col_offsets[block.end_col], block.end_col, col_sizes[block.end_col], block.start_col, col_offsets[block.start_col]); + printf("col_offsets[%d] = %d, col_sizes[%d] = %d, col_offsets[%d] = %d\n", + block.end_col, + col_offsets[block.end_col], + block.end_col, + col_sizes[block.end_col], + block.start_col, + col_offsets[block.start_col]); printf("shmem row size %d\n", shmem_row_size); printf("validity offset is %d\n", validity_offset); - printf("starting at %d,%d and going to %d, %d\n", block.start_col, block.start_row, block.end_col, block.end_row); + printf("starting at %d,%d and going to %d, %d\n", + block.start_col, + block.start_row, + block.end_col, + block.end_row); } - for (int col=block.start_col; col<=block.end_col; ++col) { - /*if (!col_is_variable) */{ - uint64_t col_offset = 0; + for (int col = block.start_col; col <= block.end_col; ++col) { + /*if (!col_is_variable) */ { + uint64_t col_offset = 0; cudf::size_type col_size = col_sizes[col]; - auto const dest_col_offset = col_offsets[col] - col_offsets[block.start_col] + dest_shim_offset; - if (debug_print) { - printf("dest col offset %d\n", dest_col_offset); - } - for (int row=block.start_row + threadIdx.x; row(input_data[col]); - if (debug_print) { - printf("%p <- short %d\n", shmem_dest, short_col_input[row]); - } + const int16_t *short_col_input = reinterpret_cast(input_data[col]); + if (debug_print) { printf("%p <- short %d\n", shmem_dest, short_col_input[row]); } *reinterpret_cast(shmem_dest) = short_col_input[row]; break; } case 4: { - const int32_t *int_col_input = reinterpret_cast(input_data[col]); + const int32_t *int_col_input = reinterpret_cast(input_data[col]); if (debug_print) { - printf("shmem[%d][%d] - %p <- int %d\n", row, col, shmem_dest, int_col_input[row]); + printf("shmem[%d][%d] - %p <- int 0x%x\n", row, col, shmem_dest, int_col_input[row]); } *reinterpret_cast(shmem_dest) = int_col_input[row]; break; } case 8: { - const int64_t *long_col_input = reinterpret_cast(input_data[col]); - if (debug_print) { - printf("%p <- long %lu\n", shmem_dest, long_col_input[row]); - } + const int64_t *long_col_input = reinterpret_cast(input_data[col]); + if (debug_print) { printf("%p <- long %lu\n", shmem_dest, long_col_input[row]); } *reinterpret_cast(shmem_dest) = long_col_input[row]; break; } default: { cudf::size_type input_offset = col_size * row; if (debug_print) { - printf("byte for byte copy due to size %d\n", col_size); - printf("%p <- input_data[%d] which is %d\n", shmem_dest, input_offset, input_data[col][input_offset]); - } - // TODO this should just not be supported for fixed width columns, but just in case... + printf("byte for byte copy due to size %d of column %d\n", col_size, col); + printf("%p <- input_data[%d] which is %d\n", + shmem_dest, + input_offset, + input_data[col][input_offset]); + } + // TODO this should just not be supported for fixed width columns, but just in case... for (cudf::size_type b = 0; b < col_size; b++) { shmem_dest[b] = input_data[col][b + input_offset]; } @@ -463,11 +488,13 @@ __global__ void copy_from_columns(const size_type num_rows, // so we have to rewrite the addresses to make sure that it is 4 byte aligned // we do this directly in the final location because the entire row may not // fit in shared memory and may require many blocks to process it entirely - int8_t *valid_byte = &output_data[block.buffer_num][row_offsets[row] + validity_offset + col / 8]; + int8_t *valid_byte = + &output_data[block.buffer_num][row_offsets[row] + validity_offset + col / 8]; cudf::size_type byte_bit_offset = col % 8; uint64_t fixup_bytes = reinterpret_cast(valid_byte) % 4; int32_t *valid_int = reinterpret_cast(valid_byte - fixup_bytes); cudf::size_type int_bit_offset = byte_bit_offset + (fixup_bytes * 8); + if (debug_print) { printf("Outputting validity to %p\n", valid_byte); } // Now copy validity for the column if (input_nm[col]) { if (bit_is_set(input_nm[col], row)) { @@ -479,11 +506,11 @@ __global__ void copy_from_columns(const size_type num_rows, // It is valid so just set the bit atomicOr_block(valid_int, 1 << int_bit_offset); } - } // end row + } // end row - col_offset += col_sizes[col] * (block.end_row - block.start_row); + col_offset += col_sizes[col] * rows_in_block; } - } // end col + } // end col // wait for the data to be totally copied into shared memory __syncthreads(); @@ -496,30 +523,75 @@ __global__ void copy_from_columns(const size_type num_rows, // row in shared memory may not be an entire row of the destination. // auto const thread_start_offset = threadIdx.x * 8; - auto const thread_stride = gridDim.x * 8; - for (auto src_offset = thread_start_offset; src_offset < shmem_row_size * (block.end_row - block.start_row); src_offset += thread_stride) { + auto const thread_stride = gridDim.x * 8; + if (debug_print) { + printf("writing final data from %d to %d at stride %d\n", + thread_start_offset, + shmem_row_size * rows_in_block, + thread_stride); + printf("rows in block %d\n", rows_in_block); + } + for (auto src_offset = thread_start_offset; src_offset < shmem_row_size * rows_in_block; + src_offset += thread_stride) { auto const output_row_num = src_offset / shmem_row_size; - auto const row_offset = row_offsets[block.start_row + output_row_num]; - auto const col_offset = src_offset % shmem_row_size; - int8_t *output_ptr = &output_data[block.buffer_num][row_offset + col_offset]; - int8_t *input_ptr = &shared_data[src_offset]; - // the first part and last part of the row is unaligned data copy. This is copied a single byte - // at a time. - if (dest_shim_offset > 0 && src_offset % shmem_row_size == 0) { - // first part of a row, copy single bytes + auto const row_offset = row_offsets[block.start_row + output_row_num]; + auto const col_offset = src_offset % shmem_row_size; + int8_t *output_ptr = &output_data[block.buffer_num][row_offset + col_offset]; + int8_t *input_ptr = &shared_data[src_offset]; + + // three cases to worry about here + // 1) first 8-byte part of a large row - dest_shim_offset bytes of pad at the front + // 2) last 8-byte part of a large row - some bytes of pad at the end + // 3) corner case of <= 8 bytes of data, which means dest_shim_offset bytes of pad at the front + // AND potentially pad at the rear + + // we know the real number of bytes in a row, so we can figure out if we are in case 3 easily. + // 1st case is when we're at some even multiple of shmem_row_size offset. + // 2nd case is when offset + 8 is some even multiple of shmem_row_size. + // must be an 8 byte copy + + // there is a chance we have a 0 dest_shim_offset and an 8 byte thing to copy, optimize? + if (real_bytes_in_row + dest_shim_offset <= 8) { + // case 3, we want to copy real_bytes_in_row bytes + auto const num_single_bytes = real_bytes_in_row - dest_shim_offset; + for (auto i = 0; i < num_single_bytes; ++i) { + if (debug_print) { + printf("case 3 - %d single byte final write %p -> %p\n", + num_single_bytes, + &input_ptr[i + dest_shim_offset], + &output_ptr[i]); + } + output_ptr[i] = input_ptr[i + dest_shim_offset]; + } + } else if (dest_shim_offset > 0 && src_offset % shmem_row_size == 0) { + // first byte with leading pad auto const num_single_bytes = 8 - dest_shim_offset; - for (auto i=0; i %p\n", &input_ptr[i + dest_shim_offset], &output_ptr[i]); + } output_ptr[i] = input_ptr[i + dest_shim_offset]; } - } else if (dest_shim_offset > 0 && (src_offset + 8) % shmem_row_size == 0) { - // last part of a row, copy single bytes - auto const num_single_bytes = dest_shim_offset; - for (auto i=0; i 0) { + // last bytes of a row + auto const num_single_bytes = (real_bytes_in_row + dest_shim_offset) % 8; + for (auto i = 0; i < num_single_bytes; ++i) { + if (debug_print) { + printf("single trailing byte final write %p -> %p\n", + &input_ptr[i + dest_shim_offset], + &output_ptr[i]); + } output_ptr[i] = input_ptr[i + dest_shim_offset]; } } else { // copy 8 bytes aligned - const int64_t *long_col_input = reinterpret_cast(input_ptr); + const int64_t *long_col_input = reinterpret_cast(input_ptr); + if (debug_print) { + printf( + "long final write %p -> %p\n", long_col_input, reinterpret_cast(output_ptr)); + } *reinterpret_cast(output_ptr) = *long_col_input; } } @@ -696,13 +768,14 @@ std::vector> convert_to_rows2(cudf::table_view con rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) { - // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the data, but small enough - // that multiple columns fit in memory so the writes can coalese as well. Potential optimization for window sizes. + // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the + // data, but small enough that multiple columns fit in memory so the writes can coalese as well. + // Potential optimization for window sizes. constexpr int max_window_height = 1024; - const size_type num_columns = tbl.num_columns(); - const size_type num_rows = tbl.num_rows(); + const size_type num_columns = tbl.num_columns(); + const size_type num_rows = tbl.num_rows(); - #if defined(DEBUG) +#if defined(DEBUG) auto pretty_print = [](uint64_t i) { if (i > (1 * 1024 * 1024 * 1024)) { printf("%.2f GB", i / float(1 * 1024 * 1024 * 1024)); @@ -714,7 +787,7 @@ std::vector> convert_to_rows2(cudf::table_view con printf("%lu Bytes", i); } }; - #endif +#endif int device_id; CUDA_TRY(cudaGetDevice(&device_id)); @@ -722,11 +795,11 @@ std::vector> convert_to_rows2(cudf::table_view con CUDA_TRY( cudaDeviceGetAttribute(&shmem_limit_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); - #if defined(DEBUG) +#if defined(DEBUG) size_t free, total; - cudaMemGetInfo( &free, &total ); - printf("%lu/%lu Memory", free, total); - #endif + cudaMemGetInfo(&free, &total); + printf("%lu/%lu Memory\n", free, total); +#endif // break up the work into blocks, which are a starting and ending row/col #. // this window size is calculated based on the shared memory size available @@ -743,45 +816,46 @@ std::vector> convert_to_rows2(cudf::table_view con // to that point. These are row batches and they are decided first before building the // windows so the windows can be properly cut around them. - std::vector row_sizes; // size of each row in bytes including any alignment padding - std::vector row_offsets; // offset from the start of the data to this row + std::vector row_sizes; // size of each row in bytes including any alignment padding + std::vector row_offsets; // offset from the start of the data to this row std::vector column_sizes; // byte size of each column - std::vector column_starts; // offset of column inside a row including alignment - std::vector variable_width_columns; // list of the variable width columns in the table + std::vector column_starts; // offset of column inside a row including alignment + std::vector + variable_width_columns; // list of the variable width columns in the table row_sizes.reserve(num_rows); row_offsets.reserve(num_rows); column_sizes.reserve(num_columns); - column_starts.reserve(num_columns+1); // we add a final offset for validity data start + column_starts.reserve(num_columns + 1); // we add a final offset for validity data start size_type fixed_width_size_per_row = 0; for (int col = 0; col < num_columns; ++col) { - auto cv = tbl.column(col); - auto col_type = cv.type(); + auto cv = tbl.column(col); + auto col_type = cv.type(); bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; - if (nested_type) { variable_width_columns.push_back(cv);} + if (nested_type) { variable_width_columns.push_back(cv); } // a list or string column will write a single uint64 // of data here for offset/length auto col_size = nested_type ? 8 : size_of(col_type); // align size for this type - std::size_t const alignment_needed = col_size; // They are the same for fixed width types - fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed); + std::size_t const alignment_needed = col_size; // They are the same for fixed width types + fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed); column_starts.push_back(fixed_width_size_per_row); column_sizes.push_back(col_size); fixed_width_size_per_row += col_size; } - + // When building the columns to return, we have to be mindful of the offset limit in cudf. // It is 32-bit and these data columns are capable of surpassing that easily. The data should // not be cut off exactly at the limit though due to the validity buffers. The most efficient // place to cut the validity is on a 32-row boundary, so as we calculate the row sizes // we keep track of the cut points for the validity, which we call row batches. If the row - // is larger than can be represented with the 32-bit offsets, we use the last 32-row boundary we hit. - // Note that this boundary is for our book-keeping with column pointers and not anything - // that the kernel needs to worry about. We cut the output at convienient boundaries - // when assembling the outgoing data stream. + // is larger than can be represented with the 32-bit offsets, we use the last 32-row boundary we + // hit. Note that this boundary is for our book-keeping with column pointers and not anything that + // the kernel needs to worry about. We cut the output at convienient boundaries when assembling + // the outgoing data stream. struct row_batch { size_type num_bytes; size_type row_count; @@ -798,71 +872,90 @@ std::vector> convert_to_rows2(cudf::table_view con // will be included in the variable-width data blob at the end of the // row. return 0; -/* auto c = variable_width_columns[col]; - while (true) { - auto col_offsets = c.child(0).data(); - auto col_data_size = size_of(c.child(1).type()); - std::size_t alignment_needed = col_data_size; - - row_sizes[row] += (col_offsets[row + 1] - col_offsets[row]) * col_data_size; - if (c.num_children() == 0) { - break; - } - c = c.child(1); - } -*/ + /* auto c = variable_width_columns[col]; + while (true) { + auto col_offsets = c.child(0).data(); + auto col_data_size = size_of(c.child(1).type()); + std::size_t alignment_needed = col_data_size; + + row_sizes[row] += (col_offsets[row + 1] - col_offsets[row]) * col_data_size; + if (c.num_children() == 0) { + break; + } + c = c.child(1); + } + */ }; uint64_t row_batch_size = 0; uint64_t total_table_size = 0; - size_type row_batch_rows = 0; - uint64_t row_offset = 0; + size_type row_batch_rows = 0; + uint64_t row_offset = 0; + + auto calculate_validity_size = [](int const num_cols) { + // Now we need to add in space for validity + // Eventually we can think about nullable vs not nullable, but for now we will just always add + // it in + return (num_cols + 7) / 8; + }; - // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then calculate - // the size of each row's variable-width data as well. + // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then + // calculate the size of each row's variable-width data and validity as well. for (int row = 0; row < num_rows; ++row) { - row_sizes[row] = fixed_width_size_per_row + calculate_variable_width_row_data_size(row); - if (row_batch_size + row_sizes[row] > std::numeric_limits::max()) { + auto aligned_row_batch_size = + detail::align_offset(row_batch_size, 8); // rows are 8 byte aligned + row_sizes[row] = fixed_width_size_per_row; + // validity is byte aligned + row_sizes[row] += calculate_validity_size(num_columns); + // variable width data is 8-byte aligned + row_sizes[row] = detail::align_offset(row_sizes[row], 8) + + calculate_variable_width_row_data_size(row); // rows are 8 byte aligned + + if (aligned_row_batch_size + row_sizes[row] > std::numeric_limits::max()) { // a new batch starts at the last 32-row boundary - row_batches.push_back(row_batch{static_cast(row_batch_size), row_batch_rows & ~31}); - row_batch_size = 0; - row_batch_rows = row_batch_rows & 31; - row_offset = 0; + row_batches.push_back( + row_batch{static_cast(row_batch_size), row_batch_rows & ~31}); + row_batch_size = 0; + row_batch_rows = row_batch_rows & 31; + row_offset = 0; + aligned_row_batch_size = 0; } - row_offset = detail::align_offset(row_offset, 8); // rows are 8 byte aligned + row_offset = detail::align_offset(row_offset, 8); // rows are 8 byte aligned row_offsets.push_back(row_offset); - row_batch_size += row_sizes[row]; + row_batch_size = aligned_row_batch_size + row_sizes[row]; row_offset += row_sizes[row]; - total_table_size = detail::align_offset(total_table_size, 8); // rows are 8 byte aligned + total_table_size = detail::align_offset(total_table_size, 8); // rows are 8 byte aligned total_table_size += row_sizes[row]; row_batch_rows++; } if (row_batch_size > 0) { - row_batches.push_back(row_batch{static_cast(row_batch_size), row_batch_rows & ~31}); + row_batches.push_back(row_batch{static_cast(row_batch_size), row_batch_rows}); } - #if defined(DEBUG) +#if defined(DEBUG) + printf("%d rows and %d columns in table\n", num_rows, num_columns); printf("%lu batches:\n", row_batches.size()); for (auto i = 0; i < (int)row_batches.size(); ++i) { printf("%d: %d rows, ", i, row_batches[i].row_count); pretty_print(row_batches[i].num_bytes); printf("\n"); } - #endif +#endif std::vector block_infos; // block infos are organized with the windows going "down" the columns // this provides the most coalescing of memory access - int current_window_size = 0; + int current_window_width = 0; int current_window_start_col = 0; // build the blocks for a specific set of columns - auto build_blocks = [&block_infos, &row_batches, num_rows](int const start_col, int const end_col, int const desired_window_height) { + auto build_blocks = [&block_infos, &row_batches, num_rows]( + int const start_col, int const end_col, int const desired_window_height) { int current_window_start_row = 0; int current_window_row_batch = 0; - int rows_left_in_batch = row_batches[current_window_row_batch].row_count; - int i = 0; + int rows_left_in_batch = row_batches[current_window_row_batch].row_count; + int i = 0; while (i < num_rows) { if (rows_left_in_batch == 0) { current_window_row_batch++; @@ -872,9 +965,10 @@ std::vector> convert_to_rows2(cudf::table_view con block_infos.emplace_back( detail::block_info{start_col, - current_window_start_row, - end_col, - std::min(current_window_start_row + window_height - 1, num_rows), current_window_row_batch}); + current_window_start_row, + end_col, + std::min(current_window_start_row + window_height - 1, num_rows - 1), + current_window_row_batch}); i += window_height; current_window_start_row += window_height; @@ -882,7 +976,17 @@ std::vector> convert_to_rows2(cudf::table_view con } }; - int const window_height = std::min(std::min(max_window_height, num_rows), row_batches[0].row_count); + int const window_height = + std::min(std::min(max_window_height, num_rows), row_batches[0].row_count); +#if defined(DEBUG) + printf( + "max_window_height is %d, num_rows is %d, batch row count is %d - which makes window height " + "%d\n", + max_window_height, + num_rows, + row_batches[0].row_count, + window_height); +#endif int row_size = 0; @@ -891,32 +995,74 @@ std::vector> convert_to_rows2(cudf::table_view con auto const col_size = column_sizes[col]; // align size for this type - std::size_t alignment_needed = col_size; // They are the same for fixed width types - auto row_size_with_this_col = detail::align_offset(row_size, alignment_needed) + col_size; + std::size_t alignment_needed = col_size; // They are the same for fixed width types + auto row_size_aligned = detail::align_offset(row_size, alignment_needed); + auto row_size_with_this_col = row_size_aligned + col_size; if (row_size_with_this_col * window_height > shmem_limit_per_block) { +#if defined(DEBUG) + printf( + "Window size %d too large at column %d, bumping back to build windows of size %d(cols " + "%d-%d), which is %d tall. Row size is too large at %d and ok at %d(aligned overall is %d) " + "for shared mem size %d\n", + row_size_with_this_col * window_height, + col, + row_size * window_height, + current_window_start_col, + col - 1, + window_height, + row_size_with_this_col, + row_size, + row_size_aligned, + shmem_limit_per_block); +#endif // too large, close this window, generate vertical blocks and restart build_blocks(current_window_start_col, col - 1, window_height); - row_size = detail::align_offset(column_starts[col] & 7, alignment_needed) + col_size; // alignment required for shared memory window boundary to match alignment of output row + row_size = + detail::align_offset((column_starts[col] + column_sizes[col]) & 7, alignment_needed); +#if defined(DEBUG) + printf( + "New window starting with offset %d and row size %d to be %d (previous column offset %d+%d " + "or %d)\n", + row_size, + col_size, + row_size + col_size, + column_starts[col - 1], + column_sizes[col - 1], + column_starts[col - 1] + column_sizes[col - 1]); +#endif + row_size += col_size; // alignment required for shared memory window boundary to match + // alignment of output row current_window_start_col = col; + current_window_width = 0; } else { row_size = row_size_with_this_col; + current_window_width++; } } - auto validity_offset = detail::align_offset(column_starts.back(), 4); +#if defined(DEBUG) + printf("validity offset will be %d + %d = %d\n", + column_starts.back(), + column_sizes.back(), + column_starts.back() + column_sizes.back()); +#endif + auto validity_offset = detail::align_offset(column_starts.back() + column_sizes.back(), 4); column_starts.push_back(validity_offset); - + // build last set of blocks - if (current_window_size > 0) { build_blocks(current_window_start_col, (int)tbl.num_columns() - 1, window_height); } + if (current_window_width > 0) { + build_blocks(current_window_start_col, (int)tbl.num_columns() - 1, window_height); + } - // Get the pointers to the input columnar data ready - possibly moved up to copy to gpu while calculating other things + // Get the pointers to the input columnar data ready - possibly moved up to copy to gpu while + // calculating other things std::vector input_data; std::vector input_nm; for (size_type column_number = 0; column_number < num_columns; column_number++) { - column_view cv = tbl.column(column_number); + column_view cv = tbl.column(column_number); auto const col_type = cv.type(); - bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; + bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; if (!nested_type) { input_data.emplace_back(cv.data()); @@ -924,81 +1070,87 @@ std::vector> convert_to_rows2(cudf::table_view con } } - #if defined(DEBUG) - printf("%lu windows for %d columns, %d rows to fit in ", block_infos.size(), block_infos[0].end_col - block_infos[0].start_col, block_infos[0].end_row - block_infos[0].start_row); +#if defined(DEBUG) + printf("%lu windows for %d columns, %d rows to fit in ", + block_infos.size(), + block_infos[0].end_col - block_infos[0].start_col + 1, + block_infos[0].end_row - block_infos[0].start_row); pretty_print(shmem_limit_per_block); printf(" shared mem("); pretty_print(fixed_width_size_per_row); printf("/row, %d columns, %d rows, ", num_columns, num_rows); pretty_print(total_table_size); printf(" total):\n"); - #endif +#endif auto dev_input_data = detail::copy_to_dev_async2(input_data, stream, mr); auto dev_input_nm = detail::copy_to_dev_async2(input_nm, stream, mr); auto dev_col_sizes = detail::copy_to_dev_async2(column_sizes, stream, mr); - auto dev_col_starts = detail::copy_to_dev_async2(column_starts, stream, mr); + auto dev_col_starts = detail::copy_to_dev_async2(column_starts, stream, mr); auto dev_block_infos = detail::copy_to_dev_async2(block_infos, stream, mr); - auto dev_row_offsets = detail::copy_to_dev_async2(row_offsets, stream, mr); + auto dev_row_offsets = detail::copy_to_dev_async2(row_offsets, stream, mr); std::vector output_buffers; std::vector output_data; output_data.reserve(row_batches.size()); - for (uint i=0; i(temp.data())); output_buffers.push_back(std::move(temp)); } - auto dev_output_data = detail::copy_to_dev_async2(output_data, stream, mr); + auto dev_output_data = detail::copy_to_dev_async2(output_data, stream, mr); // blast through the entire table and convert it dim3 blocks(block_infos.size()); - dim3 threads(1024); - copy_from_columns<<>>(num_rows, - num_columns, - dev_input_data.data(), - dev_input_nm.data(), - dev_col_sizes.data(), - dev_col_starts.data(), - dev_block_infos.data(), - dev_row_offsets.data(), - reinterpret_cast(dev_output_data.data())); + dim3 threads(std::min((uint64_t)1024, total_table_size / 8)); +#if defined(DEBUG) + printf("Launching kernel with %d blocks, %d threads, ", blocks.x, threads.x); + pretty_print(shmem_limit_per_block); + printf(" shared memory\n"); +#endif + copy_from_columns<<>>( + num_rows, + num_columns, + dev_input_data.data(), + dev_input_nm.data(), + dev_col_sizes.data(), + dev_col_starts.data(), + dev_block_infos.data(), + dev_row_offsets.data(), + reinterpret_cast(dev_output_data.data())); // split up the output buffer into multiple buffers based on row batch sizes // and create list of byte columns int offset_offset = 0; std::vector> ret; - for (uint i=0; i offset_vals; offset_vals.reserve(row_batches[i].row_count + 1); size_type cur_offset = 0; offset_vals.push_back(cur_offset); - for (int row=0; row(data_type{type_id::INT32}, (size_type)offset_vals.size(), dev_offsets.release()); + auto dev_offsets = detail::copy_to_dev_async2(offset_vals, stream, mr); + auto offsets = std::make_unique( + data_type{type_id::INT32}, (size_type)offset_vals.size(), dev_offsets.release()); - auto data = - std::make_unique(data_type{cudf::type_id::INT8}, - row_batches[i].num_bytes, - std::move(output_buffers[i])); + auto data = std::make_unique( + data_type{cudf::type_id::INT8}, row_batches[i].num_bytes, std::move(output_buffers[i])); ret.push_back(cudf::make_lists_column(row_batches[i].row_count, - std::move(offsets), - std::move(data), - 0, - rmm::device_buffer{0, rmm::cuda_stream_default, mr}, - stream, - mr)); + std::move(offsets), + std::move(data), + 0, + rmm::device_buffer{0, rmm::cuda_stream_default, mr}, + stream, + mr)); } - + return ret; } diff --git a/cpp/tests/row_conversion/row_conversion.cpp b/cpp/tests/row_conversion/row_conversion.cpp index 818d7a89ddb..c02f83ad1d5 100644 --- a/cpp/tests/row_conversion/row_conversion.cpp +++ b/cpp/tests/row_conversion/row_conversion.cpp @@ -21,13 +21,9 @@ #include #include -#include "cudf/lists/lists_column_view.hpp" -#include "cudf/types.hpp" struct ColumnToRowTests : public cudf::test::BaseFixture { }; -struct RowToColumnTests : public cudf::test::BaseFixture { -}; TEST_F(ColumnToRowTests, Single) { @@ -112,105 +108,3 @@ TEST_F(ColumnToRowTests, SingleByteWide) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); } } - -TEST_F(RowToColumnTests, Single) -{ - cudf::test::fixed_width_column_wrapper a({-1}); - cudf::table_view in(std::vector{a}); - - auto old_rows = cudf::convert_to_rows(in); - std::vector schema{cudf::data_type{cudf::type_id::INT32}}; - for (uint i=0; i a({-1, 0, 1}); - cudf::table_view in(std::vector{a}); - - auto old_rows = cudf::convert_to_rows(in); - std::vector schema{cudf::data_type{cudf::type_id::INT32}}; - for (uint i=0; i int32_t { return rand(); }); - cudf::test::fixed_width_column_wrapper a(r, r + (size_t)4096); - cudf::table_view in(std::vector{a}); - - auto old_rows = cudf::convert_to_rows(in); - std::vector schema; - schema.reserve(in.num_columns()); - for (auto col = in.begin(); col < in.end(); ++col) { - schema.push_back(col->type()); - } - for (uint i=0; i> cols; - std::vector views; - - for (int i = 0; i < 256; ++i) { - cols.push_back(cudf::test::fixed_width_column_wrapper({rand()})); - views.push_back(cols.back()); - } - cudf::table_view in(views); - - auto old_rows = cudf::convert_to_rows(in); - std::vector schema; - schema.reserve(in.num_columns()); - for (auto col = in.begin(); col < in.end(); ++col) { - schema.push_back(col->type()); - } - - for (uint i=0; i> cols; - std::vector views; - - for (int i = 0; i < 256; ++i) { - cols.push_back(cudf::test::fixed_width_column_wrapper({rand()})); - views.push_back(cols.back()); - } - cudf::table_view in(views); - - auto old_rows = cudf::convert_to_rows(in); - std::vector schema; - schema.reserve(in.num_columns()); - for (auto col = in.begin(); col < in.end(); ++col) { - schema.push_back(col->type()); - } - for (uint i=0; i Date: Mon, 21 Jun 2021 18:17:45 +0000 Subject: [PATCH 11/80] Updating windows to be generated in a square way so we can have more data to write out as 8-byte writes from shared memory. Shuffled some of the copy to GPU code up so it can start the copy sooner and hopefully won't force stalls. Some bug fixes. --- .../row_conversion/row_conversion.cpp | 15 ++- cpp/src/row_conversion/row_conversion.cu | 96 +++++++++++-------- 2 files changed, 67 insertions(+), 44 deletions(-) diff --git a/cpp/benchmarks/row_conversion/row_conversion.cpp b/cpp/benchmarks/row_conversion/row_conversion.cpp index e1228c9df21..d6b195433cf 100644 --- a/cpp/benchmarks/row_conversion/row_conversion.cpp +++ b/cpp/benchmarks/row_conversion/row_conversion.cpp @@ -125,7 +125,7 @@ static void BM_new_to_row(benchmark::State& state) state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows()); }*/ -#define TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \ +#define OLD_TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \ BENCHMARK_DEFINE_F(RowConversion, name) \ (::benchmark::State & st) { f(st); } \ BENCHMARK_REGISTER_F(RowConversion, name) \ @@ -134,8 +134,17 @@ static void BM_new_to_row(benchmark::State& state) ->UseManualTime() \ ->Unit(benchmark::kMillisecond); -TO_ROW_CONVERSION_BENCHMARK_DEFINE(old_to_row_conversion, BM_old_to_row) -TO_ROW_CONVERSION_BENCHMARK_DEFINE(new_to_row_conversion, BM_new_to_row) +#define NEW_TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \ + BENCHMARK_DEFINE_F(RowConversion, name) \ + (::benchmark::State & st) { f(st); } \ + BENCHMARK_REGISTER_F(RowConversion, name) \ + ->RangeMultiplier(8) \ + ->Ranges({{1 << 6, 1 << 20}}) \ + ->UseManualTime() \ + ->Unit(benchmark::kMillisecond); + +OLD_TO_ROW_CONVERSION_BENCHMARK_DEFINE(old_to_row_conversion, BM_old_to_row) +NEW_TO_ROW_CONVERSION_BENCHMARK_DEFINE(new_to_row_conversion, BM_new_to_row) #define FROM_ROW_CONVERSION_BENCHMARK_DEFINE(name) \ BENCHMARK_DEFINE_F(RowConversion, name) \ diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu index 92ba075c316..3f221e2f716 100644 --- a/cpp/src/row_conversion/row_conversion.cu +++ b/cpp/src/row_conversion/row_conversion.cu @@ -364,7 +364,7 @@ __global__ void copy_from_columns(const size_type num_rows, // This has been broken up for us in the block_info struct, so we don't have // any calculation to do here, but it is important to note. - bool debug_print = false; // blockIdx.x == 70 && threadIdx.x == 448; + constexpr bool debug_print = false; //blockIdx.x == 2649 && threadIdx.x == 479; if (debug_print) { printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns); @@ -383,6 +383,7 @@ __global__ void copy_from_columns(const size_type num_rows, }*/ printf("output data to %p\n", output_data[block_infos[blockIdx.x].buffer_num]); } + //else { return; } auto block = block_infos[blockIdx.x]; auto const rows_in_block = block.end_row - block.start_row + 1; extern __shared__ int8_t shared_data[]; @@ -416,7 +417,7 @@ __global__ void copy_from_columns(const size_type num_rows, col_sizes[block.end_col], block.start_col, col_offsets[block.start_col]); - printf("shmem row size %d\n", shmem_row_size); + printf("shmem row size %d with real bytes %d\n", shmem_row_size, real_bytes_in_row); printf("validity offset is %d\n", validity_offset); printf("starting at %d,%d and going to %d, %d\n", block.start_col, @@ -524,6 +525,8 @@ __global__ void copy_from_columns(const size_type num_rows, // auto const thread_start_offset = threadIdx.x * 8; auto const thread_stride = gridDim.x * 8; + auto const end_offset = shmem_row_size * rows_in_block; + if (debug_print) { printf("writing final data from %d to %d at stride %d\n", thread_start_offset, @@ -531,7 +534,7 @@ __global__ void copy_from_columns(const size_type num_rows, thread_stride); printf("rows in block %d\n", rows_in_block); } - for (auto src_offset = thread_start_offset; src_offset < shmem_row_size * rows_in_block; + for (auto src_offset = thread_start_offset; src_offset < end_offset; src_offset += thread_stride) { auto const output_row_num = src_offset / shmem_row_size; auto const row_offset = row_offsets[block.start_row + output_row_num]; @@ -771,7 +774,6 @@ std::vector> convert_to_rows2(cudf::table_view con // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the // data, but small enough that multiple columns fit in memory so the writes can coalese as well. // Potential optimization for window sizes. - constexpr int max_window_height = 1024; const size_type num_columns = tbl.num_columns(); const size_type num_rows = tbl.num_rows(); @@ -816,6 +818,25 @@ std::vector> convert_to_rows2(cudf::table_view con // to that point. These are row batches and they are decided first before building the // windows so the windows can be properly cut around them. + // Get the pointers to the input columnar data ready + std::vector input_data; + std::vector input_nm; + input_data.reserve(num_columns); + input_nm.reserve(num_columns); + for (size_type column_number = 0; column_number < num_columns; column_number++) { + column_view cv = tbl.column(column_number); + auto const col_type = cv.type(); + bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; + + if (!nested_type) { + input_data.emplace_back(cv.data()); + input_nm.emplace_back(cv.null_mask()); + } + } + + auto dev_input_data = detail::copy_to_dev_async2(input_data, stream, mr); + auto dev_input_nm = detail::copy_to_dev_async2(input_nm, stream, mr); + std::vector row_sizes; // size of each row in bytes including any alignment padding std::vector row_offsets; // offset from the start of the data to this row std::vector column_sizes; // byte size of each column @@ -847,6 +868,9 @@ std::vector> convert_to_rows2(cudf::table_view con fixed_width_size_per_row += col_size; } + auto dev_col_sizes = detail::copy_to_dev_async2(column_sizes, stream, mr); + auto dev_col_starts = detail::copy_to_dev_async2(column_starts, stream, mr); + // When building the columns to return, we have to be mindful of the offset limit in cudf. // It is 32-bit and these data columns are capable of surpassing that easily. The data should // not be cut off exactly at the limit though due to the validity buffers. The most efficient @@ -901,17 +925,18 @@ std::vector> convert_to_rows2(cudf::table_view con // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then // calculate the size of each row's variable-width data and validity as well. + auto validity_size = calculate_validity_size(num_columns); for (int row = 0; row < num_rows; ++row) { auto aligned_row_batch_size = detail::align_offset(row_batch_size, 8); // rows are 8 byte aligned row_sizes[row] = fixed_width_size_per_row; // validity is byte aligned - row_sizes[row] += calculate_validity_size(num_columns); + row_sizes[row] += validity_size; // variable width data is 8-byte aligned row_sizes[row] = detail::align_offset(row_sizes[row], 8) + calculate_variable_width_row_data_size(row); // rows are 8 byte aligned - if (aligned_row_batch_size + row_sizes[row] > std::numeric_limits::max()) { + if ((uint64_t)aligned_row_batch_size + row_sizes[row] > (uint64_t)std::numeric_limits::max()) { // a new batch starts at the last 32-row boundary row_batches.push_back( row_batch{static_cast(row_batch_size), row_batch_rows & ~31}); @@ -932,7 +957,9 @@ std::vector> convert_to_rows2(cudf::table_view con row_batches.push_back(row_batch{static_cast(row_batch_size), row_batch_rows}); } -#if defined(DEBUG) + auto dev_row_offsets = detail::copy_to_dev_async2(row_offsets, stream, mr); + + #if defined(DEBUG) printf("%d rows and %d columns in table\n", num_rows, num_columns); printf("%lu batches:\n", row_batches.size()); for (auto i = 0; i < (int)row_batches.size(); ++i) { @@ -942,6 +969,16 @@ std::vector> convert_to_rows2(cudf::table_view con } #endif + std::vector output_buffers; + std::vector output_data; + output_data.reserve(row_batches.size()); + for (uint i = 0; i < row_batches.size(); ++i) { + rmm::device_buffer temp(row_batches[i].num_bytes, stream, mr); + output_data.push_back(static_cast(temp.data())); + output_buffers.push_back(std::move(temp)); + } + auto dev_output_data = detail::copy_to_dev_async2(output_data, stream, mr); + std::vector block_infos; // block infos are organized with the windows going "down" the columns @@ -976,8 +1013,13 @@ std::vector> convert_to_rows2(cudf::table_view con } }; + // the ideal window height has lots of 8-byte reads and 8-byte writes. The optimal read/write would be memory cache line sized + // access, but since other blocks will read/write the edges this may not turn out to be overly important. + // For now, we will attempt to build a square window as far as byte sizes. x * y = shared_mem_size. + // Which translates to x^2 = shared_mem_size since we want them equal, so height and width are + // sqrt(shared_mem_size). The trick is that it's in bytes, not rows or columns. int const window_height = - std::min(std::min(max_window_height, num_rows), row_batches[0].row_count); + std::min(std::min(size_type(sqrt(shmem_limit_per_block))/column_sizes[0], num_rows), row_batches[0].row_count); #if defined(DEBUG) printf( "max_window_height is %d, num_rows is %d, batch row count is %d - which makes window height " @@ -998,20 +1040,21 @@ std::vector> convert_to_rows2(cudf::table_view con std::size_t alignment_needed = col_size; // They are the same for fixed width types auto row_size_aligned = detail::align_offset(row_size, alignment_needed); auto row_size_with_this_col = row_size_aligned + col_size; + auto row_size_with_end_pad = detail::align_offset(row_size_with_this_col, 8); - if (row_size_with_this_col * window_height > shmem_limit_per_block) { + if (row_size_with_end_pad * window_height > shmem_limit_per_block) { #if defined(DEBUG) printf( "Window size %d too large at column %d, bumping back to build windows of size %d(cols " "%d-%d), which is %d tall. Row size is too large at %d and ok at %d(aligned overall is %d) " "for shared mem size %d\n", - row_size_with_this_col * window_height, + row_size_with_end_pad * window_height, col, row_size * window_height, current_window_start_col, col - 1, window_height, - row_size_with_this_col, + row_size_with_end_pad, row_size, row_size_aligned, shmem_limit_per_block); @@ -1055,20 +1098,6 @@ std::vector> convert_to_rows2(cudf::table_view con build_blocks(current_window_start_col, (int)tbl.num_columns() - 1, window_height); } - // Get the pointers to the input columnar data ready - possibly moved up to copy to gpu while - // calculating other things - std::vector input_data; - std::vector input_nm; - for (size_type column_number = 0; column_number < num_columns; column_number++) { - column_view cv = tbl.column(column_number); - auto const col_type = cv.type(); - bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; - - if (!nested_type) { - input_data.emplace_back(cv.data()); - input_nm.emplace_back(cv.null_mask()); - } - } #if defined(DEBUG) printf("%lu windows for %d columns, %d rows to fit in ", @@ -1083,26 +1112,11 @@ std::vector> convert_to_rows2(cudf::table_view con printf(" total):\n"); #endif - auto dev_input_data = detail::copy_to_dev_async2(input_data, stream, mr); - auto dev_input_nm = detail::copy_to_dev_async2(input_nm, stream, mr); - auto dev_col_sizes = detail::copy_to_dev_async2(column_sizes, stream, mr); - auto dev_col_starts = detail::copy_to_dev_async2(column_starts, stream, mr); auto dev_block_infos = detail::copy_to_dev_async2(block_infos, stream, mr); - auto dev_row_offsets = detail::copy_to_dev_async2(row_offsets, stream, mr); - - std::vector output_buffers; - std::vector output_data; - output_data.reserve(row_batches.size()); - for (uint i = 0; i < row_batches.size(); ++i) { - rmm::device_buffer temp(row_batches[i].num_bytes, stream, mr); - output_data.push_back(static_cast(temp.data())); - output_buffers.push_back(std::move(temp)); - } - auto dev_output_data = detail::copy_to_dev_async2(output_data, stream, mr); // blast through the entire table and convert it dim3 blocks(block_infos.size()); - dim3 threads(std::min((uint64_t)1024, total_table_size / 8)); + dim3 threads(std::min(1024, shmem_limit_per_block / 8)); #if defined(DEBUG) printf("Launching kernel with %d blocks, %d threads, ", blocks.x, threads.x); pretty_print(shmem_limit_per_block); From 5cf1cf1afccacd0f7c9b0d47596176926b8b0858 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Thu, 8 Jul 2021 01:52:36 +0000 Subject: [PATCH 12/80] Adding row to column conversion code. Performance falls off a cliff, but starts out reasonably. I haven't looked at this in nsight yet. --- .../row_conversion/row_conversion.cpp | 74 +- cpp/include/cudf/row_conversion.hpp | 12 + cpp/src/row_conversion/row_conversion.cu | 759 +++++++++++++----- cpp/tests/row_conversion/row_conversion.cpp | 106 +++ 4 files changed, 748 insertions(+), 203 deletions(-) diff --git a/cpp/benchmarks/row_conversion/row_conversion.cpp b/cpp/benchmarks/row_conversion/row_conversion.cpp index d6b195433cf..7c1f52c5cd6 100644 --- a/cpp/benchmarks/row_conversion/row_conversion.cpp +++ b/cpp/benchmarks/row_conversion/row_conversion.cpp @@ -91,7 +91,7 @@ static void BM_new_to_row(benchmark::State& state) state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows()); } -/*static void BM_from_row(benchmark::State& state) +static void BM_old_from_row(benchmark::State& state) { cudf::size_type const n_rows{(cudf::size_type)state.range(0)}; auto const table = create_random_table({cudf::type_id::INT8, @@ -123,36 +123,62 @@ static void BM_new_to_row(benchmark::State& state) } state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows()); -}*/ - -#define OLD_TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \ - BENCHMARK_DEFINE_F(RowConversion, name) \ - (::benchmark::State & st) { f(st); } \ - BENCHMARK_REGISTER_F(RowConversion, name) \ - ->RangeMultiplier(8) \ - ->Ranges({{1 << 6, 1 << 20}}) \ - ->UseManualTime() \ - ->Unit(benchmark::kMillisecond); +} + +static void BM_new_from_row(benchmark::State& state) +{ + cudf::size_type const n_rows{(cudf::size_type)state.range(0)}; + auto const table = create_random_table({cudf::type_id::INT8, + cudf::type_id::INT32, + cudf::type_id::INT16, + cudf::type_id::INT64, + cudf::type_id::INT32, + cudf::type_id::BOOL8, + cudf::type_id::UINT16, + cudf::type_id::UINT8, + cudf::type_id::UINT64}, + 256, + row_count{n_rows}); + + std::vector schema; + cudf::size_type total_bytes = 0; + for (int i = 0; i < table->num_columns(); ++i) { + auto t = table->get_column(i).type(); + schema.push_back(t); + total_bytes += cudf::size_of(t); + } + + auto rows = cudf::convert_to_rows(table->view()); + + for (auto _ : state) { + cuda_event_timer raii(state, true, rmm::cuda_stream_default); + + auto out = cudf::convert_from_rows2(rows, schema); + } + + state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows()); +} -#define NEW_TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \ - BENCHMARK_DEFINE_F(RowConversion, name) \ - (::benchmark::State & st) { f(st); } \ - BENCHMARK_REGISTER_F(RowConversion, name) \ - ->RangeMultiplier(8) \ - ->Ranges({{1 << 6, 1 << 20}}) \ - ->UseManualTime() \ +#define TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \ + BENCHMARK_DEFINE_F(RowConversion, name) \ + (::benchmark::State & st) { f(st); } \ + BENCHMARK_REGISTER_F(RowConversion, name) \ + ->RangeMultiplier(8) \ + ->Ranges({{1 << 6, 1 << 20}}) \ + ->UseManualTime() \ ->Unit(benchmark::kMillisecond); -OLD_TO_ROW_CONVERSION_BENCHMARK_DEFINE(old_to_row_conversion, BM_old_to_row) -NEW_TO_ROW_CONVERSION_BENCHMARK_DEFINE(new_to_row_conversion, BM_new_to_row) +TO_ROW_CONVERSION_BENCHMARK_DEFINE(old_to_row_conversion, BM_old_to_row) +TO_ROW_CONVERSION_BENCHMARK_DEFINE(new_to_row_conversion, BM_new_to_row) -#define FROM_ROW_CONVERSION_BENCHMARK_DEFINE(name) \ +#define FROM_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \ BENCHMARK_DEFINE_F(RowConversion, name) \ - (::benchmark::State & st) { BM_from_row(st); } \ + (::benchmark::State & st) { f(st); } \ BENCHMARK_REGISTER_F(RowConversion, name) \ ->RangeMultiplier(8) \ - ->Ranges({{1 << 6, 1 << 22}}) \ + ->Ranges({{1 << 6, 1 << 20}}) \ ->UseManualTime() \ ->Unit(benchmark::kMillisecond); -//FROM_ROW_CONVERSION_BENCHMARK_DEFINE(from_row_conversion) +FROM_ROW_CONVERSION_BENCHMARK_DEFINE(old_from_row_conversion, BM_old_from_row) +FROM_ROW_CONVERSION_BENCHMARK_DEFINE(new_from_row_conversion, BM_new_from_row) diff --git a/cpp/include/cudf/row_conversion.hpp b/cpp/include/cudf/row_conversion.hpp index f5e2225ad19..282ffa4b0cb 100644 --- a/cpp/include/cudf/row_conversion.hpp +++ b/cpp/include/cudf/row_conversion.hpp @@ -48,4 +48,16 @@ std::unique_ptr convert_from_rows( rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); +std::unique_ptr convert_from_rows2( + cudf::lists_column_view const &input, + std::vector const &schema, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); + +std::unique_ptr convert_from_rows2( + std::vector> const &input, + std::vector const &schema, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); + } // namespace cudf diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu index 3f221e2f716..c0e78a03576 100644 --- a/cpp/src/row_conversion/row_conversion.cu +++ b/cpp/src/row_conversion/row_conversion.cu @@ -30,6 +30,7 @@ #include #include +#include #include "cudf/types.hpp" #include "rmm/device_buffer.hpp" #include "thrust/iterator/counting_iterator.h" @@ -332,6 +333,20 @@ struct block_info { int buffer_num; }; +// When building the columns to return, we have to be mindful of the offset limit in cudf. +// It is 32-bit and these data columns are capable of surpassing that easily. The data should +// not be cut off exactly at the limit though due to the validity buffers. The most efficient +// place to cut the validity is on a 32-row boundary, so as we calculate the row sizes +// we keep track of the cut points for the validity, which we call row batches. If the row +// is larger than can be represented with the 32-bit offsets, we use the last 32-row boundary we +// hit. Note that this boundary is for our book-keeping with column pointers and not anything that +// the kernel needs to worry about. We cut the output at convienient boundaries when assembling +// the outgoing data stream. +struct row_batch { + size_type num_bytes; + size_type row_count; +}; + /** * @brief copy data from cudf columns into x format, which is row-based * @@ -364,7 +379,7 @@ __global__ void copy_from_columns(const size_type num_rows, // This has been broken up for us in the block_info struct, so we don't have // any calculation to do here, but it is important to note. - constexpr bool debug_print = false; //blockIdx.x == 2649 && threadIdx.x == 479; + bool debug_print = false; //blockIdx.x == 0 && threadIdx.x == 0; if (debug_print) { printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns); @@ -383,7 +398,7 @@ __global__ void copy_from_columns(const size_type num_rows, }*/ printf("output data to %p\n", output_data[block_infos[blockIdx.x].buffer_num]); } - //else { return; } + // else { return; } auto block = block_infos[blockIdx.x]; auto const rows_in_block = block.end_row - block.start_row + 1; extern __shared__ int8_t shared_data[]; @@ -403,7 +418,7 @@ __global__ void copy_from_columns(const size_type num_rows, block.buffer_num); } // each thread is responsible for every threadcount rows of data. - // the data is copies into shared memory in the final layout. + // the data is copied into shared memory in the final layout. auto const real_bytes_in_row = col_offsets[block.end_col] + col_sizes[block.end_col] - col_offsets[block.start_col]; auto const shmem_row_size = align_offset(real_bytes_in_row + dest_shim_offset, @@ -432,7 +447,7 @@ __global__ void copy_from_columns(const size_type num_rows, auto const dest_col_offset = col_offsets[col] - col_offsets[block.start_col] + dest_shim_offset; if (debug_print) { printf("dest col offset %d\n", dest_col_offset); } - for (int row = block.start_row + threadIdx.x; row <= block.end_row; row += gridDim.x) { + for (int row = block.start_row + threadIdx.x; row <= block.end_row; row += blockDim.x) { if (debug_print) { printf("shmem row %d(%d) at offset %d(%d)\n", row - block.start_row, @@ -524,8 +539,8 @@ __global__ void copy_from_columns(const size_type num_rows, // row in shared memory may not be an entire row of the destination. // auto const thread_start_offset = threadIdx.x * 8; - auto const thread_stride = gridDim.x * 8; - auto const end_offset = shmem_row_size * rows_in_block; + auto const thread_stride = blockDim.x * 8; + auto const end_offset = shmem_row_size * rows_in_block; if (debug_print) { printf("writing final data from %d to %d at stride %d\n", @@ -559,9 +574,10 @@ __global__ void copy_from_columns(const size_type num_rows, auto const num_single_bytes = real_bytes_in_row - dest_shim_offset; for (auto i = 0; i < num_single_bytes; ++i) { if (debug_print) { - printf("case 3 - %d single byte final write %p -> %p\n", + printf("case 3 - %d single byte final write %p(%d) -> %p\n", num_single_bytes, &input_ptr[i + dest_shim_offset], + input_ptr[i + dest_shim_offset], &output_ptr[i]); } output_ptr[i] = input_ptr[i + dest_shim_offset]; @@ -600,6 +616,237 @@ __global__ void copy_from_columns(const size_type num_rows, } } +/** + * @brief copy data from row-based format to cudf columns + * + * @param num_rows total number of rows in the table + * @param num_columns total number of columns in the table + * @param offsets + * @param output_data + * @param output_nm + * @param col_sizes array of sizes for each element in a column - one per column + * @param col_offsets offset into input data row for each column's start + * @param block_infos information about the blocks of work + * @param input_data pointer to input data + * + */ +__global__ void copy_to_columns(const size_type num_rows, + const size_type num_columns, + const size_type *offsets, + int8_t **output_data, + cudf::bitmask_type **output_nm, + const size_type *col_sizes, + const size_type *col_offsets, + const block_info *block_infos, + const int8_t *input_data) +{ + // We are going to copy the data in two passes. + // The first pass copies a chunk of data into shared memory. + // The second pass copies that chunk from shared memory out to the final location. + + // Because shared memory is limited we copy a subset of the rows at a time. + // This has been broken up for us in the block_info struct, so we don't have + // any calculation to do here, but it is important to note. + + bool debug_print = false; //blockIdx.x == 1 && threadIdx.x == 0; + + if (debug_print) { + printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns); + printf("Column Info:\n"); + for (int i = 0; i < num_columns; ++i) { + printf("col %d is at %p with size %d and offset %d\n", + i, + output_data[i], + col_sizes[i], + col_offsets[i]); + } + printf("block infos are at %p and my index is %d\n", block_infos, blockIdx.x); + /* printf("Row Offsets:\n"); + for (int i=0; i(&input_data[offsets[absolute_row] + offset_in_row]); + if (debug_print) { + printf("which will be address %p\n", long_col_input); + printf("%p <- long %lu\n", shmem_dest, *long_col_input); } + *reinterpret_cast(shmem_dest) = *long_col_input; + } + + __syncthreads(); + + // now we copy from shared memory to final destination. + // the data is laid out in rows in shared memory, so the reads + // for a column will be "vertical". Because of this and the different + // sizes for each column, this portion is handled on row/column basis. + // to prevent each thread working on a single row and also to ensure + // that all threads can do work in the case of more threads than rows, + // we do a global index instead of a double for loop with col/row. + for (int index = threadIdx.x; index < rows_in_block * cols_in_block; index += blockDim.x) { + auto const relative_col = index % cols_in_block; + auto const relative_row = index / cols_in_block; + auto const absolute_col = relative_col + block.start_col; + auto const absolute_row = relative_row + block.start_row; + + auto const shared_memory_row_offset = window_quad_width * 8 * relative_row; + auto const shared_memory_offset = col_offsets[absolute_col] - col_offsets[block.start_col] + + shared_memory_row_offset + shared_memory_starting_pad; + auto const column_size = col_sizes[absolute_col]; + + int8_t *shmem_src = &shared_data[shared_memory_offset]; + int8_t *dst = &output_data[absolute_col][absolute_row * column_size]; + + if (debug_print) { + printf("relative_col: %d, relative_row: %d, absolute_col: %d, absolute_row: %d, shared_mmeory_row_offset: %d, shared_memory_offset: %d," + " column_size: %d, shmem_src: %p, dst: %p\n", relative_col, relative_row, absolute_col, absolute_row, shared_memory_row_offset, shared_memory_offset, column_size, + shmem_src, dst) ; + } + switch (column_size) { + case 1: { + if (debug_print) { printf("%p <- byte %d\n", dst, *shmem_src); } + *dst = *shmem_src; + break; + } + case 2: { + const int16_t *short_col_input = reinterpret_cast(shmem_src); + if (debug_print) { printf("%p <- short %d\n", dst, *short_col_input); } + *reinterpret_cast(dst) = *short_col_input; + break; + } + case 4: { + const int32_t *int_col_input = reinterpret_cast(shmem_src); + if (debug_print) { printf("%p <- int 0x%x\n", dst, *int_col_input); } + *reinterpret_cast(dst) = *int_col_input; + break; + } + case 8: { + const int64_t *long_col_input = reinterpret_cast(shmem_src); + if (debug_print) { printf("%p <- long %lu\n", dst, *long_col_input); } + *reinterpret_cast(dst) = *long_col_input; + break; + } + default: { + if (debug_print) { + printf("byte for byte copy due to size %d of column %d\n", column_size, absolute_col); + } + // TODO this should just not be supported for fixed width columns, but just in case... + for (cudf::size_type b = 0; b < column_size; b++) { dst[b] = shmem_src[b]; } + break; + } + } + } + + __syncthreads(); + + // now handle validity. Each thread is responsible for 32 rows in a single column. + // to prevent indexing issues with a large number of threads, this is compressed + // to a single loop like above. TODO: investigate using shared memory here + auto const validity_batches_per_col = (num_rows + 31) / 32; + auto const validity_batches_total = validity_batches_per_col * num_columns; + if (debug_print) { + printf("validity_batched_per_col is %d\nvalidity_batches_total is %d for %d rows\n", validity_batches_per_col, validity_batches_total, num_rows); + } + for (int index = threadIdx.x; index < validity_batches_total; index += blockDim.x) { + // what column is this? + auto const col = index / validity_batches_per_col; + auto const batch = index % validity_batches_per_col; + auto const starting_row = batch * 32; + auto const validity_offset = col_offsets[num_columns] + col / 8; + + if (debug_print) { + printf("col: %d, batch: %d, starting_row: %d, validity_offset: %d\n", col, batch, starting_row, validity_offset); + } + + int32_t dst_validity = 0; + for (int row = starting_row; row < std::min(num_rows, starting_row + 32); ++row) { + int8_t const * const validity_ptr = &input_data[offsets[row] + validity_offset]; + + if (debug_print) { + printf("validity_ptr is %p for row %d\nwhich is input_data[%d]\n", validity_ptr, row, offsets[row] + validity_offset); + } + + auto const val_byte = *validity_ptr; + auto const src_shift = col % 8; + auto const dst_shift = row % 32; + auto const src_bit_mask = 1 << src_shift; + if (debug_print) { + printf("src bit mask is 0x%x\n", src_bit_mask); + printf("src shift is 0x%x and dst shift is 0x%x\n", src_shift, dst_shift); + printf("validity bit is 0x%x\n", (val_byte & src_bit_mask) >> src_shift); + } +// auto const dst_bit_mask = 1 << dst_shift; + dst_validity |= (((val_byte & src_bit_mask) >> src_shift) << dst_shift); + if (debug_print) { + printf("validity is now 0x%x\n", dst_validity); + } + } + + + int32_t *validity_ptr = reinterpret_cast(output_nm[col] + (starting_row / 32)); + if (debug_print) { + printf("valiidty_ptr is output_nm[%d]: %p + starting_row / 8: %d because starting row is %d, which becomes %p\n", col, output_nm[col], starting_row / 32, starting_row, output_nm[col] + (starting_row / 32)); + printf("validity to write is %d\n", dst_validity); + printf("validity write %p <- %d\n", validity_ptr, dst_validity); + } + *validity_ptr = dst_validity; + } +} + /** * Calculate the dimensions of the kernel for fixed width only columns. * @param [in] num_columns the number of columns being copied. @@ -764,21 +1011,165 @@ static inline int32_t compute_fixed_width_layout(std::vector co return align_offset(at_offset, 8); // 8 bytes (64 bits) } -} // namespace detail +template +static size_type compute_column_information( + iterator begin, + iterator end, + std::vector &column_starts, + std::vector &column_sizes)//, + //std::function nested_type_cb) +{ + size_type fixed_width_size_per_row = 0; + for (auto cv = begin; cv != end; ++cv) { + auto col_type = std::get<0>(*cv); + bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; + +// if (nested_type && nested_type_cb) { nested_type_cb(cv->get<1>()); } + + // a list or string column will write a single uint64 + // of data here for offset/length + auto col_size = nested_type ? 8 : size_of(col_type); + + // align size for this type + std::size_t const alignment_needed = col_size; // They are the same for fixed width types + fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed); + column_starts.push_back(fixed_width_size_per_row); + column_sizes.push_back(col_size); + fixed_width_size_per_row += col_size; + } + + auto validity_offset = detail::align_offset(fixed_width_size_per_row, 4); + column_starts.push_back(validity_offset); + + return fixed_width_size_per_row; +} //#define DEBUG -std::vector> convert_to_rows2(cudf::table_view const &tbl, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) + +static std::vector build_block_infos(std::vector const &column_sizes, + std::vector const &column_starts, + std::vector const &row_batches, + size_type const total_number_of_rows, + size_type const &shmem_limit_per_block) { - // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the - // data, but small enough that multiple columns fit in memory so the writes can coalese as well. - // Potential optimization for window sizes. - const size_type num_columns = tbl.num_columns(); - const size_type num_rows = tbl.num_rows(); + std::vector block_infos; + + // block infos are organized with the windows going "down" the columns + // this provides the most coalescing of memory access + int current_window_width = 0; + int current_window_start_col = 0; + + // build the blocks for a specific set of columns + auto build_blocks = [&block_infos, &row_batches, total_number_of_rows]( + int const start_col, int const end_col, int const desired_window_height) { + int current_window_start_row = 0; + int current_window_row_batch = 0; + int rows_left_in_batch = row_batches[current_window_row_batch].row_count; + int i = 0; + while (i < total_number_of_rows) { + if (rows_left_in_batch == 0) { + current_window_row_batch++; + rows_left_in_batch = row_batches[current_window_row_batch].row_count; + } + int const window_height = std::min(desired_window_height, rows_left_in_batch); + + block_infos.emplace_back(detail::block_info{ + start_col, + current_window_start_row, + end_col, + std::min(current_window_start_row + window_height - 1, total_number_of_rows - 1), + current_window_row_batch}); + + i += window_height; + current_window_start_row += window_height; + rows_left_in_batch -= window_height; + } + }; + + // the ideal window height has lots of 8-byte reads and 8-byte writes. The optimal read/write + // would be memory cache line sized access, but since other blocks will read/write the edges this + // may not turn out to be overly important. For now, we will attempt to build a square window as + // far as byte sizes. x * y = shared_mem_size. Which translates to x^2 = shared_mem_size since we + // want them equal, so height and width are sqrt(shared_mem_size). The trick is that it's in + // bytes, not rows or columns. + int const window_height = std::min( + std::min(size_type(sqrt(shmem_limit_per_block)) / column_sizes[0], total_number_of_rows), + row_batches[0].row_count); +#if defined(DEBUG) + printf( + "sqrt(shmem_limit_per_block) / column_sizes[0] is %d and num_rows is %d, batch row count is %d - which makes window height " + "%d\n", + size_type(sqrt(shmem_limit_per_block)) / column_sizes[0], + total_number_of_rows, + row_batches[0].row_count, + window_height); +#endif + + int row_size = 0; + + // march each column and build the blocks of appropriate sizes + for (unsigned int col = 0; col < column_sizes.size(); ++col) { + auto const col_size = column_sizes[col]; + + // align size for this type + std::size_t alignment_needed = col_size; // They are the same for fixed width types + auto row_size_aligned = detail::align_offset(row_size, alignment_needed); + auto row_size_with_this_col = row_size_aligned + col_size; + auto row_size_with_end_pad = detail::align_offset(row_size_with_this_col, 8); + + if (row_size_with_end_pad * window_height > shmem_limit_per_block) { +#if defined(DEBUG) + printf( + "Window size %d too large at column %d, bumping back to build windows of size %d(cols " + "%d-%d), which is %d tall. Row size is too large at %d and ok at %d(aligned overall is %d) " + "for shared mem size %d\n", + row_size_with_end_pad * window_height, + col, + row_size * window_height, + current_window_start_col, + col - 1, + window_height, + row_size_with_end_pad, + row_size, + row_size_aligned, + shmem_limit_per_block); +#endif + // too large, close this window, generate vertical blocks and restart + build_blocks(current_window_start_col, col - 1, window_height); + row_size = + detail::align_offset((column_starts[col] + column_sizes[col]) & 7, alignment_needed); +#if defined(DEBUG) + printf( + "New window starting with offset %d and row size %d to be %d (previous column offset %d+%d " + "or %d)\n", + row_size, + col_size, + row_size + col_size, + column_starts[col - 1], + column_sizes[col - 1], + column_starts[col - 1] + column_sizes[col - 1]); +#endif + row_size += col_size; // alignment required for shared memory window boundary to match + // alignment of output row + current_window_start_col = col; + current_window_width = 0; + } else { + row_size = row_size_with_this_col; + current_window_width++; + } + } + + // build last set of blocks + if (current_window_width > 0) { + build_blocks(current_window_start_col, (int)column_sizes.size()-1, window_height); + } + + return block_infos; +} +} // namespace detail #if defined(DEBUG) - auto pretty_print = [](uint64_t i) { + void pretty_print(uint64_t i) { if (i > (1 * 1024 * 1024 * 1024)) { printf("%.2f GB", i / float(1 * 1024 * 1024 * 1024)); } else if (i > (1 * 1024 * 1024)) { @@ -788,9 +1179,19 @@ std::vector> convert_to_rows2(cudf::table_view con } else { printf("%lu Bytes", i); } - }; + } #endif +std::vector> convert_to_rows2(cudf::table_view const &tbl, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) +{ + // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the + // data, but small enough that multiple columns fit in memory so the writes can coalese as well. + // Potential optimization for window sizes. + const size_type num_columns = tbl.num_columns(); + const size_type num_rows = tbl.num_rows(); + int device_id; CUDA_TRY(cudaGetDevice(&device_id)); int shmem_limit_per_block; @@ -834,8 +1235,8 @@ std::vector> convert_to_rows2(cudf::table_view con } } - auto dev_input_data = detail::copy_to_dev_async2(input_data, stream, mr); - auto dev_input_nm = detail::copy_to_dev_async2(input_nm, stream, mr); + auto dev_input_data = detail::copy_to_dev_async2(input_data, stream, mr); + auto dev_input_nm = detail::copy_to_dev_async2(input_nm, stream, mr); std::vector row_sizes; // size of each row in bytes including any alignment padding std::vector row_offsets; // offset from the start of the data to this row @@ -848,43 +1249,48 @@ std::vector> convert_to_rows2(cudf::table_view con column_sizes.reserve(num_columns); column_starts.reserve(num_columns + 1); // we add a final offset for validity data start - size_type fixed_width_size_per_row = 0; - for (int col = 0; col < num_columns; ++col) { - auto cv = tbl.column(col); - auto col_type = cv.type(); - bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; + auto iter = thrust::make_transform_iterator(thrust::make_counting_iterator(0), [&tbl](auto i) -> std::tuple { + return std::make_tuple(tbl.column(i).type(), tbl.column(i)); + }); + + size_type fixed_width_size_per_row = detail::compute_column_information( + iter, + iter + num_columns, + column_starts, + column_sizes);//, +// [&variable_width_columns](column_view const &cv) { variable_width_columns.push_back(cv); }); + /* size_type fixed_width_size_per_row = 0; + for (int col = 0; col < num_columns; ++col) { + auto cv = tbl.column(col); + auto col_type = cv.type(); + bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; + + if (nested_type) { variable_width_columns.push_back(cv); } + + // a list or string column will write a single uint64 + // of data here for offset/length + auto col_size = nested_type ? 8 : size_of(col_type); + + // align size for this type + std::size_t const alignment_needed = col_size; // They are the same for fixed width types + fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed); + column_starts.push_back(fixed_width_size_per_row); + column_sizes.push_back(col_size); + fixed_width_size_per_row += col_size; + }*/ - if (nested_type) { variable_width_columns.push_back(cv); } +#if defined(DEBUG) + printf("validity offset will be %d + %d = %d\n", + column_starts.back(), + column_sizes.back(), + column_starts.back() + column_sizes.back()); +#endif - // a list or string column will write a single uint64 - // of data here for offset/length - auto col_size = nested_type ? 8 : size_of(col_type); - // align size for this type - std::size_t const alignment_needed = col_size; // They are the same for fixed width types - fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed); - column_starts.push_back(fixed_width_size_per_row); - column_sizes.push_back(col_size); - fixed_width_size_per_row += col_size; - } + auto dev_col_sizes = detail::copy_to_dev_async2(column_sizes, stream, mr); + auto dev_col_starts = detail::copy_to_dev_async2(column_starts, stream, mr); - auto dev_col_sizes = detail::copy_to_dev_async2(column_sizes, stream, mr); - auto dev_col_starts = detail::copy_to_dev_async2(column_starts, stream, mr); - - // When building the columns to return, we have to be mindful of the offset limit in cudf. - // It is 32-bit and these data columns are capable of surpassing that easily. The data should - // not be cut off exactly at the limit though due to the validity buffers. The most efficient - // place to cut the validity is on a 32-row boundary, so as we calculate the row sizes - // we keep track of the cut points for the validity, which we call row batches. If the row - // is larger than can be represented with the 32-bit offsets, we use the last 32-row boundary we - // hit. Note that this boundary is for our book-keeping with column pointers and not anything that - // the kernel needs to worry about. We cut the output at convienient boundaries when assembling - // the outgoing data stream. - struct row_batch { - size_type num_bytes; - size_type row_count; - }; - std::vector row_batches; + std::vector row_batches; auto calculate_variable_width_row_data_size = [](int const row) { // each level of variable-width data will add an offset/length @@ -936,10 +1342,11 @@ std::vector> convert_to_rows2(cudf::table_view con row_sizes[row] = detail::align_offset(row_sizes[row], 8) + calculate_variable_width_row_data_size(row); // rows are 8 byte aligned - if ((uint64_t)aligned_row_batch_size + row_sizes[row] > (uint64_t)std::numeric_limits::max()) { + if ((uint64_t)aligned_row_batch_size + row_sizes[row] > + (uint64_t)std::numeric_limits::max()) { // a new batch starts at the last 32-row boundary row_batches.push_back( - row_batch{static_cast(row_batch_size), row_batch_rows & ~31}); + detail::row_batch{static_cast(row_batch_size), row_batch_rows & ~31}); row_batch_size = 0; row_batch_rows = row_batch_rows & 31; row_offset = 0; @@ -954,12 +1361,12 @@ std::vector> convert_to_rows2(cudf::table_view con row_batch_rows++; } if (row_batch_size > 0) { - row_batches.push_back(row_batch{static_cast(row_batch_size), row_batch_rows}); + row_batches.push_back(detail::row_batch{static_cast(row_batch_size), row_batch_rows}); } auto dev_row_offsets = detail::copy_to_dev_async2(row_offsets, stream, mr); - #if defined(DEBUG) +#if defined(DEBUG) printf("%d rows and %d columns in table\n", num_rows, num_columns); printf("%lu batches:\n", row_batches.size()); for (auto i = 0; i < (int)row_batches.size(); ++i) { @@ -979,125 +1386,8 @@ std::vector> convert_to_rows2(cudf::table_view con } auto dev_output_data = detail::copy_to_dev_async2(output_data, stream, mr); - std::vector block_infos; - - // block infos are organized with the windows going "down" the columns - // this provides the most coalescing of memory access - int current_window_width = 0; - int current_window_start_col = 0; - - // build the blocks for a specific set of columns - auto build_blocks = [&block_infos, &row_batches, num_rows]( - int const start_col, int const end_col, int const desired_window_height) { - int current_window_start_row = 0; - int current_window_row_batch = 0; - int rows_left_in_batch = row_batches[current_window_row_batch].row_count; - int i = 0; - while (i < num_rows) { - if (rows_left_in_batch == 0) { - current_window_row_batch++; - rows_left_in_batch = row_batches[current_window_row_batch].row_count; - } - int const window_height = std::min(desired_window_height, rows_left_in_batch); - - block_infos.emplace_back( - detail::block_info{start_col, - current_window_start_row, - end_col, - std::min(current_window_start_row + window_height - 1, num_rows - 1), - current_window_row_batch}); - - i += window_height; - current_window_start_row += window_height; - rows_left_in_batch -= window_height; - } - }; - - // the ideal window height has lots of 8-byte reads and 8-byte writes. The optimal read/write would be memory cache line sized - // access, but since other blocks will read/write the edges this may not turn out to be overly important. - // For now, we will attempt to build a square window as far as byte sizes. x * y = shared_mem_size. - // Which translates to x^2 = shared_mem_size since we want them equal, so height and width are - // sqrt(shared_mem_size). The trick is that it's in bytes, not rows or columns. - int const window_height = - std::min(std::min(size_type(sqrt(shmem_limit_per_block))/column_sizes[0], num_rows), row_batches[0].row_count); -#if defined(DEBUG) - printf( - "max_window_height is %d, num_rows is %d, batch row count is %d - which makes window height " - "%d\n", - max_window_height, - num_rows, - row_batches[0].row_count, - window_height); -#endif - - int row_size = 0; - - // march each column and build the blocks of appropriate sizes - for (int col = 0; col < num_columns; ++col) { - auto const col_size = column_sizes[col]; - - // align size for this type - std::size_t alignment_needed = col_size; // They are the same for fixed width types - auto row_size_aligned = detail::align_offset(row_size, alignment_needed); - auto row_size_with_this_col = row_size_aligned + col_size; - auto row_size_with_end_pad = detail::align_offset(row_size_with_this_col, 8); - - if (row_size_with_end_pad * window_height > shmem_limit_per_block) { -#if defined(DEBUG) - printf( - "Window size %d too large at column %d, bumping back to build windows of size %d(cols " - "%d-%d), which is %d tall. Row size is too large at %d and ok at %d(aligned overall is %d) " - "for shared mem size %d\n", - row_size_with_end_pad * window_height, - col, - row_size * window_height, - current_window_start_col, - col - 1, - window_height, - row_size_with_end_pad, - row_size, - row_size_aligned, - shmem_limit_per_block); -#endif - // too large, close this window, generate vertical blocks and restart - build_blocks(current_window_start_col, col - 1, window_height); - row_size = - detail::align_offset((column_starts[col] + column_sizes[col]) & 7, alignment_needed); -#if defined(DEBUG) - printf( - "New window starting with offset %d and row size %d to be %d (previous column offset %d+%d " - "or %d)\n", - row_size, - col_size, - row_size + col_size, - column_starts[col - 1], - column_sizes[col - 1], - column_starts[col - 1] + column_sizes[col - 1]); -#endif - row_size += col_size; // alignment required for shared memory window boundary to match - // alignment of output row - current_window_start_col = col; - current_window_width = 0; - } else { - row_size = row_size_with_this_col; - current_window_width++; - } - } - -#if defined(DEBUG) - printf("validity offset will be %d + %d = %d\n", - column_starts.back(), - column_sizes.back(), - column_starts.back() + column_sizes.back()); -#endif - auto validity_offset = detail::align_offset(column_starts.back() + column_sizes.back(), 4); - column_starts.push_back(validity_offset); - - // build last set of blocks - if (current_window_width > 0) { - build_blocks(current_window_start_col, (int)tbl.num_columns() - 1, window_height); - } - + std::vector block_infos = + build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block); #if defined(DEBUG) printf("%lu windows for %d columns, %d rows to fit in ", @@ -1116,7 +1406,11 @@ std::vector> convert_to_rows2(cudf::table_view con // blast through the entire table and convert it dim3 blocks(block_infos.size()); - dim3 threads(std::min(1024, shmem_limit_per_block / 8)); + #if defined(DEBUG) || 1 + dim3 threads(std::min(std::min(512, shmem_limit_per_block / 8), (int)total_table_size)); + #else + dim3 threads(std::min(std::min(1024, shmem_limit_per_block / 8), (int)total_table_size)); + #endif #if defined(DEBUG) printf("Launching kernel with %d blocks, %d threads, ", blocks.x, threads.x); pretty_print(shmem_limit_per_block); @@ -1206,11 +1500,11 @@ std::vector> convert_to_rows(cudf::table_view cons using ScalarType = cudf::scalar_type_t; auto zero = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value()); - zero->set_valid(true, stream); + zero->set_valid_async(true, stream); static_cast(zero.get())->set_value(0, stream); auto step = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value()); - step->set_valid(true, stream); + step->set_valid_async(true, stream); static_cast(step.get()) ->set_value(static_cast(size_per_row), stream); @@ -1238,6 +1532,97 @@ std::vector> convert_to_rows(cudf::table_view cons } } +std::unique_ptr convert_from_rows2(cudf::lists_column_view const &input, + std::vector const &schema, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) +{ + // verify that the types are what we expect + cudf::column_view child = input.child(); + cudf::type_id list_type = child.type().id(); + CUDF_EXPECTS(list_type == cudf::type_id::INT8 || list_type == cudf::type_id::UINT8, + "Only a list of bytes is supported as input"); + + cudf::size_type num_columns = schema.size(); + cudf::size_type num_rows = input.parent().size(); + + int device_id; + CUDA_TRY(cudaGetDevice(&device_id)); + int shmem_limit_per_block; + CUDA_TRY( + cudaDeviceGetAttribute(&shmem_limit_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); + + std::vector column_starts; + std::vector column_sizes; + + auto iter = thrust::make_transform_iterator(thrust::make_counting_iterator(0), [&schema](auto i) { + return std::make_tuple(schema[i], nullptr); + }); + size_type fixed_width_size_per_row = detail::compute_column_information( + iter, iter + num_columns, column_starts, column_sizes);//, [](void *) {}); + + size_type validity_size = (num_columns + 7) / 8; + + size_type row_size = detail::align_offset(fixed_width_size_per_row + validity_size, 8); + + // Ideally we would check that the offsets are all the same, etc. but for now + // this is probably fine + CUDF_EXPECTS(row_size * num_rows == child.size(), + "The layout of the data appears to be off"); + auto dev_col_starts = detail::copy_to_dev_async2(column_starts, stream, mr); + auto dev_col_sizes = detail::copy_to_dev_async2(column_sizes, stream, mr); + + // build the row_batches from the passed in list column + std::vector row_batches; + + row_batches.push_back(detail::row_batch{child.size(), num_rows}); + + // Allocate the columns we are going to write into + std::vector> output_columns; + std::vector output_data; + std::vector output_nm; + for (cudf::size_type i = 0; i < num_columns; i++) { + auto column = cudf::make_fixed_width_column( + schema[i], num_rows, cudf::mask_state::UNINITIALIZED, stream, mr); + auto mut = column->mutable_view(); + output_data.emplace_back(mut.data()); + output_nm.emplace_back(mut.null_mask()); + output_columns.emplace_back(std::move(column)); + } + + auto dev_output_data = detail::copy_to_dev_async2(output_data, stream, mr); + auto dev_output_nm = detail::copy_to_dev_async2(output_nm, stream, mr); + + std::vector block_infos = + build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block); + + auto dev_block_infos = detail::copy_to_dev_async2(block_infos, stream, mr); + + dim3 blocks(block_infos.size()); + #if defined(DEBUG) || 1 + dim3 threads(std::min(std::min(512, shmem_limit_per_block / 8), (int)child.size())); + #else + dim3 threads(std::min(std::min(1024, shmem_limit_per_block / 8), (int)child.size())); + #endif +#if defined(DEBUG) + printf("Launching kernel with %d blocks, %d threads, ", blocks.x, threads.x); + pretty_print(shmem_limit_per_block); + printf(" shared memory\n"); +#endif + detail::copy_to_columns<<>>( + num_rows, + num_columns, + input.offsets().data(), + dev_output_data.data(), + dev_output_nm.data(), + dev_col_sizes.data(), + dev_col_starts.data(), + dev_block_infos.data(), + child.data()); + + return std::make_unique(std::move(output_columns)); +} + std::unique_ptr convert_from_rows(cudf::lists_column_view const &input, std::vector const &schema, rmm::cuda_stream_view stream, @@ -1318,4 +1703,20 @@ std::unique_ptr convert_from_rows( // } } +std::unique_ptr convert_from_rows2( + std::vector> const &input, + std::vector const &schema, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) +{ + CUDF_EXPECTS(input.size() == 1, "Too large of an input, need to concat the output tables..."); + + // for (uint i=0; iview(); + auto ret = convert_from_rows2(lcv, schema, stream, mr); + + return ret; + // } +} + } // namespace cudf diff --git a/cpp/tests/row_conversion/row_conversion.cpp b/cpp/tests/row_conversion/row_conversion.cpp index c02f83ad1d5..818d7a89ddb 100644 --- a/cpp/tests/row_conversion/row_conversion.cpp +++ b/cpp/tests/row_conversion/row_conversion.cpp @@ -21,9 +21,13 @@ #include #include +#include "cudf/lists/lists_column_view.hpp" +#include "cudf/types.hpp" struct ColumnToRowTests : public cudf::test::BaseFixture { }; +struct RowToColumnTests : public cudf::test::BaseFixture { +}; TEST_F(ColumnToRowTests, Single) { @@ -108,3 +112,105 @@ TEST_F(ColumnToRowTests, SingleByteWide) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); } } + +TEST_F(RowToColumnTests, Single) +{ + cudf::test::fixed_width_column_wrapper a({-1}); + cudf::table_view in(std::vector{a}); + + auto old_rows = cudf::convert_to_rows(in); + std::vector schema{cudf::data_type{cudf::type_id::INT32}}; + for (uint i=0; i a({-1, 0, 1}); + cudf::table_view in(std::vector{a}); + + auto old_rows = cudf::convert_to_rows(in); + std::vector schema{cudf::data_type{cudf::type_id::INT32}}; + for (uint i=0; i int32_t { return rand(); }); + cudf::test::fixed_width_column_wrapper a(r, r + (size_t)4096); + cudf::table_view in(std::vector{a}); + + auto old_rows = cudf::convert_to_rows(in); + std::vector schema; + schema.reserve(in.num_columns()); + for (auto col = in.begin(); col < in.end(); ++col) { + schema.push_back(col->type()); + } + for (uint i=0; i> cols; + std::vector views; + + for (int i = 0; i < 256; ++i) { + cols.push_back(cudf::test::fixed_width_column_wrapper({rand()})); + views.push_back(cols.back()); + } + cudf::table_view in(views); + + auto old_rows = cudf::convert_to_rows(in); + std::vector schema; + schema.reserve(in.num_columns()); + for (auto col = in.begin(); col < in.end(); ++col) { + schema.push_back(col->type()); + } + + for (uint i=0; i> cols; + std::vector views; + + for (int i = 0; i < 256; ++i) { + cols.push_back(cudf::test::fixed_width_column_wrapper({rand()})); + views.push_back(cols.back()); + } + cudf::table_view in(views); + + auto old_rows = cudf::convert_to_rows(in); + std::vector schema; + schema.reserve(in.num_columns()); + for (auto col = in.begin(); col < in.end(); ++col) { + schema.push_back(col->type()); + } + for (uint i=0; i Date: Thu, 8 Jul 2021 20:45:18 +0000 Subject: [PATCH 13/80] updating to use make_device_uvector_async and bitmask functions per review comments --- cpp/src/row_conversion/row_conversion.cu | 125 +++++++++-------------- 1 file changed, 47 insertions(+), 78 deletions(-) diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu index c0e78a03576..c73e967cf0f 100644 --- a/cpp/src/row_conversion/row_conversion.cu +++ b/cpp/src/row_conversion/row_conversion.cu @@ -21,6 +21,7 @@ #include #include +#include #include #include #include @@ -36,6 +37,7 @@ #include "thrust/iterator/counting_iterator.h" #include "thrust/iterator/transform_iterator.h" +using cudf::detail::make_device_uvector_async; namespace cudf { namespace detail { @@ -45,32 +47,6 @@ static inline __host__ __device__ int32_t align_offset(int32_t offset, std::size return (offset + alignment - 1) & ~(alignment - 1); } -/** - * Copy a simple vector to device memory asynchronously. Be sure to read - * the data on the same stream as is used to copy it. - */ -template -std::unique_ptr> copy_to_dev_async(const std::vector &input, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) -{ - std::unique_ptr> ret(new rmm::device_uvector(input.size(), stream, mr)); - CUDA_TRY(cudaMemcpyAsync( - ret->data(), input.data(), sizeof(T) * input.size(), cudaMemcpyHostToDevice, stream.value())); - return ret; -} - -template -rmm::device_uvector copy_to_dev_async2(const std::vector &input, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) -{ - rmm::device_uvector ret(input.size(), stream, mr); - CUDA_TRY(cudaMemcpyAsync( - ret.data(), input.data(), sizeof(T) * input.size(), cudaMemcpyHostToDevice, stream.value())); - return ret; -} - __global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows, const cudf::size_type num_columns, const cudf::size_type row_size, @@ -180,8 +156,8 @@ __global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows, } cudf::bitmask_type *nm = output_nm[col_index]; - int8_t *valid_byte = &row_vld_tmp[col_index / 8]; - cudf::size_type byte_bit_offset = col_index % 8; + int8_t *valid_byte = &row_vld_tmp[word_index(col_index)]; + cudf::size_type byte_bit_offset = intra_word_index(col_index); int predicate = *valid_byte & (1 << byte_bit_offset); uint32_t bitmask = __ballot_sync(active_mask, predicate); if (row_index % 32 == 0) { nm[word_index(row_index)] = bitmask; } @@ -278,8 +254,8 @@ __global__ void copy_from_fixed_width_columns(const cudf::size_type start_row, } // atomicOr only works on 32 bit or 64 bit aligned values, and not byte aligned // so we have to rewrite the addresses to make sure that it is 4 byte aligned - int8_t *valid_byte = &row_vld_tmp[col_index / 8]; - cudf::size_type byte_bit_offset = col_index % 8; + int8_t *valid_byte = &row_vld_tmp[word_index(col_index)]; + cudf::size_type byte_bit_offset = intra_word_index(col_index); uint64_t fixup_bytes = reinterpret_cast(valid_byte) % 4; int32_t *valid_int = reinterpret_cast(valid_byte - fixup_bytes); cudf::size_type int_bit_offset = byte_bit_offset + (fixup_bytes * 8); @@ -505,8 +481,8 @@ __global__ void copy_from_columns(const size_type num_rows, // we do this directly in the final location because the entire row may not // fit in shared memory and may require many blocks to process it entirely int8_t *valid_byte = - &output_data[block.buffer_num][row_offsets[row] + validity_offset + col / 8]; - cudf::size_type byte_bit_offset = col % 8; + &output_data[block.buffer_num][row_offsets[row] + validity_offset + word_index(col)]; + cudf::size_type byte_bit_offset = intra_word_index(col); uint64_t fixup_bytes = reinterpret_cast(valid_byte) % 4; int32_t *valid_int = reinterpret_cast(valid_byte - fixup_bytes); cudf::size_type int_bit_offset = byte_bit_offset + (fixup_bytes * 8); @@ -648,7 +624,7 @@ __global__ void copy_to_columns(const size_type num_rows, // This has been broken up for us in the block_info struct, so we don't have // any calculation to do here, but it is important to note. - bool debug_print = false; //blockIdx.x == 1 && threadIdx.x == 0; + bool debug_print = false; //blockIdx.x == 0 && threadIdx.x == 0; if (debug_print) { printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns); @@ -806,7 +782,7 @@ __global__ void copy_to_columns(const size_type num_rows, auto const col = index / validity_batches_per_col; auto const batch = index % validity_batches_per_col; auto const starting_row = batch * 32; - auto const validity_offset = col_offsets[num_columns] + col / 8; + auto const validity_offset = col_offsets[num_columns] + word_index(col); if (debug_print) { printf("col: %d, batch: %d, starting_row: %d, validity_offset: %d\n", col, batch, starting_row, validity_offset); @@ -821,7 +797,7 @@ __global__ void copy_to_columns(const size_type num_rows, } auto const val_byte = *validity_ptr; - auto const src_shift = col % 8; + auto const src_shift = intra_word_index(col); auto const dst_shift = row % 32; auto const src_bit_mask = 1 << src_shift; if (debug_print) { @@ -920,10 +896,10 @@ static std::unique_ptr fixed_width_convert_to_rows( const cudf::size_type num_rows, const cudf::size_type num_columns, const cudf::size_type size_per_row, - std::unique_ptr> &column_start, - std::unique_ptr> &column_size, - std::unique_ptr> &input_data, - std::unique_ptr> &input_nm, + rmm::device_uvector &column_start, + rmm::device_uvector &column_size, + rmm::device_uvector &input_data, + rmm::device_uvector &input_nm, const cudf::scalar &zero, const cudf::scalar &scalar_size_per_row, rmm::cuda_stream_view stream, @@ -954,10 +930,10 @@ static std::unique_ptr fixed_width_convert_to_rows( num_rows, num_columns, size_per_row, - column_start->data(), - column_size->data(), - input_data->data(), - input_nm->data(), + column_start.data(), + column_size.data(), + input_data.data(), + input_nm.data(), data->mutable_view().data()); return cudf::make_lists_column(num_rows, @@ -1004,7 +980,7 @@ static inline int32_t compute_fixed_width_layout(std::vector co // Now we need to add in space for validity // Eventually we can think about nullable vs not nullable, but for now we will just always add it // in - int32_t validity_bytes_needed = (schema.size() + 7) / 8; + int32_t validity_bytes_needed = word_index(schema.size() + 7); // validity comes at the end and is byte aligned so we can pack more in. at_offset += validity_bytes_needed; // Now we need to pad the end so all rows are 64 bit aligned @@ -1235,8 +1211,8 @@ std::vector> convert_to_rows2(cudf::table_view con } } - auto dev_input_data = detail::copy_to_dev_async2(input_data, stream, mr); - auto dev_input_nm = detail::copy_to_dev_async2(input_nm, stream, mr); + auto dev_input_data = make_device_uvector_async(input_data, stream, mr); + auto dev_input_nm = make_device_uvector_async(input_nm, stream, mr); std::vector row_sizes; // size of each row in bytes including any alignment padding std::vector row_offsets; // offset from the start of the data to this row @@ -1287,8 +1263,8 @@ std::vector> convert_to_rows2(cudf::table_view con #endif - auto dev_col_sizes = detail::copy_to_dev_async2(column_sizes, stream, mr); - auto dev_col_starts = detail::copy_to_dev_async2(column_starts, stream, mr); + auto dev_col_sizes = make_device_uvector_async(column_sizes, stream, mr); + auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr); std::vector row_batches; @@ -1322,16 +1298,9 @@ std::vector> convert_to_rows2(cudf::table_view con size_type row_batch_rows = 0; uint64_t row_offset = 0; - auto calculate_validity_size = [](int const num_cols) { - // Now we need to add in space for validity - // Eventually we can think about nullable vs not nullable, but for now we will just always add - // it in - return (num_cols + 7) / 8; - }; - // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then // calculate the size of each row's variable-width data and validity as well. - auto validity_size = calculate_validity_size(num_columns); + auto validity_size = num_bitmask_words(num_columns); for (int row = 0; row < num_rows; ++row) { auto aligned_row_batch_size = detail::align_offset(row_batch_size, 8); // rows are 8 byte aligned @@ -1364,7 +1333,7 @@ std::vector> convert_to_rows2(cudf::table_view con row_batches.push_back(detail::row_batch{static_cast(row_batch_size), row_batch_rows}); } - auto dev_row_offsets = detail::copy_to_dev_async2(row_offsets, stream, mr); + auto dev_row_offsets = make_device_uvector_async(row_offsets, stream, mr); #if defined(DEBUG) printf("%d rows and %d columns in table\n", num_rows, num_columns); @@ -1384,7 +1353,7 @@ std::vector> convert_to_rows2(cudf::table_view con output_data.push_back(static_cast(temp.data())); output_buffers.push_back(std::move(temp)); } - auto dev_output_data = detail::copy_to_dev_async2(output_data, stream, mr); + auto dev_output_data = make_device_uvector_async(output_data, stream, mr); std::vector block_infos = build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block); @@ -1402,7 +1371,7 @@ std::vector> convert_to_rows2(cudf::table_view con printf(" total):\n"); #endif - auto dev_block_infos = detail::copy_to_dev_async2(block_infos, stream, mr); + auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr); // blast through the entire table and convert it dim3 blocks(block_infos.size()); @@ -1443,7 +1412,7 @@ std::vector> convert_to_rows2(cudf::table_view con } offset_offset += row_batches[i].row_count; - auto dev_offsets = detail::copy_to_dev_async2(offset_vals, stream, mr); + auto dev_offsets = make_device_uvector_async(offset_vals, stream, mr); auto offsets = std::make_unique( data_type{type_id::INT32}, (size_type)offset_vals.size(), dev_offsets.release()); @@ -1477,8 +1446,8 @@ std::vector> convert_to_rows(cudf::table_view cons std::vector column_size; int32_t size_per_row = detail::compute_fixed_width_layout(schema, column_start, column_size); - auto dev_column_start = detail::copy_to_dev_async(column_start, stream, mr); - auto dev_column_size = detail::copy_to_dev_async(column_size, stream, mr); + auto dev_column_start = make_device_uvector_async(column_start, stream, mr); + auto dev_column_size = make_device_uvector_async(column_size, stream, mr); int32_t max_rows_per_batch = std::numeric_limits::max() / size_per_row; // Make the number of rows per batch a multiple of 32 so we don't have to worry about @@ -1495,8 +1464,8 @@ std::vector> convert_to_rows(cudf::table_view cons input_data.emplace_back(cv.data()); input_nm.emplace_back(cv.null_mask()); } - auto dev_input_data = detail::copy_to_dev_async(input_data, stream, mr); - auto dev_input_nm = detail::copy_to_dev_async(input_nm, stream, mr); + auto dev_input_data = make_device_uvector_async(input_data, stream, mr); + auto dev_input_nm = make_device_uvector_async(input_nm, stream, mr); using ScalarType = cudf::scalar_type_t; auto zero = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value()); @@ -1561,7 +1530,7 @@ std::unique_ptr convert_from_rows2(cudf::lists_column_view const &i size_type fixed_width_size_per_row = detail::compute_column_information( iter, iter + num_columns, column_starts, column_sizes);//, [](void *) {}); - size_type validity_size = (num_columns + 7) / 8; + size_type validity_size = num_bitmask_words(num_columns); size_type row_size = detail::align_offset(fixed_width_size_per_row + validity_size, 8); @@ -1569,8 +1538,8 @@ std::unique_ptr convert_from_rows2(cudf::lists_column_view const &i // this is probably fine CUDF_EXPECTS(row_size * num_rows == child.size(), "The layout of the data appears to be off"); - auto dev_col_starts = detail::copy_to_dev_async2(column_starts, stream, mr); - auto dev_col_sizes = detail::copy_to_dev_async2(column_sizes, stream, mr); + auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr); + auto dev_col_sizes = make_device_uvector_async(column_sizes, stream, mr); // build the row_batches from the passed in list column std::vector row_batches; @@ -1590,13 +1559,13 @@ std::unique_ptr convert_from_rows2(cudf::lists_column_view const &i output_columns.emplace_back(std::move(column)); } - auto dev_output_data = detail::copy_to_dev_async2(output_data, stream, mr); - auto dev_output_nm = detail::copy_to_dev_async2(output_nm, stream, mr); + auto dev_output_data = make_device_uvector_async(output_data, stream, mr); + auto dev_output_nm = make_device_uvector_async(output_nm, stream, mr); std::vector block_infos = build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block); - auto dev_block_infos = detail::copy_to_dev_async2(block_infos, stream, mr); + auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr); dim3 blocks(block_infos.size()); #if defined(DEBUG) || 1 @@ -1647,8 +1616,8 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const &in // this is probably fine CUDF_EXPECTS(size_per_row * num_rows == child.size(), "The layout of the data appears to be off"); - auto dev_column_start = detail::copy_to_dev_async(column_start, stream, mr); - auto dev_column_size = detail::copy_to_dev_async(column_size, stream, mr); + auto dev_column_start = make_device_uvector_async(column_start, stream); + auto dev_column_size = make_device_uvector_async(column_size, stream); // Allocate the columns we are going to write into std::vector> output_columns; @@ -1663,8 +1632,8 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const &in output_columns.emplace_back(std::move(column)); } - auto dev_output_data = detail::copy_to_dev_async(output_data, stream, mr); - auto dev_output_nm = detail::copy_to_dev_async(output_nm, stream, mr); + auto dev_output_data = make_device_uvector_async(output_data, stream, mr); + auto dev_output_nm = make_device_uvector_async(output_nm, stream, mr); dim3 blocks; dim3 threads; @@ -1675,10 +1644,10 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const &in num_rows, num_columns, size_per_row, - dev_column_start->data(), - dev_column_size->data(), - dev_output_data->data(), - dev_output_nm->data(), + dev_column_start.data(), + dev_column_size.data(), + dev_output_data.data(), + dev_output_nm.data(), child.data()); return std::make_unique(std::move(output_columns)); From 1d0245bffc592f80ba6b4fce7d9bcf9d585eef30 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Tue, 13 Jul 2021 07:18:49 +0000 Subject: [PATCH 14/80] updating conversion code. Found out bit operations are on 32-bit values, so they can't be used since row data has byte-aligned validity. Performance improvements on the row to column side. --- cpp/src/row_conversion/row_conversion.cu | 106 ++++++++++++----------- 1 file changed, 54 insertions(+), 52 deletions(-) diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu index c73e967cf0f..0879a1c50a5 100644 --- a/cpp/src/row_conversion/row_conversion.cu +++ b/cpp/src/row_conversion/row_conversion.cu @@ -37,6 +37,8 @@ #include "thrust/iterator/counting_iterator.h" #include "thrust/iterator/transform_iterator.h" +#define NUM_BLOCKS_PER_KERNEL_TO_COLUMNS (2) + using cudf::detail::make_device_uvector_async; namespace cudf { @@ -156,11 +158,11 @@ __global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows, } cudf::bitmask_type *nm = output_nm[col_index]; - int8_t *valid_byte = &row_vld_tmp[word_index(col_index)]; - cudf::size_type byte_bit_offset = intra_word_index(col_index); + int8_t *valid_byte = &row_vld_tmp[col_index / 8]; + cudf::size_type byte_bit_offset = col_index % 8; int predicate = *valid_byte & (1 << byte_bit_offset); uint32_t bitmask = __ballot_sync(active_mask, predicate); - if (row_index % 32 == 0) { nm[word_index(row_index)] = bitmask; } + if (row_index % 32 == 0) { nm[row_index / 8] = bitmask; } } // end column loop } // end row copy // wait for the row_group to be totally copied before starting on the next row group @@ -254,8 +256,8 @@ __global__ void copy_from_fixed_width_columns(const cudf::size_type start_row, } // atomicOr only works on 32 bit or 64 bit aligned values, and not byte aligned // so we have to rewrite the addresses to make sure that it is 4 byte aligned - int8_t *valid_byte = &row_vld_tmp[word_index(col_index)]; - cudf::size_type byte_bit_offset = intra_word_index(col_index); + int8_t *valid_byte = &row_vld_tmp[col_index / 8]; + cudf::size_type byte_bit_offset = col_index % 8; uint64_t fixup_bytes = reinterpret_cast(valid_byte) % 4; int32_t *valid_int = reinterpret_cast(valid_byte - fixup_bytes); cudf::size_type int_bit_offset = byte_bit_offset + (fixup_bytes * 8); @@ -481,8 +483,8 @@ __global__ void copy_from_columns(const size_type num_rows, // we do this directly in the final location because the entire row may not // fit in shared memory and may require many blocks to process it entirely int8_t *valid_byte = - &output_data[block.buffer_num][row_offsets[row] + validity_offset + word_index(col)]; - cudf::size_type byte_bit_offset = intra_word_index(col); + &output_data[block.buffer_num][row_offsets[row] + validity_offset + (col / 8)]; + cudf::size_type byte_bit_offset = col % 8; uint64_t fixup_bytes = reinterpret_cast(valid_byte) % 4; int32_t *valid_int = reinterpret_cast(valid_byte - fixup_bytes); cudf::size_type int_bit_offset = byte_bit_offset + (fixup_bytes * 8); @@ -597,6 +599,7 @@ __global__ void copy_from_columns(const size_type num_rows, * * @param num_rows total number of rows in the table * @param num_columns total number of columns in the table + * @param shmem_used_per_block amount of shared memory that is used by a block * @param offsets * @param output_data * @param output_nm @@ -608,6 +611,7 @@ __global__ void copy_from_columns(const size_type num_rows, */ __global__ void copy_to_columns(const size_type num_rows, const size_type num_columns, + const size_type shmem_used_per_block, const size_type *offsets, int8_t **output_data, cudf::bitmask_type **output_nm, @@ -624,18 +628,10 @@ __global__ void copy_to_columns(const size_type num_rows, // This has been broken up for us in the block_info struct, so we don't have // any calculation to do here, but it is important to note. - bool debug_print = false; //blockIdx.x == 0 && threadIdx.x == 0; + constexpr bool debug_print = false; //blockIdx.x == 0 && threadIdx.x == 0; if (debug_print) { printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns); - printf("Column Info:\n"); - for (int i = 0; i < num_columns; ++i) { - printf("col %d is at %p with size %d and offset %d\n", - i, - output_data[i], - col_sizes[i], - col_offsets[i]); - } printf("block infos are at %p and my index is %d\n", block_infos, blockIdx.x); /* printf("Row Offsets:\n"); for (int i=0; i blockDim.x) { + break; + } + auto block = block_infos[this_block_index]; auto const rows_in_block = block.end_row - block.start_row + 1; auto const cols_in_block = block.end_col - block.start_col + 1; extern __shared__ int8_t shared_data[]; @@ -767,61 +769,58 @@ __global__ void copy_to_columns(const size_type num_rows, } } - __syncthreads(); - - // now handle validity. Each thread is responsible for 32 rows in a single column. + // now handle validity. Each thread is responsible for 32 rows in 8 columns. // to prevent indexing issues with a large number of threads, this is compressed // to a single loop like above. TODO: investigate using shared memory here auto const validity_batches_per_col = (num_rows + 31) / 32; - auto const validity_batches_total = validity_batches_per_col * num_columns; - if (debug_print) { - printf("validity_batched_per_col is %d\nvalidity_batches_total is %d for %d rows\n", validity_batches_per_col, validity_batches_total, num_rows); + auto const validity_batches_total = std::max(1, validity_batches_per_col * (num_columns / 8)); + if (debug_print && threadIdx.x == 0 && blockIdx.x == 0) { + printf("validity_batched_per_col is %d\nvalidity_batches_total is %d for %d rows\n%d blocks of %d threads\n", validity_batches_per_col, validity_batches_total, num_rows, gridDim.x, blockDim.x); } - for (int index = threadIdx.x; index < validity_batches_total; index += blockDim.x) { - // what column is this? - auto const col = index / validity_batches_per_col; + for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < validity_batches_total; index += blockDim.x * gridDim.x) { + auto const start_col = (index * 8) / validity_batches_per_col; auto const batch = index % validity_batches_per_col; auto const starting_row = batch * 32; - auto const validity_offset = col_offsets[num_columns] + word_index(col); + auto const validity_offset = col_offsets[num_columns] + (start_col / 8); if (debug_print) { - printf("col: %d, batch: %d, starting_row: %d, validity_offset: %d\n", col, batch, starting_row, validity_offset); + printf("%d-%d: cols: %d-%d, word index: %d, batch: %d, starting_row: %d, +validity_offset: %d, index: %d, stride: %d\n", threadIdx.x, blockIdx.x, start_col, start_col + 7, (start_col / 8), batch, starting_row, validity_offset, index, blockDim.x * gridDim.x); } - int32_t dst_validity = 0; + // one for each column + int32_t dst_validity[8] = {0}; for (int row = starting_row; row < std::min(num_rows, starting_row + 32); ++row) { int8_t const * const validity_ptr = &input_data[offsets[row] + validity_offset]; if (debug_print) { - printf("validity_ptr is %p for row %d\nwhich is input_data[%d]\n", validity_ptr, row, offsets[row] + validity_offset); + printf("%d: validity_ptr is %p for row %d\n", threadIdx.x, validity_ptr, row); } auto const val_byte = *validity_ptr; - auto const src_shift = intra_word_index(col); - auto const dst_shift = row % 32; - auto const src_bit_mask = 1 << src_shift; - if (debug_print) { - printf("src bit mask is 0x%x\n", src_bit_mask); - printf("src shift is 0x%x and dst shift is 0x%x\n", src_shift, dst_shift); - printf("validity bit is 0x%x\n", (val_byte & src_bit_mask) >> src_shift); - } -// auto const dst_bit_mask = 1 << dst_shift; - dst_validity |= (((val_byte & src_bit_mask) >> src_shift) << dst_shift); - if (debug_print) { - printf("validity is now 0x%x\n", dst_validity); + + for (int i=0; i> src_shift); + } + // auto const dst_bit_mask = 1 << dst_shift; + dst_validity[i] |= (((val_byte & src_bit_mask) >> src_shift) << dst_shift); } } - int32_t *validity_ptr = reinterpret_cast(output_nm[col] + (starting_row / 32)); - if (debug_print) { - printf("valiidty_ptr is output_nm[%d]: %p + starting_row / 8: %d because starting row is %d, which becomes %p\n", col, output_nm[col], starting_row / 32, starting_row, output_nm[col] + (starting_row / 32)); - printf("validity to write is %d\n", dst_validity); - printf("validity write %p <- %d\n", validity_ptr, dst_validity); + for (int i=0; i(output_nm[start_col + i] + (starting_row / 32)); + if (debug_print) { + printf("%d-%d: validity write output_nm[%d][%d] - %p <- %d\n", threadIdx.x, blockIdx.x, start_col + i, starting_row, validity_ptr, dst_validity[i]); + } + *validity_ptr = dst_validity[i]; } - *validity_ptr = dst_validity; } } +} /** * Calculate the dimensions of the kernel for fixed width only columns. @@ -980,7 +979,7 @@ static inline int32_t compute_fixed_width_layout(std::vector co // Now we need to add in space for validity // Eventually we can think about nullable vs not nullable, but for now we will just always add it // in - int32_t validity_bytes_needed = word_index(schema.size() + 7); + int32_t validity_bytes_needed = (schema.size() + 7) / 8; // validity comes at the end and is byte aligned so we can pack more in. at_offset += validity_bytes_needed; // Now we need to pad the end so all rows are 64 bit aligned @@ -1300,7 +1299,7 @@ std::vector> convert_to_rows2(cudf::table_view con // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then // calculate the size of each row's variable-width data and validity as well. - auto validity_size = num_bitmask_words(num_columns); + auto validity_size = num_bitmask_words(num_columns) * 4; for (int row = 0; row < num_rows; ++row) { auto aligned_row_batch_size = detail::align_offset(row_batch_size, 8); // rows are 8 byte aligned @@ -1521,6 +1520,8 @@ std::unique_ptr convert_from_rows2(cudf::lists_column_view const &i CUDA_TRY( cudaDeviceGetAttribute(&shmem_limit_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); + shmem_limit_per_block /= NUM_BLOCKS_PER_KERNEL_TO_COLUMNS; + std::vector column_starts; std::vector column_sizes; @@ -1530,7 +1531,7 @@ std::unique_ptr convert_from_rows2(cudf::lists_column_view const &i size_type fixed_width_size_per_row = detail::compute_column_information( iter, iter + num_columns, column_starts, column_sizes);//, [](void *) {}); - size_type validity_size = num_bitmask_words(num_columns); + size_type validity_size = num_bitmask_words(num_columns) * 4; size_type row_size = detail::align_offset(fixed_width_size_per_row + validity_size, 8); @@ -1567,7 +1568,7 @@ std::unique_ptr convert_from_rows2(cudf::lists_column_view const &i auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr); - dim3 blocks(block_infos.size()); + dim3 blocks((block_infos.size() + (NUM_BLOCKS_PER_KERNEL_TO_COLUMNS - 1)) / NUM_BLOCKS_PER_KERNEL_TO_COLUMNS); #if defined(DEBUG) || 1 dim3 threads(std::min(std::min(512, shmem_limit_per_block / 8), (int)child.size())); #else @@ -1581,6 +1582,7 @@ std::unique_ptr convert_from_rows2(cudf::lists_column_view const &i detail::copy_to_columns<<>>( num_rows, num_columns, + shmem_limit_per_block, input.offsets().data(), dev_output_data.data(), dev_output_nm.data(), From 65490e027df4aa5e55292731434c679f79d0d58b Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Mon, 13 Sep 2021 19:46:03 +0000 Subject: [PATCH 15/80] updating for memcpy_async and validation in a different kernel --- .../row_conversion/row_conversion.cpp | 47 +- cpp/include/cudf/row_conversion.hpp | 38 +- cpp/src/row_conversion/row_conversion.cu | 1926 ++++++++++++----- cpp/tests/row_conversion/row_conversion.cpp | 132 +- java/src/main/native/src/row_conversion.cu | 1293 ++++++++++- java/src/main/native/src/row_conversion.hpp | 12 + 6 files changed, 2714 insertions(+), 734 deletions(-) diff --git a/cpp/benchmarks/row_conversion/row_conversion.cpp b/cpp/benchmarks/row_conversion/row_conversion.cpp index 7c1f52c5cd6..ad9925e9043 100644 --- a/cpp/benchmarks/row_conversion/row_conversion.cpp +++ b/cpp/benchmarks/row_conversion/row_conversion.cpp @@ -20,7 +20,8 @@ #include #include -#include "cudf_test/column_utilities.hpp" +#include +#include class RowConversion : public cudf::benchmark { }; @@ -39,9 +40,6 @@ static void BM_old_to_row(benchmark::State& state) cudf::type_id::UINT64}, 212, row_count{n_rows}); - /* auto const table = create_random_table({cudf::type_id::INT32}, - 64, - row_count{n_rows});*/ cudf::size_type total_bytes = 0; for (int i = 0; i < table->num_columns(); ++i) { @@ -52,7 +50,7 @@ static void BM_old_to_row(benchmark::State& state) for (auto _ : state) { cuda_event_timer raii(state, true, rmm::cuda_stream_default); - auto rows = cudf::convert_to_rows(table->view()); + auto rows = cudf::old_convert_to_rows(table->view()); } state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows()); @@ -72,9 +70,6 @@ static void BM_new_to_row(benchmark::State& state) cudf::type_id::UINT64}, 212, row_count{n_rows}); - /* auto const table = create_random_table({cudf::type_id::INT32}, - 64, - row_count{n_rows});*/ cudf::size_type total_bytes = 0; for (int i = 0; i < table->num_columns(); ++i) { @@ -85,7 +80,7 @@ static void BM_new_to_row(benchmark::State& state) for (auto _ : state) { cuda_event_timer raii(state, true, rmm::cuda_stream_default); - auto new_rows = cudf::convert_to_rows2(table->view()); + auto new_rows = cudf::convert_to_rows(table->view()); } state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows()); @@ -114,12 +109,13 @@ static void BM_old_from_row(benchmark::State& state) total_bytes += cudf::size_of(t); } - auto rows = cudf::convert_to_rows(table->view()); + auto rows = cudf::old_convert_to_rows(table->view()); + cudf::lists_column_view const first_list(rows.front()->view()); for (auto _ : state) { cuda_event_timer raii(state, true, rmm::cuda_stream_default); - auto out = cudf::convert_from_rows(rows, schema); + auto out = cudf::old_convert_from_rows(first_list, schema); } state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows()); @@ -148,36 +144,37 @@ static void BM_new_from_row(benchmark::State& state) total_bytes += cudf::size_of(t); } - auto rows = cudf::convert_to_rows(table->view()); + auto rows = cudf::old_convert_to_rows(table->view()); + cudf::lists_column_view const first_list(rows.front()->view()); for (auto _ : state) { cuda_event_timer raii(state, true, rmm::cuda_stream_default); - auto out = cudf::convert_from_rows2(rows, schema); + auto out = cudf::convert_from_rows(first_list, schema); } state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows()); } #define TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \ - BENCHMARK_DEFINE_F(RowConversion, name) \ - (::benchmark::State & st) { f(st); } \ - BENCHMARK_REGISTER_F(RowConversion, name) \ - ->RangeMultiplier(8) \ - ->Ranges({{1 << 6, 1 << 20}}) \ - ->UseManualTime() \ + BENCHMARK_DEFINE_F(RowConversion, name) \ + (::benchmark::State & st) { f(st); } \ + BENCHMARK_REGISTER_F(RowConversion, name) \ + ->RangeMultiplier(8) \ + ->Ranges({{1 << 6, 1 << 20}}) \ + ->UseManualTime() \ ->Unit(benchmark::kMillisecond); TO_ROW_CONVERSION_BENCHMARK_DEFINE(old_to_row_conversion, BM_old_to_row) TO_ROW_CONVERSION_BENCHMARK_DEFINE(new_to_row_conversion, BM_new_to_row) #define FROM_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \ - BENCHMARK_DEFINE_F(RowConversion, name) \ - (::benchmark::State & st) { f(st); } \ - BENCHMARK_REGISTER_F(RowConversion, name) \ - ->RangeMultiplier(8) \ - ->Ranges({{1 << 6, 1 << 20}}) \ - ->UseManualTime() \ + BENCHMARK_DEFINE_F(RowConversion, name) \ + (::benchmark::State & st) { f(st); } \ + BENCHMARK_REGISTER_F(RowConversion, name) \ + ->RangeMultiplier(8) \ + ->Ranges({{1 << 6, 1 << 20}}) \ + ->UseManualTime() \ ->Unit(benchmark::kMillisecond); FROM_ROW_CONVERSION_BENCHMARK_DEFINE(old_from_row_conversion, BM_old_from_row) diff --git a/cpp/include/cudf/row_conversion.hpp b/cpp/include/cudf/row_conversion.hpp index 282ffa4b0cb..8f82d01b06c 100644 --- a/cpp/include/cudf/row_conversion.hpp +++ b/cpp/include/cudf/row_conversion.hpp @@ -24,40 +24,28 @@ namespace cudf { -std::vector> convert_to_rows( - cudf::table_view const &tbl, +std::vector> old_convert_to_rows( + cudf::table_view const& tbl, // TODO need something for validity rmm::cuda_stream_view stream = rmm::cuda_stream_default, - rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); -std::vector> convert_to_rows2( - cudf::table_view const &tbl, +std::vector> convert_to_rows( + cudf::table_view const& tbl, // TODO need something for validity rmm::cuda_stream_view stream = rmm::cuda_stream_default, - rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); -std::unique_ptr convert_from_rows( - cudf::lists_column_view const &input, - std::vector const &schema, +std::unique_ptr old_convert_from_rows( + cudf::lists_column_view const& input, + std::vector const& schema, rmm::cuda_stream_view stream = rmm::cuda_stream_default, - rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); std::unique_ptr convert_from_rows( - std::vector> const &input, - std::vector const &schema, - rmm::cuda_stream_view stream = rmm::cuda_stream_default, - rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); - -std::unique_ptr convert_from_rows2( - cudf::lists_column_view const &input, - std::vector const &schema, - rmm::cuda_stream_view stream = rmm::cuda_stream_default, - rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); - -std::unique_ptr convert_from_rows2( - std::vector> const &input, - std::vector const &schema, + cudf::lists_column_view const& input, + std::vector const& schema, rmm::cuda_stream_view stream = rmm::cuda_stream_default, - rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace cudf diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu index 0879a1c50a5..42c40e0542d 100644 --- a/cpp/src/row_conversion/row_conversion.cu +++ b/cpp/src/row_conversion/row_conversion.cu @@ -18,26 +18,42 @@ #include #include #include +#include +#include + +#include + +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 +#include +#endif #include #include +#include +#include #include +#include #include #include +#include #include #include #include + #include +#include #include -#include -#include -#include "cudf/types.hpp" -#include "rmm/device_buffer.hpp" -#include "thrust/iterator/counting_iterator.h" -#include "thrust/iterator/transform_iterator.h" +#include +#include -#define NUM_BLOCKS_PER_KERNEL_TO_COLUMNS (2) +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 +constexpr auto NUM_BLOCKS_PER_KERNEL_TO_COLUMNS = 8; +constexpr auto NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS = 8; +constexpr auto NUM_BLOCKS_PER_KERNEL_LOADED = 2; +constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL = 8; +constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED = 2; +#endif using cudf::detail::make_device_uvector_async; namespace cudf { @@ -52,11 +68,11 @@ static inline __host__ __device__ int32_t align_offset(int32_t offset, std::size __global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows, const cudf::size_type num_columns, const cudf::size_type row_size, - const cudf::size_type *input_offset_in_row, - const cudf::size_type *num_bytes, - int8_t **output_data, - cudf::bitmask_type **output_nm, - const int8_t *input_data) + const cudf::size_type* input_offset_in_row, + const cudf::size_type* num_bytes, + int8_t** output_data, + cudf::bitmask_type** output_nm, + const int8_t* input_data) { // We are going to copy the data in two passes. // The first pass copies a chunk of data into shared memory. @@ -81,15 +97,15 @@ __global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows, // Because we are copying fixed width only data and we stride the rows // this thread will always start copying from shared data in the same place - int8_t *row_tmp = &shared_data[row_size * threadIdx.x]; - int8_t *row_vld_tmp = &row_tmp[input_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]]; + int8_t* row_tmp = &shared_data[row_size * threadIdx.x]; + int8_t* row_vld_tmp = &row_tmp[input_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]]; for (cudf::size_type row_group_index = row_group_start; row_group_index < row_group_end; row_group_index += row_group_stride) { // Step 1: Copy the data into shared memory // We know row_size is always aligned with and a multiple of int64_t; - int64_t *long_shared = reinterpret_cast(shared_data); - const int64_t *long_input = reinterpret_cast(input_data); + int64_t* long_shared = reinterpret_cast(shared_data); + const int64_t* long_input = reinterpret_cast(input_data); cudf::size_type shared_output_index = threadIdx.x + (threadIdx.y * blockDim.x); cudf::size_type shared_output_stride = blockDim.x * blockDim.y; @@ -125,26 +141,26 @@ __global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows, for (cudf::size_type col_index = col_index_start; col_index < num_columns; col_index += col_index_stride) { cudf::size_type col_size = num_bytes[col_index]; - const int8_t *col_tmp = &(row_tmp[input_offset_in_row[col_index]]); - int8_t *col_output = output_data[col_index]; + const int8_t* col_tmp = &(row_tmp[input_offset_in_row[col_index]]); + int8_t* col_output = output_data[col_index]; switch (col_size) { case 1: { col_output[row_index] = *col_tmp; break; } case 2: { - int16_t *short_col_output = reinterpret_cast(col_output); - short_col_output[row_index] = *reinterpret_cast(col_tmp); + int16_t* short_col_output = reinterpret_cast(col_output); + short_col_output[row_index] = *reinterpret_cast(col_tmp); break; } case 4: { - int32_t *int_col_output = reinterpret_cast(col_output); - int_col_output[row_index] = *reinterpret_cast(col_tmp); + int32_t* int_col_output = reinterpret_cast(col_output); + int_col_output[row_index] = *reinterpret_cast(col_tmp); break; } case 8: { - int64_t *long_col_output = reinterpret_cast(col_output); - long_col_output[row_index] = *reinterpret_cast(col_tmp); + int64_t* long_col_output = reinterpret_cast(col_output); + long_col_output[row_index] = *reinterpret_cast(col_tmp); break; } default: { @@ -157,12 +173,12 @@ __global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows, } } - cudf::bitmask_type *nm = output_nm[col_index]; - int8_t *valid_byte = &row_vld_tmp[col_index / 8]; + cudf::bitmask_type* nm = output_nm[col_index]; + int8_t* valid_byte = &row_vld_tmp[col_index / 8]; cudf::size_type byte_bit_offset = col_index % 8; int predicate = *valid_byte & (1 << byte_bit_offset); uint32_t bitmask = __ballot_sync(active_mask, predicate); - if (row_index % 32 == 0) { nm[row_index / 8] = bitmask; } + if (row_index % 32 == 0) { nm[word_index(row_index)] = bitmask; } } // end column loop } // end row copy // wait for the row_group to be totally copied before starting on the next row group @@ -174,11 +190,11 @@ __global__ void copy_from_fixed_width_columns(const cudf::size_type start_row, const cudf::size_type num_rows, const cudf::size_type num_columns, const cudf::size_type row_size, - const cudf::size_type *output_offset_in_row, - const cudf::size_type *num_bytes, - const int8_t **input_data, - const cudf::bitmask_type **input_nm, - int8_t *output_data) + const cudf::size_type* output_offset_in_row, + const cudf::size_type* num_bytes, + const int8_t** input_data, + const cudf::bitmask_type** input_nm, + int8_t* output_data) { // We are going to copy the data in two passes. // The first pass copies a chunk of data into shared memory. @@ -205,8 +221,8 @@ __global__ void copy_from_fixed_width_columns(const cudf::size_type start_row, // Because we are copying fixed width only data and we stride the rows // this thread will always start copying to shared data in the same place - int8_t *row_tmp = &shared_data[row_size * threadIdx.x]; - int8_t *row_vld_tmp = + int8_t* row_tmp = &shared_data[row_size * threadIdx.x]; + int8_t* row_vld_tmp = &row_tmp[output_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]]; for (cudf::size_type row_group_index = row_group_start; row_group_index < row_group_end; @@ -223,26 +239,26 @@ __global__ void copy_from_fixed_width_columns(const cudf::size_type start_row, for (cudf::size_type col_index = col_index_start; col_index < num_columns; col_index += col_index_stride) { cudf::size_type col_size = num_bytes[col_index]; - int8_t *col_tmp = &(row_tmp[output_offset_in_row[col_index]]); - const int8_t *col_input = input_data[col_index]; + int8_t* col_tmp = &(row_tmp[output_offset_in_row[col_index]]); + const int8_t* col_input = input_data[col_index]; switch (col_size) { case 1: { *col_tmp = col_input[row_index]; break; } case 2: { - const int16_t *short_col_input = reinterpret_cast(col_input); - *reinterpret_cast(col_tmp) = short_col_input[row_index]; + const int16_t* short_col_input = reinterpret_cast(col_input); + *reinterpret_cast(col_tmp) = short_col_input[row_index]; break; } case 4: { - const int32_t *int_col_input = reinterpret_cast(col_input); - *reinterpret_cast(col_tmp) = int_col_input[row_index]; + const int32_t* int_col_input = reinterpret_cast(col_input); + *reinterpret_cast(col_tmp) = int_col_input[row_index]; break; } case 8: { - const int64_t *long_col_input = reinterpret_cast(col_input); - *reinterpret_cast(col_tmp) = long_col_input[row_index]; + const int64_t* long_col_input = reinterpret_cast(col_input); + *reinterpret_cast(col_tmp) = long_col_input[row_index]; break; } default: { @@ -256,10 +272,10 @@ __global__ void copy_from_fixed_width_columns(const cudf::size_type start_row, } // atomicOr only works on 32 bit or 64 bit aligned values, and not byte aligned // so we have to rewrite the addresses to make sure that it is 4 byte aligned - int8_t *valid_byte = &row_vld_tmp[col_index / 8]; + int8_t* valid_byte = &row_vld_tmp[col_index / 8]; cudf::size_type byte_bit_offset = col_index % 8; uint64_t fixup_bytes = reinterpret_cast(valid_byte) % 4; - int32_t *valid_int = reinterpret_cast(valid_byte - fixup_bytes); + int32_t* valid_int = reinterpret_cast(valid_byte - fixup_bytes); cudf::size_type int_bit_offset = byte_bit_offset + (fixup_bytes * 8); // Now copy validity for the column if (input_nm[col_index]) { @@ -279,8 +295,8 @@ __global__ void copy_from_fixed_width_columns(const cudf::size_type start_row, // Step 2: Copy the data back out // We know row_size is always aligned with and a multiple of int64_t; - int64_t *long_shared = reinterpret_cast(shared_data); - int64_t *long_output = reinterpret_cast(output_data); + int64_t* long_shared = reinterpret_cast(shared_data); + int64_t* long_output = reinterpret_cast(output_data); cudf::size_type shared_input_index = threadIdx.x + (threadIdx.y * blockDim.x); cudf::size_type shared_input_stride = blockDim.x * blockDim.y; @@ -303,12 +319,35 @@ __global__ void copy_from_fixed_width_columns(const cudf::size_type start_row, } } +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 + struct block_info { int start_col; int start_row; int end_col; int end_row; int buffer_num; + + __host__ __device__ size_type get_row_size(size_type const* const col_offsets, + size_type const* const col_sizes, + bool debug_print = false) const + { + if (debug_print) + printf("col_offsets[%d]: %p + col_sizes[%d]: %p - col_offsets[%d]: %p\n%d + %d - %d\n", + end_col, + &col_offsets[end_col], + end_col, + &col_sizes[end_col], + start_col, + &col_offsets[start_col], + col_offsets[end_col], + col_sizes[end_col], + col_offsets[start_col]); + return align_offset(col_offsets[end_col] + col_sizes[end_col] - col_offsets[start_col], 8); + } + __host__ __device__ size_type num_cols() const { return end_col - start_col + 1; } + + __host__ __device__ size_type num_rows() const { return end_row - start_row + 1; } }; // When building the columns to return, we have to be mindful of the offset limit in cudf. @@ -341,13 +380,14 @@ struct row_batch { */ __global__ void copy_from_columns(const size_type num_rows, const size_type num_columns, - const int8_t **input_data, - const bitmask_type **input_nm, - const size_type *col_sizes, - const size_type *col_offsets, - const block_info *block_infos, - const size_type *row_offsets, - int8_t **output_data) + const size_type shmem_used_per_block, + const size_type num_block_infos, + const int8_t** input_data, + const size_type* col_sizes, + const size_type* col_offsets, + const block_info* block_infos, + const size_type* row_offsets, + int8_t** output_data) { // We are going to copy the data in two passes. // The first pass copies a chunk of data into shared memory. @@ -357,239 +397,597 @@ __global__ void copy_from_columns(const size_type num_rows, // This has been broken up for us in the block_info struct, so we don't have // any calculation to do here, but it is important to note. - bool debug_print = false; //blockIdx.x == 0 && threadIdx.x == 0; + constexpr bool debug_print = false; // blockIdx.x == 0 && threadIdx.x == 1; + + constexpr unsigned stages_count = NUM_BLOCKS_PER_KERNEL_LOADED; + auto group = cooperative_groups::this_thread_block(); + extern __shared__ int8_t shared_data[]; + int8_t* shared[stages_count] = {shared_data, shared_data + shmem_used_per_block}; + + __shared__ cuda::barrier block_barrier[NUM_BLOCKS_PER_KERNEL_LOADED]; + if (group.thread_rank() == 0) { + for (int i = 0; i < NUM_BLOCKS_PER_KERNEL_LOADED; ++i) { + init(&block_barrier[i], group.size()); + } + } + + group.sync(); if (debug_print) { printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns); - printf("Column Info:\n"); - for (int i = 0; i < num_columns; ++i) { - printf("col %d is at %p with size %d and offset %d\n", - i, - input_data[i], - col_sizes[i], - col_offsets[i]); - } + printf("col sizes at %p, col offsets at %p, and row offsets at %p\n", + col_sizes, + col_offsets, + row_offsets); printf("block infos are at %p and my index is %d\n", block_infos, blockIdx.x); - /* printf("Row Offsets:\n"); - for (int i=0; i NUM_BLOCKS_PER_KERNEL_LOADED) { fetch_barrier.arrive_and_wait(); } + + // to do the copy we need to do n column copies followed by m element copies OR + // we have to do m element copies followed by r row copies. When going from column + // to row it is much easier to copy by elements first otherwise we would need a running + // total of the column sizes for our block, which isn't readily available. This makes it more + // appealing to copy element-wise from input data into shared matching the end layout and do + // row-based memcopies out. + + for (auto el = (int)threadIdx.x; el < num_elements_in_block; el += blockDim.x) { + auto const relative_col = el / num_fetch_rows; + auto const relative_row = el % num_fetch_rows; + auto const absolute_col = relative_col + fetch_block.start_col; + auto const absolute_row = relative_row + fetch_block.start_row; + if (debug_print) + printf("row %d(%d), col %d(%d), %d fetch rows, element %d\n", + relative_row, + absolute_row, + relative_col, + absolute_col, + num_fetch_rows, + el); + auto const col_size = col_sizes[absolute_col]; + auto const col_offset = col_offsets[absolute_col]; + auto const relative_col_offset = col_offset - starting_column_offset; + + auto const shared_offset = relative_row * fetch_block_row_size + relative_col_offset; + auto const input_src = input_data[absolute_col] + col_size * absolute_row; + + if (debug_print) + printf("block %lu to shared chunk %lu. %p <- %p - %d bytes\n", + fetch, + fetch % stages_count, + &shared[fetch % stages_count][shared_offset], + input_src, + col_size); + + // copy the main + cuda::memcpy_async( + &shared[fetch % stages_count][shared_offset], input_src, col_size, fetch_barrier); + } + } + + auto& subset_barrier = block_barrier[subset % NUM_BLOCKS_PER_KERNEL_LOADED]; + subset_barrier.arrive_and_wait(); + + auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + subset]; + if (debug_print) + printf("reading block %lu\n", blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + subset); + + /* auto const rows_in_block = block.num_rows(); + auto const cols_in_block = block.num_cols();*/ + auto const block_row_size = block.get_row_size(col_offsets, col_sizes); + auto const column_offset = col_offsets[block.start_col]; + + // copy entire rows to final dest + for (auto absolute_row = block.start_row + threadIdx.x; absolute_row <= block.end_row; + absolute_row += blockDim.x) { + auto const relative_row = absolute_row - block.start_row; + auto const output_dest = + output_data[block.buffer_num] + absolute_row * block_row_size + column_offset; + if (debug_print) + printf("processing row %d\noutput data[%d] is address %p\n", + absolute_row, + absolute_row, + output_dest); + auto const shared_offset = block_row_size * relative_row; + if (debug_print) + printf("memcpy %p <- %p - %d bytes which is row %d\n", + output_dest, + &shared[subset % stages_count][shared_offset], + block_row_size, + absolute_row); + cuda::memcpy_async( + output_dest, &shared[subset % stages_count][shared_offset], block_row_size, subset_barrier); + } + } + + // wait on the last copies to complete + for (uint i = 0; i < std::min(stages_count, blocks_remaining); ++i) { + block_barrier[i].arrive_and_wait(); + } +} + +/** + * @brief copy data from row-based format to cudf columns + * + * @param num_rows total number of rows in the table + * @param num_columns total number of columns in the table + * @param shmem_used_per_block amount of shared memory that is used by a block + * @param offsets + * @param output_data pointer to output data, partitioned by data size + * @param validity_offsets offset into input data row for validity data + * @param block_infos information about the blocks of work + * @param num_block_infos number of infos in blocks array + * @param input_data pointer to input data + * + */ +__global__ void copy_validity_from_columns(const size_type num_rows, + const size_type num_columns, + const size_type shmem_used_per_block, + const size_type* row_offsets, + int8_t** output_data, + const size_type validity_offset, + const block_info* block_infos, + const size_type num_block_infos, + const bitmask_type** input_nm) +{ extern __shared__ int8_t shared_data[]; - uint64_t const output_start_offset = col_offsets[block.start_col] + row_offsets[block.start_row]; - uint8_t const dest_shim_offset = - reinterpret_cast(&output_data[0][output_start_offset]) & - 7; // offset for alignment shim in order to match shared memory with final dest - if (debug_print) { - printf("outputting to offset %lu\n", output_start_offset); - printf("dest shim offset is %d\n", dest_shim_offset); - printf("Shared data is %p-%p\n", shared_data, shared_data + (48 * 1024)); - printf("my block is %d,%d -> %d,%d - buffer %d\n", - block.start_col, - block.start_row, - block.end_col, - block.end_row, - block.buffer_num); + int8_t* shared_blocks[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] = { + shared_data, shared_data + shmem_used_per_block / 2}; + + constexpr bool print_debug = false; //(threadIdx.x==0 || threadIdx.x == 32) && blockIdx.x == 0; + // if (blockIdx.x != 3 || threadIdx.x / 32 != 0) return; + if (print_debug) { + printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns); + printf("%d %d - block infos are at %p and my index is %d\n", + threadIdx.x, + blockIdx.x, + block_infos, + blockIdx.x); + printf("%d %d - input nm is %p, input_nm[0] is at %p\n", + threadIdx.x, + blockIdx.x, + input_nm, + input_nm[0]); + printf("shared memory is %p to %p\n", shared_data, shared_data + shmem_used_per_block * 2); + printf("block infos at %p and this is index %d\n", + &block_infos, + blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + 0); + /* printf("Row Offsets:\n"); + for (int i=0; i + shared_block_barriers[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]; + if (group.thread_rank() == 0) { + for (int i = 0; i < NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; ++i) { + init(&shared_block_barriers[i], group.size()); + } } - for (int col = block.start_col; col <= block.end_col; ++col) { - /*if (!col_is_variable) */ { - uint64_t col_offset = 0; - cudf::size_type col_size = col_sizes[col]; - auto const dest_col_offset = - col_offsets[col] - col_offsets[block.start_col] + dest_shim_offset; - if (debug_print) { printf("dest col offset %d\n", dest_col_offset); } - for (int row = block.start_row + threadIdx.x; row <= block.end_row; row += blockDim.x) { - if (debug_print) { - printf("shmem row %d(%d) at offset %d(%d)\n", - row - block.start_row, - row, - (row - block.start_row) * shmem_row_size, - row * shmem_row_size); - } - int8_t *shmem_dest = - &shared_data[dest_col_offset + shmem_row_size * (row - block.start_row)]; - switch (col_size) { - case 1: { - if (debug_print) { printf("%p <- byte %d\n", shmem_dest, input_data[col][row]); } - *shmem_dest = input_data[col][row]; - break; - } - case 2: { - const int16_t *short_col_input = reinterpret_cast(input_data[col]); - if (debug_print) { printf("%p <- short %d\n", shmem_dest, short_col_input[row]); } - *reinterpret_cast(shmem_dest) = short_col_input[row]; - break; - } - case 4: { - const int32_t *int_col_input = reinterpret_cast(input_data[col]); - if (debug_print) { - printf("shmem[%d][%d] - %p <- int 0x%x\n", row, col, shmem_dest, int_col_input[row]); - } - *reinterpret_cast(shmem_dest) = int_col_input[row]; - break; - } - case 8: { - const int64_t *long_col_input = reinterpret_cast(input_data[col]); - if (debug_print) { printf("%p <- long %lu\n", shmem_dest, long_col_input[row]); } - *reinterpret_cast(shmem_dest) = long_col_input[row]; - break; - } - default: { - cudf::size_type input_offset = col_size * row; - if (debug_print) { - printf("byte for byte copy due to size %d of column %d\n", col_size, col); - printf("%p <- input_data[%d] which is %d\n", - shmem_dest, - input_offset, - input_data[col][input_offset]); - } - // TODO this should just not be supported for fixed width columns, but just in case... - for (cudf::size_type b = 0; b < col_size; b++) { - shmem_dest[b] = input_data[col][b + input_offset]; + + group.sync(); + + for (int validity_block = 0; validity_block < blocks_remaining; ++validity_block) { + if (validity_block != validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED) { + if (print_debug) + printf("%d: waiting at barrier %d\n", + threadIdx.x, + validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED); + shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] + .arrive_and_wait(); + if (print_debug) printf("past barrier...\n"); + } + int8_t* this_shared_block = shared_blocks[validity_block % 2]; + if (print_debug) printf("top of loop for validity block %d\n", validity_block); + if (print_debug) + printf("reading validity block info %d at %p\n", + blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block, + &block_infos[blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block]); + auto block = block_infos[blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block]; + + auto const num_block_cols = block.num_cols(); + auto const num_block_rows = block.num_rows(); + + auto const num_sections_x = (num_block_cols + 31) / 32; + auto const num_sections_y = (num_block_rows + 7) / 8; + auto const validity_data_row_length = + align_offset(util::div_rounding_up_unsafe(num_block_cols, 8), 8); + auto const total_sections = num_sections_x * num_sections_y; + + if (print_debug) { + printf("%d %d - block %d has %d cols, %d rows, %d row length, and %d total sections\n", + threadIdx.x, + blockIdx.x, + blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block, + num_block_cols, + num_block_rows, + validity_data_row_length, + total_sections); + } + int const warp_id = threadIdx.x / detail::warp_size; + int const lane_id = threadIdx.x % detail::warp_size; + auto const warps_per_block = std::max(1u, blockDim.x / detail::warp_size); + + if (print_debug) + printf( + "%d %d - my warp is %d, %d total sections, %d warps per block, blockDim.x=%d, warp side " + "%d\n", + threadIdx.x, + blockIdx.x, + warp_id, + total_sections, + warps_per_block, + blockDim.x, + detail::warp_size); + // the block is divided into sections. A warp operates on a section at a time. + for (int my_section_idx = warp_id; my_section_idx < total_sections; + my_section_idx += warps_per_block) { + // convert to rows and cols + auto const section_x = my_section_idx / num_sections_x; + auto const section_y = my_section_idx % num_sections_x; + + if (print_debug) printf("working on section %d of %d...\n", section_x, num_sections_x); + auto const relative_col = section_x * 32 + lane_id; + auto const relative_row = section_y * 8; + auto const absolute_col = relative_col + block.start_col; + auto const absolute_row = relative_row + block.start_row; + auto const cols_left = num_columns - absolute_col; + + if (print_debug) printf("pre ballot sync...\n"); + auto const participation_mask = __ballot_sync(0xFFFFFFFF, absolute_col < num_columns); + + if (print_debug) + printf( + "participation mask is 0x%x for relative row %d(%d real), relative col %d(%d absolute)\n", + participation_mask, + relative_row, + absolute_row, + relative_col, + absolute_col); + + if (absolute_col < num_columns) { + if (print_debug) + printf( + "thread %d's byte is at %p, participation mask is 0x%x for relative row %d(%d real), " + "relative col %d(%d absolute)\n", + threadIdx.x, + &input_nm[absolute_col][absolute_row / 32], + participation_mask, + relative_row, + absolute_row, + relative_col, + absolute_col); + auto my_byte = + input_nm[absolute_col] != nullptr ? input_nm[absolute_col][absolute_row / 32] : 0xFF; + + if (print_debug) + printf( + "thread %d's byte is 0x%x, participation mask is 0x%x for relative row %d(%d real), " + "relative col %d(%d absolute)\n", + threadIdx.x, + my_byte & 0xFF, + participation_mask, + relative_row, + absolute_row, + relative_col, + absolute_col); + + // every thread that is participating in the warp has a byte, but it's column-based + // data and we need it in row-based. So we shiffle the bits around with ballot_sync to make + // the bytes we actually write. + for (int i = 0, byte_mask = 1; i < 8 && relative_row + i < num_rows; ++i, byte_mask <<= 1) { + auto validity_data = __ballot_sync(participation_mask, my_byte & byte_mask); + // lead thread in each warp writes data + auto const validity_write_offset = + validity_data_row_length * (relative_row + i) + relative_col / 8; + if (threadIdx.x % detail::warp_size == 0) { + if (print_debug) + printf( + "%d %d - byte_mask is 0x%x, masked_byte is 0x%x, shared_data_block[%d][%d] = " + "0x%x\n", + threadIdx.x, + blockIdx.x, + byte_mask, + my_byte & byte_mask, + validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED, + validity_write_offset, + validity_data); + if (cols_left <= 8) { + // write byte + if (print_debug) + printf("writing single byte to shared offset 0x%x which is %p...\n", + validity_write_offset, + &this_shared_block[validity_write_offset]); + this_shared_block[validity_write_offset] = validity_data & 0xFF; + } else if (cols_left <= 16) { + // write int16 + if (print_debug) + printf("writing two bytes to shared offset 0x%x which is %p...\n", + validity_write_offset, + &this_shared_block[validity_write_offset]); + *reinterpret_cast(&this_shared_block[validity_write_offset]) = + validity_data & 0xFFFF; + } else if (cols_left <= 24) { + // write int16 and then int8 + if (print_debug) + printf("writing three bytes to shared offset 0x%x which is %p...\n", + validity_write_offset, + &this_shared_block[validity_write_offset]); + *reinterpret_cast(&this_shared_block[validity_write_offset]) = + validity_data & 0xFFFF; + shared_data[validity_write_offset + 2] = (validity_data >> 16) & 0xFF; + } else { + // write int32 + if (print_debug) + printf("writing 4 bytes to shared offset 0x%x which is %p...\n", + validity_write_offset, + &this_shared_block[validity_write_offset]); + *reinterpret_cast(&this_shared_block[validity_write_offset]) = + validity_data; } - break; } } + } + } - // atomicOr only works on 32 bit or 64 bit aligned values, and not byte aligned - // so we have to rewrite the addresses to make sure that it is 4 byte aligned - // we do this directly in the final location because the entire row may not - // fit in shared memory and may require many blocks to process it entirely - int8_t *valid_byte = - &output_data[block.buffer_num][row_offsets[row] + validity_offset + (col / 8)]; - cudf::size_type byte_bit_offset = col % 8; - uint64_t fixup_bytes = reinterpret_cast(valid_byte) % 4; - int32_t *valid_int = reinterpret_cast(valid_byte - fixup_bytes); - cudf::size_type int_bit_offset = byte_bit_offset + (fixup_bytes * 8); - if (debug_print) { printf("Outputting validity to %p\n", valid_byte); } - // Now copy validity for the column - if (input_nm[col]) { - if (bit_is_set(input_nm[col], row)) { - atomicOr_block(valid_int, 1 << int_bit_offset); - } else { - atomicAnd_block(valid_int, ~(1 << int_bit_offset)); - } - } else { - // It is valid so just set the bit - atomicOr_block(valid_int, 1 << int_bit_offset); - } - } // end row + // make sure entire block has finished copy + group.sync(); - col_offset += col_sizes[col] * rows_in_block; + // now async memcpy the shared memory out to the final destination + for (int row = block.start_row + threadIdx.x; row <= block.end_row; row += blockDim.x) { + auto const relative_row = row - block.start_row; + if (print_debug) { + printf( + "base output data is %p, row offset is 0x%x, validity offset into row is 0x%x, word " + "index of block start is 0x%x\n", + output_data[block.buffer_num], + row_offsets[row], + validity_offset, + word_index(block.start_col)); + printf( + "%d %d - row %d/%d/%d col %d-%d - %p = shared_data_block[%d][%d] which is %p - %d " + "bytes\n - %p <- 0x%x\n", + threadIdx.x, + blockIdx.x, + block.start_row, + row, + block.end_row, + block.start_col, + block.end_col, + output_data[block.buffer_num] + row_offsets[row] + validity_offset + + (word_index(block.start_col)), + validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED, + validity_data_row_length * relative_row, + &this_shared_block[validity_data_row_length * relative_row], + util::div_rounding_up_unsafe(num_block_cols, 8), + output_data[block.buffer_num] + row_offsets[row] + validity_offset + + word_index(block.start_col), + this_shared_block[validity_data_row_length * relative_row]); + } + auto const output_ptr = + output_data[block.buffer_num] + row_offsets[row] + validity_offset + block.start_col / 8; + auto const num_bytes = util::div_rounding_up_unsafe(num_block_cols, 8); + cuda::memcpy_async( + output_ptr, + &this_shared_block[validity_data_row_length * relative_row], + num_bytes, + shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]); + + /* auto const padding_ptr = output_ptr + num_bytes; + auto const padding_needed = -reinterpret_cast(padding_ptr) & 7; + if (print_debug) printf( + "absolute_row: %d, row_offset for this row: 0x%x, validity data bytes: %d, end + address: %p, padding bytes %lu\n", row, row_offsets[row], num_bytes, output_ptr + + num_bytes, padding_needed); cuda::memcpy_async(padding_ptr, zero, padding_needed, + shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]); + */ + + /* if (print_debug) { + for (int i=0; i %p\n", - num_single_bytes, - &input_ptr[i + dest_shim_offset], - input_ptr[i + dest_shim_offset], - &output_ptr[i]); - } - output_ptr[i] = input_ptr[i + dest_shim_offset]; - } - } else if (dest_shim_offset > 0 && src_offset % shmem_row_size == 0) { - // first byte with leading pad - auto const num_single_bytes = 8 - dest_shim_offset; - for (auto i = 0; i < num_single_bytes; ++i) { - if (debug_print) { - printf( - "single byte final write %p -> %p\n", &input_ptr[i + dest_shim_offset], &output_ptr[i]); - } - output_ptr[i] = input_ptr[i + dest_shim_offset]; - } - } else if ((src_offset + 8) % shmem_row_size == 0 && - (real_bytes_in_row + dest_shim_offset) % 8 > 0) { - // last bytes of a row - auto const num_single_bytes = (real_bytes_in_row + dest_shim_offset) % 8; - for (auto i = 0; i < num_single_bytes; ++i) { - if (debug_print) { - printf("single trailing byte final write %p -> %p\n", - &input_ptr[i + dest_shim_offset], - &output_ptr[i]); - } - output_ptr[i] = input_ptr[i + dest_shim_offset]; - } - } else { - // copy 8 bytes aligned - const int64_t *long_col_input = reinterpret_cast(input_ptr); - if (debug_print) { - printf( - "long final write %p -> %p\n", long_col_input, reinterpret_cast(output_ptr)); +} + +static __device__ std::tuple get_admin_data_sizes(size_t col_size_size, + size_t col_offset_size, + int const num_cols) +{ + auto const col_size_bytes = num_cols * col_size_size; + auto const col_offset_bytes = num_cols * col_offset_size; + + return {col_size_bytes, col_offset_bytes}; +} + +/** + * @brief ensure `read_ahead` buffer blocks are fetched + * + * @param fetch_index internal state passed into the function + * @param processing_index index where processing is occuring + * @param read_ahead_count how many blocks to read ahead + * @param max_resident_blocks how many blocks can be loaded at once + * @param total_blocks total number of blocks overall + * @param block_infos pointer to the block infos + * @param col_sizes pointer to column size information + * @param col_offsets pointer to the table's column offsets + * @param row_offsets pointer to offsets for each row in the table + * @param input_data pointer to the input data + * @param shared pointer to shared memory + * @param group thread group participating in the fetch + * @param block_barrier barriers used for each block + * @param debug_print + * @return + */ +static __device__ void fetch_blocks_for_row_to_column( + size_t& fetch_index, + size_t const processing_index, + int const read_ahead_count, + int const max_resident_blocks, + int const total_blocks, + block_info const* const block_infos, + size_type const* const col_sizes, + size_type const* const col_offsets, + size_type const* const row_offsets, + int8_t const* const input_data, + int8_t* shared[], + cooperative_groups::thread_block const group, + cuda::barrier* block_barrier, + bool debug_print) +{ + for (; fetch_index < static_cast(total_blocks) && + fetch_index < (processing_index + read_ahead_count); + ++fetch_index) { + if (debug_print) + printf("fetching block %lu of %d\n", + blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch_index, + total_blocks); + auto const fetch_block = + block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch_index]; + auto const fetch_block_start_row = fetch_block.start_row; + auto const fetch_block_end_row = fetch_block.end_row; + auto const starting_col_offset = col_offsets[fetch_block.start_col]; + + auto const fetch_block_row_size = fetch_block.get_row_size(col_offsets, col_sizes); + auto const num_fetch_cols = fetch_block.num_cols(); + auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes( + sizeof(decltype(*col_sizes)), sizeof(decltype(*col_offsets)), num_fetch_cols); + auto& fetch_barrier = block_barrier[fetch_index % NUM_BLOCKS_PER_KERNEL_LOADED]; + + // if we have fetched all buffers, we need to wait for processing + // to complete on them before we can use them again + if (fetch_index > NUM_BLOCKS_PER_KERNEL_LOADED) { fetch_barrier.arrive_and_wait(); } + + auto shared_row_offset = 0; + // copy the data for column sizes + if (debug_print) + printf("%d: col sizes memcpy_async(group, %p, %p, %d, barrier);\n", + threadIdx.x, + &shared[fetch_index % max_resident_blocks][shared_row_offset], + &col_offsets[fetch_block.start_col], + col_size_bytes); + if (debug_print && group.thread_rank() == 0 && blockIdx.x == 0) + printf("%d-%d fetching to %p with barrier %p\n", + threadIdx.x, + blockIdx.x, + shared[fetch_index % max_resident_blocks], + &fetch_barrier); + cuda::memcpy_async(group, + &shared[fetch_index % max_resident_blocks][shared_row_offset], + &col_sizes[fetch_block.start_col], + col_size_bytes, + fetch_barrier); + shared_row_offset += col_size_bytes; + // copy the data for column offsets + if (debug_print) + printf("%d: offsets memcpy_async(group, %p, %p, %d, barrier);\n", + threadIdx.x, + &shared[fetch_index % max_resident_blocks][shared_row_offset], + &col_offsets[fetch_block.start_col], + col_offset_bytes); + cuda::memcpy_async(group, + &shared[fetch_index % max_resident_blocks][shared_row_offset], + &col_offsets[fetch_block.start_col], + col_offset_bytes, + fetch_barrier); + shared_row_offset += col_offset_bytes; + shared_row_offset = align_offset(shared_row_offset, 8); + + if (debug_print && group.thread_rank() == 0 && blockIdx.x == 0 && fetch_block.start_col == 0 && + fetch_block.start_row <= 51 && fetch_block.end_row >= 51) { + printf("Input data for col 0 row 51 is 0x"); + for (int i = 0; i < col_sizes[0]; ++i) { + printf("%x ", input_data[row_offsets[51] + col_offsets[0] + i]); } - *reinterpret_cast(output_ptr) = *long_col_input; + printf("\n"); + printf( + "this is at offset %d-%d and starting column offset is %d and we're reading %d bytes\n", + col_offsets[0], + col_offsets[0] + col_sizes[0], + starting_col_offset, + fetch_block_row_size); + auto shared_offset = (51 - fetch_block_start_row) * fetch_block_row_size + shared_row_offset; + printf("destination is %p", &shared[fetch_index % max_resident_blocks][shared_offset]); + } + + for (auto row = fetch_block_start_row + static_cast(threadIdx.x); + row <= fetch_block_end_row; + row += blockDim.x) { + auto shared_offset = (row - fetch_block_start_row) * fetch_block_row_size + shared_row_offset; + if (debug_print) + printf("fetching block %lu to shared chunk %lu. %p <- %p\n", + fetch_index, + fetch_index % max_resident_blocks, + &shared[fetch_index % max_resident_blocks][shared_offset], + &input_data[row_offsets[row] + starting_col_offset]); + // copy the main + cuda::memcpy_async(&shared[fetch_index % max_resident_blocks][shared_offset], + &input_data[row_offsets[row] + starting_col_offset], + fetch_block_row_size, + fetch_barrier); } } } @@ -600,7 +998,7 @@ __global__ void copy_from_columns(const size_type num_rows, * @param num_rows total number of rows in the table * @param num_columns total number of columns in the table * @param shmem_used_per_block amount of shared memory that is used by a block - * @param offsets + * @param row_offsets * @param output_data * @param output_nm * @param col_sizes array of sizes for each element in a column - one per column @@ -612,13 +1010,13 @@ __global__ void copy_from_columns(const size_type num_rows, __global__ void copy_to_columns(const size_type num_rows, const size_type num_columns, const size_type shmem_used_per_block, - const size_type *offsets, - int8_t **output_data, - cudf::bitmask_type **output_nm, - const size_type *col_sizes, - const size_type *col_offsets, - const block_info *block_infos, - const int8_t *input_data) + const size_type* row_offsets, + int8_t** output_data, + const size_type* _col_sizes, + const size_type* _col_offsets, + const block_info* block_infos, + const size_type num_block_infos, + const int8_t* input_data) { // We are going to copy the data in two passes. // The first pass copies a chunk of data into shared memory. @@ -628,7 +1026,14 @@ __global__ void copy_to_columns(const size_type num_rows, // This has been broken up for us in the block_info struct, so we don't have // any calculation to do here, but it is important to note. - constexpr bool debug_print = false; //blockIdx.x == 0 && threadIdx.x == 0; + // to speed up some of the random access memory we do, we copy col_sizes and col_offsets + // to shared memory for each of the blocks that we work on + + /*constexpr*/ bool debug_print = false; // threadIdx.x == 0; + constexpr unsigned stages_count = NUM_BLOCKS_PER_KERNEL_LOADED; + auto group = cooperative_groups::this_thread_block(); + extern __shared__ int8_t shared_data[]; + int8_t* shared[stages_count] = {shared_data, shared_data + shmem_used_per_block}; if (debug_print) { printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns); @@ -638,189 +1043,387 @@ __global__ void copy_to_columns(const size_type num_rows, printf("%d: %d\n", i, row_offsets[i]); }*/ printf("output data to %p\n", output_data[block_infos[blockIdx.x].buffer_num]); + printf("shared memory pointers are %p and %p\n", shared[0], shared[1]); + printf("shared_memory ends at %p\n", &shared_data[shmem_used_per_block * 2]); + printf("group is %d threads\n", group.size()); } -// else { return; } + // else { return; } - for (int block_offset = 0; block_offset < NUM_BLOCKS_PER_KERNEL_TO_COLUMNS; ++block_offset) { - auto this_block_index = blockIdx.x*NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + block_offset; - if (this_block_index > blockDim.x) { - break; + __shared__ cuda::barrier block_barrier[NUM_BLOCKS_PER_KERNEL_LOADED]; + if (group.thread_rank() == 0) { + for (int i = 0; i < NUM_BLOCKS_PER_KERNEL_LOADED; ++i) { + init(&block_barrier[i], group.size()); } - auto block = block_infos[this_block_index]; - auto const rows_in_block = block.end_row - block.start_row + 1; - auto const cols_in_block = block.end_col - block.start_col + 1; - extern __shared__ int8_t shared_data[]; + } - // copy data from our block's window to shared memory - // offsets information can get us on the row, then we need to know where the column - // starts to offset into the row data. - - // each thread is responsible for 8-byte chunks starting at threadIdx.x and striding - // at blockDim.x. If the 8-byte chunk falls on the boundary of the window, then the - // thread may copy less than 8 bytes. Even if at the beginning of the window, because - // every internal copy is aligned to 8-byte boundaries. - // - // thread 0 thread 1 thread 2 thread 3 thread 4 thread 5 - // 01234567 89abcdef 01234567 89abcdef 01234567 89abcdef - // xxxbbbbb bbbbbbbb bbbbbbbb bbbbbbbb bbbbbbbb bbxxxxxx - // | | | | | | | - // - // - - auto const window_start_quad = col_offsets[block.start_col] / 8; - auto const window_end_quad = (col_offsets[block.end_col] + col_sizes[block.end_col] + 7) / 8; - auto const window_quad_width = window_end_quad - window_start_quad; - auto const total_quads = window_quad_width * rows_in_block; - auto const shared_memory_starting_pad = col_offsets[block.start_col] & 0x7; + group.sync(); - if (debug_print) { - printf("col_offsets[%d]: %d, col_offsets[%d]: %d col_sizes[%d]: %d\n", block.start_col, col_offsets[block.start_col], block.end_col, col_offsets[block.end_col], block.end_col, col_sizes[block.end_col]); - printf("window start quad is %d, window end quad is %d\n", window_start_quad, window_end_quad); - printf("window quad width is %d and there are %d total quads\n%d shared memory starting pad\n", window_quad_width, total_quads, shared_memory_starting_pad); - } + auto blocks_remaining = std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS, + (uint)NUM_BLOCKS_PER_KERNEL_TO_COLUMNS); - // the copy to shared memory will be greedy. We know that the data is 8-byte aligned, so we won't - // access illegal memory by doing 8-byte aligned copies, so we can copy 8-byte aligned. This will - // result in the window edges being duplicated across blocks, but we can copy the padding as well - // to speed up our transfers to shared memory. - for (int i = threadIdx.x; i < total_quads; i += blockDim.x) { - auto const relative_row = i / window_quad_width; - auto const absolute_row = relative_row + block.start_row; - //auto const row = i / window_quad_width; - auto const offset_in_row = i % window_quad_width * 8; - auto const shmem_dest = &shared_data[i * 8]; - - if (debug_print) { - printf("relative_row: %d, absolute_row: %d, offset_in_row: %d, shmem_dest: %p\n", relative_row, absolute_row, offset_in_row, shmem_dest); - printf("offsets is %p\n", offsets); - printf("offsets[%d]: %d\n", absolute_row, offsets[absolute_row]); - printf("input_data[%d] will be dereferenced\n", offsets[absolute_row] + offset_in_row); - } + auto get_admin_data_sizes = [col_size_size = sizeof(decltype(*_col_sizes)), + col_offset_size = sizeof(decltype(*_col_offsets))]( + int const num_cols, + int const num_rows) -> std::tuple { + auto const col_size_bytes = num_cols * col_size_size; + auto const col_offset_bytes = num_cols * col_offset_size; - // full 8-byte copy - const int64_t *long_col_input = - reinterpret_cast(&input_data[offsets[absolute_row] + offset_in_row]); - if (debug_print) { - printf("which will be address %p\n", long_col_input); - printf("%p <- long %lu\n", shmem_dest, *long_col_input); } - *reinterpret_cast(shmem_dest) = *long_col_input; - } + return {col_size_bytes, col_offset_bytes}; + }; - __syncthreads(); - - // now we copy from shared memory to final destination. - // the data is laid out in rows in shared memory, so the reads - // for a column will be "vertical". Because of this and the different - // sizes for each column, this portion is handled on row/column basis. - // to prevent each thread working on a single row and also to ensure - // that all threads can do work in the case of more threads than rows, - // we do a global index instead of a double for loop with col/row. - for (int index = threadIdx.x; index < rows_in_block * cols_in_block; index += blockDim.x) { - auto const relative_col = index % cols_in_block; - auto const relative_row = index / cols_in_block; - auto const absolute_col = relative_col + block.start_col; - auto const absolute_row = relative_row + block.start_row; - - auto const shared_memory_row_offset = window_quad_width * 8 * relative_row; - auto const shared_memory_offset = col_offsets[absolute_col] - col_offsets[block.start_col] + - shared_memory_row_offset + shared_memory_starting_pad; - auto const column_size = col_sizes[absolute_col]; - - int8_t *shmem_src = &shared_data[shared_memory_offset]; - int8_t *dst = &output_data[absolute_col][absolute_row * column_size]; - - if (debug_print) { - printf("relative_col: %d, relative_row: %d, absolute_col: %d, absolute_row: %d, shared_mmeory_row_offset: %d, shared_memory_offset: %d," - " column_size: %d, shmem_src: %p, dst: %p\n", relative_col, relative_row, absolute_col, absolute_row, shared_memory_row_offset, shared_memory_offset, column_size, - shmem_src, dst) ; - } - switch (column_size) { - case 1: { - if (debug_print) { printf("%p <- byte %d\n", dst, *shmem_src); } - *dst = *shmem_src; - break; - } - case 2: { - const int16_t *short_col_input = reinterpret_cast(shmem_src); - if (debug_print) { printf("%p <- short %d\n", dst, *short_col_input); } - *reinterpret_cast(dst) = *short_col_input; - break; - } - case 4: { - const int32_t *int_col_input = reinterpret_cast(shmem_src); - if (debug_print) { printf("%p <- int 0x%x\n", dst, *int_col_input); } - *reinterpret_cast(dst) = *int_col_input; - break; - } - case 8: { - const int64_t *long_col_input = reinterpret_cast(shmem_src); - if (debug_print) { printf("%p <- long %lu\n", dst, *long_col_input); } - *reinterpret_cast(dst) = *long_col_input; - break; + if (debug_print) + printf("%d blocks remaining -> %d block infos, %d block index\n", + blocks_remaining, + num_block_infos, + blockIdx.x); + size_t fetch; + size_t subset; + for (subset = fetch = 0; subset < blocks_remaining; ++subset) { + // Fetch ahead up to stages_count subsets + fetch_blocks_for_row_to_column(fetch, + subset, + stages_count, + stages_count, + blocks_remaining, + block_infos, + _col_sizes, + _col_offsets, + row_offsets, + input_data, + shared, + group, + block_barrier, + debug_print); + + auto& subset_barrier = block_barrier[subset % NUM_BLOCKS_PER_KERNEL_LOADED]; + // ensure our data is ready + if (debug_print && group.thread_rank() == 0 && blockIdx.x == 0) + printf("%d-%d waiting at barrier %p\n", threadIdx.x, blockIdx.x, &subset_barrier); + subset_barrier.arrive_and_wait(); + + auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + subset]; + if (debug_print && group.thread_rank() == 0 && blockIdx.x == 0) + printf("%d-%d reading block %lu at address %p\n", + threadIdx.x, + blockIdx.x, + blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + subset, + shared[subset % stages_count]); + + auto const rows_in_block = block.num_rows(); + auto const cols_in_block = block.num_cols(); + + auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes(cols_in_block, rows_in_block); + // auto shared_row_offsets = shared[subset]; + auto shared_col_sizes = reinterpret_cast(shared[subset % stages_count]); + auto shared_col_offsets = + reinterpret_cast(&shared[subset % stages_count][col_size_bytes]); + + auto const shared_row_offset = align_offset(col_size_bytes + col_offset_bytes, 8); + + auto block_row_size = block.get_row_size(_col_offsets, _col_sizes, debug_print); + + // now we copy from shared memory to final destination. + // the data is laid out in rows in shared memory, so the reads + // for a column will be "vertical". Because of this and the different + // sizes for each column, this portion is handled on row/column basis. + // to prevent each thread working on a single row and also to ensure + // that all threads can do work in the case of more threads than rows, + // we do a global index instead of a double for loop with col/row. + for (int index = threadIdx.x; index < rows_in_block * cols_in_block; index += blockDim.x) { + auto const relative_col = index % cols_in_block; + auto const relative_row = index / cols_in_block; + auto const absolute_col = relative_col + block.start_col; + auto const absolute_row = relative_row + block.start_row; + + if (debug_print) + printf("copying for row %d(%d absolute) col %d(%d absolute)\n", + relative_row, + absolute_row, + relative_col, + absolute_col); + + auto const shared_memory_row_offset = block_row_size * relative_row; + if (debug_print) + printf("shared_col_offsets is %p and relative col is %d, making me access %p\n", + shared_col_offsets, + relative_col, + &shared_col_offsets[relative_col]); + auto const shared_memory_offset = shared_col_offsets[relative_col] - shared_col_offsets[0] + + shared_memory_row_offset + shared_row_offset; + if (debug_print) + printf("shared_col_sizes is %p and relative col is %d, making me access %p\n", + shared_col_sizes, + relative_col, + &shared_col_sizes[relative_col]); + auto const column_size = shared_col_sizes[relative_col]; + + int8_t* shmem_src = &shared[subset % stages_count][shared_memory_offset]; + int8_t* dst = &output_data[absolute_col][absolute_row * column_size]; + + if (debug_print) { + printf( + "relative_col: %d, relative_row: %d, absolute_col: %d, absolute_row: %d, " + "shared_mmeory_row_offset: %d, shared_memory_offset: %d," + " column_size: %d, shmem_src: %p, dst: %p\n",//, uint32 is %u\n", + relative_col, + relative_row, + absolute_col, + absolute_row, + shared_memory_row_offset, + shared_memory_offset, + column_size, + shmem_src, + dst/*, + *reinterpret_cast(shmem_src)*/); + printf("memcpy_async(%p, %p, %d, subset_barrier);\n", dst, shmem_src, column_size); } - default: { - if (debug_print) { - printf("byte for byte copy due to size %d of column %d\n", column_size, absolute_col); + if (debug_print && absolute_col == 0 && absolute_row == 51) { + printf("col0row51(%d bytes) = %p - 0x", column_size, shmem_src); + for (int i = 0; i < column_size; ++i) { + printf("%x ", shmem_src[i]); } - // TODO this should just not be supported for fixed width columns, but just in case... - for (cudf::size_type b = 0; b < column_size; b++) { dst[b] = shmem_src[b]; } - break; + printf("\n"); } + + cuda::memcpy_async(dst, shmem_src, column_size, subset_barrier); } + group.sync(); + if (debug_print && group.thread_rank() == 0 && blockIdx.x == 0) + printf( + "%d-%d copy to main memory with barrier %p\n", threadIdx.x, blockIdx.x, &subset_barrier); + } + + // wait on the last copies to complete + for (uint i = 0; i < std::min(stages_count, blocks_remaining); ++i) { + block_barrier[i].arrive_and_wait(); } +} + +/** + * @brief copy data from row-based format to cudf columns + * + * @param num_rows total number of rows in the table + * @param num_columns total number of columns in the table + * @param shmem_used_per_block amount of shared memory that is used by a block + * @param offsets + * @param output_nm + * @param validity_offsets offset into input data row for validity data + * @param block_infos information about the blocks of work + * @param num_block_infos number of infos in blocks array + * @param input_data pointer to input data + * + */ +__global__ void copy_validity_to_columns(const size_type num_rows, + const size_type num_columns, + const size_type shmem_used_per_block, + const size_type* row_offsets, + cudf::bitmask_type** output_nm, + const size_type validity_offset, + const block_info* block_infos, + const size_type num_block_infos, + const int8_t* input_data) +{ + extern __shared__ int8_t shared_data[]; + int8_t* shared_blocks[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] = { + shared_data, shared_data + shmem_used_per_block / 2}; - // now handle validity. Each thread is responsible for 32 rows in 8 columns. - // to prevent indexing issues with a large number of threads, this is compressed - // to a single loop like above. TODO: investigate using shared memory here - auto const validity_batches_per_col = (num_rows + 31) / 32; - auto const validity_batches_total = std::max(1, validity_batches_per_col * (num_columns / 8)); - if (debug_print && threadIdx.x == 0 && blockIdx.x == 0) { - printf("validity_batched_per_col is %d\nvalidity_batches_total is %d for %d rows\n%d blocks of %d threads\n", validity_batches_per_col, validity_batches_total, num_rows, gridDim.x, blockDim.x); + bool print_debug = false; // threadIdx.x == 0 && blockIdx.x == 0; + // bool print_debug = false; + // if (blockIdx.x != 3 || threadIdx.x / 32 != 0) return; + if (print_debug) { + printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns); + printf("%d %d - block infos are at %p and my index is %d\n", + threadIdx.x, + blockIdx.x, + block_infos, + blockIdx.x); + printf( + "%d %d - Shared memory starts at %p and ends at %p, input data is %p, output data is %p, row " + "offsets are %p, block infos at %p\n", + threadIdx.x, + blockIdx.x, + shared_data, + shared_data + shmem_used_per_block, + input_data, + output_nm, + row_offsets, + block_infos); + /* printf("Row Offsets:\n"); + for (int i=0; i + shared_block_barriers[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]; + if (group.thread_rank() == 0) { + for (int i = 0; i < NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; ++i) { + init(&shared_block_barriers[i], group.size()); } + } - // one for each column - int32_t dst_validity[8] = {0}; - for (int row = starting_row; row < std::min(num_rows, starting_row + 32); ++row) { - int8_t const * const validity_ptr = &input_data[offsets[row] + validity_offset]; + group.sync(); - if (debug_print) { - printf("%d: validity_ptr is %p for row %d\n", threadIdx.x, validity_ptr, row); - } - - auto const val_byte = *validity_ptr; - - for (int i=0; i> src_shift); + for (int validity_block = 0; validity_block < blocks_remaining; ++validity_block) { + auto const validity_index = validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; + if (validity_block != validity_index) { + shared_block_barriers[validity_index].arrive_and_wait(); + } + int8_t* this_shared_block = shared_blocks[validity_block % 2]; + auto const block = block_infos[blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block]; + auto const block_start_col = block.start_col; + auto const block_start_row = block.start_row; + + auto const num_block_cols = block.num_cols(); + auto const num_block_rows = block.num_rows(); + + auto const num_sections_x = (num_block_cols + 7) / 8; + auto const num_sections_y = (num_block_rows + 31) / 32; + auto const validity_data_col_length = align_offset(num_sections_y, 4); + auto const total_sections = num_sections_x * num_sections_y; + + if (print_debug) { + printf("%d %d - block %d has %d cols, %d rows, and %d total sections\n", + threadIdx.x, + blockIdx.x, + blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block, + num_block_cols, + num_block_rows, + total_sections); + } + int const warp_id = threadIdx.x / detail::warp_size; + int const lane_id = threadIdx.x % detail::warp_size; + auto const warps_per_block = std::max(1u, blockDim.x / detail::warp_size); + + if (print_debug) + printf( + "%d %d - my warp is %d, %d total sections, %d warps per block, blockDim.x=%d, warp side " + "%d\n", + threadIdx.x, + blockIdx.x, + warp_id, + total_sections, + warps_per_block, + blockDim.x, + detail::warp_size); + // the block is divided into sections. A warp operates on a section at a time. + for (int my_section_idx = warp_id; my_section_idx < total_sections; + my_section_idx += warps_per_block) { + // convert to rows and cols + auto const section_x = my_section_idx % num_sections_x; + auto const section_y = my_section_idx / num_sections_x; + + auto const relative_col = section_x * 8; + auto const relative_row = section_y * 32 + lane_id; + auto const absolute_col = relative_col + block_start_col; + auto const absolute_row = relative_row + block_start_row; + auto const rows_left = num_rows - absolute_row; + + if (print_debug) + printf("%d-%d: si: %d nsx: %d nsy: %d sx: %d sy: %d ar: %d nr: %d rc: %d rr: %d\n", + threadIdx.x, + blockIdx.x, + my_section_idx, + num_sections_x, + num_sections_y, + section_x, + section_y, + absolute_row, + num_rows, + relative_col, + relative_row); + auto const participation_mask = __ballot_sync(0xFFFFFFFF, absolute_row < num_rows); + + if (absolute_row < num_rows) { + auto const my_byte = + input_data[row_offsets[absolute_row] + validity_offset + absolute_col / 8]; + + // so every thread that is participating in the warp has a byte, but it's row-based + // data and we need it in column-based. So we shiffle the bits around to make + // the bytes we actually write. + for (int i = 0, byte_mask = 1; i < 8 && relative_col + i < num_columns; + ++i, byte_mask <<= 1) { + auto validity_data = __ballot_sync(participation_mask, my_byte & byte_mask); + // lead thread in each warp writes data + if (threadIdx.x % detail::warp_size == 0) { + auto const validity_write_offset = + validity_data_col_length * (relative_col + i) + relative_row / 8; + + if (print_debug) + printf("%d - Writing validity data 0x%x to shared memory location %d\n", + threadIdx.x, + validity_data, + validity_write_offset); + if (rows_left <= 8) { + // write byte + this_shared_block[validity_write_offset] = validity_data & 0xFF; + } else if (rows_left <= 16) { + // write int16 + *reinterpret_cast(&this_shared_block[validity_write_offset]) = + validity_data & 0xFFFF; + } else if (rows_left <= 24) { + // write int16 and then int8 + *reinterpret_cast(&this_shared_block[validity_write_offset]) = + validity_data & 0xFFFF; + shared_data[validity_write_offset + 2] = (validity_data >> 16) & 0xFF; + } else { + // write int32 + *reinterpret_cast(&this_shared_block[validity_write_offset]) = + validity_data; + } + } } - // auto const dst_bit_mask = 1 << dst_shift; - dst_validity[i] |= (((val_byte & src_bit_mask) >> src_shift) << dst_shift); } } - - for (int i=0; i(output_nm[start_col + i] + (starting_row / 32)); - if (debug_print) { - printf("%d-%d: validity write output_nm[%d][%d] - %p <- %d\n", threadIdx.x, blockIdx.x, start_col + i, starting_row, validity_ptr, dst_validity[i]); - } - *validity_ptr = dst_validity[i]; + // make sure entire block has finished copy + group.sync(); + + // now async memcpy the shared + for (int col = block.start_col + threadIdx.x; col <= block.end_col; col += blockDim.x) { + auto const relative_col = col - block.start_col; + auto const words_to_copy = util::div_rounding_up_unsafe(num_block_rows, 32); + auto const starting_address = output_nm[col] + word_index(block_start_row); + + if (print_debug) + printf("memcpy_async(%p(offset %d), %p, %d, subset_barrier);\n", + starting_address, + word_index(block_start_row), + &this_shared_block[validity_data_col_length * relative_col], + words_to_copy * 4); + cuda::memcpy_async( + output_nm[col] + word_index(block_start_row), + &this_shared_block[validity_data_col_length * relative_col], + util::div_rounding_up_unsafe(num_block_rows, 8), + shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]); } } + + // if (print_debug) printf("leaving...\n"); + // wait for last blocks of data to arrive + auto const num_blocks_to_wait = blocks_remaining > NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED + ? NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED + : blocks_remaining; + for (int validity_block = 0; validity_block < num_blocks_to_wait; ++validity_block) { + shared_block_barriers[validity_block].arrive_and_wait(); + } } -} + +#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 /** * Calculate the dimensions of the kernel for fixed width only columns. @@ -834,8 +1437,8 @@ __global__ void copy_to_columns(const size_type num_rows, static int calc_fixed_width_kernel_dims(const cudf::size_type num_columns, const cudf::size_type num_rows, const cudf::size_type size_per_row, - dim3 &blocks, - dim3 &threads) + dim3& blocks, + dim3& threads) { // We have found speed degrades when a thread handles more than 4 columns. // Each block is 2 dimensional. The y dimension indicates the columns. @@ -846,7 +1449,7 @@ static int calc_fixed_width_kernel_dims(const cudf::size_type num_columns, // in the x dimension because we use atomic operations at the block // level when writing validity data out to main memory, and that would // need to change if we split a word of validity data between blocks. - int y_block_size = (num_columns + 3) / 4; + int y_block_size = (num_columns + 3) / 4; // cudf::util::div_rounding_up_safe(num_columns, 4); if (y_block_size > 32) { y_block_size = 32; } int x_possible_block_size = 1024 / y_block_size; // 48KB is the default setting for shared memory per block according to the cuda tutorials @@ -895,14 +1498,14 @@ static std::unique_ptr fixed_width_convert_to_rows( const cudf::size_type num_rows, const cudf::size_type num_columns, const cudf::size_type size_per_row, - rmm::device_uvector &column_start, - rmm::device_uvector &column_size, - rmm::device_uvector &input_data, - rmm::device_uvector &input_nm, - const cudf::scalar &zero, - const cudf::scalar &scalar_size_per_row, + rmm::device_uvector& column_start, + rmm::device_uvector& column_size, + rmm::device_uvector& input_data, + rmm::device_uvector& input_nm, + const cudf::scalar& zero, + const cudf::scalar& scalar_size_per_row, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) + rmm::mr::device_memory_resource* mr) { int64_t total_allocation = size_per_row * num_rows; // We made a mistake in the split somehow @@ -944,12 +1547,12 @@ static std::unique_ptr fixed_width_convert_to_rows( mr); } -static cudf::data_type get_data_type(const cudf::column_view &v) { return v.type(); } +static cudf::data_type get_data_type(const cudf::column_view& v) { return v.type(); } -static inline bool are_all_fixed_width(std::vector const &schema) +static inline bool are_all_fixed_width(std::vector const& schema) { return std::all_of( - schema.begin(), schema.end(), [](const cudf::data_type &t) { return cudf::is_fixed_width(t); }); + schema.begin(), schema.end(), [](const cudf::data_type& t) { return cudf::is_fixed_width(t); }); } /** @@ -959,9 +1562,9 @@ static inline bool are_all_fixed_width(std::vector const &schem * @param [out] column_size the size in bytes of the data for each columns in the row. * @return the size in bytes each row needs. */ -static inline int32_t compute_fixed_width_layout(std::vector const &schema, - std::vector &column_start, - std::vector &column_size) +static inline int32_t compute_fixed_width_layout(std::vector const& schema, + std::vector& column_start, + std::vector& column_size) { // We guarantee that the start of each column is 64-bit aligned so anything can go // there, but to make the code simple we will still do an alignment for it. @@ -979,27 +1582,29 @@ static inline int32_t compute_fixed_width_layout(std::vector co // Now we need to add in space for validity // Eventually we can think about nullable vs not nullable, but for now we will just always add it // in - int32_t validity_bytes_needed = (schema.size() + 7) / 8; + int32_t validity_bytes_needed = + (schema.size() + 7) / 8; // cudf::util::div_rounding_up_safe(schema.size(), 8); // validity comes at the end and is byte aligned so we can pack more in. at_offset += validity_bytes_needed; // Now we need to pad the end so all rows are 64 bit aligned return align_offset(at_offset, 8); // 8 bytes (64 bits) } +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 + template -static size_type compute_column_information( - iterator begin, - iterator end, - std::vector &column_starts, - std::vector &column_sizes)//, - //std::function nested_type_cb) +static size_type compute_column_information(iterator begin, + iterator end, + std::vector& column_starts, + std::vector& column_sizes) //, +// std::function nested_type_cb) { size_type fixed_width_size_per_row = 0; for (auto cv = begin; cv != end; ++cv) { auto col_type = std::get<0>(*cv); bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; -// if (nested_type && nested_type_cb) { nested_type_cb(cv->get<1>()); } + // if (nested_type && nested_type_cb) { nested_type_cb(cv->get<1>()); } // a list or string column will write a single uint64 // of data here for offset/length @@ -1021,11 +1626,53 @@ static size_type compute_column_information( //#define DEBUG -static std::vector build_block_infos(std::vector const &column_sizes, - std::vector const &column_starts, - std::vector const &row_batches, - size_type const total_number_of_rows, - size_type const &shmem_limit_per_block) +std::vector build_validity_block_infos( + size_type const& num_columns, + size_type const& num_rows, + size_type const& shmem_limit_per_block, + std::vector const& row_batches) +{ + auto const desired_rows_and_columns = (int)sqrt(shmem_limit_per_block); + auto const column_stride = align_offset( + [&]() { + if (desired_rows_and_columns > num_columns) { + // not many columns, group it into 8s and ship it off + return std::min(8, num_columns); + } else { + return util::round_down_safe(desired_rows_and_columns, 8); + } + }(), + 8); + // we fit as much as we can given the column stride + auto const row_stride = std::min(num_rows, shmem_limit_per_block * 8 / column_stride); + + std::vector validity_block_infos; + for (int col = 0; col < num_columns; col += column_stride) { + int current_window_row_batch = 0; + int rows_left_in_batch = row_batches[current_window_row_batch].row_count; + int row = 0; + while (row < num_rows) { + if (rows_left_in_batch == 0) { + current_window_row_batch++; + rows_left_in_batch = row_batches[current_window_row_batch].row_count; + } + int const window_height = std::min(row_stride, rows_left_in_batch); + + validity_block_infos.emplace_back(detail::block_info{ + col, row, std::min(col + column_stride - 1, num_columns - 1), row + window_height - 1}); + row += window_height; + rows_left_in_batch -= window_height; + } + } + + return validity_block_infos; +} + +std::vector build_block_infos(std::vector const& column_sizes, + std::vector const& column_starts, + std::vector const& row_batches, + size_type const total_number_of_rows, + size_type const& shmem_limit_per_block) { std::vector block_infos; @@ -1067,19 +1714,37 @@ static std::vector build_block_infos(std::vector const &c // far as byte sizes. x * y = shared_mem_size. Which translates to x^2 = shared_mem_size since we // want them equal, so height and width are sqrt(shared_mem_size). The trick is that it's in // bytes, not rows or columns. - int const window_height = std::min( - std::min(size_type(sqrt(shmem_limit_per_block)) / column_sizes[0], total_number_of_rows), - row_batches[0].row_count); + size_type const optimal_square_len = size_type(sqrt(shmem_limit_per_block)); + int const window_height = + std::clamp(util::round_up_safe( + optimal_square_len <= (size_type)column_sizes.size() + ? std::min(optimal_square_len / column_sizes[0], total_number_of_rows) + : row_batches[0].row_count / 2, + 32), + 1, + row_batches[0].row_count); #if defined(DEBUG) printf( - "sqrt(shmem_limit_per_block) / column_sizes[0] is %d and num_rows is %d, batch row count is %d - which makes window height " - "%d\n", - size_type(sqrt(shmem_limit_per_block)) / column_sizes[0], + "optimal_square_len is %d and we have %d columns, optimal_square_len / column_sizes[0] is %d " + "and num_rows is %d, batch row count is %d " + "- which makes window height " + "%d - admin size is %lu\n", + optimal_square_len, + (int)column_sizes.size(), + optimal_square_len / column_sizes[0], total_number_of_rows, row_batches[0].row_count, - window_height); + window_height, + column_sizes.size() * sizeof(size_type) * 2); #endif + auto calc_admin_data_size = [](int num_cols) -> size_type { + // admin data is the column sizes and column start information. + // this is copied to shared memory as well and needs to be accounted for + // in the window calculation. + return num_cols * sizeof(size_type) + num_cols * sizeof(size_type); + }; + int row_size = 0; // march each column and build the blocks of appropriate sizes @@ -1092,14 +1757,26 @@ static std::vector build_block_infos(std::vector const &c auto row_size_with_this_col = row_size_aligned + col_size; auto row_size_with_end_pad = detail::align_offset(row_size_with_this_col, 8); - if (row_size_with_end_pad * window_height > shmem_limit_per_block) { + if (row_size_with_end_pad * window_height + + calc_admin_data_size(col - current_window_start_col) > + shmem_limit_per_block) { #if defined(DEBUG) printf( - "Window size %d too large at column %d, bumping back to build windows of size %d(cols " + "row size with end pad is %d and admin data is %d, which adds up to %d and that is too " + "large for shmem block of %d\n", + row_size_with_end_pad, + calc_admin_data_size(col - current_window_start_col), + row_size_with_end_pad * window_height + + calc_admin_data_size(col - current_window_start_col), + shmem_limit_per_block); + printf( + "Window size %d too large at column %d, admin size is %d, bumping back to build windows of " + "size %d(cols " "%d-%d), which is %d tall. Row size is too large at %d and ok at %d(aligned overall is %d) " "for shared mem size %d\n", row_size_with_end_pad * window_height, col, + calc_admin_data_size(col - current_window_start_col), row_size * window_height, current_window_start_col, col - 1, @@ -1136,31 +1813,35 @@ static std::vector build_block_infos(std::vector const &c // build last set of blocks if (current_window_width > 0) { - build_blocks(current_window_start_col, (int)column_sizes.size()-1, window_height); + build_blocks(current_window_start_col, (int)column_sizes.size() - 1, window_height); } return block_infos; } -} // namespace detail #if defined(DEBUG) - void pretty_print(uint64_t i) { - if (i > (1 * 1024 * 1024 * 1024)) { - printf("%.2f GB", i / float(1 * 1024 * 1024 * 1024)); - } else if (i > (1 * 1024 * 1024)) { - printf("%.2f MB", i / float(1 * 1024 * 1024)); - } else if (i > (1 * 1024)) { - printf("%.2f KB", float(i / 1024)); - } else { - printf("%lu Bytes", i); - } +void pretty_print(uint64_t i) +{ + if (i > (1 * 1024 * 1024 * 1024)) { + printf("%.2f GB", i / float(1 * 1024 * 1024 * 1024)); + } else if (i > (1 * 1024 * 1024)) { + printf("%.2f MB", i / float(1 * 1024 * 1024)); + } else if (i > (1 * 1024)) { + printf("%.2f KB", float(i / 1024)); + } else { + printf("%lu Bytes", i); } +} #endif +#endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 + +} // namespace detail -std::vector> convert_to_rows2(cudf::table_view const &tbl, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) +std::vector> convert_to_rows(cudf::table_view const& tbl, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the // data, but small enough that multiple columns fit in memory so the writes can coalese as well. // Potential optimization for window sizes. @@ -1169,9 +1850,13 @@ std::vector> convert_to_rows2(cudf::table_view con int device_id; CUDA_TRY(cudaGetDevice(&device_id)); - int shmem_limit_per_block; - CUDA_TRY( - cudaDeviceGetAttribute(&shmem_limit_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); + int total_shmem; + CUDA_TRY(cudaDeviceGetAttribute(&total_shmem, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); + +#if defined(DEBUG) || 1 + total_shmem -= 1024; +#endif + int shmem_limit_per_block = total_shmem / NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; #if defined(DEBUG) size_t free, total; @@ -1195,8 +1880,8 @@ std::vector> convert_to_rows2(cudf::table_view con // windows so the windows can be properly cut around them. // Get the pointers to the input columnar data ready - std::vector input_data; - std::vector input_nm; + std::vector input_data; + std::vector input_nm; input_data.reserve(num_columns); input_nm.reserve(num_columns); for (size_type column_number = 0; column_number < num_columns; column_number++) { @@ -1224,16 +1909,16 @@ std::vector> convert_to_rows2(cudf::table_view con column_sizes.reserve(num_columns); column_starts.reserve(num_columns + 1); // we add a final offset for validity data start - auto iter = thrust::make_transform_iterator(thrust::make_counting_iterator(0), [&tbl](auto i) -> std::tuple { - return std::make_tuple(tbl.column(i).type(), tbl.column(i)); - }); + auto iter = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), [&tbl](auto i) -> std::tuple { + return std::make_tuple(tbl.column(i).type(), tbl.column(i)); + }); - size_type fixed_width_size_per_row = detail::compute_column_information( - iter, - iter + num_columns, - column_starts, - column_sizes);//, -// [&variable_width_columns](column_view const &cv) { variable_width_columns.push_back(cv); }); + size_type fixed_width_size_per_row = detail::compute_column_information(iter, + iter + num_columns, + column_starts, + column_sizes); //, + // [&variable_width_columns](column_view const &cv) { variable_width_columns.push_back(cv); }); /* size_type fixed_width_size_per_row = 0; for (int col = 0; col < num_columns; ++col) { auto cv = tbl.column(col); @@ -1261,7 +1946,6 @@ std::vector> convert_to_rows2(cudf::table_view con column_starts.back() + column_sizes.back()); #endif - auto dev_col_sizes = make_device_uvector_async(column_sizes, stream, mr); auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr); @@ -1329,7 +2013,8 @@ std::vector> convert_to_rows2(cudf::table_view con row_batch_rows++; } if (row_batch_size > 0) { - row_batches.push_back(detail::row_batch{static_cast(row_batch_size), row_batch_rows}); + row_batches.push_back( + detail::row_batch{static_cast(row_batch_size), row_batch_rows}); } auto dev_row_offsets = make_device_uvector_async(row_offsets, stream, mr); @@ -1339,17 +2024,17 @@ std::vector> convert_to_rows2(cudf::table_view con printf("%lu batches:\n", row_batches.size()); for (auto i = 0; i < (int)row_batches.size(); ++i) { printf("%d: %d rows, ", i, row_batches[i].row_count); - pretty_print(row_batches[i].num_bytes); + detail::pretty_print(row_batches[i].num_bytes); printf("\n"); } #endif std::vector output_buffers; - std::vector output_data; + std::vector output_data; output_data.reserve(row_batches.size()); for (uint i = 0; i < row_batches.size(); ++i) { rmm::device_buffer temp(row_batches[i].num_bytes, stream, mr); - output_data.push_back(static_cast(temp.data())); + output_data.push_back(static_cast(temp.data())); output_buffers.push_back(std::move(temp)); } auto dev_output_data = make_device_uvector_async(output_data, stream, mr); @@ -1362,38 +2047,63 @@ std::vector> convert_to_rows2(cudf::table_view con block_infos.size(), block_infos[0].end_col - block_infos[0].start_col + 1, block_infos[0].end_row - block_infos[0].start_row); - pretty_print(shmem_limit_per_block); + detail::pretty_print(shmem_limit_per_block); printf(" shared mem("); - pretty_print(fixed_width_size_per_row); + detail::pretty_print(fixed_width_size_per_row); printf("/row, %d columns, %d rows, ", num_columns, num_rows); - pretty_print(total_table_size); + detail::pretty_print(total_table_size); printf(" total):\n"); #endif auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr); // blast through the entire table and convert it - dim3 blocks(block_infos.size()); - #if defined(DEBUG) || 1 - dim3 threads(std::min(std::min(512, shmem_limit_per_block / 8), (int)total_table_size)); - #else - dim3 threads(std::min(std::min(1024, shmem_limit_per_block / 8), (int)total_table_size)); - #endif + dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS)); + dim3 threads(256); + #if defined(DEBUG) printf("Launching kernel with %d blocks, %d threads, ", blocks.x, threads.x); - pretty_print(shmem_limit_per_block); + detail::pretty_print(shmem_limit_per_block); printf(" shared memory\n"); #endif - copy_from_columns<<>>( + detail::copy_from_columns<<>>( num_rows, num_columns, + shmem_limit_per_block, + block_infos.size(), dev_input_data.data(), - dev_input_nm.data(), dev_col_sizes.data(), dev_col_starts.data(), dev_block_infos.data(), dev_row_offsets.data(), - reinterpret_cast(dev_output_data.data())); + reinterpret_cast(dev_output_data.data())); + + auto validity_block_infos = + build_validity_block_infos(num_columns, num_rows, shmem_limit_per_block, row_batches); + + auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream, mr); + dim3 validity_blocks( + util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_COLUMNS)); + dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu)); +#if defined(DEBUG) + printf("Launching validity kernel with %d blocks, for %lu validity blocks with %d threads, ", + validity_blocks.x, + validity_block_infos.size(), + validity_threads.x); + detail::pretty_print(total_shmem); + printf(" shared memory\n"); +#endif + detail:: + copy_validity_from_columns<<>>( + num_rows, + num_columns, + shmem_limit_per_block, + dev_row_offsets.data(), + dev_output_data.data(), + column_starts.back(), + dev_validity_block_infos.data(), + validity_block_infos.size(), + dev_input_nm.data()); // split up the output buffer into multiple buffers based on row batch sizes // and create list of byte columns @@ -1428,11 +2138,15 @@ std::vector> convert_to_rows2(cudf::table_view con } return ret; +#else + CUDF_FAIL("Column to row conversion optimization requires volta or later hardware."); + return {}; +#endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 } -std::vector> convert_to_rows(cudf::table_view const &tbl, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) +std::vector> old_convert_to_rows(cudf::table_view const& tbl, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { const cudf::size_type num_columns = tbl.num_columns(); @@ -1456,8 +2170,8 @@ std::vector> convert_to_rows(cudf::table_view cons cudf::size_type num_rows = tbl.num_rows(); // Get the pointers to the input columnar data ready - std::vector input_data; - std::vector input_nm; + std::vector input_data; + std::vector input_nm; for (cudf::size_type column_number = 0; column_number < num_columns; column_number++) { cudf::column_view cv = tbl.column(column_number); input_data.emplace_back(cv.data()); @@ -1469,11 +2183,11 @@ std::vector> convert_to_rows(cudf::table_view cons using ScalarType = cudf::scalar_type_t; auto zero = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value()); zero->set_valid_async(true, stream); - static_cast(zero.get())->set_value(0, stream); + static_cast(zero.get())->set_value(0, stream); auto step = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value()); step->set_valid_async(true, stream); - static_cast(step.get()) + static_cast(step.get()) ->set_value(static_cast(size_per_row), stream); std::vector> ret; @@ -1500,11 +2214,12 @@ std::vector> convert_to_rows(cudf::table_view cons } } -std::unique_ptr convert_from_rows2(cudf::lists_column_view const &input, - std::vector const &schema, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) +std::unique_ptr convert_from_rows(cudf::lists_column_view const& input, + std::vector const& schema, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 // verify that the types are what we expect cudf::column_view child = input.child(); cudf::type_id list_type = child.type().id(); @@ -1516,11 +2231,13 @@ std::unique_ptr convert_from_rows2(cudf::lists_column_view const &i int device_id; CUDA_TRY(cudaGetDevice(&device_id)); - int shmem_limit_per_block; - CUDA_TRY( - cudaDeviceGetAttribute(&shmem_limit_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); + int total_shmem; + CUDA_TRY(cudaDeviceGetAttribute(&total_shmem, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); - shmem_limit_per_block /= NUM_BLOCKS_PER_KERNEL_TO_COLUMNS; +#if defined(DEBUG) || 1 + total_shmem -= 1024; +#endif + int shmem_limit_per_block = total_shmem / NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; std::vector column_starts; std::vector column_sizes; @@ -1529,7 +2246,7 @@ std::unique_ptr convert_from_rows2(cudf::lists_column_view const &i return std::make_tuple(schema[i], nullptr); }); size_type fixed_width_size_per_row = detail::compute_column_information( - iter, iter + num_columns, column_starts, column_sizes);//, [](void *) {}); + iter, iter + num_columns, column_starts, column_sizes); //, [](void *) {}); size_type validity_size = num_bitmask_words(num_columns) * 4; @@ -1537,8 +2254,7 @@ std::unique_ptr convert_from_rows2(cudf::lists_column_view const &i // Ideally we would check that the offsets are all the same, etc. but for now // this is probably fine - CUDF_EXPECTS(row_size * num_rows == child.size(), - "The layout of the data appears to be off"); + CUDF_EXPECTS(row_size * num_rows == child.size(), "The layout of the data appears to be off"); auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr); auto dev_col_sizes = make_device_uvector_async(column_sizes, stream, mr); @@ -1549,8 +2265,8 @@ std::unique_ptr convert_from_rows2(cudf::lists_column_view const &i // Allocate the columns we are going to write into std::vector> output_columns; - std::vector output_data; - std::vector output_nm; + std::vector output_data; + std::vector output_nm; for (cudf::size_type i = 0; i < num_columns; i++) { auto column = cudf::make_fixed_width_column( schema[i], num_rows, cudf::mask_state::UNINITIALIZED, stream, mr); @@ -1568,36 +2284,97 @@ std::unique_ptr convert_from_rows2(cudf::lists_column_view const &i auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr); - dim3 blocks((block_infos.size() + (NUM_BLOCKS_PER_KERNEL_TO_COLUMNS - 1)) / NUM_BLOCKS_PER_KERNEL_TO_COLUMNS); - #if defined(DEBUG) || 1 - dim3 threads(std::min(std::min(512, shmem_limit_per_block / 8), (int)child.size())); - #else - dim3 threads(std::min(std::min(1024, shmem_limit_per_block / 8), (int)child.size())); - #endif + dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_COLUMNS)); +#if defined(DEBUG) + dim3 threads(std::min(std::min(128, shmem_limit_per_block / 8), (int)child.size())); +#else + dim3 threads(std::min(256, (int)child.size())); +#endif #if defined(DEBUG) printf("Launching kernel with %d blocks, %d threads, ", blocks.x, threads.x); - pretty_print(shmem_limit_per_block); + detail::pretty_print(total_shmem); printf(" shared memory\n"); #endif - detail::copy_to_columns<<>>( + detail::copy_to_columns<<>>( num_rows, num_columns, shmem_limit_per_block, input.offsets().data(), dev_output_data.data(), - dev_output_nm.data(), dev_col_sizes.data(), dev_col_starts.data(), dev_block_infos.data(), + block_infos.size(), child.data()); + auto const desired_rows_and_columns = (int)sqrt(shmem_limit_per_block); + auto const column_stride = [&]() { + if (desired_rows_and_columns > num_columns) { + // not many columns, group it into 8s and ship it off + return std::min(8, num_columns); + } else { + return util::round_down_safe(desired_rows_and_columns, 8); + } + }(); + auto const row_stride = [&]() { + // we fit as much as we can, we know the column stride now, so calculate the row + return std::min(num_rows, util::round_down_safe(shmem_limit_per_block * 8 / column_stride, 32)); + /* if (desired_rows_and_columns > num_rows) { + return std::min(32, num_rows); + } else { + return util::round_down_safe(desired_rows_and_columns, 32); + }*/ + }(); + std::vector validity_block_infos; + for (int col = 0; col < num_columns; col += column_stride) { + for (int row = 0; row < num_rows; row += row_stride) { + validity_block_infos.emplace_back( + detail::block_info{col, + row, + std::min(col + column_stride - 1, num_columns - 1), + std::min(row + row_stride - 1, num_rows - 1)}); + } + } + auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream, mr); + dim3 validity_blocks( + util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_COLUMNS)); +#if defined(DEBUG) + printf( + "Launching validity kernel with %d blocks, for %lu validity blocks, col stride %d and row " + "stride of %d with %d threads, ", + validity_blocks.x, + validity_block_infos.size(), + column_stride, + row_stride, + threads.x); + detail::pretty_print(total_shmem); + printf(" shared memory\n"); +#endif + + dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu)); + detail:: + copy_validity_to_columns<<>>( + num_rows, + num_columns, + shmem_limit_per_block, + input.offsets().data(), + dev_output_nm.data(), + column_starts.back(), + dev_validity_block_infos.data(), + validity_block_infos.size(), + child.data()); + return std::make_unique(std::move(output_columns)); +#else + CUDF_FAIL("Row to column conversion optimization requires volta or later hardware."); + return {}; +#endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 } -std::unique_ptr convert_from_rows(cudf::lists_column_view const &input, - std::vector const &schema, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) +std::unique_ptr old_convert_from_rows(cudf::lists_column_view const& input, + std::vector const& schema, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { // verify that the types are what we expect cudf::column_view child = input.child(); @@ -1619,12 +2396,12 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const &in CUDF_EXPECTS(size_per_row * num_rows == child.size(), "The layout of the data appears to be off"); auto dev_column_start = make_device_uvector_async(column_start, stream); - auto dev_column_size = make_device_uvector_async(column_size, stream); + auto dev_column_size = make_device_uvector_async(column_size, stream); // Allocate the columns we are going to write into std::vector> output_columns; - std::vector output_data; - std::vector output_nm; + std::vector output_data; + std::vector output_nm; for (cudf::size_type i = 0; i < num_columns; i++) { auto column = cudf::make_fixed_width_column( schema[i], num_rows, cudf::mask_state::UNINITIALIZED, stream, mr); @@ -1642,6 +2419,11 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const &in int shared_size = detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads); + // printf("Launching (%d, %d, %d) blocks, (%d, %d, %d) threads, with %d shared size\n", + // blocks.x, blocks.y, blocks.z, threads.x, threads.y, threads.z, shared_size); + // printf("pointers are column_start: %p, column_size: %p, output_data: %p, output_nm: %p\n", + // dev_column_start.data(), dev_column_size.data(), dev_output_data.data(), + // dev_output_nm.data()); detail::copy_to_fixed_width_columns<<>>( num_rows, num_columns, @@ -1658,36 +2440,4 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const &in } } -std::unique_ptr convert_from_rows( - std::vector> const &input, - std::vector const &schema, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) -{ - CUDF_EXPECTS(input.size() == 1, "Too large of an input, need to concat the output tables..."); - - // for (uint i=0; iview(); - auto ret = convert_from_rows(lcv, schema, stream, mr); - - return ret; - // } -} - -std::unique_ptr convert_from_rows2( - std::vector> const &input, - std::vector const &schema, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) -{ - CUDF_EXPECTS(input.size() == 1, "Too large of an input, need to concat the output tables..."); - - // for (uint i=0; iview(); - auto ret = convert_from_rows2(lcv, schema, stream, mr); - - return ret; - // } -} - } // namespace cudf diff --git a/cpp/tests/row_conversion/row_conversion.cpp b/cpp/tests/row_conversion/row_conversion.cpp index 818d7a89ddb..e38b37e81a6 100644 --- a/cpp/tests/row_conversion/row_conversion.cpp +++ b/cpp/tests/row_conversion/row_conversion.cpp @@ -34,8 +34,8 @@ TEST_F(ColumnToRowTests, Single) cudf::test::fixed_width_column_wrapper a({-1}); cudf::table_view in(std::vector{a}); - auto old_rows = cudf::convert_to_rows(in); - auto new_rows = cudf::convert_to_rows2(in); + auto old_rows = cudf::old_convert_to_rows(in); + auto new_rows = cudf::convert_to_rows(in); EXPECT_EQ(old_rows.size(), new_rows.size()); for (uint i = 0; i < old_rows.size(); i++) { @@ -48,8 +48,8 @@ TEST_F(ColumnToRowTests, Simple) cudf::test::fixed_width_column_wrapper a({-1, 0, 1}); cudf::table_view in(std::vector{a}); - auto old_rows = cudf::convert_to_rows(in); - auto new_rows = cudf::convert_to_rows2(in); + auto old_rows = cudf::old_convert_to_rows(in); + auto new_rows = cudf::convert_to_rows(in); EXPECT_EQ(old_rows.size(), new_rows.size()); for (uint i = 0; i < old_rows.size(); i++) { @@ -64,8 +64,8 @@ TEST_F(ColumnToRowTests, Tall) cudf::test::fixed_width_column_wrapper a(r, r + (size_t)4096); cudf::table_view in(std::vector{a}); - auto old_rows = cudf::convert_to_rows(in); - auto new_rows = cudf::convert_to_rows2(in); + auto old_rows = cudf::old_convert_to_rows(in); + auto new_rows = cudf::convert_to_rows(in); EXPECT_EQ(old_rows.size(), new_rows.size()); for (uint i = 0; i < old_rows.size(); i++) { @@ -84,8 +84,8 @@ TEST_F(ColumnToRowTests, Wide) } cudf::table_view in(views); - auto old_rows = cudf::convert_to_rows(in); - auto new_rows = cudf::convert_to_rows2(in); + auto old_rows = cudf::old_convert_to_rows(in); + auto new_rows = cudf::convert_to_rows(in); EXPECT_EQ(old_rows.size(), new_rows.size()); for (uint i = 0; i < old_rows.size(); i++) { @@ -104,8 +104,31 @@ TEST_F(ColumnToRowTests, SingleByteWide) } cudf::table_view in(views); - auto old_rows = cudf::convert_to_rows(in); - auto new_rows = cudf::convert_to_rows2(in); + auto old_rows = cudf::old_convert_to_rows(in); + auto new_rows = cudf::convert_to_rows(in); + + EXPECT_EQ(old_rows.size(), new_rows.size()); + for (uint i = 0; i < old_rows.size(); i++) { + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); + } +} + +TEST_F(ColumnToRowTests, Big) +{ + auto r = + cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); }); + std::vector> cols; + std::vector views; + + for (int i = 0; i < 256; ++i) { + cols.push_back( + cudf::test::fixed_width_column_wrapper(r + 4096 * i, r + 4096 * i + 4096)); + views.push_back(cols.back()); + } + cudf::table_view in(views); + + auto old_rows = cudf::old_convert_to_rows(in); + auto new_rows = cudf::convert_to_rows(in); EXPECT_EQ(old_rows.size(), new_rows.size()); for (uint i = 0; i < old_rows.size(); i++) { @@ -120,9 +143,9 @@ TEST_F(RowToColumnTests, Single) auto old_rows = cudf::convert_to_rows(in); std::vector schema{cudf::data_type{cudf::type_id::INT32}}; - for (uint i=0; i a({-1, 0, 1}); cudf::table_view in(std::vector{a}); - auto old_rows = cudf::convert_to_rows(in); + auto old_rows = cudf::old_convert_to_rows(in); std::vector schema{cudf::data_type{cudf::type_id::INT32}}; - for (uint i=0; i a(r, r + (size_t)4096); cudf::table_view in(std::vector{a}); - auto old_rows = cudf::convert_to_rows(in); + auto old_rows = cudf::old_convert_to_rows(in); std::vector schema; schema.reserve(in.num_columns()); for (auto col = in.begin(); col < in.end(); ++col) { schema.push_back(col->type()); } - for (uint i=0; i views; for (int i = 0; i < 256; ++i) { - cols.push_back(cudf::test::fixed_width_column_wrapper({rand()})); + cols.push_back(cudf::test::fixed_width_column_wrapper({i})); // rand()})); views.push_back(cols.back()); } cudf::table_view in(views); - auto old_rows = cudf::convert_to_rows(in); + auto old_rows = cudf::old_convert_to_rows(in); std::vector schema; schema.reserve(in.num_columns()); for (auto col = in.begin(); col < in.end(); ++col) { schema.push_back(col->type()); } - for (uint i=0; i schema; schema.reserve(in.num_columns()); for (auto col = in.begin(); col < in.end(); ++col) { schema.push_back(col->type()); } - for (uint i=0; i int32_t { return rand(); }); + std::vector> cols; + std::vector views; + std::vector schema; + + cols.push_back(cudf::test::fixed_width_column_wrapper(r, r + 13)); + views.push_back(cols.back()); + schema.push_back(cudf::data_type{cudf::type_id::INT32}); + cudf::table_view in(views); + + auto old_rows = cudf::old_convert_to_rows(in); + + for (uint i = 0; i < old_rows.size(); ++i) { + auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); + } +} + +TEST_F(RowToColumnTests, Big) +{ + auto r = + cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); }); + std::vector> cols; + std::vector views; + std::vector schema; + + for (int i = 0; i < 256; ++i) { + cols.push_back( + cudf::test::fixed_width_column_wrapper(r + 4096 * i, r + 4096 * i + 4096)); + views.push_back(cols.back()); + schema.push_back(cudf::data_type{cudf::type_id::INT32}); + } + cudf::table_view in(views); + + auto old_rows = cudf::old_convert_to_rows(in); + + for (uint i = 0; i < old_rows.size(); ++i) { + auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); } diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu index 68f1ae93dec..1babbc6fd1a 100644 --- a/java/src/main/native/src/row_conversion.cu +++ b/java/src/main/native/src/row_conversion.cu @@ -14,36 +14,52 @@ * limitations under the License. */ +#include #include +#include #include +#include + +#include +#include + +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 +#include +#endif #include #include +#include +#include +#include +#include #include #include +#include #include #include #include #include +#include #include - -#include "row_conversion.hpp" - +#include +#include + +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 +constexpr auto NUM_BLOCKS_PER_KERNEL_TO_COLUMNS = 8; +constexpr auto NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS = 8; +constexpr auto NUM_BLOCKS_PER_KERNEL_LOADED = 2; +constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL = 8; +constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED = 2; +#endif + +using cudf::detail::make_device_uvector_async; namespace cudf { -namespace java { -/** - * Copy a simple vector to device memory asynchronously. Be sure to read - * the data on the same stream as is used to copy it. - */ -template -std::unique_ptr> copy_to_dev_async(const std::vector &input, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) { - std::unique_ptr> ret(new rmm::device_uvector(input.size(), stream, mr)); - CUDA_TRY(cudaMemcpyAsync(ret->data(), input.data(), sizeof(T) * input.size(), - cudaMemcpyHostToDevice, stream.value())); - return ret; +namespace detail { + +static inline __host__ __device__ int32_t align_offset(int32_t offset, std::size_t alignment) { + return (offset + alignment - 1) & ~(alignment - 1); } __global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows, @@ -53,7 +69,6 @@ __global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows, const cudf::size_type *num_bytes, int8_t **output_data, cudf::bitmask_type **output_nm, const int8_t *input_data) { - // We are going to copy the data in two passes. // The first pass copies a chunk of data into shared memory. // The second pass copies that chunk from shared memory out to the final location. @@ -122,7 +137,6 @@ __global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows, cudf::size_type col_index_stride = blockDim.y; for (cudf::size_type col_index = col_index_start; col_index < num_columns; col_index += col_index_stride) { - cudf::size_type col_size = num_bytes[col_index]; const int8_t *col_tmp = &(row_tmp[input_offset_in_row[col_index]]); int8_t *col_output = output_data[col_index]; @@ -208,7 +222,6 @@ copy_from_fixed_width_columns(const cudf::size_type start_row, const cudf::size_ for (cudf::size_type row_group_index = row_group_start; row_group_index < row_group_end; row_group_index += row_group_stride) { - // Within the row group there should be 1 thread for each row. This is a // requirement for launching the kernel cudf::size_type row_index = start_row + (row_group_index * rows_per_group) + threadIdx.x; @@ -220,7 +233,6 @@ copy_from_fixed_width_columns(const cudf::size_type start_row, const cudf::size_ cudf::size_type col_index_stride = blockDim.y; for (cudf::size_type col_index = col_index_start; col_index < num_columns; col_index += col_index_stride) { - cudf::size_type col_size = num_bytes[col_index]; int8_t *col_tmp = &(row_tmp[output_offset_in_row[col_index]]); const int8_t *col_input = input_data[col_index]; @@ -304,6 +316,630 @@ copy_from_fixed_width_columns(const cudf::size_type start_row, const cudf::size_ } } +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 + +struct block_info { + int start_col; + int start_row; + int end_col; + int end_row; + int buffer_num; + + __host__ __device__ size_type get_row_size(size_type const *const col_offsets, + size_type const *const col_sizes) const { + return align_offset(col_offsets[end_col] + col_sizes[end_col] - col_offsets[start_col], 8); + } + __host__ __device__ size_type num_cols() const { return end_col - start_col + 1; } + + __host__ __device__ size_type num_rows() const { return end_row - start_row + 1; } +}; + +// When building the columns to return, we have to be mindful of the offset limit in cudf. +// It is 32-bit and these data columns are capable of surpassing that easily. The data should +// not be cut off exactly at the limit though due to the validity buffers. The most efficient +// place to cut the validity is on a 32-row boundary, so as we calculate the row sizes +// we keep track of the cut points for the validity, which we call row batches. If the row +// is larger than can be represented with the 32-bit offsets, we use the last 32-row boundary we +// hit. Note that this boundary is for our book-keeping with column pointers and not anything that +// the kernel needs to worry about. We cut the output at convienient boundaries when assembling +// the outgoing data stream. +struct row_batch { + size_type num_bytes; + size_type row_count; +}; + +/** + * @brief copy data from cudf columns into x format, which is row-based + * + * @param num_rows total number of rows in the table + * @param num_columns total number of columns in the table + * @param input_data pointer to raw table data + * @param input_nm pointer to validity data + * @param col_sizes array of sizes for each element in a column - one per column + * @param col_offsets offset into input data row for each column's start + * @param block_infos information about the blocks of work + * @param row_offsets offset to a specific row in the input data + * @param output_data pointer to output data + * + */ +__global__ void copy_from_columns(const size_type num_rows, const size_type num_columns, + const size_type shmem_used_per_block, + const size_type num_block_infos, const int8_t **input_data, + const size_type *col_sizes, const size_type *col_offsets, + const block_info *block_infos, const size_type *row_offsets, + int8_t **output_data) { + // We are going to copy the data in two passes. + // The first pass copies a chunk of data into shared memory. + // The second pass copies that chunk from shared memory out to the final location. + + // Because shared memory is limited we copy a subset of the rows at a time. + // This has been broken up for us in the block_info struct, so we don't have + // any calculation to do here, but it is important to note. + + constexpr unsigned stages_count = NUM_BLOCKS_PER_KERNEL_LOADED; + auto group = cooperative_groups::this_thread_block(); + extern __shared__ int8_t shared_data[]; + int8_t *shared[stages_count] = {shared_data, shared_data + shmem_used_per_block}; + + __shared__ cuda::barrier block_barrier[NUM_BLOCKS_PER_KERNEL_LOADED]; + if (group.thread_rank() == 0) { + for (int i = 0; i < NUM_BLOCKS_PER_KERNEL_LOADED; ++i) { + init(&block_barrier[i], group.size()); + } + } + + group.sync(); + + auto const blocks_remaining = + std::min((uint)(num_block_infos % NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS), + std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS, + (uint)NUM_BLOCKS_PER_KERNEL_TO_COLUMNS)); + + size_t fetch; + size_t subset; + for (subset = fetch = 0; subset < blocks_remaining; ++subset) { + // Fetch ahead up to stages_count subsets + for (; fetch < blocks_remaining && fetch < (subset + stages_count); ++fetch) { + auto const fetch_block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch]; + + auto const num_fetch_cols = fetch_block.num_cols(); + auto const num_fetch_rows = fetch_block.num_rows(); + auto const num_elements_in_block = num_fetch_cols * num_fetch_rows; + auto const fetch_block_row_size = fetch_block.get_row_size(col_offsets, col_sizes); + auto const starting_column_offset = col_offsets[fetch_block.start_col]; + auto &fetch_barrier = block_barrier[fetch % NUM_BLOCKS_PER_KERNEL_LOADED]; + + // wait for the last use of the memory to be completed + if (fetch > NUM_BLOCKS_PER_KERNEL_LOADED) { + fetch_barrier.arrive_and_wait(); + } + + // to do the copy we need to do n column copies followed by m element copies OR + // we have to do m element copies followed by r row copies. When going from column + // to row it is much easier to copy by elements first otherwise we would need a running + // total of the column sizes for our block, which isn't readily available. This makes it more + // appealing to copy element-wise from input data into shared matching the end layout and do + // row-based memcopies out. + + for (auto el = (int)threadIdx.x; el < num_elements_in_block; el += blockDim.x) { + auto const relative_col = el / num_fetch_rows; + auto const relative_row = el % num_fetch_rows; + auto const absolute_col = relative_col + fetch_block.start_col; + auto const absolute_row = relative_row + fetch_block.start_row; + auto const col_size = col_sizes[absolute_col]; + auto const col_offset = col_offsets[absolute_col]; + auto const relative_col_offset = col_offset - starting_column_offset; + + auto const shared_offset = relative_row * fetch_block_row_size + relative_col_offset; + auto const input_src = input_data[absolute_col] + col_size * absolute_row; + + // copy the main + cuda::memcpy_async(&shared[fetch % stages_count][shared_offset], input_src, col_size, + fetch_barrier); + } + } + + auto &subset_barrier = block_barrier[subset % NUM_BLOCKS_PER_KERNEL_LOADED]; + subset_barrier.arrive_and_wait(); + + auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + subset]; + /* auto const rows_in_block = block.num_rows(); + auto const cols_in_block = block.num_cols();*/ + auto const block_row_size = block.get_row_size(col_offsets, col_sizes); + auto const column_offset = col_offsets[block.start_col]; + + // copy entire rows to final dest + for (auto absolute_row = block.start_row + threadIdx.x; absolute_row <= block.end_row; + absolute_row += blockDim.x) { + auto const relative_row = absolute_row - block.start_row; + auto const output_dest = + output_data[block.buffer_num] + absolute_row * block_row_size + column_offset; + auto const shared_offset = block_row_size * relative_row; + cuda::memcpy_async(output_dest, &shared[subset % stages_count][shared_offset], block_row_size, + subset_barrier); + } + } + + // wait on the last copies to complete + for (uint i = 0; i < std::min(stages_count, blocks_remaining); ++i) { + block_barrier[i].arrive_and_wait(); + } +} + +/** + * @brief copy data from row-based format to cudf columns + * + * @param num_rows total number of rows in the table + * @param num_columns total number of columns in the table + * @param shmem_used_per_block amount of shared memory that is used by a block + * @param offsets + * @param output_data pointer to output data, partitioned by data size + * @param validity_offsets offset into input data row for validity data + * @param block_infos information about the blocks of work + * @param num_block_infos number of infos in blocks array + * @param input_data pointer to input data + * + */ +__global__ void copy_validity_from_columns( + const size_type num_rows, const size_type num_columns, const size_type shmem_used_per_block, + const size_type *row_offsets, int8_t **output_data, const size_type validity_offset, + const block_info *block_infos, const size_type num_block_infos, const bitmask_type **input_nm) { + extern __shared__ int8_t shared_data[]; + int8_t *shared_blocks[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] = { + shared_data, shared_data + shmem_used_per_block / 2}; + + // per conversation with DaveB + // each thread of warp reads a single int32 of validity - so we read 128 bytes + // then ballot_sync the bits and write the result to shmem + // after we fill shared mem memcpy it out in a blob. + // probably need knobs for number of rows vs columns to balance read/write + auto group = cooperative_groups::this_thread_block(); + + int const blocks_remaining = + std::min(num_block_infos - blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL, + (uint)NUM_VALIDITY_BLOCKS_PER_KERNEL); + + __shared__ cuda::barrier + shared_block_barriers[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]; + if (group.thread_rank() == 0) { + for (int i = 0; i < NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; ++i) { + init(&shared_block_barriers[i], group.size()); + } + } + + group.sync(); + + for (int validity_block = 0; validity_block < blocks_remaining; ++validity_block) { + if (validity_block != validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED) { + shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] + .arrive_and_wait(); + } + int8_t *this_shared_block = shared_blocks[validity_block % 2]; + auto block = block_infos[blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block]; + + auto const num_block_cols = block.num_cols(); + auto const num_block_rows = block.num_rows(); + + auto const num_sections_x = (num_block_cols + 31) / 32; + auto const num_sections_y = (num_block_rows + 7) / 8; + auto const validity_data_row_length = + align_offset(util::div_rounding_up_unsafe(num_block_cols, 8), 8); + auto const total_sections = num_sections_x * num_sections_y; + + int const warp_id = threadIdx.x / detail::warp_size; + int const lane_id = threadIdx.x % detail::warp_size; + auto const warps_per_block = std::max(1u, blockDim.x / detail::warp_size); + + // the block is divided into sections. A warp operates on a section at a time. + for (int my_section_idx = warp_id; my_section_idx < total_sections; + my_section_idx += warps_per_block) { + // convert to rows and cols + auto const section_x = my_section_idx / num_sections_x; + auto const section_y = my_section_idx % num_sections_x; + + auto const relative_col = section_x * 32 + lane_id; + auto const relative_row = section_y * 8; + auto const absolute_col = relative_col + block.start_col; + auto const absolute_row = relative_row + block.start_row; + auto const cols_left = num_columns - absolute_col; + + auto const participation_mask = __ballot_sync(0xFFFFFFFF, absolute_col < num_columns); + + if (absolute_col < num_columns) { + auto my_byte = + input_nm[absolute_col] != nullptr ? input_nm[absolute_col][absolute_row / 32] : 0xFF; + + // every thread that is participating in the warp has a byte, but it's column-based + // data and we need it in row-based. So we shiffle the bits around with ballot_sync to make + // the bytes we actually write. + for (int i = 0, byte_mask = 1; i < 8 && relative_row + i < num_rows; ++i, byte_mask <<= 1) { + auto validity_data = __ballot_sync(participation_mask, my_byte & byte_mask); + // lead thread in each warp writes data + auto const validity_write_offset = + validity_data_row_length * (relative_row + i) + relative_col / 8; + if (threadIdx.x % detail::warp_size == 0) { + if (cols_left <= 8) { + // write byte + this_shared_block[validity_write_offset] = validity_data & 0xFF; + } else if (cols_left <= 16) { + // write int16 + *reinterpret_cast(&this_shared_block[validity_write_offset]) = + validity_data & 0xFFFF; + } else if (cols_left <= 24) { + // write int16 and then int8 + *reinterpret_cast(&this_shared_block[validity_write_offset]) = + validity_data & 0xFFFF; + shared_data[validity_write_offset + 2] = (validity_data >> 16) & 0xFF; + } else { + // write int32 + *reinterpret_cast(&this_shared_block[validity_write_offset]) = + validity_data; + } + } + } + } + } + + // make sure entire block has finished copy + group.sync(); + + // now async memcpy the shared memory out to the final destination + for (int row = block.start_row + threadIdx.x; row <= block.end_row; row += blockDim.x) { + auto const relative_row = row - block.start_row; + auto const output_ptr = + output_data[block.buffer_num] + row_offsets[row] + validity_offset + block.start_col / 8; + auto const num_bytes = util::div_rounding_up_unsafe(num_block_cols, 8); + cuda::memcpy_async( + output_ptr, &this_shared_block[validity_data_row_length * relative_row], num_bytes, + shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]); + } + } + + // wait for last blocks of data to arrive + for (int validity_block = 0; + validity_block < blocks_remaining % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; + ++validity_block) { + shared_block_barriers[validity_block].arrive_and_wait(); + } +} + +static __device__ std::tuple +get_admin_data_sizes(size_t col_size_size, size_t col_offset_size, int const num_cols) { + auto const col_size_bytes = num_cols * col_size_size; + auto const col_offset_bytes = num_cols * col_offset_size; + + return {col_size_bytes, col_offset_bytes}; +} + +/** + * @brief ensure `read_ahead` buffer blocks are fetched + * + * @param fetch_index internal state passed into the function + * @param processing_index index where processing is occuring + * @param read_ahead_count how many blocks to read ahead + * @param max_resident_blocks how many blocks can be loaded at once + * @param total_blocks total number of blocks overall + * @param block_infos pointer to the block infos + * @param col_sizes pointer to column size information + * @param col_offsets pointer to the table's column offsets + * @param row_offsets pointer to offsets for each row in the table + * @param input_data pointer to the input data + * @param shared pointer to shared memory + * @param group thread group participating in the fetch + * @param block_barrier barriers used for each block + * @return + */ +static __device__ void +fetch_blocks_for_row_to_column(size_t &fetch_index, size_t const processing_index, + int const read_ahead_count, int const max_resident_blocks, + int const total_blocks, block_info const *const block_infos, + size_type const *const col_sizes, size_type const *const col_offsets, + size_type const *const row_offsets, int8_t const *const input_data, + int8_t *shared[], cooperative_groups::thread_block const group, + cuda::barrier *block_barrier) { + for (; fetch_index < static_cast(total_blocks) && + fetch_index < (processing_index + read_ahead_count); + ++fetch_index) { + auto const fetch_block = + block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch_index]; + auto const fetch_block_start_row = fetch_block.start_row; + auto const fetch_block_end_row = fetch_block.end_row; + auto const starting_col_offset = col_offsets[fetch_block.start_col]; + + auto const fetch_block_row_size = fetch_block.get_row_size(col_offsets, col_sizes); + auto const num_fetch_cols = fetch_block.num_cols(); + auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes( + sizeof(decltype(*col_sizes)), sizeof(decltype(*col_offsets)), num_fetch_cols); + auto &fetch_barrier = block_barrier[fetch_index % NUM_BLOCKS_PER_KERNEL_LOADED]; + + // if we have fetched all buffers, we need to wait for processing + // to complete on them before we can use them again + if (fetch_index > NUM_BLOCKS_PER_KERNEL_LOADED) { + fetch_barrier.arrive_and_wait(); + } + + auto shared_row_offset = 0; + // copy the data for column sizes + cuda::memcpy_async(group, &shared[fetch_index % max_resident_blocks][shared_row_offset], + &col_sizes[fetch_block.start_col], col_size_bytes, fetch_barrier); + shared_row_offset += col_size_bytes; + // copy the data for column offsets + cuda::memcpy_async(group, &shared[fetch_index % max_resident_blocks][shared_row_offset], + &col_offsets[fetch_block.start_col], col_offset_bytes, fetch_barrier); + shared_row_offset += col_offset_bytes; + shared_row_offset = align_offset(shared_row_offset, 8); + + for (auto row = fetch_block_start_row + static_cast(threadIdx.x); + row <= fetch_block_end_row; row += blockDim.x) { + auto shared_offset = (row - fetch_block_start_row) * fetch_block_row_size + shared_row_offset; + // copy the main + cuda::memcpy_async(&shared[fetch_index % max_resident_blocks][shared_offset], + &input_data[row_offsets[row] + starting_col_offset], fetch_block_row_size, + fetch_barrier); + } + } +} + +/** + * @brief copy data from row-based format to cudf columns + * + * @param num_rows total number of rows in the table + * @param num_columns total number of columns in the table + * @param shmem_used_per_block amount of shared memory that is used by a block + * @param row_offsets + * @param output_data + * @param output_nm + * @param col_sizes array of sizes for each element in a column - one per column + * @param col_offsets offset into input data row for each column's start + * @param block_infos information about the blocks of work + * @param input_data pointer to input data + * + */ +__global__ void copy_to_columns(const size_type num_rows, const size_type num_columns, + const size_type shmem_used_per_block, const size_type *row_offsets, + int8_t **output_data, const size_type *_col_sizes, + const size_type *_col_offsets, const block_info *block_infos, + const size_type num_block_infos, const int8_t *input_data) { + // We are going to copy the data in two passes. + // The first pass copies a chunk of data into shared memory. + // The second pass copies that chunk from shared memory out to the final location. + + // Because shared memory is limited we copy a subset of the rows at a time. + // This has been broken up for us in the block_info struct, so we don't have + // any calculation to do here, but it is important to note. + + // to speed up some of the random access memory we do, we copy col_sizes and col_offsets + // to shared memory for each of the blocks that we work on + + constexpr unsigned stages_count = NUM_BLOCKS_PER_KERNEL_LOADED; + auto group = cooperative_groups::this_thread_block(); + extern __shared__ int8_t shared_data[]; + int8_t *shared[stages_count] = {shared_data, shared_data + shmem_used_per_block}; + + __shared__ cuda::barrier block_barrier[stages_count]; + if (group.thread_rank() == 0) { + for (int i = 0; i < stages_count; ++i) { + init(&block_barrier[i], group.size()); + } + } + + group.sync(); + + auto blocks_remaining = std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS, + (uint)NUM_BLOCKS_PER_KERNEL_TO_COLUMNS); + + auto get_admin_data_sizes = [col_size_size = sizeof(decltype(*_col_sizes)), + col_offset_size = sizeof(decltype(*_col_offsets))]( + int const num_cols, + int const num_rows) -> std::tuple { + auto const col_size_bytes = num_cols * col_size_size; + auto const col_offset_bytes = num_cols * col_offset_size; + + return {col_size_bytes, col_offset_bytes}; + }; + + size_t fetch; + size_t subset; + for (subset = fetch = 0; subset < blocks_remaining; ++subset) { + // Fetch ahead up to stages_count subsets + fetch_blocks_for_row_to_column(fetch, subset, stages_count, stages_count, blocks_remaining, + block_infos, _col_sizes, _col_offsets, row_offsets, input_data, + shared, group, block_barrier); + + auto &subset_barrier = block_barrier[subset % stages_count]; + // ensure our data is ready + subset_barrier.arrive_and_wait(); + + auto const block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + subset]; + + auto const rows_in_block = block.num_rows(); + auto const cols_in_block = block.num_cols(); + + auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes(cols_in_block, rows_in_block); + // auto shared_row_offsets = shared[subset]; + auto shared_col_sizes = reinterpret_cast(shared[subset % stages_count]); + auto shared_col_offsets = + reinterpret_cast(&shared[subset % stages_count][col_size_bytes]); + + auto const shared_row_offset = align_offset(col_size_bytes + col_offset_bytes, 8); + + auto block_row_size = block.get_row_size(_col_offsets, _col_sizes); + + // now we copy from shared memory to final destination. + // the data is laid out in rows in shared memory, so the reads + // for a column will be "vertical". Because of this and the different + // sizes for each column, this portion is handled on row/column basis. + // to prevent each thread working on a single row and also to ensure + // that all threads can do work in the case of more threads than rows, + // we do a global index instead of a double for loop with col/row. + for (int index = threadIdx.x; index < rows_in_block * cols_in_block; index += blockDim.x) { + auto const relative_col = index % cols_in_block; + auto const relative_row = index / cols_in_block; + auto const absolute_col = relative_col + block.start_col; + auto const absolute_row = relative_row + block.start_row; + + auto const shared_memory_row_offset = block_row_size * relative_row; + auto const shared_memory_offset = shared_col_offsets[relative_col] - shared_col_offsets[0] + + shared_memory_row_offset + shared_row_offset; + auto const column_size = shared_col_sizes[relative_col]; + + int8_t *shmem_src = &shared[subset % stages_count][shared_memory_offset]; + int8_t *dst = &output_data[absolute_col][absolute_row * column_size]; + + cuda::memcpy_async(dst, shmem_src, column_size, subset_barrier); + } + group.sync(); + } + + // wait on the last copies to complete + for (uint i = 0; i < std::min(stages_count, blocks_remaining); ++i) { + block_barrier[i].arrive_and_wait(); + } +} + +/** + * @brief copy data from row-based format to cudf columns + * + * @param num_rows total number of rows in the table + * @param num_columns total number of columns in the table + * @param shmem_used_per_block amount of shared memory that is used by a block + * @param offsets + * @param output_nm + * @param validity_offsets offset into input data row for validity data + * @param block_infos information about the blocks of work + * @param num_block_infos number of infos in blocks array + * @param input_data pointer to input data + * + */ +__global__ void copy_validity_to_columns( + const size_type num_rows, const size_type num_columns, const size_type shmem_used_per_block, + const size_type *row_offsets, cudf::bitmask_type **output_nm, const size_type validity_offset, + const block_info *block_infos, const size_type num_block_infos, const int8_t *input_data) { + extern __shared__ int8_t shared_data[]; + int8_t *shared_blocks[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] = { + shared_data, shared_data + shmem_used_per_block / 2}; + + // per conversation with DaveB + // each thread of warp reads a single byte of validity - so we read 32 bytes + // then ballot_sync the bits and write the result to shmem + // after we fill shared mem memcpy it out in a blob. + // probably need knobs for number of rows vs columns to balance read/write + auto group = cooperative_groups::this_thread_block(); + + int const blocks_remaining = + std::min(num_block_infos - blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL, + (uint)NUM_VALIDITY_BLOCKS_PER_KERNEL); + + __shared__ cuda::barrier + shared_block_barriers[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]; + if (group.thread_rank() == 0) { + for (int i = 0; i < NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; ++i) { + init(&shared_block_barriers[i], group.size()); + } + } + + group.sync(); + + for (int validity_block = 0; validity_block < blocks_remaining; ++validity_block) { + auto const validity_index = validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; + if (validity_block != validity_index) { + shared_block_barriers[validity_index].arrive_and_wait(); + } + int8_t *this_shared_block = shared_blocks[validity_block % 2]; + auto const block = block_infos[blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block]; + auto const block_start_col = block.start_col; + auto const block_start_row = block.start_row; + + auto const num_block_cols = block.num_cols(); + auto const num_block_rows = block.num_rows(); + + auto const num_sections_x = (num_block_cols + 7) / 8; + auto const num_sections_y = (num_block_rows + 31) / 32; + auto const validity_data_col_length = align_offset(num_sections_y, 4); + auto const total_sections = num_sections_x * num_sections_y; + + int const warp_id = threadIdx.x / detail::warp_size; + int const lane_id = threadIdx.x % detail::warp_size; + auto const warps_per_block = std::max(1u, blockDim.x / detail::warp_size); + + // the block is divided into sections. A warp operates on a section at a time. + for (int my_section_idx = warp_id; my_section_idx < total_sections; + my_section_idx += warps_per_block) { + // convert to rows and cols + auto const section_x = my_section_idx % num_sections_x; + auto const section_y = my_section_idx / num_sections_x; + + auto const relative_col = section_x * 8; + auto const relative_row = section_y * 32 + lane_id; + auto const absolute_col = relative_col + block_start_col; + auto const absolute_row = relative_row + block_start_row; + auto const rows_left = num_rows - absolute_row; + + auto const participation_mask = __ballot_sync(0xFFFFFFFF, absolute_row < num_rows); + + if (absolute_row < num_rows) { + auto const my_byte = + input_data[row_offsets[absolute_row] + validity_offset + absolute_col / 8]; + + // so every thread that is participating in the warp has a byte, but it's row-based + // data and we need it in column-based. So we shiffle the bits around to make + // the bytes we actually write. + for (int i = 0, byte_mask = 1; i < 8 && relative_col + i < num_columns; + ++i, byte_mask <<= 1) { + auto validity_data = __ballot_sync(participation_mask, my_byte & byte_mask); + // lead thread in each warp writes data + if (threadIdx.x % detail::warp_size == 0) { + auto const validity_write_offset = + validity_data_col_length * (relative_col + i) + relative_row / 8; + + if (rows_left <= 8) { + // write byte + this_shared_block[validity_write_offset] = validity_data & 0xFF; + } else if (rows_left <= 16) { + // write int16 + *reinterpret_cast(&this_shared_block[validity_write_offset]) = + validity_data & 0xFFFF; + } else if (rows_left <= 24) { + // write int16 and then int8 + *reinterpret_cast(&this_shared_block[validity_write_offset]) = + validity_data & 0xFFFF; + shared_data[validity_write_offset + 2] = (validity_data >> 16) & 0xFF; + } else { + // write int32 + *reinterpret_cast(&this_shared_block[validity_write_offset]) = + validity_data; + } + } + } + } + } + + // make sure entire block has finished copy + group.sync(); + + // now async memcpy the shared + for (int col = block.start_col + threadIdx.x; col <= block.end_col; col += blockDim.x) { + auto const relative_col = col - block.start_col; + + cuda::memcpy_async( + output_nm[col] + word_index(block_start_row), + &this_shared_block[validity_data_col_length * relative_col], + util::div_rounding_up_unsafe(num_block_rows, 8), + shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]); + } + } + + // wait for last blocks of data to arrive + auto const num_blocks_to_wait = blocks_remaining > NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED ? + NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED : + blocks_remaining; + for (int validity_block = 0; validity_block < num_blocks_to_wait; ++validity_block) { + shared_block_barriers[validity_block].arrive_and_wait(); + } +} + +#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 + /** * Calculate the dimensions of the kernel for fixed width only columns. * @param [in] num_columns the number of columns being copied. @@ -317,7 +953,6 @@ static int calc_fixed_width_kernel_dims(const cudf::size_type num_columns, const cudf::size_type num_rows, const cudf::size_type size_per_row, dim3 &blocks, dim3 &threads) { - // We have found speed degrades when a thread handles more than 4 columns. // Each block is 2 dimensional. The y dimension indicates the columns. // We limit this to 32 threads in the y dimension so we can still @@ -327,10 +962,9 @@ static int calc_fixed_width_kernel_dims(const cudf::size_type num_columns, // in the x dimension because we use atomic operations at the block // level when writing validity data out to main memory, and that would // need to change if we split a word of validity data between blocks. - int y_block_size = (num_columns + 3) / 4; - if (y_block_size > 32) { + int y_block_size = (num_columns + 3) / 4; // cudf::util::div_rounding_up_safe(num_columns, 4); + if (y_block_size > 32) y_block_size = 32; - } int x_possible_block_size = 1024 / y_block_size; // 48KB is the default setting for shared memory per block according to the cuda tutorials // If someone configures the GPU to only have 16 KB this might not work. @@ -373,15 +1007,15 @@ static int calc_fixed_width_kernel_dims(const cudf::size_type num_columns, * going from start row and containing the next num_rows. Most of the parameters passed * into this function are common between runs and should be calculated once. */ -static std::unique_ptr fixed_width_convert_to_rows( - const cudf::size_type start_row, const cudf::size_type num_rows, - const cudf::size_type num_columns, const cudf::size_type size_per_row, - std::unique_ptr> &column_start, - std::unique_ptr> &column_size, - std::unique_ptr> &input_data, - std::unique_ptr> &input_nm, - const cudf::scalar &zero, const cudf::scalar &scalar_size_per_row, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) { +static std::unique_ptr +fixed_width_convert_to_rows(const cudf::size_type start_row, const cudf::size_type num_rows, + const cudf::size_type num_columns, const cudf::size_type size_per_row, + rmm::device_uvector &column_start, + rmm::device_uvector &column_size, + rmm::device_uvector &input_data, + rmm::device_uvector &input_nm, + const cudf::scalar &zero, const cudf::scalar &scalar_size_per_row, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) { int64_t total_allocation = size_per_row * num_rows; // We made a mistake in the split somehow CUDF_EXPECTS(total_allocation < std::numeric_limits::max(), "Table is too large to fit!"); @@ -397,30 +1031,23 @@ static std::unique_ptr fixed_width_convert_to_rows( dim3 blocks; dim3 threads; int shared_size = - calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads); + detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads); copy_from_fixed_width_columns<<>>( - start_row, num_rows, num_columns, size_per_row, column_start->data(), column_size->data(), - input_data->data(), input_nm->data(), data->mutable_view().data()); + start_row, num_rows, num_columns, size_per_row, column_start.data(), column_size.data(), + input_data.data(), input_nm.data(), data->mutable_view().data()); return cudf::make_lists_column(num_rows, std::move(offsets), std::move(data), 0, - rmm::device_buffer{}, stream, mr); + rmm::device_buffer{0, rmm::cuda_stream_default, mr}, stream, mr); } static cudf::data_type get_data_type(const cudf::column_view &v) { return v.type(); } -static bool is_fixed_width(const cudf::data_type &t) { - return cudf::is_fixed_width(t); -} - -static inline int32_t align_offset(int32_t offset, std::size_t alignment) { - return (offset + alignment - 1) & ~(alignment - 1); -} - static inline bool are_all_fixed_width(std::vector const &schema) { - return std::all_of(schema.begin(), schema.end(), cudf::java::is_fixed_width); + return std::all_of(schema.begin(), schema.end(), + [](const cudf::data_type &t) { return cudf::is_fixed_width(t); }); } /** @@ -449,30 +1076,443 @@ static inline int32_t compute_fixed_width_layout(std::vector co // Now we need to add in space for validity // Eventually we can think about nullable vs not nullable, but for now we will just always add it // in - int32_t validity_bytes_needed = (schema.size() + 7) / 8; + int32_t validity_bytes_needed = + (schema.size() + 7) / 8; // cudf::util::div_rounding_up_safe(schema.size(), 8); // validity comes at the end and is byte aligned so we can pack more in. at_offset += validity_bytes_needed; // Now we need to pad the end so all rows are 64 bit aligned return align_offset(at_offset, 8); // 8 bytes (64 bits) } +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 + +template +static size_type compute_column_information(iterator begin, iterator end, + std::vector &column_starts, + std::vector &column_sizes) //, +// std::function nested_type_cb) +{ + size_type fixed_width_size_per_row = 0; + for (auto cv = begin; cv != end; ++cv) { + auto col_type = std::get<0>(*cv); + bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; + + // if (nested_type && nested_type_cb) { nested_type_cb(cv->get<1>()); } + + // a list or string column will write a single uint64 + // of data here for offset/length + auto col_size = nested_type ? 8 : size_of(col_type); + + // align size for this type + std::size_t const alignment_needed = col_size; // They are the same for fixed width types + fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed); + column_starts.push_back(fixed_width_size_per_row); + column_sizes.push_back(col_size); + fixed_width_size_per_row += col_size; + } + + auto validity_offset = detail::align_offset(fixed_width_size_per_row, 4); + column_starts.push_back(validity_offset); + + return fixed_width_size_per_row; +} + +std::vector +build_validity_block_infos(size_type const &num_columns, size_type const &num_rows, + size_type const &shmem_limit_per_block, + std::vector const &row_batches) { + auto const desired_rows_and_columns = (int)sqrt(shmem_limit_per_block); + auto const column_stride = align_offset( + [&]() { + if (desired_rows_and_columns > num_columns) { + // not many columns, group it into 8s and ship it off + return std::min(8, num_columns); + } else { + return util::round_down_safe(desired_rows_and_columns, 8); + } + }(), + 8); + // we fit as much as we can given the column stride + auto const row_stride = std::min(num_rows, shmem_limit_per_block * 8 / column_stride); + + std::vector validity_block_infos; + for (int col = 0; col < num_columns; col += column_stride) { + int current_window_row_batch = 0; + int rows_left_in_batch = row_batches[current_window_row_batch].row_count; + int row = 0; + while (row < num_rows) { + if (rows_left_in_batch == 0) { + current_window_row_batch++; + rows_left_in_batch = row_batches[current_window_row_batch].row_count; + } + int const window_height = std::min(row_stride, rows_left_in_batch); + + validity_block_infos.emplace_back(detail::block_info{ + col, row, std::min(col + column_stride - 1, num_columns - 1), row + window_height - 1}); + row += window_height; + rows_left_in_batch -= window_height; + } + } + + return validity_block_infos; +} + +std::vector build_block_infos(std::vector const &column_sizes, + std::vector const &column_starts, + std::vector const &row_batches, + size_type const total_number_of_rows, + size_type const &shmem_limit_per_block) { + std::vector block_infos; + + // block infos are organized with the windows going "down" the columns + // this provides the most coalescing of memory access + int current_window_width = 0; + int current_window_start_col = 0; + + // build the blocks for a specific set of columns + auto build_blocks = [&block_infos, &row_batches, total_number_of_rows]( + int const start_col, int const end_col, int const desired_window_height) { + int current_window_start_row = 0; + int current_window_row_batch = 0; + int rows_left_in_batch = row_batches[current_window_row_batch].row_count; + int i = 0; + while (i < total_number_of_rows) { + if (rows_left_in_batch == 0) { + current_window_row_batch++; + rows_left_in_batch = row_batches[current_window_row_batch].row_count; + } + int const window_height = std::min(desired_window_height, rows_left_in_batch); + + block_infos.emplace_back(detail::block_info{ + start_col, current_window_start_row, end_col, + std::min(current_window_start_row + window_height - 1, total_number_of_rows - 1), + current_window_row_batch}); + + i += window_height; + current_window_start_row += window_height; + rows_left_in_batch -= window_height; + } + }; + + // the ideal window height has lots of 8-byte reads and 8-byte writes. The optimal read/write + // would be memory cache line sized access, but since other blocks will read/write the edges this + // may not turn out to be overly important. For now, we will attempt to build a square window as + // far as byte sizes. x * y = shared_mem_size. Which translates to x^2 = shared_mem_size since we + // want them equal, so height and width are sqrt(shared_mem_size). The trick is that it's in + // bytes, not rows or columns. + size_type const optimal_square_len = size_type(sqrt(shmem_limit_per_block)); + int const window_height = + std::clamp(util::round_up_safe( + optimal_square_len <= (size_type)column_sizes.size() ? + std::min(optimal_square_len / column_sizes[0], total_number_of_rows) : + row_batches[0].row_count / 2, + 32), + 1, row_batches[0].row_count); + + auto calc_admin_data_size = [](int num_cols) -> size_type { + // admin data is the column sizes and column start information. + // this is copied to shared memory as well and needs to be accounted for + // in the window calculation. + return num_cols * sizeof(size_type) + num_cols * sizeof(size_type); + }; + + int row_size = 0; + + // march each column and build the blocks of appropriate sizes + for (unsigned int col = 0; col < column_sizes.size(); ++col) { + auto const col_size = column_sizes[col]; + + // align size for this type + std::size_t alignment_needed = col_size; // They are the same for fixed width types + auto row_size_aligned = detail::align_offset(row_size, alignment_needed); + auto row_size_with_this_col = row_size_aligned + col_size; + auto row_size_with_end_pad = detail::align_offset(row_size_with_this_col, 8); + + if (row_size_with_end_pad * window_height + + calc_admin_data_size(col - current_window_start_col) > + shmem_limit_per_block) { + // too large, close this window, generate vertical blocks and restart + build_blocks(current_window_start_col, col - 1, window_height); + row_size = + detail::align_offset((column_starts[col] + column_sizes[col]) & 7, alignment_needed); + row_size += col_size; // alignment required for shared memory window boundary to match + // alignment of output row + current_window_start_col = col; + current_window_width = 0; + } else { + row_size = row_size_with_this_col; + current_window_width++; + } + } + + // build last set of blocks + if (current_window_width > 0) { + build_blocks(current_window_start_col, (int)column_sizes.size() - 1, window_height); + } + + return block_infos; +} + +#endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 + +} // namespace detail + std::vector> convert_to_rows(cudf::table_view const &tbl, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) { +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 + // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the + // data, but small enough that multiple columns fit in memory so the writes can coalese as well. + // Potential optimization for window sizes. + const size_type num_columns = tbl.num_columns(); + const size_type num_rows = tbl.num_rows(); + + int device_id; + CUDA_TRY(cudaGetDevice(&device_id)); + int total_shmem; + CUDA_TRY(cudaDeviceGetAttribute(&total_shmem, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); + + // TODO: kernels fail to launch if we use all the available shared memory. + total_shmem -= 1024; + + int shmem_limit_per_block = total_shmem / NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; + + // break up the work into blocks, which are a starting and ending row/col #. + // this window size is calculated based on the shared memory size available + // we want a single block to fill up the entire shared memory space available + // for the transpose-like conversion. + + // There are two different processes going on here. The GPU conversion of the data + // and the writing of the data into the list of byte columns that are a maximum of + // 2 gigs each due to offset maximum size. The GPU conversion portion has to understand + // this limitation because the column must own the data inside and as a result it must be + // a distinct allocation for that column. Copying the data into these final buffers would + // be prohibitively expensive, so care is taken to ensure the GPU writes to the proper buffer. + // The windows are broken at the boundaries of specific rows based on the row sizes up + // to that point. These are row batches and they are decided first before building the + // windows so the windows can be properly cut around them. + + // Get the pointers to the input columnar data ready + std::vector input_data; + std::vector input_nm; + input_data.reserve(num_columns); + input_nm.reserve(num_columns); + for (size_type column_number = 0; column_number < num_columns; column_number++) { + column_view cv = tbl.column(column_number); + auto const col_type = cv.type(); + bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; + + if (!nested_type) { + input_data.emplace_back(cv.data()); + input_nm.emplace_back(cv.null_mask()); + } + } + auto dev_input_data = make_device_uvector_async(input_data, stream, mr); + auto dev_input_nm = make_device_uvector_async(input_nm, stream, mr); + + std::vector row_sizes; // size of each row in bytes including any alignment padding + std::vector row_offsets; // offset from the start of the data to this row + std::vector column_sizes; // byte size of each column + std::vector column_starts; // offset of column inside a row including alignment + std::vector + variable_width_columns; // list of the variable width columns in the table + row_sizes.reserve(num_rows); + row_offsets.reserve(num_rows); + column_sizes.reserve(num_columns); + column_starts.reserve(num_columns + 1); // we add a final offset for validity data start + + auto iter = + thrust::make_transform_iterator(thrust::make_counting_iterator(0), + [&tbl](auto i) -> std::tuple { + return std::make_tuple(tbl.column(i).type(), tbl.column(i)); + }); + + size_type fixed_width_size_per_row = + detail::compute_column_information(iter, iter + num_columns, column_starts, + column_sizes); //, + // [&variable_width_columns](column_view const &cv) { variable_width_columns.push_back(cv); }); + /* size_type fixed_width_size_per_row = 0; + for (int col = 0; col < num_columns; ++col) { + auto cv = tbl.column(col); + auto col_type = cv.type(); + bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; + + if (nested_type) { variable_width_columns.push_back(cv); } + + // a list or string column will write a single uint64 + // of data here for offset/length + auto col_size = nested_type ? 8 : size_of(col_type); + + // align size for this type + std::size_t const alignment_needed = col_size; // They are the same for fixed width types + fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed); + column_starts.push_back(fixed_width_size_per_row); + column_sizes.push_back(col_size); + fixed_width_size_per_row += col_size; + }*/ + + auto dev_col_sizes = make_device_uvector_async(column_sizes, stream, mr); + auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr); + + std::vector row_batches; + + auto calculate_variable_width_row_data_size = [](int const row) { + // each level of variable-width data will add an offset/length + // uint64 of data. The first of which is inside the fixed-width + // data itself and needs to be aligned based on what is around + // that data. This is handled above with the fixed-width calculations + // for that reason. We may still need to add more of these offset/length + // combinations if the nesting is deeper than one level as these + // will be included in the variable-width data blob at the end of the + // row. + return 0; + /* auto c = variable_width_columns[col]; + while (true) { + auto col_offsets = c.child(0).data(); + auto col_data_size = size_of(c.child(1).type()); + std::size_t alignment_needed = col_data_size; + + row_sizes[row] += (col_offsets[row + 1] - col_offsets[row]) * col_data_size; + if (c.num_children() == 0) { + break; + } + c = c.child(1); + } + */ + }; + + uint64_t row_batch_size = 0; + uint64_t total_table_size = 0; + size_type row_batch_rows = 0; + uint64_t row_offset = 0; + + // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then + // calculate the size of each row's variable-width data and validity as well. + auto validity_size = num_bitmask_words(num_columns) * 4; + for (int row = 0; row < num_rows; ++row) { + auto aligned_row_batch_size = + detail::align_offset(row_batch_size, 8); // rows are 8 byte aligned + row_sizes[row] = fixed_width_size_per_row; + // validity is byte aligned + row_sizes[row] += validity_size; + // variable width data is 8-byte aligned + row_sizes[row] = detail::align_offset(row_sizes[row], 8) + + calculate_variable_width_row_data_size(row); // rows are 8 byte aligned + + if ((uint64_t)aligned_row_batch_size + row_sizes[row] > + (uint64_t)std::numeric_limits::max()) { + // a new batch starts at the last 32-row boundary + row_batches.push_back( + detail::row_batch{static_cast(row_batch_size), row_batch_rows & ~31}); + row_batch_size = 0; + row_batch_rows = row_batch_rows & 31; + row_offset = 0; + aligned_row_batch_size = 0; + } + row_offset = detail::align_offset(row_offset, 8); // rows are 8 byte aligned + row_offsets.push_back(row_offset); + row_batch_size = aligned_row_batch_size + row_sizes[row]; + row_offset += row_sizes[row]; + total_table_size = detail::align_offset(total_table_size, 8); // rows are 8 byte aligned + total_table_size += row_sizes[row]; + row_batch_rows++; + } + if (row_batch_size > 0) { + row_batches.push_back( + detail::row_batch{static_cast(row_batch_size), row_batch_rows}); + } + + auto dev_row_offsets = make_device_uvector_async(row_offsets, stream, mr); + + std::vector output_buffers; + std::vector output_data; + output_data.reserve(row_batches.size()); + for (uint i = 0; i < row_batches.size(); ++i) { + rmm::device_buffer temp(row_batches[i].num_bytes, stream, mr); + output_data.push_back(static_cast(temp.data())); + output_buffers.push_back(std::move(temp)); + } + auto dev_output_data = make_device_uvector_async(output_data, stream, mr); + + std::vector block_infos = + build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block); + + auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr); + + // blast through the entire table and convert it + dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS)); + dim3 threads(256); + + detail::copy_from_columns<<>>( + num_rows, num_columns, shmem_limit_per_block, block_infos.size(), dev_input_data.data(), + dev_col_sizes.data(), dev_col_starts.data(), dev_block_infos.data(), dev_row_offsets.data(), + reinterpret_cast(dev_output_data.data())); + + auto validity_block_infos = + build_validity_block_infos(num_columns, num_rows, shmem_limit_per_block, row_batches); + + auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream, mr); + dim3 validity_blocks( + util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_COLUMNS)); + dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu)); + detail::copy_validity_from_columns<<>>( + num_rows, num_columns, shmem_limit_per_block, dev_row_offsets.data(), dev_output_data.data(), + column_starts.back(), dev_validity_block_infos.data(), validity_block_infos.size(), + dev_input_nm.data()); + + // split up the output buffer into multiple buffers based on row batch sizes + // and create list of byte columns + int offset_offset = 0; + std::vector> ret; + for (uint i = 0; i < row_batches.size(); ++i) { + // compute offsets for this row batch + std::vector offset_vals; + offset_vals.reserve(row_batches[i].row_count + 1); + size_type cur_offset = 0; + offset_vals.push_back(cur_offset); + for (int row = 0; row < row_batches[i].row_count; ++row) { + cur_offset = detail::align_offset(cur_offset, 8) + row_sizes[row + offset_offset]; + offset_vals.push_back(cur_offset); + } + offset_offset += row_batches[i].row_count; + + auto dev_offsets = make_device_uvector_async(offset_vals, stream, mr); + auto offsets = std::make_unique(data_type{type_id::INT32}, + (size_type)offset_vals.size(), dev_offsets.release()); + + auto data = std::make_unique(data_type{cudf::type_id::INT8}, row_batches[i].num_bytes, + std::move(output_buffers[i])); + + ret.push_back( + cudf::make_lists_column(row_batches[i].row_count, std::move(offsets), std::move(data), 0, + rmm::device_buffer{0, rmm::cuda_stream_default, mr}, stream, mr)); + } + + return ret; +#else + CUDF_FAIL("Column to row conversion optimization requires volta or later hardware."); + return {}; +#endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 +} + +std::vector> +old_convert_to_rows(cudf::table_view const &tbl, rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) { const cudf::size_type num_columns = tbl.num_columns(); std::vector schema; schema.resize(num_columns); - std::transform(tbl.begin(), tbl.end(), schema.begin(), cudf::java::get_data_type); + std::transform(tbl.begin(), tbl.end(), schema.begin(), detail::get_data_type); - if (are_all_fixed_width(schema)) { + if (detail::are_all_fixed_width(schema)) { std::vector column_start; std::vector column_size; - int32_t size_per_row = compute_fixed_width_layout(schema, column_start, column_size); - auto dev_column_start = copy_to_dev_async(column_start, stream, mr); - auto dev_column_size = copy_to_dev_async(column_size, stream, mr); + int32_t size_per_row = detail::compute_fixed_width_layout(schema, column_start, column_size); + auto dev_column_start = make_device_uvector_async(column_start, stream, mr); + auto dev_column_size = make_device_uvector_async(column_size, stream, mr); int32_t max_rows_per_batch = std::numeric_limits::max() / size_per_row; // Make the number of rows per batch a multiple of 32 so we don't have to worry about @@ -489,8 +1529,8 @@ std::vector> convert_to_rows(cudf::table_view cons input_data.emplace_back(cv.data()); input_nm.emplace_back(cv.null_mask()); } - auto dev_input_data = copy_to_dev_async(input_data, stream, mr); - auto dev_input_nm = copy_to_dev_async(input_nm, stream, mr); + auto dev_input_data = make_device_uvector_async(input_data, stream, mr); + auto dev_input_nm = make_device_uvector_async(input_nm, stream, mr); using ScalarType = cudf::scalar_type_t; auto zero = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value()); @@ -506,7 +1546,7 @@ std::vector> convert_to_rows(cudf::table_view cons for (cudf::size_type row_start = 0; row_start < num_rows; row_start += max_rows_per_batch) { cudf::size_type row_count = num_rows - row_start; row_count = row_count > max_rows_per_batch ? max_rows_per_batch : row_count; - ret.emplace_back(fixed_width_convert_to_rows( + ret.emplace_back(detail::fixed_width_convert_to_rows( row_start, row_count, num_columns, size_per_row, dev_column_start, dev_column_size, dev_input_data, dev_input_nm, *zero, *step, stream, mr)); } @@ -521,7 +1561,129 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const &in std::vector const &schema, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) { +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 + // verify that the types are what we expect + cudf::column_view child = input.child(); + cudf::type_id list_type = child.type().id(); + CUDF_EXPECTS(list_type == cudf::type_id::INT8 || list_type == cudf::type_id::UINT8, + "Only a list of bytes is supported as input"); + + cudf::size_type num_columns = schema.size(); + cudf::size_type num_rows = input.parent().size(); + + int device_id; + CUDA_TRY(cudaGetDevice(&device_id)); + int total_shmem; + CUDA_TRY(cudaDeviceGetAttribute(&total_shmem, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); + + // TODO: unable to launch a kernel with all shared used + total_shmem -= 1024; + int shmem_limit_per_block = total_shmem / NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; + + std::vector column_starts; + std::vector column_sizes; + + auto iter = thrust::make_transform_iterator(thrust::make_counting_iterator(0), [&schema](auto i) { + return std::make_tuple(schema[i], nullptr); + }); + size_type fixed_width_size_per_row = detail::compute_column_information( + iter, iter + num_columns, column_starts, column_sizes); //, [](void *) {}); + + size_type validity_size = num_bitmask_words(num_columns) * 4; + + size_type row_size = detail::align_offset(fixed_width_size_per_row + validity_size, 8); + + // Ideally we would check that the offsets are all the same, etc. but for now + // this is probably fine + CUDF_EXPECTS(row_size * num_rows == child.size(), "The layout of the data appears to be off"); + auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr); + auto dev_col_sizes = make_device_uvector_async(column_sizes, stream, mr); + + // build the row_batches from the passed in list column + std::vector row_batches; + + row_batches.push_back(detail::row_batch{child.size(), num_rows}); + + // Allocate the columns we are going to write into + std::vector> output_columns; + std::vector output_data; + std::vector output_nm; + for (cudf::size_type i = 0; i < num_columns; i++) { + auto column = cudf::make_fixed_width_column(schema[i], num_rows, + cudf::mask_state::UNINITIALIZED, stream, mr); + auto mut = column->mutable_view(); + output_data.emplace_back(mut.data()); + output_nm.emplace_back(mut.null_mask()); + output_columns.emplace_back(std::move(column)); + } + + auto dev_output_data = make_device_uvector_async(output_data, stream, mr); + auto dev_output_nm = make_device_uvector_async(output_nm, stream, mr); + + std::vector block_infos = + build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block); + + auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr); + + dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_COLUMNS)); +#if defined(DEBUG) + dim3 threads(std::min(std::min(128, shmem_limit_per_block / 8), (int)child.size())); +#else + dim3 threads(std::min(256, (int)child.size())); +#endif + detail::copy_to_columns<<>>( + num_rows, num_columns, shmem_limit_per_block, input.offsets().data(), + dev_output_data.data(), dev_col_sizes.data(), dev_col_starts.data(), dev_block_infos.data(), + block_infos.size(), child.data()); + + auto const desired_rows_and_columns = (int)sqrt(shmem_limit_per_block); + auto const column_stride = [&]() { + if (desired_rows_and_columns > num_columns) { + // not many columns, group it into 8s and ship it off + return std::min(8, num_columns); + } else { + return util::round_down_safe(desired_rows_and_columns, 8); + } + }(); + auto const row_stride = [&]() { + // we fit as much as we can, we know the column stride now, so calculate the row + return std::min(num_rows, util::round_down_safe(shmem_limit_per_block * 8 / column_stride, 32)); + /* if (desired_rows_and_columns > num_rows) { + return std::min(32, num_rows); + } else { + return util::round_down_safe(desired_rows_and_columns, 32); + }*/ + }(); + std::vector validity_block_infos; + for (int col = 0; col < num_columns; col += column_stride) { + for (int row = 0; row < num_rows; row += row_stride) { + validity_block_infos.emplace_back( + detail::block_info{col, row, std::min(col + column_stride - 1, num_columns - 1), + std::min(row + row_stride - 1, num_rows - 1)}); + } + } + auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream, mr); + dim3 validity_blocks( + util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_COLUMNS)); + + dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu)); + detail:: + copy_validity_to_columns<<>>( + num_rows, num_columns, shmem_limit_per_block, input.offsets().data(), + dev_output_nm.data(), column_starts.back(), dev_validity_block_infos.data(), + validity_block_infos.size(), child.data()); + + return std::make_unique(std::move(output_columns)); +#else + CUDF_FAIL("Row to column conversion optimization requires volta or later hardware."); + return {}; +#endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 +} +std::unique_ptr old_convert_from_rows(cudf::lists_column_view const &input, + std::vector const &schema, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) { // verify that the types are what we expect cudf::column_view child = input.child(); cudf::type_id list_type = child.type().id(); @@ -530,19 +1692,19 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const &in cudf::size_type num_columns = schema.size(); - if (are_all_fixed_width(schema)) { + if (detail::are_all_fixed_width(schema)) { std::vector column_start; std::vector column_size; cudf::size_type num_rows = input.parent().size(); - int32_t size_per_row = compute_fixed_width_layout(schema, column_start, column_size); + int32_t size_per_row = detail::compute_fixed_width_layout(schema, column_start, column_size); // Ideally we would check that the offsets are all the same, etc. but for now // this is probably fine CUDF_EXPECTS(size_per_row * num_rows == child.size(), "The layout of the data appears to be off"); - auto dev_column_start = copy_to_dev_async(column_start, stream, mr); - auto dev_column_size = copy_to_dev_async(column_size, stream, mr); + auto dev_column_start = make_device_uvector_async(column_start, stream); + auto dev_column_size = make_device_uvector_async(column_size, stream); // Allocate the columns we are going to write into std::vector> output_columns; @@ -557,17 +1719,17 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const &in output_columns.emplace_back(std::move(column)); } - auto dev_output_data = copy_to_dev_async(output_data, stream, mr); - auto dev_output_nm = copy_to_dev_async(output_nm, stream, mr); + auto dev_output_data = make_device_uvector_async(output_data, stream, mr); + auto dev_output_nm = make_device_uvector_async(output_nm, stream, mr); dim3 blocks; dim3 threads; int shared_size = - calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads); + detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads); - copy_to_fixed_width_columns<<>>( - num_rows, num_columns, size_per_row, dev_column_start->data(), dev_column_size->data(), - dev_output_data->data(), dev_output_nm->data(), child.data()); + detail::copy_to_fixed_width_columns<<>>( + num_rows, num_columns, size_per_row, dev_column_start.data(), dev_column_size.data(), + dev_output_data.data(), dev_output_nm.data(), child.data()); return std::make_unique(std::move(output_columns)); } else { @@ -575,5 +1737,4 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const &in } } -} // namespace java } // namespace cudf diff --git a/java/src/main/native/src/row_conversion.hpp b/java/src/main/native/src/row_conversion.hpp index 17abde8df19..517202f3892 100644 --- a/java/src/main/native/src/row_conversion.hpp +++ b/java/src/main/native/src/row_conversion.hpp @@ -25,12 +25,24 @@ namespace cudf { namespace java { +std::vector> +old_convert_to_rows(cudf::table_view const &tbl, + // TODO need something for validity + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); + std::vector> convert_to_rows(cudf::table_view const &tbl, // TODO need something for validity rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); +std::unique_ptr +old_convert_from_rows(cudf::lists_column_view const &input, + std::vector const &schema, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); + std::unique_ptr convert_from_rows(cudf::lists_column_view const &input, std::vector const &schema, rmm::cuda_stream_view stream = rmm::cuda_stream_default, From 2a57ce67cc4fa7bc7ae436756f2ab7a5d0eb2cab Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Tue, 21 Sep 2021 21:39:00 +0000 Subject: [PATCH 16/80] fixing validity alignment bugs --- cpp/src/row_conversion/row_conversion.cu | 144 +++++++++---- cpp/tests/row_conversion/row_conversion.cpp | 226 +++++++++++++++++++- java/src/main/native/src/row_conversion.cu | 22 +- 3 files changed, 333 insertions(+), 59 deletions(-) diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu index 42c40e0542d..0409a65b630 100644 --- a/cpp/src/row_conversion/row_conversion.cu +++ b/cpp/src/row_conversion/row_conversion.cu @@ -493,7 +493,7 @@ __global__ void copy_from_columns(const size_type num_rows, input_src, col_size); - // copy the main + // copy the element to global memory cuda::memcpy_async( &shared[fetch % stages_count][shared_offset], input_src, col_size, fetch_barrier); } @@ -568,7 +568,11 @@ __global__ void copy_validity_from_columns(const size_type num_rows, int8_t* shared_blocks[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] = { shared_data, shared_data + shmem_used_per_block / 2}; - constexpr bool print_debug = false; //(threadIdx.x==0 || threadIdx.x == 32) && blockIdx.x == 0; + int8_t* output_check_addr = nullptr; + int8_t* output_block_start = nullptr; + size_type output_block_size = 0; + + bool print_debug = false; //threadIdx.x==0 && blockIdx.x == 0; // if (blockIdx.x != 3 || threadIdx.x / 32 != 0) return; if (print_debug) { printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns); @@ -659,12 +663,14 @@ __global__ void copy_validity_from_columns(const size_type num_rows, if (print_debug) printf( - "%d %d - my warp is %d, %d total sections, %d warps per block, blockDim.x=%d, warp side " + "%d %d - my warp is %d, %d total sections(%d x, %d y), %d warps per block, blockDim.x=%d, warp size " "%d\n", threadIdx.x, blockIdx.x, warp_id, total_sections, + num_sections_x, + num_sections_y, warps_per_block, blockDim.x, detail::warp_size); @@ -672,10 +678,10 @@ __global__ void copy_validity_from_columns(const size_type num_rows, for (int my_section_idx = warp_id; my_section_idx < total_sections; my_section_idx += warps_per_block) { // convert to rows and cols - auto const section_x = my_section_idx / num_sections_x; - auto const section_y = my_section_idx % num_sections_x; + auto const section_x = my_section_idx % num_sections_x; + auto const section_y = my_section_idx / num_sections_x; - if (print_debug) printf("working on section %d of %d...\n", section_x, num_sections_x); + if (print_debug) printf("working on section %d,%d - %d of %d...\n", section_x, section_y, my_section_idx, total_sections); auto const relative_col = section_x * 32 + lane_id; auto const relative_row = section_y * 8; auto const absolute_col = relative_col + block.start_col; @@ -722,7 +728,7 @@ __global__ void copy_validity_from_columns(const size_type num_rows, absolute_col); // every thread that is participating in the warp has a byte, but it's column-based - // data and we need it in row-based. So we shiffle the bits around with ballot_sync to make + // data and we need it in row-based. So we shuffle the bits around with ballot_sync to make // the bytes we actually write. for (int i = 0, byte_mask = 1; i < 8 && relative_row + i < num_rows; ++i, byte_mask <<= 1) { auto validity_data = __ballot_sync(participation_mask, my_byte & byte_mask); @@ -744,23 +750,23 @@ __global__ void copy_validity_from_columns(const size_type num_rows, if (cols_left <= 8) { // write byte if (print_debug) - printf("writing single byte to shared offset 0x%x which is %p...\n", - validity_write_offset, + printf("%d %d - writing single byte to shared offset 0x%x which is %p...\n", + threadIdx.x, blockIdx.x, validity_write_offset, &this_shared_block[validity_write_offset]); this_shared_block[validity_write_offset] = validity_data & 0xFF; } else if (cols_left <= 16) { // write int16 if (print_debug) - printf("writing two bytes to shared offset 0x%x which is %p...\n", - validity_write_offset, + printf("%d %d - writing two bytes to shared offset 0x%x which is %p...\n", + threadIdx.x, blockIdx.x, validity_write_offset, &this_shared_block[validity_write_offset]); *reinterpret_cast(&this_shared_block[validity_write_offset]) = validity_data & 0xFFFF; } else if (cols_left <= 24) { // write int16 and then int8 if (print_debug) - printf("writing three bytes to shared offset 0x%x which is %p...\n", - validity_write_offset, + printf("%d %d - writing three bytes to shared offset 0x%x which is %p...\n", + threadIdx.x, blockIdx.x, validity_write_offset, &this_shared_block[validity_write_offset]); *reinterpret_cast(&this_shared_block[validity_write_offset]) = validity_data & 0xFFFF; @@ -768,8 +774,8 @@ __global__ void copy_validity_from_columns(const size_type num_rows, } else { // write int32 if (print_debug) - printf("writing 4 bytes to shared offset 0x%x which is %p...\n", - validity_write_offset, + printf("%d %d - writing 4 bytes to shared offset 0x%x which is %p...\n", + threadIdx.x, blockIdx.x, validity_write_offset, &this_shared_block[validity_write_offset]); *reinterpret_cast(&this_shared_block[validity_write_offset]) = validity_data; @@ -816,6 +822,18 @@ __global__ void copy_validity_from_columns(const size_type num_rows, auto const output_ptr = output_data[block.buffer_num] + row_offsets[row] + validity_offset + block.start_col / 8; auto const num_bytes = util::div_rounding_up_unsafe(num_block_cols, 8); + +/* if (num_rows >= 5006) { + auto const row5006_col_65 = output_data[block.buffer_num] + row_offsets[5006] + validity_offset + 65 / 8; + if (output_ptr >= row5006_col_65 && output_ptr <= row5006_col_65 + 4) { + printf("%d %d - writing bytes from %p(0x%x)-%p to %p-%p that overlap global %p(0x%x), which is row 5006, col 65!\n", threadIdx.x, blockIdx.x, &this_shared_block[validity_data_row_length * relative_row], this_shared_block[validity_data_row_length * relative_row], &this_shared_block[validity_data_row_length * relative_row + num_bytes], output_ptr, output_ptr + num_bytes, row5006_col_65, *row5006_col_65); + printf("%d %d - block information\n%d,%d -> %d,%d\n%d columns, %d rows\n", threadIdx.x, blockIdx.x, block.start_col, block.start_row, block.end_col, block.end_row, block.num_cols(), block.num_rows()); + output_check_addr = row5006_col_65; + output_block_start = output_ptr; + output_block_size = num_bytes; + } + }*/ + cuda::memcpy_async( output_ptr, &this_shared_block[validity_data_row_length * relative_row], @@ -851,6 +869,17 @@ __global__ void copy_validity_from_columns(const size_type num_rows, ++validity_block) { shared_block_barriers[validity_block].arrive_and_wait(); } + if (output_check_addr != nullptr) { + printf("output check after write to %p - 0x%x\n", output_check_addr, *output_check_addr); + for (int i=0; i get_admin_data_sizes(size_t col_size_size, @@ -901,12 +930,12 @@ static __device__ void fetch_blocks_for_row_to_column( for (; fetch_index < static_cast(total_blocks) && fetch_index < (processing_index + read_ahead_count); ++fetch_index) { - if (debug_print) - printf("fetching block %lu of %d\n", - blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch_index, - total_blocks); auto const fetch_block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch_index]; + if (debug_print) + printf("fetching block %lu of %d for start col %d, end col %d. Starting col offset is %p, ending offset %p\n", + blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch_index, + total_blocks, fetch_block.start_col, fetch_block.end_col, &col_offsets[fetch_block.start_col], &col_offsets[fetch_block.end_col]); auto const fetch_block_start_row = fetch_block.start_row; auto const fetch_block_end_row = fetch_block.end_row; auto const starting_col_offset = col_offsets[fetch_block.start_col]; @@ -948,7 +977,7 @@ static __device__ void fetch_blocks_for_row_to_column( &shared[fetch_index % max_resident_blocks][shared_row_offset], &col_offsets[fetch_block.start_col], col_offset_bytes); - cuda::memcpy_async(group, + cuda::memcpy_async(group, &shared[fetch_index % max_resident_blocks][shared_row_offset], &col_offsets[fetch_block.start_col], col_offset_bytes, @@ -983,7 +1012,7 @@ static __device__ void fetch_blocks_for_row_to_column( fetch_index % max_resident_blocks, &shared[fetch_index % max_resident_blocks][shared_offset], &input_data[row_offsets[row] + starting_col_offset]); - // copy the main + // copy the main cuda::memcpy_async(&shared[fetch_index % max_resident_blocks][shared_offset], &input_data[row_offsets[row] + starting_col_offset], fetch_block_row_size, @@ -1029,7 +1058,7 @@ __global__ void copy_to_columns(const size_type num_rows, // to speed up some of the random access memory we do, we copy col_sizes and col_offsets // to shared memory for each of the blocks that we work on - /*constexpr*/ bool debug_print = false; // threadIdx.x == 0; + /*constexpr*/ bool debug_print = false; //threadIdx.x == 0 && blockIdx.x == 0; constexpr unsigned stages_count = NUM_BLOCKS_PER_KERNEL_LOADED; auto group = cooperative_groups::this_thread_block(); extern __shared__ int8_t shared_data[]; @@ -1037,12 +1066,14 @@ __global__ void copy_to_columns(const size_type num_rows, if (debug_print) { printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns); - printf("block infos are at %p and my index is %d\n", block_infos, blockIdx.x); + printf("%d block infos are at %p and my index is %d\n", num_block_infos, block_infos, blockIdx.x); /* printf("Row Offsets:\n"); for (int i=0; i build_validity_block_infos( }(), 8); // we fit as much as we can given the column stride - auto const row_stride = std::min(num_rows, shmem_limit_per_block * 8 / column_stride); + // note that an element in the table takes just 1 bit, but a row with a single + // element still takes 8 bytes! + auto const bytes_per_row = align_offset(util::div_rounding_up_unsafe(column_stride, 8), 8); + auto const row_stride = std::min(num_rows, shmem_limit_per_block / bytes_per_row); std::vector validity_block_infos; for (int col = 0; col < num_columns; col += column_stride) { @@ -1695,6 +1747,7 @@ std::vector build_block_infos(std::vector const& column_s } int const window_height = std::min(desired_window_height, rows_left_in_batch); +// printf("block %d, %d to %d, %d\n", start_col, current_window_start_row, end_col, std::min(current_window_start_row + window_height - 1, total_number_of_rows - 1)); block_infos.emplace_back(detail::block_info{ start_col, current_window_start_row, @@ -1716,11 +1769,7 @@ std::vector build_block_infos(std::vector const& column_s // bytes, not rows or columns. size_type const optimal_square_len = size_type(sqrt(shmem_limit_per_block)); int const window_height = - std::clamp(util::round_up_safe( - optimal_square_len <= (size_type)column_sizes.size() - ? std::min(optimal_square_len / column_sizes[0], total_number_of_rows) - : row_batches[0].row_count / 2, - 32), + std::clamp(util::round_up_safe(std::min(std::min(optimal_square_len, (size_type)column_sizes.size()) / column_sizes[0], total_number_of_rows), 32), 1, row_batches[0].row_count); #if defined(DEBUG) @@ -1787,7 +1836,7 @@ std::vector build_block_infos(std::vector const& column_s shmem_limit_per_block); #endif // too large, close this window, generate vertical blocks and restart - build_blocks(current_window_start_col, col - 1, window_height); + build_blocks(current_window_start_col, col == 0 ? col : col - 1, window_height); row_size = detail::align_offset((column_starts[col] + column_sizes[col]) & 7, alignment_needed); #if defined(DEBUG) @@ -1973,6 +2022,16 @@ std::vector> convert_to_rows(cudf::table_view cons } c = c.child(1); } + exclusive_scan([t](int row_index) { + size_type total_row_size = 0; + for (int i=0 i> convert_to_rows(cudf::table_view cons // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then // calculate the size of each row's variable-width data and validity as well. auto validity_size = num_bitmask_words(num_columns) * 4; + // thrust for (int row = 0; row < num_rows; ++row) { auto aligned_row_batch_size = detail::align_offset(row_batch_size, 8); // rows are 8 byte aligned @@ -2310,8 +2370,8 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const& in auto const desired_rows_and_columns = (int)sqrt(shmem_limit_per_block); auto const column_stride = [&]() { if (desired_rows_and_columns > num_columns) { - // not many columns, group it into 8s and ship it off - return std::min(8, num_columns); + // not many columns, group it into 64s and ship it off + return std::min(64, num_columns); } else { return util::round_down_safe(desired_rows_and_columns, 8); } @@ -2325,6 +2385,8 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const& in return util::round_down_safe(desired_rows_and_columns, 32); }*/ }(); + printf("column stride is %d and row stride is %d. std::min(%d, util::round_down_safe(%d * 8 / %d, 32))\n", column_stride, row_stride, num_rows, shmem_limit_per_block, column_stride); + printf("each block uses %d bytes of shared memory\n", (column_stride / 8) * detail::align_offset(row_stride, 4)); std::vector validity_block_infos; for (int col = 0; col < num_columns; col += column_stride) { for (int row = 0; row < num_rows; row += row_stride) { diff --git a/cpp/tests/row_conversion/row_conversion.cpp b/cpp/tests/row_conversion/row_conversion.cpp index e38b37e81a6..26e071eef79 100644 --- a/cpp/tests/row_conversion/row_conversion.cpp +++ b/cpp/tests/row_conversion/row_conversion.cpp @@ -33,11 +33,19 @@ TEST_F(ColumnToRowTests, Single) { cudf::test::fixed_width_column_wrapper a({-1}); cudf::table_view in(std::vector{a}); + std::vector schema = {cudf::data_type{cudf::type_id::INT32}}; auto old_rows = cudf::old_convert_to_rows(in); auto new_rows = cudf::convert_to_rows(in); EXPECT_EQ(old_rows.size(), new_rows.size()); + for (uint i = 0; i < old_rows.size(); ++i) { + auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); + } + for (uint i = 0; i < old_rows.size(); i++) { CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); } @@ -47,11 +55,19 @@ TEST_F(ColumnToRowTests, Simple) { cudf::test::fixed_width_column_wrapper a({-1, 0, 1}); cudf::table_view in(std::vector{a}); + std::vector schema = {cudf::data_type{cudf::type_id::INT32}}; auto old_rows = cudf::old_convert_to_rows(in); auto new_rows = cudf::convert_to_rows(in); EXPECT_EQ(old_rows.size(), new_rows.size()); + for (uint i = 0; i < old_rows.size(); ++i) { + auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); + } + for (uint i = 0; i < old_rows.size(); i++) { CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); } @@ -63,11 +79,20 @@ TEST_F(ColumnToRowTests, Tall) cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); }); cudf::test::fixed_width_column_wrapper a(r, r + (size_t)4096); cudf::table_view in(std::vector{a}); + std::vector schema = {cudf::data_type{cudf::type_id::INT32}}; auto old_rows = cudf::old_convert_to_rows(in); auto new_rows = cudf::convert_to_rows(in); EXPECT_EQ(old_rows.size(), new_rows.size()); + + for (uint i = 0; i < old_rows.size(); ++i) { + auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); + } + for (uint i = 0; i < old_rows.size(); i++) { CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); } @@ -77,10 +102,12 @@ TEST_F(ColumnToRowTests, Wide) { std::vector> cols; std::vector views; + std::vector schema; for (int i = 0; i < 256; ++i) { cols.push_back(cudf::test::fixed_width_column_wrapper({rand()})); views.push_back(cols.back()); + schema.push_back(cudf::data_type{cudf::type_id::INT32}); } cudf::table_view in(views); @@ -88,6 +115,13 @@ TEST_F(ColumnToRowTests, Wide) auto new_rows = cudf::convert_to_rows(in); EXPECT_EQ(old_rows.size(), new_rows.size()); + for (uint i = 0; i < old_rows.size(); ++i) { + auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); + } + for (uint i = 0; i < old_rows.size(); i++) { CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); } @@ -97,10 +131,13 @@ TEST_F(ColumnToRowTests, SingleByteWide) { std::vector> cols; std::vector views; + std::vector schema; for (int i = 0; i < 256; ++i) { cols.push_back(cudf::test::fixed_width_column_wrapper({rand()})); views.push_back(cols.back()); + + schema.push_back(cudf::data_type{cudf::type_id::INT8}); } cudf::table_view in(views); @@ -108,6 +145,59 @@ TEST_F(ColumnToRowTests, SingleByteWide) auto new_rows = cudf::convert_to_rows(in); EXPECT_EQ(old_rows.size(), new_rows.size()); + + for (uint i = 0; i < old_rows.size(); ++i) { + auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); + } + + for (uint i = 0; i < old_rows.size(); i++) { + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); + } +} + +TEST_F(ColumnToRowTests, Non2Power) +{ + auto r = + cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); }); + std::vector> cols; + std::vector views; + std::vector schema; + + constexpr auto num_rows = 6 * 1024 + 557; + for (int i = 0; i < 131; ++i) { + cols.push_back( + cudf::test::fixed_width_column_wrapper(r + num_rows * i, r + num_rows * i + num_rows)); + views.push_back(cols.back()); + schema.push_back(cudf::data_type{cudf::type_id::INT32}); + } + cudf::table_view in(views); + + auto old_rows = cudf::old_convert_to_rows(in); + auto new_rows = cudf::convert_to_rows(in); + + EXPECT_EQ(old_rows.size(), new_rows.size()); + + for (uint i = 0; i < old_rows.size(); ++i) { + auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + + for (int j=0; jnum_columns(); ++j) { + printf("testing column %d\n", j); + if (j==65) { + printf("old\n"); + cudf::test::print(old_tbl->get_column(j)); + printf("new\n"); + cudf::test::print(new_tbl->get_column(j)); + } + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(old_tbl->get_column(j), new_tbl->get_column(j)); + } + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); + } + for (uint i = 0; i < old_rows.size(); i++) { CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); } @@ -119,11 +209,69 @@ TEST_F(ColumnToRowTests, Big) cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); }); std::vector> cols; std::vector views; + std::vector schema; - for (int i = 0; i < 256; ++i) { + // 28 columns of 1 million rows + constexpr auto num_rows = 1024 * 1024; + for (int i = 0; i < 28; ++i) { + cols.push_back( + cudf::test::fixed_width_column_wrapper(r + num_rows * i, r + num_rows * i + num_rows)); + views.push_back(cols.back()); + schema.push_back(cudf::data_type{cudf::type_id::INT32}); + } + cudf::table_view in(views); + + auto old_rows = cudf::old_convert_to_rows(in); + auto new_rows = cudf::convert_to_rows(in); + + EXPECT_EQ(old_rows.size(), new_rows.size()); + for (uint i = 0; i < old_rows.size(); i++) { + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); + } +} + +TEST_F(ColumnToRowTests, Bigger) +{ + auto r = + cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); }); + std::vector> cols; + std::vector views; + std::vector schema; + + // 128 columns of 1 million rows + constexpr auto num_rows = 1024 * 1024; + for (int i = 0; i < 128; ++i) { cols.push_back( - cudf::test::fixed_width_column_wrapper(r + 4096 * i, r + 4096 * i + 4096)); + cudf::test::fixed_width_column_wrapper(r + num_rows * i, r + num_rows * i + num_rows)); views.push_back(cols.back()); + schema.push_back(cudf::data_type{cudf::type_id::INT32}); + } + cudf::table_view in(views); + + auto old_rows = cudf::old_convert_to_rows(in); + auto new_rows = cudf::convert_to_rows(in); + + EXPECT_EQ(old_rows.size(), new_rows.size()); + for (uint i = 0; i < old_rows.size(); i++) { + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); + } +} + +TEST_F(ColumnToRowTests, Biggest) +{ + auto r = + cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); }); + std::vector> cols; + std::vector views; + std::vector schema; + + // 128 columns of 2 million rows + constexpr auto num_rows = 2 * 1024 * 1024; + for (int i = 0; i < 128; ++i) { + cols.push_back( + cudf::test::fixed_width_column_wrapper(r + num_rows * i, r + num_rows * i + num_rows)); + views.push_back(cols.back()); + schema.push_back(cudf::data_type{cudf::type_id::INT32}); } cudf::table_view in(views); @@ -238,7 +386,7 @@ TEST_F(RowToColumnTests, SingleByteWide) } } -TEST_F(RowToColumnTests, non2power) +TEST_F(RowToColumnTests, Non2Power) { auto r = cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); }); @@ -246,9 +394,13 @@ TEST_F(RowToColumnTests, non2power) std::vector views; std::vector schema; - cols.push_back(cudf::test::fixed_width_column_wrapper(r, r + 13)); - views.push_back(cols.back()); - schema.push_back(cudf::data_type{cudf::type_id::INT32}); + constexpr auto num_rows = 6 * 1024 + 557; + for (int i = 0; i < 131; ++i) { + cols.push_back( + cudf::test::fixed_width_column_wrapper(r + num_rows * i, r + num_rows * i + num_rows)); + views.push_back(cols.back()); + schema.push_back(cudf::data_type{cudf::type_id::INT32}); + } cudf::table_view in(views); auto old_rows = cudf::old_convert_to_rows(in); @@ -269,9 +421,67 @@ TEST_F(RowToColumnTests, Big) std::vector views; std::vector schema; - for (int i = 0; i < 256; ++i) { + // 28 columns of 1 million rows + constexpr auto num_rows = 1024 * 1024; + for (int i = 0; i < 28; ++i) { + cols.push_back( + cudf::test::fixed_width_column_wrapper(r + num_rows * i, r + num_rows * i + num_rows)); + views.push_back(cols.back()); + schema.push_back(cudf::data_type{cudf::type_id::INT32}); + } + cudf::table_view in(views); + + auto old_rows = cudf::old_convert_to_rows(in); + + for (uint i = 0; i < old_rows.size(); ++i) { + auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); + } +} + +TEST_F(RowToColumnTests, Bigger) +{ + auto r = + cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); }); + std::vector> cols; + std::vector views; + std::vector schema; + + // 28 columns of 1 million rows + constexpr auto num_rows = 1024 * 1024; + for (int i = 0; i < 128; ++i) { + cols.push_back( + cudf::test::fixed_width_column_wrapper(r + num_rows * i, r + num_rows * i + num_rows)); + views.push_back(cols.back()); + schema.push_back(cudf::data_type{cudf::type_id::INT32}); + } + cudf::table_view in(views); + + auto old_rows = cudf::old_convert_to_rows(in); + + for (uint i = 0; i < old_rows.size(); ++i) { + auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); + } +} + +TEST_F(RowToColumnTests, Biggest) +{ + auto r = + cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); }); + std::vector> cols; + std::vector views; + std::vector schema; + + // 28 columns of 1 million rows + constexpr auto num_rows = 5 * 1024 * 1024; + for (int i = 0; i < 128; ++i) { cols.push_back( - cudf::test::fixed_width_column_wrapper(r + 4096 * i, r + 4096 * i + 4096)); + cudf::test::fixed_width_column_wrapper(r + num_rows * i, r + num_rows * i + num_rows)); views.push_back(cols.back()); schema.push_back(cudf::data_type{cudf::type_id::INT32}); } diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu index 1babbc6fd1a..9f0df3569a7 100644 --- a/java/src/main/native/src/row_conversion.cu +++ b/java/src/main/native/src/row_conversion.cu @@ -54,7 +54,9 @@ constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED = 2; #endif using cudf::detail::make_device_uvector_async; -namespace cudf { +using cudf::detail::warp_size; + +namespace cudf::java { namespace detail { @@ -526,9 +528,9 @@ __global__ void copy_validity_from_columns( align_offset(util::div_rounding_up_unsafe(num_block_cols, 8), 8); auto const total_sections = num_sections_x * num_sections_y; - int const warp_id = threadIdx.x / detail::warp_size; - int const lane_id = threadIdx.x % detail::warp_size; - auto const warps_per_block = std::max(1u, blockDim.x / detail::warp_size); + int const warp_id = threadIdx.x / warp_size; + int const lane_id = threadIdx.x % warp_size; + auto const warps_per_block = std::max(1u, blockDim.x / warp_size); // the block is divided into sections. A warp operates on a section at a time. for (int my_section_idx = warp_id; my_section_idx < total_sections; @@ -557,7 +559,7 @@ __global__ void copy_validity_from_columns( // lead thread in each warp writes data auto const validity_write_offset = validity_data_row_length * (relative_row + i) + relative_col / 8; - if (threadIdx.x % detail::warp_size == 0) { + if (threadIdx.x % warp_size == 0) { if (cols_left <= 8) { // write byte this_shared_block[validity_write_offset] = validity_data & 0xFF; @@ -855,12 +857,12 @@ __global__ void copy_validity_to_columns( auto const num_sections_x = (num_block_cols + 7) / 8; auto const num_sections_y = (num_block_rows + 31) / 32; - auto const validity_data_col_length = align_offset(num_sections_y, 4); + auto const validity_data_col_length = num_sections_y * 4; // words to bytes auto const total_sections = num_sections_x * num_sections_y; - int const warp_id = threadIdx.x / detail::warp_size; - int const lane_id = threadIdx.x % detail::warp_size; - auto const warps_per_block = std::max(1u, blockDim.x / detail::warp_size); + int const warp_id = threadIdx.x / warp_size; + int const lane_id = threadIdx.x % warp_size; + auto const warps_per_block = std::max(1u, blockDim.x / warp_size); // the block is divided into sections. A warp operates on a section at a time. for (int my_section_idx = warp_id; my_section_idx < total_sections; @@ -888,7 +890,7 @@ __global__ void copy_validity_to_columns( ++i, byte_mask <<= 1) { auto validity_data = __ballot_sync(participation_mask, my_byte & byte_mask); // lead thread in each warp writes data - if (threadIdx.x % detail::warp_size == 0) { + if (threadIdx.x % warp_size == 0) { auto const validity_write_offset = validity_data_col_length * (relative_col + i) + relative_row / 8; From ed5492eb80979d6f90ab18f64d6baf5006abf6a6 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Wed, 22 Sep 2021 03:11:58 +0000 Subject: [PATCH 17/80] Updates and bug fixes --- .../row_conversion/row_conversion.cpp | 2 +- cpp/src/row_conversion/row_conversion.cu | 206 +++++++----------- cpp/tests/row_conversion/row_conversion.cpp | 36 +-- java/src/main/native/src/row_conversion.cu | 106 ++++----- 4 files changed, 155 insertions(+), 195 deletions(-) diff --git a/cpp/benchmarks/row_conversion/row_conversion.cpp b/cpp/benchmarks/row_conversion/row_conversion.cpp index ad9925e9043..2fe436a22c1 100644 --- a/cpp/benchmarks/row_conversion/row_conversion.cpp +++ b/cpp/benchmarks/row_conversion/row_conversion.cpp @@ -19,8 +19,8 @@ #include #include -#include #include +#include #include class RowConversion : public cudf::benchmark { diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu index 0409a65b630..eb3c4b28b6a 100644 --- a/cpp/src/row_conversion/row_conversion.cu +++ b/cpp/src/row_conversion/row_conversion.cu @@ -568,11 +568,7 @@ __global__ void copy_validity_from_columns(const size_type num_rows, int8_t* shared_blocks[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] = { shared_data, shared_data + shmem_used_per_block / 2}; - int8_t* output_check_addr = nullptr; - int8_t* output_block_start = nullptr; - size_type output_block_size = 0; - - bool print_debug = false; //threadIdx.x==0 && blockIdx.x == 0; + constexpr bool print_debug = false; // threadIdx.x==0 && blockIdx.x == 0; // if (blockIdx.x != 3 || threadIdx.x / 32 != 0) return; if (print_debug) { printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns); @@ -663,7 +659,8 @@ __global__ void copy_validity_from_columns(const size_type num_rows, if (print_debug) printf( - "%d %d - my warp is %d, %d total sections(%d x, %d y), %d warps per block, blockDim.x=%d, warp size " + "%d %d - my warp is %d, %d total sections(%d x, %d y), %d warps per block, blockDim.x=%d, " + "warp size " "%d\n", threadIdx.x, blockIdx.x, @@ -681,7 +678,12 @@ __global__ void copy_validity_from_columns(const size_type num_rows, auto const section_x = my_section_idx % num_sections_x; auto const section_y = my_section_idx / num_sections_x; - if (print_debug) printf("working on section %d,%d - %d of %d...\n", section_x, section_y, my_section_idx, total_sections); + if (print_debug) + printf("working on section %d,%d - %d of %d...\n", + section_x, + section_y, + my_section_idx, + total_sections); auto const relative_col = section_x * 32 + lane_id; auto const relative_row = section_y * 8; auto const absolute_col = relative_col + block.start_col; @@ -751,14 +753,18 @@ __global__ void copy_validity_from_columns(const size_type num_rows, // write byte if (print_debug) printf("%d %d - writing single byte to shared offset 0x%x which is %p...\n", - threadIdx.x, blockIdx.x, validity_write_offset, + threadIdx.x, + blockIdx.x, + validity_write_offset, &this_shared_block[validity_write_offset]); this_shared_block[validity_write_offset] = validity_data & 0xFF; } else if (cols_left <= 16) { // write int16 if (print_debug) printf("%d %d - writing two bytes to shared offset 0x%x which is %p...\n", - threadIdx.x, blockIdx.x, validity_write_offset, + threadIdx.x, + blockIdx.x, + validity_write_offset, &this_shared_block[validity_write_offset]); *reinterpret_cast(&this_shared_block[validity_write_offset]) = validity_data & 0xFFFF; @@ -766,7 +772,9 @@ __global__ void copy_validity_from_columns(const size_type num_rows, // write int16 and then int8 if (print_debug) printf("%d %d - writing three bytes to shared offset 0x%x which is %p...\n", - threadIdx.x, blockIdx.x, validity_write_offset, + threadIdx.x, + blockIdx.x, + validity_write_offset, &this_shared_block[validity_write_offset]); *reinterpret_cast(&this_shared_block[validity_write_offset]) = validity_data & 0xFFFF; @@ -775,7 +783,9 @@ __global__ void copy_validity_from_columns(const size_type num_rows, // write int32 if (print_debug) printf("%d %d - writing 4 bytes to shared offset 0x%x which is %p...\n", - threadIdx.x, blockIdx.x, validity_write_offset, + threadIdx.x, + blockIdx.x, + validity_write_offset, &this_shared_block[validity_write_offset]); *reinterpret_cast(&this_shared_block[validity_write_offset]) = validity_data; @@ -823,63 +833,20 @@ __global__ void copy_validity_from_columns(const size_type num_rows, output_data[block.buffer_num] + row_offsets[row] + validity_offset + block.start_col / 8; auto const num_bytes = util::div_rounding_up_unsafe(num_block_cols, 8); -/* if (num_rows >= 5006) { - auto const row5006_col_65 = output_data[block.buffer_num] + row_offsets[5006] + validity_offset + 65 / 8; - if (output_ptr >= row5006_col_65 && output_ptr <= row5006_col_65 + 4) { - printf("%d %d - writing bytes from %p(0x%x)-%p to %p-%p that overlap global %p(0x%x), which is row 5006, col 65!\n", threadIdx.x, blockIdx.x, &this_shared_block[validity_data_row_length * relative_row], this_shared_block[validity_data_row_length * relative_row], &this_shared_block[validity_data_row_length * relative_row + num_bytes], output_ptr, output_ptr + num_bytes, row5006_col_65, *row5006_col_65); - printf("%d %d - block information\n%d,%d -> %d,%d\n%d columns, %d rows\n", threadIdx.x, blockIdx.x, block.start_col, block.start_row, block.end_col, block.end_row, block.num_cols(), block.num_rows()); - output_check_addr = row5006_col_65; - output_block_start = output_ptr; - output_block_size = num_bytes; - } - }*/ - cuda::memcpy_async( output_ptr, &this_shared_block[validity_data_row_length * relative_row], num_bytes, shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]); - - /* auto const padding_ptr = output_ptr + num_bytes; - auto const padding_needed = -reinterpret_cast(padding_ptr) & 7; - if (print_debug) printf( - "absolute_row: %d, row_offset for this row: 0x%x, validity data bytes: %d, end - address: %p, padding bytes %lu\n", row, row_offsets[row], num_bytes, output_ptr + - num_bytes, padding_needed); cuda::memcpy_async(padding_ptr, zero, padding_needed, - shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]); - */ - - /* if (print_debug) { - for (int i=0; i get_admin_data_sizes(size_t col_size_size, @@ -932,10 +899,16 @@ static __device__ void fetch_blocks_for_row_to_column( ++fetch_index) { auto const fetch_block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch_index]; - if (debug_print) - printf("fetching block %lu of %d for start col %d, end col %d. Starting col offset is %p, ending offset %p\n", - blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch_index, - total_blocks, fetch_block.start_col, fetch_block.end_col, &col_offsets[fetch_block.start_col], &col_offsets[fetch_block.end_col]); + if (debug_print) + printf( + "fetching block %lu of %d for start col %d, end col %d. Starting col offset is %p, ending " + "offset %p\n", + blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch_index, + total_blocks, + fetch_block.start_col, + fetch_block.end_col, + &col_offsets[fetch_block.start_col], + &col_offsets[fetch_block.end_col]); auto const fetch_block_start_row = fetch_block.start_row; auto const fetch_block_end_row = fetch_block.end_row; auto const starting_col_offset = col_offsets[fetch_block.start_col]; @@ -977,7 +950,7 @@ static __device__ void fetch_blocks_for_row_to_column( &shared[fetch_index % max_resident_blocks][shared_row_offset], &col_offsets[fetch_block.start_col], col_offset_bytes); - cuda::memcpy_async(group, + cuda::memcpy_async(group, &shared[fetch_index % max_resident_blocks][shared_row_offset], &col_offsets[fetch_block.start_col], col_offset_bytes, @@ -985,23 +958,6 @@ static __device__ void fetch_blocks_for_row_to_column( shared_row_offset += col_offset_bytes; shared_row_offset = align_offset(shared_row_offset, 8); - if (debug_print && group.thread_rank() == 0 && blockIdx.x == 0 && fetch_block.start_col == 0 && - fetch_block.start_row <= 51 && fetch_block.end_row >= 51) { - printf("Input data for col 0 row 51 is 0x"); - for (int i = 0; i < col_sizes[0]; ++i) { - printf("%x ", input_data[row_offsets[51] + col_offsets[0] + i]); - } - printf("\n"); - printf( - "this is at offset %d-%d and starting column offset is %d and we're reading %d bytes\n", - col_offsets[0], - col_offsets[0] + col_sizes[0], - starting_col_offset, - fetch_block_row_size); - auto shared_offset = (51 - fetch_block_start_row) * fetch_block_row_size + shared_row_offset; - printf("destination is %p", &shared[fetch_index % max_resident_blocks][shared_offset]); - } - for (auto row = fetch_block_start_row + static_cast(threadIdx.x); row <= fetch_block_end_row; row += blockDim.x) { @@ -1012,7 +968,7 @@ static __device__ void fetch_blocks_for_row_to_column( fetch_index % max_resident_blocks, &shared[fetch_index % max_resident_blocks][shared_offset], &input_data[row_offsets[row] + starting_col_offset]); - // copy the main + // copy the main cuda::memcpy_async(&shared[fetch_index % max_resident_blocks][shared_offset], &input_data[row_offsets[row] + starting_col_offset], fetch_block_row_size, @@ -1058,7 +1014,7 @@ __global__ void copy_to_columns(const size_type num_rows, // to speed up some of the random access memory we do, we copy col_sizes and col_offsets // to shared memory for each of the blocks that we work on - /*constexpr*/ bool debug_print = false; //threadIdx.x == 0 && blockIdx.x == 0; + /*constexpr*/ bool debug_print = false; // threadIdx.x == 0 && blockIdx.x == 0; constexpr unsigned stages_count = NUM_BLOCKS_PER_KERNEL_LOADED; auto group = cooperative_groups::this_thread_block(); extern __shared__ int8_t shared_data[]; @@ -1066,14 +1022,17 @@ __global__ void copy_to_columns(const size_type num_rows, if (debug_print) { printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns); - printf("%d block infos are at %p and my index is %d\n", num_block_infos, block_infos, blockIdx.x); + printf( + "%d block infos are at %p and my index is %d\n", num_block_infos, block_infos, blockIdx.x); /* printf("Row Offsets:\n"); for (int i=0; i NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED ? NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED @@ -1696,7 +1650,7 @@ std::vector build_validity_block_infos( // note that an element in the table takes just 1 bit, but a row with a single // element still takes 8 bytes! auto const bytes_per_row = align_offset(util::div_rounding_up_unsafe(column_stride, 8), 8); - auto const row_stride = std::min(num_rows, shmem_limit_per_block / bytes_per_row); + auto const row_stride = std::min(num_rows, shmem_limit_per_block / bytes_per_row); std::vector validity_block_infos; for (int col = 0; col < num_columns; col += column_stride) { @@ -1747,7 +1701,6 @@ std::vector build_block_infos(std::vector const& column_s } int const window_height = std::min(desired_window_height, rows_left_in_batch); -// printf("block %d, %d to %d, %d\n", start_col, current_window_start_row, end_col, std::min(current_window_start_row + window_height - 1, total_number_of_rows - 1)); block_infos.emplace_back(detail::block_info{ start_col, current_window_start_row, @@ -1768,10 +1721,13 @@ std::vector build_block_infos(std::vector const& column_s // want them equal, so height and width are sqrt(shared_mem_size). The trick is that it's in // bytes, not rows or columns. size_type const optimal_square_len = size_type(sqrt(shmem_limit_per_block)); - int const window_height = - std::clamp(util::round_up_safe(std::min(std::min(optimal_square_len, (size_type)column_sizes.size()) / column_sizes[0], total_number_of_rows), 32), - 1, - row_batches[0].row_count); + int const window_height = std::clamp( + util::round_up_safe( + std::min(std::min(optimal_square_len, (size_type)column_sizes.size()) / column_sizes[0], + total_number_of_rows), + 32), + 1, + row_batches[0].row_count); #if defined(DEBUG) printf( "optimal_square_len is %d and we have %d columns, optimal_square_len / column_sizes[0] is %d " @@ -2385,8 +2341,6 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const& in return util::round_down_safe(desired_rows_and_columns, 32); }*/ }(); - printf("column stride is %d and row stride is %d. std::min(%d, util::round_down_safe(%d * 8 / %d, 32))\n", column_stride, row_stride, num_rows, shmem_limit_per_block, column_stride); - printf("each block uses %d bytes of shared memory\n", (column_stride / 8) * detail::align_offset(row_stride, 4)); std::vector validity_block_infos; for (int col = 0; col < num_columns; col += column_stride) { for (int row = 0; row < num_rows; row += row_stride) { diff --git a/cpp/tests/row_conversion/row_conversion.cpp b/cpp/tests/row_conversion/row_conversion.cpp index 26e071eef79..70a4552a6f9 100644 --- a/cpp/tests/row_conversion/row_conversion.cpp +++ b/cpp/tests/row_conversion/row_conversion.cpp @@ -168,8 +168,8 @@ TEST_F(ColumnToRowTests, Non2Power) constexpr auto num_rows = 6 * 1024 + 557; for (int i = 0; i < 131; ++i) { - cols.push_back( - cudf::test::fixed_width_column_wrapper(r + num_rows * i, r + num_rows * i + num_rows)); + cols.push_back(cudf::test::fixed_width_column_wrapper(r + num_rows * i, + r + num_rows * i + num_rows)); views.push_back(cols.back()); schema.push_back(cudf::data_type{cudf::type_id::INT32}); } @@ -184,9 +184,9 @@ TEST_F(ColumnToRowTests, Non2Power) auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); - for (int j=0; jnum_columns(); ++j) { + for (int j = 0; j < old_tbl->num_columns(); ++j) { printf("testing column %d\n", j); - if (j==65) { + if (j == 65) { printf("old\n"); cudf::test::print(old_tbl->get_column(j)); printf("new\n"); @@ -214,8 +214,8 @@ TEST_F(ColumnToRowTests, Big) // 28 columns of 1 million rows constexpr auto num_rows = 1024 * 1024; for (int i = 0; i < 28; ++i) { - cols.push_back( - cudf::test::fixed_width_column_wrapper(r + num_rows * i, r + num_rows * i + num_rows)); + cols.push_back(cudf::test::fixed_width_column_wrapper(r + num_rows * i, + r + num_rows * i + num_rows)); views.push_back(cols.back()); schema.push_back(cudf::data_type{cudf::type_id::INT32}); } @@ -241,8 +241,8 @@ TEST_F(ColumnToRowTests, Bigger) // 128 columns of 1 million rows constexpr auto num_rows = 1024 * 1024; for (int i = 0; i < 128; ++i) { - cols.push_back( - cudf::test::fixed_width_column_wrapper(r + num_rows * i, r + num_rows * i + num_rows)); + cols.push_back(cudf::test::fixed_width_column_wrapper(r + num_rows * i, + r + num_rows * i + num_rows)); views.push_back(cols.back()); schema.push_back(cudf::data_type{cudf::type_id::INT32}); } @@ -268,8 +268,8 @@ TEST_F(ColumnToRowTests, Biggest) // 128 columns of 2 million rows constexpr auto num_rows = 2 * 1024 * 1024; for (int i = 0; i < 128; ++i) { - cols.push_back( - cudf::test::fixed_width_column_wrapper(r + num_rows * i, r + num_rows * i + num_rows)); + cols.push_back(cudf::test::fixed_width_column_wrapper(r + num_rows * i, + r + num_rows * i + num_rows)); views.push_back(cols.back()); schema.push_back(cudf::data_type{cudf::type_id::INT32}); } @@ -396,8 +396,8 @@ TEST_F(RowToColumnTests, Non2Power) constexpr auto num_rows = 6 * 1024 + 557; for (int i = 0; i < 131; ++i) { - cols.push_back( - cudf::test::fixed_width_column_wrapper(r + num_rows * i, r + num_rows * i + num_rows)); + cols.push_back(cudf::test::fixed_width_column_wrapper(r + num_rows * i, + r + num_rows * i + num_rows)); views.push_back(cols.back()); schema.push_back(cudf::data_type{cudf::type_id::INT32}); } @@ -424,8 +424,8 @@ TEST_F(RowToColumnTests, Big) // 28 columns of 1 million rows constexpr auto num_rows = 1024 * 1024; for (int i = 0; i < 28; ++i) { - cols.push_back( - cudf::test::fixed_width_column_wrapper(r + num_rows * i, r + num_rows * i + num_rows)); + cols.push_back(cudf::test::fixed_width_column_wrapper(r + num_rows * i, + r + num_rows * i + num_rows)); views.push_back(cols.back()); schema.push_back(cudf::data_type{cudf::type_id::INT32}); } @@ -452,8 +452,8 @@ TEST_F(RowToColumnTests, Bigger) // 28 columns of 1 million rows constexpr auto num_rows = 1024 * 1024; for (int i = 0; i < 128; ++i) { - cols.push_back( - cudf::test::fixed_width_column_wrapper(r + num_rows * i, r + num_rows * i + num_rows)); + cols.push_back(cudf::test::fixed_width_column_wrapper(r + num_rows * i, + r + num_rows * i + num_rows)); views.push_back(cols.back()); schema.push_back(cudf::data_type{cudf::type_id::INT32}); } @@ -480,8 +480,8 @@ TEST_F(RowToColumnTests, Biggest) // 28 columns of 1 million rows constexpr auto num_rows = 5 * 1024 * 1024; for (int i = 0; i < 128; ++i) { - cols.push_back( - cudf::test::fixed_width_column_wrapper(r + num_rows * i, r + num_rows * i + num_rows)); + cols.push_back(cudf::test::fixed_width_column_wrapper(r + num_rows * i, + r + num_rows * i + num_rows)); views.push_back(cols.back()); schema.push_back(cudf::data_type{cudf::type_id::INT32}); } diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu index 9f0df3569a7..c64a61b3373 100644 --- a/java/src/main/native/src/row_conversion.cu +++ b/java/src/main/native/src/row_conversion.cu @@ -54,9 +54,7 @@ constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED = 2; #endif using cudf::detail::make_device_uvector_async; -using cudf::detail::warp_size; - -namespace cudf::java { +namespace cudf { namespace detail { @@ -403,7 +401,6 @@ __global__ void copy_from_columns(const size_type num_rows, const size_type num_ // Fetch ahead up to stages_count subsets for (; fetch < blocks_remaining && fetch < (subset + stages_count); ++fetch) { auto const fetch_block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch]; - auto const num_fetch_cols = fetch_block.num_cols(); auto const num_fetch_rows = fetch_block.num_rows(); auto const num_elements_in_block = num_fetch_cols * num_fetch_rows; @@ -435,7 +432,7 @@ __global__ void copy_from_columns(const size_type num_rows, const size_type num_ auto const shared_offset = relative_row * fetch_block_row_size + relative_col_offset; auto const input_src = input_data[absolute_col] + col_size * absolute_row; - // copy the main + // copy the element to global memory cuda::memcpy_async(&shared[fetch % stages_count][shared_offset], input_src, col_size, fetch_barrier); } @@ -445,18 +442,19 @@ __global__ void copy_from_columns(const size_type num_rows, const size_type num_ subset_barrier.arrive_and_wait(); auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + subset]; - /* auto const rows_in_block = block.num_rows(); - auto const cols_in_block = block.num_cols();*/ + auto const block_row_size = block.get_row_size(col_offsets, col_sizes); auto const column_offset = col_offsets[block.start_col]; // copy entire rows to final dest for (auto absolute_row = block.start_row + threadIdx.x; absolute_row <= block.end_row; absolute_row += blockDim.x) { + auto const relative_row = absolute_row - block.start_row; auto const output_dest = output_data[block.buffer_num] + absolute_row * block_row_size + column_offset; auto const shared_offset = block_row_size * relative_row; + cuda::memcpy_async(output_dest, &shared[subset % stages_count][shared_offset], block_row_size, subset_barrier); } @@ -528,23 +526,22 @@ __global__ void copy_validity_from_columns( align_offset(util::div_rounding_up_unsafe(num_block_cols, 8), 8); auto const total_sections = num_sections_x * num_sections_y; - int const warp_id = threadIdx.x / warp_size; - int const lane_id = threadIdx.x % warp_size; - auto const warps_per_block = std::max(1u, blockDim.x / warp_size); + int const warp_id = threadIdx.x / detail::warp_size; + int const lane_id = threadIdx.x % detail::warp_size; + auto const warps_per_block = std::max(1u, blockDim.x / detail::warp_size); // the block is divided into sections. A warp operates on a section at a time. for (int my_section_idx = warp_id; my_section_idx < total_sections; my_section_idx += warps_per_block) { - // convert to rows and cols - auto const section_x = my_section_idx / num_sections_x; - auto const section_y = my_section_idx % num_sections_x; + // convert to rows and cols + auto const section_x = my_section_idx % num_sections_x; + auto const section_y = my_section_idx / num_sections_x; auto const relative_col = section_x * 32 + lane_id; auto const relative_row = section_y * 8; auto const absolute_col = relative_col + block.start_col; auto const absolute_row = relative_row + block.start_row; auto const cols_left = num_columns - absolute_col; - auto const participation_mask = __ballot_sync(0xFFFFFFFF, absolute_col < num_columns); if (absolute_col < num_columns) { @@ -552,14 +549,14 @@ __global__ void copy_validity_from_columns( input_nm[absolute_col] != nullptr ? input_nm[absolute_col][absolute_row / 32] : 0xFF; // every thread that is participating in the warp has a byte, but it's column-based - // data and we need it in row-based. So we shiffle the bits around with ballot_sync to make + // data and we need it in row-based. So we shuffle the bits around with ballot_sync to make // the bytes we actually write. for (int i = 0, byte_mask = 1; i < 8 && relative_row + i < num_rows; ++i, byte_mask <<= 1) { auto validity_data = __ballot_sync(participation_mask, my_byte & byte_mask); // lead thread in each warp writes data auto const validity_write_offset = validity_data_row_length * (relative_row + i) + relative_col / 8; - if (threadIdx.x % warp_size == 0) { + if (threadIdx.x % detail::warp_size == 0) { if (cols_left <= 8) { // write byte this_shared_block[validity_write_offset] = validity_data & 0xFF; @@ -591,6 +588,7 @@ __global__ void copy_validity_from_columns( auto const output_ptr = output_data[block.buffer_num] + row_offsets[row] + validity_offset + block.start_col / 8; auto const num_bytes = util::div_rounding_up_unsafe(num_block_cols, 8); + cuda::memcpy_async( output_ptr, &this_shared_block[validity_data_row_length * relative_row], num_bytes, shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]); @@ -647,7 +645,6 @@ fetch_blocks_for_row_to_column(size_t &fetch_index, size_t const processing_inde auto const fetch_block_start_row = fetch_block.start_row; auto const fetch_block_end_row = fetch_block.end_row; auto const starting_col_offset = col_offsets[fetch_block.start_col]; - auto const fetch_block_row_size = fetch_block.get_row_size(col_offsets, col_sizes); auto const num_fetch_cols = fetch_block.num_cols(); auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes( @@ -718,9 +715,9 @@ __global__ void copy_to_columns(const size_type num_rows, const size_type num_co extern __shared__ int8_t shared_data[]; int8_t *shared[stages_count] = {shared_data, shared_data + shmem_used_per_block}; - __shared__ cuda::barrier block_barrier[stages_count]; + __shared__ cuda::barrier block_barrier[NUM_BLOCKS_PER_KERNEL_LOADED]; if (group.thread_rank() == 0) { - for (int i = 0; i < stages_count; ++i) { + for (int i = 0; i < NUM_BLOCKS_PER_KERNEL_LOADED; ++i) { init(&block_barrier[i], group.size()); } } @@ -748,12 +745,11 @@ __global__ void copy_to_columns(const size_type num_rows, const size_type num_co block_infos, _col_sizes, _col_offsets, row_offsets, input_data, shared, group, block_barrier); - auto &subset_barrier = block_barrier[subset % stages_count]; + auto &subset_barrier = block_barrier[subset % NUM_BLOCKS_PER_KERNEL_LOADED]; // ensure our data is ready subset_barrier.arrive_and_wait(); - auto const block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + subset]; - + auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + subset]; auto const rows_in_block = block.num_rows(); auto const cols_in_block = block.num_cols(); @@ -851,18 +847,15 @@ __global__ void copy_validity_to_columns( auto const block = block_infos[blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block]; auto const block_start_col = block.start_col; auto const block_start_row = block.start_row; - auto const num_block_cols = block.num_cols(); auto const num_block_rows = block.num_rows(); - auto const num_sections_x = (num_block_cols + 7) / 8; auto const num_sections_y = (num_block_rows + 31) / 32; auto const validity_data_col_length = num_sections_y * 4; // words to bytes auto const total_sections = num_sections_x * num_sections_y; - - int const warp_id = threadIdx.x / warp_size; - int const lane_id = threadIdx.x % warp_size; - auto const warps_per_block = std::max(1u, blockDim.x / warp_size); + int const warp_id = threadIdx.x / detail::warp_size; + int const lane_id = threadIdx.x % detail::warp_size; + auto const warps_per_block = std::max(1u, blockDim.x / detail::warp_size); // the block is divided into sections. A warp operates on a section at a time. for (int my_section_idx = warp_id; my_section_idx < total_sections; @@ -870,7 +863,6 @@ __global__ void copy_validity_to_columns( // convert to rows and cols auto const section_x = my_section_idx % num_sections_x; auto const section_y = my_section_idx / num_sections_x; - auto const relative_col = section_x * 8; auto const relative_row = section_y * 32 + lane_id; auto const absolute_col = relative_col + block_start_col; @@ -890,9 +882,11 @@ __global__ void copy_validity_to_columns( ++i, byte_mask <<= 1) { auto validity_data = __ballot_sync(participation_mask, my_byte & byte_mask); // lead thread in each warp writes data - if (threadIdx.x % warp_size == 0) { + if (threadIdx.x % detail::warp_size == 0) { auto const validity_write_offset = validity_data_col_length * (relative_col + i) + relative_row / 8; + auto const write_5006_offset = 837; // validity_data_col_length * (65 - block_start_col) + // + (5006 - block_start_row)/8; if (rows_left <= 8) { // write byte @@ -922,6 +916,8 @@ __global__ void copy_validity_to_columns( // now async memcpy the shared for (int col = block.start_col + threadIdx.x; col <= block.end_col; col += blockDim.x) { auto const relative_col = col - block.start_col; + auto const words_to_copy = util::div_rounding_up_unsafe(num_block_rows, 32); + auto const starting_address = output_nm[col] + word_index(block_start_row); cuda::memcpy_async( output_nm[col] + word_index(block_start_row), @@ -965,8 +961,9 @@ static int calc_fixed_width_kernel_dims(const cudf::size_type num_columns, // level when writing validity data out to main memory, and that would // need to change if we split a word of validity data between blocks. int y_block_size = (num_columns + 3) / 4; // cudf::util::div_rounding_up_safe(num_columns, 4); - if (y_block_size > 32) + if (y_block_size > 32) { y_block_size = 32; + } int x_possible_block_size = 1024 / y_block_size; // 48KB is the default setting for shared memory per block according to the cuda tutorials // If someone configures the GPU to only have 16 KB this might not work. @@ -1135,7 +1132,10 @@ build_validity_block_infos(size_type const &num_columns, size_type const &num_ro }(), 8); // we fit as much as we can given the column stride - auto const row_stride = std::min(num_rows, shmem_limit_per_block * 8 / column_stride); + // note that an element in the table takes just 1 bit, but a row with a single + // element still takes 8 bytes! + auto const bytes_per_row = align_offset(util::div_rounding_up_unsafe(column_stride, 8), 8); + auto const row_stride = std::min(num_rows, shmem_limit_per_block / bytes_per_row); std::vector validity_block_infos; for (int col = 0; col < num_columns; col += column_stride) { @@ -1203,13 +1203,12 @@ std::vector build_block_infos(std::vector const &column_s // want them equal, so height and width are sqrt(shared_mem_size). The trick is that it's in // bytes, not rows or columns. size_type const optimal_square_len = size_type(sqrt(shmem_limit_per_block)); - int const window_height = - std::clamp(util::round_up_safe( - optimal_square_len <= (size_type)column_sizes.size() ? - std::min(optimal_square_len / column_sizes[0], total_number_of_rows) : - row_batches[0].row_count / 2, - 32), - 1, row_batches[0].row_count); + int const window_height = std::clamp( + util::round_up_safe( + std::min(std::min(optimal_square_len, (size_type)column_sizes.size()) / column_sizes[0], + total_number_of_rows), + 32), + 1, row_batches[0].row_count); auto calc_admin_data_size = [](int num_cols) -> size_type { // admin data is the column sizes and column start information. @@ -1233,8 +1232,9 @@ std::vector build_block_infos(std::vector const &column_s if (row_size_with_end_pad * window_height + calc_admin_data_size(col - current_window_start_col) > shmem_limit_per_block) { + // too large, close this window, generate vertical blocks and restart - build_blocks(current_window_start_col, col - 1, window_height); + build_blocks(current_window_start_col, col == 0 ? col : col - 1, window_height); row_size = detail::align_offset((column_starts[col] + column_sizes[col]) & 7, alignment_needed); row_size += col_size; // alignment required for shared memory window boundary to match @@ -1274,9 +1274,8 @@ std::vector> convert_to_rows(cudf::table_view cons int total_shmem; CUDA_TRY(cudaDeviceGetAttribute(&total_shmem, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); - // TODO: kernels fail to launch if we use all the available shared memory. + // TODO: why? total_shmem -= 1024; - int shmem_limit_per_block = total_shmem / NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; // break up the work into blocks, which are a starting and ending row/col #. @@ -1381,6 +1380,16 @@ std::vector> convert_to_rows(cudf::table_view cons } c = c.child(1); } + exclusive_scan([t](int row_index) { + size_type total_row_size = 0; + for (int i=0 i> convert_to_rows(cudf::table_view cons // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then // calculate the size of each row's variable-width data and validity as well. auto validity_size = num_bitmask_words(num_columns) * 4; + // thrust for (int row = 0; row < num_rows; ++row) { auto aligned_row_batch_size = detail::align_offset(row_batch_size, 8); // rows are 8 byte aligned @@ -1578,7 +1588,7 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const &in int total_shmem; CUDA_TRY(cudaDeviceGetAttribute(&total_shmem, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); - // TODO: unable to launch a kernel with all shared used + // TODO why? total_shmem -= 1024; int shmem_limit_per_block = total_shmem / NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; @@ -1628,11 +1638,7 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const &in auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr); dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_COLUMNS)); -#if defined(DEBUG) - dim3 threads(std::min(std::min(128, shmem_limit_per_block / 8), (int)child.size())); -#else - dim3 threads(std::min(256, (int)child.size())); -#endif + dim3 threads(std::min(std::min(256, shmem_limit_per_block / 8), (int)child.size())); detail::copy_to_columns<<>>( num_rows, num_columns, shmem_limit_per_block, input.offsets().data(), dev_output_data.data(), dev_col_sizes.data(), dev_col_starts.data(), dev_block_infos.data(), @@ -1641,8 +1647,8 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const &in auto const desired_rows_and_columns = (int)sqrt(shmem_limit_per_block); auto const column_stride = [&]() { if (desired_rows_and_columns > num_columns) { - // not many columns, group it into 8s and ship it off - return std::min(8, num_columns); + // not many columns, group it into 64s and ship it off + return std::min(64, num_columns); } else { return util::round_down_safe(desired_rows_and_columns, 8); } From 02cb81b95d53ad2e8330fcf768f55fc1502d707d Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Fri, 1 Oct 2021 15:14:54 +0000 Subject: [PATCH 18/80] Fixing merge issue --- cpp/benchmarks/CMakeLists.txt | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 5cc48436d01..7d353c37df7 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -29,6 +29,7 @@ target_link_libraries(cudf_datagen GTest::gmock_main GTest::gtest_main benchmark::benchmark + nvbench::nvbench Threads::Threads cudf) @@ -50,11 +51,19 @@ target_link_libraries(cudf_benchmark_common PRIVATE cudf_datagen) function(ConfigureBench CMAKE_BENCH_NAME) add_executable(${CMAKE_BENCH_NAME} ${ARGN}) set_target_properties(${CMAKE_BENCH_NAME} - PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$") + PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$") target_link_libraries(${CMAKE_BENCH_NAME} PRIVATE cudf_benchmark_common cudf_datagen benchmark::benchmark_main) endfunction() +function(ConfigureNVBench CMAKE_BENCH_NAME) + add_executable(${CMAKE_BENCH_NAME} ${ARGN}) + set_target_properties(${CMAKE_BENCH_NAME} + PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$") + target_link_libraries(${CMAKE_BENCH_NAME} + PRIVATE cudf_benchmark_common cudf_datagen nvbench::main) +endfunction() + ################################################################################################### # - column benchmarks ----------------------------------------------------------------------------- ConfigureBench(COLUMN_CONCAT_BENCH column/concatenate_benchmark.cpp) @@ -67,6 +76,10 @@ ConfigureBench(GATHER_BENCH copying/gather_benchmark.cu) # - scatter benchmark ----------------------------------------------------------------------------- ConfigureBench(SCATTER_BENCH copying/scatter_benchmark.cu) +################################################################################################### +# - lists scatter benchmark ----------------------------------------------------------------------- +ConfigureBench(SCATTER_LISTS_BENCH lists/copying/scatter_lists_benchmark.cu) + ################################################################################################### # - contiguous_split benchmark ------------------------------------------------------------------- ConfigureBench(CONTIGUOUS_SPLIT_BENCH copying/contiguous_split_benchmark.cu) @@ -89,7 +102,8 @@ ConfigureBench(STREAM_COMPACTION_BENCH stream_compaction/drop_duplicates_benchma ################################################################################################### # - join benchmark -------------------------------------------------------------------------------- -ConfigureBench(JOIN_BENCH join/join_benchmark.cu) +ConfigureBench(JOIN_BENCH join/join_benchmark.cu join/conditional_join_benchmark.cu) +ConfigureNVBench(JOIN_NVBENCH join/join_nvbench.cu) ################################################################################################### # - iterator benchmark ---------------------------------------------------------------------------- @@ -191,6 +205,7 @@ ConfigureBench(AST_BENCH ast/transform_benchmark.cpp) # - binaryop benchmark ---------------------------------------------------------------------------- ConfigureBench(BINARYOP_BENCH binaryop/binaryop_benchmark.cpp + binaryop/compiled_binaryop_benchmark.cpp binaryop/jit_binaryop_benchmark.cpp) ################################################################################################### @@ -218,6 +233,7 @@ ConfigureBench(STRINGS_BENCH string/factory_benchmark.cu string/filter_benchmark.cpp string/find_benchmark.cpp + string/repeat_strings_benchmark.cpp string/replace_benchmark.cpp string/replace_re_benchmark.cpp string/split_benchmark.cpp @@ -231,5 +247,10 @@ ConfigureBench(JSON_BENCH string/json_benchmark.cpp) ################################################################################################### -# - row conversion benchmark ---------------------------------------------------------------------------- +# - io benchmark --------------------------------------------------------------------- +ConfigureBench(MULTIBYTE_SPLIT_BENCHMARK + io/text/multibyte_split_benchmark.cpp) + +################################################################################################### +# - row conversion benchmark --------------------------------------------------------- ConfigureBench(ROW_CONVERSION_BENCH row_conversion/row_conversion.cpp) From bae16f6976bda67f46e24e399b014ea1f7aff38d Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Fri, 1 Oct 2021 15:17:11 +0000 Subject: [PATCH 19/80] working on code to move block creation and batch creation to gpu --- cpp/src/row_conversion/row_conversion.cu | 180 +++++++++++++++++++- cpp/tests/row_conversion/row_conversion.cpp | 7 - 2 files changed, 178 insertions(+), 9 deletions(-) diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu index eb3c4b28b6a..ae218e637d0 100644 --- a/cpp/src/row_conversion/row_conversion.cu +++ b/cpp/src/row_conversion/row_conversion.cu @@ -20,6 +20,8 @@ #include #include #include +#include "cudf/detail/iterator.cuh" +#include "cudf/lists/lists_column_device_view.cuh" #include @@ -43,7 +45,9 @@ #include #include #include +#include +#include #include #include @@ -56,6 +60,7 @@ constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED = 2; #endif using cudf::detail::make_device_uvector_async; +using rmm::device_uvector; namespace cudf { namespace detail { @@ -1352,8 +1357,6 @@ __global__ void copy_validity_to_columns(const size_type num_rows, if (threadIdx.x % detail::warp_size == 0) { auto const validity_write_offset = validity_data_col_length * (relative_col + i) + relative_row / 8; - auto const write_5006_offset = 837; // validity_data_col_length * (65 - - // block_start_col) + (5006 - block_start_row)/8; if (print_debug) printf( @@ -1674,6 +1677,173 @@ std::vector build_validity_block_infos( return validity_block_infos; } +constexpr size_t max_batch_size = 1024; // 2ul * 1024 * 1024 * 1024; + +template +void build_batches(size_t total_size, + size_type num_rows, + CumulativeRowSize cumulative_row_size, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto const num_batches = ((total_size + (max_batch_size - 1)) / max_batch_size); + auto const num_offsets = num_batches + 1; + printf("%lu batches so %lu offsets\n", num_batches, num_offsets); + + // at most max gpu memory / 2GB iterations. + std::vector h_batch_row_offsets; + h_batch_row_offsets.reserve(num_offsets); + h_batch_row_offsets.push_back(0); + size_type last_row_end = 0; + while (h_batch_row_offsets.size() < num_batches) { + // subtract out the size of the last row in the previous batch + auto adjusted_row_size = + thrust::make_transform_iterator(cumulative_row_size + last_row_end, + [last_row_end, cumulative_row_size] __device__(size_t size) { + return size - cumulative_row_size[last_row_end]; + }); + // find the next max_batch_size boundary + size_type const row_end = ((thrust::lower_bound(rmm::exec_policy(stream), + adjusted_row_size, + adjusted_row_size + (num_rows - last_row_end), + max_batch_size) - + adjusted_row_size) + + last_row_end) - + 1; + + h_batch_row_offsets.push_back(row_end); + last_row_end = row_end; + } + printf("batches: "); + for (uint i = 0; i < h_batch_row_offsets.size(); ++i) { + printf("%d ", h_batch_row_offsets[i]); + } + printf("\n"); +} + +int compute_block_counts(device_uvector const& batch_row_offsets, + int desired_window_height, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + size_type const num_batches = batch_row_offsets.size() - 1; + device_uvector num_blocks(num_batches, stream); + auto iter = thrust::make_counting_iterator(0); + thrust::transform( + rmm::exec_policy(stream), + iter, + iter + num_batches, + num_blocks.begin(), + [desired_window_height, + batch_row_offsets = batch_row_offsets.data()] __device__(auto batch_index) -> size_type { + return (batch_row_offsets[batch_index + 1] - batch_row_offsets[batch_index]) / + desired_window_height; + }); + return thrust::reduce(rmm::exec_policy(stream), num_blocks.begin(), num_blocks.end()); +} + +size_type block_lambda( + block_info* blocks, + device_uvector const& batch_row_offsets, // comes from build_batches + int column_start, + int column_end, + int desired_window_height, + int total_number_of_rows, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + size_type const num_batches = batch_row_offsets.size() - 1; + device_uvector num_blocks(num_batches, stream); + auto iter = thrust::make_counting_iterator(0); + thrust::transform( + rmm::exec_policy(stream), + iter, + iter + num_batches, + num_blocks.begin(), + [=, batch_row_offsets = batch_row_offsets.data()] __device__(int batch_index) -> size_type { + return (batch_row_offsets[batch_index + 1] - batch_row_offsets[batch_index]) / + desired_window_height; + }); + size_type const total_blocks = + thrust::reduce(rmm::exec_policy(stream), num_blocks.begin(), num_blocks.end()); + device_uvector block_starts(num_batches, stream); + thrust::exclusive_scan(rmm::exec_policy(stream), + num_blocks.begin(), + num_blocks.end(), + block_starts.begin()); // in blocks + + thrust::for_each( + rmm::exec_policy(stream), + iter, + iter + total_blocks, + [ =, + block_starts = block_starts.data(), + batch_row_offsets = batch_row_offsets.data()] __device__(size_type block_index) { + block_info& bi = blocks[block_index]; + + // what batch this block falls in + auto const batch_index_iter = + thrust::lower_bound(thrust::seq, block_starts, block_starts + num_batches, block_index); + auto const batch_index = batch_index_iter == block_starts ? 0 : *batch_index_iter; + // local index within the block + int const local_block_index = block_index - block_starts[batch_index]; + // the start row for this batch. + int const batch_row_start = batch_row_offsets[batch_index]; + // the start row for this block + int const block_row_start = batch_row_start + (local_block_index * desired_window_height); + // the end row for this block + int const max_row = std::min(total_number_of_rows, + batch_index + 1 > num_batches + ? std::numeric_limits::max() + : static_cast(batch_row_offsets[batch_index + 1])); + int const block_row_end = + std::min(batch_row_start + ((local_block_index + 1) * desired_window_height) - 1, + total_number_of_rows); + + // stuff the block + bi.start_col = column_start; + bi.end_col = column_end; + bi.start_row = block_row_start; + bi.end_row = block_row_end; + bi.buffer_num = batch_index; + }); + + return total_blocks; +} + +void test_block_lambda(rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) +{ + device_uvector batch_row_offsets(3, stream); + batch_row_offsets.set_element(0, 0, stream); + batch_row_offsets.set_element(1, 2000, stream); + batch_row_offsets.set_element(2, 5000, stream); + + // three groups of columns that can hold 128, 1024, and 768 rows each. + auto const total_blocks = compute_block_counts(batch_row_offsets, 128, stream, mr) + + compute_block_counts(batch_row_offsets, 1024, stream, mr) + + compute_block_counts(batch_row_offsets, 768, stream, mr); + + auto const table_num_rows = 50 * 1024; + + // allocate memory for all blocks + device_uvector blocks(total_blocks, stream); + + auto used_blocks = + block_lambda(blocks.data(), batch_row_offsets, 0, 15, 128, table_num_rows, stream, mr); + used_blocks += block_lambda( + blocks.data() + used_blocks, batch_row_offsets, 16, 28, 1024, table_num_rows, stream, mr); + used_blocks += block_lambda( + blocks.data() + used_blocks, batch_row_offsets, 29, 32, 768, table_num_rows, stream, mr); + + CUDF_EXPECTS(used_blocks == total_blocks, "used not equal to total!"); + + for (int i = 0; i < total_blocks; ++i) { + auto const block = blocks.element(i, stream); + printf( + "%d: %d,%d -> %d,%d\n", i, block.start_col, block.start_row, block.end_col, block.end_row); + } +} + std::vector build_block_infos(std::vector const& column_sizes, std::vector const& column_starts, std::vector const& row_batches, @@ -2245,6 +2415,12 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const& in cudf::size_type num_columns = schema.size(); cudf::size_type num_rows = input.parent().size(); + auto cumulative_row_size = cudf::detail::make_counting_transform_iterator( + 0, [] __device__(size_t row_index) { return 300 * row_index; }); + detail::build_batches(1024 * 1024, 1024, cumulative_row_size, stream, mr); + + detail::test_block_lambda(stream, mr); + int device_id; CUDA_TRY(cudaGetDevice(&device_id)); int total_shmem; diff --git a/cpp/tests/row_conversion/row_conversion.cpp b/cpp/tests/row_conversion/row_conversion.cpp index 70a4552a6f9..48d9690d583 100644 --- a/cpp/tests/row_conversion/row_conversion.cpp +++ b/cpp/tests/row_conversion/row_conversion.cpp @@ -185,13 +185,6 @@ TEST_F(ColumnToRowTests, Non2Power) auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); for (int j = 0; j < old_tbl->num_columns(); ++j) { - printf("testing column %d\n", j); - if (j == 65) { - printf("old\n"); - cudf::test::print(old_tbl->get_column(j)); - printf("new\n"); - cudf::test::print(new_tbl->get_column(j)); - } CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(old_tbl->get_column(j), new_tbl->get_column(j)); } From 36568485ec7caaec1ab5188b1cb5a1fbaea45b51 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Wed, 6 Oct 2021 19:41:49 +0000 Subject: [PATCH 20/80] pulling incomplete code for gpu building block data --- cpp/src/row_conversion/row_conversion.cu | 173 --------------------- java/src/main/native/src/row_conversion.cu | 53 +------ 2 files changed, 6 insertions(+), 220 deletions(-) diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu index ae218e637d0..9674000a69d 100644 --- a/cpp/src/row_conversion/row_conversion.cu +++ b/cpp/src/row_conversion/row_conversion.cu @@ -1677,173 +1677,6 @@ std::vector build_validity_block_infos( return validity_block_infos; } -constexpr size_t max_batch_size = 1024; // 2ul * 1024 * 1024 * 1024; - -template -void build_batches(size_t total_size, - size_type num_rows, - CumulativeRowSize cumulative_row_size, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - auto const num_batches = ((total_size + (max_batch_size - 1)) / max_batch_size); - auto const num_offsets = num_batches + 1; - printf("%lu batches so %lu offsets\n", num_batches, num_offsets); - - // at most max gpu memory / 2GB iterations. - std::vector h_batch_row_offsets; - h_batch_row_offsets.reserve(num_offsets); - h_batch_row_offsets.push_back(0); - size_type last_row_end = 0; - while (h_batch_row_offsets.size() < num_batches) { - // subtract out the size of the last row in the previous batch - auto adjusted_row_size = - thrust::make_transform_iterator(cumulative_row_size + last_row_end, - [last_row_end, cumulative_row_size] __device__(size_t size) { - return size - cumulative_row_size[last_row_end]; - }); - // find the next max_batch_size boundary - size_type const row_end = ((thrust::lower_bound(rmm::exec_policy(stream), - adjusted_row_size, - adjusted_row_size + (num_rows - last_row_end), - max_batch_size) - - adjusted_row_size) + - last_row_end) - - 1; - - h_batch_row_offsets.push_back(row_end); - last_row_end = row_end; - } - printf("batches: "); - for (uint i = 0; i < h_batch_row_offsets.size(); ++i) { - printf("%d ", h_batch_row_offsets[i]); - } - printf("\n"); -} - -int compute_block_counts(device_uvector const& batch_row_offsets, - int desired_window_height, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - size_type const num_batches = batch_row_offsets.size() - 1; - device_uvector num_blocks(num_batches, stream); - auto iter = thrust::make_counting_iterator(0); - thrust::transform( - rmm::exec_policy(stream), - iter, - iter + num_batches, - num_blocks.begin(), - [desired_window_height, - batch_row_offsets = batch_row_offsets.data()] __device__(auto batch_index) -> size_type { - return (batch_row_offsets[batch_index + 1] - batch_row_offsets[batch_index]) / - desired_window_height; - }); - return thrust::reduce(rmm::exec_policy(stream), num_blocks.begin(), num_blocks.end()); -} - -size_type block_lambda( - block_info* blocks, - device_uvector const& batch_row_offsets, // comes from build_batches - int column_start, - int column_end, - int desired_window_height, - int total_number_of_rows, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - size_type const num_batches = batch_row_offsets.size() - 1; - device_uvector num_blocks(num_batches, stream); - auto iter = thrust::make_counting_iterator(0); - thrust::transform( - rmm::exec_policy(stream), - iter, - iter + num_batches, - num_blocks.begin(), - [=, batch_row_offsets = batch_row_offsets.data()] __device__(int batch_index) -> size_type { - return (batch_row_offsets[batch_index + 1] - batch_row_offsets[batch_index]) / - desired_window_height; - }); - size_type const total_blocks = - thrust::reduce(rmm::exec_policy(stream), num_blocks.begin(), num_blocks.end()); - device_uvector block_starts(num_batches, stream); - thrust::exclusive_scan(rmm::exec_policy(stream), - num_blocks.begin(), - num_blocks.end(), - block_starts.begin()); // in blocks - - thrust::for_each( - rmm::exec_policy(stream), - iter, - iter + total_blocks, - [ =, - block_starts = block_starts.data(), - batch_row_offsets = batch_row_offsets.data()] __device__(size_type block_index) { - block_info& bi = blocks[block_index]; - - // what batch this block falls in - auto const batch_index_iter = - thrust::lower_bound(thrust::seq, block_starts, block_starts + num_batches, block_index); - auto const batch_index = batch_index_iter == block_starts ? 0 : *batch_index_iter; - // local index within the block - int const local_block_index = block_index - block_starts[batch_index]; - // the start row for this batch. - int const batch_row_start = batch_row_offsets[batch_index]; - // the start row for this block - int const block_row_start = batch_row_start + (local_block_index * desired_window_height); - // the end row for this block - int const max_row = std::min(total_number_of_rows, - batch_index + 1 > num_batches - ? std::numeric_limits::max() - : static_cast(batch_row_offsets[batch_index + 1])); - int const block_row_end = - std::min(batch_row_start + ((local_block_index + 1) * desired_window_height) - 1, - total_number_of_rows); - - // stuff the block - bi.start_col = column_start; - bi.end_col = column_end; - bi.start_row = block_row_start; - bi.end_row = block_row_end; - bi.buffer_num = batch_index; - }); - - return total_blocks; -} - -void test_block_lambda(rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) -{ - device_uvector batch_row_offsets(3, stream); - batch_row_offsets.set_element(0, 0, stream); - batch_row_offsets.set_element(1, 2000, stream); - batch_row_offsets.set_element(2, 5000, stream); - - // three groups of columns that can hold 128, 1024, and 768 rows each. - auto const total_blocks = compute_block_counts(batch_row_offsets, 128, stream, mr) + - compute_block_counts(batch_row_offsets, 1024, stream, mr) + - compute_block_counts(batch_row_offsets, 768, stream, mr); - - auto const table_num_rows = 50 * 1024; - - // allocate memory for all blocks - device_uvector blocks(total_blocks, stream); - - auto used_blocks = - block_lambda(blocks.data(), batch_row_offsets, 0, 15, 128, table_num_rows, stream, mr); - used_blocks += block_lambda( - blocks.data() + used_blocks, batch_row_offsets, 16, 28, 1024, table_num_rows, stream, mr); - used_blocks += block_lambda( - blocks.data() + used_blocks, batch_row_offsets, 29, 32, 768, table_num_rows, stream, mr); - - CUDF_EXPECTS(used_blocks == total_blocks, "used not equal to total!"); - - for (int i = 0; i < total_blocks; ++i) { - auto const block = blocks.element(i, stream); - printf( - "%d: %d,%d -> %d,%d\n", i, block.start_col, block.start_row, block.end_col, block.end_row); - } -} - std::vector build_block_infos(std::vector const& column_sizes, std::vector const& column_starts, std::vector const& row_batches, @@ -2415,12 +2248,6 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const& in cudf::size_type num_columns = schema.size(); cudf::size_type num_rows = input.parent().size(); - auto cumulative_row_size = cudf::detail::make_counting_transform_iterator( - 0, [] __device__(size_t row_index) { return 300 * row_index; }); - detail::build_batches(1024 * 1024, 1024, cumulative_row_size, stream, mr); - - detail::test_block_lambda(stream, mr); - int device_id; CUDA_TRY(cudaGetDevice(&device_id)); int total_shmem; diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu index c64a61b3373..481787c6004 100644 --- a/java/src/main/native/src/row_conversion.cu +++ b/java/src/main/native/src/row_conversion.cu @@ -21,6 +21,8 @@ #include #include +#include +#include #include #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 @@ -42,6 +44,8 @@ #include #include #include +#include +#include #include #include @@ -54,6 +58,7 @@ constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED = 2; #endif using cudf::detail::make_device_uvector_async; +using rmm::device_uvector; namespace cudf { namespace detail { @@ -885,8 +890,6 @@ __global__ void copy_validity_to_columns( if (threadIdx.x % detail::warp_size == 0) { auto const validity_write_offset = validity_data_col_length * (relative_col + i) + relative_row / 8; - auto const write_5006_offset = 837; // validity_data_col_length * (65 - block_start_col) - // + (5006 - block_start_row)/8; if (rows_left <= 8) { // write byte @@ -1330,28 +1333,7 @@ std::vector> convert_to_rows(cudf::table_view cons }); size_type fixed_width_size_per_row = - detail::compute_column_information(iter, iter + num_columns, column_starts, - column_sizes); //, - // [&variable_width_columns](column_view const &cv) { variable_width_columns.push_back(cv); }); - /* size_type fixed_width_size_per_row = 0; - for (int col = 0; col < num_columns; ++col) { - auto cv = tbl.column(col); - auto col_type = cv.type(); - bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; - - if (nested_type) { variable_width_columns.push_back(cv); } - - // a list or string column will write a single uint64 - // of data here for offset/length - auto col_size = nested_type ? 8 : size_of(col_type); - - // align size for this type - std::size_t const alignment_needed = col_size; // They are the same for fixed width types - fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed); - column_starts.push_back(fixed_width_size_per_row); - column_sizes.push_back(col_size); - fixed_width_size_per_row += col_size; - }*/ + detail::compute_column_information(iter, iter + num_columns, column_starts, column_sizes); auto dev_col_sizes = make_device_uvector_async(column_sizes, stream, mr); auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr); @@ -1368,29 +1350,6 @@ std::vector> convert_to_rows(cudf::table_view cons // will be included in the variable-width data blob at the end of the // row. return 0; - /* auto c = variable_width_columns[col]; - while (true) { - auto col_offsets = c.child(0).data(); - auto col_data_size = size_of(c.child(1).type()); - std::size_t alignment_needed = col_data_size; - - row_sizes[row] += (col_offsets[row + 1] - col_offsets[row]) * col_data_size; - if (c.num_children() == 0) { - break; - } - c = c.child(1); - } - exclusive_scan([t](int row_index) { - size_type total_row_size = 0; - for (int i=0 i Date: Wed, 6 Oct 2021 14:43:18 -0700 Subject: [PATCH 21/80] Use the new row<->col method Added a new method `convertFromRowsFixedWidthOptimized` and `convertToRowsFixedWidthOptimized` to be used for when columns are < 100. Otherwise use the new method This is currently failing simple tests --- java/src/main/java/ai/rapids/cudf/Table.java | 33 +++++++++++ java/src/main/native/src/TableJni.cpp | 56 ++++++++++++++++++- .../test/java/ai/rapids/cudf/TableTest.java | 43 +++++++++++++- 3 files changed, 128 insertions(+), 4 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index 0af02d1c926..65c8fcc2c0d 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -627,8 +627,12 @@ private static native long[] conditionalLeftAntiJoinGatherMapWithCount(long left private static native long[] convertToRows(long nativeHandle); + private static native long[] convertToRowsFixedWidthOptimized(long nativeHandle); + private static native long[] convertFromRows(long nativeColumnView, int[] types, int[] scale); + private static native long[] convertFromRowsFixedWidthOptimized(long nativeColumnView, int[] types, int[] scale); + private static native long[] repeatStaticCount(long tableHandle, int count); private static native long[] repeatColumnCount(long tableHandle, @@ -2684,6 +2688,15 @@ public ColumnVector[] convertToRows() { return ret; } + public ColumnVector[] convertToRowsFixedWidthOptimized() { + long[] ptrs = convertToRowsFixedWidthOptimized(nativeHandle); + ColumnVector[] ret = new ColumnVector[ptrs.length]; + for (int i = 0; i < ptrs.length; i++) { + ret[i] = new ColumnVector(ptrs[i]); + } + return ret; + } + /** * Convert a column of list of bytes that is formatted like the output from `convertToRows` * and convert it back to a table. @@ -2704,6 +2717,26 @@ public static Table convertFromRows(ColumnView vec, DType ... schema) { return new Table(convertFromRows(vec.getNativeView(), types, scale)); } + /** + * Convert a column of list of bytes that is formatted like the output from `convertToRows` + * and convert it back to a table. + * @param vec the row data to process. + * @param schema the types of each column. + * @return the parsed table. + */ + public static Table convertFromRowsFixedWidthOptimized(ColumnView vec, DType ... schema) { + // TODO at some point we need a schema that support nesting so we can support nested types + // TODO we will need scale at some point very soon too + int[] types = new int[schema.length]; + int[] scale = new int[schema.length]; + for (int i = 0; i < schema.length; i++) { + types[i] = schema[i].typeId.nativeId; + scale[i] = schema[i].getScale(); + + } + return new Table(convertFromRowsFixedWidthOptimized(vec.getNativeView(), types, scale)); + } + /** * Construct a table from a packed representation. * @param metadata host-based metadata for the table diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index ee75112a2ed..cdd0623eb77 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -597,16 +598,20 @@ class native_arrow_ipc_reader_handle final { static jlongArray convert_table_for_return(JNIEnv *env, std::unique_ptr &table_result, std::vector> &extra_columns) { + std::cout << "entering convert_table_for_return\n"; std::vector> ret = table_result->release(); int table_cols = ret.size(); int num_columns = table_cols + extra_columns.size(); cudf::jni::native_jlongArray outcol_handles(env, num_columns); + std::cout << "0\n"; for (int i = 0; i < table_cols; i++) { outcol_handles[i] = reinterpret_cast(ret[i].release()); } + std::cout << "1\n"; for (size_t i = 0; i < extra_columns.size(); i++) { outcol_handles[i + table_cols] = reinterpret_cast(extra_columns[i].release()); } + std::cout << "exiting convert_table_for_return\n"; return outcol_handles.get_jArray(); } @@ -2692,14 +2697,35 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_gather(JNIEnv *env, jclas CATCH_STD(env, 0); } +JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertToRowsFixedWidthOptimized(JNIEnv *env, jclass, + jlong input_table) { + JNI_NULL_CHECK(env, input_table, "input table is null", 0); + + try { + cudf::jni::auto_set_device(env); + cudf::table_view *n_input_table = reinterpret_cast(input_table); + std::vector> cols = cudf::old_convert_to_rows(*n_input_table); + int num_columns = cols.size(); + cudf::jni::native_jlongArray outcol_handles(env, num_columns); + for (int i = 0; i < num_columns; i++) { + outcol_handles[i] = reinterpret_cast(cols[i].release()); + } + return outcol_handles.get_jArray(); + } + CATCH_STD(env, 0); +} + JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertToRows(JNIEnv *env, jclass, jlong input_table) { JNI_NULL_CHECK(env, input_table, "input table is null", 0); try { + std::cout << "convert_to_rows\n"; cudf::jni::auto_set_device(env); cudf::table_view *n_input_table = reinterpret_cast(input_table); - std::vector> cols = cudf::java::convert_to_rows(*n_input_table); + std::cout << "before convert_to_rows\n"; + std::vector> cols = cudf::convert_to_rows(*n_input_table); + std::cout << "after convert_to_rows\n"; int num_columns = cols.size(); cudf::jni::native_jlongArray outcol_handles(env, num_columns); for (int i = 0; i < num_columns; i++) { @@ -2710,6 +2736,29 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertToRows(JNIEnv *env CATCH_STD(env, 0); } +JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRowsFixedWidthOptimized(JNIEnv *env, jclass, + jlong input_column, + jintArray types, + jintArray scale) { + JNI_NULL_CHECK(env, input_column, "input column is null", 0); + JNI_NULL_CHECK(env, types, "types is null", 0); + + try { + cudf::jni::auto_set_device(env); + cudf::column_view *input = reinterpret_cast(input_column); + cudf::lists_column_view list_input(*input); + cudf::jni::native_jintArray n_types(env, types); + cudf::jni::native_jintArray n_scale(env, scale); + std::vector types_vec; + for (int i = 0; i < n_types.size(); i++) { + types_vec.emplace_back(cudf::jni::make_data_type(n_types[i], n_scale[i])); + } + std::unique_ptr result = cudf::old_convert_from_rows(list_input, types_vec); + return cudf::jni::convert_table_for_return(env, result); + } + CATCH_STD(env, 0); +} + JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRows(JNIEnv *env, jclass, jlong input_column, jintArray types, @@ -2718,6 +2767,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRows(JNIEnv *e JNI_NULL_CHECK(env, types, "types is null", 0); try { + std::cout << "convert_from_rows\n"; cudf::jni::auto_set_device(env); cudf::column_view *input = reinterpret_cast(input_column); cudf::lists_column_view list_input(*input); @@ -2727,7 +2777,9 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRows(JNIEnv *e for (int i = 0; i < n_types.size(); i++) { types_vec.emplace_back(cudf::jni::make_data_type(n_types[i], n_scale[i])); } - std::unique_ptr result = cudf::java::convert_from_rows(list_input, types_vec); + std::cout << "before convert_from_rows\n"; + std::unique_ptr result = cudf::convert_from_rows(list_input, types_vec); + std::cout << "after convert_from_rows\n"; return cudf::jni::convert_table_for_return(env, result); } CATCH_STD(env, 0); diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index aa9ef5bf766..4ddeb542bbf 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -51,6 +51,7 @@ import java.nio.file.Files; import java.util.*; import java.util.stream.Collectors; +import java.util.stream.IntStream; import static ai.rapids.cudf.ParquetColumnWriterOptions.mapColumn; import static ai.rapids.cudf.ParquetWriterOptions.listBuilder; @@ -7093,6 +7094,44 @@ void testStructColumnFilterStrings() { } } + @Test + void fixedWidthRowsRoundTripWide() { + TestBuilder tb = new TestBuilder(); + IntStream.range(0, 10).forEach(i -> tb.column(3l, 9l, 4l, 2l, 20l, null)); + IntStream.range(0, 10).forEach(i -> tb.column(5.0d, 9.5d, 0.9d, 7.23d, 2.8d, null)); + IntStream.range(0, 10).forEach(i -> tb.column(5, 1, 0, 2, 7, null)); + IntStream.range(0, 10).forEach(i -> tb.column(true, false, false, true, false, null)); + IntStream.range(0, 10).forEach(i -> tb.column(1.0f, 3.5f, 5.9f, 7.1f, 9.8f, null)); + IntStream.range(0, 10).forEach(i -> tb.column(new Byte[]{2, 3, 4, 5, 9, null})); + IntStream.range(0, 10).forEach(i -> tb.decimal32Column(-3, RoundingMode.UNNECESSARY, 5.0d, + 9.5d, 0.9d, 7.23d, 2.8d, null)); + IntStream.range(0, 10).forEach(i -> tb.decimal64Column(-8, 3L, 9L, 4L, 2L, 20L, null)); + try (Table t = tb.build()) { + ColumnVector[] rows = t.convertToRows(); + try { + // We didn't overflow + assert rows.length == 1; + ColumnVector cv = rows[0]; + assert cv.getRowCount() == t.getRowCount(); +// try (HostColumnVector hcv = cv.copyToHost()) { +// hcv.getChildColumnView(0).getDataBuffer().printBuffer(8); +// } + + DType[] types = new DType[t.getNumberOfColumns()]; + for (int i = 0; i < t.getNumberOfColumns(); i++) { + types[i] = t.getColumn(i).getType(); + } + try (Table backAgain = Table.convertFromRows(cv, types)) { + assertTablesAreEqual(t, backAgain); + } + } finally { + for (ColumnVector cv : rows) { + cv.close(); + } + } + } + } + @Test void fixedWidthRowsRoundTrip() { try (Table t = new TestBuilder() @@ -7105,7 +7144,7 @@ void fixedWidthRowsRoundTrip() { .decimal32Column(-3, RoundingMode.UNNECESSARY, 5.0d, 9.5d, 0.9d, 7.23d, 2.8d, null) .decimal64Column(-8, 3L, 9L, 4L, 2L, 20L, null) .build()) { - ColumnVector[] rows = t.convertToRows(); + ColumnVector[] rows = t.convertToRowsFixedWidthOptimized(); try { // We didn't overflow assert rows.length == 1; @@ -7119,7 +7158,7 @@ void fixedWidthRowsRoundTrip() { for (int i = 0; i < t.getNumberOfColumns(); i++) { types[i] = t.getColumn(i).getType(); } - try (Table backAgain = Table.convertFromRows(cv, types)) { + try (Table backAgain = Table.convertFromRowsFixedWidthOptimized(cv, types)) { assertTablesAreEqual(t, backAgain); } } finally { From 966c34ce57c17ee924d91cb2f83f3468c2c43833 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Thu, 7 Oct 2021 04:01:36 +0000 Subject: [PATCH 22/80] Fixing issue Raza found with 8-byte data --- cpp/src/row_conversion/row_conversion.cu | 27 +++-- cpp/tests/row_conversion/row_conversion.cpp | 122 ++++++++++++++++---- java/src/main/native/src/row_conversion.cu | 23 ++-- 3 files changed, 132 insertions(+), 40 deletions(-) diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu index 9674000a69d..84fab20fce5 100644 --- a/cpp/src/row_conversion/row_conversion.cu +++ b/cpp/src/row_conversion/row_conversion.cu @@ -333,9 +333,9 @@ struct block_info { int end_row; int buffer_num; - __host__ __device__ size_type get_row_size(size_type const* const col_offsets, - size_type const* const col_sizes, - bool debug_print = false) const + __host__ __device__ size_type get_shared_row_size(size_type const* const col_offsets, + size_type const* const col_sizes, + bool debug_print = false) const { if (debug_print) printf("col_offsets[%d]: %p + col_sizes[%d]: %p - col_offsets[%d]: %p\n%d + %d - %d\n", @@ -350,6 +350,14 @@ struct block_info { col_offsets[start_col]); return align_offset(col_offsets[end_col] + col_sizes[end_col] - col_offsets[start_col], 8); } + __host__ __device__ size_type get_dest_row_size(size_type const* const col_offsets, + size_type const* const col_sizes, + bool debug_print = false) const + { + return align_offset(col_offsets[end_col] + col_sizes[end_col] - col_offsets[start_col] + + util::div_rounding_up_unsafe(num_cols(), 8), + 8); + } __host__ __device__ size_type num_cols() const { return end_col - start_col + 1; } __host__ __device__ size_type num_rows() const { return end_row - start_row + 1; } @@ -456,7 +464,7 @@ __global__ void copy_from_columns(const size_type num_rows, auto const num_fetch_cols = fetch_block.num_cols(); auto const num_fetch_rows = fetch_block.num_rows(); auto const num_elements_in_block = num_fetch_cols * num_fetch_rows; - auto const fetch_block_row_size = fetch_block.get_row_size(col_offsets, col_sizes); + auto const fetch_block_row_size = fetch_block.get_shared_row_size(col_offsets, col_sizes); auto const starting_column_offset = col_offsets[fetch_block.start_col]; auto& fetch_barrier = block_barrier[fetch % NUM_BLOCKS_PER_KERNEL_LOADED]; @@ -513,7 +521,8 @@ __global__ void copy_from_columns(const size_type num_rows, /* auto const rows_in_block = block.num_rows(); auto const cols_in_block = block.num_cols();*/ - auto const block_row_size = block.get_row_size(col_offsets, col_sizes); + auto const block_row_size = block.get_shared_row_size(col_offsets, col_sizes); + auto const dest_row_size = block.get_dest_row_size(col_offsets, col_sizes); auto const column_offset = col_offsets[block.start_col]; // copy entire rows to final dest @@ -521,7 +530,7 @@ __global__ void copy_from_columns(const size_type num_rows, absolute_row += blockDim.x) { auto const relative_row = absolute_row - block.start_row; auto const output_dest = - output_data[block.buffer_num] + absolute_row * block_row_size + column_offset; + output_data[block.buffer_num] + absolute_row * dest_row_size + column_offset; if (debug_print) printf("processing row %d\noutput data[%d] is address %p\n", absolute_row, @@ -918,8 +927,8 @@ static __device__ void fetch_blocks_for_row_to_column( auto const fetch_block_end_row = fetch_block.end_row; auto const starting_col_offset = col_offsets[fetch_block.start_col]; - auto const fetch_block_row_size = fetch_block.get_row_size(col_offsets, col_sizes); - auto const num_fetch_cols = fetch_block.num_cols(); + auto const fetch_block_row_size = fetch_block.get_shared_row_size(col_offsets, col_sizes); + auto const num_fetch_cols = fetch_block.num_cols(); auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes( sizeof(decltype(*col_sizes)), sizeof(decltype(*col_offsets)), num_fetch_cols); auto& fetch_barrier = block_barrier[fetch_index % NUM_BLOCKS_PER_KERNEL_LOADED]; @@ -1115,7 +1124,7 @@ __global__ void copy_to_columns(const size_type num_rows, auto const shared_row_offset = align_offset(col_size_bytes + col_offset_bytes, 8); - auto block_row_size = block.get_row_size(_col_offsets, _col_sizes, debug_print); + auto block_row_size = block.get_shared_row_size(_col_offsets, _col_sizes, debug_print); // now we copy from shared memory to final destination. // the data is laid out in rows in shared memory, so the reads diff --git a/cpp/tests/row_conversion/row_conversion.cpp b/cpp/tests/row_conversion/row_conversion.cpp index 48d9690d583..0ab8b70a0f7 100644 --- a/cpp/tests/row_conversion/row_conversion.cpp +++ b/cpp/tests/row_conversion/row_conversion.cpp @@ -46,9 +46,9 @@ TEST_F(ColumnToRowTests, Single) CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); } - for (uint i = 0; i < old_rows.size(); i++) { - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); - } + /* for (uint i = 0; i < old_rows.size(); i++) { + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); + }*/ } TEST_F(ColumnToRowTests, Simple) @@ -68,9 +68,9 @@ TEST_F(ColumnToRowTests, Simple) CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); } - for (uint i = 0; i < old_rows.size(); i++) { - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); - } + /* for (uint i = 0; i < old_rows.size(); i++) { + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); + }*/ } TEST_F(ColumnToRowTests, Tall) @@ -93,9 +93,9 @@ TEST_F(ColumnToRowTests, Tall) CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); } - for (uint i = 0; i < old_rows.size(); i++) { - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); - } + /* for (uint i = 0; i < old_rows.size(); i++) { + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); + }*/ } TEST_F(ColumnToRowTests, Wide) @@ -122,9 +122,9 @@ TEST_F(ColumnToRowTests, Wide) CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); } - for (uint i = 0; i < old_rows.size(); i++) { - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); - } + /* for (uint i = 0; i < old_rows.size(); i++) { + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); + }*/ } TEST_F(ColumnToRowTests, SingleByteWide) @@ -153,9 +153,9 @@ TEST_F(ColumnToRowTests, SingleByteWide) CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); } - for (uint i = 0; i < old_rows.size(); i++) { - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); - } + /* for (uint i = 0; i < old_rows.size(); i++) { + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); + }*/ } TEST_F(ColumnToRowTests, Non2Power) @@ -191,9 +191,9 @@ TEST_F(ColumnToRowTests, Non2Power) CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); } - for (uint i = 0; i < old_rows.size(); i++) { - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); - } + /* for (uint i = 0; i < old_rows.size(); i++) { + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); + }*/ } TEST_F(ColumnToRowTests, Big) @@ -218,9 +218,21 @@ TEST_F(ColumnToRowTests, Big) auto new_rows = cudf::convert_to_rows(in); EXPECT_EQ(old_rows.size(), new_rows.size()); - for (uint i = 0; i < old_rows.size(); i++) { - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); + + for (uint i = 0; i < old_rows.size(); ++i) { + auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + + for (int j = 0; j < old_tbl->num_columns(); ++j) { + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(old_tbl->get_column(j), new_tbl->get_column(j)); + } + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); } + + /* for (uint i = 0; i < old_rows.size(); i++) { + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); + }*/ } TEST_F(ColumnToRowTests, Bigger) @@ -245,9 +257,20 @@ TEST_F(ColumnToRowTests, Bigger) auto new_rows = cudf::convert_to_rows(in); EXPECT_EQ(old_rows.size(), new_rows.size()); - for (uint i = 0; i < old_rows.size(); i++) { - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); + for (uint i = 0; i < old_rows.size(); ++i) { + auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + + for (int j = 0; j < old_tbl->num_columns(); ++j) { + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(old_tbl->get_column(j), new_tbl->get_column(j)); + } + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); } + + /* for (uint i = 0; i < old_rows.size(); i++) { + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); + }*/ } TEST_F(ColumnToRowTests, Biggest) @@ -272,9 +295,20 @@ TEST_F(ColumnToRowTests, Biggest) auto new_rows = cudf::convert_to_rows(in); EXPECT_EQ(old_rows.size(), new_rows.size()); - for (uint i = 0; i < old_rows.size(); i++) { - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); + + for (uint i = 0; i < old_rows.size(); ++i) { + auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + + for (int j = 0; j < old_tbl->num_columns(); ++j) { + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(old_tbl->get_column(j), new_tbl->get_column(j)); + } + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); } + /* for (uint i = 0; i < old_rows.size(); i++) { + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); + }*/ } TEST_F(RowToColumnTests, Single) @@ -379,6 +413,46 @@ TEST_F(RowToColumnTests, SingleByteWide) } } +TEST_F(RowToColumnTests, Raza) +{ + std::vector> cols; + std::vector views; + std::vector schema{cudf::data_type{cudf::type_id::INT64}, + cudf::data_type{cudf::type_id::FLOAT64}, + cudf::data_type{cudf::type_id::INT8}, + cudf::data_type{cudf::type_id::BOOL8}, + cudf::data_type{cudf::type_id::FLOAT32}, + cudf::data_type{cudf::type_id::INT8}, + cudf::data_type{cudf::type_id::INT32}, + cudf::data_type{cudf::type_id::INT64}}; + + cudf::test::fixed_width_column_wrapper c0({3, 9, 4, 2, 20, 0}, {1, 1, 1, 1, 1, 0}); + cudf::test::fixed_width_column_wrapper c1({5.0, 9.5, 0.9, 7.23, 2.8, 0.0}, + {1, 1, 1, 1, 1, 0}); + cudf::test::fixed_width_column_wrapper c2({5, 1, 0, 2, 7, 0}, {1, 1, 1, 1, 1, 0}); + cudf::test::fixed_width_column_wrapper c3({true, false, false, true, false, false}, + {1, 1, 1, 1, 1, 0}); + cudf::test::fixed_width_column_wrapper c4({1.0f, 3.5f, 5.9f, 7.1f, 9.8f, 0.0f}, + {1, 1, 1, 1, 1, 0}); + cudf::test::fixed_width_column_wrapper c5({2, 3, 4, 5, 9, 0}, {1, 1, 1, 1, 1, 0}); + cudf::test::fixed_point_column_wrapper c6( + {-300, 500, 950, 90, 723, 0}, {1, 1, 1, 1, 1, 1, 1, 0}, numeric::scale_type{-2}); + cudf::test::fixed_point_column_wrapper c7( + {-80, 30, 90, 20, 200, 0}, {1, 1, 1, 1, 1, 1, 0}, numeric::scale_type{-1}); + + cudf::table_view in({c0, c1, c2, c3, c4, c5, c6, c7}); + + auto old_rows = cudf::old_convert_to_rows(in); + auto new_rows = cudf::convert_to_rows(in); + + for (uint i = 0; i < old_rows.size(); ++i) { + auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*new_rows[i]), schema); + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); + } +} + TEST_F(RowToColumnTests, Non2Power) { auto r = diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu index 481787c6004..1808c7534df 100644 --- a/java/src/main/native/src/row_conversion.cu +++ b/java/src/main/native/src/row_conversion.cu @@ -330,10 +330,18 @@ struct block_info { int end_row; int buffer_num; - __host__ __device__ size_type get_row_size(size_type const *const col_offsets, - size_type const *const col_sizes) const { + __host__ __device__ size_type get_shared_row_size(size_type const *const col_offsets, + size_type const *const col_sizes, + bool debug_print = false) const { return align_offset(col_offsets[end_col] + col_sizes[end_col] - col_offsets[start_col], 8); } + __host__ __device__ size_type get_dest_row_size(size_type const *const col_offsets, + size_type const *const col_sizes, + bool debug_print = false) const { + return align_offset(col_offsets[end_col] + col_sizes[end_col] - col_offsets[start_col] + + util::div_rounding_up_unsafe(num_cols(), 8), + 8); + } __host__ __device__ size_type num_cols() const { return end_col - start_col + 1; } __host__ __device__ size_type num_rows() const { return end_row - start_row + 1; } @@ -409,7 +417,7 @@ __global__ void copy_from_columns(const size_type num_rows, const size_type num_ auto const num_fetch_cols = fetch_block.num_cols(); auto const num_fetch_rows = fetch_block.num_rows(); auto const num_elements_in_block = num_fetch_cols * num_fetch_rows; - auto const fetch_block_row_size = fetch_block.get_row_size(col_offsets, col_sizes); + auto const fetch_block_row_size = fetch_block.get_shared_row_size(col_offsets, col_sizes); auto const starting_column_offset = col_offsets[fetch_block.start_col]; auto &fetch_barrier = block_barrier[fetch % NUM_BLOCKS_PER_KERNEL_LOADED]; @@ -448,7 +456,8 @@ __global__ void copy_from_columns(const size_type num_rows, const size_type num_ auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + subset]; - auto const block_row_size = block.get_row_size(col_offsets, col_sizes); + auto const block_row_size = block.get_shared_row_size(col_offsets, col_sizes); + auto const dest_row_size = block.get_dest_row_size(col_offsets, col_sizes); auto const column_offset = col_offsets[block.start_col]; // copy entire rows to final dest @@ -457,7 +466,7 @@ __global__ void copy_from_columns(const size_type num_rows, const size_type num_ auto const relative_row = absolute_row - block.start_row; auto const output_dest = - output_data[block.buffer_num] + absolute_row * block_row_size + column_offset; + output_data[block.buffer_num] + absolute_row * dest_row_size + column_offset; auto const shared_offset = block_row_size * relative_row; cuda::memcpy_async(output_dest, &shared[subset % stages_count][shared_offset], block_row_size, @@ -650,7 +659,7 @@ fetch_blocks_for_row_to_column(size_t &fetch_index, size_t const processing_inde auto const fetch_block_start_row = fetch_block.start_row; auto const fetch_block_end_row = fetch_block.end_row; auto const starting_col_offset = col_offsets[fetch_block.start_col]; - auto const fetch_block_row_size = fetch_block.get_row_size(col_offsets, col_sizes); + auto const fetch_block_row_size = fetch_block.get_shared_row_size(col_offsets, col_sizes); auto const num_fetch_cols = fetch_block.num_cols(); auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes( sizeof(decltype(*col_sizes)), sizeof(decltype(*col_offsets)), num_fetch_cols); @@ -766,7 +775,7 @@ __global__ void copy_to_columns(const size_type num_rows, const size_type num_co auto const shared_row_offset = align_offset(col_size_bytes + col_offset_bytes, 8); - auto block_row_size = block.get_row_size(_col_offsets, _col_sizes); + auto block_row_size = block.get_shared_row_size(_col_offsets, _col_sizes); // now we copy from shared memory to final destination. // the data is laid out in rows in shared memory, so the reads From 6452e8eb6137e2a1f31049ec0dc0add1d6947f9f Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Wed, 13 Oct 2021 22:06:04 +0000 Subject: [PATCH 23/80] fixing bug with float columns when 'enough' data was present. Updated function names --- .../row_conversion/row_conversion.cpp | 8 +- cpp/include/cudf/row_conversion.hpp | 4 +- cpp/src/row_conversion/row_conversion.cu | 95 ++++--- cpp/tests/row_conversion/row_conversion.cpp | 245 ++++++++++++------ java/src/main/native/src/TableJni.cpp | 16 +- java/src/main/native/src/row_conversion.cu | 66 ++--- java/src/main/native/src/row_conversion.hpp | 19 +- 7 files changed, 265 insertions(+), 188 deletions(-) diff --git a/cpp/benchmarks/row_conversion/row_conversion.cpp b/cpp/benchmarks/row_conversion/row_conversion.cpp index 2fe436a22c1..fb8e4c8aef3 100644 --- a/cpp/benchmarks/row_conversion/row_conversion.cpp +++ b/cpp/benchmarks/row_conversion/row_conversion.cpp @@ -50,7 +50,7 @@ static void BM_old_to_row(benchmark::State& state) for (auto _ : state) { cuda_event_timer raii(state, true, rmm::cuda_stream_default); - auto rows = cudf::old_convert_to_rows(table->view()); + auto rows = cudf::convert_to_rows_fixed_width_optimized(table->view()); } state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows()); @@ -109,13 +109,13 @@ static void BM_old_from_row(benchmark::State& state) total_bytes += cudf::size_of(t); } - auto rows = cudf::old_convert_to_rows(table->view()); + auto rows = cudf::convert_to_rows_fixed_width_optimized(table->view()); cudf::lists_column_view const first_list(rows.front()->view()); for (auto _ : state) { cuda_event_timer raii(state, true, rmm::cuda_stream_default); - auto out = cudf::old_convert_from_rows(first_list, schema); + auto out = cudf::convert_from_rows_fixed_width_optimized(first_list, schema); } state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows()); @@ -144,7 +144,7 @@ static void BM_new_from_row(benchmark::State& state) total_bytes += cudf::size_of(t); } - auto rows = cudf::old_convert_to_rows(table->view()); + auto rows = cudf::convert_to_rows_fixed_width_optimized(table->view()); cudf::lists_column_view const first_list(rows.front()->view()); for (auto _ : state) { diff --git a/cpp/include/cudf/row_conversion.hpp b/cpp/include/cudf/row_conversion.hpp index 8f82d01b06c..5d799f4c596 100644 --- a/cpp/include/cudf/row_conversion.hpp +++ b/cpp/include/cudf/row_conversion.hpp @@ -24,7 +24,7 @@ namespace cudf { -std::vector> old_convert_to_rows( +std::vector> convert_to_rows_fixed_width_optimized( cudf::table_view const& tbl, // TODO need something for validity rmm::cuda_stream_view stream = rmm::cuda_stream_default, @@ -36,7 +36,7 @@ std::vector> convert_to_rows( rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); -std::unique_ptr old_convert_from_rows( +std::unique_ptr convert_from_rows_fixed_width_optimized( cudf::lists_column_view const& input, std::vector const& schema, rmm::cuda_stream_view stream = rmm::cuda_stream_default, diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu index 84fab20fce5..0457bbf71e4 100644 --- a/cpp/src/row_conversion/row_conversion.cu +++ b/cpp/src/row_conversion/row_conversion.cu @@ -53,7 +53,7 @@ #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 constexpr auto NUM_BLOCKS_PER_KERNEL_TO_COLUMNS = 8; -constexpr auto NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS = 8; +constexpr auto NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS = 2; constexpr auto NUM_BLOCKS_PER_KERNEL_LOADED = 2; constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL = 8; constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED = 2; @@ -350,14 +350,6 @@ struct block_info { col_offsets[start_col]); return align_offset(col_offsets[end_col] + col_sizes[end_col] - col_offsets[start_col], 8); } - __host__ __device__ size_type get_dest_row_size(size_type const* const col_offsets, - size_type const* const col_sizes, - bool debug_print = false) const - { - return align_offset(col_offsets[end_col] + col_sizes[end_col] - col_offsets[start_col] + - util::div_rounding_up_unsafe(num_cols(), 8), - 8); - } __host__ __device__ size_type num_cols() const { return end_col - start_col + 1; } __host__ __device__ size_type num_rows() const { return end_row - start_row + 1; } @@ -441,9 +433,8 @@ __global__ void copy_from_columns(const size_type num_rows, // else { return; } auto const blocks_remaining = - std::min((uint)(num_block_infos % NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS), - std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS, - (uint)NUM_BLOCKS_PER_KERNEL_TO_COLUMNS)); + std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS, + (uint)NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS); size_t fetch; size_t subset; @@ -451,11 +442,11 @@ __global__ void copy_from_columns(const size_type num_rows, // Fetch ahead up to stages_count subsets for (; fetch < blocks_remaining && fetch < (subset + stages_count); ++fetch) { if (debug_print) - printf("fetching block %lu\n", blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch); - auto const fetch_block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch]; + printf("fetching block %lu\n", blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS + fetch); + auto const fetch_block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS + fetch]; if (debug_print) printf("block %lu rows %d-%d and cols %d-%d\n", - blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch, + blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS + fetch, fetch_block.start_row, fetch_block.end_row, fetch_block.start_col, @@ -474,9 +465,9 @@ __global__ void copy_from_columns(const size_type num_rows, // to do the copy we need to do n column copies followed by m element copies OR // we have to do m element copies followed by r row copies. When going from column // to row it is much easier to copy by elements first otherwise we would need a running - // total of the column sizes for our block, which isn't readily available. This makes it more - // appealing to copy element-wise from input data into shared matching the end layout and do - // row-based memcopies out. + // total of the column sizes for our block, which isn't readily available. This makes it + // more appealing to copy element-wise from input data into shared matching the end layout + // and do row-based memcopies out. for (auto el = (int)threadIdx.x; el < num_elements_in_block; el += blockDim.x) { auto const relative_col = el / num_fetch_rows; @@ -499,14 +490,15 @@ __global__ void copy_from_columns(const size_type num_rows, auto const input_src = input_data[absolute_col] + col_size * absolute_row; if (debug_print) - printf("block %lu to shared chunk %lu. %p <- %p - %d bytes\n", + printf("block %lu to shared chunk %lu. %p <- %p(0x%x) - %d bytes\n", fetch, fetch % stages_count, &shared[fetch % stages_count][shared_offset], input_src, + *input_src, col_size); - // copy the element to global memory + // copy the element from global memory cuda::memcpy_async( &shared[fetch % stages_count][shared_offset], input_src, col_size, fetch_barrier); } @@ -515,14 +507,11 @@ __global__ void copy_from_columns(const size_type num_rows, auto& subset_barrier = block_barrier[subset % NUM_BLOCKS_PER_KERNEL_LOADED]; subset_barrier.arrive_and_wait(); - auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + subset]; + auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS + subset]; if (debug_print) - printf("reading block %lu\n", blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + subset); + printf("reading block %lu\n", blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS + subset); - /* auto const rows_in_block = block.num_rows(); - auto const cols_in_block = block.num_cols();*/ auto const block_row_size = block.get_shared_row_size(col_offsets, col_sizes); - auto const dest_row_size = block.get_dest_row_size(col_offsets, col_sizes); auto const column_offset = col_offsets[block.start_col]; // copy entire rows to final dest @@ -530,7 +519,7 @@ __global__ void copy_from_columns(const size_type num_rows, absolute_row += blockDim.x) { auto const relative_row = absolute_row - block.start_row; auto const output_dest = - output_data[block.buffer_num] + absolute_row * dest_row_size + column_offset; + output_data[block.buffer_num] + row_offsets[absolute_row] + column_offset; if (debug_print) printf("processing row %d\noutput data[%d] is address %p\n", absolute_row, @@ -543,6 +532,7 @@ __global__ void copy_from_columns(const size_type num_rows, &shared[subset % stages_count][shared_offset], block_row_size, absolute_row); + cuda::memcpy_async( output_dest, &shared[subset % stages_count][shared_offset], block_row_size, subset_barrier); } @@ -673,7 +663,8 @@ __global__ void copy_validity_from_columns(const size_type num_rows, if (print_debug) printf( - "%d %d - my warp is %d, %d total sections(%d x, %d y), %d warps per block, blockDim.x=%d, " + "%d %d - my warp is %d, %d total sections(%d x, %d y), %d warps per block, " + "blockDim.x=%d, " "warp size " "%d\n", threadIdx.x, @@ -709,7 +700,8 @@ __global__ void copy_validity_from_columns(const size_type num_rows, if (print_debug) printf( - "participation mask is 0x%x for relative row %d(%d real), relative col %d(%d absolute)\n", + "participation mask is 0x%x for relative row %d(%d real), relative col %d(%d " + "absolute)\n", participation_mask, relative_row, absolute_row, @@ -744,8 +736,8 @@ __global__ void copy_validity_from_columns(const size_type num_rows, absolute_col); // every thread that is participating in the warp has a byte, but it's column-based - // data and we need it in row-based. So we shuffle the bits around with ballot_sync to make - // the bytes we actually write. + // data and we need it in row-based. So we shuffle the bits around with ballot_sync to + // make the bytes we actually write. for (int i = 0, byte_mask = 1; i < 8 && relative_row + i < num_rows; ++i, byte_mask <<= 1) { auto validity_data = __ballot_sync(participation_mask, my_byte & byte_mask); // lead thread in each warp writes data @@ -915,7 +907,8 @@ static __device__ void fetch_blocks_for_row_to_column( block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch_index]; if (debug_print) printf( - "fetching block %lu of %d for start col %d, end col %d. Starting col offset is %p, ending " + "fetching block %lu of %d for start col %d, end col %d. Starting col offset is %p, " + "ending " "offset %p\n", blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch_index, total_blocks, @@ -1242,7 +1235,8 @@ __global__ void copy_validity_to_columns(const size_type num_rows, block_infos, blockIdx.x); printf( - "%d %d - Shared memory starts at %p and ends at %p, input data is %p, output data is %p, row " + "%d %d - Shared memory starts at %p and ends at %p, input data is %p, output data is %p, " + "row " "offsets are %p, block infos at %p\n", threadIdx.x, blockIdx.x, @@ -1595,8 +1589,8 @@ static inline int32_t compute_fixed_width_layout(std::vector co } // Now we need to add in space for validity - // Eventually we can think about nullable vs not nullable, but for now we will just always add it - // in + // Eventually we can think about nullable vs not nullable, but for now we will just always add + // it in int32_t validity_bytes_needed = (schema.size() + 7) / 8; // cudf::util::div_rounding_up_safe(schema.size(), 8); // validity comes at the end and is byte aligned so we can pack more in. @@ -1727,11 +1721,11 @@ std::vector build_block_infos(std::vector const& column_s }; // the ideal window height has lots of 8-byte reads and 8-byte writes. The optimal read/write - // would be memory cache line sized access, but since other blocks will read/write the edges this - // may not turn out to be overly important. For now, we will attempt to build a square window as - // far as byte sizes. x * y = shared_mem_size. Which translates to x^2 = shared_mem_size since we - // want them equal, so height and width are sqrt(shared_mem_size). The trick is that it's in - // bytes, not rows or columns. + // would be memory cache line sized access, but since other blocks will read/write the edges + // this may not turn out to be overly important. For now, we will attempt to build a square + // window as far as byte sizes. x * y = shared_mem_size. Which translates to x^2 = + // shared_mem_size since we want them equal, so height and width are sqrt(shared_mem_size). The + // trick is that it's in bytes, not rows or columns. size_type const optimal_square_len = size_type(sqrt(shmem_limit_per_block)); int const window_height = std::clamp( util::round_up_safe( @@ -1787,9 +1781,11 @@ std::vector build_block_infos(std::vector const& column_s calc_admin_data_size(col - current_window_start_col), shmem_limit_per_block); printf( - "Window size %d too large at column %d, admin size is %d, bumping back to build windows of " + "Window size %d too large at column %d, admin size is %d, bumping back to build windows " + "of " "size %d(cols " - "%d-%d), which is %d tall. Row size is too large at %d and ok at %d(aligned overall is %d) " + "%d-%d), which is %d tall. Row size is too large at %d and ok at %d(aligned overall is " + "%d) " "for shared mem size %d\n", row_size_with_end_pad * window_height, col, @@ -1809,7 +1805,8 @@ std::vector build_block_infos(std::vector const& column_s detail::align_offset((column_starts[col] + column_sizes[col]) & 7, alignment_needed); #if defined(DEBUG) printf( - "New window starting with offset %d and row size %d to be %d (previous column offset %d+%d " + "New window starting with offset %d and row size %d to be %d (previous column offset " + "%d+%d " "or %d)\n", row_size, col_size, @@ -2172,9 +2169,8 @@ std::vector> convert_to_rows(cudf::table_view cons #endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 } -std::vector> old_convert_to_rows(cudf::table_view const& tbl, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::vector> convert_to_rows_fixed_width_optimized( + cudf::table_view const& tbl, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { const cudf::size_type num_columns = tbl.num_columns(); @@ -2399,10 +2395,11 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const& in #endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 } -std::unique_ptr old_convert_from_rows(cudf::lists_column_view const& input, - std::vector const& schema, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::unique_ptr convert_from_rows_fixed_width_optimized( + cudf::lists_column_view const& input, + std::vector const& schema, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { // verify that the types are what we expect cudf::column_view child = input.child(); diff --git a/cpp/tests/row_conversion/row_conversion.cpp b/cpp/tests/row_conversion/row_conversion.cpp index 0ab8b70a0f7..746ac0655f7 100644 --- a/cpp/tests/row_conversion/row_conversion.cpp +++ b/cpp/tests/row_conversion/row_conversion.cpp @@ -14,15 +14,21 @@ * limitations under the License. */ +#include +#include +#include +#include +#include +#include #include #include #include #include #include -#include -#include "cudf/lists/lists_column_view.hpp" -#include "cudf/types.hpp" +#include + +#include struct ColumnToRowTests : public cudf::test::BaseFixture { }; @@ -35,20 +41,17 @@ TEST_F(ColumnToRowTests, Single) cudf::table_view in(std::vector{a}); std::vector schema = {cudf::data_type{cudf::type_id::INT32}}; - auto old_rows = cudf::old_convert_to_rows(in); + auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); auto new_rows = cudf::convert_to_rows(in); EXPECT_EQ(old_rows.size(), new_rows.size()); for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto old_tbl = + cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); } - - /* for (uint i = 0; i < old_rows.size(); i++) { - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); - }*/ } TEST_F(ColumnToRowTests, Simple) @@ -57,20 +60,17 @@ TEST_F(ColumnToRowTests, Simple) cudf::table_view in(std::vector{a}); std::vector schema = {cudf::data_type{cudf::type_id::INT32}}; - auto old_rows = cudf::old_convert_to_rows(in); + auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); auto new_rows = cudf::convert_to_rows(in); EXPECT_EQ(old_rows.size(), new_rows.size()); for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto old_tbl = + cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); } - - /* for (uint i = 0; i < old_rows.size(); i++) { - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); - }*/ } TEST_F(ColumnToRowTests, Tall) @@ -81,21 +81,18 @@ TEST_F(ColumnToRowTests, Tall) cudf::table_view in(std::vector{a}); std::vector schema = {cudf::data_type{cudf::type_id::INT32}}; - auto old_rows = cudf::old_convert_to_rows(in); + auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); auto new_rows = cudf::convert_to_rows(in); EXPECT_EQ(old_rows.size(), new_rows.size()); for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto old_tbl = + cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); } - - /* for (uint i = 0; i < old_rows.size(); i++) { - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); - }*/ } TEST_F(ColumnToRowTests, Wide) @@ -111,20 +108,17 @@ TEST_F(ColumnToRowTests, Wide) } cudf::table_view in(views); - auto old_rows = cudf::old_convert_to_rows(in); + auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); auto new_rows = cudf::convert_to_rows(in); EXPECT_EQ(old_rows.size(), new_rows.size()); for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto old_tbl = + cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); } - - /* for (uint i = 0; i < old_rows.size(); i++) { - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); - }*/ } TEST_F(ColumnToRowTests, SingleByteWide) @@ -141,21 +135,18 @@ TEST_F(ColumnToRowTests, SingleByteWide) } cudf::table_view in(views); - auto old_rows = cudf::old_convert_to_rows(in); + auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); auto new_rows = cudf::convert_to_rows(in); EXPECT_EQ(old_rows.size(), new_rows.size()); for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto old_tbl = + cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); } - - /* for (uint i = 0; i < old_rows.size(); i++) { - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); - }*/ } TEST_F(ColumnToRowTests, Non2Power) @@ -175,13 +166,14 @@ TEST_F(ColumnToRowTests, Non2Power) } cudf::table_view in(views); - auto old_rows = cudf::old_convert_to_rows(in); + auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); auto new_rows = cudf::convert_to_rows(in); EXPECT_EQ(old_rows.size(), new_rows.size()); for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto old_tbl = + cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); for (int j = 0; j < old_tbl->num_columns(); ++j) { @@ -190,10 +182,6 @@ TEST_F(ColumnToRowTests, Non2Power) CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); } - - /* for (uint i = 0; i < old_rows.size(); i++) { - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); - }*/ } TEST_F(ColumnToRowTests, Big) @@ -214,13 +202,14 @@ TEST_F(ColumnToRowTests, Big) } cudf::table_view in(views); - auto old_rows = cudf::old_convert_to_rows(in); + auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); auto new_rows = cudf::convert_to_rows(in); EXPECT_EQ(old_rows.size(), new_rows.size()); for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto old_tbl = + cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); for (int j = 0; j < old_tbl->num_columns(); ++j) { @@ -229,10 +218,6 @@ TEST_F(ColumnToRowTests, Big) CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); } - - /* for (uint i = 0; i < old_rows.size(); i++) { - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); - }*/ } TEST_F(ColumnToRowTests, Bigger) @@ -253,12 +238,13 @@ TEST_F(ColumnToRowTests, Bigger) } cudf::table_view in(views); - auto old_rows = cudf::old_convert_to_rows(in); + auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); auto new_rows = cudf::convert_to_rows(in); EXPECT_EQ(old_rows.size(), new_rows.size()); for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto old_tbl = + cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); for (int j = 0; j < old_tbl->num_columns(); ++j) { @@ -267,10 +253,6 @@ TEST_F(ColumnToRowTests, Bigger) CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); } - - /* for (uint i = 0; i < old_rows.size(); i++) { - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); - }*/ } TEST_F(ColumnToRowTests, Biggest) @@ -291,13 +273,14 @@ TEST_F(ColumnToRowTests, Biggest) } cudf::table_view in(views); - auto old_rows = cudf::old_convert_to_rows(in); + auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); auto new_rows = cudf::convert_to_rows(in); EXPECT_EQ(old_rows.size(), new_rows.size()); for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto old_tbl = + cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); for (int j = 0; j < old_tbl->num_columns(); ++j) { @@ -306,9 +289,6 @@ TEST_F(ColumnToRowTests, Biggest) CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); } - /* for (uint i = 0; i < old_rows.size(); i++) { - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); - }*/ } TEST_F(RowToColumnTests, Single) @@ -319,7 +299,8 @@ TEST_F(RowToColumnTests, Single) auto old_rows = cudf::convert_to_rows(in); std::vector schema{cudf::data_type{cudf::type_id::INT32}}; for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto old_tbl = + cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); @@ -331,10 +312,11 @@ TEST_F(RowToColumnTests, Simple) cudf::test::fixed_width_column_wrapper a({-1, 0, 1}); cudf::table_view in(std::vector{a}); - auto old_rows = cudf::old_convert_to_rows(in); + auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); std::vector schema{cudf::data_type{cudf::type_id::INT32}}; for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto old_tbl = + cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); @@ -348,14 +330,15 @@ TEST_F(RowToColumnTests, Tall) cudf::test::fixed_width_column_wrapper a(r, r + (size_t)4096); cudf::table_view in(std::vector{a}); - auto old_rows = cudf::old_convert_to_rows(in); + auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); std::vector schema; schema.reserve(in.num_columns()); for (auto col = in.begin(); col < in.end(); ++col) { schema.push_back(col->type()); } for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto old_tbl = + cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); @@ -373,7 +356,7 @@ TEST_F(RowToColumnTests, Wide) } cudf::table_view in(views); - auto old_rows = cudf::old_convert_to_rows(in); + auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); std::vector schema; schema.reserve(in.num_columns()); for (auto col = in.begin(); col < in.end(); ++col) { @@ -381,7 +364,8 @@ TEST_F(RowToColumnTests, Wide) } for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto old_tbl = + cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); @@ -399,21 +383,22 @@ TEST_F(RowToColumnTests, SingleByteWide) } cudf::table_view in(views); - auto old_rows = cudf::old_convert_to_rows(in); + auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); std::vector schema; schema.reserve(in.num_columns()); for (auto col = in.begin(); col < in.end(); ++col) { schema.push_back(col->type()); } for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto old_tbl = + cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); } } -TEST_F(RowToColumnTests, Raza) +TEST_F(RowToColumnTests, AllTypes) { std::vector> cols; std::vector views; @@ -442,11 +427,115 @@ TEST_F(RowToColumnTests, Raza) cudf::table_view in({c0, c1, c2, c3, c4, c5, c6, c7}); - auto old_rows = cudf::old_convert_to_rows(in); + auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); + auto new_rows = cudf::convert_to_rows(in); + + for (uint i = 0; i < old_rows.size(); ++i) { + auto old_tbl = + cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); + auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*new_rows[i]), schema); + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); + } +} + +TEST_F(RowToColumnTests, AllTypesLarge) +{ + std::vector cols; + std::vector schema{}; + + // 10 columns of each type with 1024 entries + constexpr int num_rows{1024}; + + std::default_random_engine re; + std::uniform_real_distribution rand_double(std::numeric_limits::min(), + std::numeric_limits::max()); + std::uniform_int_distribution rand_int64(std::numeric_limits::min(), + std::numeric_limits::max()); + auto r = cudf::detail::make_counting_transform_iterator( + 0, [&](auto i) -> int64_t { return rand_int64(re); }); + auto d = cudf::detail::make_counting_transform_iterator( + 0, [&](auto i) -> double { return rand_double(re); }); + + for (int i = 0; i < 10; ++i) { + cols.push_back( + *cudf::test::fixed_width_column_wrapper(r, r + num_rows).release().release()); + schema.push_back(cudf::data_type{cudf::type_id::INT8}); + } + + for (int i = 0; i < 10; ++i) { + cols.push_back( + *cudf::test::fixed_width_column_wrapper(r, r + num_rows).release().release()); + schema.push_back(cudf::data_type{cudf::type_id::INT16}); + } + + for (int i = 0; i < 10; ++i) { + cols.push_back( + *cudf::test::fixed_width_column_wrapper(r, r + num_rows).release().release()); + schema.push_back(cudf::data_type{cudf::type_id::INT32}); + } + + for (int i = 0; i < 10; ++i) { + cols.push_back( + *cudf::test::fixed_width_column_wrapper(d, d + num_rows).release().release()); + schema.push_back(cudf::data_type{cudf::type_id::FLOAT32}); + } + + for (int i = 0; i < 10; ++i) { + cols.push_back( + *cudf::test::fixed_width_column_wrapper(d, d + num_rows).release().release()); + schema.push_back(cudf::data_type{cudf::type_id::FLOAT64}); + } + + for (int i = 0; i < 10; ++i) { + cols.push_back( + *cudf::test::fixed_width_column_wrapper(r, r + num_rows).release().release()); + schema.push_back(cudf::data_type{cudf::type_id::BOOL8}); + } + + for (int i = 0; i < 10; ++i) { + cols.push_back( + *cudf::test::fixed_width_column_wrapper( + r, r + num_rows) + .release() + .release()); + schema.push_back(cudf::data_type{cudf::type_id::TIMESTAMP_MILLISECONDS}); + } + + for (int i = 0; i < 10; ++i) { + cols.push_back( + *cudf::test::fixed_width_column_wrapper( + r, r + num_rows) + .release() + .release()); + schema.push_back(cudf::data_type{cudf::type_id::TIMESTAMP_DAYS}); + } + + for (int i = 0; i < 10; ++i) { + cols.push_back( + *cudf::test::fixed_point_column_wrapper(r, r + num_rows, numeric::scale_type{-2}) + .release() + .release()); + schema.push_back(cudf::data_type{cudf::type_id::DECIMAL32}); + } + + for (int i = 0; i < 10; ++i) { + cols.push_back( + *cudf::test::fixed_point_column_wrapper(r, r + num_rows, numeric::scale_type{-1}) + .release() + .release()); + schema.push_back(cudf::data_type{cudf::type_id::DECIMAL64}); + } + + std::vector views(cols.begin(), cols.end()); + cudf::table_view in(views); + + auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); auto new_rows = cudf::convert_to_rows(in); for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto old_tbl = + cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*new_rows[i]), schema); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); @@ -470,10 +559,11 @@ TEST_F(RowToColumnTests, Non2Power) } cudf::table_view in(views); - auto old_rows = cudf::old_convert_to_rows(in); + auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto old_tbl = + cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); @@ -498,10 +588,11 @@ TEST_F(RowToColumnTests, Big) } cudf::table_view in(views); - auto old_rows = cudf::old_convert_to_rows(in); + auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto old_tbl = + cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); @@ -526,10 +617,11 @@ TEST_F(RowToColumnTests, Bigger) } cudf::table_view in(views); - auto old_rows = cudf::old_convert_to_rows(in); + auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto old_tbl = + cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); @@ -554,10 +646,11 @@ TEST_F(RowToColumnTests, Biggest) } cudf::table_view in(views); - auto old_rows = cudf::old_convert_to_rows(in); + auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto old_tbl = + cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index cdd0623eb77..109ee571b7d 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -2697,14 +2697,15 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_gather(JNIEnv *env, jclas CATCH_STD(env, 0); } -JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertToRowsFixedWidthOptimized(JNIEnv *env, jclass, - jlong input_table) { +JNIEXPORT jlongArray JNICALL +Java_ai_rapids_cudf_Table_convertToRowsFixedWidthOptimized(JNIEnv *env, jclass, jlong input_table) { JNI_NULL_CHECK(env, input_table, "input table is null", 0); try { cudf::jni::auto_set_device(env); cudf::table_view *n_input_table = reinterpret_cast(input_table); - std::vector> cols = cudf::old_convert_to_rows(*n_input_table); + std::vector> cols = + cudf::convert_to_rows_fixed_width_optimized(*n_input_table); int num_columns = cols.size(); cudf::jni::native_jlongArray outcol_handles(env, num_columns); for (int i = 0; i < num_columns; i++) { @@ -2736,10 +2737,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertToRows(JNIEnv *env CATCH_STD(env, 0); } -JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRowsFixedWidthOptimized(JNIEnv *env, jclass, - jlong input_column, - jintArray types, - jintArray scale) { +JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRowsFixedWidthOptimized( + JNIEnv *env, jclass, jlong input_column, jintArray types, jintArray scale) { JNI_NULL_CHECK(env, input_column, "input column is null", 0); JNI_NULL_CHECK(env, types, "types is null", 0); @@ -2753,7 +2752,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRowsFixedWidth for (int i = 0; i < n_types.size(); i++) { types_vec.emplace_back(cudf::jni::make_data_type(n_types[i], n_scale[i])); } - std::unique_ptr result = cudf::old_convert_from_rows(list_input, types_vec); + std::unique_ptr result = + cudf::convert_from_rows_fixed_width_optimized(list_input, types_vec); return cudf::jni::convert_table_for_return(env, result); } CATCH_STD(env, 0); diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu index 1808c7534df..e6cd9a9da32 100644 --- a/java/src/main/native/src/row_conversion.cu +++ b/java/src/main/native/src/row_conversion.cu @@ -21,8 +21,6 @@ #include #include -#include -#include #include #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 @@ -30,10 +28,12 @@ #endif #include +#include #include #include #include #include +#include #include #include #include @@ -51,7 +51,7 @@ #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 constexpr auto NUM_BLOCKS_PER_KERNEL_TO_COLUMNS = 8; -constexpr auto NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS = 8; +constexpr auto NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS = 2; constexpr auto NUM_BLOCKS_PER_KERNEL_LOADED = 2; constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL = 8; constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED = 2; @@ -331,17 +331,9 @@ struct block_info { int buffer_num; __host__ __device__ size_type get_shared_row_size(size_type const *const col_offsets, - size_type const *const col_sizes, - bool debug_print = false) const { + size_type const *const col_sizes) const { return align_offset(col_offsets[end_col] + col_sizes[end_col] - col_offsets[start_col], 8); } - __host__ __device__ size_type get_dest_row_size(size_type const *const col_offsets, - size_type const *const col_sizes, - bool debug_print = false) const { - return align_offset(col_offsets[end_col] + col_sizes[end_col] - col_offsets[start_col] + - util::div_rounding_up_unsafe(num_cols(), 8), - 8); - } __host__ __device__ size_type num_cols() const { return end_col - start_col + 1; } __host__ __device__ size_type num_rows() const { return end_row - start_row + 1; } @@ -404,16 +396,15 @@ __global__ void copy_from_columns(const size_type num_rows, const size_type num_ group.sync(); auto const blocks_remaining = - std::min((uint)(num_block_infos % NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS), - std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS, - (uint)NUM_BLOCKS_PER_KERNEL_TO_COLUMNS)); + std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS, + (uint)NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS); size_t fetch; size_t subset; for (subset = fetch = 0; subset < blocks_remaining; ++subset) { // Fetch ahead up to stages_count subsets for (; fetch < blocks_remaining && fetch < (subset + stages_count); ++fetch) { - auto const fetch_block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch]; + auto const fetch_block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS + fetch]; auto const num_fetch_cols = fetch_block.num_cols(); auto const num_fetch_rows = fetch_block.num_rows(); auto const num_elements_in_block = num_fetch_cols * num_fetch_rows; @@ -429,9 +420,9 @@ __global__ void copy_from_columns(const size_type num_rows, const size_type num_ // to do the copy we need to do n column copies followed by m element copies OR // we have to do m element copies followed by r row copies. When going from column // to row it is much easier to copy by elements first otherwise we would need a running - // total of the column sizes for our block, which isn't readily available. This makes it more - // appealing to copy element-wise from input data into shared matching the end layout and do - // row-based memcopies out. + // total of the column sizes for our block, which isn't readily available. This makes it + // more appealing to copy element-wise from input data into shared matching the end layout + // and do row-based memcopies out. for (auto el = (int)threadIdx.x; el < num_elements_in_block; el += blockDim.x) { auto const relative_col = el / num_fetch_rows; @@ -445,7 +436,7 @@ __global__ void copy_from_columns(const size_type num_rows, const size_type num_ auto const shared_offset = relative_row * fetch_block_row_size + relative_col_offset; auto const input_src = input_data[absolute_col] + col_size * absolute_row; - // copy the element to global memory + // copy the element from global memory cuda::memcpy_async(&shared[fetch % stages_count][shared_offset], input_src, col_size, fetch_barrier); } @@ -454,10 +445,8 @@ __global__ void copy_from_columns(const size_type num_rows, const size_type num_ auto &subset_barrier = block_barrier[subset % NUM_BLOCKS_PER_KERNEL_LOADED]; subset_barrier.arrive_and_wait(); - auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + subset]; - + auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS + subset]; auto const block_row_size = block.get_shared_row_size(col_offsets, col_sizes); - auto const dest_row_size = block.get_dest_row_size(col_offsets, col_sizes); auto const column_offset = col_offsets[block.start_col]; // copy entire rows to final dest @@ -466,7 +455,7 @@ __global__ void copy_from_columns(const size_type num_rows, const size_type num_ auto const relative_row = absolute_row - block.start_row; auto const output_dest = - output_data[block.buffer_num] + absolute_row * dest_row_size + column_offset; + output_data[block.buffer_num] + row_offsets[absolute_row] + column_offset; auto const shared_offset = block_row_size * relative_row; cuda::memcpy_async(output_dest, &shared[subset % stages_count][shared_offset], block_row_size, @@ -563,8 +552,8 @@ __global__ void copy_validity_from_columns( input_nm[absolute_col] != nullptr ? input_nm[absolute_col][absolute_row / 32] : 0xFF; // every thread that is participating in the warp has a byte, but it's column-based - // data and we need it in row-based. So we shuffle the bits around with ballot_sync to make - // the bytes we actually write. + // data and we need it in row-based. So we shuffle the bits around with ballot_sync to + // make the bytes we actually write. for (int i = 0, byte_mask = 1; i < 8 && relative_row + i < num_rows; ++i, byte_mask <<= 1) { auto validity_data = __ballot_sync(participation_mask, my_byte & byte_mask); // lead thread in each warp writes data @@ -1085,8 +1074,8 @@ static inline int32_t compute_fixed_width_layout(std::vector co } // Now we need to add in space for validity - // Eventually we can think about nullable vs not nullable, but for now we will just always add it - // in + // Eventually we can think about nullable vs not nullable, but for now we will just always add + // it in int32_t validity_bytes_needed = (schema.size() + 7) / 8; // cudf::util::div_rounding_up_safe(schema.size(), 8); // validity comes at the end and is byte aligned so we can pack more in. @@ -1209,11 +1198,11 @@ std::vector build_block_infos(std::vector const &column_s }; // the ideal window height has lots of 8-byte reads and 8-byte writes. The optimal read/write - // would be memory cache line sized access, but since other blocks will read/write the edges this - // may not turn out to be overly important. For now, we will attempt to build a square window as - // far as byte sizes. x * y = shared_mem_size. Which translates to x^2 = shared_mem_size since we - // want them equal, so height and width are sqrt(shared_mem_size). The trick is that it's in - // bytes, not rows or columns. + // would be memory cache line sized access, but since other blocks will read/write the edges + // this may not turn out to be overly important. For now, we will attempt to build a square + // window as far as byte sizes. x * y = shared_mem_size. Which translates to x^2 = + // shared_mem_size since we want them equal, so height and width are sqrt(shared_mem_size). The + // trick is that it's in bytes, not rows or columns. size_type const optimal_square_len = size_type(sqrt(shmem_limit_per_block)); int const window_height = std::clamp( util::round_up_safe( @@ -1478,8 +1467,8 @@ std::vector> convert_to_rows(cudf::table_view cons } std::vector> -old_convert_to_rows(cudf::table_view const &tbl, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) { +convert_to_rows_fixed_width_optimized(cudf::table_view const &tbl, rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) { const cudf::size_type num_columns = tbl.num_columns(); std::vector schema; @@ -1656,10 +1645,9 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const &in #endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 } -std::unique_ptr old_convert_from_rows(cudf::lists_column_view const &input, - std::vector const &schema, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) { +std::unique_ptr convert_from_rows_fixed_width_optimized( + cudf::lists_column_view const &input, std::vector const &schema, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) { // verify that the types are what we expect cudf::column_view child = input.child(); cudf::type_id list_type = child.type().id(); diff --git a/java/src/main/native/src/row_conversion.hpp b/java/src/main/native/src/row_conversion.hpp index 517202f3892..edc2768d4bb 100644 --- a/java/src/main/native/src/row_conversion.hpp +++ b/java/src/main/native/src/row_conversion.hpp @@ -25,11 +25,11 @@ namespace cudf { namespace java { -std::vector> -old_convert_to_rows(cudf::table_view const &tbl, - // TODO need something for validity - rmm::cuda_stream_view stream = rmm::cuda_stream_default, - rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); +std::vector> convert_to_rows_fixed_width_optimized( + cudf::table_view const &tbl, + // TODO need something for validity + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); std::vector> convert_to_rows(cudf::table_view const &tbl, @@ -37,11 +37,10 @@ convert_to_rows(cudf::table_view const &tbl, rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); -std::unique_ptr -old_convert_from_rows(cudf::lists_column_view const &input, - std::vector const &schema, - rmm::cuda_stream_view stream = rmm::cuda_stream_default, - rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); +std::unique_ptr convert_from_rows_fixed_width_optimized( + cudf::lists_column_view const &input, std::vector const &schema, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); std::unique_ptr convert_from_rows(cudf::lists_column_view const &input, std::vector const &schema, From c0e989570bfdcbd51bb2abed0bed87a5c7f5cedd Mon Sep 17 00:00:00 2001 From: Raza Jafri Date: Fri, 15 Oct 2021 15:20:52 -0700 Subject: [PATCH 24/80] code cleanup and removed comments --- java/src/main/native/src/TableJni.cpp | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 109ee571b7d..d0e6b895a1e 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -598,20 +598,16 @@ class native_arrow_ipc_reader_handle final { static jlongArray convert_table_for_return(JNIEnv *env, std::unique_ptr &table_result, std::vector> &extra_columns) { - std::cout << "entering convert_table_for_return\n"; std::vector> ret = table_result->release(); int table_cols = ret.size(); int num_columns = table_cols + extra_columns.size(); cudf::jni::native_jlongArray outcol_handles(env, num_columns); - std::cout << "0\n"; for (int i = 0; i < table_cols; i++) { outcol_handles[i] = reinterpret_cast(ret[i].release()); } - std::cout << "1\n"; for (size_t i = 0; i < extra_columns.size(); i++) { outcol_handles[i + table_cols] = reinterpret_cast(extra_columns[i].release()); } - std::cout << "exiting convert_table_for_return\n"; return outcol_handles.get_jArray(); } @@ -2721,12 +2717,9 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertToRows(JNIEnv *env JNI_NULL_CHECK(env, input_table, "input table is null", 0); try { - std::cout << "convert_to_rows\n"; cudf::jni::auto_set_device(env); cudf::table_view *n_input_table = reinterpret_cast(input_table); - std::cout << "before convert_to_rows\n"; std::vector> cols = cudf::convert_to_rows(*n_input_table); - std::cout << "after convert_to_rows\n"; int num_columns = cols.size(); cudf::jni::native_jlongArray outcol_handles(env, num_columns); for (int i = 0; i < num_columns; i++) { @@ -2767,7 +2760,6 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRows(JNIEnv *e JNI_NULL_CHECK(env, types, "types is null", 0); try { - std::cout << "convert_from_rows\n"; cudf::jni::auto_set_device(env); cudf::column_view *input = reinterpret_cast(input_column); cudf::lists_column_view list_input(*input); @@ -2777,9 +2769,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRows(JNIEnv *e for (int i = 0; i < n_types.size(); i++) { types_vec.emplace_back(cudf::jni::make_data_type(n_types[i], n_scale[i])); } - std::cout << "before convert_from_rows\n"; std::unique_ptr result = cudf::convert_from_rows(list_input, types_vec); - std::cout << "after convert_from_rows\n"; return cudf::jni::convert_table_for_return(env, result); } CATCH_STD(env, 0); From b9df725c892767f37a386b51b80c04a42da39bc7 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Thu, 21 Oct 2021 00:53:01 +0000 Subject: [PATCH 25/80] Fixing validity buffer alignment issue for row data --- cpp/src/row_conversion/row_conversion.cu | 142 ++++++++++++-------- cpp/tests/row_conversion/row_conversion.cpp | 63 ++++++--- java/src/main/native/src/row_conversion.cu | 58 +++++--- 3 files changed, 165 insertions(+), 98 deletions(-) diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu index 0457bbf71e4..90bd8b88ef0 100644 --- a/cpp/src/row_conversion/row_conversion.cu +++ b/cpp/src/row_conversion/row_conversion.cu @@ -469,6 +469,7 @@ __global__ void copy_from_columns(const size_type num_rows, // more appealing to copy element-wise from input data into shared matching the end layout // and do row-based memcopies out. + auto const shared_buffer_base = shared[fetch % stages_count]; for (auto el = (int)threadIdx.x; el < num_elements_in_block; el += blockDim.x) { auto const relative_col = el / num_fetch_rows; auto const relative_row = el % num_fetch_rows; @@ -493,14 +494,36 @@ __global__ void copy_from_columns(const size_type num_rows, printf("block %lu to shared chunk %lu. %p <- %p(0x%x) - %d bytes\n", fetch, fetch % stages_count, - &shared[fetch % stages_count][shared_offset], + &shared_buffer_base[shared_offset], input_src, *input_src, col_size); // copy the element from global memory - cuda::memcpy_async( - &shared[fetch % stages_count][shared_offset], input_src, col_size, fetch_barrier); + switch (col_size) { + case 2: + cuda::memcpy_async(&shared_buffer_base[shared_offset], + input_src, + cuda::aligned_size_t<2>(col_size), + fetch_barrier); + break; + case 4: + cuda::memcpy_async(&shared_buffer_base[shared_offset], + input_src, + cuda::aligned_size_t<4>(col_size), + fetch_barrier); + break; + case 8: + cuda::memcpy_async(&shared_buffer_base[shared_offset], + input_src, + cuda::aligned_size_t<8>(col_size), + fetch_barrier); + break; + default: + cuda::memcpy_async( + &shared_buffer_base[shared_offset], input_src, col_size, fetch_barrier); + break; + } } } @@ -511,15 +534,15 @@ __global__ void copy_from_columns(const size_type num_rows, if (debug_print) printf("reading block %lu\n", blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS + subset); - auto const block_row_size = block.get_shared_row_size(col_offsets, col_sizes); - auto const column_offset = col_offsets[block.start_col]; + auto const block_row_size = block.get_shared_row_size(col_offsets, col_sizes); + auto const column_offset = col_offsets[block.start_col]; + auto const block_output_buffer = output_data[block.buffer_num]; // copy entire rows to final dest for (auto absolute_row = block.start_row + threadIdx.x; absolute_row <= block.end_row; absolute_row += blockDim.x) { auto const relative_row = absolute_row - block.start_row; - auto const output_dest = - output_data[block.buffer_num] + row_offsets[absolute_row] + column_offset; + auto const output_dest = block_output_buffer + row_offsets[absolute_row] + column_offset; if (debug_print) printf("processing row %d\noutput data[%d] is address %p\n", absolute_row, @@ -533,8 +556,10 @@ __global__ void copy_from_columns(const size_type num_rows, block_row_size, absolute_row); - cuda::memcpy_async( - output_dest, &shared[subset % stages_count][shared_offset], block_row_size, subset_barrier); + cuda::memcpy_async(output_dest, + &shared[subset % stages_count][shared_offset], + cuda::aligned_size_t<8>(block_row_size), + subset_barrier); } } @@ -641,8 +666,8 @@ __global__ void copy_validity_from_columns(const size_type num_rows, auto const num_block_cols = block.num_cols(); auto const num_block_rows = block.num_rows(); - auto const num_sections_x = (num_block_cols + 31) / 32; - auto const num_sections_y = (num_block_rows + 7) / 8; + auto const num_sections_x = util::div_rounding_up_unsafe(num_block_cols, 32); + auto const num_sections_y = util::div_rounding_up_unsafe(num_block_rows, 32); auto const validity_data_row_length = align_offset(util::div_rounding_up_unsafe(num_block_cols, 8), 8); auto const total_sections = num_sections_x * num_sections_y; @@ -690,7 +715,7 @@ __global__ void copy_validity_from_columns(const size_type num_rows, my_section_idx, total_sections); auto const relative_col = section_x * 32 + lane_id; - auto const relative_row = section_y * 8; + auto const relative_row = section_y * 32; auto const absolute_col = relative_col + block.start_col; auto const absolute_row = relative_row + block.start_row; auto const cols_left = num_columns - absolute_col; @@ -720,15 +745,15 @@ __global__ void copy_validity_from_columns(const size_type num_rows, absolute_row, relative_col, absolute_col); - auto my_byte = - input_nm[absolute_col] != nullptr ? input_nm[absolute_col][absolute_row / 32] : 0xFF; + auto my_data = input_nm[absolute_col] != nullptr ? input_nm[absolute_col][absolute_row / 32] + : std::numeric_limits::max(); if (print_debug) printf( - "thread %d's byte is 0x%x, participation mask is 0x%x for relative row %d(%d real), " + "thread %d's bytes are 0x%x, participation mask is 0x%x for relative row %d(%d real), " "relative col %d(%d absolute)\n", threadIdx.x, - my_byte & 0xFF, + my_data, participation_mask, relative_row, absolute_row, @@ -738,8 +763,9 @@ __global__ void copy_validity_from_columns(const size_type num_rows, // every thread that is participating in the warp has a byte, but it's column-based // data and we need it in row-based. So we shuffle the bits around with ballot_sync to // make the bytes we actually write. - for (int i = 0, byte_mask = 1; i < 8 && relative_row + i < num_rows; ++i, byte_mask <<= 1) { - auto validity_data = __ballot_sync(participation_mask, my_byte & byte_mask); + bitmask_type dw_mask = 1; + for (int i = 0; i < 32 && relative_row + i < num_rows; ++i, dw_mask <<= 1) { + auto validity_data = __ballot_sync(participation_mask, my_data & dw_mask); // lead thread in each warp writes data auto const validity_write_offset = validity_data_row_length * (relative_row + i) + relative_col / 8; @@ -750,8 +776,8 @@ __global__ void copy_validity_from_columns(const size_type num_rows, "0x%x\n", threadIdx.x, blockIdx.x, - byte_mask, - my_byte & byte_mask, + dw_mask, + my_data & dw_mask, validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED, validity_write_offset, validity_data); @@ -804,6 +830,9 @@ __global__ void copy_validity_from_columns(const size_type num_rows, // make sure entire block has finished copy group.sync(); + auto const output_data_base = + output_data[block.buffer_num] + validity_offset + block.start_col / 8; + // now async memcpy the shared memory out to the final destination for (int row = block.start_row + threadIdx.x; row <= block.end_row; row += blockDim.x) { auto const relative_row = row - block.start_row; @@ -835,9 +864,8 @@ __global__ void copy_validity_from_columns(const size_type num_rows, word_index(block.start_col), this_shared_block[validity_data_row_length * relative_row]); } - auto const output_ptr = - output_data[block.buffer_num] + row_offsets[row] + validity_offset + block.start_col / 8; - auto const num_bytes = util::div_rounding_up_unsafe(num_block_cols, 8); + auto const output_ptr = output_data_base + row_offsets[row]; + auto const num_bytes = util::div_rounding_up_unsafe(num_block_cols, 8); cuda::memcpy_async( output_ptr, @@ -970,11 +998,20 @@ static __device__ void fetch_blocks_for_row_to_column( row += blockDim.x) { auto shared_offset = (row - fetch_block_start_row) * fetch_block_row_size + shared_row_offset; if (debug_print) - printf("fetching block %lu to shared chunk %lu. %p <- %p\n", - fetch_index, - fetch_index % max_resident_blocks, - &shared[fetch_index % max_resident_blocks][shared_offset], - &input_data[row_offsets[row] + starting_col_offset]); + printf( + "%d - fetching block %lu to shared chunk %lu. %p(shared[%d %% %d][%d]) <- %p(row %d, row " + "offset %d starting col offset %d)\n", + threadIdx.x, + fetch_index, + fetch_index % max_resident_blocks, + &shared[fetch_index % max_resident_blocks][shared_offset], + (int)fetch_index, + max_resident_blocks, + shared_offset, + &input_data[row_offsets[row] + starting_col_offset], + row, + row_offsets[row], + starting_col_offset); // copy the main cuda::memcpy_async(&shared[fetch_index % max_resident_blocks][shared_offset], &input_data[row_offsets[row] + starting_col_offset], @@ -1021,7 +1058,7 @@ __global__ void copy_to_columns(const size_type num_rows, // to speed up some of the random access memory we do, we copy col_sizes and col_offsets // to shared memory for each of the blocks that we work on - /*constexpr*/ bool debug_print = false; // threadIdx.x == 0 && blockIdx.x == 0; + constexpr bool debug_print = false; // threadIdx.x == 2 && blockIdx.x == 0; constexpr unsigned stages_count = NUM_BLOCKS_PER_KERNEL_LOADED; auto group = cooperative_groups::this_thread_block(); extern __shared__ int8_t shared_data[]; @@ -1094,12 +1131,12 @@ __global__ void copy_to_columns(const size_type num_rows, auto& subset_barrier = block_barrier[subset % NUM_BLOCKS_PER_KERNEL_LOADED]; // ensure our data is ready - if (debug_print && group.thread_rank() == 0 && blockIdx.x == 0) + if (debug_print) printf("%d-%d waiting at barrier %p\n", threadIdx.x, blockIdx.x, &subset_barrier); subset_barrier.arrive_and_wait(); auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + subset]; - if (debug_print && group.thread_rank() == 0 && blockIdx.x == 0) + if (debug_print) printf("%d-%d reading block %lu at address %p\n", threadIdx.x, blockIdx.x, @@ -1159,19 +1196,19 @@ __global__ void copy_to_columns(const size_type num_rows, if (debug_print) { printf( - "relative_col: %d, relative_row: %d, absolute_col: %d, absolute_row: %d, " - "shared_mmeory_row_offset: %d, shared_memory_offset: %d," - " column_size: %d, shmem_src: %p, dst: %p\n",//, uint32 is %u\n", - relative_col, - relative_row, - absolute_col, - absolute_row, - shared_memory_row_offset, - shared_memory_offset, - column_size, - shmem_src, - dst/*, - *reinterpret_cast(shmem_src)*/); + "relative_col: %d, relative_row: %d, absolute_col: %d, absolute_row: %d, " + "shared_mmeory_row_offset: %d, shared_memory_offset: %d," + " column_size: %d, shmem_src: %p, dst: %p\n",//, uint32 is %u\n", + relative_col, + relative_row, + absolute_col, + absolute_row, + shared_memory_row_offset, + shared_memory_offset, + column_size, + shmem_src, + dst/*, + *reinterpret_cast(shmem_src)*/); printf("memcpy_async(%p, %p, %d, subset_barrier);\n", dst, shmem_src, column_size); } if (debug_print && absolute_col == 0 && absolute_row == 51) { @@ -1185,7 +1222,7 @@ __global__ void copy_to_columns(const size_type num_rows, cuda::memcpy_async(dst, shmem_src, column_size, subset_barrier); } group.sync(); - if (debug_print && group.thread_rank() == 0 && blockIdx.x == 0) + if (debug_print) printf( "%d-%d copy to main memory with barrier %p\n", threadIdx.x, blockIdx.x, &subset_barrier); } @@ -1224,9 +1261,7 @@ __global__ void copy_validity_to_columns(const size_type num_rows, int8_t* shared_blocks[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] = { shared_data, shared_data + shmem_used_per_block / 2}; - bool print_debug = false; // threadIdx.x == 0 && blockIdx.x == 0; - // bool print_debug = false; - // if (blockIdx.x != 3 || threadIdx.x / 32 != 0) return; + constexpr bool print_debug = false; // threadIdx.x == 0 && blockIdx.x == 0; if (print_debug) { printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns); printf("%d %d - block infos are at %p and my index is %d\n", @@ -1246,10 +1281,6 @@ __global__ void copy_validity_to_columns(const size_type num_rows, output_nm, row_offsets, block_infos); - /* printf("Row Offsets:\n"); - for (int i=0; i double { return rand_double(re); }); + auto all_valid = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return 1; }); + auto none_valid = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return 0; }); + auto most_valid = cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return rand() % 2 == 0 ? 0 : 1; }); + auto few_valid = cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return rand() % 13 == 0 ? 1 : 0; }); + for (int i = 0; i < 10; ++i) { - cols.push_back( - *cudf::test::fixed_width_column_wrapper(r, r + num_rows).release().release()); + cols.push_back(*cudf::test::fixed_width_column_wrapper(r, r + num_rows, all_valid) + .release() + .release()); schema.push_back(cudf::data_type{cudf::type_id::INT8}); } for (int i = 0; i < 10; ++i) { - cols.push_back( - *cudf::test::fixed_width_column_wrapper(r, r + num_rows).release().release()); + cols.push_back(*cudf::test::fixed_width_column_wrapper(r, r + num_rows, few_valid) + .release() + .release()); schema.push_back(cudf::data_type{cudf::type_id::INT16}); } for (int i = 0; i < 10; ++i) { - cols.push_back( - *cudf::test::fixed_width_column_wrapper(r, r + num_rows).release().release()); + if (i < 5) { + cols.push_back(*cudf::test::fixed_width_column_wrapper(r, r + num_rows, few_valid) + .release() + .release()); + } else { + cols.push_back(*cudf::test::fixed_width_column_wrapper(r, r + num_rows, none_valid) + .release() + .release()); + } schema.push_back(cudf::data_type{cudf::type_id::INT32}); } for (int i = 0; i < 10; ++i) { - cols.push_back( - *cudf::test::fixed_width_column_wrapper(d, d + num_rows).release().release()); + cols.push_back(*cudf::test::fixed_width_column_wrapper(d, d + num_rows, most_valid) + .release() + .release()); schema.push_back(cudf::data_type{cudf::type_id::FLOAT32}); } for (int i = 0; i < 10; ++i) { - cols.push_back( - *cudf::test::fixed_width_column_wrapper(d, d + num_rows).release().release()); + cols.push_back(*cudf::test::fixed_width_column_wrapper(d, d + num_rows, most_valid) + .release() + .release()); schema.push_back(cudf::data_type{cudf::type_id::FLOAT64}); } for (int i = 0; i < 10; ++i) { - cols.push_back( - *cudf::test::fixed_width_column_wrapper(r, r + num_rows).release().release()); + cols.push_back(*cudf::test::fixed_width_column_wrapper(r, r + num_rows, few_valid) + .release() + .release()); schema.push_back(cudf::data_type{cudf::type_id::BOOL8}); } for (int i = 0; i < 10; ++i) { cols.push_back( *cudf::test::fixed_width_column_wrapper( - r, r + num_rows) + r, r + num_rows, all_valid) .release() .release()); schema.push_back(cudf::data_type{cudf::type_id::TIMESTAMP_MILLISECONDS}); @@ -505,25 +524,25 @@ TEST_F(RowToColumnTests, AllTypesLarge) for (int i = 0; i < 10; ++i) { cols.push_back( *cudf::test::fixed_width_column_wrapper( - r, r + num_rows) + r, r + num_rows, most_valid) .release() .release()); schema.push_back(cudf::data_type{cudf::type_id::TIMESTAMP_DAYS}); } for (int i = 0; i < 10; ++i) { - cols.push_back( - *cudf::test::fixed_point_column_wrapper(r, r + num_rows, numeric::scale_type{-2}) - .release() - .release()); + cols.push_back(*cudf::test::fixed_point_column_wrapper( + r, r + num_rows, all_valid, numeric::scale_type{-2}) + .release() + .release()); schema.push_back(cudf::data_type{cudf::type_id::DECIMAL32}); } for (int i = 0; i < 10; ++i) { - cols.push_back( - *cudf::test::fixed_point_column_wrapper(r, r + num_rows, numeric::scale_type{-1}) - .release() - .release()); + cols.push_back(*cudf::test::fixed_point_column_wrapper( + r, r + num_rows, most_valid, numeric::scale_type{-1}) + .release() + .release()); schema.push_back(cudf::data_type{cudf::type_id::DECIMAL64}); } diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu index e6cd9a9da32..a67589fbaec 100644 --- a/java/src/main/native/src/row_conversion.cu +++ b/java/src/main/native/src/row_conversion.cu @@ -424,6 +424,7 @@ __global__ void copy_from_columns(const size_type num_rows, const size_type num_ // more appealing to copy element-wise from input data into shared matching the end layout // and do row-based memcopies out. + auto const shared_buffer_base = shared[fetch % stages_count]; for (auto el = (int)threadIdx.x; el < num_elements_in_block; el += blockDim.x) { auto const relative_col = el / num_fetch_rows; auto const relative_row = el % num_fetch_rows; @@ -437,8 +438,24 @@ __global__ void copy_from_columns(const size_type num_rows, const size_type num_ auto const input_src = input_data[absolute_col] + col_size * absolute_row; // copy the element from global memory - cuda::memcpy_async(&shared[fetch % stages_count][shared_offset], input_src, col_size, - fetch_barrier); + switch (col_size) { + case 2: + cuda::memcpy_async(&shared_buffer_base[shared_offset], input_src, + cuda::aligned_size_t<2>(col_size), fetch_barrier); + break; + case 4: + cuda::memcpy_async(&shared_buffer_base[shared_offset], input_src, + cuda::aligned_size_t<4>(col_size), fetch_barrier); + break; + case 8: + cuda::memcpy_async(&shared_buffer_base[shared_offset], input_src, + cuda::aligned_size_t<8>(col_size), fetch_barrier); + break; + default: + cuda::memcpy_async(&shared_buffer_base[shared_offset], input_src, col_size, + fetch_barrier); + break; + } } } @@ -448,18 +465,17 @@ __global__ void copy_from_columns(const size_type num_rows, const size_type num_ auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS + subset]; auto const block_row_size = block.get_shared_row_size(col_offsets, col_sizes); auto const column_offset = col_offsets[block.start_col]; + auto const block_output_buffer = output_data[block.buffer_num]; // copy entire rows to final dest for (auto absolute_row = block.start_row + threadIdx.x; absolute_row <= block.end_row; absolute_row += blockDim.x) { - auto const relative_row = absolute_row - block.start_row; - auto const output_dest = - output_data[block.buffer_num] + row_offsets[absolute_row] + column_offset; + auto const output_dest = block_output_buffer + row_offsets[absolute_row] + column_offset; auto const shared_offset = block_row_size * relative_row; - cuda::memcpy_async(output_dest, &shared[subset % stages_count][shared_offset], block_row_size, - subset_barrier); + cuda::memcpy_async(output_dest, &shared[subset % stages_count][shared_offset], + cuda::aligned_size_t<8>(block_row_size), subset_barrier); } } @@ -523,8 +539,8 @@ __global__ void copy_validity_from_columns( auto const num_block_cols = block.num_cols(); auto const num_block_rows = block.num_rows(); - auto const num_sections_x = (num_block_cols + 31) / 32; - auto const num_sections_y = (num_block_rows + 7) / 8; + auto const num_sections_x = util::div_rounding_up_unsafe(num_block_cols, 32); + auto const num_sections_y = util::div_rounding_up_unsafe(num_block_rows, 32); auto const validity_data_row_length = align_offset(util::div_rounding_up_unsafe(num_block_cols, 8), 8); auto const total_sections = num_sections_x * num_sections_y; @@ -536,26 +552,27 @@ __global__ void copy_validity_from_columns( // the block is divided into sections. A warp operates on a section at a time. for (int my_section_idx = warp_id; my_section_idx < total_sections; my_section_idx += warps_per_block) { - // convert to rows and cols auto const section_x = my_section_idx % num_sections_x; auto const section_y = my_section_idx / num_sections_x; auto const relative_col = section_x * 32 + lane_id; - auto const relative_row = section_y * 8; + auto const relative_row = section_y * 32; auto const absolute_col = relative_col + block.start_col; auto const absolute_row = relative_row + block.start_row; auto const cols_left = num_columns - absolute_col; auto const participation_mask = __ballot_sync(0xFFFFFFFF, absolute_col < num_columns); if (absolute_col < num_columns) { - auto my_byte = - input_nm[absolute_col] != nullptr ? input_nm[absolute_col][absolute_row / 32] : 0xFF; + auto my_data = input_nm[absolute_col] != nullptr ? + input_nm[absolute_col][absolute_row / 32] : + std::numeric_limits::max(); // every thread that is participating in the warp has a byte, but it's column-based // data and we need it in row-based. So we shuffle the bits around with ballot_sync to // make the bytes we actually write. - for (int i = 0, byte_mask = 1; i < 8 && relative_row + i < num_rows; ++i, byte_mask <<= 1) { - auto validity_data = __ballot_sync(participation_mask, my_byte & byte_mask); + bitmask_type dw_mask = 1; + for (int i = 0; i < 32 && relative_row + i < num_rows; ++i, dw_mask <<= 1) { + auto validity_data = __ballot_sync(participation_mask, my_data & dw_mask); // lead thread in each warp writes data auto const validity_write_offset = validity_data_row_length * (relative_row + i) + relative_col / 8; @@ -585,11 +602,13 @@ __global__ void copy_validity_from_columns( // make sure entire block has finished copy group.sync(); + auto const output_data_base = + output_data[block.buffer_num] + validity_offset + block.start_col / 8; + // now async memcpy the shared memory out to the final destination for (int row = block.start_row + threadIdx.x; row <= block.end_row; row += blockDim.x) { auto const relative_row = row - block.start_row; - auto const output_ptr = - output_data[block.buffer_num] + row_offsets[row] + validity_offset + block.start_col / 8; + auto const output_ptr = output_data_base + row_offsets[row]; auto const num_bytes = util::div_rounding_up_unsafe(num_block_cols, 8); cuda::memcpy_async( @@ -917,8 +936,6 @@ __global__ void copy_validity_to_columns( // now async memcpy the shared for (int col = block.start_col + threadIdx.x; col <= block.end_col; col += blockDim.x) { auto const relative_col = col - block.start_col; - auto const words_to_copy = util::div_rounding_up_unsafe(num_block_rows, 32); - auto const starting_address = output_nm[col] + word_index(block_start_row); cuda::memcpy_async( output_nm[col] + word_index(block_start_row), @@ -1111,7 +1128,7 @@ static size_type compute_column_information(iterator begin, iterator end, fixed_width_size_per_row += col_size; } - auto validity_offset = detail::align_offset(fixed_width_size_per_row, 4); + auto validity_offset = fixed_width_size_per_row; column_starts.push_back(validity_offset); return fixed_width_size_per_row; @@ -1233,7 +1250,6 @@ std::vector build_block_infos(std::vector const &column_s if (row_size_with_end_pad * window_height + calc_admin_data_size(col - current_window_start_col) > shmem_limit_per_block) { - // too large, close this window, generate vertical blocks and restart build_blocks(current_window_start_col, col == 0 ? col : col - 1, window_height); row_size = From 8d00447eb0e9c6166b6f3a01b199dbd9c0a88c9a Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Thu, 21 Oct 2021 18:02:07 +0000 Subject: [PATCH 26/80] Cleaning up code for PR --- cpp/src/row_conversion/row_conversion.cu | 4132 ++++++++------------ java/src/main/native/src/row_conversion.cu | 237 +- 2 files changed, 1740 insertions(+), 2629 deletions(-) diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu index 90bd8b88ef0..c068a2c0b76 100644 --- a/cpp/src/row_conversion/row_conversion.cu +++ b/cpp/src/row_conversion/row_conversion.cu @@ -14,2487 +14,1653 @@ * limitations under the License. */ -#include -#include -#include -#include -#include -#include -#include "cudf/detail/iterator.cuh" -#include "cudf/lists/lists_column_device_view.cuh" - -#include - -#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 -#include -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#include -#include -#include - -#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 -constexpr auto NUM_BLOCKS_PER_KERNEL_TO_COLUMNS = 8; -constexpr auto NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS = 2; -constexpr auto NUM_BLOCKS_PER_KERNEL_LOADED = 2; -constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL = 8; -constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED = 2; -#endif - -using cudf::detail::make_device_uvector_async; -using rmm::device_uvector; -namespace cudf { - -namespace detail { - -static inline __host__ __device__ int32_t align_offset(int32_t offset, std::size_t alignment) -{ - return (offset + alignment - 1) & ~(alignment - 1); -} - -__global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows, - const cudf::size_type num_columns, - const cudf::size_type row_size, - const cudf::size_type* input_offset_in_row, - const cudf::size_type* num_bytes, - int8_t** output_data, - cudf::bitmask_type** output_nm, - const int8_t* input_data) -{ - // We are going to copy the data in two passes. - // The first pass copies a chunk of data into shared memory. - // The second pass copies that chunk from shared memory out to the final location. - - // Because shared memory is limited we copy a subset of the rows at a time. - // For simplicity we will refer to this as a row_group - - // In practice we have found writing more than 4 columns of data per thread - // results in performance loss. As such we are using a 2 dimensional - // kernel in terms of threads, but not in terms of blocks. Columns are - // controlled by the y dimension (there is no y dimension in blocks). Rows - // are controlled by the x dimension (there are multiple blocks in the x - // dimension). - - cudf::size_type rows_per_group = blockDim.x; - cudf::size_type row_group_start = blockIdx.x; - cudf::size_type row_group_stride = gridDim.x; - cudf::size_type row_group_end = (num_rows + rows_per_group - 1) / rows_per_group + 1; - - extern __shared__ int8_t shared_data[]; - - // Because we are copying fixed width only data and we stride the rows - // this thread will always start copying from shared data in the same place - int8_t* row_tmp = &shared_data[row_size * threadIdx.x]; - int8_t* row_vld_tmp = &row_tmp[input_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]]; - - for (cudf::size_type row_group_index = row_group_start; row_group_index < row_group_end; - row_group_index += row_group_stride) { - // Step 1: Copy the data into shared memory - // We know row_size is always aligned with and a multiple of int64_t; - int64_t* long_shared = reinterpret_cast(shared_data); - const int64_t* long_input = reinterpret_cast(input_data); - - cudf::size_type shared_output_index = threadIdx.x + (threadIdx.y * blockDim.x); - cudf::size_type shared_output_stride = blockDim.x * blockDim.y; - cudf::size_type row_index_end = ((row_group_index + 1) * rows_per_group); - if (row_index_end > num_rows) { row_index_end = num_rows; } - cudf::size_type num_rows_in_group = row_index_end - (row_group_index * rows_per_group); - cudf::size_type shared_length = row_size * num_rows_in_group; - - cudf::size_type shared_output_end = shared_length / sizeof(int64_t); - - cudf::size_type start_input_index = - (row_size * row_group_index * rows_per_group) / sizeof(int64_t); - - for (cudf::size_type shared_index = shared_output_index; shared_index < shared_output_end; - shared_index += shared_output_stride) { - long_shared[shared_index] = long_input[start_input_index + shared_index]; - } - // Wait for all of the data to be in shared memory - __syncthreads(); - - // Step 2 copy the data back out - - // Within the row group there should be 1 thread for each row. This is a - // requirement for launching the kernel - cudf::size_type row_index = (row_group_index * rows_per_group) + threadIdx.x; - // But we might not use all of the threads if the number of rows does not go - // evenly into the thread count. We don't want those threads to exit yet - // because we may need them to copy data in for the next row group. - uint32_t active_mask = __ballot_sync(0xffffffff, row_index < num_rows); - if (row_index < num_rows) { - cudf::size_type col_index_start = threadIdx.y; - cudf::size_type col_index_stride = blockDim.y; - for (cudf::size_type col_index = col_index_start; col_index < num_columns; - col_index += col_index_stride) { - cudf::size_type col_size = num_bytes[col_index]; - const int8_t* col_tmp = &(row_tmp[input_offset_in_row[col_index]]); - int8_t* col_output = output_data[col_index]; - switch (col_size) { - case 1: { - col_output[row_index] = *col_tmp; - break; - } - case 2: { - int16_t* short_col_output = reinterpret_cast(col_output); - short_col_output[row_index] = *reinterpret_cast(col_tmp); - break; - } - case 4: { - int32_t* int_col_output = reinterpret_cast(col_output); - int_col_output[row_index] = *reinterpret_cast(col_tmp); - break; - } - case 8: { - int64_t* long_col_output = reinterpret_cast(col_output); - long_col_output[row_index] = *reinterpret_cast(col_tmp); - break; - } - default: { - cudf::size_type output_offset = col_size * row_index; - // TODO this should just not be supported for fixed width columns, but just in case... - for (cudf::size_type b = 0; b < col_size; b++) { - col_output[b + output_offset] = col_tmp[b]; - } - break; - } - } - - cudf::bitmask_type* nm = output_nm[col_index]; - int8_t* valid_byte = &row_vld_tmp[col_index / 8]; - cudf::size_type byte_bit_offset = col_index % 8; - int predicate = *valid_byte & (1 << byte_bit_offset); - uint32_t bitmask = __ballot_sync(active_mask, predicate); - if (row_index % 32 == 0) { nm[word_index(row_index)] = bitmask; } - } // end column loop - } // end row copy - // wait for the row_group to be totally copied before starting on the next row group - __syncthreads(); - } -} - -__global__ void copy_from_fixed_width_columns(const cudf::size_type start_row, - const cudf::size_type num_rows, - const cudf::size_type num_columns, - const cudf::size_type row_size, - const cudf::size_type* output_offset_in_row, - const cudf::size_type* num_bytes, - const int8_t** input_data, - const cudf::bitmask_type** input_nm, - int8_t* output_data) -{ - // We are going to copy the data in two passes. - // The first pass copies a chunk of data into shared memory. - // The second pass copies that chunk from shared memory out to the final location. - - // Because shared memory is limited we copy a subset of the rows at a time. - // We do not support copying a subset of the columns in a row yet, so we don't - // currently support a row that is wider than shared memory. - // For simplicity we will refer to this as a row_group - - // In practice we have found reading more than 4 columns of data per thread - // results in performance loss. As such we are using a 2 dimensional - // kernel in terms of threads, but not in terms of blocks. Columns are - // controlled by the y dimension (there is no y dimension in blocks). Rows - // are controlled by the x dimension (there are multiple blocks in the x - // dimension). - - cudf::size_type rows_per_group = blockDim.x; - cudf::size_type row_group_start = blockIdx.x; - cudf::size_type row_group_stride = gridDim.x; - cudf::size_type row_group_end = (num_rows + rows_per_group - 1) / rows_per_group + 1; - - extern __shared__ int8_t shared_data[]; - - // Because we are copying fixed width only data and we stride the rows - // this thread will always start copying to shared data in the same place - int8_t* row_tmp = &shared_data[row_size * threadIdx.x]; - int8_t* row_vld_tmp = - &row_tmp[output_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]]; - - for (cudf::size_type row_group_index = row_group_start; row_group_index < row_group_end; - row_group_index += row_group_stride) { - // Within the row group there should be 1 thread for each row. This is a - // requirement for launching the kernel - cudf::size_type row_index = start_row + (row_group_index * rows_per_group) + threadIdx.x; - // But we might not use all of the threads if the number of rows does not go - // evenly into the thread count. We don't want those threads to exit yet - // because we may need them to copy data back out. - if (row_index < (start_row + num_rows)) { - cudf::size_type col_index_start = threadIdx.y; - cudf::size_type col_index_stride = blockDim.y; - for (cudf::size_type col_index = col_index_start; col_index < num_columns; - col_index += col_index_stride) { - cudf::size_type col_size = num_bytes[col_index]; - int8_t* col_tmp = &(row_tmp[output_offset_in_row[col_index]]); - const int8_t* col_input = input_data[col_index]; - switch (col_size) { - case 1: { - *col_tmp = col_input[row_index]; - break; - } - case 2: { - const int16_t* short_col_input = reinterpret_cast(col_input); - *reinterpret_cast(col_tmp) = short_col_input[row_index]; - break; - } - case 4: { - const int32_t* int_col_input = reinterpret_cast(col_input); - *reinterpret_cast(col_tmp) = int_col_input[row_index]; - break; - } - case 8: { - const int64_t* long_col_input = reinterpret_cast(col_input); - *reinterpret_cast(col_tmp) = long_col_input[row_index]; - break; - } - default: { - cudf::size_type input_offset = col_size * row_index; - // TODO this should just not be supported for fixed width columns, but just in case... - for (cudf::size_type b = 0; b < col_size; b++) { - col_tmp[b] = col_input[b + input_offset]; - } - break; - } - } - // atomicOr only works on 32 bit or 64 bit aligned values, and not byte aligned - // so we have to rewrite the addresses to make sure that it is 4 byte aligned - int8_t* valid_byte = &row_vld_tmp[col_index / 8]; - cudf::size_type byte_bit_offset = col_index % 8; - uint64_t fixup_bytes = reinterpret_cast(valid_byte) % 4; - int32_t* valid_int = reinterpret_cast(valid_byte - fixup_bytes); - cudf::size_type int_bit_offset = byte_bit_offset + (fixup_bytes * 8); - // Now copy validity for the column - if (input_nm[col_index]) { - if (bit_is_set(input_nm[col_index], row_index)) { - atomicOr_block(valid_int, 1 << int_bit_offset); - } else { - atomicAnd_block(valid_int, ~(1 << int_bit_offset)); - } - } else { - // It is valid so just set the bit - atomicOr_block(valid_int, 1 << int_bit_offset); - } - } // end column loop - } // end row copy - // wait for the row_group to be totally copied into shared memory - __syncthreads(); - - // Step 2: Copy the data back out - // We know row_size is always aligned with and a multiple of int64_t; - int64_t* long_shared = reinterpret_cast(shared_data); - int64_t* long_output = reinterpret_cast(output_data); - - cudf::size_type shared_input_index = threadIdx.x + (threadIdx.y * blockDim.x); - cudf::size_type shared_input_stride = blockDim.x * blockDim.y; - cudf::size_type row_index_end = ((row_group_index + 1) * rows_per_group); - if (row_index_end > num_rows) { row_index_end = num_rows; } - cudf::size_type num_rows_in_group = row_index_end - (row_group_index * rows_per_group); - cudf::size_type shared_length = row_size * num_rows_in_group; - - cudf::size_type shared_input_end = shared_length / sizeof(int64_t); - - cudf::size_type start_output_index = - (row_size * row_group_index * rows_per_group) / sizeof(int64_t); - - for (cudf::size_type shared_index = shared_input_index; shared_index < shared_input_end; - shared_index += shared_input_stride) { - long_output[start_output_index + shared_index] = long_shared[shared_index]; - } - __syncthreads(); - // Go for the next round - } -} - -#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 - -struct block_info { - int start_col; - int start_row; - int end_col; - int end_row; - int buffer_num; - - __host__ __device__ size_type get_shared_row_size(size_type const* const col_offsets, - size_type const* const col_sizes, - bool debug_print = false) const - { - if (debug_print) - printf("col_offsets[%d]: %p + col_sizes[%d]: %p - col_offsets[%d]: %p\n%d + %d - %d\n", - end_col, - &col_offsets[end_col], - end_col, - &col_sizes[end_col], - start_col, - &col_offsets[start_col], - col_offsets[end_col], - col_sizes[end_col], - col_offsets[start_col]); - return align_offset(col_offsets[end_col] + col_sizes[end_col] - col_offsets[start_col], 8); - } - __host__ __device__ size_type num_cols() const { return end_col - start_col + 1; } - - __host__ __device__ size_type num_rows() const { return end_row - start_row + 1; } -}; - -// When building the columns to return, we have to be mindful of the offset limit in cudf. -// It is 32-bit and these data columns are capable of surpassing that easily. The data should -// not be cut off exactly at the limit though due to the validity buffers. The most efficient -// place to cut the validity is on a 32-row boundary, so as we calculate the row sizes -// we keep track of the cut points for the validity, which we call row batches. If the row -// is larger than can be represented with the 32-bit offsets, we use the last 32-row boundary we -// hit. Note that this boundary is for our book-keeping with column pointers and not anything that -// the kernel needs to worry about. We cut the output at convienient boundaries when assembling -// the outgoing data stream. -struct row_batch { - size_type num_bytes; - size_type row_count; -}; - -/** - * @brief copy data from cudf columns into x format, which is row-based - * - * @param num_rows total number of rows in the table - * @param num_columns total number of columns in the table - * @param input_data pointer to raw table data - * @param input_nm pointer to validity data - * @param col_sizes array of sizes for each element in a column - one per column - * @param col_offsets offset into input data row for each column's start - * @param block_infos information about the blocks of work - * @param row_offsets offset to a specific row in the input data - * @param output_data pointer to output data - * - */ -__global__ void copy_from_columns(const size_type num_rows, - const size_type num_columns, - const size_type shmem_used_per_block, - const size_type num_block_infos, - const int8_t** input_data, - const size_type* col_sizes, - const size_type* col_offsets, - const block_info* block_infos, - const size_type* row_offsets, - int8_t** output_data) -{ - // We are going to copy the data in two passes. - // The first pass copies a chunk of data into shared memory. - // The second pass copies that chunk from shared memory out to the final location. - - // Because shared memory is limited we copy a subset of the rows at a time. - // This has been broken up for us in the block_info struct, so we don't have - // any calculation to do here, but it is important to note. - - constexpr bool debug_print = false; // blockIdx.x == 0 && threadIdx.x == 1; - - constexpr unsigned stages_count = NUM_BLOCKS_PER_KERNEL_LOADED; - auto group = cooperative_groups::this_thread_block(); - extern __shared__ int8_t shared_data[]; - int8_t* shared[stages_count] = {shared_data, shared_data + shmem_used_per_block}; - - __shared__ cuda::barrier block_barrier[NUM_BLOCKS_PER_KERNEL_LOADED]; - if (group.thread_rank() == 0) { - for (int i = 0; i < NUM_BLOCKS_PER_KERNEL_LOADED; ++i) { - init(&block_barrier[i], group.size()); - } - } - - group.sync(); - - if (debug_print) { - printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns); - printf("col sizes at %p, col offsets at %p, and row offsets at %p\n", - col_sizes, - col_offsets, - row_offsets); - printf("block infos are at %p and my index is %d\n", block_infos, blockIdx.x); - printf("output data to %p\n", output_data[block_infos[blockIdx.x].buffer_num]); - printf("shared memory pointers are %p and %p\n", shared[0], shared[1]); - printf("shared_memory ends at %p\n", &shared_data[shmem_used_per_block * 2]); - printf("group is %d threads\n", group.size()); - } - // else { return; } - - auto const blocks_remaining = - std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS, - (uint)NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS); - - size_t fetch; - size_t subset; - for (subset = fetch = 0; subset < blocks_remaining; ++subset) { - // Fetch ahead up to stages_count subsets - for (; fetch < blocks_remaining && fetch < (subset + stages_count); ++fetch) { - if (debug_print) - printf("fetching block %lu\n", blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS + fetch); - auto const fetch_block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS + fetch]; - if (debug_print) - printf("block %lu rows %d-%d and cols %d-%d\n", - blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS + fetch, - fetch_block.start_row, - fetch_block.end_row, - fetch_block.start_col, - fetch_block.end_col); - - auto const num_fetch_cols = fetch_block.num_cols(); - auto const num_fetch_rows = fetch_block.num_rows(); - auto const num_elements_in_block = num_fetch_cols * num_fetch_rows; - auto const fetch_block_row_size = fetch_block.get_shared_row_size(col_offsets, col_sizes); - auto const starting_column_offset = col_offsets[fetch_block.start_col]; - auto& fetch_barrier = block_barrier[fetch % NUM_BLOCKS_PER_KERNEL_LOADED]; - - // wait for the last use of the memory to be completed - if (fetch > NUM_BLOCKS_PER_KERNEL_LOADED) { fetch_barrier.arrive_and_wait(); } - - // to do the copy we need to do n column copies followed by m element copies OR - // we have to do m element copies followed by r row copies. When going from column - // to row it is much easier to copy by elements first otherwise we would need a running - // total of the column sizes for our block, which isn't readily available. This makes it - // more appealing to copy element-wise from input data into shared matching the end layout - // and do row-based memcopies out. - - auto const shared_buffer_base = shared[fetch % stages_count]; - for (auto el = (int)threadIdx.x; el < num_elements_in_block; el += blockDim.x) { - auto const relative_col = el / num_fetch_rows; - auto const relative_row = el % num_fetch_rows; - auto const absolute_col = relative_col + fetch_block.start_col; - auto const absolute_row = relative_row + fetch_block.start_row; - if (debug_print) - printf("row %d(%d), col %d(%d), %d fetch rows, element %d\n", - relative_row, - absolute_row, - relative_col, - absolute_col, - num_fetch_rows, - el); - auto const col_size = col_sizes[absolute_col]; - auto const col_offset = col_offsets[absolute_col]; - auto const relative_col_offset = col_offset - starting_column_offset; - - auto const shared_offset = relative_row * fetch_block_row_size + relative_col_offset; - auto const input_src = input_data[absolute_col] + col_size * absolute_row; - - if (debug_print) - printf("block %lu to shared chunk %lu. %p <- %p(0x%x) - %d bytes\n", - fetch, - fetch % stages_count, - &shared_buffer_base[shared_offset], - input_src, - *input_src, - col_size); - - // copy the element from global memory - switch (col_size) { - case 2: - cuda::memcpy_async(&shared_buffer_base[shared_offset], - input_src, - cuda::aligned_size_t<2>(col_size), - fetch_barrier); - break; - case 4: - cuda::memcpy_async(&shared_buffer_base[shared_offset], - input_src, - cuda::aligned_size_t<4>(col_size), - fetch_barrier); - break; - case 8: - cuda::memcpy_async(&shared_buffer_base[shared_offset], - input_src, - cuda::aligned_size_t<8>(col_size), - fetch_barrier); - break; - default: - cuda::memcpy_async( - &shared_buffer_base[shared_offset], input_src, col_size, fetch_barrier); - break; - } - } - } - - auto& subset_barrier = block_barrier[subset % NUM_BLOCKS_PER_KERNEL_LOADED]; - subset_barrier.arrive_and_wait(); - - auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS + subset]; - if (debug_print) - printf("reading block %lu\n", blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS + subset); - - auto const block_row_size = block.get_shared_row_size(col_offsets, col_sizes); - auto const column_offset = col_offsets[block.start_col]; - auto const block_output_buffer = output_data[block.buffer_num]; - - // copy entire rows to final dest - for (auto absolute_row = block.start_row + threadIdx.x; absolute_row <= block.end_row; - absolute_row += blockDim.x) { - auto const relative_row = absolute_row - block.start_row; - auto const output_dest = block_output_buffer + row_offsets[absolute_row] + column_offset; - if (debug_print) - printf("processing row %d\noutput data[%d] is address %p\n", - absolute_row, - absolute_row, - output_dest); - auto const shared_offset = block_row_size * relative_row; - if (debug_print) - printf("memcpy %p <- %p - %d bytes which is row %d\n", - output_dest, - &shared[subset % stages_count][shared_offset], - block_row_size, - absolute_row); - - cuda::memcpy_async(output_dest, - &shared[subset % stages_count][shared_offset], - cuda::aligned_size_t<8>(block_row_size), - subset_barrier); - } - } - - // wait on the last copies to complete - for (uint i = 0; i < std::min(stages_count, blocks_remaining); ++i) { - block_barrier[i].arrive_and_wait(); - } -} - -/** - * @brief copy data from row-based format to cudf columns - * - * @param num_rows total number of rows in the table - * @param num_columns total number of columns in the table - * @param shmem_used_per_block amount of shared memory that is used by a block - * @param offsets - * @param output_data pointer to output data, partitioned by data size - * @param validity_offsets offset into input data row for validity data - * @param block_infos information about the blocks of work - * @param num_block_infos number of infos in blocks array - * @param input_data pointer to input data - * - */ -__global__ void copy_validity_from_columns(const size_type num_rows, - const size_type num_columns, - const size_type shmem_used_per_block, - const size_type* row_offsets, - int8_t** output_data, - const size_type validity_offset, - const block_info* block_infos, - const size_type num_block_infos, - const bitmask_type** input_nm) -{ - extern __shared__ int8_t shared_data[]; - int8_t* shared_blocks[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] = { - shared_data, shared_data + shmem_used_per_block / 2}; - - constexpr bool print_debug = false; // threadIdx.x==0 && blockIdx.x == 0; - // if (blockIdx.x != 3 || threadIdx.x / 32 != 0) return; - if (print_debug) { - printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns); - printf("%d %d - block infos are at %p and my index is %d\n", - threadIdx.x, - blockIdx.x, - block_infos, - blockIdx.x); - printf("%d %d - input nm is %p, input_nm[0] is at %p\n", - threadIdx.x, - blockIdx.x, - input_nm, - input_nm[0]); - printf("shared memory is %p to %p\n", shared_data, shared_data + shmem_used_per_block * 2); - printf("block infos at %p and this is index %d\n", - &block_infos, - blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + 0); - /* printf("Row Offsets:\n"); - for (int i=0; i - shared_block_barriers[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]; - if (group.thread_rank() == 0) { - for (int i = 0; i < NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; ++i) { - init(&shared_block_barriers[i], group.size()); - } - } - - group.sync(); - - for (int validity_block = 0; validity_block < blocks_remaining; ++validity_block) { - if (validity_block != validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED) { - if (print_debug) - printf("%d: waiting at barrier %d\n", - threadIdx.x, - validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED); - shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] - .arrive_and_wait(); - if (print_debug) printf("past barrier...\n"); - } - int8_t* this_shared_block = shared_blocks[validity_block % 2]; - if (print_debug) printf("top of loop for validity block %d\n", validity_block); - if (print_debug) - printf("reading validity block info %d at %p\n", - blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block, - &block_infos[blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block]); - auto block = block_infos[blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block]; - - auto const num_block_cols = block.num_cols(); - auto const num_block_rows = block.num_rows(); - - auto const num_sections_x = util::div_rounding_up_unsafe(num_block_cols, 32); - auto const num_sections_y = util::div_rounding_up_unsafe(num_block_rows, 32); - auto const validity_data_row_length = - align_offset(util::div_rounding_up_unsafe(num_block_cols, 8), 8); - auto const total_sections = num_sections_x * num_sections_y; - - if (print_debug) { - printf("%d %d - block %d has %d cols, %d rows, %d row length, and %d total sections\n", - threadIdx.x, - blockIdx.x, - blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block, - num_block_cols, - num_block_rows, - validity_data_row_length, - total_sections); - } - int const warp_id = threadIdx.x / detail::warp_size; - int const lane_id = threadIdx.x % detail::warp_size; - auto const warps_per_block = std::max(1u, blockDim.x / detail::warp_size); - - if (print_debug) - printf( - "%d %d - my warp is %d, %d total sections(%d x, %d y), %d warps per block, " - "blockDim.x=%d, " - "warp size " - "%d\n", - threadIdx.x, - blockIdx.x, - warp_id, - total_sections, - num_sections_x, - num_sections_y, - warps_per_block, - blockDim.x, - detail::warp_size); - // the block is divided into sections. A warp operates on a section at a time. - for (int my_section_idx = warp_id; my_section_idx < total_sections; - my_section_idx += warps_per_block) { - // convert to rows and cols - auto const section_x = my_section_idx % num_sections_x; - auto const section_y = my_section_idx / num_sections_x; - - if (print_debug) - printf("working on section %d,%d - %d of %d...\n", - section_x, - section_y, - my_section_idx, - total_sections); - auto const relative_col = section_x * 32 + lane_id; - auto const relative_row = section_y * 32; - auto const absolute_col = relative_col + block.start_col; - auto const absolute_row = relative_row + block.start_row; - auto const cols_left = num_columns - absolute_col; - - if (print_debug) printf("pre ballot sync...\n"); - auto const participation_mask = __ballot_sync(0xFFFFFFFF, absolute_col < num_columns); - - if (print_debug) - printf( - "participation mask is 0x%x for relative row %d(%d real), relative col %d(%d " - "absolute)\n", - participation_mask, - relative_row, - absolute_row, - relative_col, - absolute_col); - - if (absolute_col < num_columns) { - if (print_debug) - printf( - "thread %d's byte is at %p, participation mask is 0x%x for relative row %d(%d real), " - "relative col %d(%d absolute)\n", - threadIdx.x, - &input_nm[absolute_col][absolute_row / 32], - participation_mask, - relative_row, - absolute_row, - relative_col, - absolute_col); - auto my_data = input_nm[absolute_col] != nullptr ? input_nm[absolute_col][absolute_row / 32] - : std::numeric_limits::max(); - - if (print_debug) - printf( - "thread %d's bytes are 0x%x, participation mask is 0x%x for relative row %d(%d real), " - "relative col %d(%d absolute)\n", - threadIdx.x, - my_data, - participation_mask, - relative_row, - absolute_row, - relative_col, - absolute_col); - - // every thread that is participating in the warp has a byte, but it's column-based - // data and we need it in row-based. So we shuffle the bits around with ballot_sync to - // make the bytes we actually write. - bitmask_type dw_mask = 1; - for (int i = 0; i < 32 && relative_row + i < num_rows; ++i, dw_mask <<= 1) { - auto validity_data = __ballot_sync(participation_mask, my_data & dw_mask); - // lead thread in each warp writes data - auto const validity_write_offset = - validity_data_row_length * (relative_row + i) + relative_col / 8; - if (threadIdx.x % detail::warp_size == 0) { - if (print_debug) - printf( - "%d %d - byte_mask is 0x%x, masked_byte is 0x%x, shared_data_block[%d][%d] = " - "0x%x\n", - threadIdx.x, - blockIdx.x, - dw_mask, - my_data & dw_mask, - validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED, - validity_write_offset, - validity_data); - if (cols_left <= 8) { - // write byte - if (print_debug) - printf("%d %d - writing single byte to shared offset 0x%x which is %p...\n", - threadIdx.x, - blockIdx.x, - validity_write_offset, - &this_shared_block[validity_write_offset]); - this_shared_block[validity_write_offset] = validity_data & 0xFF; - } else if (cols_left <= 16) { - // write int16 - if (print_debug) - printf("%d %d - writing two bytes to shared offset 0x%x which is %p...\n", - threadIdx.x, - blockIdx.x, - validity_write_offset, - &this_shared_block[validity_write_offset]); - *reinterpret_cast(&this_shared_block[validity_write_offset]) = - validity_data & 0xFFFF; - } else if (cols_left <= 24) { - // write int16 and then int8 - if (print_debug) - printf("%d %d - writing three bytes to shared offset 0x%x which is %p...\n", - threadIdx.x, - blockIdx.x, - validity_write_offset, - &this_shared_block[validity_write_offset]); - *reinterpret_cast(&this_shared_block[validity_write_offset]) = - validity_data & 0xFFFF; - shared_data[validity_write_offset + 2] = (validity_data >> 16) & 0xFF; - } else { - // write int32 - if (print_debug) - printf("%d %d - writing 4 bytes to shared offset 0x%x which is %p...\n", - threadIdx.x, - blockIdx.x, - validity_write_offset, - &this_shared_block[validity_write_offset]); - *reinterpret_cast(&this_shared_block[validity_write_offset]) = - validity_data; - } - } - } - } - } - - // make sure entire block has finished copy - group.sync(); - - auto const output_data_base = - output_data[block.buffer_num] + validity_offset + block.start_col / 8; - - // now async memcpy the shared memory out to the final destination - for (int row = block.start_row + threadIdx.x; row <= block.end_row; row += blockDim.x) { - auto const relative_row = row - block.start_row; - if (print_debug) { - printf( - "base output data is %p, row offset is 0x%x, validity offset into row is 0x%x, word " - "index of block start is 0x%x\n", - output_data[block.buffer_num], - row_offsets[row], - validity_offset, - word_index(block.start_col)); - printf( - "%d %d - row %d/%d/%d col %d-%d - %p = shared_data_block[%d][%d] which is %p - %d " - "bytes\n - %p <- 0x%x\n", - threadIdx.x, - blockIdx.x, - block.start_row, - row, - block.end_row, - block.start_col, - block.end_col, - output_data[block.buffer_num] + row_offsets[row] + validity_offset + - (word_index(block.start_col)), - validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED, - validity_data_row_length * relative_row, - &this_shared_block[validity_data_row_length * relative_row], - util::div_rounding_up_unsafe(num_block_cols, 8), - output_data[block.buffer_num] + row_offsets[row] + validity_offset + - word_index(block.start_col), - this_shared_block[validity_data_row_length * relative_row]); - } - auto const output_ptr = output_data_base + row_offsets[row]; - auto const num_bytes = util::div_rounding_up_unsafe(num_block_cols, 8); - - cuda::memcpy_async( - output_ptr, - &this_shared_block[validity_data_row_length * relative_row], - num_bytes, - shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]); - } - } - - // wait for last blocks of data to arrive - for (int validity_block = 0; - validity_block < blocks_remaining % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; - ++validity_block) { - shared_block_barriers[validity_block].arrive_and_wait(); - } -} - -static __device__ std::tuple get_admin_data_sizes(size_t col_size_size, - size_t col_offset_size, - int const num_cols) -{ - auto const col_size_bytes = num_cols * col_size_size; - auto const col_offset_bytes = num_cols * col_offset_size; - - return {col_size_bytes, col_offset_bytes}; -} - -/** - * @brief ensure `read_ahead` buffer blocks are fetched - * - * @param fetch_index internal state passed into the function - * @param processing_index index where processing is occuring - * @param read_ahead_count how many blocks to read ahead - * @param max_resident_blocks how many blocks can be loaded at once - * @param total_blocks total number of blocks overall - * @param block_infos pointer to the block infos - * @param col_sizes pointer to column size information - * @param col_offsets pointer to the table's column offsets - * @param row_offsets pointer to offsets for each row in the table - * @param input_data pointer to the input data - * @param shared pointer to shared memory - * @param group thread group participating in the fetch - * @param block_barrier barriers used for each block - * @param debug_print - * @return - */ -static __device__ void fetch_blocks_for_row_to_column( - size_t& fetch_index, - size_t const processing_index, - int const read_ahead_count, - int const max_resident_blocks, - int const total_blocks, - block_info const* const block_infos, - size_type const* const col_sizes, - size_type const* const col_offsets, - size_type const* const row_offsets, - int8_t const* const input_data, - int8_t* shared[], - cooperative_groups::thread_block const group, - cuda::barrier* block_barrier, - bool debug_print) -{ - for (; fetch_index < static_cast(total_blocks) && - fetch_index < (processing_index + read_ahead_count); - ++fetch_index) { - auto const fetch_block = - block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch_index]; - if (debug_print) - printf( - "fetching block %lu of %d for start col %d, end col %d. Starting col offset is %p, " - "ending " - "offset %p\n", - blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch_index, - total_blocks, - fetch_block.start_col, - fetch_block.end_col, - &col_offsets[fetch_block.start_col], - &col_offsets[fetch_block.end_col]); - auto const fetch_block_start_row = fetch_block.start_row; - auto const fetch_block_end_row = fetch_block.end_row; - auto const starting_col_offset = col_offsets[fetch_block.start_col]; - - auto const fetch_block_row_size = fetch_block.get_shared_row_size(col_offsets, col_sizes); - auto const num_fetch_cols = fetch_block.num_cols(); - auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes( - sizeof(decltype(*col_sizes)), sizeof(decltype(*col_offsets)), num_fetch_cols); - auto& fetch_barrier = block_barrier[fetch_index % NUM_BLOCKS_PER_KERNEL_LOADED]; - - // if we have fetched all buffers, we need to wait for processing - // to complete on them before we can use them again - if (fetch_index > NUM_BLOCKS_PER_KERNEL_LOADED) { fetch_barrier.arrive_and_wait(); } - - auto shared_row_offset = 0; - // copy the data for column sizes - if (debug_print) - printf("%d: col sizes memcpy_async(group, %p, %p, %d, barrier);\n", - threadIdx.x, - &shared[fetch_index % max_resident_blocks][shared_row_offset], - &col_offsets[fetch_block.start_col], - col_size_bytes); - if (debug_print && group.thread_rank() == 0 && blockIdx.x == 0) - printf("%d-%d fetching to %p with barrier %p\n", - threadIdx.x, - blockIdx.x, - shared[fetch_index % max_resident_blocks], - &fetch_barrier); - cuda::memcpy_async(group, - &shared[fetch_index % max_resident_blocks][shared_row_offset], - &col_sizes[fetch_block.start_col], - col_size_bytes, - fetch_barrier); - shared_row_offset += col_size_bytes; - // copy the data for column offsets - if (debug_print) - printf("%d: offsets memcpy_async(group, %p, %p, %d, barrier);\n", - threadIdx.x, - &shared[fetch_index % max_resident_blocks][shared_row_offset], - &col_offsets[fetch_block.start_col], - col_offset_bytes); - cuda::memcpy_async(group, - &shared[fetch_index % max_resident_blocks][shared_row_offset], - &col_offsets[fetch_block.start_col], - col_offset_bytes, - fetch_barrier); - shared_row_offset += col_offset_bytes; - shared_row_offset = align_offset(shared_row_offset, 8); - - for (auto row = fetch_block_start_row + static_cast(threadIdx.x); - row <= fetch_block_end_row; - row += blockDim.x) { - auto shared_offset = (row - fetch_block_start_row) * fetch_block_row_size + shared_row_offset; - if (debug_print) - printf( - "%d - fetching block %lu to shared chunk %lu. %p(shared[%d %% %d][%d]) <- %p(row %d, row " - "offset %d starting col offset %d)\n", - threadIdx.x, - fetch_index, - fetch_index % max_resident_blocks, - &shared[fetch_index % max_resident_blocks][shared_offset], - (int)fetch_index, - max_resident_blocks, - shared_offset, - &input_data[row_offsets[row] + starting_col_offset], - row, - row_offsets[row], - starting_col_offset); - // copy the main - cuda::memcpy_async(&shared[fetch_index % max_resident_blocks][shared_offset], - &input_data[row_offsets[row] + starting_col_offset], - fetch_block_row_size, - fetch_barrier); - } - } -} - -/** - * @brief copy data from row-based format to cudf columns - * - * @param num_rows total number of rows in the table - * @param num_columns total number of columns in the table - * @param shmem_used_per_block amount of shared memory that is used by a block - * @param row_offsets - * @param output_data - * @param output_nm - * @param col_sizes array of sizes for each element in a column - one per column - * @param col_offsets offset into input data row for each column's start - * @param block_infos information about the blocks of work - * @param input_data pointer to input data - * - */ -__global__ void copy_to_columns(const size_type num_rows, - const size_type num_columns, - const size_type shmem_used_per_block, - const size_type* row_offsets, - int8_t** output_data, - const size_type* _col_sizes, - const size_type* _col_offsets, - const block_info* block_infos, - const size_type num_block_infos, - const int8_t* input_data) -{ - // We are going to copy the data in two passes. - // The first pass copies a chunk of data into shared memory. - // The second pass copies that chunk from shared memory out to the final location. - - // Because shared memory is limited we copy a subset of the rows at a time. - // This has been broken up for us in the block_info struct, so we don't have - // any calculation to do here, but it is important to note. - - // to speed up some of the random access memory we do, we copy col_sizes and col_offsets - // to shared memory for each of the blocks that we work on - - constexpr bool debug_print = false; // threadIdx.x == 2 && blockIdx.x == 0; - constexpr unsigned stages_count = NUM_BLOCKS_PER_KERNEL_LOADED; - auto group = cooperative_groups::this_thread_block(); - extern __shared__ int8_t shared_data[]; - int8_t* shared[stages_count] = {shared_data, shared_data + shmem_used_per_block}; - - if (debug_print) { - printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns); - printf( - "%d block infos are at %p and my index is %d\n", num_block_infos, block_infos, blockIdx.x); - /* printf("Row Offsets:\n"); - for (int i=0; i block_barrier[NUM_BLOCKS_PER_KERNEL_LOADED]; - if (group.thread_rank() == 0) { - for (int i = 0; i < NUM_BLOCKS_PER_KERNEL_LOADED; ++i) { - init(&block_barrier[i], group.size()); - } - } - - group.sync(); - - auto blocks_remaining = std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS, - (uint)NUM_BLOCKS_PER_KERNEL_TO_COLUMNS); - - auto get_admin_data_sizes = [col_size_size = sizeof(decltype(*_col_sizes)), - col_offset_size = sizeof(decltype(*_col_offsets))]( - int const num_cols, - int const num_rows) -> std::tuple { - auto const col_size_bytes = num_cols * col_size_size; - auto const col_offset_bytes = num_cols * col_offset_size; - - return {col_size_bytes, col_offset_bytes}; - }; - - if (debug_print) - printf("%d blocks remaining -> %d block infos, %d block index\n", - blocks_remaining, - num_block_infos, - blockIdx.x); - size_t fetch; - size_t subset; - for (subset = fetch = 0; subset < blocks_remaining; ++subset) { - // Fetch ahead up to stages_count subsets - fetch_blocks_for_row_to_column(fetch, - subset, - stages_count, - stages_count, - blocks_remaining, - block_infos, - _col_sizes, - _col_offsets, - row_offsets, - input_data, - shared, - group, - block_barrier, - debug_print); - - auto& subset_barrier = block_barrier[subset % NUM_BLOCKS_PER_KERNEL_LOADED]; - // ensure our data is ready - if (debug_print) - printf("%d-%d waiting at barrier %p\n", threadIdx.x, blockIdx.x, &subset_barrier); - subset_barrier.arrive_and_wait(); - - auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + subset]; - if (debug_print) - printf("%d-%d reading block %lu at address %p\n", - threadIdx.x, - blockIdx.x, - blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + subset, - shared[subset % stages_count]); - - auto const rows_in_block = block.num_rows(); - auto const cols_in_block = block.num_cols(); - - auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes(cols_in_block, rows_in_block); - // auto shared_row_offsets = shared[subset]; - auto shared_col_sizes = reinterpret_cast(shared[subset % stages_count]); - auto shared_col_offsets = - reinterpret_cast(&shared[subset % stages_count][col_size_bytes]); - - auto const shared_row_offset = align_offset(col_size_bytes + col_offset_bytes, 8); - - auto block_row_size = block.get_shared_row_size(_col_offsets, _col_sizes, debug_print); - - // now we copy from shared memory to final destination. - // the data is laid out in rows in shared memory, so the reads - // for a column will be "vertical". Because of this and the different - // sizes for each column, this portion is handled on row/column basis. - // to prevent each thread working on a single row and also to ensure - // that all threads can do work in the case of more threads than rows, - // we do a global index instead of a double for loop with col/row. - for (int index = threadIdx.x; index < rows_in_block * cols_in_block; index += blockDim.x) { - auto const relative_col = index % cols_in_block; - auto const relative_row = index / cols_in_block; - auto const absolute_col = relative_col + block.start_col; - auto const absolute_row = relative_row + block.start_row; - - if (debug_print) - printf("copying for row %d(%d absolute) col %d(%d absolute)\n", - relative_row, - absolute_row, - relative_col, - absolute_col); - - auto const shared_memory_row_offset = block_row_size * relative_row; - if (debug_print) - printf("shared_col_offsets is %p and relative col is %d, making me access %p\n", - shared_col_offsets, - relative_col, - &shared_col_offsets[relative_col]); - auto const shared_memory_offset = shared_col_offsets[relative_col] - shared_col_offsets[0] + - shared_memory_row_offset + shared_row_offset; - if (debug_print) - printf("shared_col_sizes is %p and relative col is %d, making me access %p\n", - shared_col_sizes, - relative_col, - &shared_col_sizes[relative_col]); - auto const column_size = shared_col_sizes[relative_col]; - - int8_t* shmem_src = &shared[subset % stages_count][shared_memory_offset]; - int8_t* dst = &output_data[absolute_col][absolute_row * column_size]; - - if (debug_print) { - printf( - "relative_col: %d, relative_row: %d, absolute_col: %d, absolute_row: %d, " - "shared_mmeory_row_offset: %d, shared_memory_offset: %d," - " column_size: %d, shmem_src: %p, dst: %p\n",//, uint32 is %u\n", - relative_col, - relative_row, - absolute_col, - absolute_row, - shared_memory_row_offset, - shared_memory_offset, - column_size, - shmem_src, - dst/*, - *reinterpret_cast(shmem_src)*/); - printf("memcpy_async(%p, %p, %d, subset_barrier);\n", dst, shmem_src, column_size); - } - if (debug_print && absolute_col == 0 && absolute_row == 51) { - printf("col0row51(%d bytes) = %p - 0x", column_size, shmem_src); - for (int i = 0; i < column_size; ++i) { - printf("%x ", shmem_src[i]); - } - printf("\n"); - } - - cuda::memcpy_async(dst, shmem_src, column_size, subset_barrier); - } - group.sync(); - if (debug_print) - printf( - "%d-%d copy to main memory with barrier %p\n", threadIdx.x, blockIdx.x, &subset_barrier); - } - - // wait on the last copies to complete - for (uint i = 0; i < std::min(stages_count, blocks_remaining); ++i) { - block_barrier[i].arrive_and_wait(); - } -} - -/** - * @brief copy data from row-based format to cudf columns - * - * @param num_rows total number of rows in the table - * @param num_columns total number of columns in the table - * @param shmem_used_per_block amount of shared memory that is used by a block - * @param offsets - * @param output_nm - * @param validity_offsets offset into input data row for validity data - * @param block_infos information about the blocks of work - * @param num_block_infos number of infos in blocks array - * @param input_data pointer to input data - * - */ -__global__ void copy_validity_to_columns(const size_type num_rows, - const size_type num_columns, - const size_type shmem_used_per_block, - const size_type* row_offsets, - cudf::bitmask_type** output_nm, - const size_type validity_offset, - const block_info* block_infos, - const size_type num_block_infos, - const int8_t* input_data) -{ - extern __shared__ int8_t shared_data[]; - int8_t* shared_blocks[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] = { - shared_data, shared_data + shmem_used_per_block / 2}; - - constexpr bool print_debug = false; // threadIdx.x == 0 && blockIdx.x == 0; - if (print_debug) { - printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns); - printf("%d %d - block infos are at %p and my index is %d\n", - threadIdx.x, - blockIdx.x, - block_infos, - blockIdx.x); - printf( - "%d %d - Shared memory starts at %p and ends at %p, input data is %p, output data is %p, " - "row " - "offsets are %p, block infos at %p\n", - threadIdx.x, - blockIdx.x, - shared_data, - shared_data + shmem_used_per_block, - input_data, - output_nm, - row_offsets, - block_infos); - } - // else { return; } - - // per conversation with DaveB - // each thread of warp reads a single byte of validity - so we read 32 bytes - // then ballot_sync the bits and write the result to shmem - // after we fill shared mem memcpy it out in a blob. - // probably need knobs for number of rows vs columns to balance read/write - auto group = cooperative_groups::this_thread_block(); - - int const blocks_remaining = - std::min(num_block_infos - blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL, - (uint)NUM_VALIDITY_BLOCKS_PER_KERNEL); - - if (print_debug) printf("%d blocks with %d in group\n", blocks_remaining, group.size()); - - __shared__ cuda::barrier - shared_block_barriers[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]; - if (group.thread_rank() == 0) { - for (int i = 0; i < NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; ++i) { - init(&shared_block_barriers[i], group.size()); - } - } - - group.sync(); - - for (int validity_block = 0; validity_block < blocks_remaining; ++validity_block) { - auto const validity_index = validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; - if (validity_block != validity_index) { - shared_block_barriers[validity_index].arrive_and_wait(); - } - int8_t* this_shared_block = shared_blocks[validity_block % 2]; - auto const block = block_infos[blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block]; - auto const block_start_col = block.start_col; - auto const block_start_row = block.start_row; - - auto const num_block_cols = block.num_cols(); - auto const num_block_rows = block.num_rows(); - - auto const num_sections_x = (num_block_cols + 7) / 8; - auto const num_sections_y = (num_block_rows + 31) / 32; - auto const validity_data_col_length = num_sections_y * 4; // words to bytes - auto const total_sections = num_sections_x * num_sections_y; - - if (print_debug) { - printf("%d %d - block %d has %d cols, %d rows, and %d total sections\n", - threadIdx.x, - blockIdx.x, - blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block, - num_block_cols, - num_block_rows, - total_sections); - } - int const warp_id = threadIdx.x / detail::warp_size; - int const lane_id = threadIdx.x % detail::warp_size; - auto const warps_per_block = std::max(1u, blockDim.x / detail::warp_size); - - if (print_debug) - printf( - "%d %d - my warp is %d, %d total sections, %d warps per block, blockDim.x=%d, warp side " - "%d\n", - threadIdx.x, - blockIdx.x, - warp_id, - total_sections, - warps_per_block, - blockDim.x, - detail::warp_size); - // the block is divided into sections. A warp operates on a section at a time. - for (int my_section_idx = warp_id; my_section_idx < total_sections; - my_section_idx += warps_per_block) { - // convert to rows and cols - auto const section_x = my_section_idx % num_sections_x; - auto const section_y = my_section_idx / num_sections_x; - - auto const relative_col = section_x * 8; - auto const relative_row = section_y * 32 + lane_id; - auto const absolute_col = relative_col + block_start_col; - auto const absolute_row = relative_row + block_start_row; - auto const rows_left = num_rows - absolute_row; - - /* if (print_debug) - printf("%d-%d: si: %d nsx: %d nsy: %d sx: %d sy: %d ar: %d nr: %d rc: %d rr: %d\n", - threadIdx.x, - blockIdx.x, - my_section_idx, - num_sections_x, - num_sections_y, - section_x, - section_y, - absolute_row, - num_rows, - relative_col, - relative_row);*/ - auto const participation_mask = __ballot_sync(0xFFFFFFFF, absolute_row < num_rows); - - if (absolute_row < num_rows) { - auto const my_byte = - input_data[row_offsets[absolute_row] + validity_offset + absolute_col / 8]; - - // so every thread that is participating in the warp has a byte, but it's row-based - // data and we need it in column-based. So we shiffle the bits around to make - // the bytes we actually write. - for (int i = 0, byte_mask = 1; i < 8 && relative_col + i < num_columns; - ++i, byte_mask <<= 1) { - auto validity_data = __ballot_sync(participation_mask, my_byte & byte_mask); - // lead thread in each warp writes data - if (threadIdx.x % detail::warp_size == 0) { - auto const validity_write_offset = - validity_data_col_length * (relative_col + i) + relative_row / 8; - - if (print_debug) - printf( - "%d - Writing validity data for column %d, row %d 0x%x to shared memory location " - "%d(%d * (%d + %d) + %d / 8)\n", - threadIdx.x, - absolute_col + i, - absolute_row, - validity_data, - validity_write_offset, - validity_data_col_length, - relative_col, - i, - relative_row); - - if (rows_left <= 8) { - // write byte - this_shared_block[validity_write_offset] = validity_data & 0xFF; - } else if (rows_left <= 16) { - // write int16 - *reinterpret_cast(&this_shared_block[validity_write_offset]) = - validity_data & 0xFFFF; - } else if (rows_left <= 24) { - // write int16 and then int8 - *reinterpret_cast(&this_shared_block[validity_write_offset]) = - validity_data & 0xFFFF; - shared_data[validity_write_offset + 2] = (validity_data >> 16) & 0xFF; - } else { - // write int32 - *reinterpret_cast(&this_shared_block[validity_write_offset]) = - validity_data; - } - } - } - } - } - - // make sure entire block has finished copy - group.sync(); - - // now async memcpy the shared - for (int col = block.start_col + threadIdx.x; col <= block.end_col; col += blockDim.x) { - auto const relative_col = col - block.start_col; - auto const words_to_copy = util::div_rounding_up_unsafe(num_block_rows, 32); - auto const starting_address = output_nm[col] + word_index(block_start_row); - - if (print_debug) - printf("%d %d - col %d memcpy_async(%p(offset %d), %p, %d, subset_barrier); - 0x%x\n", - threadIdx.x, - blockIdx.x, - col, - starting_address, - word_index(block_start_row), - &this_shared_block[validity_data_col_length * relative_col], - words_to_copy * 4, - this_shared_block[validity_data_col_length * relative_col]); - cuda::memcpy_async( - output_nm[col] + word_index(block_start_row), - &this_shared_block[validity_data_col_length * relative_col], - util::div_rounding_up_unsafe(num_block_rows, 8), - shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]); - } - } - - // wait for last blocks of data to arrive - auto const num_blocks_to_wait = blocks_remaining > NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED - ? NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED - : blocks_remaining; - for (int validity_block = 0; validity_block < num_blocks_to_wait; ++validity_block) { - shared_block_barriers[validity_block].arrive_and_wait(); - } -} - -#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 - -/** - * Calculate the dimensions of the kernel for fixed width only columns. - * @param [in] num_columns the number of columns being copied. - * @param [in] num_rows the number of rows being copied. - * @param [in] size_per_row the size each row takes up when padded. - * @param [out] blocks the size of the blocks for the kernel - * @param [out] threads the size of the threads for the kernel - * @return the size in bytes of shared memory needed for each block. - */ -static int calc_fixed_width_kernel_dims(const cudf::size_type num_columns, - const cudf::size_type num_rows, - const cudf::size_type size_per_row, - dim3& blocks, - dim3& threads) -{ - // We have found speed degrades when a thread handles more than 4 columns. - // Each block is 2 dimensional. The y dimension indicates the columns. - // We limit this to 32 threads in the y dimension so we can still - // have at least 32 threads in the x dimension (1 warp) which should - // result in better coalescing of memory operations. We also - // want to guarantee that we are processing a multiple of 32 threads - // in the x dimension because we use atomic operations at the block - // level when writing validity data out to main memory, and that would - // need to change if we split a word of validity data between blocks. - int y_block_size = (num_columns + 3) / 4; // cudf::util::div_rounding_up_safe(num_columns, 4); - if (y_block_size > 32) { y_block_size = 32; } - int x_possible_block_size = 1024 / y_block_size; - // 48KB is the default setting for shared memory per block according to the cuda tutorials - // If someone configures the GPU to only have 16 KB this might not work. - int max_shared_size = 48 * 1024; - int max_block_size = max_shared_size / size_per_row; - // If we don't have enough shared memory there is no point in having more threads - // per block that will just sit idle - max_block_size = max_block_size > x_possible_block_size ? x_possible_block_size : max_block_size; - // Make sure that the x dimension is a multiple of 32 this not only helps - // coalesce memory access it also lets us do a ballot sync for validity to write - // the data back out the warp level. If x is a multiple of 32 then each thread in the y - // dimension is associated with one or more warps, that should correspond to the validity - // words directly. - int block_size = (max_block_size / 32) * 32; - CUDF_EXPECTS(block_size != 0, "Row size is too large to fit in shared memory"); - - int num_blocks = (num_rows + block_size - 1) / block_size; - if (num_blocks < 1) { - num_blocks = 1; - } else if (num_blocks > 10240) { - // The maximum number of blocks supported in the x dimension is 2 ^ 31 - 1 - // but in practice haveing too many can cause some overhead that I don't totally - // understand. Playing around with this haveing as little as 600 blocks appears - // to be able to saturate memory on V100, so this is an order of magnitude higher - // to try and future proof this a bit. - num_blocks = 10240; - } - blocks.x = num_blocks; - blocks.y = 1; - blocks.z = 1; - threads.x = block_size; - threads.y = y_block_size; - threads.z = 1; - return size_per_row * block_size; -} - -/** - * When converting to rows it is possible that the size of the table was too big to fit - * in a single column. This creates an output column for a subset of the rows in a table - * going from start row and containing the next num_rows. Most of the parameters passed - * into this function are common between runs and should be calculated once. - */ -static std::unique_ptr fixed_width_convert_to_rows( - const cudf::size_type start_row, - const cudf::size_type num_rows, - const cudf::size_type num_columns, - const cudf::size_type size_per_row, - rmm::device_uvector& column_start, - rmm::device_uvector& column_size, - rmm::device_uvector& input_data, - rmm::device_uvector& input_nm, - const cudf::scalar& zero, - const cudf::scalar& scalar_size_per_row, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - int64_t total_allocation = size_per_row * num_rows; - // We made a mistake in the split somehow - CUDF_EXPECTS(total_allocation < std::numeric_limits::max(), "Table is too large to fit!"); - - // Allocate and set the offsets row for the byte array - std::unique_ptr offsets = - cudf::detail::sequence(num_rows + 1, zero, scalar_size_per_row, stream); - - std::unique_ptr data = - cudf::make_numeric_column(cudf::data_type(cudf::type_id::INT8), - static_cast(total_allocation), - cudf::mask_state::UNALLOCATED, - stream, - mr); - - dim3 blocks; - dim3 threads; - int shared_size = - detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads); - - copy_from_fixed_width_columns<<>>( - start_row, - num_rows, - num_columns, - size_per_row, - column_start.data(), - column_size.data(), - input_data.data(), - input_nm.data(), - data->mutable_view().data()); - - return cudf::make_lists_column(num_rows, - std::move(offsets), - std::move(data), - 0, - rmm::device_buffer{0, rmm::cuda_stream_default, mr}, - stream, - mr); -} - -static cudf::data_type get_data_type(const cudf::column_view& v) { return v.type(); } - -static inline bool are_all_fixed_width(std::vector const& schema) -{ - return std::all_of( - schema.begin(), schema.end(), [](const cudf::data_type& t) { return cudf::is_fixed_width(t); }); -} - -/** - * Given a set of fixed width columns, calculate how the data will be laid out in memory. - * @param [in] schema the types of columns that need to be laid out. - * @param [out] column_start the byte offset where each column starts in the row. - * @param [out] column_size the size in bytes of the data for each columns in the row. - * @return the size in bytes each row needs. - */ -static inline int32_t compute_fixed_width_layout(std::vector const& schema, - std::vector& column_start, - std::vector& column_size) -{ - // We guarantee that the start of each column is 64-bit aligned so anything can go - // there, but to make the code simple we will still do an alignment for it. - int32_t at_offset = 0; - for (auto col = schema.begin(); col < schema.end(); col++) { - cudf::size_type s = cudf::size_of(*col); - column_size.emplace_back(s); - std::size_t allocation_needed = s; - std::size_t alignment_needed = allocation_needed; // They are the same for fixed width types - at_offset = align_offset(at_offset, alignment_needed); - column_start.emplace_back(at_offset); - at_offset += allocation_needed; - } - - // Now we need to add in space for validity - // Eventually we can think about nullable vs not nullable, but for now we will just always add - // it in - int32_t validity_bytes_needed = - (schema.size() + 7) / 8; // cudf::util::div_rounding_up_safe(schema.size(), 8); - // validity comes at the end and is byte aligned so we can pack more in. - at_offset += validity_bytes_needed; - // Now we need to pad the end so all rows are 64 bit aligned - return align_offset(at_offset, 8); // 8 bytes (64 bits) -} - -#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 - -template -static size_type compute_column_information(iterator begin, - iterator end, - std::vector& column_starts, - std::vector& column_sizes) //, -// std::function nested_type_cb) -{ - size_type fixed_width_size_per_row = 0; - for (auto cv = begin; cv != end; ++cv) { - auto col_type = std::get<0>(*cv); - bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; - - // if (nested_type && nested_type_cb) { nested_type_cb(cv->get<1>()); } - - // a list or string column will write a single uint64 - // of data here for offset/length - auto col_size = nested_type ? 8 : size_of(col_type); - - // align size for this type - std::size_t const alignment_needed = col_size; // They are the same for fixed width types - fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed); - column_starts.push_back(fixed_width_size_per_row); - column_sizes.push_back(col_size); - fixed_width_size_per_row += col_size; - } - - auto validity_offset = fixed_width_size_per_row; - column_starts.push_back(validity_offset); - - return fixed_width_size_per_row; -} - -//#define DEBUG - -std::vector build_validity_block_infos( - size_type const& num_columns, - size_type const& num_rows, - size_type const& shmem_limit_per_block, - std::vector const& row_batches) -{ - auto const desired_rows_and_columns = (int)sqrt(shmem_limit_per_block); - auto const column_stride = align_offset( - [&]() { - if (desired_rows_and_columns > num_columns) { - // not many columns, group it into 8s and ship it off - return std::min(8, num_columns); - } else { - return util::round_down_safe(desired_rows_and_columns, 8); - } - }(), - 8); - // we fit as much as we can given the column stride - // note that an element in the table takes just 1 bit, but a row with a single - // element still takes 8 bytes! - auto const bytes_per_row = align_offset(util::div_rounding_up_unsafe(column_stride, 8), 8); - auto const row_stride = std::min(num_rows, shmem_limit_per_block / bytes_per_row); - - std::vector validity_block_infos; - for (int col = 0; col < num_columns; col += column_stride) { - int current_window_row_batch = 0; - int rows_left_in_batch = row_batches[current_window_row_batch].row_count; - int row = 0; - while (row < num_rows) { - if (rows_left_in_batch == 0) { - current_window_row_batch++; - rows_left_in_batch = row_batches[current_window_row_batch].row_count; - } - int const window_height = std::min(row_stride, rows_left_in_batch); - - validity_block_infos.emplace_back(detail::block_info{ - col, row, std::min(col + column_stride - 1, num_columns - 1), row + window_height - 1}); - row += window_height; - rows_left_in_batch -= window_height; - } - } - - return validity_block_infos; -} - -std::vector build_block_infos(std::vector const& column_sizes, - std::vector const& column_starts, - std::vector const& row_batches, - size_type const total_number_of_rows, - size_type const& shmem_limit_per_block) -{ - std::vector block_infos; - - // block infos are organized with the windows going "down" the columns - // this provides the most coalescing of memory access - int current_window_width = 0; - int current_window_start_col = 0; - - // build the blocks for a specific set of columns - auto build_blocks = [&block_infos, &row_batches, total_number_of_rows]( - int const start_col, int const end_col, int const desired_window_height) { - int current_window_start_row = 0; - int current_window_row_batch = 0; - int rows_left_in_batch = row_batches[current_window_row_batch].row_count; - int i = 0; - while (i < total_number_of_rows) { - if (rows_left_in_batch == 0) { - current_window_row_batch++; - rows_left_in_batch = row_batches[current_window_row_batch].row_count; - } - int const window_height = std::min(desired_window_height, rows_left_in_batch); - - block_infos.emplace_back(detail::block_info{ - start_col, - current_window_start_row, - end_col, - std::min(current_window_start_row + window_height - 1, total_number_of_rows - 1), - current_window_row_batch}); - - i += window_height; - current_window_start_row += window_height; - rows_left_in_batch -= window_height; - } - }; - - // the ideal window height has lots of 8-byte reads and 8-byte writes. The optimal read/write - // would be memory cache line sized access, but since other blocks will read/write the edges - // this may not turn out to be overly important. For now, we will attempt to build a square - // window as far as byte sizes. x * y = shared_mem_size. Which translates to x^2 = - // shared_mem_size since we want them equal, so height and width are sqrt(shared_mem_size). The - // trick is that it's in bytes, not rows or columns. - size_type const optimal_square_len = size_type(sqrt(shmem_limit_per_block)); - int const window_height = std::clamp( - util::round_up_safe( - std::min(std::min(optimal_square_len, (size_type)column_sizes.size()) / column_sizes[0], - total_number_of_rows), - 32), - 1, - row_batches[0].row_count); -#if defined(DEBUG) - printf( - "optimal_square_len is %d and we have %d columns, optimal_square_len / column_sizes[0] is %d " - "and num_rows is %d, batch row count is %d " - "- which makes window height " - "%d - admin size is %lu\n", - optimal_square_len, - (int)column_sizes.size(), - optimal_square_len / column_sizes[0], - total_number_of_rows, - row_batches[0].row_count, - window_height, - column_sizes.size() * sizeof(size_type) * 2); -#endif - - auto calc_admin_data_size = [](int num_cols) -> size_type { - // admin data is the column sizes and column start information. - // this is copied to shared memory as well and needs to be accounted for - // in the window calculation. - return num_cols * sizeof(size_type) + num_cols * sizeof(size_type); - }; - - int row_size = 0; - - // march each column and build the blocks of appropriate sizes - for (unsigned int col = 0; col < column_sizes.size(); ++col) { - auto const col_size = column_sizes[col]; - - // align size for this type - std::size_t alignment_needed = col_size; // They are the same for fixed width types - auto row_size_aligned = detail::align_offset(row_size, alignment_needed); - auto row_size_with_this_col = row_size_aligned + col_size; - auto row_size_with_end_pad = detail::align_offset(row_size_with_this_col, 8); - - if (row_size_with_end_pad * window_height + - calc_admin_data_size(col - current_window_start_col) > - shmem_limit_per_block) { -#if defined(DEBUG) - printf( - "row size with end pad is %d and admin data is %d, which adds up to %d and that is too " - "large for shmem block of %d\n", - row_size_with_end_pad, - calc_admin_data_size(col - current_window_start_col), - row_size_with_end_pad * window_height + - calc_admin_data_size(col - current_window_start_col), - shmem_limit_per_block); - printf( - "Window size %d too large at column %d, admin size is %d, bumping back to build windows " - "of " - "size %d(cols " - "%d-%d), which is %d tall. Row size is too large at %d and ok at %d(aligned overall is " - "%d) " - "for shared mem size %d\n", - row_size_with_end_pad * window_height, - col, - calc_admin_data_size(col - current_window_start_col), - row_size * window_height, - current_window_start_col, - col - 1, - window_height, - row_size_with_end_pad, - row_size, - row_size_aligned, - shmem_limit_per_block); -#endif - // too large, close this window, generate vertical blocks and restart - build_blocks(current_window_start_col, col == 0 ? col : col - 1, window_height); - row_size = - detail::align_offset((column_starts[col] + column_sizes[col]) & 7, alignment_needed); -#if defined(DEBUG) - printf( - "New window starting with offset %d and row size %d to be %d (previous column offset " - "%d+%d " - "or %d)\n", - row_size, - col_size, - row_size + col_size, - column_starts[col - 1], - column_sizes[col - 1], - column_starts[col - 1] + column_sizes[col - 1]); -#endif - row_size += col_size; // alignment required for shared memory window boundary to match + #include + #include + #include + #include + #include + + #include + #include + + #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 + #include + #endif + + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + + #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 + constexpr auto NUM_BLOCKS_PER_KERNEL_FROM_ROWS = 8; + constexpr auto NUM_BLOCKS_PER_KERNEL_TO_ROWS = 2; + constexpr auto NUM_BLOCKS_PER_KERNEL_LOADED = 2; + constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL = 8; + constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED = 2; + #endif + + using cudf::detail::make_device_uvector_async; + using rmm::device_uvector; + namespace cudf { + + namespace detail { + + static inline __host__ __device__ int32_t align_offset(int32_t offset, std::size_t alignment) { + return (offset + alignment - 1) & ~(alignment - 1); + } + + __global__ void copy_from_rows_fixed_width_optimized( + const cudf::size_type num_rows, const cudf::size_type num_columns, + const cudf::size_type row_size, const cudf::size_type *input_offset_in_row, + const cudf::size_type *num_bytes, int8_t **output_data, cudf::bitmask_type **output_nm, + const int8_t *input_data) { + // We are going to copy the data in two passes. + // The first pass copies a chunk of data into shared memory. + // The second pass copies that chunk from shared memory out to the final location. + + // Because shared memory is limited we copy a subset of the rows at a time. + // For simplicity we will refer to this as a row_group + + // In practice we have found writing more than 4 columns of data per thread + // results in performance loss. As such we are using a 2 dimensional + // kernel in terms of threads, but not in terms of blocks. Columns are + // controlled by the y dimension (there is no y dimension in blocks). Rows + // are controlled by the x dimension (there are multiple blocks in the x + // dimension). + + cudf::size_type rows_per_group = blockDim.x; + cudf::size_type row_group_start = blockIdx.x; + cudf::size_type row_group_stride = gridDim.x; + cudf::size_type row_group_end = (num_rows + rows_per_group - 1) / rows_per_group + 1; + + extern __shared__ int8_t shared_data[]; + + // Because we are copying fixed width only data and we stride the rows + // this thread will always start copying from shared data in the same place + int8_t *row_tmp = &shared_data[row_size * threadIdx.x]; + int8_t *row_vld_tmp = &row_tmp[input_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]]; + + for (cudf::size_type row_group_index = row_group_start; row_group_index < row_group_end; + row_group_index += row_group_stride) { + // Step 1: Copy the data into shared memory + // We know row_size is always aligned with and a multiple of int64_t; + int64_t *long_shared = reinterpret_cast(shared_data); + const int64_t *long_input = reinterpret_cast(input_data); + + cudf::size_type shared_output_index = threadIdx.x + (threadIdx.y * blockDim.x); + cudf::size_type shared_output_stride = blockDim.x * blockDim.y; + cudf::size_type row_index_end = ((row_group_index + 1) * rows_per_group); + if (row_index_end > num_rows) { + row_index_end = num_rows; + } + cudf::size_type num_rows_in_group = row_index_end - (row_group_index * rows_per_group); + cudf::size_type shared_length = row_size * num_rows_in_group; + + cudf::size_type shared_output_end = shared_length / sizeof(int64_t); + + cudf::size_type start_input_index = + (row_size * row_group_index * rows_per_group) / sizeof(int64_t); + + for (cudf::size_type shared_index = shared_output_index; shared_index < shared_output_end; + shared_index += shared_output_stride) { + long_shared[shared_index] = long_input[start_input_index + shared_index]; + } + // Wait for all of the data to be in shared memory + __syncthreads(); + + // Step 2 copy the data back out + + // Within the row group there should be 1 thread for each row. This is a + // requirement for launching the kernel + cudf::size_type row_index = (row_group_index * rows_per_group) + threadIdx.x; + // But we might not use all of the threads if the number of rows does not go + // evenly into the thread count. We don't want those threads to exit yet + // because we may need them to copy data in for the next row group. + uint32_t active_mask = __ballot_sync(0xffffffff, row_index < num_rows); + if (row_index < num_rows) { + cudf::size_type col_index_start = threadIdx.y; + cudf::size_type col_index_stride = blockDim.y; + for (cudf::size_type col_index = col_index_start; col_index < num_columns; + col_index += col_index_stride) { + cudf::size_type col_size = num_bytes[col_index]; + const int8_t *col_tmp = &(row_tmp[input_offset_in_row[col_index]]); + int8_t *col_output = output_data[col_index]; + switch (col_size) { + case 1: { + col_output[row_index] = *col_tmp; + break; + } + case 2: { + int16_t *short_col_output = reinterpret_cast(col_output); + short_col_output[row_index] = *reinterpret_cast(col_tmp); + break; + } + case 4: { + int32_t *int_col_output = reinterpret_cast(col_output); + int_col_output[row_index] = *reinterpret_cast(col_tmp); + break; + } + case 8: { + int64_t *long_col_output = reinterpret_cast(col_output); + long_col_output[row_index] = *reinterpret_cast(col_tmp); + break; + } + default: { + cudf::size_type output_offset = col_size * row_index; + // TODO this should just not be supported for fixed width columns, but just in case... + for (cudf::size_type b = 0; b < col_size; b++) { + col_output[b + output_offset] = col_tmp[b]; + } + break; + } + } + + cudf::bitmask_type *nm = output_nm[col_index]; + int8_t *valid_byte = &row_vld_tmp[col_index / 8]; + cudf::size_type byte_bit_offset = col_index % 8; + int predicate = *valid_byte & (1 << byte_bit_offset); + uint32_t bitmask = __ballot_sync(active_mask, predicate); + if (row_index % 32 == 0) { + nm[word_index(row_index)] = bitmask; + } + } // end column loop + } // end row copy + // wait for the row_group to be totally copied before starting on the next row group + __syncthreads(); + } + } + + __global__ void copy_to_rows_fixed_width_optimized( + const cudf::size_type start_row, const cudf::size_type num_rows, + const cudf::size_type num_columns, const cudf::size_type row_size, + const cudf::size_type *output_offset_in_row, const cudf::size_type *num_bytes, + const int8_t **input_data, const cudf::bitmask_type **input_nm, int8_t *output_data) { + // We are going to copy the data in two passes. + // The first pass copies a chunk of data into shared memory. + // The second pass copies that chunk from shared memory out to the final location. + + // Because shared memory is limited we copy a subset of the rows at a time. + // We do not support copying a subset of the columns in a row yet, so we don't + // currently support a row that is wider than shared memory. + // For simplicity we will refer to this as a row_group + + // In practice we have found reading more than 4 columns of data per thread + // results in performance loss. As such we are using a 2 dimensional + // kernel in terms of threads, but not in terms of blocks. Columns are + // controlled by the y dimension (there is no y dimension in blocks). Rows + // are controlled by the x dimension (there are multiple blocks in the x + // dimension). + + cudf::size_type rows_per_group = blockDim.x; + cudf::size_type row_group_start = blockIdx.x; + cudf::size_type row_group_stride = gridDim.x; + cudf::size_type row_group_end = (num_rows + rows_per_group - 1) / rows_per_group + 1; + + extern __shared__ int8_t shared_data[]; + + // Because we are copying fixed width only data and we stride the rows + // this thread will always start copying to shared data in the same place + int8_t *row_tmp = &shared_data[row_size * threadIdx.x]; + int8_t *row_vld_tmp = + &row_tmp[output_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]]; + + for (cudf::size_type row_group_index = row_group_start; row_group_index < row_group_end; + row_group_index += row_group_stride) { + // Within the row group there should be 1 thread for each row. This is a + // requirement for launching the kernel + cudf::size_type row_index = start_row + (row_group_index * rows_per_group) + threadIdx.x; + // But we might not use all of the threads if the number of rows does not go + // evenly into the thread count. We don't want those threads to exit yet + // because we may need them to copy data back out. + if (row_index < (start_row + num_rows)) { + cudf::size_type col_index_start = threadIdx.y; + cudf::size_type col_index_stride = blockDim.y; + for (cudf::size_type col_index = col_index_start; col_index < num_columns; + col_index += col_index_stride) { + cudf::size_type col_size = num_bytes[col_index]; + int8_t *col_tmp = &(row_tmp[output_offset_in_row[col_index]]); + const int8_t *col_input = input_data[col_index]; + switch (col_size) { + case 1: { + *col_tmp = col_input[row_index]; + break; + } + case 2: { + const int16_t *short_col_input = reinterpret_cast(col_input); + *reinterpret_cast(col_tmp) = short_col_input[row_index]; + break; + } + case 4: { + const int32_t *int_col_input = reinterpret_cast(col_input); + *reinterpret_cast(col_tmp) = int_col_input[row_index]; + break; + } + case 8: { + const int64_t *long_col_input = reinterpret_cast(col_input); + *reinterpret_cast(col_tmp) = long_col_input[row_index]; + break; + } + default: { + cudf::size_type input_offset = col_size * row_index; + // TODO this should just not be supported for fixed width columns, but just in case... + for (cudf::size_type b = 0; b < col_size; b++) { + col_tmp[b] = col_input[b + input_offset]; + } + break; + } + } + // atomicOr only works on 32 bit or 64 bit aligned values, and not byte aligned + // so we have to rewrite the addresses to make sure that it is 4 byte aligned + int8_t *valid_byte = &row_vld_tmp[col_index / 8]; + cudf::size_type byte_bit_offset = col_index % 8; + uint64_t fixup_bytes = reinterpret_cast(valid_byte) % 4; + int32_t *valid_int = reinterpret_cast(valid_byte - fixup_bytes); + cudf::size_type int_bit_offset = byte_bit_offset + (fixup_bytes * 8); + // Now copy validity for the column + if (input_nm[col_index]) { + if (bit_is_set(input_nm[col_index], row_index)) { + atomicOr_block(valid_int, 1 << int_bit_offset); + } else { + atomicAnd_block(valid_int, ~(1 << int_bit_offset)); + } + } else { + // It is valid so just set the bit + atomicOr_block(valid_int, 1 << int_bit_offset); + } + } // end column loop + } // end row copy + // wait for the row_group to be totally copied into shared memory + __syncthreads(); + + // Step 2: Copy the data back out + // We know row_size is always aligned with and a multiple of int64_t; + int64_t *long_shared = reinterpret_cast(shared_data); + int64_t *long_output = reinterpret_cast(output_data); + + cudf::size_type shared_input_index = threadIdx.x + (threadIdx.y * blockDim.x); + cudf::size_type shared_input_stride = blockDim.x * blockDim.y; + cudf::size_type row_index_end = ((row_group_index + 1) * rows_per_group); + if (row_index_end > num_rows) { + row_index_end = num_rows; + } + cudf::size_type num_rows_in_group = row_index_end - (row_group_index * rows_per_group); + cudf::size_type shared_length = row_size * num_rows_in_group; + + cudf::size_type shared_input_end = shared_length / sizeof(int64_t); + + cudf::size_type start_output_index = + (row_size * row_group_index * rows_per_group) / sizeof(int64_t); + + for (cudf::size_type shared_index = shared_input_index; shared_index < shared_input_end; + shared_index += shared_input_stride) { + long_output[start_output_index + shared_index] = long_shared[shared_index]; + } + __syncthreads(); + // Go for the next round + } + } + + #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 + + struct block_info { + int start_col; + int start_row; + int end_col; + int end_row; + int buffer_num; + + __host__ __device__ size_type get_shared_row_size(size_type const *const col_offsets, + size_type const *const col_sizes) const { + return align_offset(col_offsets[end_col] + col_sizes[end_col] - col_offsets[start_col], 8); + } + __host__ __device__ size_type num_cols() const { return end_col - start_col + 1; } + + __host__ __device__ size_type num_rows() const { return end_row - start_row + 1; } + }; + + // When building the columns to return, we have to be mindful of the offset limit in cudf. + // It is 32-bit and these data columns are capable of surpassing that easily. The data should + // not be cut off exactly at the limit though due to the validity buffers. The most efficient + // place to cut the validity is on a 32-row boundary, so as we calculate the row sizes + // we keep track of the cut points for the validity, which we call row batches. If the row + // is larger than can be represented with the 32-bit offsets, we use the last 32-row boundary we + // hit. Note that this boundary is for our book-keeping with column pointers and not anything that + // the kernel needs to worry about. We cut the output at convienient boundaries when assembling + // the outgoing data stream. + struct row_batch { + size_type num_bytes; + size_type row_count; + }; + + /** + * @brief copy data from cudf columns into x format, which is row-based + * + * @param num_rows total number of rows in the table + * @param num_columns total number of columns in the table + * @param input_data pointer to raw table data + * @param input_nm pointer to validity data + * @param col_sizes array of sizes for each element in a column - one per column + * @param col_offsets offset into input data row for each column's start + * @param block_infos information about the blocks of work + * @param row_offsets offset to a specific row in the input data + * @param output_data pointer to output data + * + */ + __global__ void copy_to_rows(const size_type num_rows, const size_type num_columns, + const size_type shmem_used_per_block, const size_type num_block_infos, + const int8_t **input_data, const size_type *col_sizes, + const size_type *col_offsets, const block_info *block_infos, + const size_type *row_offsets, int8_t **output_data) { + // We are going to copy the data in two passes. + // The first pass copies a chunk of data into shared memory. + // The second pass copies that chunk from shared memory out to the final location. + + // Because shared memory is limited we copy a subset of the rows at a time. + // This has been broken up for us in the block_info struct, so we don't have + // any calculation to do here, but it is important to note. + + constexpr unsigned stages_count = NUM_BLOCKS_PER_KERNEL_LOADED; + auto group = cooperative_groups::this_thread_block(); + extern __shared__ int8_t shared_data[]; + int8_t *shared[stages_count] = {shared_data, shared_data + shmem_used_per_block}; + + __shared__ cuda::barrier block_barrier[NUM_BLOCKS_PER_KERNEL_LOADED]; + if (group.thread_rank() == 0) { + for (int i = 0; i < NUM_BLOCKS_PER_KERNEL_LOADED; ++i) { + init(&block_barrier[i], group.size()); + } + } + + group.sync(); + + auto const blocks_remaining = + std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_ROWS, + (uint)NUM_BLOCKS_PER_KERNEL_TO_ROWS); + + size_t fetch; + size_t subset; + for (subset = fetch = 0; subset < blocks_remaining; ++subset) { + // Fetch ahead up to stages_count subsets + for (; fetch < blocks_remaining && fetch < (subset + stages_count); ++fetch) { + auto const fetch_block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_ROWS + fetch]; + auto const num_fetch_cols = fetch_block.num_cols(); + auto const num_fetch_rows = fetch_block.num_rows(); + auto const num_elements_in_block = num_fetch_cols * num_fetch_rows; + auto const fetch_block_row_size = fetch_block.get_shared_row_size(col_offsets, col_sizes); + auto const starting_column_offset = col_offsets[fetch_block.start_col]; + auto &fetch_barrier = block_barrier[fetch % NUM_BLOCKS_PER_KERNEL_LOADED]; + + // wait for the last use of the memory to be completed + if (fetch > NUM_BLOCKS_PER_KERNEL_LOADED) { + fetch_barrier.arrive_and_wait(); + } + + // to do the copy we need to do n column copies followed by m element copies OR + // we have to do m element copies followed by r row copies. When going from column + // to row it is much easier to copy by elements first otherwise we would need a running + // total of the column sizes for our block, which isn't readily available. This makes it + // more appealing to copy element-wise from input data into shared matching the end layout + // and do row-based memcopies out. + + auto const shared_buffer_base = shared[fetch % stages_count]; + for (auto el = (int)threadIdx.x; el < num_elements_in_block; el += blockDim.x) { + auto const relative_col = el / num_fetch_rows; + auto const relative_row = el % num_fetch_rows; + auto const absolute_col = relative_col + fetch_block.start_col; + auto const absolute_row = relative_row + fetch_block.start_row; + auto const col_size = col_sizes[absolute_col]; + auto const col_offset = col_offsets[absolute_col]; + auto const relative_col_offset = col_offset - starting_column_offset; + + auto const shared_offset = relative_row * fetch_block_row_size + relative_col_offset; + auto const input_src = input_data[absolute_col] + col_size * absolute_row; + + // copy the element from global memory + switch (col_size) { + case 2: + cuda::memcpy_async(&shared_buffer_base[shared_offset], input_src, + cuda::aligned_size_t<2>(col_size), fetch_barrier); + break; + case 4: + cuda::memcpy_async(&shared_buffer_base[shared_offset], input_src, + cuda::aligned_size_t<4>(col_size), fetch_barrier); + break; + case 8: + cuda::memcpy_async(&shared_buffer_base[shared_offset], input_src, + cuda::aligned_size_t<8>(col_size), fetch_barrier); + break; + default: + cuda::memcpy_async(&shared_buffer_base[shared_offset], input_src, col_size, + fetch_barrier); + break; + } + } + } + + auto &subset_barrier = block_barrier[subset % NUM_BLOCKS_PER_KERNEL_LOADED]; + subset_barrier.arrive_and_wait(); + + auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_ROWS + subset]; + auto const block_row_size = block.get_shared_row_size(col_offsets, col_sizes); + auto const column_offset = col_offsets[block.start_col]; + auto const block_output_buffer = output_data[block.buffer_num]; + + // copy entire rows to final dest + for (auto absolute_row = block.start_row + threadIdx.x; absolute_row <= block.end_row; + absolute_row += blockDim.x) { + auto const relative_row = absolute_row - block.start_row; + auto const output_dest = block_output_buffer + row_offsets[absolute_row] + column_offset; + auto const shared_offset = block_row_size * relative_row; + + cuda::memcpy_async(output_dest, &shared[subset % stages_count][shared_offset], + cuda::aligned_size_t<8>(block_row_size), subset_barrier); + } + } + + // wait on the last copies to complete + for (uint i = 0; i < std::min(stages_count, blocks_remaining); ++i) { + block_barrier[i].arrive_and_wait(); + } + } + + /** + * @brief copy data from row-based format to cudf columns + * + * @param num_rows total number of rows in the table + * @param num_columns total number of columns in the table + * @param shmem_used_per_block amount of shared memory that is used by a block + * @param offsets + * @param output_data pointer to output data, partitioned by data size + * @param validity_offsets offset into input data row for validity data + * @param block_infos information about the blocks of work + * @param num_block_infos number of infos in blocks array + * @param input_data pointer to input data + * + */ + __global__ void copy_validity_to_rows( + const size_type num_rows, const size_type num_columns, const size_type shmem_used_per_block, + const size_type *row_offsets, int8_t **output_data, const size_type validity_offset, + const block_info *block_infos, const size_type num_block_infos, const bitmask_type **input_nm) { + extern __shared__ int8_t shared_data[]; + int8_t *shared_blocks[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] = { + shared_data, shared_data + shmem_used_per_block / 2}; + + // per conversation with DaveB + // each thread of warp reads a single int32 of validity - so we read 128 bytes + // then ballot_sync the bits and write the result to shmem + // after we fill shared mem memcpy it out in a blob. + // probably need knobs for number of rows vs columns to balance read/write + auto group = cooperative_groups::this_thread_block(); + + int const blocks_remaining = + std::min(num_block_infos - blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL, + (uint)NUM_VALIDITY_BLOCKS_PER_KERNEL); + + __shared__ cuda::barrier + shared_block_barriers[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]; + if (group.thread_rank() == 0) { + for (int i = 0; i < NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; ++i) { + init(&shared_block_barriers[i], group.size()); + } + } + + group.sync(); + + for (int validity_block = 0; validity_block < blocks_remaining; ++validity_block) { + if (validity_block != validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED) { + shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] + .arrive_and_wait(); + } + int8_t *this_shared_block = shared_blocks[validity_block % 2]; + auto block = block_infos[blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block]; + + auto const num_block_cols = block.num_cols(); + auto const num_block_rows = block.num_rows(); + + auto const num_sections_x = util::div_rounding_up_unsafe(num_block_cols, 32); + auto const num_sections_y = util::div_rounding_up_unsafe(num_block_rows, 32); + auto const validity_data_row_length = + align_offset(util::div_rounding_up_unsafe(num_block_cols, 8), 8); + auto const total_sections = num_sections_x * num_sections_y; + + int const warp_id = threadIdx.x / detail::warp_size; + int const lane_id = threadIdx.x % detail::warp_size; + auto const warps_per_block = std::max(1u, blockDim.x / detail::warp_size); + + // the block is divided into sections. A warp operates on a section at a time. + for (int my_section_idx = warp_id; my_section_idx < total_sections; + my_section_idx += warps_per_block) { + // convert to rows and cols + auto const section_x = my_section_idx % num_sections_x; + auto const section_y = my_section_idx / num_sections_x; + auto const relative_col = section_x * 32 + lane_id; + auto const relative_row = section_y * 32; + auto const absolute_col = relative_col + block.start_col; + auto const absolute_row = relative_row + block.start_row; + auto const cols_left = num_columns - absolute_col; + auto const participation_mask = __ballot_sync(0xFFFFFFFF, absolute_col < num_columns); + + if (absolute_col < num_columns) { + auto my_data = input_nm[absolute_col] != nullptr ? + input_nm[absolute_col][absolute_row / 32] : + std::numeric_limits::max(); + + // every thread that is participating in the warp has a byte, but it's column-based + // data and we need it in row-based. So we shuffle the bits around with ballot_sync to + // make the bytes we actually write. + bitmask_type dw_mask = 1; + for (int i = 0; i < 32 && relative_row + i < num_rows; ++i, dw_mask <<= 1) { + auto validity_data = __ballot_sync(participation_mask, my_data & dw_mask); + // lead thread in each warp writes data + auto const validity_write_offset = + validity_data_row_length * (relative_row + i) + relative_col / 8; + if (threadIdx.x % detail::warp_size == 0) { + if (cols_left <= 8) { + // write byte + this_shared_block[validity_write_offset] = validity_data & 0xFF; + } else if (cols_left <= 16) { + // write int16 + *reinterpret_cast(&this_shared_block[validity_write_offset]) = + validity_data & 0xFFFF; + } else if (cols_left <= 24) { + // write int16 and then int8 + *reinterpret_cast(&this_shared_block[validity_write_offset]) = + validity_data & 0xFFFF; + shared_data[validity_write_offset + 2] = (validity_data >> 16) & 0xFF; + } else { + // write int32 + *reinterpret_cast(&this_shared_block[validity_write_offset]) = + validity_data; + } + } + } + } + } + + // make sure entire block has finished copy + group.sync(); + + auto const output_data_base = + output_data[block.buffer_num] + validity_offset + block.start_col / 8; + + // now async memcpy the shared memory out to the final destination + for (int row = block.start_row + threadIdx.x; row <= block.end_row; row += blockDim.x) { + auto const relative_row = row - block.start_row; + auto const output_ptr = output_data_base + row_offsets[row]; + auto const num_bytes = util::div_rounding_up_unsafe(num_block_cols, 8); + + cuda::memcpy_async( + output_ptr, &this_shared_block[validity_data_row_length * relative_row], num_bytes, + shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]); + } + } + + // wait for last blocks of data to arrive + for (int validity_block = 0; + validity_block < blocks_remaining % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; + ++validity_block) { + shared_block_barriers[validity_block].arrive_and_wait(); + } + } + + static __device__ std::tuple + get_admin_data_sizes(size_t col_size_size, size_t col_offset_size, int const num_cols) { + auto const col_size_bytes = num_cols * col_size_size; + auto const col_offset_bytes = num_cols * col_offset_size; + + return {col_size_bytes, col_offset_bytes}; + } + + /** + * @brief copy data from row-based format to cudf columns + * + * @param num_rows total number of rows in the table + * @param num_columns total number of columns in the table + * @param shmem_used_per_block amount of shared memory that is used by a block + * @param row_offsets + * @param output_data + * @param output_nm + * @param col_sizes array of sizes for each element in a column - one per column + * @param col_offsets offset into input data row for each column's start + * @param block_infos information about the blocks of work + * @param input_data pointer to input data + * + */ + __global__ void copy_from_rows(const size_type num_rows, const size_type num_columns, + const size_type shmem_used_per_block, const size_type *row_offsets, + int8_t **output_data, const size_type *_col_sizes, + const size_type *_col_offsets, const block_info *block_infos, + const size_type num_block_infos, const int8_t *input_data) { + // We are going to copy the data in two passes. + // The first pass copies a chunk of data into shared memory. + // The second pass copies that chunk from shared memory out to the final location. + + // Because shared memory is limited we copy a subset of the rows at a time. + // This has been broken up for us in the block_info struct, so we don't have + // any calculation to do here, but it is important to note. + + // to speed up some of the random access memory we do, we copy col_sizes and col_offsets + // to shared memory for each of the blocks that we work on + + constexpr unsigned stages_count = NUM_BLOCKS_PER_KERNEL_LOADED; + auto group = cooperative_groups::this_thread_block(); + extern __shared__ int8_t shared_data[]; + int8_t *shared[stages_count] = {shared_data, shared_data + shmem_used_per_block}; + + __shared__ cuda::barrier block_barrier[NUM_BLOCKS_PER_KERNEL_LOADED]; + if (group.thread_rank() == 0) { + for (int i = 0; i < NUM_BLOCKS_PER_KERNEL_LOADED; ++i) { + init(&block_barrier[i], group.size()); + } + } + + group.sync(); + + auto blocks_remaining = std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_ROWS, + (uint)NUM_BLOCKS_PER_KERNEL_FROM_ROWS); + + size_t fetch_index; + size_t processing_index; + for (processing_index = fetch_index = 0; processing_index < blocks_remaining; + ++processing_index) { + // Fetch ahead up to stages_count groups + for (; fetch_index < static_cast(blocks_remaining) && + fetch_index < (processing_index + stages_count); + ++fetch_index) { + auto const fetch_block = + block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_ROWS + fetch_index]; + auto const fetch_block_start_row = fetch_block.start_row; + auto const fetch_block_end_row = fetch_block.end_row; + auto const starting_col_offset = _col_offsets[fetch_block.start_col]; + auto const fetch_block_row_size = fetch_block.get_shared_row_size(_col_offsets, _col_sizes); + auto const num_fetch_cols = fetch_block.num_cols(); + auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes( + sizeof(decltype(*_col_sizes)), sizeof(decltype(*_col_offsets)), num_fetch_cols); + auto &fetch_barrier = block_barrier[fetch_index % NUM_BLOCKS_PER_KERNEL_LOADED]; + + // if we have fetched all buffers, we need to wait for processing + // to complete on them before we can use them again + if (fetch_index > NUM_BLOCKS_PER_KERNEL_LOADED) { + fetch_barrier.arrive_and_wait(); + } + + auto shared_row_offset = 0; + // copy the data for column sizes + cuda::memcpy_async(group, &shared[fetch_index % stages_count][shared_row_offset], + &_col_sizes[fetch_block.start_col], col_size_bytes, fetch_barrier); + shared_row_offset += col_size_bytes; + // copy the data for column offsets + cuda::memcpy_async(group, &shared[fetch_index % stages_count][shared_row_offset], + &_col_offsets[fetch_block.start_col], col_offset_bytes, fetch_barrier); + shared_row_offset += col_offset_bytes; + shared_row_offset = align_offset(shared_row_offset, 8); + + for (auto row = fetch_block_start_row + static_cast(threadIdx.x); + row <= fetch_block_end_row; row += blockDim.x) { + auto shared_offset = + (row - fetch_block_start_row) * fetch_block_row_size + shared_row_offset; + // copy the main + cuda::memcpy_async(&shared[fetch_index % stages_count][shared_offset], + &input_data[row_offsets[row] + starting_col_offset], + fetch_block_row_size, fetch_barrier); + } + } + + auto &processing_barrier = block_barrier[processing_index % NUM_BLOCKS_PER_KERNEL_LOADED]; + + // ensure our data is ready + processing_barrier.arrive_and_wait(); + + auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_ROWS + processing_index]; + auto const rows_in_block = block.num_rows(); + auto const cols_in_block = block.num_cols(); + + auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes( + sizeof(decltype(*_col_sizes)), sizeof(decltype(*_col_offsets)), cols_in_block); + auto shared_col_sizes = reinterpret_cast(shared[processing_index % stages_count]); + auto shared_col_offsets = + reinterpret_cast(&shared[processing_index % stages_count][col_size_bytes]); + + auto const shared_row_offset = align_offset(col_size_bytes + col_offset_bytes, 8); + + auto block_row_size = block.get_shared_row_size(_col_offsets, _col_sizes); + + // now we copy from shared memory to final destination. + // the data is laid out in rows in shared memory, so the reads + // for a column will be "vertical". Because of this and the different + // sizes for each column, this portion is handled on row/column basis. + // to prevent each thread working on a single row and also to ensure + // that all threads can do work in the case of more threads than rows, + // we do a global index instead of a double for loop with col/row. + for (int index = threadIdx.x; index < rows_in_block * cols_in_block; index += blockDim.x) { + auto const relative_col = index % cols_in_block; + auto const relative_row = index / cols_in_block; + auto const absolute_col = relative_col + block.start_col; + auto const absolute_row = relative_row + block.start_row; + + auto const shared_memory_row_offset = block_row_size * relative_row; + auto const shared_memory_offset = shared_col_offsets[relative_col] - shared_col_offsets[0] + + shared_memory_row_offset + shared_row_offset; + auto const column_size = shared_col_sizes[relative_col]; + + int8_t *shmem_src = &shared[processing_index % stages_count][shared_memory_offset]; + int8_t *dst = &output_data[absolute_col][absolute_row * column_size]; + + cuda::memcpy_async(dst, shmem_src, column_size, processing_barrier); + } + group.sync(); + } + + // wait on the last copies to complete + for (uint i = 0; i < std::min(stages_count, blocks_remaining); ++i) { + block_barrier[i].arrive_and_wait(); + } + } + + /** + * @brief copy data from row-based format to cudf columns + * + * @param num_rows total number of rows in the table + * @param num_columns total number of columns in the table + * @param shmem_used_per_block amount of shared memory that is used by a block + * @param offsets + * @param output_nm + * @param validity_offsets offset into input data row for validity data + * @param block_infos information about the blocks of work + * @param num_block_infos number of infos in blocks array + * @param input_data pointer to input data + * + */ + __global__ void copy_validity_from_rows( + const size_type num_rows, const size_type num_columns, const size_type shmem_used_per_block, + const size_type *row_offsets, cudf::bitmask_type **output_nm, const size_type validity_offset, + const block_info *block_infos, const size_type num_block_infos, const int8_t *input_data) { + extern __shared__ int8_t shared_data[]; + int8_t *shared_blocks[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] = { + shared_data, shared_data + shmem_used_per_block / 2}; + + // per conversation with DaveB + // each thread of warp reads a single byte of validity - so we read 32 bytes + // then ballot_sync the bits and write the result to shmem + // after we fill shared mem memcpy it out in a blob. + // probably need knobs for number of rows vs columns to balance read/write + auto group = cooperative_groups::this_thread_block(); + + int const blocks_remaining = + std::min(num_block_infos - blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL, + (uint)NUM_VALIDITY_BLOCKS_PER_KERNEL); + + __shared__ cuda::barrier + shared_block_barriers[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]; + if (group.thread_rank() == 0) { + for (int i = 0; i < NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; ++i) { + init(&shared_block_barriers[i], group.size()); + } + } + + group.sync(); + + for (int validity_block = 0; validity_block < blocks_remaining; ++validity_block) { + auto const validity_index = validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; + if (validity_block != validity_index) { + shared_block_barriers[validity_index].arrive_and_wait(); + } + int8_t *this_shared_block = shared_blocks[validity_block % 2]; + auto const block = block_infos[blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block]; + auto const block_start_col = block.start_col; + auto const block_start_row = block.start_row; + auto const num_block_cols = block.num_cols(); + auto const num_block_rows = block.num_rows(); + auto const num_sections_x = (num_block_cols + 7) / 8; + auto const num_sections_y = (num_block_rows + 31) / 32; + auto const validity_data_col_length = num_sections_y * 4; // words to bytes + auto const total_sections = num_sections_x * num_sections_y; + int const warp_id = threadIdx.x / detail::warp_size; + int const lane_id = threadIdx.x % detail::warp_size; + auto const warps_per_block = std::max(1u, blockDim.x / detail::warp_size); + + // the block is divided into sections. A warp operates on a section at a time. + for (int my_section_idx = warp_id; my_section_idx < total_sections; + my_section_idx += warps_per_block) { + // convert to rows and cols + auto const section_x = my_section_idx % num_sections_x; + auto const section_y = my_section_idx / num_sections_x; + auto const relative_col = section_x * 8; + auto const relative_row = section_y * 32 + lane_id; + auto const absolute_col = relative_col + block_start_col; + auto const absolute_row = relative_row + block_start_row; + auto const rows_left = num_rows - absolute_row; + + auto const participation_mask = __ballot_sync(0xFFFFFFFF, absolute_row < num_rows); + + if (absolute_row < num_rows) { + auto const my_byte = + input_data[row_offsets[absolute_row] + validity_offset + absolute_col / 8]; + + // so every thread that is participating in the warp has a byte, but it's row-based + // data and we need it in column-based. So we shiffle the bits around to make + // the bytes we actually write. + for (int i = 0, byte_mask = 1; i < 8 && relative_col + i < num_columns; + ++i, byte_mask <<= 1) { + auto validity_data = __ballot_sync(participation_mask, my_byte & byte_mask); + // lead thread in each warp writes data + if (threadIdx.x % detail::warp_size == 0) { + auto const validity_write_offset = + validity_data_col_length * (relative_col + i) + relative_row / 8; + + if (rows_left <= 8) { + // write byte + this_shared_block[validity_write_offset] = validity_data & 0xFF; + } else if (rows_left <= 16) { + // write int16 + *reinterpret_cast(&this_shared_block[validity_write_offset]) = + validity_data & 0xFFFF; + } else if (rows_left <= 24) { + // write int16 and then int8 + *reinterpret_cast(&this_shared_block[validity_write_offset]) = + validity_data & 0xFFFF; + shared_data[validity_write_offset + 2] = (validity_data >> 16) & 0xFF; + } else { + // write int32 + *reinterpret_cast(&this_shared_block[validity_write_offset]) = + validity_data; + } + } + } + } + } + + // make sure entire block has finished copy + group.sync(); + + // now async memcpy the shared + for (int col = block.start_col + threadIdx.x; col <= block.end_col; col += blockDim.x) { + auto const relative_col = col - block.start_col; + + cuda::memcpy_async( + output_nm[col] + word_index(block_start_row), + &this_shared_block[validity_data_col_length * relative_col], + util::div_rounding_up_unsafe(num_block_rows, 8), + shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]); + } + } + + // wait for last blocks of data to arrive + auto const num_blocks_to_wait = blocks_remaining > NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED ? + NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED : + blocks_remaining; + for (int validity_block = 0; validity_block < num_blocks_to_wait; ++validity_block) { + shared_block_barriers[validity_block].arrive_and_wait(); + } + } + + #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 + + /** + * Calculate the dimensions of the kernel for fixed width only columns. + * @param [in] num_columns the number of columns being copied. + * @param [in] num_rows the number of rows being copied. + * @param [in] size_per_row the size each row takes up when padded. + * @param [out] blocks the size of the blocks for the kernel + * @param [out] threads the size of the threads for the kernel + * @return the size in bytes of shared memory needed for each block. + */ + static int calc_fixed_width_kernel_dims(const cudf::size_type num_columns, + const cudf::size_type num_rows, + const cudf::size_type size_per_row, dim3 &blocks, + dim3 &threads) { + // We have found speed degrades when a thread handles more than 4 columns. + // Each block is 2 dimensional. The y dimension indicates the columns. + // We limit this to 32 threads in the y dimension so we can still + // have at least 32 threads in the x dimension (1 warp) which should + // result in better coalescing of memory operations. We also + // want to guarantee that we are processing a multiple of 32 threads + // in the x dimension because we use atomic operations at the block + // level when writing validity data out to main memory, and that would + // need to change if we split a word of validity data between blocks. + int y_block_size = (num_columns + 3) / 4; // cudf::util::div_rounding_up_safe(num_columns, 4); + if (y_block_size > 32) { + y_block_size = 32; + } + int x_possible_block_size = 1024 / y_block_size; + // 48KB is the default setting for shared memory per block according to the cuda tutorials + // If someone configures the GPU to only have 16 KB this might not work. + int max_shared_size = 48 * 1024; + int max_block_size = max_shared_size / size_per_row; + // If we don't have enough shared memory there is no point in having more threads + // per block that will just sit idle + max_block_size = max_block_size > x_possible_block_size ? x_possible_block_size : max_block_size; + // Make sure that the x dimension is a multiple of 32 this not only helps + // coalesce memory access it also lets us do a ballot sync for validity to write + // the data back out the warp level. If x is a multiple of 32 then each thread in the y + // dimension is associated with one or more warps, that should correspond to the validity + // words directly. + int block_size = (max_block_size / 32) * 32; + CUDF_EXPECTS(block_size != 0, "Row size is too large to fit in shared memory"); + + int num_blocks = (num_rows + block_size - 1) / block_size; + if (num_blocks < 1) { + num_blocks = 1; + } else if (num_blocks > 10240) { + // The maximum number of blocks supported in the x dimension is 2 ^ 31 - 1 + // but in practice haveing too many can cause some overhead that I don't totally + // understand. Playing around with this haveing as little as 600 blocks appears + // to be able to saturate memory on V100, so this is an order of magnitude higher + // to try and future proof this a bit. + num_blocks = 10240; + } + blocks.x = num_blocks; + blocks.y = 1; + blocks.z = 1; + threads.x = block_size; + threads.y = y_block_size; + threads.z = 1; + return size_per_row * block_size; + } + + /** + * When converting to rows it is possible that the size of the table was too big to fit + * in a single column. This creates an output column for a subset of the rows in a table + * going from start row and containing the next num_rows. Most of the parameters passed + * into this function are common between runs and should be calculated once. + */ + static std::unique_ptr + fixed_width_convert_to_rows(const cudf::size_type start_row, const cudf::size_type num_rows, + const cudf::size_type num_columns, const cudf::size_type size_per_row, + rmm::device_uvector &column_start, + rmm::device_uvector &column_size, + rmm::device_uvector &input_data, + rmm::device_uvector &input_nm, + const cudf::scalar &zero, const cudf::scalar &scalar_size_per_row, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) { + int64_t total_allocation = size_per_row * num_rows; + // We made a mistake in the split somehow + CUDF_EXPECTS(total_allocation < std::numeric_limits::max(), "Table is too large to fit!"); + + // Allocate and set the offsets row for the byte array + std::unique_ptr offsets = + cudf::detail::sequence(num_rows + 1, zero, scalar_size_per_row, stream); + + std::unique_ptr data = cudf::make_numeric_column( + cudf::data_type(cudf::type_id::INT8), static_cast(total_allocation), + cudf::mask_state::UNALLOCATED, stream, mr); + + dim3 blocks; + dim3 threads; + int shared_size = + detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads); + + copy_to_rows_fixed_width_optimized<<>>( + start_row, num_rows, num_columns, size_per_row, column_start.data(), column_size.data(), + input_data.data(), input_nm.data(), data->mutable_view().data()); + + return cudf::make_lists_column(num_rows, std::move(offsets), std::move(data), 0, + rmm::device_buffer{0, rmm::cuda_stream_default, mr}, stream, mr); + } + + static cudf::data_type get_data_type(const cudf::column_view &v) { + return v.type(); + } + + static inline bool are_all_fixed_width(std::vector const &schema) { + return std::all_of(schema.begin(), schema.end(), + [](const cudf::data_type &t) { return cudf::is_fixed_width(t); }); + } + + /** + * Given a set of fixed width columns, calculate how the data will be laid out in memory. + * @param [in] schema the types of columns that need to be laid out. + * @param [out] column_start the byte offset where each column starts in the row. + * @param [out] column_size the size in bytes of the data for each columns in the row. + * @return the size in bytes each row needs. + */ + static inline int32_t compute_fixed_width_layout(std::vector const &schema, + std::vector &column_start, + std::vector &column_size) { + // We guarantee that the start of each column is 64-bit aligned so anything can go + // there, but to make the code simple we will still do an alignment for it. + int32_t at_offset = 0; + for (auto col = schema.begin(); col < schema.end(); col++) { + cudf::size_type s = cudf::size_of(*col); + column_size.emplace_back(s); + std::size_t allocation_needed = s; + std::size_t alignment_needed = allocation_needed; // They are the same for fixed width types + at_offset = align_offset(at_offset, alignment_needed); + column_start.emplace_back(at_offset); + at_offset += allocation_needed; + } + + // Now we need to add in space for validity + // Eventually we can think about nullable vs not nullable, but for now we will just always add + // it in + int32_t validity_bytes_needed = + (schema.size() + 7) / 8; // cudf::util::div_rounding_up_safe(schema.size(), 8); + // validity comes at the end and is byte aligned so we can pack more in. + at_offset += validity_bytes_needed; + // Now we need to pad the end so all rows are 64 bit aligned + return align_offset(at_offset, 8); // 8 bytes (64 bits) + } + + #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 + + template + static size_type compute_column_information(iterator begin, iterator end, + std::vector &column_starts, + std::vector &column_sizes) //, + // std::function nested_type_cb) + { + size_type fixed_width_size_per_row = 0; + for (auto cv = begin; cv != end; ++cv) { + auto col_type = std::get<0>(*cv); + bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; + + // if (nested_type && nested_type_cb) { nested_type_cb(cv->get<1>()); } + + // a list or string column will write a single uint64 + // of data here for offset/length + auto col_size = nested_type ? 8 : size_of(col_type); + + // align size for this type + std::size_t const alignment_needed = col_size; // They are the same for fixed width types + fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed); + column_starts.push_back(fixed_width_size_per_row); + column_sizes.push_back(col_size); + fixed_width_size_per_row += col_size; + } + + auto validity_offset = fixed_width_size_per_row; + column_starts.push_back(validity_offset); + + return fixed_width_size_per_row; + } + + std::vector + build_validity_block_infos(size_type const &num_columns, size_type const &num_rows, + size_type const &shmem_limit_per_block, + std::vector const &row_batches) { + auto const desired_rows_and_columns = (int)sqrt(shmem_limit_per_block); + auto const column_stride = align_offset( + [&]() { + if (desired_rows_and_columns > num_columns) { + // not many columns, group it into 8s and ship it off + return std::min(8, num_columns); + } else { + return util::round_down_safe(desired_rows_and_columns, 8); + } + }(), + 8); + // we fit as much as we can given the column stride + // note that an element in the table takes just 1 bit, but a row with a single + // element still takes 8 bytes! + auto const bytes_per_row = align_offset(util::div_rounding_up_unsafe(column_stride, 8), 8); + auto const row_stride = std::min(num_rows, shmem_limit_per_block / bytes_per_row); + + std::vector validity_block_infos; + for (int col = 0; col < num_columns; col += column_stride) { + int current_window_row_batch = 0; + int rows_left_in_batch = row_batches[current_window_row_batch].row_count; + int row = 0; + while (row < num_rows) { + if (rows_left_in_batch == 0) { + current_window_row_batch++; + rows_left_in_batch = row_batches[current_window_row_batch].row_count; + } + int const window_height = std::min(row_stride, rows_left_in_batch); + + validity_block_infos.emplace_back(detail::block_info{ + col, row, std::min(col + column_stride - 1, num_columns - 1), row + window_height - 1}); + row += window_height; + rows_left_in_batch -= window_height; + } + } + + return validity_block_infos; + } + + std::vector build_block_infos(std::vector const &column_sizes, + std::vector const &column_starts, + std::vector const &row_batches, + size_type const total_number_of_rows, + size_type const &shmem_limit_per_block) { + std::vector block_infos; + + // block infos are organized with the windows going "down" the columns + // this provides the most coalescing of memory access + int current_window_width = 0; + int current_window_start_col = 0; + + // build the blocks for a specific set of columns + auto build_blocks = [&block_infos, &row_batches, total_number_of_rows]( + int const start_col, int const end_col, int const desired_window_height) { + int current_window_start_row = 0; + int current_window_row_batch = 0; + int rows_left_in_batch = row_batches[current_window_row_batch].row_count; + int i = 0; + while (i < total_number_of_rows) { + if (rows_left_in_batch == 0) { + current_window_row_batch++; + rows_left_in_batch = row_batches[current_window_row_batch].row_count; + } + int const window_height = std::min(desired_window_height, rows_left_in_batch); + + block_infos.emplace_back(detail::block_info{ + start_col, current_window_start_row, end_col, + std::min(current_window_start_row + window_height - 1, total_number_of_rows - 1), + current_window_row_batch}); + + i += window_height; + current_window_start_row += window_height; + rows_left_in_batch -= window_height; + } + }; + + // the ideal window height has lots of 8-byte reads and 8-byte writes. The optimal read/write + // would be memory cache line sized access, but since other blocks will read/write the edges + // this may not turn out to be overly important. For now, we will attempt to build a square + // window as far as byte sizes. x * y = shared_mem_size. Which translates to x^2 = + // shared_mem_size since we want them equal, so height and width are sqrt(shared_mem_size). The + // trick is that it's in bytes, not rows or columns. + size_type const optimal_square_len = size_type(sqrt(shmem_limit_per_block)); + int const window_height = std::clamp( + util::round_up_safe( + std::min(std::min(optimal_square_len, (size_type)column_sizes.size()) / column_sizes[0], + total_number_of_rows), + 32), + 1, row_batches[0].row_count); + + auto calc_admin_data_size = [](int num_cols) -> size_type { + // admin data is the column sizes and column start information. + // this is copied to shared memory as well and needs to be accounted for + // in the window calculation. + return num_cols * sizeof(size_type) + num_cols * sizeof(size_type); + }; + + int row_size = 0; + + // march each column and build the blocks of appropriate sizes + for (unsigned int col = 0; col < column_sizes.size(); ++col) { + auto const col_size = column_sizes[col]; + + // align size for this type + std::size_t alignment_needed = col_size; // They are the same for fixed width types + auto row_size_aligned = detail::align_offset(row_size, alignment_needed); + auto row_size_with_this_col = row_size_aligned + col_size; + auto row_size_with_end_pad = detail::align_offset(row_size_with_this_col, 8); + + if (row_size_with_end_pad * window_height + + calc_admin_data_size(col - current_window_start_col) > + shmem_limit_per_block) { + // too large, close this window, generate vertical blocks and restart + build_blocks(current_window_start_col, col == 0 ? col : col - 1, window_height); + row_size = + detail::align_offset((column_starts[col] + column_sizes[col]) & 7, alignment_needed); + row_size += col_size; // alignment required for shared memory window boundary to match // alignment of output row - current_window_start_col = col; - current_window_width = 0; - } else { - row_size = row_size_with_this_col; - current_window_width++; - } - } - - // build last set of blocks - if (current_window_width > 0) { - build_blocks(current_window_start_col, (int)column_sizes.size() - 1, window_height); - } - - return block_infos; -} - -#if defined(DEBUG) -void pretty_print(uint64_t i) -{ - if (i > (1 * 1024 * 1024 * 1024)) { - printf("%.2f GB", i / float(1 * 1024 * 1024 * 1024)); - } else if (i > (1 * 1024 * 1024)) { - printf("%.2f MB", i / float(1 * 1024 * 1024)); - } else if (i > (1 * 1024)) { - printf("%.2f KB", float(i / 1024)); - } else { - printf("%lu Bytes", i); - } -} -#endif -#endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 - -} // namespace detail - -std::vector> convert_to_rows(cudf::table_view const& tbl, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ -#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 - // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the - // data, but small enough that multiple columns fit in memory so the writes can coalese as well. - // Potential optimization for window sizes. - const size_type num_columns = tbl.num_columns(); - const size_type num_rows = tbl.num_rows(); - - int device_id; - CUDA_TRY(cudaGetDevice(&device_id)); - int total_shmem; - CUDA_TRY(cudaDeviceGetAttribute(&total_shmem, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); - -#if defined(DEBUG) || 1 - total_shmem -= 1024; -#endif - int shmem_limit_per_block = total_shmem / NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; - -#if defined(DEBUG) - size_t free, total; - cudaMemGetInfo(&free, &total); - printf("%lu/%lu Memory\n", free, total); -#endif - - // break up the work into blocks, which are a starting and ending row/col #. - // this window size is calculated based on the shared memory size available - // we want a single block to fill up the entire shared memory space available - // for the transpose-like conversion. - - // There are two different processes going on here. The GPU conversion of the data - // and the writing of the data into the list of byte columns that are a maximum of - // 2 gigs each due to offset maximum size. The GPU conversion portion has to understand - // this limitation because the column must own the data inside and as a result it must be - // a distinct allocation for that column. Copying the data into these final buffers would - // be prohibitively expensive, so care is taken to ensure the GPU writes to the proper buffer. - // The windows are broken at the boundaries of specific rows based on the row sizes up - // to that point. These are row batches and they are decided first before building the - // windows so the windows can be properly cut around them. - - // Get the pointers to the input columnar data ready - std::vector input_data; - std::vector input_nm; - input_data.reserve(num_columns); - input_nm.reserve(num_columns); - for (size_type column_number = 0; column_number < num_columns; column_number++) { - column_view cv = tbl.column(column_number); - auto const col_type = cv.type(); - bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; - - if (!nested_type) { - input_data.emplace_back(cv.data()); - input_nm.emplace_back(cv.null_mask()); - } - } - - auto dev_input_data = make_device_uvector_async(input_data, stream, mr); - auto dev_input_nm = make_device_uvector_async(input_nm, stream, mr); - - std::vector row_sizes; // size of each row in bytes including any alignment padding - std::vector row_offsets; // offset from the start of the data to this row - std::vector column_sizes; // byte size of each column - std::vector column_starts; // offset of column inside a row including alignment - std::vector - variable_width_columns; // list of the variable width columns in the table - row_sizes.reserve(num_rows); - row_offsets.reserve(num_rows); - column_sizes.reserve(num_columns); - column_starts.reserve(num_columns + 1); // we add a final offset for validity data start - - auto iter = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), [&tbl](auto i) -> std::tuple { - return std::make_tuple(tbl.column(i).type(), tbl.column(i)); - }); - - size_type fixed_width_size_per_row = detail::compute_column_information(iter, - iter + num_columns, - column_starts, - column_sizes); //, - // [&variable_width_columns](column_view const &cv) { variable_width_columns.push_back(cv); }); - /* size_type fixed_width_size_per_row = 0; - for (int col = 0; col < num_columns; ++col) { - auto cv = tbl.column(col); - auto col_type = cv.type(); - bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; - - if (nested_type) { variable_width_columns.push_back(cv); } - - // a list or string column will write a single uint64 - // of data here for offset/length - auto col_size = nested_type ? 8 : size_of(col_type); - - // align size for this type - std::size_t const alignment_needed = col_size; // They are the same for fixed width types - fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed); - column_starts.push_back(fixed_width_size_per_row); - column_sizes.push_back(col_size); - fixed_width_size_per_row += col_size; - }*/ - -#if defined(DEBUG) - printf("validity offset will be %d + %d = %d\n", - column_starts.back(), - column_sizes.back(), - column_starts.back() + column_sizes.back()); -#endif - - auto dev_col_sizes = make_device_uvector_async(column_sizes, stream, mr); - auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr); - - std::vector row_batches; - - auto calculate_variable_width_row_data_size = [](int const row) { - // each level of variable-width data will add an offset/length - // uint64 of data. The first of which is inside the fixed-width - // data itself and needs to be aligned based on what is around - // that data. This is handled above with the fixed-width calculations - // for that reason. We may still need to add more of these offset/length - // combinations if the nesting is deeper than one level as these - // will be included in the variable-width data blob at the end of the - // row. - return 0; - /* auto c = variable_width_columns[col]; - while (true) { - auto col_offsets = c.child(0).data(); - auto col_data_size = size_of(c.child(1).type()); - std::size_t alignment_needed = col_data_size; - - row_sizes[row] += (col_offsets[row + 1] - col_offsets[row]) * col_data_size; - if (c.num_children() == 0) { - break; - } - c = c.child(1); - } - exclusive_scan([t](int row_index) { - size_type total_row_size = 0; - for (int i=0 i - (uint64_t)std::numeric_limits::max()) { - // a new batch starts at the last 32-row boundary - row_batches.push_back( - detail::row_batch{static_cast(row_batch_size), row_batch_rows & ~31}); - row_batch_size = 0; - row_batch_rows = row_batch_rows & 31; - row_offset = 0; - aligned_row_batch_size = 0; - } - row_offset = detail::align_offset(row_offset, 8); // rows are 8 byte aligned - row_offsets.push_back(row_offset); - row_batch_size = aligned_row_batch_size + row_sizes[row]; - row_offset += row_sizes[row]; - total_table_size = detail::align_offset(total_table_size, 8); // rows are 8 byte aligned - total_table_size += row_sizes[row]; - row_batch_rows++; - } - if (row_batch_size > 0) { - row_batches.push_back( - detail::row_batch{static_cast(row_batch_size), row_batch_rows}); - } - - auto dev_row_offsets = make_device_uvector_async(row_offsets, stream, mr); - -#if defined(DEBUG) - printf("%d rows and %d columns in table\n", num_rows, num_columns); - printf("%lu batches:\n", row_batches.size()); - for (auto i = 0; i < (int)row_batches.size(); ++i) { - printf("%d: %d rows, ", i, row_batches[i].row_count); - detail::pretty_print(row_batches[i].num_bytes); - printf("\n"); - } -#endif - - std::vector output_buffers; - std::vector output_data; - output_data.reserve(row_batches.size()); - for (uint i = 0; i < row_batches.size(); ++i) { - rmm::device_buffer temp(row_batches[i].num_bytes, stream, mr); - output_data.push_back(static_cast(temp.data())); - output_buffers.push_back(std::move(temp)); - } - auto dev_output_data = make_device_uvector_async(output_data, stream, mr); - - std::vector block_infos = - build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block); - -#if defined(DEBUG) - printf("%lu windows for %d columns, %d rows to fit in ", - block_infos.size(), - block_infos[0].end_col - block_infos[0].start_col + 1, - block_infos[0].end_row - block_infos[0].start_row); - detail::pretty_print(shmem_limit_per_block); - printf(" shared mem("); - detail::pretty_print(fixed_width_size_per_row); - printf("/row, %d columns, %d rows, ", num_columns, num_rows); - detail::pretty_print(total_table_size); - printf(" total):\n"); -#endif - - auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr); - - // blast through the entire table and convert it - dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS)); - dim3 threads(256); - -#if defined(DEBUG) - printf("Launching kernel with %d blocks, %d threads, ", blocks.x, threads.x); - detail::pretty_print(shmem_limit_per_block); - printf(" shared memory\n"); -#endif - detail::copy_from_columns<<>>( - num_rows, - num_columns, - shmem_limit_per_block, - block_infos.size(), - dev_input_data.data(), - dev_col_sizes.data(), - dev_col_starts.data(), - dev_block_infos.data(), - dev_row_offsets.data(), - reinterpret_cast(dev_output_data.data())); - - auto validity_block_infos = - build_validity_block_infos(num_columns, num_rows, shmem_limit_per_block, row_batches); - - auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream, mr); - dim3 validity_blocks( - util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_COLUMNS)); - dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu)); -#if defined(DEBUG) - printf("Launching validity kernel with %d blocks, for %lu validity blocks with %d threads, ", - validity_blocks.x, - validity_block_infos.size(), - validity_threads.x); - detail::pretty_print(total_shmem); - printf(" shared memory\n"); -#endif - detail:: - copy_validity_from_columns<<>>( - num_rows, - num_columns, - shmem_limit_per_block, - dev_row_offsets.data(), - dev_output_data.data(), - column_starts.back(), - dev_validity_block_infos.data(), - validity_block_infos.size(), - dev_input_nm.data()); - - // split up the output buffer into multiple buffers based on row batch sizes - // and create list of byte columns - int offset_offset = 0; - std::vector> ret; - for (uint i = 0; i < row_batches.size(); ++i) { - // compute offsets for this row batch - std::vector offset_vals; - offset_vals.reserve(row_batches[i].row_count + 1); - size_type cur_offset = 0; - offset_vals.push_back(cur_offset); - for (int row = 0; row < row_batches[i].row_count; ++row) { - cur_offset = detail::align_offset(cur_offset, 8) + row_sizes[row + offset_offset]; - offset_vals.push_back(cur_offset); - } - offset_offset += row_batches[i].row_count; - - auto dev_offsets = make_device_uvector_async(offset_vals, stream, mr); - auto offsets = std::make_unique( - data_type{type_id::INT32}, (size_type)offset_vals.size(), dev_offsets.release()); - - auto data = std::make_unique( - data_type{cudf::type_id::INT8}, row_batches[i].num_bytes, std::move(output_buffers[i])); - - ret.push_back(cudf::make_lists_column(row_batches[i].row_count, - std::move(offsets), - std::move(data), - 0, - rmm::device_buffer{0, rmm::cuda_stream_default, mr}, - stream, - mr)); - } - - return ret; -#else - CUDF_FAIL("Column to row conversion optimization requires volta or later hardware."); - return {}; -#endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 -} - -std::vector> convert_to_rows_fixed_width_optimized( - cudf::table_view const& tbl, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) -{ - const cudf::size_type num_columns = tbl.num_columns(); - - std::vector schema; - schema.resize(num_columns); - std::transform(tbl.begin(), tbl.end(), schema.begin(), detail::get_data_type); - - if (detail::are_all_fixed_width(schema)) { - std::vector column_start; - std::vector column_size; - - int32_t size_per_row = detail::compute_fixed_width_layout(schema, column_start, column_size); - auto dev_column_start = make_device_uvector_async(column_start, stream, mr); - auto dev_column_size = make_device_uvector_async(column_size, stream, mr); - - int32_t max_rows_per_batch = std::numeric_limits::max() / size_per_row; - // Make the number of rows per batch a multiple of 32 so we don't have to worry about - // splitting validity at a specific row offset. This might change in the future. - max_rows_per_batch = (max_rows_per_batch / 32) * 32; - - cudf::size_type num_rows = tbl.num_rows(); - - // Get the pointers to the input columnar data ready - std::vector input_data; - std::vector input_nm; - for (cudf::size_type column_number = 0; column_number < num_columns; column_number++) { - cudf::column_view cv = tbl.column(column_number); - input_data.emplace_back(cv.data()); - input_nm.emplace_back(cv.null_mask()); - } - auto dev_input_data = make_device_uvector_async(input_data, stream, mr); - auto dev_input_nm = make_device_uvector_async(input_nm, stream, mr); - - using ScalarType = cudf::scalar_type_t; - auto zero = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value()); - zero->set_valid_async(true, stream); - static_cast(zero.get())->set_value(0, stream); - - auto step = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value()); - step->set_valid_async(true, stream); - static_cast(step.get()) - ->set_value(static_cast(size_per_row), stream); - - std::vector> ret; - for (cudf::size_type row_start = 0; row_start < num_rows; row_start += max_rows_per_batch) { - cudf::size_type row_count = num_rows - row_start; - row_count = row_count > max_rows_per_batch ? max_rows_per_batch : row_count; - ret.emplace_back(detail::fixed_width_convert_to_rows(row_start, - row_count, - num_columns, - size_per_row, - dev_column_start, - dev_column_size, - dev_input_data, - dev_input_nm, - *zero, - *step, - stream, - mr)); - } - - return ret; - } else { - CUDF_FAIL("Only fixed width types are currently supported"); - } -} - -std::unique_ptr convert_from_rows(cudf::lists_column_view const& input, - std::vector const& schema, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ -#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 - // verify that the types are what we expect - cudf::column_view child = input.child(); - cudf::type_id list_type = child.type().id(); - CUDF_EXPECTS(list_type == cudf::type_id::INT8 || list_type == cudf::type_id::UINT8, - "Only a list of bytes is supported as input"); - - cudf::size_type num_columns = schema.size(); - cudf::size_type num_rows = input.parent().size(); - - int device_id; - CUDA_TRY(cudaGetDevice(&device_id)); - int total_shmem; - CUDA_TRY(cudaDeviceGetAttribute(&total_shmem, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); - -#if defined(DEBUG) || 1 - total_shmem -= 1024; -#endif - int shmem_limit_per_block = total_shmem / NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; - - std::vector column_starts; - std::vector column_sizes; - - auto iter = thrust::make_transform_iterator(thrust::make_counting_iterator(0), [&schema](auto i) { - return std::make_tuple(schema[i], nullptr); - }); - size_type fixed_width_size_per_row = detail::compute_column_information( - iter, iter + num_columns, column_starts, column_sizes); //, [](void *) {}); - - size_type validity_size = num_bitmask_words(num_columns) * 4; - - size_type row_size = detail::align_offset(fixed_width_size_per_row + validity_size, 8); - - // Ideally we would check that the offsets are all the same, etc. but for now - // this is probably fine - CUDF_EXPECTS(row_size * num_rows == child.size(), "The layout of the data appears to be off"); - auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr); - auto dev_col_sizes = make_device_uvector_async(column_sizes, stream, mr); - - // build the row_batches from the passed in list column - std::vector row_batches; - - row_batches.push_back(detail::row_batch{child.size(), num_rows}); - - // Allocate the columns we are going to write into - std::vector> output_columns; - std::vector output_data; - std::vector output_nm; - for (cudf::size_type i = 0; i < num_columns; i++) { - auto column = cudf::make_fixed_width_column( - schema[i], num_rows, cudf::mask_state::UNINITIALIZED, stream, mr); - auto mut = column->mutable_view(); - output_data.emplace_back(mut.data()); - output_nm.emplace_back(mut.null_mask()); - output_columns.emplace_back(std::move(column)); - } - - auto dev_output_data = make_device_uvector_async(output_data, stream, mr); - auto dev_output_nm = make_device_uvector_async(output_nm, stream, mr); - - std::vector block_infos = - build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block); - - auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr); - - dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_COLUMNS)); -#if defined(DEBUG) - dim3 threads(std::min(std::min(128, shmem_limit_per_block / 8), (int)child.size())); -#else - dim3 threads(std::min(256, (int)child.size())); -#endif -#if defined(DEBUG) - printf("Launching kernel with %d blocks, %d threads, ", blocks.x, threads.x); - detail::pretty_print(total_shmem); - printf(" shared memory\n"); -#endif - detail::copy_to_columns<<>>( - num_rows, - num_columns, - shmem_limit_per_block, - input.offsets().data(), - dev_output_data.data(), - dev_col_sizes.data(), - dev_col_starts.data(), - dev_block_infos.data(), - block_infos.size(), - child.data()); - - auto const desired_rows_and_columns = (int)sqrt(shmem_limit_per_block); - auto const column_stride = [&]() { - if (desired_rows_and_columns > num_columns) { - // not many columns, group it into 64s and ship it off - return std::min(64, num_columns); - } else { - return util::round_down_safe(desired_rows_and_columns, 8); - } - }(); - auto const row_stride = [&]() { - // we fit as much as we can, we know the column stride now, so calculate the row - return std::min(num_rows, util::round_down_safe(shmem_limit_per_block * 8 / column_stride, 32)); - /* if (desired_rows_and_columns > num_rows) { - return std::min(32, num_rows); - } else { - return util::round_down_safe(desired_rows_and_columns, 32); - }*/ - }(); - std::vector validity_block_infos; - for (int col = 0; col < num_columns; col += column_stride) { - for (int row = 0; row < num_rows; row += row_stride) { - validity_block_infos.emplace_back( - detail::block_info{col, - row, - std::min(col + column_stride - 1, num_columns - 1), - std::min(row + row_stride - 1, num_rows - 1)}); - } - } - auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream, mr); - dim3 validity_blocks( - util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_COLUMNS)); -#if defined(DEBUG) - printf( - "Launching validity kernel with %d blocks, for %lu validity blocks, col stride %d and row " - "stride of %d with %d threads, ", - validity_blocks.x, - validity_block_infos.size(), - column_stride, - row_stride, - threads.x); - detail::pretty_print(total_shmem); - printf(" shared memory\n"); -#endif - - dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu)); - detail:: - copy_validity_to_columns<<>>( - num_rows, - num_columns, - shmem_limit_per_block, - input.offsets().data(), - dev_output_nm.data(), - column_starts.back(), - dev_validity_block_infos.data(), - validity_block_infos.size(), - child.data()); - - return std::make_unique(std::move(output_columns)); -#else - CUDF_FAIL("Row to column conversion optimization requires volta or later hardware."); - return {}; -#endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 -} - -std::unique_ptr convert_from_rows_fixed_width_optimized( - cudf::lists_column_view const& input, - std::vector const& schema, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - // verify that the types are what we expect - cudf::column_view child = input.child(); - cudf::type_id list_type = child.type().id(); - CUDF_EXPECTS(list_type == cudf::type_id::INT8 || list_type == cudf::type_id::UINT8, - "Only a list of bytes is supported as input"); - - cudf::size_type num_columns = schema.size(); - - if (detail::are_all_fixed_width(schema)) { - std::vector column_start; - std::vector column_size; - - cudf::size_type num_rows = input.parent().size(); - int32_t size_per_row = detail::compute_fixed_width_layout(schema, column_start, column_size); - - // Ideally we would check that the offsets are all the same, etc. but for now - // this is probably fine - CUDF_EXPECTS(size_per_row * num_rows == child.size(), - "The layout of the data appears to be off"); - auto dev_column_start = make_device_uvector_async(column_start, stream); - auto dev_column_size = make_device_uvector_async(column_size, stream); - - // Allocate the columns we are going to write into - std::vector> output_columns; - std::vector output_data; - std::vector output_nm; - for (cudf::size_type i = 0; i < num_columns; i++) { - auto column = cudf::make_fixed_width_column( - schema[i], num_rows, cudf::mask_state::UNINITIALIZED, stream, mr); - auto mut = column->mutable_view(); - output_data.emplace_back(mut.data()); - output_nm.emplace_back(mut.null_mask()); - output_columns.emplace_back(std::move(column)); - } - - auto dev_output_data = make_device_uvector_async(output_data, stream, mr); - auto dev_output_nm = make_device_uvector_async(output_nm, stream, mr); - - dim3 blocks; - dim3 threads; - int shared_size = - detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads); - - // printf("Launching (%d, %d, %d) blocks, (%d, %d, %d) threads, with %d shared size\n", - // blocks.x, blocks.y, blocks.z, threads.x, threads.y, threads.z, shared_size); - // printf("pointers are column_start: %p, column_size: %p, output_data: %p, output_nm: %p\n", - // dev_column_start.data(), dev_column_size.data(), dev_output_data.data(), - // dev_output_nm.data()); - detail::copy_to_fixed_width_columns<<>>( - num_rows, - num_columns, - size_per_row, - dev_column_start.data(), - dev_column_size.data(), - dev_output_data.data(), - dev_output_nm.data(), - child.data()); - - return std::make_unique(std::move(output_columns)); - } else { - CUDF_FAIL("Only fixed width types are currently supported"); - } -} - -} // namespace cudf + current_window_start_col = col; + current_window_width = 0; + } else { + row_size = row_size_with_this_col; + current_window_width++; + } + } + + // build last set of blocks + if (current_window_width > 0) { + build_blocks(current_window_start_col, (int)column_sizes.size() - 1, window_height); + } + + return block_infos; + } + + #endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 + + } // namespace detail + + std::vector> convert_to_rows(cudf::table_view const &tbl, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) { + #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 + // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the + // data, but small enough that multiple columns fit in memory so the writes can coalese as well. + // Potential optimization for window sizes. + const size_type num_columns = tbl.num_columns(); + const size_type num_rows = tbl.num_rows(); + + int device_id; + CUDA_TRY(cudaGetDevice(&device_id)); + int total_shmem; + CUDA_TRY(cudaDeviceGetAttribute(&total_shmem, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); + + // TODO: why? + total_shmem -= 1024; + int shmem_limit_per_block = total_shmem / NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; + + // break up the work into blocks, which are a starting and ending row/col #. + // this window size is calculated based on the shared memory size available + // we want a single block to fill up the entire shared memory space available + // for the transpose-like conversion. + + // There are two different processes going on here. The GPU conversion of the data + // and the writing of the data into the list of byte columns that are a maximum of + // 2 gigs each due to offset maximum size. The GPU conversion portion has to understand + // this limitation because the column must own the data inside and as a result it must be + // a distinct allocation for that column. Copying the data into these final buffers would + // be prohibitively expensive, so care is taken to ensure the GPU writes to the proper buffer. + // The windows are broken at the boundaries of specific rows based on the row sizes up + // to that point. These are row batches and they are decided first before building the + // windows so the windows can be properly cut around them. + + // Get the pointers to the input columnar data ready + std::vector input_data; + std::vector input_nm; + input_data.reserve(num_columns); + input_nm.reserve(num_columns); + for (size_type column_number = 0; column_number < num_columns; column_number++) { + column_view cv = tbl.column(column_number); + auto const col_type = cv.type(); + bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; + + if (!nested_type) { + input_data.emplace_back(cv.data()); + input_nm.emplace_back(cv.null_mask()); + } + } + + auto dev_input_data = make_device_uvector_async(input_data, stream, mr); + auto dev_input_nm = make_device_uvector_async(input_nm, stream, mr); + + std::vector row_sizes; // size of each row in bytes including any alignment padding + std::vector row_offsets; // offset from the start of the data to this row + std::vector column_sizes; // byte size of each column + std::vector column_starts; // offset of column inside a row including alignment + std::vector + variable_width_columns; // list of the variable width columns in the table + row_sizes.reserve(num_rows); + row_offsets.reserve(num_rows); + column_sizes.reserve(num_columns); + column_starts.reserve(num_columns + 1); // we add a final offset for validity data start + + auto iter = + thrust::make_transform_iterator(thrust::make_counting_iterator(0), + [&tbl](auto i) -> std::tuple { + return std::make_tuple(tbl.column(i).type(), tbl.column(i)); + }); + + size_type fixed_width_size_per_row = + detail::compute_column_information(iter, iter + num_columns, column_starts, column_sizes); + + auto dev_col_sizes = make_device_uvector_async(column_sizes, stream, mr); + auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr); + + std::vector row_batches; + + uint64_t row_batch_size = 0; + uint64_t total_table_size = 0; + size_type row_batch_rows = 0; + uint64_t row_offset = 0; + + // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then + // calculate the size of each row's variable-width data and validity as well. + auto validity_size = num_bitmask_words(num_columns) * 4; + // thrust + for (int row = 0; row < num_rows; ++row) { + auto aligned_row_batch_size = + detail::align_offset(row_batch_size, 8); // rows are 8 byte aligned + row_sizes[row] = fixed_width_size_per_row; + // validity is byte aligned + row_sizes[row] += validity_size; + // variable width data is 8-byte aligned + row_sizes[row] = detail::align_offset(row_sizes[row], 8); // rows are 8 byte aligned + + if ((uint64_t)aligned_row_batch_size + row_sizes[row] > + (uint64_t)std::numeric_limits::max()) { + // a new batch starts at the last 32-row boundary + row_batches.push_back( + detail::row_batch{static_cast(row_batch_size), row_batch_rows & ~31}); + row_batch_size = 0; + row_batch_rows = row_batch_rows & 31; + row_offset = 0; + aligned_row_batch_size = 0; + } + row_offset = detail::align_offset(row_offset, 8); // rows are 8 byte aligned + row_offsets.push_back(row_offset); + row_batch_size = aligned_row_batch_size + row_sizes[row]; + row_offset += row_sizes[row]; + total_table_size = detail::align_offset(total_table_size, 8); // rows are 8 byte aligned + total_table_size += row_sizes[row]; + row_batch_rows++; + } + if (row_batch_size > 0) { + row_batches.push_back( + detail::row_batch{static_cast(row_batch_size), row_batch_rows}); + } + + auto dev_row_offsets = make_device_uvector_async(row_offsets, stream, mr); + + std::vector output_buffers; + std::vector output_data; + output_data.reserve(row_batches.size()); + for (uint i = 0; i < row_batches.size(); ++i) { + rmm::device_buffer temp(row_batches[i].num_bytes, stream, mr); + output_data.push_back(static_cast(temp.data())); + output_buffers.push_back(std::move(temp)); + } + auto dev_output_data = make_device_uvector_async(output_data, stream, mr); + + std::vector block_infos = + build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block); + + auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr); + + // blast through the entire table and convert it + dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_ROWS)); + dim3 threads(256); + + detail::copy_to_rows<<>>( + num_rows, num_columns, shmem_limit_per_block, block_infos.size(), dev_input_data.data(), + dev_col_sizes.data(), dev_col_starts.data(), dev_block_infos.data(), dev_row_offsets.data(), + reinterpret_cast(dev_output_data.data())); + + auto validity_block_infos = + build_validity_block_infos(num_columns, num_rows, shmem_limit_per_block, row_batches); + + auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream, mr); + dim3 validity_blocks( + util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_ROWS)); + dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu)); + detail::copy_validity_to_rows<<>>( + num_rows, num_columns, shmem_limit_per_block, dev_row_offsets.data(), dev_output_data.data(), + column_starts.back(), dev_validity_block_infos.data(), validity_block_infos.size(), + dev_input_nm.data()); + + // split up the output buffer into multiple buffers based on row batch sizes + // and create list of byte columns + int offset_offset = 0; + std::vector> ret; + for (uint i = 0; i < row_batches.size(); ++i) { + // compute offsets for this row batch + std::vector offset_vals; + offset_vals.reserve(row_batches[i].row_count + 1); + size_type cur_offset = 0; + offset_vals.push_back(cur_offset); + for (int row = 0; row < row_batches[i].row_count; ++row) { + cur_offset = detail::align_offset(cur_offset, 8) + row_sizes[row + offset_offset]; + offset_vals.push_back(cur_offset); + } + offset_offset += row_batches[i].row_count; + + auto dev_offsets = make_device_uvector_async(offset_vals, stream, mr); + auto offsets = std::make_unique(data_type{type_id::INT32}, + (size_type)offset_vals.size(), dev_offsets.release()); + + auto data = std::make_unique(data_type{cudf::type_id::INT8}, row_batches[i].num_bytes, + std::move(output_buffers[i])); + + ret.push_back( + cudf::make_lists_column(row_batches[i].row_count, std::move(offsets), std::move(data), 0, + rmm::device_buffer{0, rmm::cuda_stream_default, mr}, stream, mr)); + } + + return ret; + #else + CUDF_FAIL("Column to row conversion optimization requires volta or later hardware."); + return {}; + #endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 + } + + std::vector> + convert_to_rows_fixed_width_optimized(cudf::table_view const &tbl, rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) { + const cudf::size_type num_columns = tbl.num_columns(); + + std::vector schema; + schema.resize(num_columns); + std::transform(tbl.begin(), tbl.end(), schema.begin(), detail::get_data_type); + + if (detail::are_all_fixed_width(schema)) { + std::vector column_start; + std::vector column_size; + + int32_t size_per_row = detail::compute_fixed_width_layout(schema, column_start, column_size); + auto dev_column_start = make_device_uvector_async(column_start, stream, mr); + auto dev_column_size = make_device_uvector_async(column_size, stream, mr); + + int32_t max_rows_per_batch = std::numeric_limits::max() / size_per_row; + // Make the number of rows per batch a multiple of 32 so we don't have to worry about + // splitting validity at a specific row offset. This might change in the future. + max_rows_per_batch = (max_rows_per_batch / 32) * 32; + + cudf::size_type num_rows = tbl.num_rows(); + + // Get the pointers to the input columnar data ready + std::vector input_data; + std::vector input_nm; + for (cudf::size_type column_number = 0; column_number < num_columns; column_number++) { + cudf::column_view cv = tbl.column(column_number); + input_data.emplace_back(cv.data()); + input_nm.emplace_back(cv.null_mask()); + } + auto dev_input_data = make_device_uvector_async(input_data, stream, mr); + auto dev_input_nm = make_device_uvector_async(input_nm, stream, mr); + + using ScalarType = cudf::scalar_type_t; + auto zero = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value()); + zero->set_valid_async(true, stream); + static_cast(zero.get())->set_value(0, stream); + + auto step = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value()); + step->set_valid_async(true, stream); + static_cast(step.get()) + ->set_value(static_cast(size_per_row), stream); + + std::vector> ret; + for (cudf::size_type row_start = 0; row_start < num_rows; row_start += max_rows_per_batch) { + cudf::size_type row_count = num_rows - row_start; + row_count = row_count > max_rows_per_batch ? max_rows_per_batch : row_count; + ret.emplace_back(detail::fixed_width_convert_to_rows( + row_start, row_count, num_columns, size_per_row, dev_column_start, dev_column_size, + dev_input_data, dev_input_nm, *zero, *step, stream, mr)); + } + + return ret; + } else { + CUDF_FAIL("Only fixed width types are currently supported"); + } + } + + std::unique_ptr convert_from_rows(cudf::lists_column_view const &input, + std::vector const &schema, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) { + #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 + // verify that the types are what we expect + cudf::column_view child = input.child(); + cudf::type_id list_type = child.type().id(); + CUDF_EXPECTS(list_type == cudf::type_id::INT8 || list_type == cudf::type_id::UINT8, + "Only a list of bytes is supported as input"); + + cudf::size_type num_columns = schema.size(); + cudf::size_type num_rows = input.parent().size(); + + int device_id; + CUDA_TRY(cudaGetDevice(&device_id)); + int total_shmem; + CUDA_TRY(cudaDeviceGetAttribute(&total_shmem, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); + + // TODO why? + total_shmem -= 1024; + int shmem_limit_per_block = total_shmem / NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; + + std::vector column_starts; + std::vector column_sizes; + + auto iter = thrust::make_transform_iterator(thrust::make_counting_iterator(0), [&schema](auto i) { + return std::make_tuple(schema[i], nullptr); + }); + size_type fixed_width_size_per_row = detail::compute_column_information( + iter, iter + num_columns, column_starts, column_sizes); //, [](void *) {}); + + size_type validity_size = num_bitmask_words(num_columns) * 4; + + size_type row_size = detail::align_offset(fixed_width_size_per_row + validity_size, 8); + + // Ideally we would check that the offsets are all the same, etc. but for now + // this is probably fine + CUDF_EXPECTS(row_size * num_rows == child.size(), "The layout of the data appears to be off"); + auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr); + auto dev_col_sizes = make_device_uvector_async(column_sizes, stream, mr); + + // build the row_batches from the passed in list column + std::vector row_batches; + + row_batches.push_back(detail::row_batch{child.size(), num_rows}); + + // Allocate the columns we are going to write into + std::vector> output_columns; + std::vector output_data; + std::vector output_nm; + for (cudf::size_type i = 0; i < num_columns; i++) { + auto column = cudf::make_fixed_width_column(schema[i], num_rows, + cudf::mask_state::UNINITIALIZED, stream, mr); + auto mut = column->mutable_view(); + output_data.emplace_back(mut.data()); + output_nm.emplace_back(mut.null_mask()); + output_columns.emplace_back(std::move(column)); + } + + auto dev_output_data = make_device_uvector_async(output_data, stream, mr); + auto dev_output_nm = make_device_uvector_async(output_nm, stream, mr); + + std::vector block_infos = + build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block); + + auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr); + + dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_FROM_ROWS)); + dim3 threads(std::min(std::min(256, shmem_limit_per_block / 8), (int)child.size())); + detail::copy_from_rows<<>>( + num_rows, num_columns, shmem_limit_per_block, input.offsets().data(), + dev_output_data.data(), dev_col_sizes.data(), dev_col_starts.data(), dev_block_infos.data(), + block_infos.size(), child.data()); + + auto const desired_rows_and_columns = (int)sqrt(shmem_limit_per_block); + auto const column_stride = [&]() { + if (desired_rows_and_columns > num_columns) { + // not many columns, group it into 64s and ship it off + return std::min(64, num_columns); + } else { + return util::round_down_safe(desired_rows_and_columns, 8); + } + }(); + auto const row_stride = [&]() { + // we fit as much as we can, we know the column stride now, so calculate the row + return std::min(num_rows, util::round_down_safe(shmem_limit_per_block * 8 / column_stride, 32)); + /* if (desired_rows_and_columns > num_rows) { + return std::min(32, num_rows); + } else { + return util::round_down_safe(desired_rows_and_columns, 32); + }*/ + }(); + std::vector validity_block_infos; + for (int col = 0; col < num_columns; col += column_stride) { + for (int row = 0; row < num_rows; row += row_stride) { + validity_block_infos.emplace_back( + detail::block_info{col, row, std::min(col + column_stride - 1, num_columns - 1), + std::min(row + row_stride - 1, num_rows - 1)}); + } + } + auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream, mr); + dim3 validity_blocks( + util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_BLOCKS_PER_KERNEL_FROM_ROWS)); + + dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu)); + detail:: + copy_validity_from_rows<<>>( + num_rows, num_columns, shmem_limit_per_block, input.offsets().data(), + dev_output_nm.data(), column_starts.back(), dev_validity_block_infos.data(), + validity_block_infos.size(), child.data()); + + return std::make_unique(std::move(output_columns)); + #else + CUDF_FAIL("Row to column conversion optimization requires volta or later hardware."); + return {}; + #endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 + } + + std::unique_ptr convert_from_rows_fixed_width_optimized( + cudf::lists_column_view const &input, std::vector const &schema, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) { + // verify that the types are what we expect + cudf::column_view child = input.child(); + cudf::type_id list_type = child.type().id(); + CUDF_EXPECTS(list_type == cudf::type_id::INT8 || list_type == cudf::type_id::UINT8, + "Only a list of bytes is supported as input"); + + cudf::size_type num_columns = schema.size(); + + if (detail::are_all_fixed_width(schema)) { + std::vector column_start; + std::vector column_size; + + cudf::size_type num_rows = input.parent().size(); + int32_t size_per_row = detail::compute_fixed_width_layout(schema, column_start, column_size); + + // Ideally we would check that the offsets are all the same, etc. but for now + // this is probably fine + CUDF_EXPECTS(size_per_row * num_rows == child.size(), + "The layout of the data appears to be off"); + auto dev_column_start = make_device_uvector_async(column_start, stream); + auto dev_column_size = make_device_uvector_async(column_size, stream); + + // Allocate the columns we are going to write into + std::vector> output_columns; + std::vector output_data; + std::vector output_nm; + for (cudf::size_type i = 0; i < num_columns; i++) { + auto column = cudf::make_fixed_width_column(schema[i], num_rows, + cudf::mask_state::UNINITIALIZED, stream, mr); + auto mut = column->mutable_view(); + output_data.emplace_back(mut.data()); + output_nm.emplace_back(mut.null_mask()); + output_columns.emplace_back(std::move(column)); + } + + auto dev_output_data = make_device_uvector_async(output_data, stream, mr); + auto dev_output_nm = make_device_uvector_async(output_nm, stream, mr); + + dim3 blocks; + dim3 threads; + int shared_size = + detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads); + + detail::copy_from_rows_fixed_width_optimized<<>>( + num_rows, num_columns, size_per_row, dev_column_start.data(), dev_column_size.data(), + dev_output_data.data(), dev_output_nm.data(), child.data()); + + return std::make_unique(std::move(output_columns)); + } else { + CUDF_FAIL("Only fixed width types are currently supported"); + } + } + + } // namespace cudf + \ No newline at end of file diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu index a67589fbaec..932afa4bb70 100644 --- a/java/src/main/native/src/row_conversion.cu +++ b/java/src/main/native/src/row_conversion.cu @@ -50,8 +50,8 @@ #include #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 -constexpr auto NUM_BLOCKS_PER_KERNEL_TO_COLUMNS = 8; -constexpr auto NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS = 2; +constexpr auto NUM_BLOCKS_PER_KERNEL_FROM_ROWS = 8; +constexpr auto NUM_BLOCKS_PER_KERNEL_TO_ROWS = 2; constexpr auto NUM_BLOCKS_PER_KERNEL_LOADED = 2; constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL = 8; constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED = 2; @@ -67,13 +67,11 @@ static inline __host__ __device__ int32_t align_offset(int32_t offset, std::size return (offset + alignment - 1) & ~(alignment - 1); } -__global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows, - const cudf::size_type num_columns, - const cudf::size_type row_size, - const cudf::size_type *input_offset_in_row, - const cudf::size_type *num_bytes, int8_t **output_data, - cudf::bitmask_type **output_nm, - const int8_t *input_data) { +__global__ void copy_from_rows_fixed_width_optimized( + const cudf::size_type num_rows, const cudf::size_type num_columns, + const cudf::size_type row_size, const cudf::size_type *input_offset_in_row, + const cudf::size_type *num_bytes, int8_t **output_data, cudf::bitmask_type **output_nm, + const int8_t *input_data) { // We are going to copy the data in two passes. // The first pass copies a chunk of data into shared memory. // The second pass copies that chunk from shared memory out to the final location. @@ -190,12 +188,11 @@ __global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows, } } -__global__ void -copy_from_fixed_width_columns(const cudf::size_type start_row, const cudf::size_type num_rows, - const cudf::size_type num_columns, const cudf::size_type row_size, - const cudf::size_type *output_offset_in_row, - const cudf::size_type *num_bytes, const int8_t **input_data, - const cudf::bitmask_type **input_nm, int8_t *output_data) { +__global__ void copy_to_rows_fixed_width_optimized( + const cudf::size_type start_row, const cudf::size_type num_rows, + const cudf::size_type num_columns, const cudf::size_type row_size, + const cudf::size_type *output_offset_in_row, const cudf::size_type *num_bytes, + const int8_t **input_data, const cudf::bitmask_type **input_nm, int8_t *output_data) { // We are going to copy the data in two passes. // The first pass copies a chunk of data into shared memory. // The second pass copies that chunk from shared memory out to the final location. @@ -367,12 +364,11 @@ struct row_batch { * @param output_data pointer to output data * */ -__global__ void copy_from_columns(const size_type num_rows, const size_type num_columns, - const size_type shmem_used_per_block, - const size_type num_block_infos, const int8_t **input_data, - const size_type *col_sizes, const size_type *col_offsets, - const block_info *block_infos, const size_type *row_offsets, - int8_t **output_data) { +__global__ void copy_to_rows(const size_type num_rows, const size_type num_columns, + const size_type shmem_used_per_block, const size_type num_block_infos, + const int8_t **input_data, const size_type *col_sizes, + const size_type *col_offsets, const block_info *block_infos, + const size_type *row_offsets, int8_t **output_data) { // We are going to copy the data in two passes. // The first pass copies a chunk of data into shared memory. // The second pass copies that chunk from shared memory out to the final location. @@ -396,15 +392,15 @@ __global__ void copy_from_columns(const size_type num_rows, const size_type num_ group.sync(); auto const blocks_remaining = - std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS, - (uint)NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS); + std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_ROWS, + (uint)NUM_BLOCKS_PER_KERNEL_TO_ROWS); size_t fetch; size_t subset; for (subset = fetch = 0; subset < blocks_remaining; ++subset) { // Fetch ahead up to stages_count subsets for (; fetch < blocks_remaining && fetch < (subset + stages_count); ++fetch) { - auto const fetch_block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS + fetch]; + auto const fetch_block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_ROWS + fetch]; auto const num_fetch_cols = fetch_block.num_cols(); auto const num_fetch_rows = fetch_block.num_rows(); auto const num_elements_in_block = num_fetch_cols * num_fetch_rows; @@ -462,7 +458,7 @@ __global__ void copy_from_columns(const size_type num_rows, const size_type num_ auto &subset_barrier = block_barrier[subset % NUM_BLOCKS_PER_KERNEL_LOADED]; subset_barrier.arrive_and_wait(); - auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS + subset]; + auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_ROWS + subset]; auto const block_row_size = block.get_shared_row_size(col_offsets, col_sizes); auto const column_offset = col_offsets[block.start_col]; auto const block_output_buffer = output_data[block.buffer_num]; @@ -499,7 +495,7 @@ __global__ void copy_from_columns(const size_type num_rows, const size_type num_ * @param input_data pointer to input data * */ -__global__ void copy_validity_from_columns( +__global__ void copy_validity_to_rows( const size_type num_rows, const size_type num_columns, const size_type shmem_used_per_block, const size_type *row_offsets, int8_t **output_data, const size_type validity_offset, const block_info *block_infos, const size_type num_block_infos, const bitmask_type **input_nm) { @@ -633,74 +629,6 @@ get_admin_data_sizes(size_t col_size_size, size_t col_offset_size, int const num return {col_size_bytes, col_offset_bytes}; } -/** - * @brief ensure `read_ahead` buffer blocks are fetched - * - * @param fetch_index internal state passed into the function - * @param processing_index index where processing is occuring - * @param read_ahead_count how many blocks to read ahead - * @param max_resident_blocks how many blocks can be loaded at once - * @param total_blocks total number of blocks overall - * @param block_infos pointer to the block infos - * @param col_sizes pointer to column size information - * @param col_offsets pointer to the table's column offsets - * @param row_offsets pointer to offsets for each row in the table - * @param input_data pointer to the input data - * @param shared pointer to shared memory - * @param group thread group participating in the fetch - * @param block_barrier barriers used for each block - * @return - */ -static __device__ void -fetch_blocks_for_row_to_column(size_t &fetch_index, size_t const processing_index, - int const read_ahead_count, int const max_resident_blocks, - int const total_blocks, block_info const *const block_infos, - size_type const *const col_sizes, size_type const *const col_offsets, - size_type const *const row_offsets, int8_t const *const input_data, - int8_t *shared[], cooperative_groups::thread_block const group, - cuda::barrier *block_barrier) { - for (; fetch_index < static_cast(total_blocks) && - fetch_index < (processing_index + read_ahead_count); - ++fetch_index) { - auto const fetch_block = - block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch_index]; - auto const fetch_block_start_row = fetch_block.start_row; - auto const fetch_block_end_row = fetch_block.end_row; - auto const starting_col_offset = col_offsets[fetch_block.start_col]; - auto const fetch_block_row_size = fetch_block.get_shared_row_size(col_offsets, col_sizes); - auto const num_fetch_cols = fetch_block.num_cols(); - auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes( - sizeof(decltype(*col_sizes)), sizeof(decltype(*col_offsets)), num_fetch_cols); - auto &fetch_barrier = block_barrier[fetch_index % NUM_BLOCKS_PER_KERNEL_LOADED]; - - // if we have fetched all buffers, we need to wait for processing - // to complete on them before we can use them again - if (fetch_index > NUM_BLOCKS_PER_KERNEL_LOADED) { - fetch_barrier.arrive_and_wait(); - } - - auto shared_row_offset = 0; - // copy the data for column sizes - cuda::memcpy_async(group, &shared[fetch_index % max_resident_blocks][shared_row_offset], - &col_sizes[fetch_block.start_col], col_size_bytes, fetch_barrier); - shared_row_offset += col_size_bytes; - // copy the data for column offsets - cuda::memcpy_async(group, &shared[fetch_index % max_resident_blocks][shared_row_offset], - &col_offsets[fetch_block.start_col], col_offset_bytes, fetch_barrier); - shared_row_offset += col_offset_bytes; - shared_row_offset = align_offset(shared_row_offset, 8); - - for (auto row = fetch_block_start_row + static_cast(threadIdx.x); - row <= fetch_block_end_row; row += blockDim.x) { - auto shared_offset = (row - fetch_block_start_row) * fetch_block_row_size + shared_row_offset; - // copy the main - cuda::memcpy_async(&shared[fetch_index % max_resident_blocks][shared_offset], - &input_data[row_offsets[row] + starting_col_offset], fetch_block_row_size, - fetch_barrier); - } - } -} - /** * @brief copy data from row-based format to cudf columns * @@ -716,7 +644,7 @@ fetch_blocks_for_row_to_column(size_t &fetch_index, size_t const processing_inde * @param input_data pointer to input data * */ -__global__ void copy_to_columns(const size_type num_rows, const size_type num_columns, +__global__ void copy_from_rows(const size_type num_rows, const size_type num_columns, const size_type shmem_used_per_block, const size_type *row_offsets, int8_t **output_data, const size_type *_col_sizes, const size_type *_col_offsets, const block_info *block_infos, @@ -746,40 +674,70 @@ __global__ void copy_to_columns(const size_type num_rows, const size_type num_co group.sync(); - auto blocks_remaining = std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS, - (uint)NUM_BLOCKS_PER_KERNEL_TO_COLUMNS); + auto blocks_remaining = std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_ROWS, + (uint)NUM_BLOCKS_PER_KERNEL_FROM_ROWS); + + size_t fetch_index; + size_t processing_index; + for (processing_index = fetch_index = 0; processing_index < blocks_remaining; + ++processing_index) { + // Fetch ahead up to stages_count groups + for (; fetch_index < static_cast(blocks_remaining) && + fetch_index < (processing_index + stages_count); + ++fetch_index) { + auto const fetch_block = + block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_ROWS + fetch_index]; + auto const fetch_block_start_row = fetch_block.start_row; + auto const fetch_block_end_row = fetch_block.end_row; + auto const starting_col_offset = _col_offsets[fetch_block.start_col]; + auto const fetch_block_row_size = fetch_block.get_shared_row_size(_col_offsets, _col_sizes); + auto const num_fetch_cols = fetch_block.num_cols(); + auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes( + sizeof(decltype(*_col_sizes)), sizeof(decltype(*_col_offsets)), num_fetch_cols); + auto &fetch_barrier = block_barrier[fetch_index % NUM_BLOCKS_PER_KERNEL_LOADED]; - auto get_admin_data_sizes = [col_size_size = sizeof(decltype(*_col_sizes)), - col_offset_size = sizeof(decltype(*_col_offsets))]( - int const num_cols, - int const num_rows) -> std::tuple { - auto const col_size_bytes = num_cols * col_size_size; - auto const col_offset_bytes = num_cols * col_offset_size; + // if we have fetched all buffers, we need to wait for processing + // to complete on them before we can use them again + if (fetch_index > NUM_BLOCKS_PER_KERNEL_LOADED) { + fetch_barrier.arrive_and_wait(); + } - return {col_size_bytes, col_offset_bytes}; - }; + auto shared_row_offset = 0; + // copy the data for column sizes + cuda::memcpy_async(group, &shared[fetch_index % stages_count][shared_row_offset], + &_col_sizes[fetch_block.start_col], col_size_bytes, fetch_barrier); + shared_row_offset += col_size_bytes; + // copy the data for column offsets + cuda::memcpy_async(group, &shared[fetch_index % stages_count][shared_row_offset], + &_col_offsets[fetch_block.start_col], col_offset_bytes, fetch_barrier); + shared_row_offset += col_offset_bytes; + shared_row_offset = align_offset(shared_row_offset, 8); + + for (auto row = fetch_block_start_row + static_cast(threadIdx.x); + row <= fetch_block_end_row; row += blockDim.x) { + auto shared_offset = + (row - fetch_block_start_row) * fetch_block_row_size + shared_row_offset; + // copy the main + cuda::memcpy_async(&shared[fetch_index % stages_count][shared_offset], + &input_data[row_offsets[row] + starting_col_offset], + fetch_block_row_size, fetch_barrier); + } + } - size_t fetch; - size_t subset; - for (subset = fetch = 0; subset < blocks_remaining; ++subset) { - // Fetch ahead up to stages_count subsets - fetch_blocks_for_row_to_column(fetch, subset, stages_count, stages_count, blocks_remaining, - block_infos, _col_sizes, _col_offsets, row_offsets, input_data, - shared, group, block_barrier); + auto &processing_barrier = block_barrier[processing_index % NUM_BLOCKS_PER_KERNEL_LOADED]; - auto &subset_barrier = block_barrier[subset % NUM_BLOCKS_PER_KERNEL_LOADED]; // ensure our data is ready - subset_barrier.arrive_and_wait(); + processing_barrier.arrive_and_wait(); - auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + subset]; + auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_ROWS + processing_index]; auto const rows_in_block = block.num_rows(); auto const cols_in_block = block.num_cols(); - auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes(cols_in_block, rows_in_block); - // auto shared_row_offsets = shared[subset]; - auto shared_col_sizes = reinterpret_cast(shared[subset % stages_count]); + auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes( + sizeof(decltype(*_col_sizes)), sizeof(decltype(*_col_offsets)), cols_in_block); + auto shared_col_sizes = reinterpret_cast(shared[processing_index % stages_count]); auto shared_col_offsets = - reinterpret_cast(&shared[subset % stages_count][col_size_bytes]); + reinterpret_cast(&shared[processing_index % stages_count][col_size_bytes]); auto const shared_row_offset = align_offset(col_size_bytes + col_offset_bytes, 8); @@ -803,10 +761,10 @@ __global__ void copy_to_columns(const size_type num_rows, const size_type num_co shared_memory_row_offset + shared_row_offset; auto const column_size = shared_col_sizes[relative_col]; - int8_t *shmem_src = &shared[subset % stages_count][shared_memory_offset]; + int8_t *shmem_src = &shared[processing_index % stages_count][shared_memory_offset]; int8_t *dst = &output_data[absolute_col][absolute_row * column_size]; - cuda::memcpy_async(dst, shmem_src, column_size, subset_barrier); + cuda::memcpy_async(dst, shmem_src, column_size, processing_barrier); } group.sync(); } @@ -831,7 +789,7 @@ __global__ void copy_to_columns(const size_type num_rows, const size_type num_co * @param input_data pointer to input data * */ -__global__ void copy_validity_to_columns( +__global__ void copy_validity_from_rows( const size_type num_rows, const size_type num_columns, const size_type shmem_used_per_block, const size_type *row_offsets, cudf::bitmask_type **output_nm, const size_type validity_offset, const block_info *block_infos, const size_type num_block_infos, const int8_t *input_data) { @@ -1050,7 +1008,7 @@ fixed_width_convert_to_rows(const cudf::size_type start_row, const cudf::size_ty int shared_size = detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads); - copy_from_fixed_width_columns<<>>( + copy_to_rows_fixed_width_optimized<<>>( start_row, num_rows, num_columns, size_per_row, column_start.data(), column_size.data(), input_data.data(), input_nm.data(), data->mutable_view().data()); @@ -1354,18 +1312,6 @@ std::vector> convert_to_rows(cudf::table_view cons std::vector row_batches; - auto calculate_variable_width_row_data_size = [](int const row) { - // each level of variable-width data will add an offset/length - // uint64 of data. The first of which is inside the fixed-width - // data itself and needs to be aligned based on what is around - // that data. This is handled above with the fixed-width calculations - // for that reason. We may still need to add more of these offset/length - // combinations if the nesting is deeper than one level as these - // will be included in the variable-width data blob at the end of the - // row. - return 0; - }; - uint64_t row_batch_size = 0; uint64_t total_table_size = 0; size_type row_batch_rows = 0; @@ -1382,8 +1328,7 @@ std::vector> convert_to_rows(cudf::table_view cons // validity is byte aligned row_sizes[row] += validity_size; // variable width data is 8-byte aligned - row_sizes[row] = detail::align_offset(row_sizes[row], 8) + - calculate_variable_width_row_data_size(row); // rows are 8 byte aligned + row_sizes[row] = detail::align_offset(row_sizes[row], 8); // rows are 8 byte aligned if ((uint64_t)aligned_row_batch_size + row_sizes[row] > (uint64_t)std::numeric_limits::max()) { @@ -1426,10 +1371,10 @@ std::vector> convert_to_rows(cudf::table_view cons auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr); // blast through the entire table and convert it - dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS)); + dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_ROWS)); dim3 threads(256); - detail::copy_from_columns<<>>( + detail::copy_to_rows<<>>( num_rows, num_columns, shmem_limit_per_block, block_infos.size(), dev_input_data.data(), dev_col_sizes.data(), dev_col_starts.data(), dev_block_infos.data(), dev_row_offsets.data(), reinterpret_cast(dev_output_data.data())); @@ -1439,9 +1384,9 @@ std::vector> convert_to_rows(cudf::table_view cons auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream, mr); dim3 validity_blocks( - util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_COLUMNS)); + util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_ROWS)); dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu)); - detail::copy_validity_from_columns<<>>( num_rows, num_columns, shmem_limit_per_block, dev_row_offsets.data(), dev_output_data.data(), column_starts.back(), dev_validity_block_infos.data(), validity_block_infos.size(), @@ -1610,9 +1555,9 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const &in auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr); - dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_COLUMNS)); + dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_FROM_ROWS)); dim3 threads(std::min(std::min(256, shmem_limit_per_block / 8), (int)child.size())); - detail::copy_to_columns<<>>( + detail::copy_from_rows<<>>( num_rows, num_columns, shmem_limit_per_block, input.offsets().data(), dev_output_data.data(), dev_col_sizes.data(), dev_col_starts.data(), dev_block_infos.data(), block_infos.size(), child.data()); @@ -1645,11 +1590,11 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const &in } auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream, mr); dim3 validity_blocks( - util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_COLUMNS)); + util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_BLOCKS_PER_KERNEL_FROM_ROWS)); dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu)); detail:: - copy_validity_to_columns<<>>( + copy_validity_from_rows<<>>( num_rows, num_columns, shmem_limit_per_block, input.offsets().data(), dev_output_nm.data(), column_starts.back(), dev_validity_block_infos.data(), validity_block_infos.size(), child.data()); @@ -1707,7 +1652,7 @@ std::unique_ptr convert_from_rows_fixed_width_optimized( int shared_size = detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads); - detail::copy_to_fixed_width_columns<<>>( + detail::copy_from_rows_fixed_width_optimized<<>>( num_rows, num_columns, size_per_row, dev_column_start.data(), dev_column_size.data(), dev_output_data.data(), dev_output_nm.data(), child.data()); From c4b02424dcb4794381a27e8440ea9702d0054ed4 Mon Sep 17 00:00:00 2001 From: Raza Jafri Date: Thu, 21 Oct 2021 14:49:26 -0700 Subject: [PATCH 27/80] fixed typo --- java/src/main/java/ai/rapids/cudf/HostMemoryBuffer.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/HostMemoryBuffer.java b/java/src/main/java/ai/rapids/cudf/HostMemoryBuffer.java index 9541d05ce00..e4106574a19 100644 --- a/java/src/main/java/ai/rapids/cudf/HostMemoryBuffer.java +++ b/java/src/main/java/ai/rapids/cudf/HostMemoryBuffer.java @@ -393,7 +393,7 @@ public final void setInts(long offset, int[] data, long srcOffset, long len) { */ public final long getLong(long offset) { long requestedAddress = this.address + offset; - addressOutOfBoundsCheck(requestedAddress, 8, "setLong"); + addressOutOfBoundsCheck(requestedAddress, 8, "getLong"); return UnsafeMemoryAccessor.getLong(requestedAddress); } @@ -404,7 +404,7 @@ public final long getLong(long offset) { */ public final void setLong(long offset, long value) { long requestedAddress = this.address + offset; - addressOutOfBoundsCheck(requestedAddress, 8, "getLong"); + addressOutOfBoundsCheck(requestedAddress, 8, "setLong"); UnsafeMemoryAccessor.setLong(requestedAddress, value); } From e92989c822aa613cf7ed6310ff59b2a8bcf0e376 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Sat, 23 Oct 2021 01:21:15 +0000 Subject: [PATCH 28/80] Updating for actual PR. Fixed a few last minute bugs, removed cudf-land code that was there for testing and benchmarking. --- cpp/CMakeLists.txt | 1 - cpp/benchmarks/CMakeLists.txt | 4 - .../row_conversion/row_conversion.cpp | 181 -- cpp/src/row_conversion/row_conversion.cu | 1666 ----------------- cpp/tests/CMakeLists.txt | 4 - cpp/tests/row_conversion/row_conversion.cpp | 677 ------- java/src/main/native/src/row_conversion.cu | 33 +- 7 files changed, 16 insertions(+), 2550 deletions(-) delete mode 100644 cpp/benchmarks/row_conversion/row_conversion.cpp delete mode 100644 cpp/src/row_conversion/row_conversion.cu delete mode 100644 cpp/tests/row_conversion/row_conversion.cpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 785ac1f72de..82bc5bfba93 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -350,7 +350,6 @@ add_library(cudf src/rolling/rolling.cu src/rolling/rolling_collect_list.cu src/round/round.cu - src/row_conversion/row_conversion.cu src/scalar/scalar.cpp src/scalar/scalar_factories.cpp src/search/search.cu diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 7d353c37df7..b3b92003573 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -250,7 +250,3 @@ ConfigureBench(JSON_BENCH # - io benchmark --------------------------------------------------------------------- ConfigureBench(MULTIBYTE_SPLIT_BENCHMARK io/text/multibyte_split_benchmark.cpp) - -################################################################################################### -# - row conversion benchmark --------------------------------------------------------- -ConfigureBench(ROW_CONVERSION_BENCH row_conversion/row_conversion.cpp) diff --git a/cpp/benchmarks/row_conversion/row_conversion.cpp b/cpp/benchmarks/row_conversion/row_conversion.cpp deleted file mode 100644 index fb8e4c8aef3..00000000000 --- a/cpp/benchmarks/row_conversion/row_conversion.cpp +++ /dev/null @@ -1,181 +0,0 @@ -/* - * Copyright (c) 2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include - -#include -#include -#include - -class RowConversion : public cudf::benchmark { -}; - -static void BM_old_to_row(benchmark::State& state) -{ - cudf::size_type const n_rows{(cudf::size_type)state.range(0)}; - auto const table = create_random_table({cudf::type_id::INT8, - cudf::type_id::INT32, - cudf::type_id::INT16, - cudf::type_id::INT64, - cudf::type_id::INT32, - cudf::type_id::BOOL8, - cudf::type_id::UINT16, - cudf::type_id::UINT8, - cudf::type_id::UINT64}, - 212, - row_count{n_rows}); - - cudf::size_type total_bytes = 0; - for (int i = 0; i < table->num_columns(); ++i) { - auto t = table->get_column(i).type(); - total_bytes += cudf::size_of(t); - } - - for (auto _ : state) { - cuda_event_timer raii(state, true, rmm::cuda_stream_default); - - auto rows = cudf::convert_to_rows_fixed_width_optimized(table->view()); - } - - state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows()); -} - -static void BM_new_to_row(benchmark::State& state) -{ - cudf::size_type const n_rows{(cudf::size_type)state.range(0)}; - auto const table = create_random_table({cudf::type_id::INT8, - cudf::type_id::INT32, - cudf::type_id::INT16, - cudf::type_id::INT64, - cudf::type_id::INT32, - cudf::type_id::BOOL8, - cudf::type_id::UINT16, - cudf::type_id::UINT8, - cudf::type_id::UINT64}, - 212, - row_count{n_rows}); - - cudf::size_type total_bytes = 0; - for (int i = 0; i < table->num_columns(); ++i) { - auto t = table->get_column(i).type(); - total_bytes += cudf::size_of(t); - } - - for (auto _ : state) { - cuda_event_timer raii(state, true, rmm::cuda_stream_default); - - auto new_rows = cudf::convert_to_rows(table->view()); - } - - state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows()); -} - -static void BM_old_from_row(benchmark::State& state) -{ - cudf::size_type const n_rows{(cudf::size_type)state.range(0)}; - auto const table = create_random_table({cudf::type_id::INT8, - cudf::type_id::INT32, - cudf::type_id::INT16, - cudf::type_id::INT64, - cudf::type_id::INT32, - cudf::type_id::BOOL8, - cudf::type_id::UINT16, - cudf::type_id::UINT8, - cudf::type_id::UINT64}, - 256, - row_count{n_rows}); - - std::vector schema; - cudf::size_type total_bytes = 0; - for (int i = 0; i < table->num_columns(); ++i) { - auto t = table->get_column(i).type(); - schema.push_back(t); - total_bytes += cudf::size_of(t); - } - - auto rows = cudf::convert_to_rows_fixed_width_optimized(table->view()); - cudf::lists_column_view const first_list(rows.front()->view()); - - for (auto _ : state) { - cuda_event_timer raii(state, true, rmm::cuda_stream_default); - - auto out = cudf::convert_from_rows_fixed_width_optimized(first_list, schema); - } - - state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows()); -} - -static void BM_new_from_row(benchmark::State& state) -{ - cudf::size_type const n_rows{(cudf::size_type)state.range(0)}; - auto const table = create_random_table({cudf::type_id::INT8, - cudf::type_id::INT32, - cudf::type_id::INT16, - cudf::type_id::INT64, - cudf::type_id::INT32, - cudf::type_id::BOOL8, - cudf::type_id::UINT16, - cudf::type_id::UINT8, - cudf::type_id::UINT64}, - 256, - row_count{n_rows}); - - std::vector schema; - cudf::size_type total_bytes = 0; - for (int i = 0; i < table->num_columns(); ++i) { - auto t = table->get_column(i).type(); - schema.push_back(t); - total_bytes += cudf::size_of(t); - } - - auto rows = cudf::convert_to_rows_fixed_width_optimized(table->view()); - cudf::lists_column_view const first_list(rows.front()->view()); - - for (auto _ : state) { - cuda_event_timer raii(state, true, rmm::cuda_stream_default); - - auto out = cudf::convert_from_rows(first_list, schema); - } - - state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows()); -} - -#define TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \ - BENCHMARK_DEFINE_F(RowConversion, name) \ - (::benchmark::State & st) { f(st); } \ - BENCHMARK_REGISTER_F(RowConversion, name) \ - ->RangeMultiplier(8) \ - ->Ranges({{1 << 6, 1 << 20}}) \ - ->UseManualTime() \ - ->Unit(benchmark::kMillisecond); - -TO_ROW_CONVERSION_BENCHMARK_DEFINE(old_to_row_conversion, BM_old_to_row) -TO_ROW_CONVERSION_BENCHMARK_DEFINE(new_to_row_conversion, BM_new_to_row) - -#define FROM_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \ - BENCHMARK_DEFINE_F(RowConversion, name) \ - (::benchmark::State & st) { f(st); } \ - BENCHMARK_REGISTER_F(RowConversion, name) \ - ->RangeMultiplier(8) \ - ->Ranges({{1 << 6, 1 << 20}}) \ - ->UseManualTime() \ - ->Unit(benchmark::kMillisecond); - -FROM_ROW_CONVERSION_BENCHMARK_DEFINE(old_from_row_conversion, BM_old_from_row) -FROM_ROW_CONVERSION_BENCHMARK_DEFINE(new_from_row_conversion, BM_new_from_row) diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu deleted file mode 100644 index c068a2c0b76..00000000000 --- a/cpp/src/row_conversion/row_conversion.cu +++ /dev/null @@ -1,1666 +0,0 @@ -/* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - #include - #include - #include - #include - #include - - #include - #include - - #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 - #include - #endif - - #include - #include - #include - #include - #include - #include - #include - #include - #include - #include - #include - #include - #include - #include - #include - #include - #include - #include - #include - #include - #include - - #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 - constexpr auto NUM_BLOCKS_PER_KERNEL_FROM_ROWS = 8; - constexpr auto NUM_BLOCKS_PER_KERNEL_TO_ROWS = 2; - constexpr auto NUM_BLOCKS_PER_KERNEL_LOADED = 2; - constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL = 8; - constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED = 2; - #endif - - using cudf::detail::make_device_uvector_async; - using rmm::device_uvector; - namespace cudf { - - namespace detail { - - static inline __host__ __device__ int32_t align_offset(int32_t offset, std::size_t alignment) { - return (offset + alignment - 1) & ~(alignment - 1); - } - - __global__ void copy_from_rows_fixed_width_optimized( - const cudf::size_type num_rows, const cudf::size_type num_columns, - const cudf::size_type row_size, const cudf::size_type *input_offset_in_row, - const cudf::size_type *num_bytes, int8_t **output_data, cudf::bitmask_type **output_nm, - const int8_t *input_data) { - // We are going to copy the data in two passes. - // The first pass copies a chunk of data into shared memory. - // The second pass copies that chunk from shared memory out to the final location. - - // Because shared memory is limited we copy a subset of the rows at a time. - // For simplicity we will refer to this as a row_group - - // In practice we have found writing more than 4 columns of data per thread - // results in performance loss. As such we are using a 2 dimensional - // kernel in terms of threads, but not in terms of blocks. Columns are - // controlled by the y dimension (there is no y dimension in blocks). Rows - // are controlled by the x dimension (there are multiple blocks in the x - // dimension). - - cudf::size_type rows_per_group = blockDim.x; - cudf::size_type row_group_start = blockIdx.x; - cudf::size_type row_group_stride = gridDim.x; - cudf::size_type row_group_end = (num_rows + rows_per_group - 1) / rows_per_group + 1; - - extern __shared__ int8_t shared_data[]; - - // Because we are copying fixed width only data and we stride the rows - // this thread will always start copying from shared data in the same place - int8_t *row_tmp = &shared_data[row_size * threadIdx.x]; - int8_t *row_vld_tmp = &row_tmp[input_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]]; - - for (cudf::size_type row_group_index = row_group_start; row_group_index < row_group_end; - row_group_index += row_group_stride) { - // Step 1: Copy the data into shared memory - // We know row_size is always aligned with and a multiple of int64_t; - int64_t *long_shared = reinterpret_cast(shared_data); - const int64_t *long_input = reinterpret_cast(input_data); - - cudf::size_type shared_output_index = threadIdx.x + (threadIdx.y * blockDim.x); - cudf::size_type shared_output_stride = blockDim.x * blockDim.y; - cudf::size_type row_index_end = ((row_group_index + 1) * rows_per_group); - if (row_index_end > num_rows) { - row_index_end = num_rows; - } - cudf::size_type num_rows_in_group = row_index_end - (row_group_index * rows_per_group); - cudf::size_type shared_length = row_size * num_rows_in_group; - - cudf::size_type shared_output_end = shared_length / sizeof(int64_t); - - cudf::size_type start_input_index = - (row_size * row_group_index * rows_per_group) / sizeof(int64_t); - - for (cudf::size_type shared_index = shared_output_index; shared_index < shared_output_end; - shared_index += shared_output_stride) { - long_shared[shared_index] = long_input[start_input_index + shared_index]; - } - // Wait for all of the data to be in shared memory - __syncthreads(); - - // Step 2 copy the data back out - - // Within the row group there should be 1 thread for each row. This is a - // requirement for launching the kernel - cudf::size_type row_index = (row_group_index * rows_per_group) + threadIdx.x; - // But we might not use all of the threads if the number of rows does not go - // evenly into the thread count. We don't want those threads to exit yet - // because we may need them to copy data in for the next row group. - uint32_t active_mask = __ballot_sync(0xffffffff, row_index < num_rows); - if (row_index < num_rows) { - cudf::size_type col_index_start = threadIdx.y; - cudf::size_type col_index_stride = blockDim.y; - for (cudf::size_type col_index = col_index_start; col_index < num_columns; - col_index += col_index_stride) { - cudf::size_type col_size = num_bytes[col_index]; - const int8_t *col_tmp = &(row_tmp[input_offset_in_row[col_index]]); - int8_t *col_output = output_data[col_index]; - switch (col_size) { - case 1: { - col_output[row_index] = *col_tmp; - break; - } - case 2: { - int16_t *short_col_output = reinterpret_cast(col_output); - short_col_output[row_index] = *reinterpret_cast(col_tmp); - break; - } - case 4: { - int32_t *int_col_output = reinterpret_cast(col_output); - int_col_output[row_index] = *reinterpret_cast(col_tmp); - break; - } - case 8: { - int64_t *long_col_output = reinterpret_cast(col_output); - long_col_output[row_index] = *reinterpret_cast(col_tmp); - break; - } - default: { - cudf::size_type output_offset = col_size * row_index; - // TODO this should just not be supported for fixed width columns, but just in case... - for (cudf::size_type b = 0; b < col_size; b++) { - col_output[b + output_offset] = col_tmp[b]; - } - break; - } - } - - cudf::bitmask_type *nm = output_nm[col_index]; - int8_t *valid_byte = &row_vld_tmp[col_index / 8]; - cudf::size_type byte_bit_offset = col_index % 8; - int predicate = *valid_byte & (1 << byte_bit_offset); - uint32_t bitmask = __ballot_sync(active_mask, predicate); - if (row_index % 32 == 0) { - nm[word_index(row_index)] = bitmask; - } - } // end column loop - } // end row copy - // wait for the row_group to be totally copied before starting on the next row group - __syncthreads(); - } - } - - __global__ void copy_to_rows_fixed_width_optimized( - const cudf::size_type start_row, const cudf::size_type num_rows, - const cudf::size_type num_columns, const cudf::size_type row_size, - const cudf::size_type *output_offset_in_row, const cudf::size_type *num_bytes, - const int8_t **input_data, const cudf::bitmask_type **input_nm, int8_t *output_data) { - // We are going to copy the data in two passes. - // The first pass copies a chunk of data into shared memory. - // The second pass copies that chunk from shared memory out to the final location. - - // Because shared memory is limited we copy a subset of the rows at a time. - // We do not support copying a subset of the columns in a row yet, so we don't - // currently support a row that is wider than shared memory. - // For simplicity we will refer to this as a row_group - - // In practice we have found reading more than 4 columns of data per thread - // results in performance loss. As such we are using a 2 dimensional - // kernel in terms of threads, but not in terms of blocks. Columns are - // controlled by the y dimension (there is no y dimension in blocks). Rows - // are controlled by the x dimension (there are multiple blocks in the x - // dimension). - - cudf::size_type rows_per_group = blockDim.x; - cudf::size_type row_group_start = blockIdx.x; - cudf::size_type row_group_stride = gridDim.x; - cudf::size_type row_group_end = (num_rows + rows_per_group - 1) / rows_per_group + 1; - - extern __shared__ int8_t shared_data[]; - - // Because we are copying fixed width only data and we stride the rows - // this thread will always start copying to shared data in the same place - int8_t *row_tmp = &shared_data[row_size * threadIdx.x]; - int8_t *row_vld_tmp = - &row_tmp[output_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]]; - - for (cudf::size_type row_group_index = row_group_start; row_group_index < row_group_end; - row_group_index += row_group_stride) { - // Within the row group there should be 1 thread for each row. This is a - // requirement for launching the kernel - cudf::size_type row_index = start_row + (row_group_index * rows_per_group) + threadIdx.x; - // But we might not use all of the threads if the number of rows does not go - // evenly into the thread count. We don't want those threads to exit yet - // because we may need them to copy data back out. - if (row_index < (start_row + num_rows)) { - cudf::size_type col_index_start = threadIdx.y; - cudf::size_type col_index_stride = blockDim.y; - for (cudf::size_type col_index = col_index_start; col_index < num_columns; - col_index += col_index_stride) { - cudf::size_type col_size = num_bytes[col_index]; - int8_t *col_tmp = &(row_tmp[output_offset_in_row[col_index]]); - const int8_t *col_input = input_data[col_index]; - switch (col_size) { - case 1: { - *col_tmp = col_input[row_index]; - break; - } - case 2: { - const int16_t *short_col_input = reinterpret_cast(col_input); - *reinterpret_cast(col_tmp) = short_col_input[row_index]; - break; - } - case 4: { - const int32_t *int_col_input = reinterpret_cast(col_input); - *reinterpret_cast(col_tmp) = int_col_input[row_index]; - break; - } - case 8: { - const int64_t *long_col_input = reinterpret_cast(col_input); - *reinterpret_cast(col_tmp) = long_col_input[row_index]; - break; - } - default: { - cudf::size_type input_offset = col_size * row_index; - // TODO this should just not be supported for fixed width columns, but just in case... - for (cudf::size_type b = 0; b < col_size; b++) { - col_tmp[b] = col_input[b + input_offset]; - } - break; - } - } - // atomicOr only works on 32 bit or 64 bit aligned values, and not byte aligned - // so we have to rewrite the addresses to make sure that it is 4 byte aligned - int8_t *valid_byte = &row_vld_tmp[col_index / 8]; - cudf::size_type byte_bit_offset = col_index % 8; - uint64_t fixup_bytes = reinterpret_cast(valid_byte) % 4; - int32_t *valid_int = reinterpret_cast(valid_byte - fixup_bytes); - cudf::size_type int_bit_offset = byte_bit_offset + (fixup_bytes * 8); - // Now copy validity for the column - if (input_nm[col_index]) { - if (bit_is_set(input_nm[col_index], row_index)) { - atomicOr_block(valid_int, 1 << int_bit_offset); - } else { - atomicAnd_block(valid_int, ~(1 << int_bit_offset)); - } - } else { - // It is valid so just set the bit - atomicOr_block(valid_int, 1 << int_bit_offset); - } - } // end column loop - } // end row copy - // wait for the row_group to be totally copied into shared memory - __syncthreads(); - - // Step 2: Copy the data back out - // We know row_size is always aligned with and a multiple of int64_t; - int64_t *long_shared = reinterpret_cast(shared_data); - int64_t *long_output = reinterpret_cast(output_data); - - cudf::size_type shared_input_index = threadIdx.x + (threadIdx.y * blockDim.x); - cudf::size_type shared_input_stride = blockDim.x * blockDim.y; - cudf::size_type row_index_end = ((row_group_index + 1) * rows_per_group); - if (row_index_end > num_rows) { - row_index_end = num_rows; - } - cudf::size_type num_rows_in_group = row_index_end - (row_group_index * rows_per_group); - cudf::size_type shared_length = row_size * num_rows_in_group; - - cudf::size_type shared_input_end = shared_length / sizeof(int64_t); - - cudf::size_type start_output_index = - (row_size * row_group_index * rows_per_group) / sizeof(int64_t); - - for (cudf::size_type shared_index = shared_input_index; shared_index < shared_input_end; - shared_index += shared_input_stride) { - long_output[start_output_index + shared_index] = long_shared[shared_index]; - } - __syncthreads(); - // Go for the next round - } - } - - #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 - - struct block_info { - int start_col; - int start_row; - int end_col; - int end_row; - int buffer_num; - - __host__ __device__ size_type get_shared_row_size(size_type const *const col_offsets, - size_type const *const col_sizes) const { - return align_offset(col_offsets[end_col] + col_sizes[end_col] - col_offsets[start_col], 8); - } - __host__ __device__ size_type num_cols() const { return end_col - start_col + 1; } - - __host__ __device__ size_type num_rows() const { return end_row - start_row + 1; } - }; - - // When building the columns to return, we have to be mindful of the offset limit in cudf. - // It is 32-bit and these data columns are capable of surpassing that easily. The data should - // not be cut off exactly at the limit though due to the validity buffers. The most efficient - // place to cut the validity is on a 32-row boundary, so as we calculate the row sizes - // we keep track of the cut points for the validity, which we call row batches. If the row - // is larger than can be represented with the 32-bit offsets, we use the last 32-row boundary we - // hit. Note that this boundary is for our book-keeping with column pointers and not anything that - // the kernel needs to worry about. We cut the output at convienient boundaries when assembling - // the outgoing data stream. - struct row_batch { - size_type num_bytes; - size_type row_count; - }; - - /** - * @brief copy data from cudf columns into x format, which is row-based - * - * @param num_rows total number of rows in the table - * @param num_columns total number of columns in the table - * @param input_data pointer to raw table data - * @param input_nm pointer to validity data - * @param col_sizes array of sizes for each element in a column - one per column - * @param col_offsets offset into input data row for each column's start - * @param block_infos information about the blocks of work - * @param row_offsets offset to a specific row in the input data - * @param output_data pointer to output data - * - */ - __global__ void copy_to_rows(const size_type num_rows, const size_type num_columns, - const size_type shmem_used_per_block, const size_type num_block_infos, - const int8_t **input_data, const size_type *col_sizes, - const size_type *col_offsets, const block_info *block_infos, - const size_type *row_offsets, int8_t **output_data) { - // We are going to copy the data in two passes. - // The first pass copies a chunk of data into shared memory. - // The second pass copies that chunk from shared memory out to the final location. - - // Because shared memory is limited we copy a subset of the rows at a time. - // This has been broken up for us in the block_info struct, so we don't have - // any calculation to do here, but it is important to note. - - constexpr unsigned stages_count = NUM_BLOCKS_PER_KERNEL_LOADED; - auto group = cooperative_groups::this_thread_block(); - extern __shared__ int8_t shared_data[]; - int8_t *shared[stages_count] = {shared_data, shared_data + shmem_used_per_block}; - - __shared__ cuda::barrier block_barrier[NUM_BLOCKS_PER_KERNEL_LOADED]; - if (group.thread_rank() == 0) { - for (int i = 0; i < NUM_BLOCKS_PER_KERNEL_LOADED; ++i) { - init(&block_barrier[i], group.size()); - } - } - - group.sync(); - - auto const blocks_remaining = - std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_ROWS, - (uint)NUM_BLOCKS_PER_KERNEL_TO_ROWS); - - size_t fetch; - size_t subset; - for (subset = fetch = 0; subset < blocks_remaining; ++subset) { - // Fetch ahead up to stages_count subsets - for (; fetch < blocks_remaining && fetch < (subset + stages_count); ++fetch) { - auto const fetch_block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_ROWS + fetch]; - auto const num_fetch_cols = fetch_block.num_cols(); - auto const num_fetch_rows = fetch_block.num_rows(); - auto const num_elements_in_block = num_fetch_cols * num_fetch_rows; - auto const fetch_block_row_size = fetch_block.get_shared_row_size(col_offsets, col_sizes); - auto const starting_column_offset = col_offsets[fetch_block.start_col]; - auto &fetch_barrier = block_barrier[fetch % NUM_BLOCKS_PER_KERNEL_LOADED]; - - // wait for the last use of the memory to be completed - if (fetch > NUM_BLOCKS_PER_KERNEL_LOADED) { - fetch_barrier.arrive_and_wait(); - } - - // to do the copy we need to do n column copies followed by m element copies OR - // we have to do m element copies followed by r row copies. When going from column - // to row it is much easier to copy by elements first otherwise we would need a running - // total of the column sizes for our block, which isn't readily available. This makes it - // more appealing to copy element-wise from input data into shared matching the end layout - // and do row-based memcopies out. - - auto const shared_buffer_base = shared[fetch % stages_count]; - for (auto el = (int)threadIdx.x; el < num_elements_in_block; el += blockDim.x) { - auto const relative_col = el / num_fetch_rows; - auto const relative_row = el % num_fetch_rows; - auto const absolute_col = relative_col + fetch_block.start_col; - auto const absolute_row = relative_row + fetch_block.start_row; - auto const col_size = col_sizes[absolute_col]; - auto const col_offset = col_offsets[absolute_col]; - auto const relative_col_offset = col_offset - starting_column_offset; - - auto const shared_offset = relative_row * fetch_block_row_size + relative_col_offset; - auto const input_src = input_data[absolute_col] + col_size * absolute_row; - - // copy the element from global memory - switch (col_size) { - case 2: - cuda::memcpy_async(&shared_buffer_base[shared_offset], input_src, - cuda::aligned_size_t<2>(col_size), fetch_barrier); - break; - case 4: - cuda::memcpy_async(&shared_buffer_base[shared_offset], input_src, - cuda::aligned_size_t<4>(col_size), fetch_barrier); - break; - case 8: - cuda::memcpy_async(&shared_buffer_base[shared_offset], input_src, - cuda::aligned_size_t<8>(col_size), fetch_barrier); - break; - default: - cuda::memcpy_async(&shared_buffer_base[shared_offset], input_src, col_size, - fetch_barrier); - break; - } - } - } - - auto &subset_barrier = block_barrier[subset % NUM_BLOCKS_PER_KERNEL_LOADED]; - subset_barrier.arrive_and_wait(); - - auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_ROWS + subset]; - auto const block_row_size = block.get_shared_row_size(col_offsets, col_sizes); - auto const column_offset = col_offsets[block.start_col]; - auto const block_output_buffer = output_data[block.buffer_num]; - - // copy entire rows to final dest - for (auto absolute_row = block.start_row + threadIdx.x; absolute_row <= block.end_row; - absolute_row += blockDim.x) { - auto const relative_row = absolute_row - block.start_row; - auto const output_dest = block_output_buffer + row_offsets[absolute_row] + column_offset; - auto const shared_offset = block_row_size * relative_row; - - cuda::memcpy_async(output_dest, &shared[subset % stages_count][shared_offset], - cuda::aligned_size_t<8>(block_row_size), subset_barrier); - } - } - - // wait on the last copies to complete - for (uint i = 0; i < std::min(stages_count, blocks_remaining); ++i) { - block_barrier[i].arrive_and_wait(); - } - } - - /** - * @brief copy data from row-based format to cudf columns - * - * @param num_rows total number of rows in the table - * @param num_columns total number of columns in the table - * @param shmem_used_per_block amount of shared memory that is used by a block - * @param offsets - * @param output_data pointer to output data, partitioned by data size - * @param validity_offsets offset into input data row for validity data - * @param block_infos information about the blocks of work - * @param num_block_infos number of infos in blocks array - * @param input_data pointer to input data - * - */ - __global__ void copy_validity_to_rows( - const size_type num_rows, const size_type num_columns, const size_type shmem_used_per_block, - const size_type *row_offsets, int8_t **output_data, const size_type validity_offset, - const block_info *block_infos, const size_type num_block_infos, const bitmask_type **input_nm) { - extern __shared__ int8_t shared_data[]; - int8_t *shared_blocks[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] = { - shared_data, shared_data + shmem_used_per_block / 2}; - - // per conversation with DaveB - // each thread of warp reads a single int32 of validity - so we read 128 bytes - // then ballot_sync the bits and write the result to shmem - // after we fill shared mem memcpy it out in a blob. - // probably need knobs for number of rows vs columns to balance read/write - auto group = cooperative_groups::this_thread_block(); - - int const blocks_remaining = - std::min(num_block_infos - blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL, - (uint)NUM_VALIDITY_BLOCKS_PER_KERNEL); - - __shared__ cuda::barrier - shared_block_barriers[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]; - if (group.thread_rank() == 0) { - for (int i = 0; i < NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; ++i) { - init(&shared_block_barriers[i], group.size()); - } - } - - group.sync(); - - for (int validity_block = 0; validity_block < blocks_remaining; ++validity_block) { - if (validity_block != validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED) { - shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] - .arrive_and_wait(); - } - int8_t *this_shared_block = shared_blocks[validity_block % 2]; - auto block = block_infos[blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block]; - - auto const num_block_cols = block.num_cols(); - auto const num_block_rows = block.num_rows(); - - auto const num_sections_x = util::div_rounding_up_unsafe(num_block_cols, 32); - auto const num_sections_y = util::div_rounding_up_unsafe(num_block_rows, 32); - auto const validity_data_row_length = - align_offset(util::div_rounding_up_unsafe(num_block_cols, 8), 8); - auto const total_sections = num_sections_x * num_sections_y; - - int const warp_id = threadIdx.x / detail::warp_size; - int const lane_id = threadIdx.x % detail::warp_size; - auto const warps_per_block = std::max(1u, blockDim.x / detail::warp_size); - - // the block is divided into sections. A warp operates on a section at a time. - for (int my_section_idx = warp_id; my_section_idx < total_sections; - my_section_idx += warps_per_block) { - // convert to rows and cols - auto const section_x = my_section_idx % num_sections_x; - auto const section_y = my_section_idx / num_sections_x; - auto const relative_col = section_x * 32 + lane_id; - auto const relative_row = section_y * 32; - auto const absolute_col = relative_col + block.start_col; - auto const absolute_row = relative_row + block.start_row; - auto const cols_left = num_columns - absolute_col; - auto const participation_mask = __ballot_sync(0xFFFFFFFF, absolute_col < num_columns); - - if (absolute_col < num_columns) { - auto my_data = input_nm[absolute_col] != nullptr ? - input_nm[absolute_col][absolute_row / 32] : - std::numeric_limits::max(); - - // every thread that is participating in the warp has a byte, but it's column-based - // data and we need it in row-based. So we shuffle the bits around with ballot_sync to - // make the bytes we actually write. - bitmask_type dw_mask = 1; - for (int i = 0; i < 32 && relative_row + i < num_rows; ++i, dw_mask <<= 1) { - auto validity_data = __ballot_sync(participation_mask, my_data & dw_mask); - // lead thread in each warp writes data - auto const validity_write_offset = - validity_data_row_length * (relative_row + i) + relative_col / 8; - if (threadIdx.x % detail::warp_size == 0) { - if (cols_left <= 8) { - // write byte - this_shared_block[validity_write_offset] = validity_data & 0xFF; - } else if (cols_left <= 16) { - // write int16 - *reinterpret_cast(&this_shared_block[validity_write_offset]) = - validity_data & 0xFFFF; - } else if (cols_left <= 24) { - // write int16 and then int8 - *reinterpret_cast(&this_shared_block[validity_write_offset]) = - validity_data & 0xFFFF; - shared_data[validity_write_offset + 2] = (validity_data >> 16) & 0xFF; - } else { - // write int32 - *reinterpret_cast(&this_shared_block[validity_write_offset]) = - validity_data; - } - } - } - } - } - - // make sure entire block has finished copy - group.sync(); - - auto const output_data_base = - output_data[block.buffer_num] + validity_offset + block.start_col / 8; - - // now async memcpy the shared memory out to the final destination - for (int row = block.start_row + threadIdx.x; row <= block.end_row; row += blockDim.x) { - auto const relative_row = row - block.start_row; - auto const output_ptr = output_data_base + row_offsets[row]; - auto const num_bytes = util::div_rounding_up_unsafe(num_block_cols, 8); - - cuda::memcpy_async( - output_ptr, &this_shared_block[validity_data_row_length * relative_row], num_bytes, - shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]); - } - } - - // wait for last blocks of data to arrive - for (int validity_block = 0; - validity_block < blocks_remaining % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; - ++validity_block) { - shared_block_barriers[validity_block].arrive_and_wait(); - } - } - - static __device__ std::tuple - get_admin_data_sizes(size_t col_size_size, size_t col_offset_size, int const num_cols) { - auto const col_size_bytes = num_cols * col_size_size; - auto const col_offset_bytes = num_cols * col_offset_size; - - return {col_size_bytes, col_offset_bytes}; - } - - /** - * @brief copy data from row-based format to cudf columns - * - * @param num_rows total number of rows in the table - * @param num_columns total number of columns in the table - * @param shmem_used_per_block amount of shared memory that is used by a block - * @param row_offsets - * @param output_data - * @param output_nm - * @param col_sizes array of sizes for each element in a column - one per column - * @param col_offsets offset into input data row for each column's start - * @param block_infos information about the blocks of work - * @param input_data pointer to input data - * - */ - __global__ void copy_from_rows(const size_type num_rows, const size_type num_columns, - const size_type shmem_used_per_block, const size_type *row_offsets, - int8_t **output_data, const size_type *_col_sizes, - const size_type *_col_offsets, const block_info *block_infos, - const size_type num_block_infos, const int8_t *input_data) { - // We are going to copy the data in two passes. - // The first pass copies a chunk of data into shared memory. - // The second pass copies that chunk from shared memory out to the final location. - - // Because shared memory is limited we copy a subset of the rows at a time. - // This has been broken up for us in the block_info struct, so we don't have - // any calculation to do here, but it is important to note. - - // to speed up some of the random access memory we do, we copy col_sizes and col_offsets - // to shared memory for each of the blocks that we work on - - constexpr unsigned stages_count = NUM_BLOCKS_PER_KERNEL_LOADED; - auto group = cooperative_groups::this_thread_block(); - extern __shared__ int8_t shared_data[]; - int8_t *shared[stages_count] = {shared_data, shared_data + shmem_used_per_block}; - - __shared__ cuda::barrier block_barrier[NUM_BLOCKS_PER_KERNEL_LOADED]; - if (group.thread_rank() == 0) { - for (int i = 0; i < NUM_BLOCKS_PER_KERNEL_LOADED; ++i) { - init(&block_barrier[i], group.size()); - } - } - - group.sync(); - - auto blocks_remaining = std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_ROWS, - (uint)NUM_BLOCKS_PER_KERNEL_FROM_ROWS); - - size_t fetch_index; - size_t processing_index; - for (processing_index = fetch_index = 0; processing_index < blocks_remaining; - ++processing_index) { - // Fetch ahead up to stages_count groups - for (; fetch_index < static_cast(blocks_remaining) && - fetch_index < (processing_index + stages_count); - ++fetch_index) { - auto const fetch_block = - block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_ROWS + fetch_index]; - auto const fetch_block_start_row = fetch_block.start_row; - auto const fetch_block_end_row = fetch_block.end_row; - auto const starting_col_offset = _col_offsets[fetch_block.start_col]; - auto const fetch_block_row_size = fetch_block.get_shared_row_size(_col_offsets, _col_sizes); - auto const num_fetch_cols = fetch_block.num_cols(); - auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes( - sizeof(decltype(*_col_sizes)), sizeof(decltype(*_col_offsets)), num_fetch_cols); - auto &fetch_barrier = block_barrier[fetch_index % NUM_BLOCKS_PER_KERNEL_LOADED]; - - // if we have fetched all buffers, we need to wait for processing - // to complete on them before we can use them again - if (fetch_index > NUM_BLOCKS_PER_KERNEL_LOADED) { - fetch_barrier.arrive_and_wait(); - } - - auto shared_row_offset = 0; - // copy the data for column sizes - cuda::memcpy_async(group, &shared[fetch_index % stages_count][shared_row_offset], - &_col_sizes[fetch_block.start_col], col_size_bytes, fetch_barrier); - shared_row_offset += col_size_bytes; - // copy the data for column offsets - cuda::memcpy_async(group, &shared[fetch_index % stages_count][shared_row_offset], - &_col_offsets[fetch_block.start_col], col_offset_bytes, fetch_barrier); - shared_row_offset += col_offset_bytes; - shared_row_offset = align_offset(shared_row_offset, 8); - - for (auto row = fetch_block_start_row + static_cast(threadIdx.x); - row <= fetch_block_end_row; row += blockDim.x) { - auto shared_offset = - (row - fetch_block_start_row) * fetch_block_row_size + shared_row_offset; - // copy the main - cuda::memcpy_async(&shared[fetch_index % stages_count][shared_offset], - &input_data[row_offsets[row] + starting_col_offset], - fetch_block_row_size, fetch_barrier); - } - } - - auto &processing_barrier = block_barrier[processing_index % NUM_BLOCKS_PER_KERNEL_LOADED]; - - // ensure our data is ready - processing_barrier.arrive_and_wait(); - - auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_ROWS + processing_index]; - auto const rows_in_block = block.num_rows(); - auto const cols_in_block = block.num_cols(); - - auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes( - sizeof(decltype(*_col_sizes)), sizeof(decltype(*_col_offsets)), cols_in_block); - auto shared_col_sizes = reinterpret_cast(shared[processing_index % stages_count]); - auto shared_col_offsets = - reinterpret_cast(&shared[processing_index % stages_count][col_size_bytes]); - - auto const shared_row_offset = align_offset(col_size_bytes + col_offset_bytes, 8); - - auto block_row_size = block.get_shared_row_size(_col_offsets, _col_sizes); - - // now we copy from shared memory to final destination. - // the data is laid out in rows in shared memory, so the reads - // for a column will be "vertical". Because of this and the different - // sizes for each column, this portion is handled on row/column basis. - // to prevent each thread working on a single row and also to ensure - // that all threads can do work in the case of more threads than rows, - // we do a global index instead of a double for loop with col/row. - for (int index = threadIdx.x; index < rows_in_block * cols_in_block; index += blockDim.x) { - auto const relative_col = index % cols_in_block; - auto const relative_row = index / cols_in_block; - auto const absolute_col = relative_col + block.start_col; - auto const absolute_row = relative_row + block.start_row; - - auto const shared_memory_row_offset = block_row_size * relative_row; - auto const shared_memory_offset = shared_col_offsets[relative_col] - shared_col_offsets[0] + - shared_memory_row_offset + shared_row_offset; - auto const column_size = shared_col_sizes[relative_col]; - - int8_t *shmem_src = &shared[processing_index % stages_count][shared_memory_offset]; - int8_t *dst = &output_data[absolute_col][absolute_row * column_size]; - - cuda::memcpy_async(dst, shmem_src, column_size, processing_barrier); - } - group.sync(); - } - - // wait on the last copies to complete - for (uint i = 0; i < std::min(stages_count, blocks_remaining); ++i) { - block_barrier[i].arrive_and_wait(); - } - } - - /** - * @brief copy data from row-based format to cudf columns - * - * @param num_rows total number of rows in the table - * @param num_columns total number of columns in the table - * @param shmem_used_per_block amount of shared memory that is used by a block - * @param offsets - * @param output_nm - * @param validity_offsets offset into input data row for validity data - * @param block_infos information about the blocks of work - * @param num_block_infos number of infos in blocks array - * @param input_data pointer to input data - * - */ - __global__ void copy_validity_from_rows( - const size_type num_rows, const size_type num_columns, const size_type shmem_used_per_block, - const size_type *row_offsets, cudf::bitmask_type **output_nm, const size_type validity_offset, - const block_info *block_infos, const size_type num_block_infos, const int8_t *input_data) { - extern __shared__ int8_t shared_data[]; - int8_t *shared_blocks[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] = { - shared_data, shared_data + shmem_used_per_block / 2}; - - // per conversation with DaveB - // each thread of warp reads a single byte of validity - so we read 32 bytes - // then ballot_sync the bits and write the result to shmem - // after we fill shared mem memcpy it out in a blob. - // probably need knobs for number of rows vs columns to balance read/write - auto group = cooperative_groups::this_thread_block(); - - int const blocks_remaining = - std::min(num_block_infos - blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL, - (uint)NUM_VALIDITY_BLOCKS_PER_KERNEL); - - __shared__ cuda::barrier - shared_block_barriers[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]; - if (group.thread_rank() == 0) { - for (int i = 0; i < NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; ++i) { - init(&shared_block_barriers[i], group.size()); - } - } - - group.sync(); - - for (int validity_block = 0; validity_block < blocks_remaining; ++validity_block) { - auto const validity_index = validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; - if (validity_block != validity_index) { - shared_block_barriers[validity_index].arrive_and_wait(); - } - int8_t *this_shared_block = shared_blocks[validity_block % 2]; - auto const block = block_infos[blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block]; - auto const block_start_col = block.start_col; - auto const block_start_row = block.start_row; - auto const num_block_cols = block.num_cols(); - auto const num_block_rows = block.num_rows(); - auto const num_sections_x = (num_block_cols + 7) / 8; - auto const num_sections_y = (num_block_rows + 31) / 32; - auto const validity_data_col_length = num_sections_y * 4; // words to bytes - auto const total_sections = num_sections_x * num_sections_y; - int const warp_id = threadIdx.x / detail::warp_size; - int const lane_id = threadIdx.x % detail::warp_size; - auto const warps_per_block = std::max(1u, blockDim.x / detail::warp_size); - - // the block is divided into sections. A warp operates on a section at a time. - for (int my_section_idx = warp_id; my_section_idx < total_sections; - my_section_idx += warps_per_block) { - // convert to rows and cols - auto const section_x = my_section_idx % num_sections_x; - auto const section_y = my_section_idx / num_sections_x; - auto const relative_col = section_x * 8; - auto const relative_row = section_y * 32 + lane_id; - auto const absolute_col = relative_col + block_start_col; - auto const absolute_row = relative_row + block_start_row; - auto const rows_left = num_rows - absolute_row; - - auto const participation_mask = __ballot_sync(0xFFFFFFFF, absolute_row < num_rows); - - if (absolute_row < num_rows) { - auto const my_byte = - input_data[row_offsets[absolute_row] + validity_offset + absolute_col / 8]; - - // so every thread that is participating in the warp has a byte, but it's row-based - // data and we need it in column-based. So we shiffle the bits around to make - // the bytes we actually write. - for (int i = 0, byte_mask = 1; i < 8 && relative_col + i < num_columns; - ++i, byte_mask <<= 1) { - auto validity_data = __ballot_sync(participation_mask, my_byte & byte_mask); - // lead thread in each warp writes data - if (threadIdx.x % detail::warp_size == 0) { - auto const validity_write_offset = - validity_data_col_length * (relative_col + i) + relative_row / 8; - - if (rows_left <= 8) { - // write byte - this_shared_block[validity_write_offset] = validity_data & 0xFF; - } else if (rows_left <= 16) { - // write int16 - *reinterpret_cast(&this_shared_block[validity_write_offset]) = - validity_data & 0xFFFF; - } else if (rows_left <= 24) { - // write int16 and then int8 - *reinterpret_cast(&this_shared_block[validity_write_offset]) = - validity_data & 0xFFFF; - shared_data[validity_write_offset + 2] = (validity_data >> 16) & 0xFF; - } else { - // write int32 - *reinterpret_cast(&this_shared_block[validity_write_offset]) = - validity_data; - } - } - } - } - } - - // make sure entire block has finished copy - group.sync(); - - // now async memcpy the shared - for (int col = block.start_col + threadIdx.x; col <= block.end_col; col += blockDim.x) { - auto const relative_col = col - block.start_col; - - cuda::memcpy_async( - output_nm[col] + word_index(block_start_row), - &this_shared_block[validity_data_col_length * relative_col], - util::div_rounding_up_unsafe(num_block_rows, 8), - shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]); - } - } - - // wait for last blocks of data to arrive - auto const num_blocks_to_wait = blocks_remaining > NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED ? - NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED : - blocks_remaining; - for (int validity_block = 0; validity_block < num_blocks_to_wait; ++validity_block) { - shared_block_barriers[validity_block].arrive_and_wait(); - } - } - - #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 - - /** - * Calculate the dimensions of the kernel for fixed width only columns. - * @param [in] num_columns the number of columns being copied. - * @param [in] num_rows the number of rows being copied. - * @param [in] size_per_row the size each row takes up when padded. - * @param [out] blocks the size of the blocks for the kernel - * @param [out] threads the size of the threads for the kernel - * @return the size in bytes of shared memory needed for each block. - */ - static int calc_fixed_width_kernel_dims(const cudf::size_type num_columns, - const cudf::size_type num_rows, - const cudf::size_type size_per_row, dim3 &blocks, - dim3 &threads) { - // We have found speed degrades when a thread handles more than 4 columns. - // Each block is 2 dimensional. The y dimension indicates the columns. - // We limit this to 32 threads in the y dimension so we can still - // have at least 32 threads in the x dimension (1 warp) which should - // result in better coalescing of memory operations. We also - // want to guarantee that we are processing a multiple of 32 threads - // in the x dimension because we use atomic operations at the block - // level when writing validity data out to main memory, and that would - // need to change if we split a word of validity data between blocks. - int y_block_size = (num_columns + 3) / 4; // cudf::util::div_rounding_up_safe(num_columns, 4); - if (y_block_size > 32) { - y_block_size = 32; - } - int x_possible_block_size = 1024 / y_block_size; - // 48KB is the default setting for shared memory per block according to the cuda tutorials - // If someone configures the GPU to only have 16 KB this might not work. - int max_shared_size = 48 * 1024; - int max_block_size = max_shared_size / size_per_row; - // If we don't have enough shared memory there is no point in having more threads - // per block that will just sit idle - max_block_size = max_block_size > x_possible_block_size ? x_possible_block_size : max_block_size; - // Make sure that the x dimension is a multiple of 32 this not only helps - // coalesce memory access it also lets us do a ballot sync for validity to write - // the data back out the warp level. If x is a multiple of 32 then each thread in the y - // dimension is associated with one or more warps, that should correspond to the validity - // words directly. - int block_size = (max_block_size / 32) * 32; - CUDF_EXPECTS(block_size != 0, "Row size is too large to fit in shared memory"); - - int num_blocks = (num_rows + block_size - 1) / block_size; - if (num_blocks < 1) { - num_blocks = 1; - } else if (num_blocks > 10240) { - // The maximum number of blocks supported in the x dimension is 2 ^ 31 - 1 - // but in practice haveing too many can cause some overhead that I don't totally - // understand. Playing around with this haveing as little as 600 blocks appears - // to be able to saturate memory on V100, so this is an order of magnitude higher - // to try and future proof this a bit. - num_blocks = 10240; - } - blocks.x = num_blocks; - blocks.y = 1; - blocks.z = 1; - threads.x = block_size; - threads.y = y_block_size; - threads.z = 1; - return size_per_row * block_size; - } - - /** - * When converting to rows it is possible that the size of the table was too big to fit - * in a single column. This creates an output column for a subset of the rows in a table - * going from start row and containing the next num_rows. Most of the parameters passed - * into this function are common between runs and should be calculated once. - */ - static std::unique_ptr - fixed_width_convert_to_rows(const cudf::size_type start_row, const cudf::size_type num_rows, - const cudf::size_type num_columns, const cudf::size_type size_per_row, - rmm::device_uvector &column_start, - rmm::device_uvector &column_size, - rmm::device_uvector &input_data, - rmm::device_uvector &input_nm, - const cudf::scalar &zero, const cudf::scalar &scalar_size_per_row, - rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) { - int64_t total_allocation = size_per_row * num_rows; - // We made a mistake in the split somehow - CUDF_EXPECTS(total_allocation < std::numeric_limits::max(), "Table is too large to fit!"); - - // Allocate and set the offsets row for the byte array - std::unique_ptr offsets = - cudf::detail::sequence(num_rows + 1, zero, scalar_size_per_row, stream); - - std::unique_ptr data = cudf::make_numeric_column( - cudf::data_type(cudf::type_id::INT8), static_cast(total_allocation), - cudf::mask_state::UNALLOCATED, stream, mr); - - dim3 blocks; - dim3 threads; - int shared_size = - detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads); - - copy_to_rows_fixed_width_optimized<<>>( - start_row, num_rows, num_columns, size_per_row, column_start.data(), column_size.data(), - input_data.data(), input_nm.data(), data->mutable_view().data()); - - return cudf::make_lists_column(num_rows, std::move(offsets), std::move(data), 0, - rmm::device_buffer{0, rmm::cuda_stream_default, mr}, stream, mr); - } - - static cudf::data_type get_data_type(const cudf::column_view &v) { - return v.type(); - } - - static inline bool are_all_fixed_width(std::vector const &schema) { - return std::all_of(schema.begin(), schema.end(), - [](const cudf::data_type &t) { return cudf::is_fixed_width(t); }); - } - - /** - * Given a set of fixed width columns, calculate how the data will be laid out in memory. - * @param [in] schema the types of columns that need to be laid out. - * @param [out] column_start the byte offset where each column starts in the row. - * @param [out] column_size the size in bytes of the data for each columns in the row. - * @return the size in bytes each row needs. - */ - static inline int32_t compute_fixed_width_layout(std::vector const &schema, - std::vector &column_start, - std::vector &column_size) { - // We guarantee that the start of each column is 64-bit aligned so anything can go - // there, but to make the code simple we will still do an alignment for it. - int32_t at_offset = 0; - for (auto col = schema.begin(); col < schema.end(); col++) { - cudf::size_type s = cudf::size_of(*col); - column_size.emplace_back(s); - std::size_t allocation_needed = s; - std::size_t alignment_needed = allocation_needed; // They are the same for fixed width types - at_offset = align_offset(at_offset, alignment_needed); - column_start.emplace_back(at_offset); - at_offset += allocation_needed; - } - - // Now we need to add in space for validity - // Eventually we can think about nullable vs not nullable, but for now we will just always add - // it in - int32_t validity_bytes_needed = - (schema.size() + 7) / 8; // cudf::util::div_rounding_up_safe(schema.size(), 8); - // validity comes at the end and is byte aligned so we can pack more in. - at_offset += validity_bytes_needed; - // Now we need to pad the end so all rows are 64 bit aligned - return align_offset(at_offset, 8); // 8 bytes (64 bits) - } - - #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 - - template - static size_type compute_column_information(iterator begin, iterator end, - std::vector &column_starts, - std::vector &column_sizes) //, - // std::function nested_type_cb) - { - size_type fixed_width_size_per_row = 0; - for (auto cv = begin; cv != end; ++cv) { - auto col_type = std::get<0>(*cv); - bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; - - // if (nested_type && nested_type_cb) { nested_type_cb(cv->get<1>()); } - - // a list or string column will write a single uint64 - // of data here for offset/length - auto col_size = nested_type ? 8 : size_of(col_type); - - // align size for this type - std::size_t const alignment_needed = col_size; // They are the same for fixed width types - fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed); - column_starts.push_back(fixed_width_size_per_row); - column_sizes.push_back(col_size); - fixed_width_size_per_row += col_size; - } - - auto validity_offset = fixed_width_size_per_row; - column_starts.push_back(validity_offset); - - return fixed_width_size_per_row; - } - - std::vector - build_validity_block_infos(size_type const &num_columns, size_type const &num_rows, - size_type const &shmem_limit_per_block, - std::vector const &row_batches) { - auto const desired_rows_and_columns = (int)sqrt(shmem_limit_per_block); - auto const column_stride = align_offset( - [&]() { - if (desired_rows_and_columns > num_columns) { - // not many columns, group it into 8s and ship it off - return std::min(8, num_columns); - } else { - return util::round_down_safe(desired_rows_and_columns, 8); - } - }(), - 8); - // we fit as much as we can given the column stride - // note that an element in the table takes just 1 bit, but a row with a single - // element still takes 8 bytes! - auto const bytes_per_row = align_offset(util::div_rounding_up_unsafe(column_stride, 8), 8); - auto const row_stride = std::min(num_rows, shmem_limit_per_block / bytes_per_row); - - std::vector validity_block_infos; - for (int col = 0; col < num_columns; col += column_stride) { - int current_window_row_batch = 0; - int rows_left_in_batch = row_batches[current_window_row_batch].row_count; - int row = 0; - while (row < num_rows) { - if (rows_left_in_batch == 0) { - current_window_row_batch++; - rows_left_in_batch = row_batches[current_window_row_batch].row_count; - } - int const window_height = std::min(row_stride, rows_left_in_batch); - - validity_block_infos.emplace_back(detail::block_info{ - col, row, std::min(col + column_stride - 1, num_columns - 1), row + window_height - 1}); - row += window_height; - rows_left_in_batch -= window_height; - } - } - - return validity_block_infos; - } - - std::vector build_block_infos(std::vector const &column_sizes, - std::vector const &column_starts, - std::vector const &row_batches, - size_type const total_number_of_rows, - size_type const &shmem_limit_per_block) { - std::vector block_infos; - - // block infos are organized with the windows going "down" the columns - // this provides the most coalescing of memory access - int current_window_width = 0; - int current_window_start_col = 0; - - // build the blocks for a specific set of columns - auto build_blocks = [&block_infos, &row_batches, total_number_of_rows]( - int const start_col, int const end_col, int const desired_window_height) { - int current_window_start_row = 0; - int current_window_row_batch = 0; - int rows_left_in_batch = row_batches[current_window_row_batch].row_count; - int i = 0; - while (i < total_number_of_rows) { - if (rows_left_in_batch == 0) { - current_window_row_batch++; - rows_left_in_batch = row_batches[current_window_row_batch].row_count; - } - int const window_height = std::min(desired_window_height, rows_left_in_batch); - - block_infos.emplace_back(detail::block_info{ - start_col, current_window_start_row, end_col, - std::min(current_window_start_row + window_height - 1, total_number_of_rows - 1), - current_window_row_batch}); - - i += window_height; - current_window_start_row += window_height; - rows_left_in_batch -= window_height; - } - }; - - // the ideal window height has lots of 8-byte reads and 8-byte writes. The optimal read/write - // would be memory cache line sized access, but since other blocks will read/write the edges - // this may not turn out to be overly important. For now, we will attempt to build a square - // window as far as byte sizes. x * y = shared_mem_size. Which translates to x^2 = - // shared_mem_size since we want them equal, so height and width are sqrt(shared_mem_size). The - // trick is that it's in bytes, not rows or columns. - size_type const optimal_square_len = size_type(sqrt(shmem_limit_per_block)); - int const window_height = std::clamp( - util::round_up_safe( - std::min(std::min(optimal_square_len, (size_type)column_sizes.size()) / column_sizes[0], - total_number_of_rows), - 32), - 1, row_batches[0].row_count); - - auto calc_admin_data_size = [](int num_cols) -> size_type { - // admin data is the column sizes and column start information. - // this is copied to shared memory as well and needs to be accounted for - // in the window calculation. - return num_cols * sizeof(size_type) + num_cols * sizeof(size_type); - }; - - int row_size = 0; - - // march each column and build the blocks of appropriate sizes - for (unsigned int col = 0; col < column_sizes.size(); ++col) { - auto const col_size = column_sizes[col]; - - // align size for this type - std::size_t alignment_needed = col_size; // They are the same for fixed width types - auto row_size_aligned = detail::align_offset(row_size, alignment_needed); - auto row_size_with_this_col = row_size_aligned + col_size; - auto row_size_with_end_pad = detail::align_offset(row_size_with_this_col, 8); - - if (row_size_with_end_pad * window_height + - calc_admin_data_size(col - current_window_start_col) > - shmem_limit_per_block) { - // too large, close this window, generate vertical blocks and restart - build_blocks(current_window_start_col, col == 0 ? col : col - 1, window_height); - row_size = - detail::align_offset((column_starts[col] + column_sizes[col]) & 7, alignment_needed); - row_size += col_size; // alignment required for shared memory window boundary to match - // alignment of output row - current_window_start_col = col; - current_window_width = 0; - } else { - row_size = row_size_with_this_col; - current_window_width++; - } - } - - // build last set of blocks - if (current_window_width > 0) { - build_blocks(current_window_start_col, (int)column_sizes.size() - 1, window_height); - } - - return block_infos; - } - - #endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 - - } // namespace detail - - std::vector> convert_to_rows(cudf::table_view const &tbl, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) { - #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 - // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the - // data, but small enough that multiple columns fit in memory so the writes can coalese as well. - // Potential optimization for window sizes. - const size_type num_columns = tbl.num_columns(); - const size_type num_rows = tbl.num_rows(); - - int device_id; - CUDA_TRY(cudaGetDevice(&device_id)); - int total_shmem; - CUDA_TRY(cudaDeviceGetAttribute(&total_shmem, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); - - // TODO: why? - total_shmem -= 1024; - int shmem_limit_per_block = total_shmem / NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; - - // break up the work into blocks, which are a starting and ending row/col #. - // this window size is calculated based on the shared memory size available - // we want a single block to fill up the entire shared memory space available - // for the transpose-like conversion. - - // There are two different processes going on here. The GPU conversion of the data - // and the writing of the data into the list of byte columns that are a maximum of - // 2 gigs each due to offset maximum size. The GPU conversion portion has to understand - // this limitation because the column must own the data inside and as a result it must be - // a distinct allocation for that column. Copying the data into these final buffers would - // be prohibitively expensive, so care is taken to ensure the GPU writes to the proper buffer. - // The windows are broken at the boundaries of specific rows based on the row sizes up - // to that point. These are row batches and they are decided first before building the - // windows so the windows can be properly cut around them. - - // Get the pointers to the input columnar data ready - std::vector input_data; - std::vector input_nm; - input_data.reserve(num_columns); - input_nm.reserve(num_columns); - for (size_type column_number = 0; column_number < num_columns; column_number++) { - column_view cv = tbl.column(column_number); - auto const col_type = cv.type(); - bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; - - if (!nested_type) { - input_data.emplace_back(cv.data()); - input_nm.emplace_back(cv.null_mask()); - } - } - - auto dev_input_data = make_device_uvector_async(input_data, stream, mr); - auto dev_input_nm = make_device_uvector_async(input_nm, stream, mr); - - std::vector row_sizes; // size of each row in bytes including any alignment padding - std::vector row_offsets; // offset from the start of the data to this row - std::vector column_sizes; // byte size of each column - std::vector column_starts; // offset of column inside a row including alignment - std::vector - variable_width_columns; // list of the variable width columns in the table - row_sizes.reserve(num_rows); - row_offsets.reserve(num_rows); - column_sizes.reserve(num_columns); - column_starts.reserve(num_columns + 1); // we add a final offset for validity data start - - auto iter = - thrust::make_transform_iterator(thrust::make_counting_iterator(0), - [&tbl](auto i) -> std::tuple { - return std::make_tuple(tbl.column(i).type(), tbl.column(i)); - }); - - size_type fixed_width_size_per_row = - detail::compute_column_information(iter, iter + num_columns, column_starts, column_sizes); - - auto dev_col_sizes = make_device_uvector_async(column_sizes, stream, mr); - auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr); - - std::vector row_batches; - - uint64_t row_batch_size = 0; - uint64_t total_table_size = 0; - size_type row_batch_rows = 0; - uint64_t row_offset = 0; - - // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then - // calculate the size of each row's variable-width data and validity as well. - auto validity_size = num_bitmask_words(num_columns) * 4; - // thrust - for (int row = 0; row < num_rows; ++row) { - auto aligned_row_batch_size = - detail::align_offset(row_batch_size, 8); // rows are 8 byte aligned - row_sizes[row] = fixed_width_size_per_row; - // validity is byte aligned - row_sizes[row] += validity_size; - // variable width data is 8-byte aligned - row_sizes[row] = detail::align_offset(row_sizes[row], 8); // rows are 8 byte aligned - - if ((uint64_t)aligned_row_batch_size + row_sizes[row] > - (uint64_t)std::numeric_limits::max()) { - // a new batch starts at the last 32-row boundary - row_batches.push_back( - detail::row_batch{static_cast(row_batch_size), row_batch_rows & ~31}); - row_batch_size = 0; - row_batch_rows = row_batch_rows & 31; - row_offset = 0; - aligned_row_batch_size = 0; - } - row_offset = detail::align_offset(row_offset, 8); // rows are 8 byte aligned - row_offsets.push_back(row_offset); - row_batch_size = aligned_row_batch_size + row_sizes[row]; - row_offset += row_sizes[row]; - total_table_size = detail::align_offset(total_table_size, 8); // rows are 8 byte aligned - total_table_size += row_sizes[row]; - row_batch_rows++; - } - if (row_batch_size > 0) { - row_batches.push_back( - detail::row_batch{static_cast(row_batch_size), row_batch_rows}); - } - - auto dev_row_offsets = make_device_uvector_async(row_offsets, stream, mr); - - std::vector output_buffers; - std::vector output_data; - output_data.reserve(row_batches.size()); - for (uint i = 0; i < row_batches.size(); ++i) { - rmm::device_buffer temp(row_batches[i].num_bytes, stream, mr); - output_data.push_back(static_cast(temp.data())); - output_buffers.push_back(std::move(temp)); - } - auto dev_output_data = make_device_uvector_async(output_data, stream, mr); - - std::vector block_infos = - build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block); - - auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr); - - // blast through the entire table and convert it - dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_ROWS)); - dim3 threads(256); - - detail::copy_to_rows<<>>( - num_rows, num_columns, shmem_limit_per_block, block_infos.size(), dev_input_data.data(), - dev_col_sizes.data(), dev_col_starts.data(), dev_block_infos.data(), dev_row_offsets.data(), - reinterpret_cast(dev_output_data.data())); - - auto validity_block_infos = - build_validity_block_infos(num_columns, num_rows, shmem_limit_per_block, row_batches); - - auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream, mr); - dim3 validity_blocks( - util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_ROWS)); - dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu)); - detail::copy_validity_to_rows<<>>( - num_rows, num_columns, shmem_limit_per_block, dev_row_offsets.data(), dev_output_data.data(), - column_starts.back(), dev_validity_block_infos.data(), validity_block_infos.size(), - dev_input_nm.data()); - - // split up the output buffer into multiple buffers based on row batch sizes - // and create list of byte columns - int offset_offset = 0; - std::vector> ret; - for (uint i = 0; i < row_batches.size(); ++i) { - // compute offsets for this row batch - std::vector offset_vals; - offset_vals.reserve(row_batches[i].row_count + 1); - size_type cur_offset = 0; - offset_vals.push_back(cur_offset); - for (int row = 0; row < row_batches[i].row_count; ++row) { - cur_offset = detail::align_offset(cur_offset, 8) + row_sizes[row + offset_offset]; - offset_vals.push_back(cur_offset); - } - offset_offset += row_batches[i].row_count; - - auto dev_offsets = make_device_uvector_async(offset_vals, stream, mr); - auto offsets = std::make_unique(data_type{type_id::INT32}, - (size_type)offset_vals.size(), dev_offsets.release()); - - auto data = std::make_unique(data_type{cudf::type_id::INT8}, row_batches[i].num_bytes, - std::move(output_buffers[i])); - - ret.push_back( - cudf::make_lists_column(row_batches[i].row_count, std::move(offsets), std::move(data), 0, - rmm::device_buffer{0, rmm::cuda_stream_default, mr}, stream, mr)); - } - - return ret; - #else - CUDF_FAIL("Column to row conversion optimization requires volta or later hardware."); - return {}; - #endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 - } - - std::vector> - convert_to_rows_fixed_width_optimized(cudf::table_view const &tbl, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) { - const cudf::size_type num_columns = tbl.num_columns(); - - std::vector schema; - schema.resize(num_columns); - std::transform(tbl.begin(), tbl.end(), schema.begin(), detail::get_data_type); - - if (detail::are_all_fixed_width(schema)) { - std::vector column_start; - std::vector column_size; - - int32_t size_per_row = detail::compute_fixed_width_layout(schema, column_start, column_size); - auto dev_column_start = make_device_uvector_async(column_start, stream, mr); - auto dev_column_size = make_device_uvector_async(column_size, stream, mr); - - int32_t max_rows_per_batch = std::numeric_limits::max() / size_per_row; - // Make the number of rows per batch a multiple of 32 so we don't have to worry about - // splitting validity at a specific row offset. This might change in the future. - max_rows_per_batch = (max_rows_per_batch / 32) * 32; - - cudf::size_type num_rows = tbl.num_rows(); - - // Get the pointers to the input columnar data ready - std::vector input_data; - std::vector input_nm; - for (cudf::size_type column_number = 0; column_number < num_columns; column_number++) { - cudf::column_view cv = tbl.column(column_number); - input_data.emplace_back(cv.data()); - input_nm.emplace_back(cv.null_mask()); - } - auto dev_input_data = make_device_uvector_async(input_data, stream, mr); - auto dev_input_nm = make_device_uvector_async(input_nm, stream, mr); - - using ScalarType = cudf::scalar_type_t; - auto zero = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value()); - zero->set_valid_async(true, stream); - static_cast(zero.get())->set_value(0, stream); - - auto step = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value()); - step->set_valid_async(true, stream); - static_cast(step.get()) - ->set_value(static_cast(size_per_row), stream); - - std::vector> ret; - for (cudf::size_type row_start = 0; row_start < num_rows; row_start += max_rows_per_batch) { - cudf::size_type row_count = num_rows - row_start; - row_count = row_count > max_rows_per_batch ? max_rows_per_batch : row_count; - ret.emplace_back(detail::fixed_width_convert_to_rows( - row_start, row_count, num_columns, size_per_row, dev_column_start, dev_column_size, - dev_input_data, dev_input_nm, *zero, *step, stream, mr)); - } - - return ret; - } else { - CUDF_FAIL("Only fixed width types are currently supported"); - } - } - - std::unique_ptr convert_from_rows(cudf::lists_column_view const &input, - std::vector const &schema, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) { - #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 - // verify that the types are what we expect - cudf::column_view child = input.child(); - cudf::type_id list_type = child.type().id(); - CUDF_EXPECTS(list_type == cudf::type_id::INT8 || list_type == cudf::type_id::UINT8, - "Only a list of bytes is supported as input"); - - cudf::size_type num_columns = schema.size(); - cudf::size_type num_rows = input.parent().size(); - - int device_id; - CUDA_TRY(cudaGetDevice(&device_id)); - int total_shmem; - CUDA_TRY(cudaDeviceGetAttribute(&total_shmem, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); - - // TODO why? - total_shmem -= 1024; - int shmem_limit_per_block = total_shmem / NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; - - std::vector column_starts; - std::vector column_sizes; - - auto iter = thrust::make_transform_iterator(thrust::make_counting_iterator(0), [&schema](auto i) { - return std::make_tuple(schema[i], nullptr); - }); - size_type fixed_width_size_per_row = detail::compute_column_information( - iter, iter + num_columns, column_starts, column_sizes); //, [](void *) {}); - - size_type validity_size = num_bitmask_words(num_columns) * 4; - - size_type row_size = detail::align_offset(fixed_width_size_per_row + validity_size, 8); - - // Ideally we would check that the offsets are all the same, etc. but for now - // this is probably fine - CUDF_EXPECTS(row_size * num_rows == child.size(), "The layout of the data appears to be off"); - auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr); - auto dev_col_sizes = make_device_uvector_async(column_sizes, stream, mr); - - // build the row_batches from the passed in list column - std::vector row_batches; - - row_batches.push_back(detail::row_batch{child.size(), num_rows}); - - // Allocate the columns we are going to write into - std::vector> output_columns; - std::vector output_data; - std::vector output_nm; - for (cudf::size_type i = 0; i < num_columns; i++) { - auto column = cudf::make_fixed_width_column(schema[i], num_rows, - cudf::mask_state::UNINITIALIZED, stream, mr); - auto mut = column->mutable_view(); - output_data.emplace_back(mut.data()); - output_nm.emplace_back(mut.null_mask()); - output_columns.emplace_back(std::move(column)); - } - - auto dev_output_data = make_device_uvector_async(output_data, stream, mr); - auto dev_output_nm = make_device_uvector_async(output_nm, stream, mr); - - std::vector block_infos = - build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block); - - auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr); - - dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_FROM_ROWS)); - dim3 threads(std::min(std::min(256, shmem_limit_per_block / 8), (int)child.size())); - detail::copy_from_rows<<>>( - num_rows, num_columns, shmem_limit_per_block, input.offsets().data(), - dev_output_data.data(), dev_col_sizes.data(), dev_col_starts.data(), dev_block_infos.data(), - block_infos.size(), child.data()); - - auto const desired_rows_and_columns = (int)sqrt(shmem_limit_per_block); - auto const column_stride = [&]() { - if (desired_rows_and_columns > num_columns) { - // not many columns, group it into 64s and ship it off - return std::min(64, num_columns); - } else { - return util::round_down_safe(desired_rows_and_columns, 8); - } - }(); - auto const row_stride = [&]() { - // we fit as much as we can, we know the column stride now, so calculate the row - return std::min(num_rows, util::round_down_safe(shmem_limit_per_block * 8 / column_stride, 32)); - /* if (desired_rows_and_columns > num_rows) { - return std::min(32, num_rows); - } else { - return util::round_down_safe(desired_rows_and_columns, 32); - }*/ - }(); - std::vector validity_block_infos; - for (int col = 0; col < num_columns; col += column_stride) { - for (int row = 0; row < num_rows; row += row_stride) { - validity_block_infos.emplace_back( - detail::block_info{col, row, std::min(col + column_stride - 1, num_columns - 1), - std::min(row + row_stride - 1, num_rows - 1)}); - } - } - auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream, mr); - dim3 validity_blocks( - util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_BLOCKS_PER_KERNEL_FROM_ROWS)); - - dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu)); - detail:: - copy_validity_from_rows<<>>( - num_rows, num_columns, shmem_limit_per_block, input.offsets().data(), - dev_output_nm.data(), column_starts.back(), dev_validity_block_infos.data(), - validity_block_infos.size(), child.data()); - - return std::make_unique(std::move(output_columns)); - #else - CUDF_FAIL("Row to column conversion optimization requires volta or later hardware."); - return {}; - #endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 - } - - std::unique_ptr convert_from_rows_fixed_width_optimized( - cudf::lists_column_view const &input, std::vector const &schema, - rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) { - // verify that the types are what we expect - cudf::column_view child = input.child(); - cudf::type_id list_type = child.type().id(); - CUDF_EXPECTS(list_type == cudf::type_id::INT8 || list_type == cudf::type_id::UINT8, - "Only a list of bytes is supported as input"); - - cudf::size_type num_columns = schema.size(); - - if (detail::are_all_fixed_width(schema)) { - std::vector column_start; - std::vector column_size; - - cudf::size_type num_rows = input.parent().size(); - int32_t size_per_row = detail::compute_fixed_width_layout(schema, column_start, column_size); - - // Ideally we would check that the offsets are all the same, etc. but for now - // this is probably fine - CUDF_EXPECTS(size_per_row * num_rows == child.size(), - "The layout of the data appears to be off"); - auto dev_column_start = make_device_uvector_async(column_start, stream); - auto dev_column_size = make_device_uvector_async(column_size, stream); - - // Allocate the columns we are going to write into - std::vector> output_columns; - std::vector output_data; - std::vector output_nm; - for (cudf::size_type i = 0; i < num_columns; i++) { - auto column = cudf::make_fixed_width_column(schema[i], num_rows, - cudf::mask_state::UNINITIALIZED, stream, mr); - auto mut = column->mutable_view(); - output_data.emplace_back(mut.data()); - output_nm.emplace_back(mut.null_mask()); - output_columns.emplace_back(std::move(column)); - } - - auto dev_output_data = make_device_uvector_async(output_data, stream, mr); - auto dev_output_nm = make_device_uvector_async(output_nm, stream, mr); - - dim3 blocks; - dim3 threads; - int shared_size = - detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads); - - detail::copy_from_rows_fixed_width_optimized<<>>( - num_rows, num_columns, size_per_row, dev_column_start.data(), dev_column_size.data(), - dev_output_data.data(), dev_output_nm.data(), child.data()); - - return std::make_unique(std::move(output_columns)); - } else { - CUDF_FAIL("Only fixed width types are currently supported"); - } - } - - } // namespace cudf - \ No newline at end of file diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 2da28425c9e..088b0b747fb 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -330,10 +330,6 @@ ConfigureTest(RESHAPE_TEST reshape/interleave_columns_tests.cpp reshape/tile_tests.cpp) -################################################################################################### -# - row conversion test ---------------------------------------------------------------------------------- -ConfigureTest(ROW_CONVERSION_TEST row_conversion/row_conversion.cpp) - ################################################################################################### # - traits test ----------------------------------------------------------------------------------- ConfigureTest(TRAITS_TEST types/traits_test.cpp) diff --git a/cpp/tests/row_conversion/row_conversion.cpp b/cpp/tests/row_conversion/row_conversion.cpp deleted file mode 100644 index b807b5cec81..00000000000 --- a/cpp/tests/row_conversion/row_conversion.cpp +++ /dev/null @@ -1,677 +0,0 @@ -/* - * Copyright (c) 2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include - -struct ColumnToRowTests : public cudf::test::BaseFixture { -}; -struct RowToColumnTests : public cudf::test::BaseFixture { -}; - -TEST_F(ColumnToRowTests, Single) -{ - cudf::test::fixed_width_column_wrapper a({-1}); - cudf::table_view in(std::vector{a}); - std::vector schema = {cudf::data_type{cudf::type_id::INT32}}; - - auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); - auto new_rows = cudf::convert_to_rows(in); - - EXPECT_EQ(old_rows.size(), new_rows.size()); - for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = - cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); - - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); - } -} - -TEST_F(ColumnToRowTests, Simple) -{ - cudf::test::fixed_width_column_wrapper a({-1, 0, 1}); - cudf::table_view in(std::vector{a}); - std::vector schema = {cudf::data_type{cudf::type_id::INT32}}; - - auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); - auto new_rows = cudf::convert_to_rows(in); - - EXPECT_EQ(old_rows.size(), new_rows.size()); - for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = - cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); - - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); - } -} - -TEST_F(ColumnToRowTests, Tall) -{ - auto r = - cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); }); - cudf::test::fixed_width_column_wrapper a(r, r + (size_t)4096); - cudf::table_view in(std::vector{a}); - std::vector schema = {cudf::data_type{cudf::type_id::INT32}}; - - auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); - auto new_rows = cudf::convert_to_rows(in); - - EXPECT_EQ(old_rows.size(), new_rows.size()); - - for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = - cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); - - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); - } -} - -TEST_F(ColumnToRowTests, Wide) -{ - std::vector> cols; - std::vector views; - std::vector schema; - - for (int i = 0; i < 256; ++i) { - cols.push_back(cudf::test::fixed_width_column_wrapper({rand()})); - views.push_back(cols.back()); - schema.push_back(cudf::data_type{cudf::type_id::INT32}); - } - cudf::table_view in(views); - - auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); - auto new_rows = cudf::convert_to_rows(in); - - EXPECT_EQ(old_rows.size(), new_rows.size()); - for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = - cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); - - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); - } -} - -TEST_F(ColumnToRowTests, SingleByteWide) -{ - std::vector> cols; - std::vector views; - std::vector schema; - - for (int i = 0; i < 256; ++i) { - cols.push_back(cudf::test::fixed_width_column_wrapper({rand()})); - views.push_back(cols.back()); - - schema.push_back(cudf::data_type{cudf::type_id::INT8}); - } - cudf::table_view in(views); - - auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); - auto new_rows = cudf::convert_to_rows(in); - - EXPECT_EQ(old_rows.size(), new_rows.size()); - - for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = - cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); - - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); - } -} - -TEST_F(ColumnToRowTests, Non2Power) -{ - auto r = - cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); }); - std::vector> cols; - std::vector views; - std::vector schema; - - constexpr auto num_rows = 6 * 1024 + 557; - for (int i = 0; i < 131; ++i) { - cols.push_back(cudf::test::fixed_width_column_wrapper(r + num_rows * i, - r + num_rows * i + num_rows)); - views.push_back(cols.back()); - schema.push_back(cudf::data_type{cudf::type_id::INT32}); - } - cudf::table_view in(views); - - auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); - auto new_rows = cudf::convert_to_rows(in); - - EXPECT_EQ(old_rows.size(), new_rows.size()); - - for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = - cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); - - for (int j = 0; j < old_tbl->num_columns(); ++j) { - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(old_tbl->get_column(j), new_tbl->get_column(j)); - } - - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); - } -} - -TEST_F(ColumnToRowTests, Big) -{ - auto r = - cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); }); - std::vector> cols; - std::vector views; - std::vector schema; - - // 28 columns of 1 million rows - constexpr auto num_rows = 1024 * 1024; - for (int i = 0; i < 28; ++i) { - cols.push_back(cudf::test::fixed_width_column_wrapper(r + num_rows * i, - r + num_rows * i + num_rows)); - views.push_back(cols.back()); - schema.push_back(cudf::data_type{cudf::type_id::INT32}); - } - cudf::table_view in(views); - - auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); - auto new_rows = cudf::convert_to_rows(in); - - EXPECT_EQ(old_rows.size(), new_rows.size()); - - for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = - cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); - - for (int j = 0; j < old_tbl->num_columns(); ++j) { - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(old_tbl->get_column(j), new_tbl->get_column(j)); - } - - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); - } -} - -TEST_F(ColumnToRowTests, Bigger) -{ - auto r = - cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); }); - std::vector> cols; - std::vector views; - std::vector schema; - - // 128 columns of 1 million rows - constexpr auto num_rows = 1024 * 1024; - for (int i = 0; i < 128; ++i) { - cols.push_back(cudf::test::fixed_width_column_wrapper(r + num_rows * i, - r + num_rows * i + num_rows)); - views.push_back(cols.back()); - schema.push_back(cudf::data_type{cudf::type_id::INT32}); - } - cudf::table_view in(views); - - auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); - auto new_rows = cudf::convert_to_rows(in); - - EXPECT_EQ(old_rows.size(), new_rows.size()); - for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = - cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); - - for (int j = 0; j < old_tbl->num_columns(); ++j) { - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(old_tbl->get_column(j), new_tbl->get_column(j)); - } - - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); - } -} - -TEST_F(ColumnToRowTests, Biggest) -{ - auto r = - cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); }); - std::vector> cols; - std::vector views; - std::vector schema; - - // 128 columns of 2 million rows - constexpr auto num_rows = 2 * 1024 * 1024; - for (int i = 0; i < 128; ++i) { - cols.push_back(cudf::test::fixed_width_column_wrapper(r + num_rows * i, - r + num_rows * i + num_rows)); - views.push_back(cols.back()); - schema.push_back(cudf::data_type{cudf::type_id::INT32}); - } - cudf::table_view in(views); - - auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); - auto new_rows = cudf::convert_to_rows(in); - - EXPECT_EQ(old_rows.size(), new_rows.size()); - - for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = - cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); - - for (int j = 0; j < old_tbl->num_columns(); ++j) { - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(old_tbl->get_column(j), new_tbl->get_column(j)); - } - - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); - } -} - -TEST_F(RowToColumnTests, Single) -{ - cudf::test::fixed_width_column_wrapper a({-1}); - cudf::table_view in(std::vector{a}); - - auto old_rows = cudf::convert_to_rows(in); - std::vector schema{cudf::data_type{cudf::type_id::INT32}}; - for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = - cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); - - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); - } -} - -TEST_F(RowToColumnTests, Simple) -{ - cudf::test::fixed_width_column_wrapper a({-1, 0, 1}); - cudf::table_view in(std::vector{a}); - - auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); - std::vector schema{cudf::data_type{cudf::type_id::INT32}}; - for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = - cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); - - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); - } -} - -TEST_F(RowToColumnTests, Tall) -{ - auto r = - cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); }); - cudf::test::fixed_width_column_wrapper a(r, r + (size_t)4096); - cudf::table_view in(std::vector{a}); - - auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); - std::vector schema; - schema.reserve(in.num_columns()); - for (auto col = in.begin(); col < in.end(); ++col) { - schema.push_back(col->type()); - } - for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = - cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); - - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); - } -} - -TEST_F(RowToColumnTests, Wide) -{ - std::vector> cols; - std::vector views; - - for (int i = 0; i < 256; ++i) { - cols.push_back(cudf::test::fixed_width_column_wrapper({i})); // rand()})); - views.push_back(cols.back()); - } - cudf::table_view in(views); - - auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); - std::vector schema; - schema.reserve(in.num_columns()); - for (auto col = in.begin(); col < in.end(); ++col) { - schema.push_back(col->type()); - } - - for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = - cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); - - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); - } -} - -TEST_F(RowToColumnTests, SingleByteWide) -{ - std::vector> cols; - std::vector views; - - for (int i = 0; i < 256; ++i) { - cols.push_back(cudf::test::fixed_width_column_wrapper({rand()})); - views.push_back(cols.back()); - } - cudf::table_view in(views); - - auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); - std::vector schema; - schema.reserve(in.num_columns()); - for (auto col = in.begin(); col < in.end(); ++col) { - schema.push_back(col->type()); - } - for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = - cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); - - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); - } -} - -TEST_F(RowToColumnTests, AllTypes) -{ - std::vector> cols; - std::vector views; - std::vector schema{cudf::data_type{cudf::type_id::INT64}, - cudf::data_type{cudf::type_id::FLOAT64}, - cudf::data_type{cudf::type_id::INT8}, - cudf::data_type{cudf::type_id::BOOL8}, - cudf::data_type{cudf::type_id::FLOAT32}, - cudf::data_type{cudf::type_id::INT8}, - cudf::data_type{cudf::type_id::INT32}, - cudf::data_type{cudf::type_id::INT64}}; - - cudf::test::fixed_width_column_wrapper c0({3, 9, 4, 2, 20, 0}, {1, 1, 1, 1, 1, 0}); - cudf::test::fixed_width_column_wrapper c1({5.0, 9.5, 0.9, 7.23, 2.8, 0.0}, - {1, 1, 1, 1, 1, 0}); - cudf::test::fixed_width_column_wrapper c2({5, 1, 0, 2, 7, 0}, {1, 1, 1, 1, 1, 0}); - cudf::test::fixed_width_column_wrapper c3({true, false, false, true, false, false}, - {1, 1, 1, 1, 1, 0}); - cudf::test::fixed_width_column_wrapper c4({1.0f, 3.5f, 5.9f, 7.1f, 9.8f, 0.0f}, - {1, 1, 1, 1, 1, 0}); - cudf::test::fixed_width_column_wrapper c5({2, 3, 4, 5, 9, 0}, {1, 1, 1, 1, 1, 0}); - cudf::test::fixed_point_column_wrapper c6( - {-300, 500, 950, 90, 723, 0}, {1, 1, 1, 1, 1, 1, 1, 0}, numeric::scale_type{-2}); - cudf::test::fixed_point_column_wrapper c7( - {-80, 30, 90, 20, 200, 0}, {1, 1, 1, 1, 1, 1, 0}, numeric::scale_type{-1}); - - cudf::table_view in({c0, c1, c2, c3, c4, c5, c6, c7}); - - auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); - auto new_rows = cudf::convert_to_rows(in); - - for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = - cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*new_rows[i]), schema); - - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); - } -} - -TEST_F(RowToColumnTests, AllTypesLarge) -{ - std::vector cols; - std::vector schema{}; - - // 10 columns of each type with 1024 entries - constexpr int num_rows{1024}; - - std::default_random_engine re; - std::uniform_real_distribution rand_double(std::numeric_limits::min(), - std::numeric_limits::max()); - std::uniform_int_distribution rand_int64(std::numeric_limits::min(), - std::numeric_limits::max()); - auto r = cudf::detail::make_counting_transform_iterator( - 0, [&](auto i) -> int64_t { return rand_int64(re); }); - auto d = cudf::detail::make_counting_transform_iterator( - 0, [&](auto i) -> double { return rand_double(re); }); - - auto all_valid = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return 1; }); - auto none_valid = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return 0; }); - auto most_valid = cudf::detail::make_counting_transform_iterator( - 0, [](auto i) { return rand() % 2 == 0 ? 0 : 1; }); - auto few_valid = cudf::detail::make_counting_transform_iterator( - 0, [](auto i) { return rand() % 13 == 0 ? 1 : 0; }); - - for (int i = 0; i < 10; ++i) { - cols.push_back(*cudf::test::fixed_width_column_wrapper(r, r + num_rows, all_valid) - .release() - .release()); - schema.push_back(cudf::data_type{cudf::type_id::INT8}); - } - - for (int i = 0; i < 10; ++i) { - cols.push_back(*cudf::test::fixed_width_column_wrapper(r, r + num_rows, few_valid) - .release() - .release()); - schema.push_back(cudf::data_type{cudf::type_id::INT16}); - } - - for (int i = 0; i < 10; ++i) { - if (i < 5) { - cols.push_back(*cudf::test::fixed_width_column_wrapper(r, r + num_rows, few_valid) - .release() - .release()); - } else { - cols.push_back(*cudf::test::fixed_width_column_wrapper(r, r + num_rows, none_valid) - .release() - .release()); - } - schema.push_back(cudf::data_type{cudf::type_id::INT32}); - } - - for (int i = 0; i < 10; ++i) { - cols.push_back(*cudf::test::fixed_width_column_wrapper(d, d + num_rows, most_valid) - .release() - .release()); - schema.push_back(cudf::data_type{cudf::type_id::FLOAT32}); - } - - for (int i = 0; i < 10; ++i) { - cols.push_back(*cudf::test::fixed_width_column_wrapper(d, d + num_rows, most_valid) - .release() - .release()); - schema.push_back(cudf::data_type{cudf::type_id::FLOAT64}); - } - - for (int i = 0; i < 10; ++i) { - cols.push_back(*cudf::test::fixed_width_column_wrapper(r, r + num_rows, few_valid) - .release() - .release()); - schema.push_back(cudf::data_type{cudf::type_id::BOOL8}); - } - - for (int i = 0; i < 10; ++i) { - cols.push_back( - *cudf::test::fixed_width_column_wrapper( - r, r + num_rows, all_valid) - .release() - .release()); - schema.push_back(cudf::data_type{cudf::type_id::TIMESTAMP_MILLISECONDS}); - } - - for (int i = 0; i < 10; ++i) { - cols.push_back( - *cudf::test::fixed_width_column_wrapper( - r, r + num_rows, most_valid) - .release() - .release()); - schema.push_back(cudf::data_type{cudf::type_id::TIMESTAMP_DAYS}); - } - - for (int i = 0; i < 10; ++i) { - cols.push_back(*cudf::test::fixed_point_column_wrapper( - r, r + num_rows, all_valid, numeric::scale_type{-2}) - .release() - .release()); - schema.push_back(cudf::data_type{cudf::type_id::DECIMAL32}); - } - - for (int i = 0; i < 10; ++i) { - cols.push_back(*cudf::test::fixed_point_column_wrapper( - r, r + num_rows, most_valid, numeric::scale_type{-1}) - .release() - .release()); - schema.push_back(cudf::data_type{cudf::type_id::DECIMAL64}); - } - - std::vector views(cols.begin(), cols.end()); - cudf::table_view in(views); - - auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); - auto new_rows = cudf::convert_to_rows(in); - - for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = - cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*new_rows[i]), schema); - - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); - } -} - -TEST_F(RowToColumnTests, Non2Power) -{ - auto r = - cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); }); - std::vector> cols; - std::vector views; - std::vector schema; - - constexpr auto num_rows = 6 * 1024 + 557; - for (int i = 0; i < 131; ++i) { - cols.push_back(cudf::test::fixed_width_column_wrapper(r + num_rows * i, - r + num_rows * i + num_rows)); - views.push_back(cols.back()); - schema.push_back(cudf::data_type{cudf::type_id::INT32}); - } - cudf::table_view in(views); - - auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); - - for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = - cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); - - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); - } -} - -TEST_F(RowToColumnTests, Big) -{ - auto r = - cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); }); - std::vector> cols; - std::vector views; - std::vector schema; - - // 28 columns of 1 million rows - constexpr auto num_rows = 1024 * 1024; - for (int i = 0; i < 28; ++i) { - cols.push_back(cudf::test::fixed_width_column_wrapper(r + num_rows * i, - r + num_rows * i + num_rows)); - views.push_back(cols.back()); - schema.push_back(cudf::data_type{cudf::type_id::INT32}); - } - cudf::table_view in(views); - - auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); - - for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = - cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); - - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); - } -} - -TEST_F(RowToColumnTests, Bigger) -{ - auto r = - cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); }); - std::vector> cols; - std::vector views; - std::vector schema; - - // 28 columns of 1 million rows - constexpr auto num_rows = 1024 * 1024; - for (int i = 0; i < 128; ++i) { - cols.push_back(cudf::test::fixed_width_column_wrapper(r + num_rows * i, - r + num_rows * i + num_rows)); - views.push_back(cols.back()); - schema.push_back(cudf::data_type{cudf::type_id::INT32}); - } - cudf::table_view in(views); - - auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); - - for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = - cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); - - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); - } -} - -TEST_F(RowToColumnTests, Biggest) -{ - auto r = - cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); }); - std::vector> cols; - std::vector views; - std::vector schema; - - // 28 columns of 1 million rows - constexpr auto num_rows = 5 * 1024 * 1024; - for (int i = 0; i < 128; ++i) { - cols.push_back(cudf::test::fixed_width_column_wrapper(r + num_rows * i, - r + num_rows * i + num_rows)); - views.push_back(cols.back()); - schema.push_back(cudf::data_type{cudf::type_id::INT32}); - } - cudf::table_view in(views); - - auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); - - for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = - cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); - - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); - } -} diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu index 932afa4bb70..f5936e86bcd 100644 --- a/java/src/main/native/src/row_conversion.cu +++ b/java/src/main/native/src/row_conversion.cu @@ -50,7 +50,7 @@ #include #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 -constexpr auto NUM_BLOCKS_PER_KERNEL_FROM_ROWS = 8; +constexpr auto NUM_BLOCKS_PER_KERNEL_FROM_ROWS = 2; constexpr auto NUM_BLOCKS_PER_KERNEL_TO_ROWS = 2; constexpr auto NUM_BLOCKS_PER_KERNEL_LOADED = 2; constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL = 8; @@ -409,7 +409,7 @@ __global__ void copy_to_rows(const size_type num_rows, const size_type num_colum auto &fetch_barrier = block_barrier[fetch % NUM_BLOCKS_PER_KERNEL_LOADED]; // wait for the last use of the memory to be completed - if (fetch > NUM_BLOCKS_PER_KERNEL_LOADED) { + if (fetch >= NUM_BLOCKS_PER_KERNEL_LOADED) { fetch_barrier.arrive_and_wait(); } @@ -525,7 +525,7 @@ __global__ void copy_validity_to_rows( group.sync(); for (int validity_block = 0; validity_block < blocks_remaining; ++validity_block) { - if (validity_block != validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED) { + if (validity_block >= NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED) { shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] .arrive_and_wait(); } @@ -645,10 +645,10 @@ get_admin_data_sizes(size_t col_size_size, size_t col_offset_size, int const num * */ __global__ void copy_from_rows(const size_type num_rows, const size_type num_columns, - const size_type shmem_used_per_block, const size_type *row_offsets, - int8_t **output_data, const size_type *_col_sizes, - const size_type *_col_offsets, const block_info *block_infos, - const size_type num_block_infos, const int8_t *input_data) { + const size_type shmem_used_per_block, const size_type *row_offsets, + int8_t **output_data, const size_type *_col_sizes, + const size_type *_col_offsets, const block_info *block_infos, + const size_type num_block_infos, const int8_t *input_data) { // We are going to copy the data in two passes. // The first pass copies a chunk of data into shared memory. // The second pass copies that chunk from shared memory out to the final location. @@ -819,8 +819,8 @@ __global__ void copy_validity_from_rows( group.sync(); for (int validity_block = 0; validity_block < blocks_remaining; ++validity_block) { - auto const validity_index = validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; - if (validity_block != validity_index) { + if (validity_block >= NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED) { + auto const validity_index = validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; shared_block_barriers[validity_index].arrive_and_wait(); } int8_t *this_shared_block = shared_blocks[validity_block % 2]; @@ -1251,7 +1251,7 @@ std::vector> convert_to_rows(cudf::table_view cons // TODO: why? total_shmem -= 1024; - int shmem_limit_per_block = total_shmem / NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; + int shmem_limit_per_block = total_shmem / NUM_BLOCKS_PER_KERNEL_LOADED; // break up the work into blocks, which are a starting and ending row/col #. // this window size is calculated based on the shared memory size available @@ -1368,7 +1368,7 @@ std::vector> convert_to_rows(cudf::table_view cons std::vector block_infos = build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block); - auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr); + auto dev_block_infos = make_device_uvector_async(block_infos, stream); // blast through the entire table and convert it dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_ROWS)); @@ -1382,12 +1382,11 @@ std::vector> convert_to_rows(cudf::table_view cons auto validity_block_infos = build_validity_block_infos(num_columns, num_rows, shmem_limit_per_block, row_batches); - auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream, mr); + auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream); dim3 validity_blocks( - util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_ROWS)); + util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_VALIDITY_BLOCKS_PER_KERNEL)); dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu)); - detail::copy_validity_to_rows<<>>( + detail::copy_validity_to_rows<<>>( num_rows, num_columns, shmem_limit_per_block, dev_row_offsets.data(), dev_output_data.data(), column_starts.back(), dev_validity_block_infos.data(), validity_block_infos.size(), dev_input_nm.data()); @@ -1508,7 +1507,7 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const &in // TODO why? total_shmem -= 1024; - int shmem_limit_per_block = total_shmem / NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; + int shmem_limit_per_block = total_shmem / NUM_BLOCKS_PER_KERNEL_LOADED; std::vector column_starts; std::vector column_sizes; @@ -1590,7 +1589,7 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const &in } auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream, mr); dim3 validity_blocks( - util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_BLOCKS_PER_KERNEL_FROM_ROWS)); + util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_VALIDITY_BLOCKS_PER_KERNEL)); dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu)); detail:: From 131ca58fdfe2dfe7d0298d83a33d8e17ee41c34d Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Sat, 23 Oct 2021 01:37:52 +0000 Subject: [PATCH 29/80] removing unused header, suppressing shared warning for barrier, updating java bindings to use the correct namespace --- cpp/include/cudf/row_conversion.hpp | 51 ---------------------- java/src/main/native/src/TableJni.cpp | 9 ++-- java/src/main/native/src/row_conversion.cu | 6 ++- 3 files changed, 9 insertions(+), 57 deletions(-) delete mode 100644 cpp/include/cudf/row_conversion.hpp diff --git a/cpp/include/cudf/row_conversion.hpp b/cpp/include/cudf/row_conversion.hpp deleted file mode 100644 index 5d799f4c596..00000000000 --- a/cpp/include/cudf/row_conversion.hpp +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Copyright (c) 2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include - -#include -#include -#include - -namespace cudf { - -std::vector> convert_to_rows_fixed_width_optimized( - cudf::table_view const& tbl, - // TODO need something for validity - rmm::cuda_stream_view stream = rmm::cuda_stream_default, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -std::vector> convert_to_rows( - cudf::table_view const& tbl, - // TODO need something for validity - rmm::cuda_stream_view stream = rmm::cuda_stream_default, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -std::unique_ptr convert_from_rows_fixed_width_optimized( - cudf::lists_column_view const& input, - std::vector const& schema, - rmm::cuda_stream_view stream = rmm::cuda_stream_default, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -std::unique_ptr convert_from_rows( - cudf::lists_column_view const& input, - std::vector const& schema, - rmm::cuda_stream_view stream = rmm::cuda_stream_default, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -} // namespace cudf diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index d0e6b895a1e..28a12c36b4e 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -35,7 +35,6 @@ #include #include #include -#include #include #include #include @@ -2701,7 +2700,7 @@ Java_ai_rapids_cudf_Table_convertToRowsFixedWidthOptimized(JNIEnv *env, jclass, cudf::jni::auto_set_device(env); cudf::table_view *n_input_table = reinterpret_cast(input_table); std::vector> cols = - cudf::convert_to_rows_fixed_width_optimized(*n_input_table); + cudf::java::convert_to_rows_fixed_width_optimized(*n_input_table); int num_columns = cols.size(); cudf::jni::native_jlongArray outcol_handles(env, num_columns); for (int i = 0; i < num_columns; i++) { @@ -2719,7 +2718,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertToRows(JNIEnv *env try { cudf::jni::auto_set_device(env); cudf::table_view *n_input_table = reinterpret_cast(input_table); - std::vector> cols = cudf::convert_to_rows(*n_input_table); + std::vector> cols = cudf::java::convert_to_rows(*n_input_table); int num_columns = cols.size(); cudf::jni::native_jlongArray outcol_handles(env, num_columns); for (int i = 0; i < num_columns; i++) { @@ -2746,7 +2745,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRowsFixedWidth types_vec.emplace_back(cudf::jni::make_data_type(n_types[i], n_scale[i])); } std::unique_ptr result = - cudf::convert_from_rows_fixed_width_optimized(list_input, types_vec); + cudf::java::convert_from_rows_fixed_width_optimized(list_input, types_vec); return cudf::jni::convert_table_for_return(env, result); } CATCH_STD(env, 0); @@ -2769,7 +2768,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRows(JNIEnv *e for (int i = 0; i < n_types.size(); i++) { types_vec.emplace_back(cudf::jni::make_data_type(n_types[i], n_scale[i])); } - std::unique_ptr result = cudf::convert_from_rows(list_input, types_vec); + std::unique_ptr result = cudf::java::convert_from_rows(list_input, types_vec); return cudf::jni::convert_table_for_return(env, result); } CATCH_STD(env, 0); diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu index f5936e86bcd..af26e4c0b0d 100644 --- a/java/src/main/native/src/row_conversion.cu +++ b/java/src/main/native/src/row_conversion.cu @@ -34,7 +34,6 @@ #include #include #include -#include #include #include #include @@ -49,12 +48,17 @@ #include #include +#include "row_conversion.hpp" + #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 constexpr auto NUM_BLOCKS_PER_KERNEL_FROM_ROWS = 2; constexpr auto NUM_BLOCKS_PER_KERNEL_TO_ROWS = 2; constexpr auto NUM_BLOCKS_PER_KERNEL_LOADED = 2; constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL = 8; constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED = 2; + +// needed to suppress warning about cuda::barrier +#pragma diag_suppress static_var_with_dynamic_init #endif using cudf::detail::make_device_uvector_async; From d013e8b2e1182c29aac2783f3999fd86aa9087b8 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Sat, 30 Oct 2021 01:00:38 +0000 Subject: [PATCH 30/80] updating code to build block infos with thrust on the gpu --- java/src/main/native/src/row_conversion.cu | 670 +++++++++++++-------- 1 file changed, 418 insertions(+), 252 deletions(-) diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu index af26e4c0b0d..87ab1ed49d8 100644 --- a/java/src/main/native/src/row_conversion.cu +++ b/java/src/main/native/src/row_conversion.cu @@ -21,6 +21,8 @@ #include #include +#include +#include #include #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 @@ -34,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -47,8 +50,7 @@ #include #include #include - -#include "row_conversion.hpp" +#include #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 constexpr auto NUM_BLOCKS_PER_KERNEL_FROM_ROWS = 2; @@ -64,7 +66,7 @@ constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED = 2; using cudf::detail::make_device_uvector_async; using rmm::device_uvector; namespace cudf { - +namespace java { namespace detail { static inline __host__ __device__ int32_t align_offset(int32_t offset, std::size_t alignment) { @@ -324,6 +326,11 @@ __global__ void copy_to_rows_fixed_width_optimized( #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 +/** + * @brief The GPU blocks work on one or more block_info structs of data. + * This structure defined the workspace for the block. + * + */ struct block_info { int start_col; int start_row; @@ -340,38 +347,36 @@ struct block_info { __host__ __device__ size_type num_rows() const { return end_row - start_row + 1; } }; -// When building the columns to return, we have to be mindful of the offset limit in cudf. -// It is 32-bit and these data columns are capable of surpassing that easily. The data should -// not be cut off exactly at the limit though due to the validity buffers. The most efficient -// place to cut the validity is on a 32-row boundary, so as we calculate the row sizes -// we keep track of the cut points for the validity, which we call row batches. If the row -// is larger than can be represented with the 32-bit offsets, we use the last 32-row boundary we -// hit. Note that this boundary is for our book-keeping with column pointers and not anything that -// the kernel needs to worry about. We cut the output at convienient boundaries when assembling -// the outgoing data stream. +/** + * @brief Returning rows is done in a byte cudf column. This is limited in size by + * `size_type` and so output is broken into batches of rows that fit inside + * this limit. + * + */ struct row_batch { size_type num_bytes; size_type row_count; + device_uvector row_offsets; }; /** - * @brief copy data from cudf columns into x format, which is row-based + * @brief copy data from cudf columns into JCUDF format, which is row-based * * @param num_rows total number of rows in the table * @param num_columns total number of columns in the table + * @param shmem_used_per_block shared memory amount each `block_info` is using + * @param block_infos span of `block_info` structs the define the work * @param input_data pointer to raw table data - * @param input_nm pointer to validity data * @param col_sizes array of sizes for each element in a column - one per column * @param col_offsets offset into input data row for each column's start - * @param block_infos information about the blocks of work * @param row_offsets offset to a specific row in the input data * @param output_data pointer to output data * */ __global__ void copy_to_rows(const size_type num_rows, const size_type num_columns, - const size_type shmem_used_per_block, const size_type num_block_infos, - const int8_t **input_data, const size_type *col_sizes, - const size_type *col_offsets, const block_info *block_infos, + const size_type shmem_used_per_block, + device_span block_infos, const int8_t **input_data, + const size_type *col_sizes, const size_type *col_offsets, const size_type *row_offsets, int8_t **output_data) { // We are going to copy the data in two passes. // The first pass copies a chunk of data into shared memory. @@ -396,7 +401,7 @@ __global__ void copy_to_rows(const size_type num_rows, const size_type num_colum group.sync(); auto const blocks_remaining = - std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_ROWS, + std::min((uint)block_infos.size() - blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_ROWS, (uint)NUM_BLOCKS_PER_KERNEL_TO_ROWS); size_t fetch; @@ -491,23 +496,25 @@ __global__ void copy_to_rows(const size_type num_rows, const size_type num_colum * @param num_rows total number of rows in the table * @param num_columns total number of columns in the table * @param shmem_used_per_block amount of shared memory that is used by a block - * @param offsets + * @param row_offsets offset to a specific row in the input data * @param output_data pointer to output data, partitioned by data size * @param validity_offsets offset into input data row for validity data * @param block_infos information about the blocks of work - * @param num_block_infos number of infos in blocks array * @param input_data pointer to input data * */ -__global__ void copy_validity_to_rows( - const size_type num_rows, const size_type num_columns, const size_type shmem_used_per_block, - const size_type *row_offsets, int8_t **output_data, const size_type validity_offset, - const block_info *block_infos, const size_type num_block_infos, const bitmask_type **input_nm) { +__global__ void copy_validity_to_rows(const size_type num_rows, const size_type num_columns, + const size_type shmem_used_per_block, + const size_type *row_offsets, int8_t **output_data, + const size_type validity_offset, + device_span block_infos, + const bitmask_type **input_nm) { extern __shared__ int8_t shared_data[]; int8_t *shared_blocks[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] = { shared_data, shared_data + shmem_used_per_block / 2}; - // per conversation with DaveB + using cudf::detail::warp_size; + // each thread of warp reads a single int32 of validity - so we read 128 bytes // then ballot_sync the bits and write the result to shmem // after we fill shared mem memcpy it out in a blob. @@ -515,7 +522,7 @@ __global__ void copy_validity_to_rows( auto group = cooperative_groups::this_thread_block(); int const blocks_remaining = - std::min(num_block_infos - blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL, + std::min((uint)block_infos.size() - blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL, (uint)NUM_VALIDITY_BLOCKS_PER_KERNEL); __shared__ cuda::barrier @@ -545,9 +552,9 @@ __global__ void copy_validity_to_rows( align_offset(util::div_rounding_up_unsafe(num_block_cols, 8), 8); auto const total_sections = num_sections_x * num_sections_y; - int const warp_id = threadIdx.x / detail::warp_size; - int const lane_id = threadIdx.x % detail::warp_size; - auto const warps_per_block = std::max(1u, blockDim.x / detail::warp_size); + int const warp_id = threadIdx.x / warp_size; + int const lane_id = threadIdx.x % warp_size; + auto const warps_per_block = std::max(1u, blockDim.x / warp_size); // the block is divided into sections. A warp operates on a section at a time. for (int my_section_idx = warp_id; my_section_idx < total_sections; @@ -567,7 +574,7 @@ __global__ void copy_validity_to_rows( input_nm[absolute_col][absolute_row / 32] : std::numeric_limits::max(); - // every thread that is participating in the warp has a byte, but it's column-based + // every thread that is participating in the warp has 4 bytes, but it's column-based // data and we need it in row-based. So we shuffle the bits around with ballot_sync to // make the bytes we actually write. bitmask_type dw_mask = 1; @@ -576,7 +583,7 @@ __global__ void copy_validity_to_rows( // lead thread in each warp writes data auto const validity_write_offset = validity_data_row_length * (relative_row + i) + relative_col / 8; - if (threadIdx.x % detail::warp_size == 0) { + if (threadIdx.x % warp_size == 0) { if (cols_left <= 8) { // write byte this_shared_block[validity_write_offset] = validity_data & 0xFF; @@ -625,6 +632,14 @@ __global__ void copy_validity_to_rows( } } +/** + * @brief Admin data is data stored in shared memory that isn't actual column data + * + * @param col_size_size size of the column size data. + * @param col_offset_size size of the column offset data. + * @param num_cols number of columns in the block. + * @return tuple of the size of column and offset admin data. + */ static __device__ std::tuple get_admin_data_sizes(size_t col_size_size, size_t col_offset_size, int const num_cols) { auto const col_size_bytes = num_cols * col_size_size; @@ -639,9 +654,8 @@ get_admin_data_sizes(size_t col_size_size, size_t col_offset_size, int const num * @param num_rows total number of rows in the table * @param num_columns total number of columns in the table * @param shmem_used_per_block amount of shared memory that is used by a block - * @param row_offsets - * @param output_data - * @param output_nm + * @param row_offsets offset to a specific row in the input data + * @param output_data pointers to column data * @param col_sizes array of sizes for each element in a column - one per column * @param col_offsets offset into input data row for each column's start * @param block_infos information about the blocks of work @@ -651,8 +665,9 @@ get_admin_data_sizes(size_t col_size_size, size_t col_offset_size, int const num __global__ void copy_from_rows(const size_type num_rows, const size_type num_columns, const size_type shmem_used_per_block, const size_type *row_offsets, int8_t **output_data, const size_type *_col_sizes, - const size_type *_col_offsets, const block_info *block_infos, - const size_type num_block_infos, const int8_t *input_data) { + const size_type *_col_offsets, + device_span block_infos, + const int8_t *input_data) { // We are going to copy the data in two passes. // The first pass copies a chunk of data into shared memory. // The second pass copies that chunk from shared memory out to the final location. @@ -678,8 +693,9 @@ __global__ void copy_from_rows(const size_type num_rows, const size_type num_col group.sync(); - auto blocks_remaining = std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_ROWS, - (uint)NUM_BLOCKS_PER_KERNEL_FROM_ROWS); + auto blocks_remaining = + std::min((uint)block_infos.size() - blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_ROWS, + (uint)NUM_BLOCKS_PER_KERNEL_FROM_ROWS); size_t fetch_index; size_t processing_index; @@ -785,23 +801,24 @@ __global__ void copy_from_rows(const size_type num_rows, const size_type num_col * @param num_rows total number of rows in the table * @param num_columns total number of columns in the table * @param shmem_used_per_block amount of shared memory that is used by a block - * @param offsets - * @param output_nm + * @param row_offsets offset to a specific row in the input data + * @param output_nm pointers to null masks for columns * @param validity_offsets offset into input data row for validity data * @param block_infos information about the blocks of work - * @param num_block_infos number of infos in blocks array * @param input_data pointer to input data * */ -__global__ void copy_validity_from_rows( - const size_type num_rows, const size_type num_columns, const size_type shmem_used_per_block, - const size_type *row_offsets, cudf::bitmask_type **output_nm, const size_type validity_offset, - const block_info *block_infos, const size_type num_block_infos, const int8_t *input_data) { +__global__ void +copy_validity_from_rows(const size_type num_rows, const size_type num_columns, + const size_type shmem_used_per_block, const size_type *row_offsets, + cudf::bitmask_type **output_nm, const size_type validity_offset, + device_span block_infos, const int8_t *input_data) { extern __shared__ int8_t shared_data[]; int8_t *shared_blocks[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] = { shared_data, shared_data + shmem_used_per_block / 2}; - // per conversation with DaveB + using cudf::detail::warp_size; + // each thread of warp reads a single byte of validity - so we read 32 bytes // then ballot_sync the bits and write the result to shmem // after we fill shared mem memcpy it out in a blob. @@ -809,7 +826,7 @@ __global__ void copy_validity_from_rows( auto group = cooperative_groups::this_thread_block(); int const blocks_remaining = - std::min(num_block_infos - blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL, + std::min((uint)block_infos.size() - blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL, (uint)NUM_VALIDITY_BLOCKS_PER_KERNEL); __shared__ cuda::barrier @@ -837,14 +854,14 @@ __global__ void copy_validity_from_rows( auto const num_sections_y = (num_block_rows + 31) / 32; auto const validity_data_col_length = num_sections_y * 4; // words to bytes auto const total_sections = num_sections_x * num_sections_y; - int const warp_id = threadIdx.x / detail::warp_size; - int const lane_id = threadIdx.x % detail::warp_size; - auto const warps_per_block = std::max(1u, blockDim.x / detail::warp_size); + int const warp_id = threadIdx.x / warp_size; + int const lane_id = threadIdx.x % warp_size; + auto const warps_per_block = std::max(1u, blockDim.x / warp_size); // the block is divided into sections. A warp operates on a section at a time. for (int my_section_idx = warp_id; my_section_idx < total_sections; my_section_idx += warps_per_block) { - // convert to rows and cols + // convert section to row and col auto const section_x = my_section_idx % num_sections_x; auto const section_y = my_section_idx / num_sections_x; auto const relative_col = section_x * 8; @@ -860,13 +877,13 @@ __global__ void copy_validity_from_rows( input_data[row_offsets[absolute_row] + validity_offset + absolute_col / 8]; // so every thread that is participating in the warp has a byte, but it's row-based - // data and we need it in column-based. So we shiffle the bits around to make + // data and we need it in column-based. So we shuffle the bits around to make // the bytes we actually write. for (int i = 0, byte_mask = 1; i < 8 && relative_col + i < num_columns; ++i, byte_mask <<= 1) { auto validity_data = __ballot_sync(participation_mask, my_byte & byte_mask); // lead thread in each warp writes data - if (threadIdx.x % detail::warp_size == 0) { + if (threadIdx.x % warp_size == 0) { auto const validity_write_offset = validity_data_col_length * (relative_col + i) + relative_row / 8; @@ -898,10 +915,10 @@ __global__ void copy_validity_from_rows( // now async memcpy the shared for (int col = block.start_col + threadIdx.x; col <= block.end_col; col += blockDim.x) { auto const relative_col = col - block.start_col; + auto const starting_address = output_nm[col] + word_index(block_start_row); cuda::memcpy_async( - output_nm[col] + word_index(block_start_row), - &this_shared_block[validity_data_col_length * relative_col], + starting_address, &this_shared_block[validity_data_col_length * relative_col], util::div_rounding_up_unsafe(num_block_rows, 8), shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]); } @@ -919,7 +936,8 @@ __global__ void copy_validity_from_rows( #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 /** - * Calculate the dimensions of the kernel for fixed width only columns. + * @brief Calculate the dimensions of the kernel for fixed width only columns. + * * @param [in] num_columns the number of columns being copied. * @param [in] num_rows the number of rows being copied. * @param [in] size_per_row the size each row takes up when padded. @@ -995,7 +1013,7 @@ fixed_width_convert_to_rows(const cudf::size_type start_row, const cudf::size_ty rmm::device_uvector &input_nm, const cudf::scalar &zero, const cudf::scalar &scalar_size_per_row, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) { - int64_t total_allocation = size_per_row * num_rows; + int64_t const total_allocation = size_per_row * num_rows; // We made a mistake in the split somehow CUDF_EXPECTS(total_allocation < std::numeric_limits::max(), "Table is too large to fit!"); @@ -1020,17 +1038,14 @@ fixed_width_convert_to_rows(const cudf::size_type start_row, const cudf::size_ty rmm::device_buffer{0, rmm::cuda_stream_default, mr}, stream, mr); } -static cudf::data_type get_data_type(const cudf::column_view &v) { - return v.type(); -} - static inline bool are_all_fixed_width(std::vector const &schema) { return std::all_of(schema.begin(), schema.end(), [](const cudf::data_type &t) { return cudf::is_fixed_width(t); }); } /** - * Given a set of fixed width columns, calculate how the data will be laid out in memory. + * @brief Given a set of fixed width columns, calculate how the data will be laid out in memory. + * * @param [in] schema the types of columns that need to be laid out. * @param [out] column_start the byte offset where each column starts in the row. * @param [out] column_size the size in bytes of the data for each columns in the row. @@ -1065,19 +1080,25 @@ static inline int32_t compute_fixed_width_layout(std::vector co #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 +/** + * @brief Compute information about a table such as bytes per row and offsets. + * + * @tparam iterator iterator of column schema data + * @param begin starting iterator of column schema + * @param end ending iterator of column schema + * @param column_starts column start offsets + * @param column_sizes size in bytes of each column + * @return size of the fixed_width data portion of a row. + */ template static size_type compute_column_information(iterator begin, iterator end, std::vector &column_starts, - std::vector &column_sizes) //, -// std::function nested_type_cb) -{ + std::vector &column_sizes) { size_type fixed_width_size_per_row = 0; for (auto cv = begin; cv != end; ++cv) { auto col_type = std::get<0>(*cv); bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; - // if (nested_type && nested_type_cb) { nested_type_cb(cv->get<1>()); } - // a list or string column will write a single uint64 // of data here for offset/length auto col_size = nested_type ? 8 : size_of(col_type); @@ -1096,6 +1117,15 @@ static size_type compute_column_information(iterator begin, iterator end, return fixed_width_size_per_row; } +/** + * @brief Build `block_info` for the validity data to break up the work. + * + * @param num_columns number of columns in the table + * @param num_rows number of rows in the table + * @param shmem_limit_per_block size of shared memory available to a single gpu block + * @param row_batches batched row information for multiple output locations + * @return vector of `block_info` structs for validity data + */ std::vector build_validity_block_infos(size_type const &num_columns, size_type const &num_rows, size_type const &shmem_limit_per_block, @@ -1139,43 +1169,202 @@ build_validity_block_infos(size_type const &num_columns, size_type const &num_ro return validity_block_infos; } -std::vector build_block_infos(std::vector const &column_sizes, - std::vector const &column_starts, - std::vector const &row_batches, - size_type const total_number_of_rows, - size_type const &shmem_limit_per_block) { - std::vector block_infos; +constexpr size_type max_batch_size = std::numeric_limits::max(); + +/** + * @brief Holds information about the batches of data to be processed + * + */ +struct batch_data { + std::vector batch_row_boundaries; + device_uvector input_data_row_offsets; + std::vector row_batches; + + batch_data(size_type num_input_offsets, rmm::cuda_stream_view stream) + : input_data_row_offsets(num_input_offsets, stream){}; +}; +/** + * @brief Builds batches of rows that will fit in the size limit of a column. + * + * @tparam RowSize iterator that gives the size of a specific row of the table. + * @param num_rows Total number of rows in the table + * @param row_sizes iterator that gives the size of a specific row of the table. + * @param stream stream to operate on for this work + * @param mr memory resource used to allocate any returned data + * @returns vector of size_type's that indicate row numbers for batch boundaries and a + * device_uvector of row offsets + */ + +template +batch_data build_batches(size_type num_rows, RowSize row_sizes, rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) { + auto const total_size = thrust::reduce(rmm::exec_policy(stream), row_sizes, row_sizes + num_rows); + auto const num_batches = util::div_rounding_up_safe(total_size, max_batch_size); + auto const num_offsets = num_batches + 1; + batch_data ret(num_rows + 1, stream); + + // at most max gpu memory / 2GB iterations. + ret.batch_row_boundaries.reserve(num_offsets); + ret.batch_row_boundaries.push_back(0); + size_type last_row_end = 0; + device_uvector cumulative_row_sizes(num_rows, stream); + thrust::inclusive_scan(rmm::exec_policy(stream), row_sizes, row_sizes + num_rows, + cumulative_row_sizes.begin()); + while ((int)ret.batch_row_boundaries.size() < num_offsets) { + // find the next max_batch_size boundary + size_type const row_end = + ((thrust::lower_bound(rmm::exec_policy(stream), cumulative_row_sizes.begin(), + cumulative_row_sizes.begin() + (num_rows - last_row_end), + max_batch_size) - + cumulative_row_sizes.begin()) + + last_row_end); + + // build offset list for each row in this batch + auto const num_entries = row_end - last_row_end + 1; + device_uvector output_batch_row_offsets(num_entries, stream, mr); + + auto row_size_iter_bounded = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), [row_end, row_sizes, last_row_end] __device__(auto i) { + return i >= row_end ? 0 : row_sizes[i + last_row_end]; + }); + + thrust::exclusive_scan(rmm::exec_policy(stream), row_size_iter_bounded, + row_size_iter_bounded + num_entries, output_batch_row_offsets.begin()); + + ret.batch_row_boundaries.push_back(row_end); + auto const batch_bytes = output_batch_row_offsets.element(row_end, stream) - + output_batch_row_offsets.element(last_row_end, stream); + auto const num_rows_in_batch = row_end - last_row_end; + ret.row_batches.push_back( + {batch_bytes, num_rows_in_batch, std::move(output_batch_row_offsets)}); + last_row_end = row_end; + } + + auto row_size_iter = cudf::detail::make_counting_transform_iterator( + 0, [row_sizes, num_rows] __device__(auto i) { return (i < num_rows) ? row_sizes[i] : 0; }); + thrust::exclusive_scan(rmm::exec_policy(stream), row_size_iter, row_size_iter + num_rows + 1, + ret.input_data_row_offsets.begin()); + + return ret; +} + +/** + * @brief Computes the number of blocks necessary given a window height and batch offsets + * + * @param batch_row_offsets row offsets for each batch + * @param desired_window_height height of each window in the table + * @param stream stream to use + * @return number of windows necessary + */ +int compute_block_counts(device_span const &batch_row_offsets, + int desired_window_height, rmm::cuda_stream_view stream) { + size_type const num_batches = batch_row_offsets.size() - 1; + device_uvector num_blocks(num_batches, stream); + auto iter = thrust::make_counting_iterator(0); + thrust::transform( + rmm::exec_policy(stream), iter, iter + num_batches, num_blocks.begin(), + [desired_window_height, + batch_row_offsets = batch_row_offsets.data()] __device__(auto batch_index) -> size_type { + return util::div_rounding_up_unsafe(batch_row_offsets[batch_index + 1] - + batch_row_offsets[batch_index], + desired_window_height); + }); + return thrust::reduce(rmm::exec_policy(stream), num_blocks.begin(), num_blocks.end()); +} + +/** + * @brief Builds the `block_info` structs for a given table. + * + * @param blocks span of blocks to populate + * @param batch_row_offsets offsets to row batches + * @param column_start starting column of the window + * @param column_end ending column of the window + * @param desired_window_height height of the window + * @param total_number_of_rows total number of rows in the table + * @param stream stream to use + * @return number of windows created + */ +size_type +build_blocks(device_span blocks, + device_uvector const &batch_row_offsets, // comes from build_batches + int column_start, int column_end, int desired_window_height, int total_number_of_rows, + rmm::cuda_stream_view stream) { + size_type const num_batches = batch_row_offsets.size() - 1; + device_uvector num_blocks(num_batches, stream); + auto iter = thrust::make_counting_iterator(0); + thrust::transform( + rmm::exec_policy(stream), iter, iter + num_batches, num_blocks.begin(), + [desired_window_height, + batch_row_offsets = batch_row_offsets.data()] __device__(auto batch_index) -> size_type { + return util::div_rounding_up_unsafe(batch_row_offsets[batch_index + 1] - + batch_row_offsets[batch_index], + desired_window_height); + }); + + size_type const total_blocks = + thrust::reduce(rmm::exec_policy(stream), num_blocks.begin(), num_blocks.end()); + + device_uvector block_starts(num_batches + 1, stream); + auto block_iter = cudf::detail::make_counting_transform_iterator( + 0, [num_blocks = num_blocks.data(), num_batches] __device__(auto i) { + return (i < num_batches) ? num_blocks[i] : 0; + }); + thrust::exclusive_scan(rmm::exec_policy(stream), block_iter, block_iter + num_batches + 1, + block_starts.begin()); // in blocks + + thrust::transform( + rmm::exec_policy(stream), iter, iter + total_blocks, blocks.begin(), + [=, block_starts = block_starts.data(), + batch_row_offsets = batch_row_offsets.data()] __device__(size_type block_index) { + // what batch this block falls in + auto const batch_index_iter = + thrust::upper_bound(thrust::seq, block_starts, block_starts + num_batches, block_index); + auto const batch_index = std::distance(block_starts, batch_index_iter) - 1; + // local index within the block + int const local_block_index = block_index - block_starts[batch_index]; + // the start row for this batch. + int const batch_row_start = batch_row_offsets[batch_index]; + // the start row for this block + int const block_row_start = batch_row_start + (local_block_index * desired_window_height); + // the end row for this block + int const max_row = std::min(total_number_of_rows - 1, + batch_index + 1 > num_batches ? + std::numeric_limits::max() : + static_cast(batch_row_offsets[batch_index + 1]) - 1); + int const block_row_end = std::min( + batch_row_start + ((local_block_index + 1) * desired_window_height) - 1, max_row); + + // stuff the block + return block_info{column_start, block_row_start, column_end, block_row_end, + static_cast(batch_index)}; + }); + + return total_blocks; +} + +/** + * @brief Determines what data should be operated on by each block for the incoming table. + * + * @tparam WindowCallback Callback that receives the start and end columns of windows + * @param column_sizes vector of the size of each column + * @param column_starts vector of the offset of each column + * @param first_row_batch_size size of the first row batch to limit max window size since a window + * is unable to span batches + * @param total_number_of_rows total number of rows in the table + * @param shmem_limit_per_block shared memory allowed per block + * @param f callback function called when building a window + */ +template +void determine_windows(std::vector const &column_sizes, + std::vector const &column_starts, + size_type const first_row_batch_size, size_type const total_number_of_rows, + size_type const &shmem_limit_per_block, WindowCallback f) { // block infos are organized with the windows going "down" the columns // this provides the most coalescing of memory access int current_window_width = 0; int current_window_start_col = 0; - // build the blocks for a specific set of columns - auto build_blocks = [&block_infos, &row_batches, total_number_of_rows]( - int const start_col, int const end_col, int const desired_window_height) { - int current_window_start_row = 0; - int current_window_row_batch = 0; - int rows_left_in_batch = row_batches[current_window_row_batch].row_count; - int i = 0; - while (i < total_number_of_rows) { - if (rows_left_in_batch == 0) { - current_window_row_batch++; - rows_left_in_batch = row_batches[current_window_row_batch].row_count; - } - int const window_height = std::min(desired_window_height, rows_left_in_batch); - - block_infos.emplace_back(detail::block_info{ - start_col, current_window_start_row, end_col, - std::min(current_window_start_row + window_height - 1, total_number_of_rows - 1), - current_window_row_batch}); - - i += window_height; - current_window_start_row += window_height; - rows_left_in_batch -= window_height; - } - }; - // the ideal window height has lots of 8-byte reads and 8-byte writes. The optimal read/write // would be memory cache line sized access, but since other blocks will read/write the edges // this may not turn out to be overly important. For now, we will attempt to build a square @@ -1183,12 +1372,10 @@ std::vector build_block_infos(std::vector const &column_s // shared_mem_size since we want them equal, so height and width are sqrt(shared_mem_size). The // trick is that it's in bytes, not rows or columns. size_type const optimal_square_len = size_type(sqrt(shmem_limit_per_block)); - int const window_height = std::clamp( - util::round_up_safe( - std::min(std::min(optimal_square_len, (size_type)column_sizes.size()) / column_sizes[0], - total_number_of_rows), - 32), - 1, row_batches[0].row_count); + int const window_height = + std::clamp(util::round_up_safe( + std::min(optimal_square_len / column_sizes[0], total_number_of_rows), 32), + 1, first_row_batch_size); auto calc_admin_data_size = [](int num_cols) -> size_type { // admin data is the column sizes and column start information. @@ -1213,7 +1400,8 @@ std::vector build_block_infos(std::vector const &column_s calc_admin_data_size(col - current_window_start_col) > shmem_limit_per_block) { // too large, close this window, generate vertical blocks and restart - build_blocks(current_window_start_col, col == 0 ? col : col - 1, window_height); + f(current_window_start_col, col == 0 ? col : col - 1, window_height); + row_size = detail::align_offset((column_starts[col] + column_sizes[col]) & 7, alignment_needed); row_size += col_size; // alignment required for shared memory window boundary to match @@ -1228,12 +1416,24 @@ std::vector build_block_infos(std::vector const &column_s // build last set of blocks if (current_window_width > 0) { - build_blocks(current_window_start_col, (int)column_sizes.size() - 1, window_height); + f(current_window_start_col, (int)column_sizes.size() - 1, window_height); } - - return block_infos; } +struct row_size_functor { + size_type _fixed_width_size_per_row; + size_type _num_columns; + row_size_functor(size_t fixed_width_size_per_row, size_t num_columns) + : _fixed_width_size_per_row(fixed_width_size_per_row), _num_columns(num_columns){}; + + CUDA_DEVICE_CALLABLE + int operator()(int row_index) { + auto const bytes_needed = + _fixed_width_size_per_row + util::div_rounding_up_safe(_num_columns, 8); + return detail::align_offset(bytes_needed, 8); + } +}; + #endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 } // namespace detail @@ -1242,9 +1442,6 @@ std::vector> convert_to_rows(cudf::table_view cons rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) { #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 - // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the - // data, but small enough that multiple columns fit in memory so the writes can coalese as well. - // Potential optimization for window sizes. const size_type num_columns = tbl.num_columns(); const size_type num_rows = tbl.num_rows(); @@ -1253,7 +1450,7 @@ std::vector> convert_to_rows(cudf::table_view cons int total_shmem; CUDA_TRY(cudaDeviceGetAttribute(&total_shmem, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); - // TODO: why? + // TODO: why is this needed. kernel fails to launch if all memory is requested. total_shmem -= 1024; int shmem_limit_per_block = total_shmem / NUM_BLOCKS_PER_KERNEL_LOADED; @@ -1277,150 +1474,113 @@ std::vector> convert_to_rows(cudf::table_view cons std::vector input_nm; input_data.reserve(num_columns); input_nm.reserve(num_columns); - for (size_type column_number = 0; column_number < num_columns; column_number++) { - column_view cv = tbl.column(column_number); - auto const col_type = cv.type(); - bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; - - if (!nested_type) { - input_data.emplace_back(cv.data()); - input_nm.emplace_back(cv.null_mask()); - } - } + std::transform( + tbl.begin(), tbl.end(), std::back_inserter(input_data), + [](cudf::column_view const &c) -> int8_t const * { return c.template data(); }); + std::transform(tbl.begin(), tbl.end(), std::back_inserter(input_nm), + [](auto c) { return c.null_mask(); }); auto dev_input_data = make_device_uvector_async(input_data, stream, mr); auto dev_input_nm = make_device_uvector_async(input_nm, stream, mr); - std::vector row_sizes; // size of each row in bytes including any alignment padding - std::vector row_offsets; // offset from the start of the data to this row std::vector column_sizes; // byte size of each column std::vector column_starts; // offset of column inside a row including alignment - std::vector - variable_width_columns; // list of the variable width columns in the table - row_sizes.reserve(num_rows); - row_offsets.reserve(num_rows); column_sizes.reserve(num_columns); column_starts.reserve(num_columns + 1); // we add a final offset for validity data start - auto iter = + auto schema_column_iter = thrust::make_transform_iterator(thrust::make_counting_iterator(0), [&tbl](auto i) -> std::tuple { return std::make_tuple(tbl.column(i).type(), tbl.column(i)); }); - size_type fixed_width_size_per_row = - detail::compute_column_information(iter, iter + num_columns, column_starts, column_sizes); + size_type fixed_width_size_per_row = detail::compute_column_information( + schema_column_iter, schema_column_iter + num_columns, column_starts, column_sizes); auto dev_col_sizes = make_device_uvector_async(column_sizes, stream, mr); auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr); - std::vector row_batches; - - uint64_t row_batch_size = 0; - uint64_t total_table_size = 0; - size_type row_batch_rows = 0; - uint64_t row_offset = 0; + // total encoded row size. This includes fixed-width data, validity, and variable-width data. + auto row_size_iter = cudf::detail::make_counting_transform_iterator( + 0, detail::row_size_functor(fixed_width_size_per_row, num_columns)); // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then // calculate the size of each row's variable-width data and validity as well. auto validity_size = num_bitmask_words(num_columns) * 4; - // thrust - for (int row = 0; row < num_rows; ++row) { - auto aligned_row_batch_size = - detail::align_offset(row_batch_size, 8); // rows are 8 byte aligned - row_sizes[row] = fixed_width_size_per_row; - // validity is byte aligned - row_sizes[row] += validity_size; - // variable width data is 8-byte aligned - row_sizes[row] = detail::align_offset(row_sizes[row], 8); // rows are 8 byte aligned - - if ((uint64_t)aligned_row_batch_size + row_sizes[row] > - (uint64_t)std::numeric_limits::max()) { - // a new batch starts at the last 32-row boundary - row_batches.push_back( - detail::row_batch{static_cast(row_batch_size), row_batch_rows & ~31}); - row_batch_size = 0; - row_batch_rows = row_batch_rows & 31; - row_offset = 0; - aligned_row_batch_size = 0; - } - row_offset = detail::align_offset(row_offset, 8); // rows are 8 byte aligned - row_offsets.push_back(row_offset); - row_batch_size = aligned_row_batch_size + row_sizes[row]; - row_offset += row_sizes[row]; - total_table_size = detail::align_offset(total_table_size, 8); // rows are 8 byte aligned - total_table_size += row_sizes[row]; - row_batch_rows++; - } - if (row_batch_size > 0) { - row_batches.push_back( - detail::row_batch{static_cast(row_batch_size), row_batch_rows}); - } - auto dev_row_offsets = make_device_uvector_async(row_offsets, stream, mr); + auto batch_info = detail::build_batches(num_rows, row_size_iter, stream, mr); + auto gpu_batch_row_boundaries = + make_device_uvector_async(batch_info.batch_row_boundaries, stream); + + // the first batch always exists unless we were sent an empty table + auto const first_batch_size = batch_info.row_batches[0].row_count; std::vector output_buffers; std::vector output_data; - output_data.reserve(row_batches.size()); - for (uint i = 0; i < row_batches.size(); ++i) { - rmm::device_buffer temp(row_batches[i].num_bytes, stream, mr); + output_data.reserve(batch_info.row_batches.size()); + for (uint i = 0; i < batch_info.row_batches.size(); ++i) { + rmm::device_buffer temp(batch_info.row_batches[i].num_bytes, stream, mr); output_data.push_back(static_cast(temp.data())); output_buffers.push_back(std::move(temp)); } auto dev_output_data = make_device_uvector_async(output_data, stream, mr); - std::vector block_infos = - build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block); - - auto dev_block_infos = make_device_uvector_async(block_infos, stream); + int info_count = 0; + detail::determine_windows( + column_sizes, column_starts, first_batch_size, num_rows, shmem_limit_per_block, + [&gpu_batch_row_boundaries, &info_count, &stream](int const start_col, int const end_col, + int const window_height) { + int i = detail::compute_block_counts(gpu_batch_row_boundaries, window_height, stream); + info_count += i; + }); + + // allocate space for blocks + device_uvector gpu_block_infos(info_count, stream); + int block_offset = 0; + + detail::determine_windows( + column_sizes, column_starts, first_batch_size, num_rows, shmem_limit_per_block, + [&gpu_batch_row_boundaries, &gpu_block_infos, num_rows, &block_offset, + stream](int const start_col, int const end_col, int const window_height) { + block_offset += detail::build_blocks( + {gpu_block_infos.data() + block_offset, gpu_block_infos.size() - block_offset}, + gpu_batch_row_boundaries, start_col, end_col, window_height, num_rows, stream); + }); // blast through the entire table and convert it - dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_ROWS)); + dim3 blocks(util::div_rounding_up_unsafe(gpu_block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_ROWS)); dim3 threads(256); detail::copy_to_rows<<>>( - num_rows, num_columns, shmem_limit_per_block, block_infos.size(), dev_input_data.data(), - dev_col_sizes.data(), dev_col_starts.data(), dev_block_infos.data(), dev_row_offsets.data(), + num_rows, num_columns, shmem_limit_per_block, gpu_block_infos, dev_input_data.data(), + dev_col_sizes.data(), dev_col_starts.data(), batch_info.input_data_row_offsets.data(), reinterpret_cast(dev_output_data.data())); - auto validity_block_infos = - build_validity_block_infos(num_columns, num_rows, shmem_limit_per_block, row_batches); + auto validity_block_infos = detail::build_validity_block_infos( + num_columns, num_rows, shmem_limit_per_block, batch_info.row_batches); auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream); dim3 validity_blocks( util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_VALIDITY_BLOCKS_PER_KERNEL)); dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu)); detail::copy_validity_to_rows<<>>( - num_rows, num_columns, shmem_limit_per_block, dev_row_offsets.data(), dev_output_data.data(), - column_starts.back(), dev_validity_block_infos.data(), validity_block_infos.size(), - dev_input_nm.data()); + num_rows, num_columns, shmem_limit_per_block, batch_info.input_data_row_offsets.data(), + dev_output_data.data(), column_starts.back(), dev_validity_block_infos, dev_input_nm.data()); // split up the output buffer into multiple buffers based on row batch sizes // and create list of byte columns - int offset_offset = 0; std::vector> ret; - for (uint i = 0; i < row_batches.size(); ++i) { - // compute offsets for this row batch - std::vector offset_vals; - offset_vals.reserve(row_batches[i].row_count + 1); - size_type cur_offset = 0; - offset_vals.push_back(cur_offset); - for (int row = 0; row < row_batches[i].row_count; ++row) { - cur_offset = detail::align_offset(cur_offset, 8) + row_sizes[row + offset_offset]; - offset_vals.push_back(cur_offset); - } - offset_offset += row_batches[i].row_count; - - auto dev_offsets = make_device_uvector_async(offset_vals, stream, mr); - auto offsets = std::make_unique(data_type{type_id::INT32}, - (size_type)offset_vals.size(), dev_offsets.release()); - - auto data = std::make_unique(data_type{cudf::type_id::INT8}, row_batches[i].num_bytes, - std::move(output_buffers[i])); - - ret.push_back( - cudf::make_lists_column(row_batches[i].row_count, std::move(offsets), std::move(data), 0, - rmm::device_buffer{0, rmm::cuda_stream_default, mr}, stream, mr)); + for (int batch = 0; batch < (int)batch_info.row_batches.size(); ++batch) { + auto const offset_count = batch_info.row_batches[batch].row_offsets.size(); + auto offsets = std::make_unique(data_type{type_id::INT32}, (size_type)offset_count, + batch_info.row_batches[batch].row_offsets.release()); + auto data = + std::make_unique(data_type{type_id::INT8}, batch_info.row_batches[batch].num_bytes, + std::move(output_buffers[batch])); + + ret.push_back(cudf::make_lists_column( + batch_info.row_batches[batch].row_count, std::move(offsets), std::move(data), 0, + rmm::device_buffer{0, rmm::cuda_stream_default, mr}, stream, mr)); } return ret; @@ -1437,7 +1597,8 @@ convert_to_rows_fixed_width_optimized(cudf::table_view const &tbl, rmm::cuda_str std::vector schema; schema.resize(num_columns); - std::transform(tbl.begin(), tbl.end(), schema.begin(), detail::get_data_type); + std::transform(tbl.begin(), tbl.end(), schema.begin(), + [](auto i) -> cudf::data_type { return i.type(); }); if (detail::are_all_fixed_width(schema)) { std::vector column_start; @@ -1509,7 +1670,7 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const &in int total_shmem; CUDA_TRY(cudaDeviceGetAttribute(&total_shmem, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); - // TODO why? + // TODO: why is this needed. kernel fails to launch if all memory is requested. total_shmem -= 1024; int shmem_limit_per_block = total_shmem / NUM_BLOCKS_PER_KERNEL_LOADED; @@ -1519,8 +1680,8 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const &in auto iter = thrust::make_transform_iterator(thrust::make_counting_iterator(0), [&schema](auto i) { return std::make_tuple(schema[i], nullptr); }); - size_type fixed_width_size_per_row = detail::compute_column_information( - iter, iter + num_columns, column_starts, column_sizes); //, [](void *) {}); + size_type fixed_width_size_per_row = + detail::compute_column_information(iter, iter + num_columns, column_starts, column_sizes); size_type validity_size = num_bitmask_words(num_columns) * 4; @@ -1534,8 +1695,8 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const &in // build the row_batches from the passed in list column std::vector row_batches; - - row_batches.push_back(detail::row_batch{child.size(), num_rows}); + row_batches.push_back( + {detail::row_batch{child.size(), num_rows, device_uvector(0, stream)}}); // Allocate the columns we are going to write into std::vector> output_columns; @@ -1553,45 +1714,48 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const &in auto dev_output_data = make_device_uvector_async(output_data, stream, mr); auto dev_output_nm = make_device_uvector_async(output_nm, stream, mr); - std::vector block_infos = - build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block); - - auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr); - - dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_FROM_ROWS)); + // only ever get a single batch when going from rows, so boundaries + // are 0, num_rows + device_uvector gpu_batch_row_boundaries(2, stream); + + thrust::transform(rmm::exec_policy(stream), thrust::make_counting_iterator(0), + thrust::make_counting_iterator(2), gpu_batch_row_boundaries.begin(), + [num_rows] __device__(auto i) { return i == 0 ? 0 : num_rows; }); + + int info_count = 0; + detail::determine_windows(column_sizes, column_starts, num_rows, num_rows, shmem_limit_per_block, + [&gpu_batch_row_boundaries, &info_count, &stream]( + int const start_col, int const end_col, int const window_height) { + info_count += detail::compute_block_counts(gpu_batch_row_boundaries, + window_height, stream); + }); + + // allocate space for blocks + device_uvector gpu_block_infos(info_count, stream); + + int block_offset = 0; + detail::determine_windows( + column_sizes, column_starts, num_rows, num_rows, shmem_limit_per_block, + [&gpu_batch_row_boundaries, &gpu_block_infos, num_rows, &block_offset, + stream](int const start_col, int const end_col, int const window_height) { + block_offset += detail::build_blocks( + {gpu_block_infos.data() + block_offset, gpu_block_infos.size() - block_offset}, + gpu_batch_row_boundaries, start_col, end_col, window_height, num_rows, stream); + }); + + dim3 blocks( + util::div_rounding_up_unsafe(gpu_block_infos.size(), NUM_BLOCKS_PER_KERNEL_FROM_ROWS)); dim3 threads(std::min(std::min(256, shmem_limit_per_block / 8), (int)child.size())); detail::copy_from_rows<<>>( num_rows, num_columns, shmem_limit_per_block, input.offsets().data(), - dev_output_data.data(), dev_col_sizes.data(), dev_col_starts.data(), dev_block_infos.data(), - block_infos.size(), child.data()); + dev_output_data.data(), dev_col_sizes.data(), dev_col_starts.data(), gpu_block_infos, + child.data()); + + auto validity_block_infos = + detail::build_validity_block_infos(num_columns, num_rows, shmem_limit_per_block, row_batches); + + auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream); - auto const desired_rows_and_columns = (int)sqrt(shmem_limit_per_block); - auto const column_stride = [&]() { - if (desired_rows_and_columns > num_columns) { - // not many columns, group it into 64s and ship it off - return std::min(64, num_columns); - } else { - return util::round_down_safe(desired_rows_and_columns, 8); - } - }(); - auto const row_stride = [&]() { - // we fit as much as we can, we know the column stride now, so calculate the row - return std::min(num_rows, util::round_down_safe(shmem_limit_per_block * 8 / column_stride, 32)); - /* if (desired_rows_and_columns > num_rows) { - return std::min(32, num_rows); - } else { - return util::round_down_safe(desired_rows_and_columns, 32); - }*/ - }(); - std::vector validity_block_infos; - for (int col = 0; col < num_columns; col += column_stride) { - for (int row = 0; row < num_rows; row += row_stride) { - validity_block_infos.emplace_back( - detail::block_info{col, row, std::min(col + column_stride - 1, num_columns - 1), - std::min(row + row_stride - 1, num_rows - 1)}); - } - } - auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream, mr); dim3 validity_blocks( util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_VALIDITY_BLOCKS_PER_KERNEL)); @@ -1599,8 +1763,8 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const &in detail:: copy_validity_from_rows<<>>( num_rows, num_columns, shmem_limit_per_block, input.offsets().data(), - dev_output_nm.data(), column_starts.back(), dev_validity_block_infos.data(), - validity_block_infos.size(), child.data()); + dev_output_nm.data(), column_starts.back(), dev_validity_block_infos, + child.data()); return std::make_unique(std::move(output_columns)); #else @@ -1665,4 +1829,6 @@ std::unique_ptr convert_from_rows_fixed_width_optimized( } } +} // namespace java + } // namespace cudf From 70e39cd58b26c5576140f9c95fbee13edeffff19 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Thu, 4 Nov 2021 00:02:24 +0000 Subject: [PATCH 31/80] fixing overflow issues with large tables --- java/src/main/native/src/row_conversion.cu | 202 +++++++++++---------- 1 file changed, 110 insertions(+), 92 deletions(-) diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu index 87ab1ed49d8..c5bbed5274c 100644 --- a/java/src/main/native/src/row_conversion.cu +++ b/java/src/main/native/src/row_conversion.cu @@ -15,6 +15,8 @@ */ #include +#include +#include #include #include #include @@ -25,6 +27,8 @@ #include #include +#include "thrust/scan.h" + #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 #include #endif @@ -50,7 +54,6 @@ #include #include #include -#include #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 constexpr auto NUM_BLOCKS_PER_KERNEL_FROM_ROWS = 2; @@ -336,7 +339,7 @@ struct block_info { int start_row; int end_col; int end_row; - int buffer_num; + int batch_number; __host__ __device__ size_type get_shared_row_size(size_type const *const col_offsets, size_type const *const col_sizes) const { @@ -369,7 +372,7 @@ struct row_batch { * @param input_data pointer to raw table data * @param col_sizes array of sizes for each element in a column - one per column * @param col_offsets offset into input data row for each column's start - * @param row_offsets offset to a specific row in the input data + * @param row_offsets offset to a specific row in the output data * @param output_data pointer to output data * */ @@ -470,7 +473,7 @@ __global__ void copy_to_rows(const size_type num_rows, const size_type num_colum auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_ROWS + subset]; auto const block_row_size = block.get_shared_row_size(col_offsets, col_sizes); auto const column_offset = col_offsets[block.start_col]; - auto const block_output_buffer = output_data[block.buffer_num]; + auto const block_output_buffer = output_data[block.batch_number]; // copy entire rows to final dest for (auto absolute_row = block.start_row + threadIdx.x; absolute_row <= block.end_row; @@ -496,7 +499,7 @@ __global__ void copy_to_rows(const size_type num_rows, const size_type num_colum * @param num_rows total number of rows in the table * @param num_columns total number of columns in the table * @param shmem_used_per_block amount of shared memory that is used by a block - * @param row_offsets offset to a specific row in the input data + * @param row_offsets offset to a specific row in the output data * @param output_data pointer to output data, partitioned by data size * @param validity_offsets offset into input data row for validity data * @param block_infos information about the blocks of work @@ -610,7 +613,7 @@ __global__ void copy_validity_to_rows(const size_type num_rows, const size_type group.sync(); auto const output_data_base = - output_data[block.buffer_num] + validity_offset + block.start_col / 8; + output_data[block.batch_number] + validity_offset + block.start_col / 8; // now async memcpy the shared memory out to the final destination for (int row = block.start_row + threadIdx.x; row <= block.end_row; row += blockDim.x) { @@ -1176,12 +1179,18 @@ constexpr size_type max_batch_size = std::numeric_limits::max(); * */ struct batch_data { + device_uvector batch_row_offsets; std::vector batch_row_boundaries; - device_uvector input_data_row_offsets; std::vector row_batches; +}; - batch_data(size_type num_input_offsets, rmm::cuda_stream_view stream) - : input_data_row_offsets(num_input_offsets, stream){}; +template struct row_size_functor { + RowSize _row_sizes; + size_type _num_rows; + row_size_functor(RowSize row_sizes) : _row_sizes(row_sizes){}; + + CUDA_DEVICE_CALLABLE + uint64_t operator()(int row_index) { return static_cast(_row_sizes[row_index]); } }; /** @@ -1199,19 +1208,26 @@ struct batch_data { template batch_data build_batches(size_type num_rows, RowSize row_sizes, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) { - auto const total_size = thrust::reduce(rmm::exec_policy(stream), row_sizes, row_sizes + num_rows); - auto const num_batches = util::div_rounding_up_safe(total_size, max_batch_size); + auto uint64_row_sizes = + cudf::detail::make_counting_transform_iterator(0, row_size_functor(row_sizes)); + auto const total_size = + thrust::reduce(rmm::exec_policy(stream), uint64_row_sizes, uint64_row_sizes + num_rows); + auto const num_batches = static_cast( + util::div_rounding_up_safe(total_size, static_cast(max_batch_size))); auto const num_offsets = num_batches + 1; - batch_data ret(num_rows + 1, stream); + std::vector row_batches; + std::vector batch_row_boundaries; + device_uvector batch_row_offsets(num_rows, stream); // at most max gpu memory / 2GB iterations. - ret.batch_row_boundaries.reserve(num_offsets); - ret.batch_row_boundaries.push_back(0); + batch_row_boundaries.reserve(num_offsets); + batch_row_boundaries.push_back(0); size_type last_row_end = 0; - device_uvector cumulative_row_sizes(num_rows, stream); - thrust::inclusive_scan(rmm::exec_policy(stream), row_sizes, row_sizes + num_rows, + device_uvector cumulative_row_sizes(num_rows, stream); + thrust::inclusive_scan(rmm::exec_policy(stream), uint64_row_sizes, uint64_row_sizes + num_rows, cumulative_row_sizes.begin()); - while ((int)ret.batch_row_boundaries.size() < num_offsets) { + + while ((int)batch_row_boundaries.size() < num_offsets) { // find the next max_batch_size boundary size_type const row_end = ((thrust::lower_bound(rmm::exec_policy(stream), cumulative_row_sizes.begin(), @@ -1220,6 +1236,9 @@ batch_data build_batches(size_type num_rows, RowSize row_sizes, rmm::cuda_stream cumulative_row_sizes.begin()) + last_row_end); + // build offset list for each row in this batch + auto const num_rows_in_batch = row_end - last_row_end; + // build offset list for each row in this batch auto const num_entries = row_end - last_row_end + 1; device_uvector output_batch_row_offsets(num_entries, stream, mr); @@ -1232,44 +1251,44 @@ batch_data build_batches(size_type num_rows, RowSize row_sizes, rmm::cuda_stream thrust::exclusive_scan(rmm::exec_policy(stream), row_size_iter_bounded, row_size_iter_bounded + num_entries, output_batch_row_offsets.begin()); - ret.batch_row_boundaries.push_back(row_end); - auto const batch_bytes = output_batch_row_offsets.element(row_end, stream) - - output_batch_row_offsets.element(last_row_end, stream); - auto const num_rows_in_batch = row_end - last_row_end; - ret.row_batches.push_back( - {batch_bytes, num_rows_in_batch, std::move(output_batch_row_offsets)}); + auto const batch_bytes = output_batch_row_offsets.element(num_rows_in_batch, stream); + + // The output_batch_row_offsets vector is used as the offset column of the returned data. This + // needs to be individually allocated, but the kernel needs a contiguous array of offsets or + // more global lookups are necessary. + cudaMemcpy(batch_row_offsets.data() + last_row_end, output_batch_row_offsets.data(), + num_rows_in_batch * sizeof(size_type), cudaMemcpyDeviceToDevice); + + batch_row_boundaries.push_back(row_end); + row_batches.push_back({batch_bytes, num_rows_in_batch, std::move(output_batch_row_offsets)}); + last_row_end = row_end; } - auto row_size_iter = cudf::detail::make_counting_transform_iterator( - 0, [row_sizes, num_rows] __device__(auto i) { return (i < num_rows) ? row_sizes[i] : 0; }); - thrust::exclusive_scan(rmm::exec_policy(stream), row_size_iter, row_size_iter + num_rows + 1, - ret.input_data_row_offsets.begin()); - - return ret; + return {std::move(batch_row_offsets), batch_row_boundaries, std::move(row_batches)}; } /** * @brief Computes the number of blocks necessary given a window height and batch offsets * - * @param batch_row_offsets row offsets for each batch + * @param batch_row_boundaries row boundaries for each batch * @param desired_window_height height of each window in the table * @param stream stream to use * @return number of windows necessary */ -int compute_block_counts(device_span const &batch_row_offsets, +int compute_block_counts(device_span const &batch_row_boundaries, int desired_window_height, rmm::cuda_stream_view stream) { - size_type const num_batches = batch_row_offsets.size() - 1; + size_type const num_batches = batch_row_boundaries.size() - 1; device_uvector num_blocks(num_batches, stream); auto iter = thrust::make_counting_iterator(0); - thrust::transform( - rmm::exec_policy(stream), iter, iter + num_batches, num_blocks.begin(), - [desired_window_height, - batch_row_offsets = batch_row_offsets.data()] __device__(auto batch_index) -> size_type { - return util::div_rounding_up_unsafe(batch_row_offsets[batch_index + 1] - - batch_row_offsets[batch_index], - desired_window_height); - }); + thrust::transform(rmm::exec_policy(stream), iter, iter + num_batches, num_blocks.begin(), + [desired_window_height, + batch_row_boundaries = + batch_row_boundaries.data()] __device__(auto batch_index) -> size_type { + return util::div_rounding_up_unsafe(batch_row_boundaries[batch_index + 1] - + batch_row_boundaries[batch_index], + desired_window_height); + }); return thrust::reduce(rmm::exec_policy(stream), num_blocks.begin(), num_blocks.end()); } @@ -1277,7 +1296,7 @@ int compute_block_counts(device_span const &batch_row_offsets, * @brief Builds the `block_info` structs for a given table. * * @param blocks span of blocks to populate - * @param batch_row_offsets offsets to row batches + * @param batch_row_boundaries boundary to row batches * @param column_start starting column of the window * @param column_end ending column of the window * @param desired_window_height height of the window @@ -1287,20 +1306,20 @@ int compute_block_counts(device_span const &batch_row_offsets, */ size_type build_blocks(device_span blocks, - device_uvector const &batch_row_offsets, // comes from build_batches + device_uvector const &batch_row_boundaries, // comes from build_batches int column_start, int column_end, int desired_window_height, int total_number_of_rows, rmm::cuda_stream_view stream) { - size_type const num_batches = batch_row_offsets.size() - 1; + size_type const num_batches = batch_row_boundaries.size() - 1; device_uvector num_blocks(num_batches, stream); auto iter = thrust::make_counting_iterator(0); - thrust::transform( - rmm::exec_policy(stream), iter, iter + num_batches, num_blocks.begin(), - [desired_window_height, - batch_row_offsets = batch_row_offsets.data()] __device__(auto batch_index) -> size_type { - return util::div_rounding_up_unsafe(batch_row_offsets[batch_index + 1] - - batch_row_offsets[batch_index], - desired_window_height); - }); + thrust::transform(rmm::exec_policy(stream), iter, iter + num_batches, num_blocks.begin(), + [desired_window_height, + batch_row_boundaries = + batch_row_boundaries.data()] __device__(auto batch_index) -> size_type { + return util::div_rounding_up_unsafe(batch_row_boundaries[batch_index + 1] - + batch_row_boundaries[batch_index], + desired_window_height); + }); size_type const total_blocks = thrust::reduce(rmm::exec_policy(stream), num_blocks.begin(), num_blocks.end()); @@ -1316,7 +1335,7 @@ build_blocks(device_span blocks, thrust::transform( rmm::exec_policy(stream), iter, iter + total_blocks, blocks.begin(), [=, block_starts = block_starts.data(), - batch_row_offsets = batch_row_offsets.data()] __device__(size_type block_index) { + batch_row_boundaries = batch_row_boundaries.data()] __device__(size_type block_index) { // what batch this block falls in auto const batch_index_iter = thrust::upper_bound(thrust::seq, block_starts, block_starts + num_batches, block_index); @@ -1324,14 +1343,15 @@ build_blocks(device_span blocks, // local index within the block int const local_block_index = block_index - block_starts[batch_index]; // the start row for this batch. - int const batch_row_start = batch_row_offsets[batch_index]; + int const batch_row_start = batch_row_boundaries[batch_index]; // the start row for this block int const block_row_start = batch_row_start + (local_block_index * desired_window_height); // the end row for this block - int const max_row = std::min(total_number_of_rows - 1, - batch_index + 1 > num_batches ? - std::numeric_limits::max() : - static_cast(batch_row_offsets[batch_index + 1]) - 1); + int const max_row = + std::min(total_number_of_rows - 1, + batch_index + 1 > num_batches ? + std::numeric_limits::max() : + static_cast(batch_row_boundaries[batch_index + 1]) - 1); int const block_row_end = std::min( batch_row_start + ((local_block_index + 1) * desired_window_height) - 1, max_row); @@ -1420,20 +1440,6 @@ void determine_windows(std::vector const &column_sizes, } } -struct row_size_functor { - size_type _fixed_width_size_per_row; - size_type _num_columns; - row_size_functor(size_t fixed_width_size_per_row, size_t num_columns) - : _fixed_width_size_per_row(fixed_width_size_per_row), _num_columns(num_columns){}; - - CUDA_DEVICE_CALLABLE - int operator()(int row_index) { - auto const bytes_needed = - _fixed_width_size_per_row + util::div_rounding_up_safe(_num_columns, 8); - return detail::align_offset(bytes_needed, 8); - } -}; - #endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 } // namespace detail @@ -1502,7 +1508,11 @@ std::vector> convert_to_rows(cudf::table_view cons // total encoded row size. This includes fixed-width data, validity, and variable-width data. auto row_size_iter = cudf::detail::make_counting_transform_iterator( - 0, detail::row_size_functor(fixed_width_size_per_row, num_columns)); + 0, [fixed_width_size_per_row, num_columns] __device__(auto i) { + auto const bytes_needed = + fixed_width_size_per_row + util::div_rounding_up_safe(num_columns, 8); + return detail::align_offset(bytes_needed, 8); + }); // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then // calculate the size of each row's variable-width data and validity as well. @@ -1518,11 +1528,14 @@ std::vector> convert_to_rows(cudf::table_view cons std::vector output_buffers; std::vector output_data; output_data.reserve(batch_info.row_batches.size()); - for (uint i = 0; i < batch_info.row_batches.size(); ++i) { - rmm::device_buffer temp(batch_info.row_batches[i].num_bytes, stream, mr); - output_data.push_back(static_cast(temp.data())); - output_buffers.push_back(std::move(temp)); - } + output_buffers.reserve(batch_info.row_batches.size()); + std::transform(batch_info.row_batches.begin(), batch_info.row_batches.end(), + std::back_inserter(output_buffers), [&](auto const &batch) { + return rmm::device_buffer(batch.num_bytes, stream, mr); + }); + std::transform(output_buffers.begin(), output_buffers.end(), std::back_inserter(output_data), + [](auto &buf) { return static_cast(buf.data()); }); + auto dev_output_data = make_device_uvector_async(output_data, stream, mr); int info_count = 0; @@ -1551,11 +1564,6 @@ std::vector> convert_to_rows(cudf::table_view cons dim3 blocks(util::div_rounding_up_unsafe(gpu_block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_ROWS)); dim3 threads(256); - detail::copy_to_rows<<>>( - num_rows, num_columns, shmem_limit_per_block, gpu_block_infos, dev_input_data.data(), - dev_col_sizes.data(), dev_col_starts.data(), batch_info.input_data_row_offsets.data(), - reinterpret_cast(dev_output_data.data())); - auto validity_block_infos = detail::build_validity_block_infos( num_columns, num_rows, shmem_limit_per_block, batch_info.row_batches); @@ -1563,8 +1571,16 @@ std::vector> convert_to_rows(cudf::table_view cons dim3 validity_blocks( util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_VALIDITY_BLOCKS_PER_KERNEL)); dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu)); + + detail::copy_to_rows<<>>( + num_rows, num_columns, shmem_limit_per_block, gpu_block_infos, dev_input_data.data(), + dev_col_sizes.data(), dev_col_starts.data(), + batch_info.batch_row_offsets + .data(), // needs to be row offsets per batch, not overall JUST for output. + reinterpret_cast(dev_output_data.data())); + detail::copy_validity_to_rows<<>>( - num_rows, num_columns, shmem_limit_per_block, batch_info.input_data_row_offsets.data(), + num_rows, num_columns, shmem_limit_per_block, batch_info.batch_row_offsets.data(), dev_output_data.data(), column_starts.back(), dev_validity_block_infos, dev_input_nm.data()); // split up the output buffer into multiple buffers based on row batch sizes @@ -1693,11 +1709,6 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const &in auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr); auto dev_col_sizes = make_device_uvector_async(column_sizes, stream, mr); - // build the row_batches from the passed in list column - std::vector row_batches; - row_batches.push_back( - {detail::row_batch{child.size(), num_rows, device_uvector(0, stream)}}); - // Allocate the columns we are going to write into std::vector> output_columns; std::vector output_data; @@ -1711,6 +1722,11 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const &in output_columns.emplace_back(std::move(column)); } + // build the row_batches from the passed in list column + std::vector row_batches; + row_batches.push_back( + {detail::row_batch{child.size(), num_rows, device_uvector(0, stream)}}); + auto dev_output_data = make_device_uvector_async(output_data, stream, mr); auto dev_output_nm = make_device_uvector_async(output_nm, stream, mr); @@ -1746,10 +1762,6 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const &in dim3 blocks( util::div_rounding_up_unsafe(gpu_block_infos.size(), NUM_BLOCKS_PER_KERNEL_FROM_ROWS)); dim3 threads(std::min(std::min(256, shmem_limit_per_block / 8), (int)child.size())); - detail::copy_from_rows<<>>( - num_rows, num_columns, shmem_limit_per_block, input.offsets().data(), - dev_output_data.data(), dev_col_sizes.data(), dev_col_starts.data(), gpu_block_infos, - child.data()); auto validity_block_infos = detail::build_validity_block_infos(num_columns, num_rows, shmem_limit_per_block, row_batches); @@ -1760,6 +1772,12 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const &in util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_VALIDITY_BLOCKS_PER_KERNEL)); dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu)); + + detail::copy_from_rows<<>>( + num_rows, num_columns, shmem_limit_per_block, input.offsets().data(), + dev_output_data.data(), dev_col_sizes.data(), dev_col_starts.data(), gpu_block_infos, + child.data()); + detail:: copy_validity_from_rows<<>>( num_rows, num_columns, shmem_limit_per_block, input.offsets().data(), From 64c8374aa4e21cd164a5011be3cc20d7ec377ac1 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Tue, 9 Nov 2021 03:50:24 +0000 Subject: [PATCH 32/80] fixing includes for java --- java/src/main/native/src/row_conversion.cu | 26 ++++++++++++---------- 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu index c5bbed5274c..f9cb61f4ea1 100644 --- a/java/src/main/native/src/row_conversion.cu +++ b/java/src/main/native/src/row_conversion.cu @@ -23,16 +23,24 @@ #include #include -#include -#include #include -#include "thrust/scan.h" - #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 #include #endif +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include #include #include #include @@ -40,20 +48,14 @@ #include #include #include -#include #include #include #include #include #include #include -#include -#include -#include -#include -#include -#include -#include + +#include "row_conversion.hpp" #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 constexpr auto NUM_BLOCKS_PER_KERNEL_FROM_ROWS = 2; From f8ea2b1f767f1ce2885b71086c8936a1b13319a5 Mon Sep 17 00:00:00 2001 From: Raza Jafri Date: Mon, 15 Nov 2021 13:33:35 -0800 Subject: [PATCH 33/80] addressed review concerns --- java/src/main/java/ai/rapids/cudf/Table.java | 26 ++++++---- .../test/java/ai/rapids/cudf/TableTest.java | 48 ++++++++----------- 2 files changed, 37 insertions(+), 37 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index eb61ec25d9a..7d9e5a19ed6 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -2651,6 +2651,23 @@ public GatherMap conditionalLeftAntiJoinGatherMap(Table rightTable, return buildSemiJoinGatherMap(gatherMapData); } + /** + * For details about how this method functions refer to + * {@link #convertToRowsFixedWidthOptimized()}. + * + * The only thing different between this method and {@link #convertToRowsFixedWidthOptimized()} + * is that this can handle rougly 250M columns while {@link #convertToRowsFixedWidthOptimized()} + * can only handle columns less than 100 + */ + public ColumnVector[] convertToRows() { + long[] ptrs = convertToRows(nativeHandle); + ColumnVector[] ret = new ColumnVector[ptrs.length]; + for (int i = 0; i < ptrs.length; i++) { + ret[i] = new ColumnVector(ptrs[i]); + } + return ret; + } + /** * Convert this table of columns into a row major format that is useful for interacting with other * systems that do row major processing of the data. Currently only fixed-width column types are @@ -2725,15 +2742,6 @@ public GatherMap conditionalLeftAntiJoinGatherMap(Table rightTable, * There are some limits on the size of a single row. If the row is larger than 1KB this will * throw an exception. */ - public ColumnVector[] convertToRows() { - long[] ptrs = convertToRows(nativeHandle); - ColumnVector[] ret = new ColumnVector[ptrs.length]; - for (int i = 0; i < ptrs.length; i++) { - ret[i] = new ColumnVector(ptrs[i]); - } - return ret; - } - public ColumnVector[] convertToRowsFixedWidthOptimized() { long[] ptrs = convertToRowsFixedWidthOptimized(nativeHandle); ColumnVector[] ret = new ColumnVector[ptrs.length]; diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index 623b444676f..6cc108030d1 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -7223,26 +7223,22 @@ void fixedWidthRowsRoundTripWide() { IntStream.range(0, 10).forEach(i -> tb.decimal32Column(-3, RoundingMode.UNNECESSARY, 5.0d, 9.5d, 0.9d, 7.23d, 2.8d, null)); IntStream.range(0, 10).forEach(i -> tb.decimal64Column(-8, 3L, 9L, 4L, 2L, 20L, null)); - try (Table t = tb.build()) { - ColumnVector[] rows = t.convertToRows(); + try (Table origTable = tb.build()) { + ColumnVector[] rowMajorTable = origTable.convertToRows(); try { // We didn't overflow - assert rows.length == 1; - ColumnVector cv = rows[0]; - assert cv.getRowCount() == t.getRowCount(); -// try (HostColumnVector hcv = cv.copyToHost()) { -// hcv.getChildColumnView(0).getDataBuffer().printBuffer(8); -// } - - DType[] types = new DType[t.getNumberOfColumns()]; - for (int i = 0; i < t.getNumberOfColumns(); i++) { - types[i] = t.getColumn(i).getType(); + assert rowMajorTable.length == 1; + ColumnVector cv = rowMajorTable[0]; + assert cv.getRowCount() == origTable.getRowCount(); + DType[] types = new DType[origTable.getNumberOfColumns()]; + for (int i = 0; i < origTable.getNumberOfColumns(); i++) { + types[i] = origTable.getColumn(i).getType(); } try (Table backAgain = Table.convertFromRows(cv, types)) { - assertTablesAreEqual(t, backAgain); + assertTablesAreEqual(origTable, backAgain); } } finally { - for (ColumnVector cv : rows) { + for (ColumnVector cv : rowMajorTable) { cv.close(); } } @@ -7251,7 +7247,7 @@ void fixedWidthRowsRoundTripWide() { @Test void fixedWidthRowsRoundTrip() { - try (Table t = new TestBuilder() + try (Table origTable = new TestBuilder() .column(3l, 9l, 4l, 2l, 20l, null) .column(5.0d, 9.5d, 0.9d, 7.23d, 2.8d, null) .column(5, 1, 0, 2, 7, null) @@ -7261,25 +7257,21 @@ void fixedWidthRowsRoundTrip() { .decimal32Column(-3, RoundingMode.UNNECESSARY, 5.0d, 9.5d, 0.9d, 7.23d, 2.8d, null) .decimal64Column(-8, 3L, 9L, 4L, 2L, 20L, null) .build()) { - ColumnVector[] rows = t.convertToRowsFixedWidthOptimized(); + ColumnVector[] rowMajorTable = origTable.convertToRowsFixedWidthOptimized(); try { // We didn't overflow - assert rows.length == 1; - ColumnVector cv = rows[0]; - assert cv.getRowCount() == t.getRowCount(); -// try (HostColumnVector hcv = cv.copyToHost()) { -// hcv.getChildColumnView(0).getDataBuffer().printBuffer(8); -// } - - DType[] types = new DType[t.getNumberOfColumns()]; - for (int i = 0; i < t.getNumberOfColumns(); i++) { - types[i] = t.getColumn(i).getType(); + assert rowMajorTable.length == 1; + ColumnVector cv = rowMajorTable[0]; + assert cv.getRowCount() == origTable.getRowCount(); + DType[] types = new DType[origTable.getNumberOfColumns()]; + for (int i = 0; i < origTable.getNumberOfColumns(); i++) { + types[i] = origTable.getColumn(i).getType(); } try (Table backAgain = Table.convertFromRowsFixedWidthOptimized(cv, types)) { - assertTablesAreEqual(t, backAgain); + assertTablesAreEqual(origTable, backAgain); } } finally { - for (ColumnVector cv : rows) { + for (ColumnVector cv : rowMajorTable) { cv.close(); } } From c88472a95869619664c28f02dc321428523d87f9 Mon Sep 17 00:00:00 2001 From: Raza Jafri Date: Mon, 15 Nov 2021 14:33:09 -0800 Subject: [PATCH 34/80] removed TODOs and added note to javadocs --- java/src/main/java/ai/rapids/cudf/Table.java | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index 7d9e5a19ed6..b39632e43e7 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -2754,13 +2754,14 @@ public ColumnVector[] convertToRowsFixedWidthOptimized() { /** * Convert a column of list of bytes that is formatted like the output from `convertToRows` * and convert it back to a table. + * + * NOTE: This method doesn't support nested types + * * @param vec the row data to process. * @param schema the types of each column. * @return the parsed table. */ public static Table convertFromRows(ColumnView vec, DType ... schema) { - // TODO at some point we need a schema that support nesting so we can support nested types - // TODO we will need scale at some point very soon too int[] types = new int[schema.length]; int[] scale = new int[schema.length]; for (int i = 0; i < schema.length; i++) { @@ -2774,13 +2775,14 @@ public static Table convertFromRows(ColumnView vec, DType ... schema) { /** * Convert a column of list of bytes that is formatted like the output from `convertToRows` * and convert it back to a table. + * + * NOTE: This method doesn't support nested types + * * @param vec the row data to process. * @param schema the types of each column. * @return the parsed table. */ public static Table convertFromRowsFixedWidthOptimized(ColumnView vec, DType ... schema) { - // TODO at some point we need a schema that support nesting so we can support nested types - // TODO we will need scale at some point very soon too int[] types = new int[schema.length]; int[] scale = new int[schema.length]; for (int i = 0; i < schema.length; i++) { From 00e58d7912e56b790b6e448b827b1cd481ab6500 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Mon, 7 Jun 2021 08:14:52 +0000 Subject: [PATCH 35/80] working on row and column conversions --- .../row_conversion/row_conversion.cpp | 116 ++ cpp/include/cudf/row_conversion.hpp | 51 + cpp/src/row_conversion/row_conversion.cu | 1106 +++++++++++++++++ 3 files changed, 1273 insertions(+) create mode 100644 cpp/benchmarks/row_conversion/row_conversion.cpp create mode 100644 cpp/include/cudf/row_conversion.hpp create mode 100644 cpp/src/row_conversion/row_conversion.cu diff --git a/cpp/benchmarks/row_conversion/row_conversion.cpp b/cpp/benchmarks/row_conversion/row_conversion.cpp new file mode 100644 index 00000000000..c4edee91b3c --- /dev/null +++ b/cpp/benchmarks/row_conversion/row_conversion.cpp @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include +#include "cudf_test/column_utilities.hpp" + +class RowConversion : public cudf::benchmark { +}; + +static void BM_to_row(benchmark::State& state) +{ + cudf::size_type const n_rows{(cudf::size_type)state.range(0)}; + auto const table = create_random_table({cudf::type_id::INT8, + cudf::type_id::INT32, + cudf::type_id::INT16, + cudf::type_id::INT64, + cudf::type_id::INT32, + cudf::type_id::BOOL8, + cudf::type_id::UINT16, + cudf::type_id::UINT8, + cudf::type_id::UINT64}, + 50, + row_count{n_rows}); + + cudf::size_type total_bytes = 0; + for (int i = 0; i < table->num_columns(); ++i) { + auto t = table->get_column(i).type(); + total_bytes += cudf::size_of(t); + } + + for (auto _ : state) { + cuda_event_timer raii(state, true, rmm::cuda_stream_default); + +// auto rows = cudf::convert_to_rows(table->view()); + auto new_rows = cudf::convert_to_rows2(table->view()); + } + + state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows()); +} + +static void BM_from_row(benchmark::State& state) +{ + cudf::size_type const n_rows{(cudf::size_type)state.range(0)}; + auto const table = create_random_table({cudf::type_id::INT8, + cudf::type_id::INT32, + cudf::type_id::INT16, + cudf::type_id::INT64, + cudf::type_id::INT32, + cudf::type_id::BOOL8, + cudf::type_id::UINT16, + cudf::type_id::UINT8, + cudf::type_id::UINT64}, + 256, + row_count{n_rows}); + /* auto const table = create_random_table({cudf::type_id::INT32}, + 4, + row_count{n_rows});*/ + + std::vector schema; + cudf::size_type total_bytes = 0; + for (int i = 0; i < table->num_columns(); ++i) { + auto t = table->get_column(i).type(); + schema.push_back(t); + total_bytes += cudf::size_of(t); + } + + auto rows = cudf::convert_to_rows(table->view()); + + for (auto _ : state) { + cuda_event_timer raii(state, true, rmm::cuda_stream_default); + + auto out = cudf::convert_from_rows(rows, schema); + } + + state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows()); +} + +#define TO_ROW_CONVERSION_BENCHMARK_DEFINE(name) \ + BENCHMARK_DEFINE_F(RowConversion, name) \ + (::benchmark::State & st) { BM_to_row(st); } \ + BENCHMARK_REGISTER_F(RowConversion, name) \ + ->RangeMultiplier(8) \ + ->Ranges({{1 << 16, 1 << 24}}) \ + ->UseManualTime() \ + ->Unit(benchmark::kMillisecond); + +TO_ROW_CONVERSION_BENCHMARK_DEFINE(to_row_conversion) + +#define FROM_ROW_CONVERSION_BENCHMARK_DEFINE(name) \ + BENCHMARK_DEFINE_F(RowConversion, name) \ + (::benchmark::State & st) { BM_from_row(st); } \ + BENCHMARK_REGISTER_F(RowConversion, name) \ + ->RangeMultiplier(8) \ + ->Ranges({{1 << 6, 1 << 22}}) \ + ->UseManualTime() \ + ->Unit(benchmark::kMillisecond); + +FROM_ROW_CONVERSION_BENCHMARK_DEFINE(from_row_conversion) diff --git a/cpp/include/cudf/row_conversion.hpp b/cpp/include/cudf/row_conversion.hpp new file mode 100644 index 00000000000..f5e2225ad19 --- /dev/null +++ b/cpp/include/cudf/row_conversion.hpp @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include +#include +#include + +namespace cudf { + +std::vector> convert_to_rows( + cudf::table_view const &tbl, + // TODO need something for validity + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); + +std::vector> convert_to_rows2( + cudf::table_view const &tbl, + // TODO need something for validity + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); + +std::unique_ptr convert_from_rows( + cudf::lists_column_view const &input, + std::vector const &schema, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); + +std::unique_ptr convert_from_rows( + std::vector> const &input, + std::vector const &schema, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); + +} // namespace cudf diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu new file mode 100644 index 00000000000..fb5dc4cb38d --- /dev/null +++ b/cpp/src/row_conversion/row_conversion.cu @@ -0,0 +1,1106 @@ +/* + * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "cudf/types.hpp" +#include "rmm/device_buffer.hpp" +#include "thrust/iterator/counting_iterator.h" +#include "thrust/iterator/transform_iterator.h" + +namespace cudf { + +namespace detail { + +static inline __host__ __device__ int32_t align_offset(int32_t offset, std::size_t alignment) +{ + return (offset + alignment - 1) & ~(alignment - 1); +} + + +/** + * Copy a simple vector to device memory asynchronously. Be sure to read + * the data on the same stream as is used to copy it. + */ +template +std::unique_ptr> copy_to_dev_async(const std::vector &input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) +{ + std::unique_ptr> ret(new rmm::device_uvector(input.size(), stream, mr)); + CUDA_TRY(cudaMemcpyAsync( + ret->data(), input.data(), sizeof(T) * input.size(), cudaMemcpyHostToDevice, stream.value())); + return ret; +} + +template +rmm::device_uvector copy_to_dev_async2( + const std::vector &input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) +{ + rmm::device_uvector ret(input.size(), stream, mr); + CUDA_TRY(cudaMemcpyAsync( + ret.data(), input.data(), sizeof(T) * input.size(), cudaMemcpyHostToDevice, stream.value())); + return ret; +} + +__global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows, + const cudf::size_type num_columns, + const cudf::size_type row_size, + const cudf::size_type *input_offset_in_row, + const cudf::size_type *num_bytes, + int8_t **output_data, + cudf::bitmask_type **output_nm, + const int8_t *input_data) +{ + // We are going to copy the data in two passes. + // The first pass copies a chunk of data into shared memory. + // The second pass copies that chunk from shared memory out to the final location. + + // Because shared memory is limited we copy a subset of the rows at a time. + // For simplicity we will refer to this as a row_group + + // In practice we have found writing more than 4 columns of data per thread + // results in performance loss. As such we are using a 2 dimensional + // kernel in terms of threads, but not in terms of blocks. Columns are + // controlled by the y dimension (there is no y dimension in blocks). Rows + // are controlled by the x dimension (there are multiple blocks in the x + // dimension). + + cudf::size_type rows_per_group = blockDim.x; + cudf::size_type row_group_start = blockIdx.x; + cudf::size_type row_group_stride = gridDim.x; + cudf::size_type row_group_end = (num_rows + rows_per_group - 1) / rows_per_group + 1; + + extern __shared__ int8_t shared_data[]; + + // Because we are copying fixed width only data and we stride the rows + // this thread will always start copying from shared data in the same place + int8_t *row_tmp = &shared_data[row_size * threadIdx.x]; + int8_t *row_vld_tmp = &row_tmp[input_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]]; + + for (cudf::size_type row_group_index = row_group_start; row_group_index < row_group_end; + row_group_index += row_group_stride) { + // Step 1: Copy the data into shared memory + // We know row_size is always aligned with and a multiple of int64_t; + int64_t *long_shared = reinterpret_cast(shared_data); + const int64_t *long_input = reinterpret_cast(input_data); + + cudf::size_type shared_output_index = threadIdx.x + (threadIdx.y * blockDim.x); + cudf::size_type shared_output_stride = blockDim.x * blockDim.y; + cudf::size_type row_index_end = ((row_group_index + 1) * rows_per_group); + if (row_index_end > num_rows) { row_index_end = num_rows; } + cudf::size_type num_rows_in_group = row_index_end - (row_group_index * rows_per_group); + cudf::size_type shared_length = row_size * num_rows_in_group; + + cudf::size_type shared_output_end = shared_length / sizeof(int64_t); + + cudf::size_type start_input_index = + (row_size * row_group_index * rows_per_group) / sizeof(int64_t); + + for (cudf::size_type shared_index = shared_output_index; shared_index < shared_output_end; + shared_index += shared_output_stride) { + long_shared[shared_index] = long_input[start_input_index + shared_index]; + } + // Wait for all of the data to be in shared memory + __syncthreads(); + + // Step 2 copy the data back out + + // Within the row group there should be 1 thread for each row. This is a + // requirement for launching the kernel + cudf::size_type row_index = (row_group_index * rows_per_group) + threadIdx.x; + // But we might not use all of the threads if the number of rows does not go + // evenly into the thread count. We don't want those threads to exit yet + // because we may need them to copy data in for the next row group. + uint32_t active_mask = __ballot_sync(0xffffffff, row_index < num_rows); + if (row_index < num_rows) { + cudf::size_type col_index_start = threadIdx.y; + cudf::size_type col_index_stride = blockDim.y; + for (cudf::size_type col_index = col_index_start; col_index < num_columns; + col_index += col_index_stride) { + cudf::size_type col_size = num_bytes[col_index]; + const int8_t *col_tmp = &(row_tmp[input_offset_in_row[col_index]]); + int8_t *col_output = output_data[col_index]; + switch (col_size) { + case 1: { + col_output[row_index] = *col_tmp; + break; + } + case 2: { + int16_t *short_col_output = reinterpret_cast(col_output); + short_col_output[row_index] = *reinterpret_cast(col_tmp); + break; + } + case 4: { + int32_t *int_col_output = reinterpret_cast(col_output); + int_col_output[row_index] = *reinterpret_cast(col_tmp); + break; + } + case 8: { + int64_t *long_col_output = reinterpret_cast(col_output); + long_col_output[row_index] = *reinterpret_cast(col_tmp); + break; + } + default: { + cudf::size_type output_offset = col_size * row_index; + // TODO this should just not be supported for fixed width columns, but just in case... + for (cudf::size_type b = 0; b < col_size; b++) { + col_output[b + output_offset] = col_tmp[b]; + } + break; + } + } + + cudf::bitmask_type *nm = output_nm[col_index]; + int8_t *valid_byte = &row_vld_tmp[col_index / 8]; + cudf::size_type byte_bit_offset = col_index % 8; + int predicate = *valid_byte & (1 << byte_bit_offset); + uint32_t bitmask = __ballot_sync(active_mask, predicate); + if (row_index % 32 == 0) { nm[word_index(row_index)] = bitmask; } + } // end column loop + } // end row copy + // wait for the row_group to be totally copied before starting on the next row group + __syncthreads(); + } +} + +__global__ void copy_from_fixed_width_columns(const cudf::size_type start_row, + const cudf::size_type num_rows, + const cudf::size_type num_columns, + const cudf::size_type row_size, + const cudf::size_type *output_offset_in_row, + const cudf::size_type *num_bytes, + const int8_t **input_data, + const cudf::bitmask_type **input_nm, + int8_t *output_data) +{ + // We are going to copy the data in two passes. + // The first pass copies a chunk of data into shared memory. + // The second pass copies that chunk from shared memory out to the final location. + + // Because shared memory is limited we copy a subset of the rows at a time. + // We do not support copying a subset of the columns in a row yet, so we don't + // currently support a row that is wider than shared memory. + // For simplicity we will refer to this as a row_group + + // In practice we have found reading more than 4 columns of data per thread + // results in performance loss. As such we are using a 2 dimensional + // kernel in terms of threads, but not in terms of blocks. Columns are + // controlled by the y dimension (there is no y dimension in blocks). Rows + // are controlled by the x dimension (there are multiple blocks in the x + // dimension). + + cudf::size_type rows_per_group = blockDim.x; + cudf::size_type row_group_start = blockIdx.x; + cudf::size_type row_group_stride = gridDim.x; + cudf::size_type row_group_end = (num_rows + rows_per_group - 1) / rows_per_group + 1; + + extern __shared__ int8_t shared_data[]; + + // Because we are copying fixed width only data and we stride the rows + // this thread will always start copying to shared data in the same place + int8_t *row_tmp = &shared_data[row_size * threadIdx.x]; + int8_t *row_vld_tmp = + &row_tmp[output_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]]; + + for (cudf::size_type row_group_index = row_group_start; row_group_index < row_group_end; + row_group_index += row_group_stride) { + // Within the row group there should be 1 thread for each row. This is a + // requirement for launching the kernel + cudf::size_type row_index = start_row + (row_group_index * rows_per_group) + threadIdx.x; + // But we might not use all of the threads if the number of rows does not go + // evenly into the thread count. We don't want those threads to exit yet + // because we may need them to copy data back out. + if (row_index < (start_row + num_rows)) { + cudf::size_type col_index_start = threadIdx.y; + cudf::size_type col_index_stride = blockDim.y; + for (cudf::size_type col_index = col_index_start; col_index < num_columns; + col_index += col_index_stride) { + cudf::size_type col_size = num_bytes[col_index]; + int8_t *col_tmp = &(row_tmp[output_offset_in_row[col_index]]); + const int8_t *col_input = input_data[col_index]; + switch (col_size) { + case 1: { + *col_tmp = col_input[row_index]; + break; + } + case 2: { + const int16_t *short_col_input = reinterpret_cast(col_input); + *reinterpret_cast(col_tmp) = short_col_input[row_index]; + break; + } + case 4: { + const int32_t *int_col_input = reinterpret_cast(col_input); + *reinterpret_cast(col_tmp) = int_col_input[row_index]; + break; + } + case 8: { + const int64_t *long_col_input = reinterpret_cast(col_input); + *reinterpret_cast(col_tmp) = long_col_input[row_index]; + break; + } + default: { + cudf::size_type input_offset = col_size * row_index; + // TODO this should just not be supported for fixed width columns, but just in case... + for (cudf::size_type b = 0; b < col_size; b++) { + col_tmp[b] = col_input[b + input_offset]; + } + break; + } + } + // atomicOr only works on 32 bit or 64 bit aligned values, and not byte aligned + // so we have to rewrite the addresses to make sure that it is 4 byte aligned + int8_t *valid_byte = &row_vld_tmp[col_index / 8]; + cudf::size_type byte_bit_offset = col_index % 8; + uint64_t fixup_bytes = reinterpret_cast(valid_byte) % 4; + int32_t *valid_int = reinterpret_cast(valid_byte - fixup_bytes); + cudf::size_type int_bit_offset = byte_bit_offset + (fixup_bytes * 8); + // Now copy validity for the column + if (input_nm[col_index]) { + if (bit_is_set(input_nm[col_index], row_index)) { + atomicOr_block(valid_int, 1 << int_bit_offset); + } else { + atomicAnd_block(valid_int, ~(1 << int_bit_offset)); + } + } else { + // It is valid so just set the bit + atomicOr_block(valid_int, 1 << int_bit_offset); + } + } // end column loop + } // end row copy + // wait for the row_group to be totally copied into shared memory + __syncthreads(); + + // Step 2: Copy the data back out + // We know row_size is always aligned with and a multiple of int64_t; + int64_t *long_shared = reinterpret_cast(shared_data); + int64_t *long_output = reinterpret_cast(output_data); + + cudf::size_type shared_input_index = threadIdx.x + (threadIdx.y * blockDim.x); + cudf::size_type shared_input_stride = blockDim.x * blockDim.y; + cudf::size_type row_index_end = ((row_group_index + 1) * rows_per_group); + if (row_index_end > num_rows) { row_index_end = num_rows; } + cudf::size_type num_rows_in_group = row_index_end - (row_group_index * rows_per_group); + cudf::size_type shared_length = row_size * num_rows_in_group; + + cudf::size_type shared_input_end = shared_length / sizeof(int64_t); + + cudf::size_type start_output_index = + (row_size * row_group_index * rows_per_group) / sizeof(int64_t); + + for (cudf::size_type shared_index = shared_input_index; shared_index < shared_input_end; + shared_index += shared_input_stride) { + long_output[start_output_index + shared_index] = long_shared[shared_index]; + } + __syncthreads(); + // Go for the next round + } +} + +struct block_info { + int start_col; + int start_row; + int end_col; + int end_row; + int buffer_num; +}; + +/** + * @brief copy data from cudf columns into x format, which is row-based + * + * @param num_rows total number of rows in the table + * @param num_columns total number of columns in the table + * @param input_data pointer to raw table data + * @param input_nm pointer to validity data + * @param col_sizes array of sizes for each element in a column - one per column + * @param col_offsets offset into input data row for each column's start + * @param block_infos information about the blocks of work + * @param row_offsets offset to a specific row in the input data + * @param output_data pointer to output data + * + */ +__global__ void copy_from_columns(const cudf::size_type num_rows, + const cudf::size_type num_columns, + const int8_t **input_data, + const cudf::bitmask_type **input_nm, + const cudf::size_type *col_sizes, + const cudf::size_type *col_offsets, + const block_info *block_infos, + const uint64_t *row_offsets, + int8_t **output_data) +{ + // We are going to copy the data in two passes. + // The first pass copies a chunk of data into shared memory. + // The second pass copies that chunk from shared memory out to the final location. + + // Because shared memory is limited we copy a subset of the rows at a time. + // This has been broken up for us in the block_info struct, so we don't have + // any calculation to do here, but it is important to note. + + auto block = block_infos[blockIdx.x]; + extern __shared__ int8_t shared_data[]; + uint64_t const output_start_offset = col_offsets[block.start_col] + row_offsets[block.start_row]; + uint8_t const dest_shim_offset = reinterpret_cast(&output_data[0][output_start_offset]) & 7; // offset for alignment shim in order to match shared memory with final dest + + printf("copying from column %d to column %d with rows %d to row %d(grid dim %d, blockIdx %d)\n", block.start_col, block.end_col, block.start_row, block.end_row, gridDim.x, blockIdx.x); + + // each thread is responsible for every threadcount rows of data. + // the data is copies into shared memory in the final layout. + auto const shmem_row_size = align_offset(col_offsets[block.end_col] + col_sizes[block.end_col] - col_offsets[block.start_col] + dest_shim_offset, 8); // 8 byte alignment required for shared memory rows + auto const validity_offset = col_offsets[num_columns]; + for (int col=block.start_col; col<=block.end_col; ++col) { + /*if (!col_is_variable) */{ + uint64_t col_offset = 0; + cudf::size_type col_size = col_sizes[col]; + auto const dest_col_offset = col_offsets[col] - col_offsets[block.start_col] + dest_shim_offset; + for (int row=block.start_row + threadIdx.x; row(input_data[col]); + *reinterpret_cast(shmem_dest) = short_col_input[row]; + break; + } + case 4: { + const int32_t *int_col_input = reinterpret_cast(input_data[col]); + *reinterpret_cast(shmem_dest) = int_col_input[row]; + break; + } + case 8: { + const int64_t *long_col_input = reinterpret_cast(input_data[col]); + *reinterpret_cast(shmem_dest) = long_col_input[row]; + break; + } + default: { + cudf::size_type input_offset = col_size * row; + // TODO this should just not be supported for fixed width columns, but just in case... + for (cudf::size_type b = 0; b < col_size; b++) { + shmem_dest[b] = input_data[col][b + input_offset]; + } + break; + } + } + + // atomicOr only works on 32 bit or 64 bit aligned values, and not byte aligned + // so we have to rewrite the addresses to make sure that it is 4 byte aligned + // we do this directly in the final location because the entire row may not + // fit in shared memory and may require many blocks to process it entirely + int8_t *valid_byte = &output_data[block.buffer_num][row_offsets[row] + validity_offset + col / 8]; + cudf::size_type byte_bit_offset = col % 8; + uint64_t fixup_bytes = reinterpret_cast(valid_byte) % 4; + int32_t *valid_int = reinterpret_cast(valid_byte - fixup_bytes); + cudf::size_type int_bit_offset = byte_bit_offset + (fixup_bytes * 8); + // Now copy validity for the column + if (input_nm[col]) { + if (bit_is_set(input_nm[col], row)) { + atomicOr_block(valid_int, 1 << int_bit_offset); + } else { + atomicAnd_block(valid_int, ~(1 << int_bit_offset)); + } + } else { + // It is valid so just set the bit + atomicOr_block(valid_int, 1 << int_bit_offset); + } + } // end row + + col_offset += col_sizes[col] * (block.end_row - block.start_row); + } + } // end col + + // wait for the data to be totally copied into shared memory + __syncthreads(); + + // Step 2: Copy the data from shared memory to final destination + // each block is potentially a slice of the table, so no assumptions + // can be made about alignments. We do know that the alignment in shared + // memory matches the final destination alignment. Also note that + // we are not writing to entirely contiguous destinations as each + // row in shared memory may not be an entire row of the destination. + // + auto const thread_start_offset = threadIdx.x * 8; + auto const thread_stride = gridDim.x * 8; + for (auto src_offset = thread_start_offset; src_offset < shmem_row_size * (block.end_row - block.start_row); src_offset += thread_stride) { + auto const output_row_num = src_offset / shmem_row_size; + auto const row_offset = row_offsets[block.start_row + output_row_num]; + auto const col_offset = src_offset % shmem_row_size; + int8_t *output_ptr = &output_data[block.buffer_num][row_offset + col_offset]; + int8_t *input_ptr = &shared_data[src_offset]; + // the first part and last part of the row is unaligned data copy. This is copied a single byte + // at a time. + if (dest_shim_offset > 0 && src_offset % shmem_row_size == 0) { + // first part of a row, copy single bytes + auto const num_single_bytes = 8 - dest_shim_offset; + for (auto i=0; i 0 && (src_offset + 8) % shmem_row_size == 0) { + // last part of a row, copy single bytes + auto const num_single_bytes = dest_shim_offset; + for (auto i=0; i(input_ptr); + *reinterpret_cast(output_ptr) = *long_col_input; + } + } +} + +/** + * Calculate the dimensions of the kernel for fixed width only columns. + * @param [in] num_columns the number of columns being copied. + * @param [in] num_rows the number of rows being copied. + * @param [in] size_per_row the size each row takes up when padded. + * @param [out] blocks the size of the blocks for the kernel + * @param [out] threads the size of the threads for the kernel + * @return the size in bytes of shared memory needed for each block. + */ +static int calc_fixed_width_kernel_dims(const cudf::size_type num_columns, + const cudf::size_type num_rows, + const cudf::size_type size_per_row, + dim3 &blocks, + dim3 &threads) +{ + // We have found speed degrades when a thread handles more than 4 columns. + // Each block is 2 dimensional. The y dimension indicates the columns. + // We limit this to 32 threads in the y dimension so we can still + // have at least 32 threads in the x dimension (1 warp) which should + // result in better coalescing of memory operations. We also + // want to guarantee that we are processing a multiple of 32 threads + // in the x dimension because we use atomic operations at the block + // level when writing validity data out to main memory, and that would + // need to change if we split a word of validity data between blocks. + int y_block_size = (num_columns + 3) / 4; + if (y_block_size > 32) { y_block_size = 32; } + int x_possible_block_size = 1024 / y_block_size; + // 48KB is the default setting for shared memory per block according to the cuda tutorials + // If someone configures the GPU to only have 16 KB this might not work. + int max_shared_size = 48 * 1024; + int max_block_size = max_shared_size / size_per_row; + // If we don't have enough shared memory there is no point in having more threads + // per block that will just sit idle + max_block_size = max_block_size > x_possible_block_size ? x_possible_block_size : max_block_size; + // Make sure that the x dimension is a multiple of 32 this not only helps + // coalesce memory access it also lets us do a ballot sync for validity to write + // the data back out the warp level. If x is a multiple of 32 then each thread in the y + // dimension is associated with one or more warps, that should correspond to the validity + // words directly. + int block_size = (max_block_size / 32) * 32; + CUDF_EXPECTS(block_size != 0, "Row size is too large to fit in shared memory"); + + int num_blocks = (num_rows + block_size - 1) / block_size; + if (num_blocks < 1) { + num_blocks = 1; + } else if (num_blocks > 10240) { + // The maximum number of blocks supported in the x dimension is 2 ^ 31 - 1 + // but in practice haveing too many can cause some overhead that I don't totally + // understand. Playing around with this haveing as little as 600 blocks appears + // to be able to saturate memory on V100, so this is an order of magnitude higher + // to try and future proof this a bit. + num_blocks = 10240; + } + blocks.x = num_blocks; + blocks.y = 1; + blocks.z = 1; + threads.x = block_size; + threads.y = y_block_size; + threads.z = 1; + return size_per_row * block_size; +} + +/** + * When converting to rows it is possible that the size of the table was too big to fit + * in a single column. This creates an output column for a subset of the rows in a table + * going from start row and containing the next num_rows. Most of the parameters passed + * into this function are common between runs and should be calculated once. + */ +static std::unique_ptr fixed_width_convert_to_rows( + const cudf::size_type start_row, + const cudf::size_type num_rows, + const cudf::size_type num_columns, + const cudf::size_type size_per_row, + std::unique_ptr> &column_start, + std::unique_ptr> &column_size, + std::unique_ptr> &input_data, + std::unique_ptr> &input_nm, + const cudf::scalar &zero, + const cudf::scalar &scalar_size_per_row, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) +{ + int64_t total_allocation = size_per_row * num_rows; + // We made a mistake in the split somehow + CUDF_EXPECTS(total_allocation < std::numeric_limits::max(), "Table is too large to fit!"); + + // Allocate and set the offsets row for the byte array + std::unique_ptr offsets = + cudf::detail::sequence(num_rows + 1, zero, scalar_size_per_row, stream); + + std::unique_ptr data = + cudf::make_numeric_column(cudf::data_type(cudf::type_id::INT8), + static_cast(total_allocation), + cudf::mask_state::UNALLOCATED, + stream, + mr); + + dim3 blocks; + dim3 threads; + int shared_size = + detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads); + + copy_from_fixed_width_columns<<>>( + start_row, + num_rows, + num_columns, + size_per_row, + column_start->data(), + column_size->data(), + input_data->data(), + input_nm->data(), + data->mutable_view().data()); + + return cudf::make_lists_column(num_rows, + std::move(offsets), + std::move(data), + 0, + rmm::device_buffer{0, rmm::cuda_stream_default, mr}, + stream, + mr); +} + +static cudf::data_type get_data_type(const cudf::column_view &v) { return v.type(); } + +static inline bool are_all_fixed_width(std::vector const &schema) +{ + return std::all_of( + schema.begin(), schema.end(), [](const cudf::data_type &t) { return cudf::is_fixed_width(t); }); +} + +/** + * Given a set of fixed width columns, calculate how the data will be laid out in memory. + * @param [in] schema the types of columns that need to be laid out. + * @param [out] column_start the byte offset where each column starts in the row. + * @param [out] column_size the size in bytes of the data for each columns in the row. + * @return the size in bytes each row needs. + */ +static inline int32_t compute_fixed_width_layout(std::vector const &schema, + std::vector &column_start, + std::vector &column_size) +{ + // We guarantee that the start of each column is 64-bit aligned so anything can go + // there, but to make the code simple we will still do an alignment for it. + int32_t at_offset = 0; + for (auto col = schema.begin(); col < schema.end(); col++) { + cudf::size_type s = cudf::size_of(*col); + column_size.emplace_back(s); + std::size_t allocation_needed = s; + std::size_t alignment_needed = allocation_needed; // They are the same for fixed width types + at_offset = align_offset(at_offset, alignment_needed); + column_start.emplace_back(at_offset); + at_offset += allocation_needed; + } + + // Now we need to add in space for validity + // Eventually we can think about nullable vs not nullable, but for now we will just always add it + // in + int32_t validity_bytes_needed = (schema.size() + 7) / 8; + // validity comes at the end and is byte aligned so we can pack more in. + at_offset += validity_bytes_needed; + // Now we need to pad the end so all rows are 64 bit aligned + return align_offset(at_offset, 8); // 8 bytes (64 bits) +} + +} // namespace detail + +//#define DEBUG +std::vector> convert_to_rows2(cudf::table_view const &tbl, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) +{ + // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the data, but small enough + // that multiple columns fit in memory so the writes can coalese as well. Potential optimization for window sizes. + constexpr int max_window_height = 1024; + const size_type num_columns = tbl.num_columns(); + const size_type num_rows = tbl.num_rows(); + + #if defined(DEBUG) + auto pretty_print = [](uint64_t i) { + if (i > (1 * 1024 * 1024 * 1024)) { + printf("%.2f GB", i / float(1 * 1024 * 1024 * 1024)); + } else if (i > (1 * 1024 * 1024)) { + printf("%.2f MB", i / float(1 * 1024 * 1024)); + } else if (i > (1 * 1024)) { + printf("%.2f KB", float(i / 1024)); + } else { + printf("%lu Bytes", i); + } + }; + #endif + + int device_id; + CUDA_TRY(cudaGetDevice(&device_id)); + int shmem_limit_per_block; + CUDA_TRY( + cudaDeviceGetAttribute(&shmem_limit_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); + + // break up the work into blocks, which are a starting and ending row/col #. + // this window size is calculated based on the shared memory size available + // we want a single block to fill up the entire shared memory space available + // for the transpose-like conversion. + + // There are two different processes going on here. The GPU conversion of the data + // and the writing of the data into the list of byte columns that are a maximum of + // 2 gigs each due to offset maximum size. The GPU conversion portion has to understand + // this limitation because the column must own the data inside and as a result it must be + // a distinct allocation for that column. Copying the data into these final buffers would + // be prohibitively expensive, so care is taken to ensure the GPU writes to the proper buffer. + // The windows are broken at the boundaries of specific rows based on the row sizes up + // to that point. These are row batches and they are decided first before building the + // windows so the windows can be properly cut around them. + + std::vector row_sizes; // size of each row in bytes including any alignment padding + std::vector row_offsets; // offset from the start of the data to this row + std::vector column_sizes; // byte size of each column + std::vector column_starts; // offset of column inside a row including alignment + std::vector variable_width_columns; // list of the variable width columns in the table + row_sizes.reserve(num_rows); + row_offsets.reserve(num_rows); + column_sizes.reserve(num_columns); + column_starts.reserve(num_columns+1); // we add a final offset for validity data start + + size_type fixed_width_size_per_row = 0; + for (int col = 0; col < num_columns; ++col) { + auto cv = tbl.column(col); + auto col_type = cv.type(); + bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; + + if (nested_type) { variable_width_columns.push_back(cv);} + + // a list or string column will write a single uint64 + // of data here for offset/length + auto col_size = nested_type ? 8 : size_of(col_type); + + // align size for this type + std::size_t const alignment_needed = col_size; // They are the same for fixed width types + fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed); + column_starts.push_back(fixed_width_size_per_row); + column_sizes.push_back(col_size); + fixed_width_size_per_row += col_size; + } + + // When building the columns to return, we have to be mindful of the offset limit in cudf. + // It is 32-bit and these data columns are capable of surpassing that easily. The data should + // not be cut off exactly at the limit though due to the validity buffers. The most efficient + // place to cut the validity is on a 32-row boundary, so as we calculate the row sizes + // we keep track of the cut points for the validity, which we call row batches. If the row + // is larger than can be represented with the 32-bit offsets, we use the last 32-row boundary we hit. + // Note that this boundary is for our book-keeping with column pointers and not anything + // that the kernel needs to worry about. We cut the output at convienient boundaries + // when assembling the outgoing data stream. + struct row_batch { + size_type num_bytes; + size_type row_count; + }; + std::vector row_batches; + + auto calculate_variable_width_row_data_size = [](int const row) { + // each level of variable-width data will add an offset/length + // uint64 of data. The first of which is inside the fixed-width + // data itself and needs to be aligned based on what is around + // that data. This is handled above with the fixed-width calculations + // for that reason. We may still need to add more of these offset/length + // combinations if the nesting is deeper than one level as these + // will be included in the variable-width data blob at the end of the + // row. + return 0; +/* auto c = variable_width_columns[col]; + while (true) { + auto col_offsets = c.child(0).data(); + auto col_data_size = size_of(c.child(1).type()); + std::size_t alignment_needed = col_data_size; + + row_sizes[row] += (col_offsets[row + 1] - col_offsets[row]) * col_data_size; + if (c.num_children() == 0) { + break; + } + c = c.child(1); + } +*/ + }; + + uint64_t row_batch_size = 0; + uint64_t total_table_size = 0; + size_type row_batch_rows = 0; + uint64_t row_offset = 0; + + // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then calculate + // the size of each row's variable-width data as well. + for (int row = 0; row < num_rows; ++row) { + row_sizes[row] = fixed_width_size_per_row + calculate_variable_width_row_data_size(row); + if (row_batch_size + row_sizes[row] > std::numeric_limits::max()) { + // a new batch starts at the last 32-row boundary + row_batches.push_back(row_batch{static_cast(row_batch_size), row_batch_rows & ~31}); + row_batch_size = 0; + row_batch_rows = row_batch_rows & 31; + row_offset = 0; + } + row_offset = detail::align_offset(row_offset, 8); // rows are 8 byte aligned + row_offsets.push_back(row_offset); + row_batch_size += row_sizes[row]; + row_offset += row_sizes[row]; + total_table_size = detail::align_offset(total_table_size, 8); // rows are 8 byte aligned + total_table_size += row_sizes[row]; + row_batch_rows++; + } + if (row_batch_size > 0) { + row_batches.push_back(row_batch{static_cast(row_batch_size), row_batch_rows & ~31}); + } + + #if defined(DEBUG) + printf("%lu batches:\n", row_batches.size()); + for (auto i = 0; i < (int)row_batches.size(); ++i) { + printf("%d: %d rows, ", i, row_batches[i].row_count); + pretty_print(row_batches[i].num_bytes); + printf("\n"); + } + #endif + + std::vector block_infos; + + // block infos are organized with the windows going "down" the columns + // this provides the most coalescing of memory access + int current_window_size = 0; + int current_window_start_col = 0; + + // build the blocks for a specific set of columns + auto build_blocks = [&block_infos, &row_batches, num_rows](int const start_col, int const end_col, int const desired_window_height) { + int current_window_start_row = 0; + int current_window_row_batch = 0; + int rows_left_in_batch = row_batches[current_window_row_batch].row_count; + int i = 0; + while (i < num_rows) { + if (rows_left_in_batch == 0) { + current_window_row_batch++; + rows_left_in_batch = row_batches[current_window_row_batch].row_count; + } + int const window_height = std::min(desired_window_height, rows_left_in_batch); + + block_infos.emplace_back( + detail::block_info{start_col, + current_window_start_row, + start_col + end_col, + std::min(current_window_start_row + window_height - 1, num_rows), current_window_row_batch}); + + i += window_height; + current_window_start_row += window_height; + rows_left_in_batch -= window_height; + } + }; + + int const window_height = std::min(std::min(max_window_height, num_rows), row_batches[0].row_count); + + int row_size = 0; + + // march each column and build the blocks of appropriate sizes + for (int col = 0; col < num_columns; ++col) { + auto const col_size = column_sizes[col]; + + // align size for this type + std::size_t alignment_needed = col_size; // They are the same for fixed width types + auto row_size_with_this_col = detail::align_offset(row_size, alignment_needed) + col_size; + + if (row_size_with_this_col * window_height > shmem_limit_per_block) { + // too large, close this window, generate vertical blocks and restart + build_blocks(current_window_start_col, col - 1, window_height); + row_size = detail::align_offset(column_starts[col] & 7, alignment_needed) + col_size; // alignment required for shared memory window boundary to match alignment of output row + current_window_start_col = col; + } else { + row_size = row_size_with_this_col; + } + } + + auto validity_offset = detail::align_offset(column_starts.back(), 4); + column_starts.push_back(validity_offset); + + // build last set of blocks + if (current_window_size > 0) { build_blocks(current_window_start_col, (int)tbl.num_columns() - 1, window_height); } + + // Get the pointers to the input columnar data ready - possibly moved up to copy to gpu while calculating other things + std::vector input_data; + std::vector input_nm; + for (size_type column_number = 0; column_number < num_columns; column_number++) { + column_view cv = tbl.column(column_number); + auto const col_type = cv.type(); + bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; + + if (!nested_type) { + input_data.emplace_back(cv.data()); + input_nm.emplace_back(cv.null_mask()); + } + } + + #if defined(DEBUG) + printf("%lu windows for %d columns, %d rows to fit in ", block_infos.size(), block_infos[0].end_col - block_infos[0].start_col, block_infos[0].end_row - block_infos[0].start_row); + pretty_print(shmem_limit_per_block); + printf(" shared mem("); + pretty_print(fixed_width_size_per_row); + printf("/row, %d columns, %d rows, ", num_columns, num_rows); + pretty_print(total_table_size); + printf(" total):\n"); + #endif + + auto dev_input_data = detail::copy_to_dev_async2(input_data, stream, mr); + auto dev_input_nm = detail::copy_to_dev_async2(input_nm, stream, mr); + auto dev_col_sizes = detail::copy_to_dev_async2(column_sizes, stream, mr); + auto dev_col_starts = detail::copy_to_dev_async2(column_starts, stream, mr); + auto dev_block_infos = detail::copy_to_dev_async2(block_infos, stream, mr); + auto dev_row_offsets = detail::copy_to_dev_async2(row_offsets, stream, mr); + + std::vector output_data; + output_data.reserve(row_batches.size()); + for (uint i=0; i>>(num_rows, + num_columns, + dev_input_data.data(), + dev_input_nm.data(), + dev_col_sizes.data(), + dev_col_starts.data(), + dev_block_infos.data(), + dev_row_offsets.data(), + reinterpret_cast(dev_output_data.data())); + + // split up the output buffer into multiple buffers based on row batch sizes + // and create list of byte columns + int offset_offset = 0; + std::vector> ret; + for (uint i=0; i offset_vals; + offset_vals.reserve(row_batches[i].row_count + 1); + size_type cur_offset = 0; + offset_vals.push_back(cur_offset); + for (int row=0; row(data_type{type_id::INT32}, (size_type)offset_vals.size(), dev_offsets.release()); + + auto data = + std::make_unique(data_type{cudf::type_id::INT8}, + row_batches[i].num_bytes, + std::move(output_data[i])); + + ret.push_back(cudf::make_lists_column(row_batches[i].row_count, + std::move(offsets), + std::move(data), + 0, + rmm::device_buffer{0, rmm::cuda_stream_default, mr}, + stream, + mr)); + } + + return ret; +} + +std::vector> convert_to_rows(cudf::table_view const &tbl, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) +{ + const cudf::size_type num_columns = tbl.num_columns(); + + std::vector schema; + schema.resize(num_columns); + std::transform(tbl.begin(), tbl.end(), schema.begin(), detail::get_data_type); + + if (detail::are_all_fixed_width(schema)) { + std::vector column_start; + std::vector column_size; + + int32_t size_per_row = detail::compute_fixed_width_layout(schema, column_start, column_size); + auto dev_column_start = detail::copy_to_dev_async(column_start, stream, mr); + auto dev_column_size = detail::copy_to_dev_async(column_size, stream, mr); + + int32_t max_rows_per_batch = std::numeric_limits::max() / size_per_row; + // Make the number of rows per batch a multiple of 32 so we don't have to worry about + // splitting validity at a specific row offset. This might change in the future. + max_rows_per_batch = (max_rows_per_batch / 32) * 32; + + cudf::size_type num_rows = tbl.num_rows(); + + // Get the pointers to the input columnar data ready + std::vector input_data; + std::vector input_nm; + for (cudf::size_type column_number = 0; column_number < num_columns; column_number++) { + cudf::column_view cv = tbl.column(column_number); + input_data.emplace_back(cv.data()); + input_nm.emplace_back(cv.null_mask()); + } + auto dev_input_data = detail::copy_to_dev_async(input_data, stream, mr); + auto dev_input_nm = detail::copy_to_dev_async(input_nm, stream, mr); + + using ScalarType = cudf::scalar_type_t; + auto zero = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value()); + zero->set_valid(true, stream); + static_cast(zero.get())->set_value(0, stream); + + auto step = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value()); + step->set_valid(true, stream); + static_cast(step.get()) + ->set_value(static_cast(size_per_row), stream); + + std::vector> ret; + for (cudf::size_type row_start = 0; row_start < num_rows; row_start += max_rows_per_batch) { + cudf::size_type row_count = num_rows - row_start; + row_count = row_count > max_rows_per_batch ? max_rows_per_batch : row_count; + ret.emplace_back(detail::fixed_width_convert_to_rows(row_start, + row_count, + num_columns, + size_per_row, + dev_column_start, + dev_column_size, + dev_input_data, + dev_input_nm, + *zero, + *step, + stream, + mr)); + } + + return ret; + } else { + CUDF_FAIL("Only fixed width types are currently supported"); + } +} + +std::unique_ptr convert_from_rows(cudf::lists_column_view const &input, + std::vector const &schema, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) +{ + // verify that the types are what we expect + cudf::column_view child = input.child(); + cudf::type_id list_type = child.type().id(); + CUDF_EXPECTS(list_type == cudf::type_id::INT8 || list_type == cudf::type_id::UINT8, + "Only a list of bytes is supported as input"); + + cudf::size_type num_columns = schema.size(); + + if (detail::are_all_fixed_width(schema)) { + std::vector column_start; + std::vector column_size; + + cudf::size_type num_rows = input.parent().size(); + int32_t size_per_row = detail::compute_fixed_width_layout(schema, column_start, column_size); + + // Ideally we would check that the offsets are all the same, etc. but for now + // this is probably fine + CUDF_EXPECTS(size_per_row * num_rows == child.size(), + "The layout of the data appears to be off"); + auto dev_column_start = detail::copy_to_dev_async(column_start, stream, mr); + auto dev_column_size = detail::copy_to_dev_async(column_size, stream, mr); + + // Allocate the columns we are going to write into + std::vector> output_columns; + std::vector output_data; + std::vector output_nm; + for (cudf::size_type i = 0; i < num_columns; i++) { + auto column = cudf::make_fixed_width_column( + schema[i], num_rows, cudf::mask_state::UNINITIALIZED, stream, mr); + auto mut = column->mutable_view(); + output_data.emplace_back(mut.data()); + output_nm.emplace_back(mut.null_mask()); + output_columns.emplace_back(std::move(column)); + } + + auto dev_output_data = detail::copy_to_dev_async(output_data, stream, mr); + auto dev_output_nm = detail::copy_to_dev_async(output_nm, stream, mr); + + dim3 blocks; + dim3 threads; + int shared_size = + detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads); + + detail::copy_to_fixed_width_columns<<>>( + num_rows, + num_columns, + size_per_row, + dev_column_start->data(), + dev_column_size->data(), + dev_output_data->data(), + dev_output_nm->data(), + child.data()); + + return std::make_unique(std::move(output_columns)); + } else { + CUDF_FAIL("Only fixed width types are currently supported"); + } +} + +std::unique_ptr convert_from_rows( + std::vector> const &input, + std::vector const &schema, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) +{ + CUDF_EXPECTS(input.size() == 1, "Too large of an input, need to concat the output tables..."); + + // for (uint i=0; iview(); + auto ret = convert_from_rows(lcv, schema, stream, mr); + + return ret; + // } +} + +} // namespace cudf From b9f42cd2701b8933aae7156a34c9bd3ad83b1f05 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Thu, 10 Jun 2021 17:53:09 +0000 Subject: [PATCH 36/80] fixing kernel launch and updating --- .../row_conversion/row_conversion.cpp | 9 +- cpp/src/row_conversion/row_conversion.cu | 105 +++++++++++++----- 2 files changed, 83 insertions(+), 31 deletions(-) diff --git a/cpp/benchmarks/row_conversion/row_conversion.cpp b/cpp/benchmarks/row_conversion/row_conversion.cpp index c4edee91b3c..9fa05c408e5 100644 --- a/cpp/benchmarks/row_conversion/row_conversion.cpp +++ b/cpp/benchmarks/row_conversion/row_conversion.cpp @@ -28,7 +28,7 @@ class RowConversion : public cudf::benchmark { static void BM_to_row(benchmark::State& state) { cudf::size_type const n_rows{(cudf::size_type)state.range(0)}; - auto const table = create_random_table({cudf::type_id::INT8, +/* auto const table = create_random_table({cudf::type_id::INT8, cudf::type_id::INT32, cudf::type_id::INT16, cudf::type_id::INT64, @@ -38,7 +38,10 @@ static void BM_to_row(benchmark::State& state) cudf::type_id::UINT8, cudf::type_id::UINT64}, 50, - row_count{n_rows}); + row_count{n_rows});*/ + auto const table = create_random_table({cudf::type_id::INT32}, + 64, + row_count{n_rows}); cudf::size_type total_bytes = 0; for (int i = 0; i < table->num_columns(); ++i) { @@ -98,7 +101,7 @@ static void BM_from_row(benchmark::State& state) (::benchmark::State & st) { BM_to_row(st); } \ BENCHMARK_REGISTER_F(RowConversion, name) \ ->RangeMultiplier(8) \ - ->Ranges({{1 << 16, 1 << 24}}) \ + ->Ranges({{1 << 6, 1 << 20}}) \ ->UseManualTime() \ ->Unit(benchmark::kMillisecond); diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu index fb5dc4cb38d..994233a0700 100644 --- a/cpp/src/row_conversion/row_conversion.cu +++ b/cpp/src/row_conversion/row_conversion.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include #include #include #include @@ -347,14 +348,14 @@ struct block_info { * @param output_data pointer to output data * */ -__global__ void copy_from_columns(const cudf::size_type num_rows, - const cudf::size_type num_columns, +__global__ void copy_from_columns(const size_type num_rows, + const size_type num_columns, const int8_t **input_data, - const cudf::bitmask_type **input_nm, - const cudf::size_type *col_sizes, - const cudf::size_type *col_offsets, + const bitmask_type **input_nm, + const size_type *col_sizes, + const size_type *col_offsets, const block_info *block_infos, - const uint64_t *row_offsets, + const size_type *row_offsets, int8_t **output_data) { // We are going to copy the data in two passes. @@ -365,47 +366,92 @@ __global__ void copy_from_columns(const cudf::size_type num_rows, // This has been broken up for us in the block_info struct, so we don't have // any calculation to do here, but it is important to note. + bool debug_print = false; + + if (debug_print) { + printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns); + printf("Column Info:\n"); + for (int i=0; i(&output_data[0][output_start_offset]) & 7; // offset for alignment shim in order to match shared memory with final dest - - printf("copying from column %d to column %d with rows %d to row %d(grid dim %d, blockIdx %d)\n", block.start_col, block.end_col, block.start_row, block.end_row, gridDim.x, blockIdx.x); - + if (debug_print) { + printf("outputting to offset %lu\n", output_start_offset); + printf("dest shim offset is %d\n", dest_shim_offset); + printf("Shared data is %p-%p\n", shared_data, shared_data + (48 * 1024)); + } // each thread is responsible for every threadcount rows of data. // the data is copies into shared memory in the final layout. auto const shmem_row_size = align_offset(col_offsets[block.end_col] + col_sizes[block.end_col] - col_offsets[block.start_col] + dest_shim_offset, 8); // 8 byte alignment required for shared memory rows auto const validity_offset = col_offsets[num_columns]; + if (debug_print) { + printf("col_offsets[%d] = %d, col_sizes[%d] = %d, col_offsets[%d] = %d\n", block.end_col, col_offsets[block.end_col], block.end_col, col_sizes[block.end_col], block.start_col, col_offsets[block.start_col]); + printf("shmem row size %d\n", shmem_row_size); + printf("validity offset is %d\n", validity_offset); + printf("starting at %d,%d and going to %d, %d\n", block.start_col, block.start_row, block.end_col, block.end_row); + } for (int col=block.start_col; col<=block.end_col; ++col) { /*if (!col_is_variable) */{ uint64_t col_offset = 0; cudf::size_type col_size = col_sizes[col]; auto const dest_col_offset = col_offsets[col] - col_offsets[block.start_col] + dest_shim_offset; + if (debug_print) { + printf("dest col offset %d\n", dest_col_offset); + } for (int row=block.start_row + threadIdx.x; row(input_data[col]); + if (debug_print) { + printf("%p <- short %d\n", shmem_dest, short_col_input[row]); + } *reinterpret_cast(shmem_dest) = short_col_input[row]; break; } case 4: { const int32_t *int_col_input = reinterpret_cast(input_data[col]); + if (debug_print) { + printf("shmem[%d][%d] - %p <- int %d\n", row, col, shmem_dest, int_col_input[row]); + } *reinterpret_cast(shmem_dest) = int_col_input[row]; break; } case 8: { const int64_t *long_col_input = reinterpret_cast(input_data[col]); + if (debug_print) { + printf("%p <- long %lu\n", shmem_dest, long_col_input[row]); + } *reinterpret_cast(shmem_dest) = long_col_input[row]; break; } default: { cudf::size_type input_offset = col_size * row; - // TODO this should just not be supported for fixed width columns, but just in case... + if (debug_print) { + printf("byte for byte copy due to size %d\n", col_size); + printf("%p <- input_data[%d] which is %d\n", shmem_dest, input_offset, input_data[col][input_offset]); + } + // TODO this should just not be supported for fixed width columns, but just in case... for (cudf::size_type b = 0; b < col_size; b++) { shmem_dest[b] = input_data[col][b + input_offset]; } @@ -676,6 +722,12 @@ std::vector> convert_to_rows2(cudf::table_view con CUDA_TRY( cudaDeviceGetAttribute(&shmem_limit_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); + #if defined(DEBUG) + size_t free, total; + cudaMemGetInfo( &free, &total ); + printf("%lu/%lu Memory", free, total); + #endif + // break up the work into blocks, which are a starting and ending row/col #. // this window size is calculated based on the shared memory size available // we want a single block to fill up the entire shared memory space available @@ -692,7 +744,7 @@ std::vector> convert_to_rows2(cudf::table_view con // windows so the windows can be properly cut around them. std::vector row_sizes; // size of each row in bytes including any alignment padding - std::vector row_offsets; // offset from the start of the data to this row + std::vector row_offsets; // offset from the start of the data to this row std::vector column_sizes; // byte size of each column std::vector column_starts; // offset of column inside a row including alignment std::vector variable_width_columns; // list of the variable width columns in the table @@ -821,7 +873,7 @@ std::vector> convert_to_rows2(cudf::table_view con block_infos.emplace_back( detail::block_info{start_col, current_window_start_row, - start_col + end_col, + end_col, std::min(current_window_start_row + window_height - 1, num_rows), current_window_row_batch}); i += window_height; @@ -889,23 +941,20 @@ std::vector> convert_to_rows2(cudf::table_view con auto dev_block_infos = detail::copy_to_dev_async2(block_infos, stream, mr); auto dev_row_offsets = detail::copy_to_dev_async2(row_offsets, stream, mr); - std::vector output_data; + std::vector output_buffers; + std::vector output_data; output_data.reserve(row_batches.size()); for (uint i=0; i(temp.data())); + output_buffers.push_back(std::move(temp)); } - auto dev_output_data = detail::copy_to_dev_async2(row_offsets, stream, mr); + auto dev_output_data = detail::copy_to_dev_async2(output_data, stream, mr); // blast through the entire table and convert it - dim3 blocks; - dim3 threads; - blocks.x = block_infos.size(); - blocks.y = 0; - blocks.z = 0; - threads.x = 1024; - threads.y = 0; - threads.z = 0; - detail::copy_from_columns<<>>(num_rows, + dim3 blocks(block_infos.size()); + dim3 threads(1024); + copy_from_columns<<>>(num_rows, num_columns, dev_input_data.data(), dev_input_nm.data(), @@ -932,14 +981,14 @@ std::vector> convert_to_rows2(cudf::table_view con } offset_offset += row_batches[i].row_count; - auto dev_offsets = detail::copy_to_dev_async2(offset_vals, stream, mr); + auto dev_offsets = detail::copy_to_dev_async2(offset_vals, stream, mr); auto offsets = std::make_unique(data_type{type_id::INT32}, (size_type)offset_vals.size(), dev_offsets.release()); auto data = std::make_unique(data_type{cudf::type_id::INT8}, row_batches[i].num_bytes, - std::move(output_data[i])); + std::move(output_buffers[i])); ret.push_back(cudf::make_lists_column(row_batches[i].row_count, std::move(offsets), From 6a267abad1dc539217f63fc41fa24b1788504955 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Wed, 16 Jun 2021 19:25:57 +0000 Subject: [PATCH 37/80] Updates and bug fixing --- .../row_conversion/row_conversion.cpp | 76 ++- cpp/src/row_conversion/row_conversion.cu | 498 ++++++++++++------ cpp/tests/row_conversion/row_conversion.cpp | 110 ++++ 3 files changed, 488 insertions(+), 196 deletions(-) create mode 100644 cpp/tests/row_conversion/row_conversion.cpp diff --git a/cpp/benchmarks/row_conversion/row_conversion.cpp b/cpp/benchmarks/row_conversion/row_conversion.cpp index 9fa05c408e5..e1228c9df21 100644 --- a/cpp/benchmarks/row_conversion/row_conversion.cpp +++ b/cpp/benchmarks/row_conversion/row_conversion.cpp @@ -25,10 +25,43 @@ class RowConversion : public cudf::benchmark { }; -static void BM_to_row(benchmark::State& state) +static void BM_old_to_row(benchmark::State& state) { cudf::size_type const n_rows{(cudf::size_type)state.range(0)}; -/* auto const table = create_random_table({cudf::type_id::INT8, + auto const table = create_random_table({cudf::type_id::INT8, + cudf::type_id::INT32, + cudf::type_id::INT16, + cudf::type_id::INT64, + cudf::type_id::INT32, + cudf::type_id::BOOL8, + cudf::type_id::UINT16, + cudf::type_id::UINT8, + cudf::type_id::UINT64}, + 212, + row_count{n_rows}); + /* auto const table = create_random_table({cudf::type_id::INT32}, + 64, + row_count{n_rows});*/ + + cudf::size_type total_bytes = 0; + for (int i = 0; i < table->num_columns(); ++i) { + auto t = table->get_column(i).type(); + total_bytes += cudf::size_of(t); + } + + for (auto _ : state) { + cuda_event_timer raii(state, true, rmm::cuda_stream_default); + + auto rows = cudf::convert_to_rows(table->view()); + } + + state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows()); +} + +static void BM_new_to_row(benchmark::State& state) +{ + cudf::size_type const n_rows{(cudf::size_type)state.range(0)}; + auto const table = create_random_table({cudf::type_id::INT8, cudf::type_id::INT32, cudf::type_id::INT16, cudf::type_id::INT64, @@ -37,11 +70,11 @@ static void BM_to_row(benchmark::State& state) cudf::type_id::UINT16, cudf::type_id::UINT8, cudf::type_id::UINT64}, - 50, - row_count{n_rows});*/ - auto const table = create_random_table({cudf::type_id::INT32}, - 64, - row_count{n_rows}); + 212, + row_count{n_rows}); + /* auto const table = create_random_table({cudf::type_id::INT32}, + 64, + row_count{n_rows});*/ cudf::size_type total_bytes = 0; for (int i = 0; i < table->num_columns(); ++i) { @@ -52,14 +85,13 @@ static void BM_to_row(benchmark::State& state) for (auto _ : state) { cuda_event_timer raii(state, true, rmm::cuda_stream_default); -// auto rows = cudf::convert_to_rows(table->view()); auto new_rows = cudf::convert_to_rows2(table->view()); } state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows()); } -static void BM_from_row(benchmark::State& state) +/*static void BM_from_row(benchmark::State& state) { cudf::size_type const n_rows{(cudf::size_type)state.range(0)}; auto const table = create_random_table({cudf::type_id::INT8, @@ -73,9 +105,6 @@ static void BM_from_row(benchmark::State& state) cudf::type_id::UINT64}, 256, row_count{n_rows}); - /* auto const table = create_random_table({cudf::type_id::INT32}, - 4, - row_count{n_rows});*/ std::vector schema; cudf::size_type total_bytes = 0; @@ -94,18 +123,19 @@ static void BM_from_row(benchmark::State& state) } state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows()); -} - -#define TO_ROW_CONVERSION_BENCHMARK_DEFINE(name) \ - BENCHMARK_DEFINE_F(RowConversion, name) \ - (::benchmark::State & st) { BM_to_row(st); } \ - BENCHMARK_REGISTER_F(RowConversion, name) \ - ->RangeMultiplier(8) \ - ->Ranges({{1 << 6, 1 << 20}}) \ - ->UseManualTime() \ +}*/ + +#define TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \ + BENCHMARK_DEFINE_F(RowConversion, name) \ + (::benchmark::State & st) { f(st); } \ + BENCHMARK_REGISTER_F(RowConversion, name) \ + ->RangeMultiplier(8) \ + ->Ranges({{1 << 6, 1 << 20}}) \ + ->UseManualTime() \ ->Unit(benchmark::kMillisecond); -TO_ROW_CONVERSION_BENCHMARK_DEFINE(to_row_conversion) +TO_ROW_CONVERSION_BENCHMARK_DEFINE(old_to_row_conversion, BM_old_to_row) +TO_ROW_CONVERSION_BENCHMARK_DEFINE(new_to_row_conversion, BM_new_to_row) #define FROM_ROW_CONVERSION_BENCHMARK_DEFINE(name) \ BENCHMARK_DEFINE_F(RowConversion, name) \ @@ -116,4 +146,4 @@ TO_ROW_CONVERSION_BENCHMARK_DEFINE(to_row_conversion) ->UseManualTime() \ ->Unit(benchmark::kMillisecond); -FROM_ROW_CONVERSION_BENCHMARK_DEFINE(from_row_conversion) +//FROM_ROW_CONVERSION_BENCHMARK_DEFINE(from_row_conversion) diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu index 994233a0700..92ba075c316 100644 --- a/cpp/src/row_conversion/row_conversion.cu +++ b/cpp/src/row_conversion/row_conversion.cu @@ -44,7 +44,6 @@ static inline __host__ __device__ int32_t align_offset(int32_t offset, std::size return (offset + alignment - 1) & ~(alignment - 1); } - /** * Copy a simple vector to device memory asynchronously. Be sure to read * the data on the same stream as is used to copy it. @@ -61,10 +60,9 @@ std::unique_ptr> copy_to_dev_async(const std::vector & } template -rmm::device_uvector copy_to_dev_async2( - const std::vector &input, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) +rmm::device_uvector copy_to_dev_async2(const std::vector &input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) { rmm::device_uvector ret(input.size(), stream, mr); CUDA_TRY(cudaMemcpyAsync( @@ -346,7 +344,7 @@ struct block_info { * @param block_infos information about the blocks of work * @param row_offsets offset to a specific row in the input data * @param output_data pointer to output data - * + * */ __global__ void copy_from_columns(const size_type num_rows, const size_type num_columns, @@ -366,92 +364,119 @@ __global__ void copy_from_columns(const size_type num_rows, // This has been broken up for us in the block_info struct, so we don't have // any calculation to do here, but it is important to note. - bool debug_print = false; - + bool debug_print = false; // blockIdx.x == 70 && threadIdx.x == 448; + if (debug_print) { printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns); printf("Column Info:\n"); - for (int i=0; i(&output_data[0][output_start_offset]) & 7; // offset for alignment shim in order to match shared memory with final dest + uint8_t const dest_shim_offset = + reinterpret_cast(&output_data[0][output_start_offset]) & + 7; // offset for alignment shim in order to match shared memory with final dest if (debug_print) { printf("outputting to offset %lu\n", output_start_offset); printf("dest shim offset is %d\n", dest_shim_offset); printf("Shared data is %p-%p\n", shared_data, shared_data + (48 * 1024)); + printf("my block is %d,%d -> %d,%d - buffer %d\n", + block.start_col, + block.start_row, + block.end_col, + block.end_row, + block.buffer_num); } // each thread is responsible for every threadcount rows of data. // the data is copies into shared memory in the final layout. - auto const shmem_row_size = align_offset(col_offsets[block.end_col] + col_sizes[block.end_col] - col_offsets[block.start_col] + dest_shim_offset, 8); // 8 byte alignment required for shared memory rows + auto const real_bytes_in_row = + col_offsets[block.end_col] + col_sizes[block.end_col] - col_offsets[block.start_col]; + auto const shmem_row_size = align_offset(real_bytes_in_row + dest_shim_offset, + 8); // 8 byte alignment required for shared memory rows auto const validity_offset = col_offsets[num_columns]; if (debug_print) { - printf("col_offsets[%d] = %d, col_sizes[%d] = %d, col_offsets[%d] = %d\n", block.end_col, col_offsets[block.end_col], block.end_col, col_sizes[block.end_col], block.start_col, col_offsets[block.start_col]); + printf("col_offsets[%d] = %d, col_sizes[%d] = %d, col_offsets[%d] = %d\n", + block.end_col, + col_offsets[block.end_col], + block.end_col, + col_sizes[block.end_col], + block.start_col, + col_offsets[block.start_col]); printf("shmem row size %d\n", shmem_row_size); printf("validity offset is %d\n", validity_offset); - printf("starting at %d,%d and going to %d, %d\n", block.start_col, block.start_row, block.end_col, block.end_row); + printf("starting at %d,%d and going to %d, %d\n", + block.start_col, + block.start_row, + block.end_col, + block.end_row); } - for (int col=block.start_col; col<=block.end_col; ++col) { - /*if (!col_is_variable) */{ - uint64_t col_offset = 0; + for (int col = block.start_col; col <= block.end_col; ++col) { + /*if (!col_is_variable) */ { + uint64_t col_offset = 0; cudf::size_type col_size = col_sizes[col]; - auto const dest_col_offset = col_offsets[col] - col_offsets[block.start_col] + dest_shim_offset; - if (debug_print) { - printf("dest col offset %d\n", dest_col_offset); - } - for (int row=block.start_row + threadIdx.x; row(input_data[col]); - if (debug_print) { - printf("%p <- short %d\n", shmem_dest, short_col_input[row]); - } + const int16_t *short_col_input = reinterpret_cast(input_data[col]); + if (debug_print) { printf("%p <- short %d\n", shmem_dest, short_col_input[row]); } *reinterpret_cast(shmem_dest) = short_col_input[row]; break; } case 4: { - const int32_t *int_col_input = reinterpret_cast(input_data[col]); + const int32_t *int_col_input = reinterpret_cast(input_data[col]); if (debug_print) { - printf("shmem[%d][%d] - %p <- int %d\n", row, col, shmem_dest, int_col_input[row]); + printf("shmem[%d][%d] - %p <- int 0x%x\n", row, col, shmem_dest, int_col_input[row]); } *reinterpret_cast(shmem_dest) = int_col_input[row]; break; } case 8: { - const int64_t *long_col_input = reinterpret_cast(input_data[col]); - if (debug_print) { - printf("%p <- long %lu\n", shmem_dest, long_col_input[row]); - } + const int64_t *long_col_input = reinterpret_cast(input_data[col]); + if (debug_print) { printf("%p <- long %lu\n", shmem_dest, long_col_input[row]); } *reinterpret_cast(shmem_dest) = long_col_input[row]; break; } default: { cudf::size_type input_offset = col_size * row; if (debug_print) { - printf("byte for byte copy due to size %d\n", col_size); - printf("%p <- input_data[%d] which is %d\n", shmem_dest, input_offset, input_data[col][input_offset]); - } - // TODO this should just not be supported for fixed width columns, but just in case... + printf("byte for byte copy due to size %d of column %d\n", col_size, col); + printf("%p <- input_data[%d] which is %d\n", + shmem_dest, + input_offset, + input_data[col][input_offset]); + } + // TODO this should just not be supported for fixed width columns, but just in case... for (cudf::size_type b = 0; b < col_size; b++) { shmem_dest[b] = input_data[col][b + input_offset]; } @@ -463,11 +488,13 @@ __global__ void copy_from_columns(const size_type num_rows, // so we have to rewrite the addresses to make sure that it is 4 byte aligned // we do this directly in the final location because the entire row may not // fit in shared memory and may require many blocks to process it entirely - int8_t *valid_byte = &output_data[block.buffer_num][row_offsets[row] + validity_offset + col / 8]; + int8_t *valid_byte = + &output_data[block.buffer_num][row_offsets[row] + validity_offset + col / 8]; cudf::size_type byte_bit_offset = col % 8; uint64_t fixup_bytes = reinterpret_cast(valid_byte) % 4; int32_t *valid_int = reinterpret_cast(valid_byte - fixup_bytes); cudf::size_type int_bit_offset = byte_bit_offset + (fixup_bytes * 8); + if (debug_print) { printf("Outputting validity to %p\n", valid_byte); } // Now copy validity for the column if (input_nm[col]) { if (bit_is_set(input_nm[col], row)) { @@ -479,11 +506,11 @@ __global__ void copy_from_columns(const size_type num_rows, // It is valid so just set the bit atomicOr_block(valid_int, 1 << int_bit_offset); } - } // end row + } // end row - col_offset += col_sizes[col] * (block.end_row - block.start_row); + col_offset += col_sizes[col] * rows_in_block; } - } // end col + } // end col // wait for the data to be totally copied into shared memory __syncthreads(); @@ -496,30 +523,75 @@ __global__ void copy_from_columns(const size_type num_rows, // row in shared memory may not be an entire row of the destination. // auto const thread_start_offset = threadIdx.x * 8; - auto const thread_stride = gridDim.x * 8; - for (auto src_offset = thread_start_offset; src_offset < shmem_row_size * (block.end_row - block.start_row); src_offset += thread_stride) { + auto const thread_stride = gridDim.x * 8; + if (debug_print) { + printf("writing final data from %d to %d at stride %d\n", + thread_start_offset, + shmem_row_size * rows_in_block, + thread_stride); + printf("rows in block %d\n", rows_in_block); + } + for (auto src_offset = thread_start_offset; src_offset < shmem_row_size * rows_in_block; + src_offset += thread_stride) { auto const output_row_num = src_offset / shmem_row_size; - auto const row_offset = row_offsets[block.start_row + output_row_num]; - auto const col_offset = src_offset % shmem_row_size; - int8_t *output_ptr = &output_data[block.buffer_num][row_offset + col_offset]; - int8_t *input_ptr = &shared_data[src_offset]; - // the first part and last part of the row is unaligned data copy. This is copied a single byte - // at a time. - if (dest_shim_offset > 0 && src_offset % shmem_row_size == 0) { - // first part of a row, copy single bytes + auto const row_offset = row_offsets[block.start_row + output_row_num]; + auto const col_offset = src_offset % shmem_row_size; + int8_t *output_ptr = &output_data[block.buffer_num][row_offset + col_offset]; + int8_t *input_ptr = &shared_data[src_offset]; + + // three cases to worry about here + // 1) first 8-byte part of a large row - dest_shim_offset bytes of pad at the front + // 2) last 8-byte part of a large row - some bytes of pad at the end + // 3) corner case of <= 8 bytes of data, which means dest_shim_offset bytes of pad at the front + // AND potentially pad at the rear + + // we know the real number of bytes in a row, so we can figure out if we are in case 3 easily. + // 1st case is when we're at some even multiple of shmem_row_size offset. + // 2nd case is when offset + 8 is some even multiple of shmem_row_size. + // must be an 8 byte copy + + // there is a chance we have a 0 dest_shim_offset and an 8 byte thing to copy, optimize? + if (real_bytes_in_row + dest_shim_offset <= 8) { + // case 3, we want to copy real_bytes_in_row bytes + auto const num_single_bytes = real_bytes_in_row - dest_shim_offset; + for (auto i = 0; i < num_single_bytes; ++i) { + if (debug_print) { + printf("case 3 - %d single byte final write %p -> %p\n", + num_single_bytes, + &input_ptr[i + dest_shim_offset], + &output_ptr[i]); + } + output_ptr[i] = input_ptr[i + dest_shim_offset]; + } + } else if (dest_shim_offset > 0 && src_offset % shmem_row_size == 0) { + // first byte with leading pad auto const num_single_bytes = 8 - dest_shim_offset; - for (auto i=0; i %p\n", &input_ptr[i + dest_shim_offset], &output_ptr[i]); + } output_ptr[i] = input_ptr[i + dest_shim_offset]; } - } else if (dest_shim_offset > 0 && (src_offset + 8) % shmem_row_size == 0) { - // last part of a row, copy single bytes - auto const num_single_bytes = dest_shim_offset; - for (auto i=0; i 0) { + // last bytes of a row + auto const num_single_bytes = (real_bytes_in_row + dest_shim_offset) % 8; + for (auto i = 0; i < num_single_bytes; ++i) { + if (debug_print) { + printf("single trailing byte final write %p -> %p\n", + &input_ptr[i + dest_shim_offset], + &output_ptr[i]); + } output_ptr[i] = input_ptr[i + dest_shim_offset]; } } else { // copy 8 bytes aligned - const int64_t *long_col_input = reinterpret_cast(input_ptr); + const int64_t *long_col_input = reinterpret_cast(input_ptr); + if (debug_print) { + printf( + "long final write %p -> %p\n", long_col_input, reinterpret_cast(output_ptr)); + } *reinterpret_cast(output_ptr) = *long_col_input; } } @@ -696,13 +768,14 @@ std::vector> convert_to_rows2(cudf::table_view con rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) { - // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the data, but small enough - // that multiple columns fit in memory so the writes can coalese as well. Potential optimization for window sizes. + // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the + // data, but small enough that multiple columns fit in memory so the writes can coalese as well. + // Potential optimization for window sizes. constexpr int max_window_height = 1024; - const size_type num_columns = tbl.num_columns(); - const size_type num_rows = tbl.num_rows(); + const size_type num_columns = tbl.num_columns(); + const size_type num_rows = tbl.num_rows(); - #if defined(DEBUG) +#if defined(DEBUG) auto pretty_print = [](uint64_t i) { if (i > (1 * 1024 * 1024 * 1024)) { printf("%.2f GB", i / float(1 * 1024 * 1024 * 1024)); @@ -714,7 +787,7 @@ std::vector> convert_to_rows2(cudf::table_view con printf("%lu Bytes", i); } }; - #endif +#endif int device_id; CUDA_TRY(cudaGetDevice(&device_id)); @@ -722,11 +795,11 @@ std::vector> convert_to_rows2(cudf::table_view con CUDA_TRY( cudaDeviceGetAttribute(&shmem_limit_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); - #if defined(DEBUG) +#if defined(DEBUG) size_t free, total; - cudaMemGetInfo( &free, &total ); - printf("%lu/%lu Memory", free, total); - #endif + cudaMemGetInfo(&free, &total); + printf("%lu/%lu Memory\n", free, total); +#endif // break up the work into blocks, which are a starting and ending row/col #. // this window size is calculated based on the shared memory size available @@ -743,45 +816,46 @@ std::vector> convert_to_rows2(cudf::table_view con // to that point. These are row batches and they are decided first before building the // windows so the windows can be properly cut around them. - std::vector row_sizes; // size of each row in bytes including any alignment padding - std::vector row_offsets; // offset from the start of the data to this row + std::vector row_sizes; // size of each row in bytes including any alignment padding + std::vector row_offsets; // offset from the start of the data to this row std::vector column_sizes; // byte size of each column - std::vector column_starts; // offset of column inside a row including alignment - std::vector variable_width_columns; // list of the variable width columns in the table + std::vector column_starts; // offset of column inside a row including alignment + std::vector + variable_width_columns; // list of the variable width columns in the table row_sizes.reserve(num_rows); row_offsets.reserve(num_rows); column_sizes.reserve(num_columns); - column_starts.reserve(num_columns+1); // we add a final offset for validity data start + column_starts.reserve(num_columns + 1); // we add a final offset for validity data start size_type fixed_width_size_per_row = 0; for (int col = 0; col < num_columns; ++col) { - auto cv = tbl.column(col); - auto col_type = cv.type(); + auto cv = tbl.column(col); + auto col_type = cv.type(); bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; - if (nested_type) { variable_width_columns.push_back(cv);} + if (nested_type) { variable_width_columns.push_back(cv); } // a list or string column will write a single uint64 // of data here for offset/length auto col_size = nested_type ? 8 : size_of(col_type); // align size for this type - std::size_t const alignment_needed = col_size; // They are the same for fixed width types - fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed); + std::size_t const alignment_needed = col_size; // They are the same for fixed width types + fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed); column_starts.push_back(fixed_width_size_per_row); column_sizes.push_back(col_size); fixed_width_size_per_row += col_size; } - + // When building the columns to return, we have to be mindful of the offset limit in cudf. // It is 32-bit and these data columns are capable of surpassing that easily. The data should // not be cut off exactly at the limit though due to the validity buffers. The most efficient // place to cut the validity is on a 32-row boundary, so as we calculate the row sizes // we keep track of the cut points for the validity, which we call row batches. If the row - // is larger than can be represented with the 32-bit offsets, we use the last 32-row boundary we hit. - // Note that this boundary is for our book-keeping with column pointers and not anything - // that the kernel needs to worry about. We cut the output at convienient boundaries - // when assembling the outgoing data stream. + // is larger than can be represented with the 32-bit offsets, we use the last 32-row boundary we + // hit. Note that this boundary is for our book-keeping with column pointers and not anything that + // the kernel needs to worry about. We cut the output at convienient boundaries when assembling + // the outgoing data stream. struct row_batch { size_type num_bytes; size_type row_count; @@ -798,71 +872,90 @@ std::vector> convert_to_rows2(cudf::table_view con // will be included in the variable-width data blob at the end of the // row. return 0; -/* auto c = variable_width_columns[col]; - while (true) { - auto col_offsets = c.child(0).data(); - auto col_data_size = size_of(c.child(1).type()); - std::size_t alignment_needed = col_data_size; - - row_sizes[row] += (col_offsets[row + 1] - col_offsets[row]) * col_data_size; - if (c.num_children() == 0) { - break; - } - c = c.child(1); - } -*/ + /* auto c = variable_width_columns[col]; + while (true) { + auto col_offsets = c.child(0).data(); + auto col_data_size = size_of(c.child(1).type()); + std::size_t alignment_needed = col_data_size; + + row_sizes[row] += (col_offsets[row + 1] - col_offsets[row]) * col_data_size; + if (c.num_children() == 0) { + break; + } + c = c.child(1); + } + */ }; uint64_t row_batch_size = 0; uint64_t total_table_size = 0; - size_type row_batch_rows = 0; - uint64_t row_offset = 0; + size_type row_batch_rows = 0; + uint64_t row_offset = 0; + + auto calculate_validity_size = [](int const num_cols) { + // Now we need to add in space for validity + // Eventually we can think about nullable vs not nullable, but for now we will just always add + // it in + return (num_cols + 7) / 8; + }; - // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then calculate - // the size of each row's variable-width data as well. + // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then + // calculate the size of each row's variable-width data and validity as well. for (int row = 0; row < num_rows; ++row) { - row_sizes[row] = fixed_width_size_per_row + calculate_variable_width_row_data_size(row); - if (row_batch_size + row_sizes[row] > std::numeric_limits::max()) { + auto aligned_row_batch_size = + detail::align_offset(row_batch_size, 8); // rows are 8 byte aligned + row_sizes[row] = fixed_width_size_per_row; + // validity is byte aligned + row_sizes[row] += calculate_validity_size(num_columns); + // variable width data is 8-byte aligned + row_sizes[row] = detail::align_offset(row_sizes[row], 8) + + calculate_variable_width_row_data_size(row); // rows are 8 byte aligned + + if (aligned_row_batch_size + row_sizes[row] > std::numeric_limits::max()) { // a new batch starts at the last 32-row boundary - row_batches.push_back(row_batch{static_cast(row_batch_size), row_batch_rows & ~31}); - row_batch_size = 0; - row_batch_rows = row_batch_rows & 31; - row_offset = 0; + row_batches.push_back( + row_batch{static_cast(row_batch_size), row_batch_rows & ~31}); + row_batch_size = 0; + row_batch_rows = row_batch_rows & 31; + row_offset = 0; + aligned_row_batch_size = 0; } - row_offset = detail::align_offset(row_offset, 8); // rows are 8 byte aligned + row_offset = detail::align_offset(row_offset, 8); // rows are 8 byte aligned row_offsets.push_back(row_offset); - row_batch_size += row_sizes[row]; + row_batch_size = aligned_row_batch_size + row_sizes[row]; row_offset += row_sizes[row]; - total_table_size = detail::align_offset(total_table_size, 8); // rows are 8 byte aligned + total_table_size = detail::align_offset(total_table_size, 8); // rows are 8 byte aligned total_table_size += row_sizes[row]; row_batch_rows++; } if (row_batch_size > 0) { - row_batches.push_back(row_batch{static_cast(row_batch_size), row_batch_rows & ~31}); + row_batches.push_back(row_batch{static_cast(row_batch_size), row_batch_rows}); } - #if defined(DEBUG) +#if defined(DEBUG) + printf("%d rows and %d columns in table\n", num_rows, num_columns); printf("%lu batches:\n", row_batches.size()); for (auto i = 0; i < (int)row_batches.size(); ++i) { printf("%d: %d rows, ", i, row_batches[i].row_count); pretty_print(row_batches[i].num_bytes); printf("\n"); } - #endif +#endif std::vector block_infos; // block infos are organized with the windows going "down" the columns // this provides the most coalescing of memory access - int current_window_size = 0; + int current_window_width = 0; int current_window_start_col = 0; // build the blocks for a specific set of columns - auto build_blocks = [&block_infos, &row_batches, num_rows](int const start_col, int const end_col, int const desired_window_height) { + auto build_blocks = [&block_infos, &row_batches, num_rows]( + int const start_col, int const end_col, int const desired_window_height) { int current_window_start_row = 0; int current_window_row_batch = 0; - int rows_left_in_batch = row_batches[current_window_row_batch].row_count; - int i = 0; + int rows_left_in_batch = row_batches[current_window_row_batch].row_count; + int i = 0; while (i < num_rows) { if (rows_left_in_batch == 0) { current_window_row_batch++; @@ -872,9 +965,10 @@ std::vector> convert_to_rows2(cudf::table_view con block_infos.emplace_back( detail::block_info{start_col, - current_window_start_row, - end_col, - std::min(current_window_start_row + window_height - 1, num_rows), current_window_row_batch}); + current_window_start_row, + end_col, + std::min(current_window_start_row + window_height - 1, num_rows - 1), + current_window_row_batch}); i += window_height; current_window_start_row += window_height; @@ -882,7 +976,17 @@ std::vector> convert_to_rows2(cudf::table_view con } }; - int const window_height = std::min(std::min(max_window_height, num_rows), row_batches[0].row_count); + int const window_height = + std::min(std::min(max_window_height, num_rows), row_batches[0].row_count); +#if defined(DEBUG) + printf( + "max_window_height is %d, num_rows is %d, batch row count is %d - which makes window height " + "%d\n", + max_window_height, + num_rows, + row_batches[0].row_count, + window_height); +#endif int row_size = 0; @@ -891,32 +995,74 @@ std::vector> convert_to_rows2(cudf::table_view con auto const col_size = column_sizes[col]; // align size for this type - std::size_t alignment_needed = col_size; // They are the same for fixed width types - auto row_size_with_this_col = detail::align_offset(row_size, alignment_needed) + col_size; + std::size_t alignment_needed = col_size; // They are the same for fixed width types + auto row_size_aligned = detail::align_offset(row_size, alignment_needed); + auto row_size_with_this_col = row_size_aligned + col_size; if (row_size_with_this_col * window_height > shmem_limit_per_block) { +#if defined(DEBUG) + printf( + "Window size %d too large at column %d, bumping back to build windows of size %d(cols " + "%d-%d), which is %d tall. Row size is too large at %d and ok at %d(aligned overall is %d) " + "for shared mem size %d\n", + row_size_with_this_col * window_height, + col, + row_size * window_height, + current_window_start_col, + col - 1, + window_height, + row_size_with_this_col, + row_size, + row_size_aligned, + shmem_limit_per_block); +#endif // too large, close this window, generate vertical blocks and restart build_blocks(current_window_start_col, col - 1, window_height); - row_size = detail::align_offset(column_starts[col] & 7, alignment_needed) + col_size; // alignment required for shared memory window boundary to match alignment of output row + row_size = + detail::align_offset((column_starts[col] + column_sizes[col]) & 7, alignment_needed); +#if defined(DEBUG) + printf( + "New window starting with offset %d and row size %d to be %d (previous column offset %d+%d " + "or %d)\n", + row_size, + col_size, + row_size + col_size, + column_starts[col - 1], + column_sizes[col - 1], + column_starts[col - 1] + column_sizes[col - 1]); +#endif + row_size += col_size; // alignment required for shared memory window boundary to match + // alignment of output row current_window_start_col = col; + current_window_width = 0; } else { row_size = row_size_with_this_col; + current_window_width++; } } - auto validity_offset = detail::align_offset(column_starts.back(), 4); +#if defined(DEBUG) + printf("validity offset will be %d + %d = %d\n", + column_starts.back(), + column_sizes.back(), + column_starts.back() + column_sizes.back()); +#endif + auto validity_offset = detail::align_offset(column_starts.back() + column_sizes.back(), 4); column_starts.push_back(validity_offset); - + // build last set of blocks - if (current_window_size > 0) { build_blocks(current_window_start_col, (int)tbl.num_columns() - 1, window_height); } + if (current_window_width > 0) { + build_blocks(current_window_start_col, (int)tbl.num_columns() - 1, window_height); + } - // Get the pointers to the input columnar data ready - possibly moved up to copy to gpu while calculating other things + // Get the pointers to the input columnar data ready - possibly moved up to copy to gpu while + // calculating other things std::vector input_data; std::vector input_nm; for (size_type column_number = 0; column_number < num_columns; column_number++) { - column_view cv = tbl.column(column_number); + column_view cv = tbl.column(column_number); auto const col_type = cv.type(); - bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; + bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; if (!nested_type) { input_data.emplace_back(cv.data()); @@ -924,81 +1070,87 @@ std::vector> convert_to_rows2(cudf::table_view con } } - #if defined(DEBUG) - printf("%lu windows for %d columns, %d rows to fit in ", block_infos.size(), block_infos[0].end_col - block_infos[0].start_col, block_infos[0].end_row - block_infos[0].start_row); +#if defined(DEBUG) + printf("%lu windows for %d columns, %d rows to fit in ", + block_infos.size(), + block_infos[0].end_col - block_infos[0].start_col + 1, + block_infos[0].end_row - block_infos[0].start_row); pretty_print(shmem_limit_per_block); printf(" shared mem("); pretty_print(fixed_width_size_per_row); printf("/row, %d columns, %d rows, ", num_columns, num_rows); pretty_print(total_table_size); printf(" total):\n"); - #endif +#endif auto dev_input_data = detail::copy_to_dev_async2(input_data, stream, mr); auto dev_input_nm = detail::copy_to_dev_async2(input_nm, stream, mr); auto dev_col_sizes = detail::copy_to_dev_async2(column_sizes, stream, mr); - auto dev_col_starts = detail::copy_to_dev_async2(column_starts, stream, mr); + auto dev_col_starts = detail::copy_to_dev_async2(column_starts, stream, mr); auto dev_block_infos = detail::copy_to_dev_async2(block_infos, stream, mr); - auto dev_row_offsets = detail::copy_to_dev_async2(row_offsets, stream, mr); + auto dev_row_offsets = detail::copy_to_dev_async2(row_offsets, stream, mr); std::vector output_buffers; std::vector output_data; output_data.reserve(row_batches.size()); - for (uint i=0; i(temp.data())); output_buffers.push_back(std::move(temp)); } - auto dev_output_data = detail::copy_to_dev_async2(output_data, stream, mr); + auto dev_output_data = detail::copy_to_dev_async2(output_data, stream, mr); // blast through the entire table and convert it dim3 blocks(block_infos.size()); - dim3 threads(1024); - copy_from_columns<<>>(num_rows, - num_columns, - dev_input_data.data(), - dev_input_nm.data(), - dev_col_sizes.data(), - dev_col_starts.data(), - dev_block_infos.data(), - dev_row_offsets.data(), - reinterpret_cast(dev_output_data.data())); + dim3 threads(std::min((uint64_t)1024, total_table_size / 8)); +#if defined(DEBUG) + printf("Launching kernel with %d blocks, %d threads, ", blocks.x, threads.x); + pretty_print(shmem_limit_per_block); + printf(" shared memory\n"); +#endif + copy_from_columns<<>>( + num_rows, + num_columns, + dev_input_data.data(), + dev_input_nm.data(), + dev_col_sizes.data(), + dev_col_starts.data(), + dev_block_infos.data(), + dev_row_offsets.data(), + reinterpret_cast(dev_output_data.data())); // split up the output buffer into multiple buffers based on row batch sizes // and create list of byte columns int offset_offset = 0; std::vector> ret; - for (uint i=0; i offset_vals; offset_vals.reserve(row_batches[i].row_count + 1); size_type cur_offset = 0; offset_vals.push_back(cur_offset); - for (int row=0; row(data_type{type_id::INT32}, (size_type)offset_vals.size(), dev_offsets.release()); + auto dev_offsets = detail::copy_to_dev_async2(offset_vals, stream, mr); + auto offsets = std::make_unique( + data_type{type_id::INT32}, (size_type)offset_vals.size(), dev_offsets.release()); - auto data = - std::make_unique(data_type{cudf::type_id::INT8}, - row_batches[i].num_bytes, - std::move(output_buffers[i])); + auto data = std::make_unique( + data_type{cudf::type_id::INT8}, row_batches[i].num_bytes, std::move(output_buffers[i])); ret.push_back(cudf::make_lists_column(row_batches[i].row_count, - std::move(offsets), - std::move(data), - 0, - rmm::device_buffer{0, rmm::cuda_stream_default, mr}, - stream, - mr)); + std::move(offsets), + std::move(data), + 0, + rmm::device_buffer{0, rmm::cuda_stream_default, mr}, + stream, + mr)); } - + return ret; } diff --git a/cpp/tests/row_conversion/row_conversion.cpp b/cpp/tests/row_conversion/row_conversion.cpp new file mode 100644 index 00000000000..c02f83ad1d5 --- /dev/null +++ b/cpp/tests/row_conversion/row_conversion.cpp @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include +#include + +struct ColumnToRowTests : public cudf::test::BaseFixture { +}; + +TEST_F(ColumnToRowTests, Single) +{ + cudf::test::fixed_width_column_wrapper a({-1}); + cudf::table_view in(std::vector{a}); + + auto old_rows = cudf::convert_to_rows(in); + auto new_rows = cudf::convert_to_rows2(in); + + EXPECT_EQ(old_rows.size(), new_rows.size()); + for (uint i = 0; i < old_rows.size(); i++) { + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); + } +} + +TEST_F(ColumnToRowTests, Simple) +{ + cudf::test::fixed_width_column_wrapper a({-1, 0, 1}); + cudf::table_view in(std::vector{a}); + + auto old_rows = cudf::convert_to_rows(in); + auto new_rows = cudf::convert_to_rows2(in); + + EXPECT_EQ(old_rows.size(), new_rows.size()); + for (uint i = 0; i < old_rows.size(); i++) { + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); + } +} + +TEST_F(ColumnToRowTests, Tall) +{ + auto r = + cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); }); + cudf::test::fixed_width_column_wrapper a(r, r + (size_t)4096); + cudf::table_view in(std::vector{a}); + + auto old_rows = cudf::convert_to_rows(in); + auto new_rows = cudf::convert_to_rows2(in); + + EXPECT_EQ(old_rows.size(), new_rows.size()); + for (uint i = 0; i < old_rows.size(); i++) { + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); + } +} + +TEST_F(ColumnToRowTests, Wide) +{ + std::vector> cols; + std::vector views; + + for (int i = 0; i < 256; ++i) { + cols.push_back(cudf::test::fixed_width_column_wrapper({rand()})); + views.push_back(cols.back()); + } + cudf::table_view in(views); + + auto old_rows = cudf::convert_to_rows(in); + auto new_rows = cudf::convert_to_rows2(in); + + EXPECT_EQ(old_rows.size(), new_rows.size()); + for (uint i = 0; i < old_rows.size(); i++) { + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); + } +} + +TEST_F(ColumnToRowTests, SingleByteWide) +{ + std::vector> cols; + std::vector views; + + for (int i = 0; i < 256; ++i) { + cols.push_back(cudf::test::fixed_width_column_wrapper({rand()})); + views.push_back(cols.back()); + } + cudf::table_view in(views); + + auto old_rows = cudf::convert_to_rows(in); + auto new_rows = cudf::convert_to_rows2(in); + + EXPECT_EQ(old_rows.size(), new_rows.size()); + for (uint i = 0; i < old_rows.size(); i++) { + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); + } +} From 170a771d84347c0dd30ec9d9aa8eaf8041279ccf Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Mon, 21 Jun 2021 18:17:45 +0000 Subject: [PATCH 38/80] Updating windows to be generated in a square way so we can have more data to write out as 8-byte writes from shared memory. Shuffled some of the copy to GPU code up so it can start the copy sooner and hopefully won't force stalls. Some bug fixes. --- .../row_conversion/row_conversion.cpp | 15 ++- cpp/src/row_conversion/row_conversion.cu | 96 +++++++++++-------- 2 files changed, 67 insertions(+), 44 deletions(-) diff --git a/cpp/benchmarks/row_conversion/row_conversion.cpp b/cpp/benchmarks/row_conversion/row_conversion.cpp index e1228c9df21..d6b195433cf 100644 --- a/cpp/benchmarks/row_conversion/row_conversion.cpp +++ b/cpp/benchmarks/row_conversion/row_conversion.cpp @@ -125,7 +125,7 @@ static void BM_new_to_row(benchmark::State& state) state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows()); }*/ -#define TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \ +#define OLD_TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \ BENCHMARK_DEFINE_F(RowConversion, name) \ (::benchmark::State & st) { f(st); } \ BENCHMARK_REGISTER_F(RowConversion, name) \ @@ -134,8 +134,17 @@ static void BM_new_to_row(benchmark::State& state) ->UseManualTime() \ ->Unit(benchmark::kMillisecond); -TO_ROW_CONVERSION_BENCHMARK_DEFINE(old_to_row_conversion, BM_old_to_row) -TO_ROW_CONVERSION_BENCHMARK_DEFINE(new_to_row_conversion, BM_new_to_row) +#define NEW_TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \ + BENCHMARK_DEFINE_F(RowConversion, name) \ + (::benchmark::State & st) { f(st); } \ + BENCHMARK_REGISTER_F(RowConversion, name) \ + ->RangeMultiplier(8) \ + ->Ranges({{1 << 6, 1 << 20}}) \ + ->UseManualTime() \ + ->Unit(benchmark::kMillisecond); + +OLD_TO_ROW_CONVERSION_BENCHMARK_DEFINE(old_to_row_conversion, BM_old_to_row) +NEW_TO_ROW_CONVERSION_BENCHMARK_DEFINE(new_to_row_conversion, BM_new_to_row) #define FROM_ROW_CONVERSION_BENCHMARK_DEFINE(name) \ BENCHMARK_DEFINE_F(RowConversion, name) \ diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu index 92ba075c316..3f221e2f716 100644 --- a/cpp/src/row_conversion/row_conversion.cu +++ b/cpp/src/row_conversion/row_conversion.cu @@ -364,7 +364,7 @@ __global__ void copy_from_columns(const size_type num_rows, // This has been broken up for us in the block_info struct, so we don't have // any calculation to do here, but it is important to note. - bool debug_print = false; // blockIdx.x == 70 && threadIdx.x == 448; + constexpr bool debug_print = false; //blockIdx.x == 2649 && threadIdx.x == 479; if (debug_print) { printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns); @@ -383,6 +383,7 @@ __global__ void copy_from_columns(const size_type num_rows, }*/ printf("output data to %p\n", output_data[block_infos[blockIdx.x].buffer_num]); } + //else { return; } auto block = block_infos[blockIdx.x]; auto const rows_in_block = block.end_row - block.start_row + 1; extern __shared__ int8_t shared_data[]; @@ -416,7 +417,7 @@ __global__ void copy_from_columns(const size_type num_rows, col_sizes[block.end_col], block.start_col, col_offsets[block.start_col]); - printf("shmem row size %d\n", shmem_row_size); + printf("shmem row size %d with real bytes %d\n", shmem_row_size, real_bytes_in_row); printf("validity offset is %d\n", validity_offset); printf("starting at %d,%d and going to %d, %d\n", block.start_col, @@ -524,6 +525,8 @@ __global__ void copy_from_columns(const size_type num_rows, // auto const thread_start_offset = threadIdx.x * 8; auto const thread_stride = gridDim.x * 8; + auto const end_offset = shmem_row_size * rows_in_block; + if (debug_print) { printf("writing final data from %d to %d at stride %d\n", thread_start_offset, @@ -531,7 +534,7 @@ __global__ void copy_from_columns(const size_type num_rows, thread_stride); printf("rows in block %d\n", rows_in_block); } - for (auto src_offset = thread_start_offset; src_offset < shmem_row_size * rows_in_block; + for (auto src_offset = thread_start_offset; src_offset < end_offset; src_offset += thread_stride) { auto const output_row_num = src_offset / shmem_row_size; auto const row_offset = row_offsets[block.start_row + output_row_num]; @@ -771,7 +774,6 @@ std::vector> convert_to_rows2(cudf::table_view con // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the // data, but small enough that multiple columns fit in memory so the writes can coalese as well. // Potential optimization for window sizes. - constexpr int max_window_height = 1024; const size_type num_columns = tbl.num_columns(); const size_type num_rows = tbl.num_rows(); @@ -816,6 +818,25 @@ std::vector> convert_to_rows2(cudf::table_view con // to that point. These are row batches and they are decided first before building the // windows so the windows can be properly cut around them. + // Get the pointers to the input columnar data ready + std::vector input_data; + std::vector input_nm; + input_data.reserve(num_columns); + input_nm.reserve(num_columns); + for (size_type column_number = 0; column_number < num_columns; column_number++) { + column_view cv = tbl.column(column_number); + auto const col_type = cv.type(); + bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; + + if (!nested_type) { + input_data.emplace_back(cv.data()); + input_nm.emplace_back(cv.null_mask()); + } + } + + auto dev_input_data = detail::copy_to_dev_async2(input_data, stream, mr); + auto dev_input_nm = detail::copy_to_dev_async2(input_nm, stream, mr); + std::vector row_sizes; // size of each row in bytes including any alignment padding std::vector row_offsets; // offset from the start of the data to this row std::vector column_sizes; // byte size of each column @@ -847,6 +868,9 @@ std::vector> convert_to_rows2(cudf::table_view con fixed_width_size_per_row += col_size; } + auto dev_col_sizes = detail::copy_to_dev_async2(column_sizes, stream, mr); + auto dev_col_starts = detail::copy_to_dev_async2(column_starts, stream, mr); + // When building the columns to return, we have to be mindful of the offset limit in cudf. // It is 32-bit and these data columns are capable of surpassing that easily. The data should // not be cut off exactly at the limit though due to the validity buffers. The most efficient @@ -901,17 +925,18 @@ std::vector> convert_to_rows2(cudf::table_view con // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then // calculate the size of each row's variable-width data and validity as well. + auto validity_size = calculate_validity_size(num_columns); for (int row = 0; row < num_rows; ++row) { auto aligned_row_batch_size = detail::align_offset(row_batch_size, 8); // rows are 8 byte aligned row_sizes[row] = fixed_width_size_per_row; // validity is byte aligned - row_sizes[row] += calculate_validity_size(num_columns); + row_sizes[row] += validity_size; // variable width data is 8-byte aligned row_sizes[row] = detail::align_offset(row_sizes[row], 8) + calculate_variable_width_row_data_size(row); // rows are 8 byte aligned - if (aligned_row_batch_size + row_sizes[row] > std::numeric_limits::max()) { + if ((uint64_t)aligned_row_batch_size + row_sizes[row] > (uint64_t)std::numeric_limits::max()) { // a new batch starts at the last 32-row boundary row_batches.push_back( row_batch{static_cast(row_batch_size), row_batch_rows & ~31}); @@ -932,7 +957,9 @@ std::vector> convert_to_rows2(cudf::table_view con row_batches.push_back(row_batch{static_cast(row_batch_size), row_batch_rows}); } -#if defined(DEBUG) + auto dev_row_offsets = detail::copy_to_dev_async2(row_offsets, stream, mr); + + #if defined(DEBUG) printf("%d rows and %d columns in table\n", num_rows, num_columns); printf("%lu batches:\n", row_batches.size()); for (auto i = 0; i < (int)row_batches.size(); ++i) { @@ -942,6 +969,16 @@ std::vector> convert_to_rows2(cudf::table_view con } #endif + std::vector output_buffers; + std::vector output_data; + output_data.reserve(row_batches.size()); + for (uint i = 0; i < row_batches.size(); ++i) { + rmm::device_buffer temp(row_batches[i].num_bytes, stream, mr); + output_data.push_back(static_cast(temp.data())); + output_buffers.push_back(std::move(temp)); + } + auto dev_output_data = detail::copy_to_dev_async2(output_data, stream, mr); + std::vector block_infos; // block infos are organized with the windows going "down" the columns @@ -976,8 +1013,13 @@ std::vector> convert_to_rows2(cudf::table_view con } }; + // the ideal window height has lots of 8-byte reads and 8-byte writes. The optimal read/write would be memory cache line sized + // access, but since other blocks will read/write the edges this may not turn out to be overly important. + // For now, we will attempt to build a square window as far as byte sizes. x * y = shared_mem_size. + // Which translates to x^2 = shared_mem_size since we want them equal, so height and width are + // sqrt(shared_mem_size). The trick is that it's in bytes, not rows or columns. int const window_height = - std::min(std::min(max_window_height, num_rows), row_batches[0].row_count); + std::min(std::min(size_type(sqrt(shmem_limit_per_block))/column_sizes[0], num_rows), row_batches[0].row_count); #if defined(DEBUG) printf( "max_window_height is %d, num_rows is %d, batch row count is %d - which makes window height " @@ -998,20 +1040,21 @@ std::vector> convert_to_rows2(cudf::table_view con std::size_t alignment_needed = col_size; // They are the same for fixed width types auto row_size_aligned = detail::align_offset(row_size, alignment_needed); auto row_size_with_this_col = row_size_aligned + col_size; + auto row_size_with_end_pad = detail::align_offset(row_size_with_this_col, 8); - if (row_size_with_this_col * window_height > shmem_limit_per_block) { + if (row_size_with_end_pad * window_height > shmem_limit_per_block) { #if defined(DEBUG) printf( "Window size %d too large at column %d, bumping back to build windows of size %d(cols " "%d-%d), which is %d tall. Row size is too large at %d and ok at %d(aligned overall is %d) " "for shared mem size %d\n", - row_size_with_this_col * window_height, + row_size_with_end_pad * window_height, col, row_size * window_height, current_window_start_col, col - 1, window_height, - row_size_with_this_col, + row_size_with_end_pad, row_size, row_size_aligned, shmem_limit_per_block); @@ -1055,20 +1098,6 @@ std::vector> convert_to_rows2(cudf::table_view con build_blocks(current_window_start_col, (int)tbl.num_columns() - 1, window_height); } - // Get the pointers to the input columnar data ready - possibly moved up to copy to gpu while - // calculating other things - std::vector input_data; - std::vector input_nm; - for (size_type column_number = 0; column_number < num_columns; column_number++) { - column_view cv = tbl.column(column_number); - auto const col_type = cv.type(); - bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; - - if (!nested_type) { - input_data.emplace_back(cv.data()); - input_nm.emplace_back(cv.null_mask()); - } - } #if defined(DEBUG) printf("%lu windows for %d columns, %d rows to fit in ", @@ -1083,26 +1112,11 @@ std::vector> convert_to_rows2(cudf::table_view con printf(" total):\n"); #endif - auto dev_input_data = detail::copy_to_dev_async2(input_data, stream, mr); - auto dev_input_nm = detail::copy_to_dev_async2(input_nm, stream, mr); - auto dev_col_sizes = detail::copy_to_dev_async2(column_sizes, stream, mr); - auto dev_col_starts = detail::copy_to_dev_async2(column_starts, stream, mr); auto dev_block_infos = detail::copy_to_dev_async2(block_infos, stream, mr); - auto dev_row_offsets = detail::copy_to_dev_async2(row_offsets, stream, mr); - - std::vector output_buffers; - std::vector output_data; - output_data.reserve(row_batches.size()); - for (uint i = 0; i < row_batches.size(); ++i) { - rmm::device_buffer temp(row_batches[i].num_bytes, stream, mr); - output_data.push_back(static_cast(temp.data())); - output_buffers.push_back(std::move(temp)); - } - auto dev_output_data = detail::copy_to_dev_async2(output_data, stream, mr); // blast through the entire table and convert it dim3 blocks(block_infos.size()); - dim3 threads(std::min((uint64_t)1024, total_table_size / 8)); + dim3 threads(std::min(1024, shmem_limit_per_block / 8)); #if defined(DEBUG) printf("Launching kernel with %d blocks, %d threads, ", blocks.x, threads.x); pretty_print(shmem_limit_per_block); From a82cee8488b0d7aa61b4361b41c69fdf2bf07ccc Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Thu, 8 Jul 2021 01:52:36 +0000 Subject: [PATCH 39/80] Adding row to column conversion code. Performance falls off a cliff, but starts out reasonably. I haven't looked at this in nsight yet. --- .../row_conversion/row_conversion.cpp | 74 +- cpp/include/cudf/row_conversion.hpp | 12 + cpp/src/row_conversion/row_conversion.cu | 759 +++++++++++++----- cpp/tests/row_conversion/row_conversion.cpp | 106 +++ 4 files changed, 748 insertions(+), 203 deletions(-) diff --git a/cpp/benchmarks/row_conversion/row_conversion.cpp b/cpp/benchmarks/row_conversion/row_conversion.cpp index d6b195433cf..7c1f52c5cd6 100644 --- a/cpp/benchmarks/row_conversion/row_conversion.cpp +++ b/cpp/benchmarks/row_conversion/row_conversion.cpp @@ -91,7 +91,7 @@ static void BM_new_to_row(benchmark::State& state) state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows()); } -/*static void BM_from_row(benchmark::State& state) +static void BM_old_from_row(benchmark::State& state) { cudf::size_type const n_rows{(cudf::size_type)state.range(0)}; auto const table = create_random_table({cudf::type_id::INT8, @@ -123,36 +123,62 @@ static void BM_new_to_row(benchmark::State& state) } state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows()); -}*/ - -#define OLD_TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \ - BENCHMARK_DEFINE_F(RowConversion, name) \ - (::benchmark::State & st) { f(st); } \ - BENCHMARK_REGISTER_F(RowConversion, name) \ - ->RangeMultiplier(8) \ - ->Ranges({{1 << 6, 1 << 20}}) \ - ->UseManualTime() \ - ->Unit(benchmark::kMillisecond); +} + +static void BM_new_from_row(benchmark::State& state) +{ + cudf::size_type const n_rows{(cudf::size_type)state.range(0)}; + auto const table = create_random_table({cudf::type_id::INT8, + cudf::type_id::INT32, + cudf::type_id::INT16, + cudf::type_id::INT64, + cudf::type_id::INT32, + cudf::type_id::BOOL8, + cudf::type_id::UINT16, + cudf::type_id::UINT8, + cudf::type_id::UINT64}, + 256, + row_count{n_rows}); + + std::vector schema; + cudf::size_type total_bytes = 0; + for (int i = 0; i < table->num_columns(); ++i) { + auto t = table->get_column(i).type(); + schema.push_back(t); + total_bytes += cudf::size_of(t); + } + + auto rows = cudf::convert_to_rows(table->view()); + + for (auto _ : state) { + cuda_event_timer raii(state, true, rmm::cuda_stream_default); + + auto out = cudf::convert_from_rows2(rows, schema); + } + + state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows()); +} -#define NEW_TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \ - BENCHMARK_DEFINE_F(RowConversion, name) \ - (::benchmark::State & st) { f(st); } \ - BENCHMARK_REGISTER_F(RowConversion, name) \ - ->RangeMultiplier(8) \ - ->Ranges({{1 << 6, 1 << 20}}) \ - ->UseManualTime() \ +#define TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \ + BENCHMARK_DEFINE_F(RowConversion, name) \ + (::benchmark::State & st) { f(st); } \ + BENCHMARK_REGISTER_F(RowConversion, name) \ + ->RangeMultiplier(8) \ + ->Ranges({{1 << 6, 1 << 20}}) \ + ->UseManualTime() \ ->Unit(benchmark::kMillisecond); -OLD_TO_ROW_CONVERSION_BENCHMARK_DEFINE(old_to_row_conversion, BM_old_to_row) -NEW_TO_ROW_CONVERSION_BENCHMARK_DEFINE(new_to_row_conversion, BM_new_to_row) +TO_ROW_CONVERSION_BENCHMARK_DEFINE(old_to_row_conversion, BM_old_to_row) +TO_ROW_CONVERSION_BENCHMARK_DEFINE(new_to_row_conversion, BM_new_to_row) -#define FROM_ROW_CONVERSION_BENCHMARK_DEFINE(name) \ +#define FROM_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \ BENCHMARK_DEFINE_F(RowConversion, name) \ - (::benchmark::State & st) { BM_from_row(st); } \ + (::benchmark::State & st) { f(st); } \ BENCHMARK_REGISTER_F(RowConversion, name) \ ->RangeMultiplier(8) \ - ->Ranges({{1 << 6, 1 << 22}}) \ + ->Ranges({{1 << 6, 1 << 20}}) \ ->UseManualTime() \ ->Unit(benchmark::kMillisecond); -//FROM_ROW_CONVERSION_BENCHMARK_DEFINE(from_row_conversion) +FROM_ROW_CONVERSION_BENCHMARK_DEFINE(old_from_row_conversion, BM_old_from_row) +FROM_ROW_CONVERSION_BENCHMARK_DEFINE(new_from_row_conversion, BM_new_from_row) diff --git a/cpp/include/cudf/row_conversion.hpp b/cpp/include/cudf/row_conversion.hpp index f5e2225ad19..282ffa4b0cb 100644 --- a/cpp/include/cudf/row_conversion.hpp +++ b/cpp/include/cudf/row_conversion.hpp @@ -48,4 +48,16 @@ std::unique_ptr convert_from_rows( rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); +std::unique_ptr convert_from_rows2( + cudf::lists_column_view const &input, + std::vector const &schema, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); + +std::unique_ptr convert_from_rows2( + std::vector> const &input, + std::vector const &schema, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); + } // namespace cudf diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu index 3f221e2f716..c0e78a03576 100644 --- a/cpp/src/row_conversion/row_conversion.cu +++ b/cpp/src/row_conversion/row_conversion.cu @@ -30,6 +30,7 @@ #include #include +#include #include "cudf/types.hpp" #include "rmm/device_buffer.hpp" #include "thrust/iterator/counting_iterator.h" @@ -332,6 +333,20 @@ struct block_info { int buffer_num; }; +// When building the columns to return, we have to be mindful of the offset limit in cudf. +// It is 32-bit and these data columns are capable of surpassing that easily. The data should +// not be cut off exactly at the limit though due to the validity buffers. The most efficient +// place to cut the validity is on a 32-row boundary, so as we calculate the row sizes +// we keep track of the cut points for the validity, which we call row batches. If the row +// is larger than can be represented with the 32-bit offsets, we use the last 32-row boundary we +// hit. Note that this boundary is for our book-keeping with column pointers and not anything that +// the kernel needs to worry about. We cut the output at convienient boundaries when assembling +// the outgoing data stream. +struct row_batch { + size_type num_bytes; + size_type row_count; +}; + /** * @brief copy data from cudf columns into x format, which is row-based * @@ -364,7 +379,7 @@ __global__ void copy_from_columns(const size_type num_rows, // This has been broken up for us in the block_info struct, so we don't have // any calculation to do here, but it is important to note. - constexpr bool debug_print = false; //blockIdx.x == 2649 && threadIdx.x == 479; + bool debug_print = false; //blockIdx.x == 0 && threadIdx.x == 0; if (debug_print) { printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns); @@ -383,7 +398,7 @@ __global__ void copy_from_columns(const size_type num_rows, }*/ printf("output data to %p\n", output_data[block_infos[blockIdx.x].buffer_num]); } - //else { return; } + // else { return; } auto block = block_infos[blockIdx.x]; auto const rows_in_block = block.end_row - block.start_row + 1; extern __shared__ int8_t shared_data[]; @@ -403,7 +418,7 @@ __global__ void copy_from_columns(const size_type num_rows, block.buffer_num); } // each thread is responsible for every threadcount rows of data. - // the data is copies into shared memory in the final layout. + // the data is copied into shared memory in the final layout. auto const real_bytes_in_row = col_offsets[block.end_col] + col_sizes[block.end_col] - col_offsets[block.start_col]; auto const shmem_row_size = align_offset(real_bytes_in_row + dest_shim_offset, @@ -432,7 +447,7 @@ __global__ void copy_from_columns(const size_type num_rows, auto const dest_col_offset = col_offsets[col] - col_offsets[block.start_col] + dest_shim_offset; if (debug_print) { printf("dest col offset %d\n", dest_col_offset); } - for (int row = block.start_row + threadIdx.x; row <= block.end_row; row += gridDim.x) { + for (int row = block.start_row + threadIdx.x; row <= block.end_row; row += blockDim.x) { if (debug_print) { printf("shmem row %d(%d) at offset %d(%d)\n", row - block.start_row, @@ -524,8 +539,8 @@ __global__ void copy_from_columns(const size_type num_rows, // row in shared memory may not be an entire row of the destination. // auto const thread_start_offset = threadIdx.x * 8; - auto const thread_stride = gridDim.x * 8; - auto const end_offset = shmem_row_size * rows_in_block; + auto const thread_stride = blockDim.x * 8; + auto const end_offset = shmem_row_size * rows_in_block; if (debug_print) { printf("writing final data from %d to %d at stride %d\n", @@ -559,9 +574,10 @@ __global__ void copy_from_columns(const size_type num_rows, auto const num_single_bytes = real_bytes_in_row - dest_shim_offset; for (auto i = 0; i < num_single_bytes; ++i) { if (debug_print) { - printf("case 3 - %d single byte final write %p -> %p\n", + printf("case 3 - %d single byte final write %p(%d) -> %p\n", num_single_bytes, &input_ptr[i + dest_shim_offset], + input_ptr[i + dest_shim_offset], &output_ptr[i]); } output_ptr[i] = input_ptr[i + dest_shim_offset]; @@ -600,6 +616,237 @@ __global__ void copy_from_columns(const size_type num_rows, } } +/** + * @brief copy data from row-based format to cudf columns + * + * @param num_rows total number of rows in the table + * @param num_columns total number of columns in the table + * @param offsets + * @param output_data + * @param output_nm + * @param col_sizes array of sizes for each element in a column - one per column + * @param col_offsets offset into input data row for each column's start + * @param block_infos information about the blocks of work + * @param input_data pointer to input data + * + */ +__global__ void copy_to_columns(const size_type num_rows, + const size_type num_columns, + const size_type *offsets, + int8_t **output_data, + cudf::bitmask_type **output_nm, + const size_type *col_sizes, + const size_type *col_offsets, + const block_info *block_infos, + const int8_t *input_data) +{ + // We are going to copy the data in two passes. + // The first pass copies a chunk of data into shared memory. + // The second pass copies that chunk from shared memory out to the final location. + + // Because shared memory is limited we copy a subset of the rows at a time. + // This has been broken up for us in the block_info struct, so we don't have + // any calculation to do here, but it is important to note. + + bool debug_print = false; //blockIdx.x == 1 && threadIdx.x == 0; + + if (debug_print) { + printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns); + printf("Column Info:\n"); + for (int i = 0; i < num_columns; ++i) { + printf("col %d is at %p with size %d and offset %d\n", + i, + output_data[i], + col_sizes[i], + col_offsets[i]); + } + printf("block infos are at %p and my index is %d\n", block_infos, blockIdx.x); + /* printf("Row Offsets:\n"); + for (int i=0; i(&input_data[offsets[absolute_row] + offset_in_row]); + if (debug_print) { + printf("which will be address %p\n", long_col_input); + printf("%p <- long %lu\n", shmem_dest, *long_col_input); } + *reinterpret_cast(shmem_dest) = *long_col_input; + } + + __syncthreads(); + + // now we copy from shared memory to final destination. + // the data is laid out in rows in shared memory, so the reads + // for a column will be "vertical". Because of this and the different + // sizes for each column, this portion is handled on row/column basis. + // to prevent each thread working on a single row and also to ensure + // that all threads can do work in the case of more threads than rows, + // we do a global index instead of a double for loop with col/row. + for (int index = threadIdx.x; index < rows_in_block * cols_in_block; index += blockDim.x) { + auto const relative_col = index % cols_in_block; + auto const relative_row = index / cols_in_block; + auto const absolute_col = relative_col + block.start_col; + auto const absolute_row = relative_row + block.start_row; + + auto const shared_memory_row_offset = window_quad_width * 8 * relative_row; + auto const shared_memory_offset = col_offsets[absolute_col] - col_offsets[block.start_col] + + shared_memory_row_offset + shared_memory_starting_pad; + auto const column_size = col_sizes[absolute_col]; + + int8_t *shmem_src = &shared_data[shared_memory_offset]; + int8_t *dst = &output_data[absolute_col][absolute_row * column_size]; + + if (debug_print) { + printf("relative_col: %d, relative_row: %d, absolute_col: %d, absolute_row: %d, shared_mmeory_row_offset: %d, shared_memory_offset: %d," + " column_size: %d, shmem_src: %p, dst: %p\n", relative_col, relative_row, absolute_col, absolute_row, shared_memory_row_offset, shared_memory_offset, column_size, + shmem_src, dst) ; + } + switch (column_size) { + case 1: { + if (debug_print) { printf("%p <- byte %d\n", dst, *shmem_src); } + *dst = *shmem_src; + break; + } + case 2: { + const int16_t *short_col_input = reinterpret_cast(shmem_src); + if (debug_print) { printf("%p <- short %d\n", dst, *short_col_input); } + *reinterpret_cast(dst) = *short_col_input; + break; + } + case 4: { + const int32_t *int_col_input = reinterpret_cast(shmem_src); + if (debug_print) { printf("%p <- int 0x%x\n", dst, *int_col_input); } + *reinterpret_cast(dst) = *int_col_input; + break; + } + case 8: { + const int64_t *long_col_input = reinterpret_cast(shmem_src); + if (debug_print) { printf("%p <- long %lu\n", dst, *long_col_input); } + *reinterpret_cast(dst) = *long_col_input; + break; + } + default: { + if (debug_print) { + printf("byte for byte copy due to size %d of column %d\n", column_size, absolute_col); + } + // TODO this should just not be supported for fixed width columns, but just in case... + for (cudf::size_type b = 0; b < column_size; b++) { dst[b] = shmem_src[b]; } + break; + } + } + } + + __syncthreads(); + + // now handle validity. Each thread is responsible for 32 rows in a single column. + // to prevent indexing issues with a large number of threads, this is compressed + // to a single loop like above. TODO: investigate using shared memory here + auto const validity_batches_per_col = (num_rows + 31) / 32; + auto const validity_batches_total = validity_batches_per_col * num_columns; + if (debug_print) { + printf("validity_batched_per_col is %d\nvalidity_batches_total is %d for %d rows\n", validity_batches_per_col, validity_batches_total, num_rows); + } + for (int index = threadIdx.x; index < validity_batches_total; index += blockDim.x) { + // what column is this? + auto const col = index / validity_batches_per_col; + auto const batch = index % validity_batches_per_col; + auto const starting_row = batch * 32; + auto const validity_offset = col_offsets[num_columns] + col / 8; + + if (debug_print) { + printf("col: %d, batch: %d, starting_row: %d, validity_offset: %d\n", col, batch, starting_row, validity_offset); + } + + int32_t dst_validity = 0; + for (int row = starting_row; row < std::min(num_rows, starting_row + 32); ++row) { + int8_t const * const validity_ptr = &input_data[offsets[row] + validity_offset]; + + if (debug_print) { + printf("validity_ptr is %p for row %d\nwhich is input_data[%d]\n", validity_ptr, row, offsets[row] + validity_offset); + } + + auto const val_byte = *validity_ptr; + auto const src_shift = col % 8; + auto const dst_shift = row % 32; + auto const src_bit_mask = 1 << src_shift; + if (debug_print) { + printf("src bit mask is 0x%x\n", src_bit_mask); + printf("src shift is 0x%x and dst shift is 0x%x\n", src_shift, dst_shift); + printf("validity bit is 0x%x\n", (val_byte & src_bit_mask) >> src_shift); + } +// auto const dst_bit_mask = 1 << dst_shift; + dst_validity |= (((val_byte & src_bit_mask) >> src_shift) << dst_shift); + if (debug_print) { + printf("validity is now 0x%x\n", dst_validity); + } + } + + + int32_t *validity_ptr = reinterpret_cast(output_nm[col] + (starting_row / 32)); + if (debug_print) { + printf("valiidty_ptr is output_nm[%d]: %p + starting_row / 8: %d because starting row is %d, which becomes %p\n", col, output_nm[col], starting_row / 32, starting_row, output_nm[col] + (starting_row / 32)); + printf("validity to write is %d\n", dst_validity); + printf("validity write %p <- %d\n", validity_ptr, dst_validity); + } + *validity_ptr = dst_validity; + } +} + /** * Calculate the dimensions of the kernel for fixed width only columns. * @param [in] num_columns the number of columns being copied. @@ -764,21 +1011,165 @@ static inline int32_t compute_fixed_width_layout(std::vector co return align_offset(at_offset, 8); // 8 bytes (64 bits) } -} // namespace detail +template +static size_type compute_column_information( + iterator begin, + iterator end, + std::vector &column_starts, + std::vector &column_sizes)//, + //std::function nested_type_cb) +{ + size_type fixed_width_size_per_row = 0; + for (auto cv = begin; cv != end; ++cv) { + auto col_type = std::get<0>(*cv); + bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; + +// if (nested_type && nested_type_cb) { nested_type_cb(cv->get<1>()); } + + // a list or string column will write a single uint64 + // of data here for offset/length + auto col_size = nested_type ? 8 : size_of(col_type); + + // align size for this type + std::size_t const alignment_needed = col_size; // They are the same for fixed width types + fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed); + column_starts.push_back(fixed_width_size_per_row); + column_sizes.push_back(col_size); + fixed_width_size_per_row += col_size; + } + + auto validity_offset = detail::align_offset(fixed_width_size_per_row, 4); + column_starts.push_back(validity_offset); + + return fixed_width_size_per_row; +} //#define DEBUG -std::vector> convert_to_rows2(cudf::table_view const &tbl, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) + +static std::vector build_block_infos(std::vector const &column_sizes, + std::vector const &column_starts, + std::vector const &row_batches, + size_type const total_number_of_rows, + size_type const &shmem_limit_per_block) { - // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the - // data, but small enough that multiple columns fit in memory so the writes can coalese as well. - // Potential optimization for window sizes. - const size_type num_columns = tbl.num_columns(); - const size_type num_rows = tbl.num_rows(); + std::vector block_infos; + + // block infos are organized with the windows going "down" the columns + // this provides the most coalescing of memory access + int current_window_width = 0; + int current_window_start_col = 0; + + // build the blocks for a specific set of columns + auto build_blocks = [&block_infos, &row_batches, total_number_of_rows]( + int const start_col, int const end_col, int const desired_window_height) { + int current_window_start_row = 0; + int current_window_row_batch = 0; + int rows_left_in_batch = row_batches[current_window_row_batch].row_count; + int i = 0; + while (i < total_number_of_rows) { + if (rows_left_in_batch == 0) { + current_window_row_batch++; + rows_left_in_batch = row_batches[current_window_row_batch].row_count; + } + int const window_height = std::min(desired_window_height, rows_left_in_batch); + + block_infos.emplace_back(detail::block_info{ + start_col, + current_window_start_row, + end_col, + std::min(current_window_start_row + window_height - 1, total_number_of_rows - 1), + current_window_row_batch}); + + i += window_height; + current_window_start_row += window_height; + rows_left_in_batch -= window_height; + } + }; + + // the ideal window height has lots of 8-byte reads and 8-byte writes. The optimal read/write + // would be memory cache line sized access, but since other blocks will read/write the edges this + // may not turn out to be overly important. For now, we will attempt to build a square window as + // far as byte sizes. x * y = shared_mem_size. Which translates to x^2 = shared_mem_size since we + // want them equal, so height and width are sqrt(shared_mem_size). The trick is that it's in + // bytes, not rows or columns. + int const window_height = std::min( + std::min(size_type(sqrt(shmem_limit_per_block)) / column_sizes[0], total_number_of_rows), + row_batches[0].row_count); +#if defined(DEBUG) + printf( + "sqrt(shmem_limit_per_block) / column_sizes[0] is %d and num_rows is %d, batch row count is %d - which makes window height " + "%d\n", + size_type(sqrt(shmem_limit_per_block)) / column_sizes[0], + total_number_of_rows, + row_batches[0].row_count, + window_height); +#endif + + int row_size = 0; + + // march each column and build the blocks of appropriate sizes + for (unsigned int col = 0; col < column_sizes.size(); ++col) { + auto const col_size = column_sizes[col]; + + // align size for this type + std::size_t alignment_needed = col_size; // They are the same for fixed width types + auto row_size_aligned = detail::align_offset(row_size, alignment_needed); + auto row_size_with_this_col = row_size_aligned + col_size; + auto row_size_with_end_pad = detail::align_offset(row_size_with_this_col, 8); + + if (row_size_with_end_pad * window_height > shmem_limit_per_block) { +#if defined(DEBUG) + printf( + "Window size %d too large at column %d, bumping back to build windows of size %d(cols " + "%d-%d), which is %d tall. Row size is too large at %d and ok at %d(aligned overall is %d) " + "for shared mem size %d\n", + row_size_with_end_pad * window_height, + col, + row_size * window_height, + current_window_start_col, + col - 1, + window_height, + row_size_with_end_pad, + row_size, + row_size_aligned, + shmem_limit_per_block); +#endif + // too large, close this window, generate vertical blocks and restart + build_blocks(current_window_start_col, col - 1, window_height); + row_size = + detail::align_offset((column_starts[col] + column_sizes[col]) & 7, alignment_needed); +#if defined(DEBUG) + printf( + "New window starting with offset %d and row size %d to be %d (previous column offset %d+%d " + "or %d)\n", + row_size, + col_size, + row_size + col_size, + column_starts[col - 1], + column_sizes[col - 1], + column_starts[col - 1] + column_sizes[col - 1]); +#endif + row_size += col_size; // alignment required for shared memory window boundary to match + // alignment of output row + current_window_start_col = col; + current_window_width = 0; + } else { + row_size = row_size_with_this_col; + current_window_width++; + } + } + + // build last set of blocks + if (current_window_width > 0) { + build_blocks(current_window_start_col, (int)column_sizes.size()-1, window_height); + } + + return block_infos; +} +} // namespace detail #if defined(DEBUG) - auto pretty_print = [](uint64_t i) { + void pretty_print(uint64_t i) { if (i > (1 * 1024 * 1024 * 1024)) { printf("%.2f GB", i / float(1 * 1024 * 1024 * 1024)); } else if (i > (1 * 1024 * 1024)) { @@ -788,9 +1179,19 @@ std::vector> convert_to_rows2(cudf::table_view con } else { printf("%lu Bytes", i); } - }; + } #endif +std::vector> convert_to_rows2(cudf::table_view const &tbl, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) +{ + // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the + // data, but small enough that multiple columns fit in memory so the writes can coalese as well. + // Potential optimization for window sizes. + const size_type num_columns = tbl.num_columns(); + const size_type num_rows = tbl.num_rows(); + int device_id; CUDA_TRY(cudaGetDevice(&device_id)); int shmem_limit_per_block; @@ -834,8 +1235,8 @@ std::vector> convert_to_rows2(cudf::table_view con } } - auto dev_input_data = detail::copy_to_dev_async2(input_data, stream, mr); - auto dev_input_nm = detail::copy_to_dev_async2(input_nm, stream, mr); + auto dev_input_data = detail::copy_to_dev_async2(input_data, stream, mr); + auto dev_input_nm = detail::copy_to_dev_async2(input_nm, stream, mr); std::vector row_sizes; // size of each row in bytes including any alignment padding std::vector row_offsets; // offset from the start of the data to this row @@ -848,43 +1249,48 @@ std::vector> convert_to_rows2(cudf::table_view con column_sizes.reserve(num_columns); column_starts.reserve(num_columns + 1); // we add a final offset for validity data start - size_type fixed_width_size_per_row = 0; - for (int col = 0; col < num_columns; ++col) { - auto cv = tbl.column(col); - auto col_type = cv.type(); - bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; + auto iter = thrust::make_transform_iterator(thrust::make_counting_iterator(0), [&tbl](auto i) -> std::tuple { + return std::make_tuple(tbl.column(i).type(), tbl.column(i)); + }); + + size_type fixed_width_size_per_row = detail::compute_column_information( + iter, + iter + num_columns, + column_starts, + column_sizes);//, +// [&variable_width_columns](column_view const &cv) { variable_width_columns.push_back(cv); }); + /* size_type fixed_width_size_per_row = 0; + for (int col = 0; col < num_columns; ++col) { + auto cv = tbl.column(col); + auto col_type = cv.type(); + bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; + + if (nested_type) { variable_width_columns.push_back(cv); } + + // a list or string column will write a single uint64 + // of data here for offset/length + auto col_size = nested_type ? 8 : size_of(col_type); + + // align size for this type + std::size_t const alignment_needed = col_size; // They are the same for fixed width types + fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed); + column_starts.push_back(fixed_width_size_per_row); + column_sizes.push_back(col_size); + fixed_width_size_per_row += col_size; + }*/ - if (nested_type) { variable_width_columns.push_back(cv); } +#if defined(DEBUG) + printf("validity offset will be %d + %d = %d\n", + column_starts.back(), + column_sizes.back(), + column_starts.back() + column_sizes.back()); +#endif - // a list or string column will write a single uint64 - // of data here for offset/length - auto col_size = nested_type ? 8 : size_of(col_type); - // align size for this type - std::size_t const alignment_needed = col_size; // They are the same for fixed width types - fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed); - column_starts.push_back(fixed_width_size_per_row); - column_sizes.push_back(col_size); - fixed_width_size_per_row += col_size; - } + auto dev_col_sizes = detail::copy_to_dev_async2(column_sizes, stream, mr); + auto dev_col_starts = detail::copy_to_dev_async2(column_starts, stream, mr); - auto dev_col_sizes = detail::copy_to_dev_async2(column_sizes, stream, mr); - auto dev_col_starts = detail::copy_to_dev_async2(column_starts, stream, mr); - - // When building the columns to return, we have to be mindful of the offset limit in cudf. - // It is 32-bit and these data columns are capable of surpassing that easily. The data should - // not be cut off exactly at the limit though due to the validity buffers. The most efficient - // place to cut the validity is on a 32-row boundary, so as we calculate the row sizes - // we keep track of the cut points for the validity, which we call row batches. If the row - // is larger than can be represented with the 32-bit offsets, we use the last 32-row boundary we - // hit. Note that this boundary is for our book-keeping with column pointers and not anything that - // the kernel needs to worry about. We cut the output at convienient boundaries when assembling - // the outgoing data stream. - struct row_batch { - size_type num_bytes; - size_type row_count; - }; - std::vector row_batches; + std::vector row_batches; auto calculate_variable_width_row_data_size = [](int const row) { // each level of variable-width data will add an offset/length @@ -936,10 +1342,11 @@ std::vector> convert_to_rows2(cudf::table_view con row_sizes[row] = detail::align_offset(row_sizes[row], 8) + calculate_variable_width_row_data_size(row); // rows are 8 byte aligned - if ((uint64_t)aligned_row_batch_size + row_sizes[row] > (uint64_t)std::numeric_limits::max()) { + if ((uint64_t)aligned_row_batch_size + row_sizes[row] > + (uint64_t)std::numeric_limits::max()) { // a new batch starts at the last 32-row boundary row_batches.push_back( - row_batch{static_cast(row_batch_size), row_batch_rows & ~31}); + detail::row_batch{static_cast(row_batch_size), row_batch_rows & ~31}); row_batch_size = 0; row_batch_rows = row_batch_rows & 31; row_offset = 0; @@ -954,12 +1361,12 @@ std::vector> convert_to_rows2(cudf::table_view con row_batch_rows++; } if (row_batch_size > 0) { - row_batches.push_back(row_batch{static_cast(row_batch_size), row_batch_rows}); + row_batches.push_back(detail::row_batch{static_cast(row_batch_size), row_batch_rows}); } auto dev_row_offsets = detail::copy_to_dev_async2(row_offsets, stream, mr); - #if defined(DEBUG) +#if defined(DEBUG) printf("%d rows and %d columns in table\n", num_rows, num_columns); printf("%lu batches:\n", row_batches.size()); for (auto i = 0; i < (int)row_batches.size(); ++i) { @@ -979,125 +1386,8 @@ std::vector> convert_to_rows2(cudf::table_view con } auto dev_output_data = detail::copy_to_dev_async2(output_data, stream, mr); - std::vector block_infos; - - // block infos are organized with the windows going "down" the columns - // this provides the most coalescing of memory access - int current_window_width = 0; - int current_window_start_col = 0; - - // build the blocks for a specific set of columns - auto build_blocks = [&block_infos, &row_batches, num_rows]( - int const start_col, int const end_col, int const desired_window_height) { - int current_window_start_row = 0; - int current_window_row_batch = 0; - int rows_left_in_batch = row_batches[current_window_row_batch].row_count; - int i = 0; - while (i < num_rows) { - if (rows_left_in_batch == 0) { - current_window_row_batch++; - rows_left_in_batch = row_batches[current_window_row_batch].row_count; - } - int const window_height = std::min(desired_window_height, rows_left_in_batch); - - block_infos.emplace_back( - detail::block_info{start_col, - current_window_start_row, - end_col, - std::min(current_window_start_row + window_height - 1, num_rows - 1), - current_window_row_batch}); - - i += window_height; - current_window_start_row += window_height; - rows_left_in_batch -= window_height; - } - }; - - // the ideal window height has lots of 8-byte reads and 8-byte writes. The optimal read/write would be memory cache line sized - // access, but since other blocks will read/write the edges this may not turn out to be overly important. - // For now, we will attempt to build a square window as far as byte sizes. x * y = shared_mem_size. - // Which translates to x^2 = shared_mem_size since we want them equal, so height and width are - // sqrt(shared_mem_size). The trick is that it's in bytes, not rows or columns. - int const window_height = - std::min(std::min(size_type(sqrt(shmem_limit_per_block))/column_sizes[0], num_rows), row_batches[0].row_count); -#if defined(DEBUG) - printf( - "max_window_height is %d, num_rows is %d, batch row count is %d - which makes window height " - "%d\n", - max_window_height, - num_rows, - row_batches[0].row_count, - window_height); -#endif - - int row_size = 0; - - // march each column and build the blocks of appropriate sizes - for (int col = 0; col < num_columns; ++col) { - auto const col_size = column_sizes[col]; - - // align size for this type - std::size_t alignment_needed = col_size; // They are the same for fixed width types - auto row_size_aligned = detail::align_offset(row_size, alignment_needed); - auto row_size_with_this_col = row_size_aligned + col_size; - auto row_size_with_end_pad = detail::align_offset(row_size_with_this_col, 8); - - if (row_size_with_end_pad * window_height > shmem_limit_per_block) { -#if defined(DEBUG) - printf( - "Window size %d too large at column %d, bumping back to build windows of size %d(cols " - "%d-%d), which is %d tall. Row size is too large at %d and ok at %d(aligned overall is %d) " - "for shared mem size %d\n", - row_size_with_end_pad * window_height, - col, - row_size * window_height, - current_window_start_col, - col - 1, - window_height, - row_size_with_end_pad, - row_size, - row_size_aligned, - shmem_limit_per_block); -#endif - // too large, close this window, generate vertical blocks and restart - build_blocks(current_window_start_col, col - 1, window_height); - row_size = - detail::align_offset((column_starts[col] + column_sizes[col]) & 7, alignment_needed); -#if defined(DEBUG) - printf( - "New window starting with offset %d and row size %d to be %d (previous column offset %d+%d " - "or %d)\n", - row_size, - col_size, - row_size + col_size, - column_starts[col - 1], - column_sizes[col - 1], - column_starts[col - 1] + column_sizes[col - 1]); -#endif - row_size += col_size; // alignment required for shared memory window boundary to match - // alignment of output row - current_window_start_col = col; - current_window_width = 0; - } else { - row_size = row_size_with_this_col; - current_window_width++; - } - } - -#if defined(DEBUG) - printf("validity offset will be %d + %d = %d\n", - column_starts.back(), - column_sizes.back(), - column_starts.back() + column_sizes.back()); -#endif - auto validity_offset = detail::align_offset(column_starts.back() + column_sizes.back(), 4); - column_starts.push_back(validity_offset); - - // build last set of blocks - if (current_window_width > 0) { - build_blocks(current_window_start_col, (int)tbl.num_columns() - 1, window_height); - } - + std::vector block_infos = + build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block); #if defined(DEBUG) printf("%lu windows for %d columns, %d rows to fit in ", @@ -1116,7 +1406,11 @@ std::vector> convert_to_rows2(cudf::table_view con // blast through the entire table and convert it dim3 blocks(block_infos.size()); - dim3 threads(std::min(1024, shmem_limit_per_block / 8)); + #if defined(DEBUG) || 1 + dim3 threads(std::min(std::min(512, shmem_limit_per_block / 8), (int)total_table_size)); + #else + dim3 threads(std::min(std::min(1024, shmem_limit_per_block / 8), (int)total_table_size)); + #endif #if defined(DEBUG) printf("Launching kernel with %d blocks, %d threads, ", blocks.x, threads.x); pretty_print(shmem_limit_per_block); @@ -1206,11 +1500,11 @@ std::vector> convert_to_rows(cudf::table_view cons using ScalarType = cudf::scalar_type_t; auto zero = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value()); - zero->set_valid(true, stream); + zero->set_valid_async(true, stream); static_cast(zero.get())->set_value(0, stream); auto step = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value()); - step->set_valid(true, stream); + step->set_valid_async(true, stream); static_cast(step.get()) ->set_value(static_cast(size_per_row), stream); @@ -1238,6 +1532,97 @@ std::vector> convert_to_rows(cudf::table_view cons } } +std::unique_ptr convert_from_rows2(cudf::lists_column_view const &input, + std::vector const &schema, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) +{ + // verify that the types are what we expect + cudf::column_view child = input.child(); + cudf::type_id list_type = child.type().id(); + CUDF_EXPECTS(list_type == cudf::type_id::INT8 || list_type == cudf::type_id::UINT8, + "Only a list of bytes is supported as input"); + + cudf::size_type num_columns = schema.size(); + cudf::size_type num_rows = input.parent().size(); + + int device_id; + CUDA_TRY(cudaGetDevice(&device_id)); + int shmem_limit_per_block; + CUDA_TRY( + cudaDeviceGetAttribute(&shmem_limit_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); + + std::vector column_starts; + std::vector column_sizes; + + auto iter = thrust::make_transform_iterator(thrust::make_counting_iterator(0), [&schema](auto i) { + return std::make_tuple(schema[i], nullptr); + }); + size_type fixed_width_size_per_row = detail::compute_column_information( + iter, iter + num_columns, column_starts, column_sizes);//, [](void *) {}); + + size_type validity_size = (num_columns + 7) / 8; + + size_type row_size = detail::align_offset(fixed_width_size_per_row + validity_size, 8); + + // Ideally we would check that the offsets are all the same, etc. but for now + // this is probably fine + CUDF_EXPECTS(row_size * num_rows == child.size(), + "The layout of the data appears to be off"); + auto dev_col_starts = detail::copy_to_dev_async2(column_starts, stream, mr); + auto dev_col_sizes = detail::copy_to_dev_async2(column_sizes, stream, mr); + + // build the row_batches from the passed in list column + std::vector row_batches; + + row_batches.push_back(detail::row_batch{child.size(), num_rows}); + + // Allocate the columns we are going to write into + std::vector> output_columns; + std::vector output_data; + std::vector output_nm; + for (cudf::size_type i = 0; i < num_columns; i++) { + auto column = cudf::make_fixed_width_column( + schema[i], num_rows, cudf::mask_state::UNINITIALIZED, stream, mr); + auto mut = column->mutable_view(); + output_data.emplace_back(mut.data()); + output_nm.emplace_back(mut.null_mask()); + output_columns.emplace_back(std::move(column)); + } + + auto dev_output_data = detail::copy_to_dev_async2(output_data, stream, mr); + auto dev_output_nm = detail::copy_to_dev_async2(output_nm, stream, mr); + + std::vector block_infos = + build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block); + + auto dev_block_infos = detail::copy_to_dev_async2(block_infos, stream, mr); + + dim3 blocks(block_infos.size()); + #if defined(DEBUG) || 1 + dim3 threads(std::min(std::min(512, shmem_limit_per_block / 8), (int)child.size())); + #else + dim3 threads(std::min(std::min(1024, shmem_limit_per_block / 8), (int)child.size())); + #endif +#if defined(DEBUG) + printf("Launching kernel with %d blocks, %d threads, ", blocks.x, threads.x); + pretty_print(shmem_limit_per_block); + printf(" shared memory\n"); +#endif + detail::copy_to_columns<<>>( + num_rows, + num_columns, + input.offsets().data(), + dev_output_data.data(), + dev_output_nm.data(), + dev_col_sizes.data(), + dev_col_starts.data(), + dev_block_infos.data(), + child.data()); + + return std::make_unique(std::move(output_columns)); +} + std::unique_ptr convert_from_rows(cudf::lists_column_view const &input, std::vector const &schema, rmm::cuda_stream_view stream, @@ -1318,4 +1703,20 @@ std::unique_ptr convert_from_rows( // } } +std::unique_ptr convert_from_rows2( + std::vector> const &input, + std::vector const &schema, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) +{ + CUDF_EXPECTS(input.size() == 1, "Too large of an input, need to concat the output tables..."); + + // for (uint i=0; iview(); + auto ret = convert_from_rows2(lcv, schema, stream, mr); + + return ret; + // } +} + } // namespace cudf diff --git a/cpp/tests/row_conversion/row_conversion.cpp b/cpp/tests/row_conversion/row_conversion.cpp index c02f83ad1d5..818d7a89ddb 100644 --- a/cpp/tests/row_conversion/row_conversion.cpp +++ b/cpp/tests/row_conversion/row_conversion.cpp @@ -21,9 +21,13 @@ #include #include +#include "cudf/lists/lists_column_view.hpp" +#include "cudf/types.hpp" struct ColumnToRowTests : public cudf::test::BaseFixture { }; +struct RowToColumnTests : public cudf::test::BaseFixture { +}; TEST_F(ColumnToRowTests, Single) { @@ -108,3 +112,105 @@ TEST_F(ColumnToRowTests, SingleByteWide) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); } } + +TEST_F(RowToColumnTests, Single) +{ + cudf::test::fixed_width_column_wrapper a({-1}); + cudf::table_view in(std::vector{a}); + + auto old_rows = cudf::convert_to_rows(in); + std::vector schema{cudf::data_type{cudf::type_id::INT32}}; + for (uint i=0; i a({-1, 0, 1}); + cudf::table_view in(std::vector{a}); + + auto old_rows = cudf::convert_to_rows(in); + std::vector schema{cudf::data_type{cudf::type_id::INT32}}; + for (uint i=0; i int32_t { return rand(); }); + cudf::test::fixed_width_column_wrapper a(r, r + (size_t)4096); + cudf::table_view in(std::vector{a}); + + auto old_rows = cudf::convert_to_rows(in); + std::vector schema; + schema.reserve(in.num_columns()); + for (auto col = in.begin(); col < in.end(); ++col) { + schema.push_back(col->type()); + } + for (uint i=0; i> cols; + std::vector views; + + for (int i = 0; i < 256; ++i) { + cols.push_back(cudf::test::fixed_width_column_wrapper({rand()})); + views.push_back(cols.back()); + } + cudf::table_view in(views); + + auto old_rows = cudf::convert_to_rows(in); + std::vector schema; + schema.reserve(in.num_columns()); + for (auto col = in.begin(); col < in.end(); ++col) { + schema.push_back(col->type()); + } + + for (uint i=0; i> cols; + std::vector views; + + for (int i = 0; i < 256; ++i) { + cols.push_back(cudf::test::fixed_width_column_wrapper({rand()})); + views.push_back(cols.back()); + } + cudf::table_view in(views); + + auto old_rows = cudf::convert_to_rows(in); + std::vector schema; + schema.reserve(in.num_columns()); + for (auto col = in.begin(); col < in.end(); ++col) { + schema.push_back(col->type()); + } + for (uint i=0; i Date: Thu, 8 Jul 2021 20:45:18 +0000 Subject: [PATCH 40/80] updating to use make_device_uvector_async and bitmask functions per review comments --- cpp/src/row_conversion/row_conversion.cu | 125 +++++++++-------------- 1 file changed, 47 insertions(+), 78 deletions(-) diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu index c0e78a03576..c73e967cf0f 100644 --- a/cpp/src/row_conversion/row_conversion.cu +++ b/cpp/src/row_conversion/row_conversion.cu @@ -21,6 +21,7 @@ #include #include +#include #include #include #include @@ -36,6 +37,7 @@ #include "thrust/iterator/counting_iterator.h" #include "thrust/iterator/transform_iterator.h" +using cudf::detail::make_device_uvector_async; namespace cudf { namespace detail { @@ -45,32 +47,6 @@ static inline __host__ __device__ int32_t align_offset(int32_t offset, std::size return (offset + alignment - 1) & ~(alignment - 1); } -/** - * Copy a simple vector to device memory asynchronously. Be sure to read - * the data on the same stream as is used to copy it. - */ -template -std::unique_ptr> copy_to_dev_async(const std::vector &input, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) -{ - std::unique_ptr> ret(new rmm::device_uvector(input.size(), stream, mr)); - CUDA_TRY(cudaMemcpyAsync( - ret->data(), input.data(), sizeof(T) * input.size(), cudaMemcpyHostToDevice, stream.value())); - return ret; -} - -template -rmm::device_uvector copy_to_dev_async2(const std::vector &input, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) -{ - rmm::device_uvector ret(input.size(), stream, mr); - CUDA_TRY(cudaMemcpyAsync( - ret.data(), input.data(), sizeof(T) * input.size(), cudaMemcpyHostToDevice, stream.value())); - return ret; -} - __global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows, const cudf::size_type num_columns, const cudf::size_type row_size, @@ -180,8 +156,8 @@ __global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows, } cudf::bitmask_type *nm = output_nm[col_index]; - int8_t *valid_byte = &row_vld_tmp[col_index / 8]; - cudf::size_type byte_bit_offset = col_index % 8; + int8_t *valid_byte = &row_vld_tmp[word_index(col_index)]; + cudf::size_type byte_bit_offset = intra_word_index(col_index); int predicate = *valid_byte & (1 << byte_bit_offset); uint32_t bitmask = __ballot_sync(active_mask, predicate); if (row_index % 32 == 0) { nm[word_index(row_index)] = bitmask; } @@ -278,8 +254,8 @@ __global__ void copy_from_fixed_width_columns(const cudf::size_type start_row, } // atomicOr only works on 32 bit or 64 bit aligned values, and not byte aligned // so we have to rewrite the addresses to make sure that it is 4 byte aligned - int8_t *valid_byte = &row_vld_tmp[col_index / 8]; - cudf::size_type byte_bit_offset = col_index % 8; + int8_t *valid_byte = &row_vld_tmp[word_index(col_index)]; + cudf::size_type byte_bit_offset = intra_word_index(col_index); uint64_t fixup_bytes = reinterpret_cast(valid_byte) % 4; int32_t *valid_int = reinterpret_cast(valid_byte - fixup_bytes); cudf::size_type int_bit_offset = byte_bit_offset + (fixup_bytes * 8); @@ -505,8 +481,8 @@ __global__ void copy_from_columns(const size_type num_rows, // we do this directly in the final location because the entire row may not // fit in shared memory and may require many blocks to process it entirely int8_t *valid_byte = - &output_data[block.buffer_num][row_offsets[row] + validity_offset + col / 8]; - cudf::size_type byte_bit_offset = col % 8; + &output_data[block.buffer_num][row_offsets[row] + validity_offset + word_index(col)]; + cudf::size_type byte_bit_offset = intra_word_index(col); uint64_t fixup_bytes = reinterpret_cast(valid_byte) % 4; int32_t *valid_int = reinterpret_cast(valid_byte - fixup_bytes); cudf::size_type int_bit_offset = byte_bit_offset + (fixup_bytes * 8); @@ -648,7 +624,7 @@ __global__ void copy_to_columns(const size_type num_rows, // This has been broken up for us in the block_info struct, so we don't have // any calculation to do here, but it is important to note. - bool debug_print = false; //blockIdx.x == 1 && threadIdx.x == 0; + bool debug_print = false; //blockIdx.x == 0 && threadIdx.x == 0; if (debug_print) { printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns); @@ -806,7 +782,7 @@ __global__ void copy_to_columns(const size_type num_rows, auto const col = index / validity_batches_per_col; auto const batch = index % validity_batches_per_col; auto const starting_row = batch * 32; - auto const validity_offset = col_offsets[num_columns] + col / 8; + auto const validity_offset = col_offsets[num_columns] + word_index(col); if (debug_print) { printf("col: %d, batch: %d, starting_row: %d, validity_offset: %d\n", col, batch, starting_row, validity_offset); @@ -821,7 +797,7 @@ __global__ void copy_to_columns(const size_type num_rows, } auto const val_byte = *validity_ptr; - auto const src_shift = col % 8; + auto const src_shift = intra_word_index(col); auto const dst_shift = row % 32; auto const src_bit_mask = 1 << src_shift; if (debug_print) { @@ -920,10 +896,10 @@ static std::unique_ptr fixed_width_convert_to_rows( const cudf::size_type num_rows, const cudf::size_type num_columns, const cudf::size_type size_per_row, - std::unique_ptr> &column_start, - std::unique_ptr> &column_size, - std::unique_ptr> &input_data, - std::unique_ptr> &input_nm, + rmm::device_uvector &column_start, + rmm::device_uvector &column_size, + rmm::device_uvector &input_data, + rmm::device_uvector &input_nm, const cudf::scalar &zero, const cudf::scalar &scalar_size_per_row, rmm::cuda_stream_view stream, @@ -954,10 +930,10 @@ static std::unique_ptr fixed_width_convert_to_rows( num_rows, num_columns, size_per_row, - column_start->data(), - column_size->data(), - input_data->data(), - input_nm->data(), + column_start.data(), + column_size.data(), + input_data.data(), + input_nm.data(), data->mutable_view().data()); return cudf::make_lists_column(num_rows, @@ -1004,7 +980,7 @@ static inline int32_t compute_fixed_width_layout(std::vector co // Now we need to add in space for validity // Eventually we can think about nullable vs not nullable, but for now we will just always add it // in - int32_t validity_bytes_needed = (schema.size() + 7) / 8; + int32_t validity_bytes_needed = word_index(schema.size() + 7); // validity comes at the end and is byte aligned so we can pack more in. at_offset += validity_bytes_needed; // Now we need to pad the end so all rows are 64 bit aligned @@ -1235,8 +1211,8 @@ std::vector> convert_to_rows2(cudf::table_view con } } - auto dev_input_data = detail::copy_to_dev_async2(input_data, stream, mr); - auto dev_input_nm = detail::copy_to_dev_async2(input_nm, stream, mr); + auto dev_input_data = make_device_uvector_async(input_data, stream, mr); + auto dev_input_nm = make_device_uvector_async(input_nm, stream, mr); std::vector row_sizes; // size of each row in bytes including any alignment padding std::vector row_offsets; // offset from the start of the data to this row @@ -1287,8 +1263,8 @@ std::vector> convert_to_rows2(cudf::table_view con #endif - auto dev_col_sizes = detail::copy_to_dev_async2(column_sizes, stream, mr); - auto dev_col_starts = detail::copy_to_dev_async2(column_starts, stream, mr); + auto dev_col_sizes = make_device_uvector_async(column_sizes, stream, mr); + auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr); std::vector row_batches; @@ -1322,16 +1298,9 @@ std::vector> convert_to_rows2(cudf::table_view con size_type row_batch_rows = 0; uint64_t row_offset = 0; - auto calculate_validity_size = [](int const num_cols) { - // Now we need to add in space for validity - // Eventually we can think about nullable vs not nullable, but for now we will just always add - // it in - return (num_cols + 7) / 8; - }; - // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then // calculate the size of each row's variable-width data and validity as well. - auto validity_size = calculate_validity_size(num_columns); + auto validity_size = num_bitmask_words(num_columns); for (int row = 0; row < num_rows; ++row) { auto aligned_row_batch_size = detail::align_offset(row_batch_size, 8); // rows are 8 byte aligned @@ -1364,7 +1333,7 @@ std::vector> convert_to_rows2(cudf::table_view con row_batches.push_back(detail::row_batch{static_cast(row_batch_size), row_batch_rows}); } - auto dev_row_offsets = detail::copy_to_dev_async2(row_offsets, stream, mr); + auto dev_row_offsets = make_device_uvector_async(row_offsets, stream, mr); #if defined(DEBUG) printf("%d rows and %d columns in table\n", num_rows, num_columns); @@ -1384,7 +1353,7 @@ std::vector> convert_to_rows2(cudf::table_view con output_data.push_back(static_cast(temp.data())); output_buffers.push_back(std::move(temp)); } - auto dev_output_data = detail::copy_to_dev_async2(output_data, stream, mr); + auto dev_output_data = make_device_uvector_async(output_data, stream, mr); std::vector block_infos = build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block); @@ -1402,7 +1371,7 @@ std::vector> convert_to_rows2(cudf::table_view con printf(" total):\n"); #endif - auto dev_block_infos = detail::copy_to_dev_async2(block_infos, stream, mr); + auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr); // blast through the entire table and convert it dim3 blocks(block_infos.size()); @@ -1443,7 +1412,7 @@ std::vector> convert_to_rows2(cudf::table_view con } offset_offset += row_batches[i].row_count; - auto dev_offsets = detail::copy_to_dev_async2(offset_vals, stream, mr); + auto dev_offsets = make_device_uvector_async(offset_vals, stream, mr); auto offsets = std::make_unique( data_type{type_id::INT32}, (size_type)offset_vals.size(), dev_offsets.release()); @@ -1477,8 +1446,8 @@ std::vector> convert_to_rows(cudf::table_view cons std::vector column_size; int32_t size_per_row = detail::compute_fixed_width_layout(schema, column_start, column_size); - auto dev_column_start = detail::copy_to_dev_async(column_start, stream, mr); - auto dev_column_size = detail::copy_to_dev_async(column_size, stream, mr); + auto dev_column_start = make_device_uvector_async(column_start, stream, mr); + auto dev_column_size = make_device_uvector_async(column_size, stream, mr); int32_t max_rows_per_batch = std::numeric_limits::max() / size_per_row; // Make the number of rows per batch a multiple of 32 so we don't have to worry about @@ -1495,8 +1464,8 @@ std::vector> convert_to_rows(cudf::table_view cons input_data.emplace_back(cv.data()); input_nm.emplace_back(cv.null_mask()); } - auto dev_input_data = detail::copy_to_dev_async(input_data, stream, mr); - auto dev_input_nm = detail::copy_to_dev_async(input_nm, stream, mr); + auto dev_input_data = make_device_uvector_async(input_data, stream, mr); + auto dev_input_nm = make_device_uvector_async(input_nm, stream, mr); using ScalarType = cudf::scalar_type_t; auto zero = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value()); @@ -1561,7 +1530,7 @@ std::unique_ptr convert_from_rows2(cudf::lists_column_view const &i size_type fixed_width_size_per_row = detail::compute_column_information( iter, iter + num_columns, column_starts, column_sizes);//, [](void *) {}); - size_type validity_size = (num_columns + 7) / 8; + size_type validity_size = num_bitmask_words(num_columns); size_type row_size = detail::align_offset(fixed_width_size_per_row + validity_size, 8); @@ -1569,8 +1538,8 @@ std::unique_ptr convert_from_rows2(cudf::lists_column_view const &i // this is probably fine CUDF_EXPECTS(row_size * num_rows == child.size(), "The layout of the data appears to be off"); - auto dev_col_starts = detail::copy_to_dev_async2(column_starts, stream, mr); - auto dev_col_sizes = detail::copy_to_dev_async2(column_sizes, stream, mr); + auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr); + auto dev_col_sizes = make_device_uvector_async(column_sizes, stream, mr); // build the row_batches from the passed in list column std::vector row_batches; @@ -1590,13 +1559,13 @@ std::unique_ptr convert_from_rows2(cudf::lists_column_view const &i output_columns.emplace_back(std::move(column)); } - auto dev_output_data = detail::copy_to_dev_async2(output_data, stream, mr); - auto dev_output_nm = detail::copy_to_dev_async2(output_nm, stream, mr); + auto dev_output_data = make_device_uvector_async(output_data, stream, mr); + auto dev_output_nm = make_device_uvector_async(output_nm, stream, mr); std::vector block_infos = build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block); - auto dev_block_infos = detail::copy_to_dev_async2(block_infos, stream, mr); + auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr); dim3 blocks(block_infos.size()); #if defined(DEBUG) || 1 @@ -1647,8 +1616,8 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const &in // this is probably fine CUDF_EXPECTS(size_per_row * num_rows == child.size(), "The layout of the data appears to be off"); - auto dev_column_start = detail::copy_to_dev_async(column_start, stream, mr); - auto dev_column_size = detail::copy_to_dev_async(column_size, stream, mr); + auto dev_column_start = make_device_uvector_async(column_start, stream); + auto dev_column_size = make_device_uvector_async(column_size, stream); // Allocate the columns we are going to write into std::vector> output_columns; @@ -1663,8 +1632,8 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const &in output_columns.emplace_back(std::move(column)); } - auto dev_output_data = detail::copy_to_dev_async(output_data, stream, mr); - auto dev_output_nm = detail::copy_to_dev_async(output_nm, stream, mr); + auto dev_output_data = make_device_uvector_async(output_data, stream, mr); + auto dev_output_nm = make_device_uvector_async(output_nm, stream, mr); dim3 blocks; dim3 threads; @@ -1675,10 +1644,10 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const &in num_rows, num_columns, size_per_row, - dev_column_start->data(), - dev_column_size->data(), - dev_output_data->data(), - dev_output_nm->data(), + dev_column_start.data(), + dev_column_size.data(), + dev_output_data.data(), + dev_output_nm.data(), child.data()); return std::make_unique(std::move(output_columns)); From b044f8b10c606b495bbb4284a754f50f4a6eb7a4 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Tue, 13 Jul 2021 07:18:49 +0000 Subject: [PATCH 41/80] updating conversion code. Found out bit operations are on 32-bit values, so they can't be used since row data has byte-aligned validity. Performance improvements on the row to column side. --- cpp/src/row_conversion/row_conversion.cu | 106 ++++++++++++----------- 1 file changed, 54 insertions(+), 52 deletions(-) diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu index c73e967cf0f..0879a1c50a5 100644 --- a/cpp/src/row_conversion/row_conversion.cu +++ b/cpp/src/row_conversion/row_conversion.cu @@ -37,6 +37,8 @@ #include "thrust/iterator/counting_iterator.h" #include "thrust/iterator/transform_iterator.h" +#define NUM_BLOCKS_PER_KERNEL_TO_COLUMNS (2) + using cudf::detail::make_device_uvector_async; namespace cudf { @@ -156,11 +158,11 @@ __global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows, } cudf::bitmask_type *nm = output_nm[col_index]; - int8_t *valid_byte = &row_vld_tmp[word_index(col_index)]; - cudf::size_type byte_bit_offset = intra_word_index(col_index); + int8_t *valid_byte = &row_vld_tmp[col_index / 8]; + cudf::size_type byte_bit_offset = col_index % 8; int predicate = *valid_byte & (1 << byte_bit_offset); uint32_t bitmask = __ballot_sync(active_mask, predicate); - if (row_index % 32 == 0) { nm[word_index(row_index)] = bitmask; } + if (row_index % 32 == 0) { nm[row_index / 8] = bitmask; } } // end column loop } // end row copy // wait for the row_group to be totally copied before starting on the next row group @@ -254,8 +256,8 @@ __global__ void copy_from_fixed_width_columns(const cudf::size_type start_row, } // atomicOr only works on 32 bit or 64 bit aligned values, and not byte aligned // so we have to rewrite the addresses to make sure that it is 4 byte aligned - int8_t *valid_byte = &row_vld_tmp[word_index(col_index)]; - cudf::size_type byte_bit_offset = intra_word_index(col_index); + int8_t *valid_byte = &row_vld_tmp[col_index / 8]; + cudf::size_type byte_bit_offset = col_index % 8; uint64_t fixup_bytes = reinterpret_cast(valid_byte) % 4; int32_t *valid_int = reinterpret_cast(valid_byte - fixup_bytes); cudf::size_type int_bit_offset = byte_bit_offset + (fixup_bytes * 8); @@ -481,8 +483,8 @@ __global__ void copy_from_columns(const size_type num_rows, // we do this directly in the final location because the entire row may not // fit in shared memory and may require many blocks to process it entirely int8_t *valid_byte = - &output_data[block.buffer_num][row_offsets[row] + validity_offset + word_index(col)]; - cudf::size_type byte_bit_offset = intra_word_index(col); + &output_data[block.buffer_num][row_offsets[row] + validity_offset + (col / 8)]; + cudf::size_type byte_bit_offset = col % 8; uint64_t fixup_bytes = reinterpret_cast(valid_byte) % 4; int32_t *valid_int = reinterpret_cast(valid_byte - fixup_bytes); cudf::size_type int_bit_offset = byte_bit_offset + (fixup_bytes * 8); @@ -597,6 +599,7 @@ __global__ void copy_from_columns(const size_type num_rows, * * @param num_rows total number of rows in the table * @param num_columns total number of columns in the table + * @param shmem_used_per_block amount of shared memory that is used by a block * @param offsets * @param output_data * @param output_nm @@ -608,6 +611,7 @@ __global__ void copy_from_columns(const size_type num_rows, */ __global__ void copy_to_columns(const size_type num_rows, const size_type num_columns, + const size_type shmem_used_per_block, const size_type *offsets, int8_t **output_data, cudf::bitmask_type **output_nm, @@ -624,18 +628,10 @@ __global__ void copy_to_columns(const size_type num_rows, // This has been broken up for us in the block_info struct, so we don't have // any calculation to do here, but it is important to note. - bool debug_print = false; //blockIdx.x == 0 && threadIdx.x == 0; + constexpr bool debug_print = false; //blockIdx.x == 0 && threadIdx.x == 0; if (debug_print) { printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns); - printf("Column Info:\n"); - for (int i = 0; i < num_columns; ++i) { - printf("col %d is at %p with size %d and offset %d\n", - i, - output_data[i], - col_sizes[i], - col_offsets[i]); - } printf("block infos are at %p and my index is %d\n", block_infos, blockIdx.x); /* printf("Row Offsets:\n"); for (int i=0; i blockDim.x) { + break; + } + auto block = block_infos[this_block_index]; auto const rows_in_block = block.end_row - block.start_row + 1; auto const cols_in_block = block.end_col - block.start_col + 1; extern __shared__ int8_t shared_data[]; @@ -767,61 +769,58 @@ __global__ void copy_to_columns(const size_type num_rows, } } - __syncthreads(); - - // now handle validity. Each thread is responsible for 32 rows in a single column. + // now handle validity. Each thread is responsible for 32 rows in 8 columns. // to prevent indexing issues with a large number of threads, this is compressed // to a single loop like above. TODO: investigate using shared memory here auto const validity_batches_per_col = (num_rows + 31) / 32; - auto const validity_batches_total = validity_batches_per_col * num_columns; - if (debug_print) { - printf("validity_batched_per_col is %d\nvalidity_batches_total is %d for %d rows\n", validity_batches_per_col, validity_batches_total, num_rows); + auto const validity_batches_total = std::max(1, validity_batches_per_col * (num_columns / 8)); + if (debug_print && threadIdx.x == 0 && blockIdx.x == 0) { + printf("validity_batched_per_col is %d\nvalidity_batches_total is %d for %d rows\n%d blocks of %d threads\n", validity_batches_per_col, validity_batches_total, num_rows, gridDim.x, blockDim.x); } - for (int index = threadIdx.x; index < validity_batches_total; index += blockDim.x) { - // what column is this? - auto const col = index / validity_batches_per_col; + for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < validity_batches_total; index += blockDim.x * gridDim.x) { + auto const start_col = (index * 8) / validity_batches_per_col; auto const batch = index % validity_batches_per_col; auto const starting_row = batch * 32; - auto const validity_offset = col_offsets[num_columns] + word_index(col); + auto const validity_offset = col_offsets[num_columns] + (start_col / 8); if (debug_print) { - printf("col: %d, batch: %d, starting_row: %d, validity_offset: %d\n", col, batch, starting_row, validity_offset); + printf("%d-%d: cols: %d-%d, word index: %d, batch: %d, starting_row: %d, +validity_offset: %d, index: %d, stride: %d\n", threadIdx.x, blockIdx.x, start_col, start_col + 7, (start_col / 8), batch, starting_row, validity_offset, index, blockDim.x * gridDim.x); } - int32_t dst_validity = 0; + // one for each column + int32_t dst_validity[8] = {0}; for (int row = starting_row; row < std::min(num_rows, starting_row + 32); ++row) { int8_t const * const validity_ptr = &input_data[offsets[row] + validity_offset]; if (debug_print) { - printf("validity_ptr is %p for row %d\nwhich is input_data[%d]\n", validity_ptr, row, offsets[row] + validity_offset); + printf("%d: validity_ptr is %p for row %d\n", threadIdx.x, validity_ptr, row); } auto const val_byte = *validity_ptr; - auto const src_shift = intra_word_index(col); - auto const dst_shift = row % 32; - auto const src_bit_mask = 1 << src_shift; - if (debug_print) { - printf("src bit mask is 0x%x\n", src_bit_mask); - printf("src shift is 0x%x and dst shift is 0x%x\n", src_shift, dst_shift); - printf("validity bit is 0x%x\n", (val_byte & src_bit_mask) >> src_shift); - } -// auto const dst_bit_mask = 1 << dst_shift; - dst_validity |= (((val_byte & src_bit_mask) >> src_shift) << dst_shift); - if (debug_print) { - printf("validity is now 0x%x\n", dst_validity); + + for (int i=0; i> src_shift); + } + // auto const dst_bit_mask = 1 << dst_shift; + dst_validity[i] |= (((val_byte & src_bit_mask) >> src_shift) << dst_shift); } } - int32_t *validity_ptr = reinterpret_cast(output_nm[col] + (starting_row / 32)); - if (debug_print) { - printf("valiidty_ptr is output_nm[%d]: %p + starting_row / 8: %d because starting row is %d, which becomes %p\n", col, output_nm[col], starting_row / 32, starting_row, output_nm[col] + (starting_row / 32)); - printf("validity to write is %d\n", dst_validity); - printf("validity write %p <- %d\n", validity_ptr, dst_validity); + for (int i=0; i(output_nm[start_col + i] + (starting_row / 32)); + if (debug_print) { + printf("%d-%d: validity write output_nm[%d][%d] - %p <- %d\n", threadIdx.x, blockIdx.x, start_col + i, starting_row, validity_ptr, dst_validity[i]); + } + *validity_ptr = dst_validity[i]; } - *validity_ptr = dst_validity; } } +} /** * Calculate the dimensions of the kernel for fixed width only columns. @@ -980,7 +979,7 @@ static inline int32_t compute_fixed_width_layout(std::vector co // Now we need to add in space for validity // Eventually we can think about nullable vs not nullable, but for now we will just always add it // in - int32_t validity_bytes_needed = word_index(schema.size() + 7); + int32_t validity_bytes_needed = (schema.size() + 7) / 8; // validity comes at the end and is byte aligned so we can pack more in. at_offset += validity_bytes_needed; // Now we need to pad the end so all rows are 64 bit aligned @@ -1300,7 +1299,7 @@ std::vector> convert_to_rows2(cudf::table_view con // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then // calculate the size of each row's variable-width data and validity as well. - auto validity_size = num_bitmask_words(num_columns); + auto validity_size = num_bitmask_words(num_columns) * 4; for (int row = 0; row < num_rows; ++row) { auto aligned_row_batch_size = detail::align_offset(row_batch_size, 8); // rows are 8 byte aligned @@ -1521,6 +1520,8 @@ std::unique_ptr convert_from_rows2(cudf::lists_column_view const &i CUDA_TRY( cudaDeviceGetAttribute(&shmem_limit_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); + shmem_limit_per_block /= NUM_BLOCKS_PER_KERNEL_TO_COLUMNS; + std::vector column_starts; std::vector column_sizes; @@ -1530,7 +1531,7 @@ std::unique_ptr convert_from_rows2(cudf::lists_column_view const &i size_type fixed_width_size_per_row = detail::compute_column_information( iter, iter + num_columns, column_starts, column_sizes);//, [](void *) {}); - size_type validity_size = num_bitmask_words(num_columns); + size_type validity_size = num_bitmask_words(num_columns) * 4; size_type row_size = detail::align_offset(fixed_width_size_per_row + validity_size, 8); @@ -1567,7 +1568,7 @@ std::unique_ptr convert_from_rows2(cudf::lists_column_view const &i auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr); - dim3 blocks(block_infos.size()); + dim3 blocks((block_infos.size() + (NUM_BLOCKS_PER_KERNEL_TO_COLUMNS - 1)) / NUM_BLOCKS_PER_KERNEL_TO_COLUMNS); #if defined(DEBUG) || 1 dim3 threads(std::min(std::min(512, shmem_limit_per_block / 8), (int)child.size())); #else @@ -1581,6 +1582,7 @@ std::unique_ptr convert_from_rows2(cudf::lists_column_view const &i detail::copy_to_columns<<>>( num_rows, num_columns, + shmem_limit_per_block, input.offsets().data(), dev_output_data.data(), dev_output_nm.data(), From d2a33ed396e935a9b1c8ca44df26b51bc37e2d9b Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Mon, 7 Jun 2021 08:14:52 +0000 Subject: [PATCH 42/80] working on row and column conversions --- cpp/benchmarks/CMakeLists.txt | 41 +- .../row_conversion/row_conversion.cpp | 106 +- cpp/include/cudf/row_conversion.hpp | 12 - cpp/src/row_conversion/row_conversion.cu | 1183 +++++------------ 4 files changed, 357 insertions(+), 985 deletions(-) diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index fa1e61e26fd..a8f075d2464 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -22,10 +22,21 @@ target_compile_options( "$<$:${CUDF_CUDA_FLAGS}>" ) +<<<<<<< HEAD target_link_libraries( cudf_datagen PUBLIC GTest::gmock GTest::gtest GTest::gmock_main GTest::gtest_main benchmark::benchmark nvbench::nvbench Threads::Threads cudf ) +======= +target_link_libraries(cudf_datagen + PUBLIC GTest::gmock + GTest::gtest + GTest::gmock_main + GTest::gtest_main + benchmark::benchmark + Threads::Threads + cudf) +>>>>>>> working on row and column conversions target_include_directories( cudf_datagen @@ -46,6 +57,7 @@ target_link_libraries(cudf_benchmark_common PRIVATE cudf_datagen) # This function takes in a benchmark name and benchmark source and handles setting all of the # associated properties and linking to build the benchmark function(ConfigureBench CMAKE_BENCH_NAME) +<<<<<<< HEAD add_executable(${CMAKE_BENCH_NAME} ${ARGN}) set_target_properties( ${CMAKE_BENCH_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY @@ -71,6 +83,17 @@ endfunction() # ################################################################################################## # * column benchmarks ----------------------------------------------------------------------------- +======= + add_executable(${CMAKE_BENCH_NAME} ${ARGN}) + set_target_properties(${CMAKE_BENCH_NAME} + PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$") + target_link_libraries(${CMAKE_BENCH_NAME} + PRIVATE cudf_benchmark_common cudf_datagen benchmark::benchmark_main) +endfunction() + +################################################################################################### +# - column benchmarks ----------------------------------------------------------------------------- +>>>>>>> working on row and column conversions ConfigureBench(COLUMN_CONCAT_BENCH column/concatenate_benchmark.cpp) # ################################################################################################## @@ -81,12 +104,17 @@ ConfigureBench(GATHER_BENCH copying/gather_benchmark.cu) # * scatter benchmark ----------------------------------------------------------------------------- ConfigureBench(SCATTER_BENCH copying/scatter_benchmark.cu) +<<<<<<< HEAD # ################################################################################################## # * lists scatter benchmark ----------------------------------------------------------------------- ConfigureBench(SCATTER_LISTS_BENCH lists/copying/scatter_lists_benchmark.cu) # ################################################################################################## # * contiguous_split benchmark ------------------------------------------------------------------- +======= +################################################################################################### +# - contiguous_split benchmark ------------------------------------------------------------------- +>>>>>>> working on row and column conversions ConfigureBench(CONTIGUOUS_SPLIT_BENCH copying/contiguous_split_benchmark.cu) # ################################################################################################## @@ -110,10 +138,16 @@ ConfigureBench(APPLY_BOOLEAN_MASK_BENCH stream_compaction/apply_boolean_mask_ben # * stream_compaction benchmark ------------------------------------------------------------------- ConfigureBench(STREAM_COMPACTION_BENCH stream_compaction/drop_duplicates_benchmark.cpp) +<<<<<<< HEAD # ################################################################################################## # * join benchmark -------------------------------------------------------------------------------- ConfigureBench(JOIN_BENCH join/join_benchmark.cu join/conditional_join_benchmark.cu) ConfigureNVBench(JOIN_NVBENCH join/join_nvbench.cu) +======= +################################################################################################### +# - join benchmark -------------------------------------------------------------------------------- +ConfigureBench(JOIN_BENCH join/join_benchmark.cu) +>>>>>>> working on row and column conversions # ################################################################################################## # * iterator benchmark ---------------------------------------------------------------------------- @@ -239,7 +273,6 @@ ConfigureBench( string/factory_benchmark.cu string/filter_benchmark.cpp string/find_benchmark.cpp - string/repeat_strings_benchmark.cpp string/replace_benchmark.cpp string/replace_re_benchmark.cpp string/split_benchmark.cpp @@ -248,6 +281,7 @@ ConfigureBench( string/url_decode_benchmark.cpp ) +<<<<<<< HEAD # ################################################################################################## # * json benchmark ------------------------------------------------------------------- ConfigureBench(JSON_BENCH string/json_benchmark.cpp) @@ -255,3 +289,8 @@ ConfigureBench(JSON_BENCH string/json_benchmark.cpp) # ################################################################################################## # * io benchmark --------------------------------------------------------------------- ConfigureBench(MULTIBYTE_SPLIT_BENCHMARK io/text/multibyte_split_benchmark.cpp) +======= +################################################################################################### +# - row conversion benchmark ---------------------------------------------------------------------------- +ConfigureBench(ROW_CONVERSION_BENCH row_conversion/row_conversion.cpp) +>>>>>>> working on row and column conversions diff --git a/cpp/benchmarks/row_conversion/row_conversion.cpp b/cpp/benchmarks/row_conversion/row_conversion.cpp index 7c1f52c5cd6..c4edee91b3c 100644 --- a/cpp/benchmarks/row_conversion/row_conversion.cpp +++ b/cpp/benchmarks/row_conversion/row_conversion.cpp @@ -25,7 +25,7 @@ class RowConversion : public cudf::benchmark { }; -static void BM_old_to_row(benchmark::State& state) +static void BM_to_row(benchmark::State& state) { cudf::size_type const n_rows{(cudf::size_type)state.range(0)}; auto const table = create_random_table({cudf::type_id::INT8, @@ -37,44 +37,8 @@ static void BM_old_to_row(benchmark::State& state) cudf::type_id::UINT16, cudf::type_id::UINT8, cudf::type_id::UINT64}, - 212, + 50, row_count{n_rows}); - /* auto const table = create_random_table({cudf::type_id::INT32}, - 64, - row_count{n_rows});*/ - - cudf::size_type total_bytes = 0; - for (int i = 0; i < table->num_columns(); ++i) { - auto t = table->get_column(i).type(); - total_bytes += cudf::size_of(t); - } - - for (auto _ : state) { - cuda_event_timer raii(state, true, rmm::cuda_stream_default); - - auto rows = cudf::convert_to_rows(table->view()); - } - - state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows()); -} - -static void BM_new_to_row(benchmark::State& state) -{ - cudf::size_type const n_rows{(cudf::size_type)state.range(0)}; - auto const table = create_random_table({cudf::type_id::INT8, - cudf::type_id::INT32, - cudf::type_id::INT16, - cudf::type_id::INT64, - cudf::type_id::INT32, - cudf::type_id::BOOL8, - cudf::type_id::UINT16, - cudf::type_id::UINT8, - cudf::type_id::UINT64}, - 212, - row_count{n_rows}); - /* auto const table = create_random_table({cudf::type_id::INT32}, - 64, - row_count{n_rows});*/ cudf::size_type total_bytes = 0; for (int i = 0; i < table->num_columns(); ++i) { @@ -85,13 +49,14 @@ static void BM_new_to_row(benchmark::State& state) for (auto _ : state) { cuda_event_timer raii(state, true, rmm::cuda_stream_default); +// auto rows = cudf::convert_to_rows(table->view()); auto new_rows = cudf::convert_to_rows2(table->view()); } state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows()); } -static void BM_old_from_row(benchmark::State& state) +static void BM_from_row(benchmark::State& state) { cudf::size_type const n_rows{(cudf::size_type)state.range(0)}; auto const table = create_random_table({cudf::type_id::INT8, @@ -105,6 +70,9 @@ static void BM_old_from_row(benchmark::State& state) cudf::type_id::UINT64}, 256, row_count{n_rows}); + /* auto const table = create_random_table({cudf::type_id::INT32}, + 4, + row_count{n_rows});*/ std::vector schema; cudf::size_type total_bytes = 0; @@ -125,60 +93,24 @@ static void BM_old_from_row(benchmark::State& state) state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows()); } -static void BM_new_from_row(benchmark::State& state) -{ - cudf::size_type const n_rows{(cudf::size_type)state.range(0)}; - auto const table = create_random_table({cudf::type_id::INT8, - cudf::type_id::INT32, - cudf::type_id::INT16, - cudf::type_id::INT64, - cudf::type_id::INT32, - cudf::type_id::BOOL8, - cudf::type_id::UINT16, - cudf::type_id::UINT8, - cudf::type_id::UINT64}, - 256, - row_count{n_rows}); - - std::vector schema; - cudf::size_type total_bytes = 0; - for (int i = 0; i < table->num_columns(); ++i) { - auto t = table->get_column(i).type(); - schema.push_back(t); - total_bytes += cudf::size_of(t); - } - - auto rows = cudf::convert_to_rows(table->view()); - - for (auto _ : state) { - cuda_event_timer raii(state, true, rmm::cuda_stream_default); - - auto out = cudf::convert_from_rows2(rows, schema); - } - - state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows()); -} - -#define TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \ - BENCHMARK_DEFINE_F(RowConversion, name) \ - (::benchmark::State & st) { f(st); } \ - BENCHMARK_REGISTER_F(RowConversion, name) \ - ->RangeMultiplier(8) \ - ->Ranges({{1 << 6, 1 << 20}}) \ - ->UseManualTime() \ +#define TO_ROW_CONVERSION_BENCHMARK_DEFINE(name) \ + BENCHMARK_DEFINE_F(RowConversion, name) \ + (::benchmark::State & st) { BM_to_row(st); } \ + BENCHMARK_REGISTER_F(RowConversion, name) \ + ->RangeMultiplier(8) \ + ->Ranges({{1 << 16, 1 << 24}}) \ + ->UseManualTime() \ ->Unit(benchmark::kMillisecond); -TO_ROW_CONVERSION_BENCHMARK_DEFINE(old_to_row_conversion, BM_old_to_row) -TO_ROW_CONVERSION_BENCHMARK_DEFINE(new_to_row_conversion, BM_new_to_row) +TO_ROW_CONVERSION_BENCHMARK_DEFINE(to_row_conversion) -#define FROM_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \ +#define FROM_ROW_CONVERSION_BENCHMARK_DEFINE(name) \ BENCHMARK_DEFINE_F(RowConversion, name) \ - (::benchmark::State & st) { f(st); } \ + (::benchmark::State & st) { BM_from_row(st); } \ BENCHMARK_REGISTER_F(RowConversion, name) \ ->RangeMultiplier(8) \ - ->Ranges({{1 << 6, 1 << 20}}) \ + ->Ranges({{1 << 6, 1 << 22}}) \ ->UseManualTime() \ ->Unit(benchmark::kMillisecond); -FROM_ROW_CONVERSION_BENCHMARK_DEFINE(old_from_row_conversion, BM_old_from_row) -FROM_ROW_CONVERSION_BENCHMARK_DEFINE(new_from_row_conversion, BM_new_from_row) +FROM_ROW_CONVERSION_BENCHMARK_DEFINE(from_row_conversion) diff --git a/cpp/include/cudf/row_conversion.hpp b/cpp/include/cudf/row_conversion.hpp index 282ffa4b0cb..f5e2225ad19 100644 --- a/cpp/include/cudf/row_conversion.hpp +++ b/cpp/include/cudf/row_conversion.hpp @@ -48,16 +48,4 @@ std::unique_ptr convert_from_rows( rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); -std::unique_ptr convert_from_rows2( - cudf::lists_column_view const &input, - std::vector const &schema, - rmm::cuda_stream_view stream = rmm::cuda_stream_default, - rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); - -std::unique_ptr convert_from_rows2( - std::vector> const &input, - std::vector const &schema, - rmm::cuda_stream_view stream = rmm::cuda_stream_default, - rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); - } // namespace cudf diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu index 0879a1c50a5..fb5dc4cb38d 100644 --- a/cpp/src/row_conversion/row_conversion.cu +++ b/cpp/src/row_conversion/row_conversion.cu @@ -14,14 +14,12 @@ * limitations under the License. */ -#include #include #include #include #include #include -#include #include #include #include @@ -31,15 +29,11 @@ #include #include -#include #include "cudf/types.hpp" #include "rmm/device_buffer.hpp" #include "thrust/iterator/counting_iterator.h" #include "thrust/iterator/transform_iterator.h" -#define NUM_BLOCKS_PER_KERNEL_TO_COLUMNS (2) - -using cudf::detail::make_device_uvector_async; namespace cudf { namespace detail { @@ -49,6 +43,34 @@ static inline __host__ __device__ int32_t align_offset(int32_t offset, std::size return (offset + alignment - 1) & ~(alignment - 1); } + +/** + * Copy a simple vector to device memory asynchronously. Be sure to read + * the data on the same stream as is used to copy it. + */ +template +std::unique_ptr> copy_to_dev_async(const std::vector &input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) +{ + std::unique_ptr> ret(new rmm::device_uvector(input.size(), stream, mr)); + CUDA_TRY(cudaMemcpyAsync( + ret->data(), input.data(), sizeof(T) * input.size(), cudaMemcpyHostToDevice, stream.value())); + return ret; +} + +template +rmm::device_uvector copy_to_dev_async2( + const std::vector &input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) +{ + rmm::device_uvector ret(input.size(), stream, mr); + CUDA_TRY(cudaMemcpyAsync( + ret.data(), input.data(), sizeof(T) * input.size(), cudaMemcpyHostToDevice, stream.value())); + return ret; +} + __global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows, const cudf::size_type num_columns, const cudf::size_type row_size, @@ -162,7 +184,7 @@ __global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows, cudf::size_type byte_bit_offset = col_index % 8; int predicate = *valid_byte & (1 << byte_bit_offset); uint32_t bitmask = __ballot_sync(active_mask, predicate); - if (row_index % 32 == 0) { nm[row_index / 8] = bitmask; } + if (row_index % 32 == 0) { nm[word_index(row_index)] = bitmask; } } // end column loop } // end row copy // wait for the row_group to be totally copied before starting on the next row group @@ -311,20 +333,6 @@ struct block_info { int buffer_num; }; -// When building the columns to return, we have to be mindful of the offset limit in cudf. -// It is 32-bit and these data columns are capable of surpassing that easily. The data should -// not be cut off exactly at the limit though due to the validity buffers. The most efficient -// place to cut the validity is on a 32-row boundary, so as we calculate the row sizes -// we keep track of the cut points for the validity, which we call row batches. If the row -// is larger than can be represented with the 32-bit offsets, we use the last 32-row boundary we -// hit. Note that this boundary is for our book-keeping with column pointers and not anything that -// the kernel needs to worry about. We cut the output at convienient boundaries when assembling -// the outgoing data stream. -struct row_batch { - size_type num_bytes; - size_type row_count; -}; - /** * @brief copy data from cudf columns into x format, which is row-based * @@ -337,16 +345,16 @@ struct row_batch { * @param block_infos information about the blocks of work * @param row_offsets offset to a specific row in the input data * @param output_data pointer to output data - * + * */ -__global__ void copy_from_columns(const size_type num_rows, - const size_type num_columns, +__global__ void copy_from_columns(const cudf::size_type num_rows, + const cudf::size_type num_columns, const int8_t **input_data, - const bitmask_type **input_nm, - const size_type *col_sizes, - const size_type *col_offsets, + const cudf::bitmask_type **input_nm, + const cudf::size_type *col_sizes, + const cudf::size_type *col_offsets, const block_info *block_infos, - const size_type *row_offsets, + const uint64_t *row_offsets, int8_t **output_data) { // We are going to copy the data in two passes. @@ -357,119 +365,46 @@ __global__ void copy_from_columns(const size_type num_rows, // This has been broken up for us in the block_info struct, so we don't have // any calculation to do here, but it is important to note. - bool debug_print = false; //blockIdx.x == 0 && threadIdx.x == 0; - - if (debug_print) { - printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns); - printf("Column Info:\n"); - for (int i = 0; i < num_columns; ++i) { - printf("col %d is at %p with size %d and offset %d\n", - i, - input_data[i], - col_sizes[i], - col_offsets[i]); - } - printf("block infos are at %p and my index is %d\n", block_infos, blockIdx.x); - /* printf("Row Offsets:\n"); - for (int i=0; i(&output_data[0][output_start_offset]) & - 7; // offset for alignment shim in order to match shared memory with final dest - if (debug_print) { - printf("outputting to offset %lu\n", output_start_offset); - printf("dest shim offset is %d\n", dest_shim_offset); - printf("Shared data is %p-%p\n", shared_data, shared_data + (48 * 1024)); - printf("my block is %d,%d -> %d,%d - buffer %d\n", - block.start_col, - block.start_row, - block.end_col, - block.end_row, - block.buffer_num); - } + uint8_t const dest_shim_offset = reinterpret_cast(&output_data[0][output_start_offset]) & 7; // offset for alignment shim in order to match shared memory with final dest + + printf("copying from column %d to column %d with rows %d to row %d(grid dim %d, blockIdx %d)\n", block.start_col, block.end_col, block.start_row, block.end_row, gridDim.x, blockIdx.x); + // each thread is responsible for every threadcount rows of data. - // the data is copied into shared memory in the final layout. - auto const real_bytes_in_row = - col_offsets[block.end_col] + col_sizes[block.end_col] - col_offsets[block.start_col]; - auto const shmem_row_size = align_offset(real_bytes_in_row + dest_shim_offset, - 8); // 8 byte alignment required for shared memory rows + // the data is copies into shared memory in the final layout. + auto const shmem_row_size = align_offset(col_offsets[block.end_col] + col_sizes[block.end_col] - col_offsets[block.start_col] + dest_shim_offset, 8); // 8 byte alignment required for shared memory rows auto const validity_offset = col_offsets[num_columns]; - if (debug_print) { - printf("col_offsets[%d] = %d, col_sizes[%d] = %d, col_offsets[%d] = %d\n", - block.end_col, - col_offsets[block.end_col], - block.end_col, - col_sizes[block.end_col], - block.start_col, - col_offsets[block.start_col]); - printf("shmem row size %d with real bytes %d\n", shmem_row_size, real_bytes_in_row); - printf("validity offset is %d\n", validity_offset); - printf("starting at %d,%d and going to %d, %d\n", - block.start_col, - block.start_row, - block.end_col, - block.end_row); - } - for (int col = block.start_col; col <= block.end_col; ++col) { - /*if (!col_is_variable) */ { - uint64_t col_offset = 0; + for (int col=block.start_col; col<=block.end_col; ++col) { + /*if (!col_is_variable) */{ + uint64_t col_offset = 0; cudf::size_type col_size = col_sizes[col]; - auto const dest_col_offset = - col_offsets[col] - col_offsets[block.start_col] + dest_shim_offset; - if (debug_print) { printf("dest col offset %d\n", dest_col_offset); } - for (int row = block.start_row + threadIdx.x; row <= block.end_row; row += blockDim.x) { - if (debug_print) { - printf("shmem row %d(%d) at offset %d(%d)\n", - row - block.start_row, - row, - (row - block.start_row) * shmem_row_size, - row * shmem_row_size); - } - int8_t *shmem_dest = - &shared_data[dest_col_offset + shmem_row_size * (row - block.start_row)]; + auto const dest_col_offset = col_offsets[col] - col_offsets[block.start_col] + dest_shim_offset; + for (int row=block.start_row + threadIdx.x; row(input_data[col]); - if (debug_print) { printf("%p <- short %d\n", shmem_dest, short_col_input[row]); } + const int16_t *short_col_input = reinterpret_cast(input_data[col]); *reinterpret_cast(shmem_dest) = short_col_input[row]; break; } case 4: { - const int32_t *int_col_input = reinterpret_cast(input_data[col]); - if (debug_print) { - printf("shmem[%d][%d] - %p <- int 0x%x\n", row, col, shmem_dest, int_col_input[row]); - } + const int32_t *int_col_input = reinterpret_cast(input_data[col]); *reinterpret_cast(shmem_dest) = int_col_input[row]; break; } case 8: { - const int64_t *long_col_input = reinterpret_cast(input_data[col]); - if (debug_print) { printf("%p <- long %lu\n", shmem_dest, long_col_input[row]); } + const int64_t *long_col_input = reinterpret_cast(input_data[col]); *reinterpret_cast(shmem_dest) = long_col_input[row]; break; } default: { cudf::size_type input_offset = col_size * row; - if (debug_print) { - printf("byte for byte copy due to size %d of column %d\n", col_size, col); - printf("%p <- input_data[%d] which is %d\n", - shmem_dest, - input_offset, - input_data[col][input_offset]); - } // TODO this should just not be supported for fixed width columns, but just in case... for (cudf::size_type b = 0; b < col_size; b++) { shmem_dest[b] = input_data[col][b + input_offset]; @@ -482,13 +417,11 @@ __global__ void copy_from_columns(const size_type num_rows, // so we have to rewrite the addresses to make sure that it is 4 byte aligned // we do this directly in the final location because the entire row may not // fit in shared memory and may require many blocks to process it entirely - int8_t *valid_byte = - &output_data[block.buffer_num][row_offsets[row] + validity_offset + (col / 8)]; + int8_t *valid_byte = &output_data[block.buffer_num][row_offsets[row] + validity_offset + col / 8]; cudf::size_type byte_bit_offset = col % 8; uint64_t fixup_bytes = reinterpret_cast(valid_byte) % 4; int32_t *valid_int = reinterpret_cast(valid_byte - fixup_bytes); cudf::size_type int_bit_offset = byte_bit_offset + (fixup_bytes * 8); - if (debug_print) { printf("Outputting validity to %p\n", valid_byte); } // Now copy validity for the column if (input_nm[col]) { if (bit_is_set(input_nm[col], row)) { @@ -500,11 +433,11 @@ __global__ void copy_from_columns(const size_type num_rows, // It is valid so just set the bit atomicOr_block(valid_int, 1 << int_bit_offset); } - } // end row + } // end row - col_offset += col_sizes[col] * rows_in_block; + col_offset += col_sizes[col] * (block.end_row - block.start_row); } - } // end col + } // end col // wait for the data to be totally copied into shared memory __syncthreads(); @@ -517,311 +450,35 @@ __global__ void copy_from_columns(const size_type num_rows, // row in shared memory may not be an entire row of the destination. // auto const thread_start_offset = threadIdx.x * 8; - auto const thread_stride = blockDim.x * 8; - auto const end_offset = shmem_row_size * rows_in_block; - - if (debug_print) { - printf("writing final data from %d to %d at stride %d\n", - thread_start_offset, - shmem_row_size * rows_in_block, - thread_stride); - printf("rows in block %d\n", rows_in_block); - } - for (auto src_offset = thread_start_offset; src_offset < end_offset; - src_offset += thread_stride) { + auto const thread_stride = gridDim.x * 8; + for (auto src_offset = thread_start_offset; src_offset < shmem_row_size * (block.end_row - block.start_row); src_offset += thread_stride) { auto const output_row_num = src_offset / shmem_row_size; - auto const row_offset = row_offsets[block.start_row + output_row_num]; - auto const col_offset = src_offset % shmem_row_size; - int8_t *output_ptr = &output_data[block.buffer_num][row_offset + col_offset]; - int8_t *input_ptr = &shared_data[src_offset]; - - // three cases to worry about here - // 1) first 8-byte part of a large row - dest_shim_offset bytes of pad at the front - // 2) last 8-byte part of a large row - some bytes of pad at the end - // 3) corner case of <= 8 bytes of data, which means dest_shim_offset bytes of pad at the front - // AND potentially pad at the rear - - // we know the real number of bytes in a row, so we can figure out if we are in case 3 easily. - // 1st case is when we're at some even multiple of shmem_row_size offset. - // 2nd case is when offset + 8 is some even multiple of shmem_row_size. - // must be an 8 byte copy - - // there is a chance we have a 0 dest_shim_offset and an 8 byte thing to copy, optimize? - if (real_bytes_in_row + dest_shim_offset <= 8) { - // case 3, we want to copy real_bytes_in_row bytes - auto const num_single_bytes = real_bytes_in_row - dest_shim_offset; - for (auto i = 0; i < num_single_bytes; ++i) { - if (debug_print) { - printf("case 3 - %d single byte final write %p(%d) -> %p\n", - num_single_bytes, - &input_ptr[i + dest_shim_offset], - input_ptr[i + dest_shim_offset], - &output_ptr[i]); - } - output_ptr[i] = input_ptr[i + dest_shim_offset]; - } - } else if (dest_shim_offset > 0 && src_offset % shmem_row_size == 0) { - // first byte with leading pad + auto const row_offset = row_offsets[block.start_row + output_row_num]; + auto const col_offset = src_offset % shmem_row_size; + int8_t *output_ptr = &output_data[block.buffer_num][row_offset + col_offset]; + int8_t *input_ptr = &shared_data[src_offset]; + // the first part and last part of the row is unaligned data copy. This is copied a single byte + // at a time. + if (dest_shim_offset > 0 && src_offset % shmem_row_size == 0) { + // first part of a row, copy single bytes auto const num_single_bytes = 8 - dest_shim_offset; - for (auto i = 0; i < num_single_bytes; ++i) { - if (debug_print) { - printf( - "single byte final write %p -> %p\n", &input_ptr[i + dest_shim_offset], &output_ptr[i]); - } + for (auto i=0; i 0) { - // last bytes of a row - auto const num_single_bytes = (real_bytes_in_row + dest_shim_offset) % 8; - for (auto i = 0; i < num_single_bytes; ++i) { - if (debug_print) { - printf("single trailing byte final write %p -> %p\n", - &input_ptr[i + dest_shim_offset], - &output_ptr[i]); - } + } else if (dest_shim_offset > 0 && (src_offset + 8) % shmem_row_size == 0) { + // last part of a row, copy single bytes + auto const num_single_bytes = dest_shim_offset; + for (auto i=0; i(input_ptr); - if (debug_print) { - printf( - "long final write %p -> %p\n", long_col_input, reinterpret_cast(output_ptr)); - } + const int64_t *long_col_input = reinterpret_cast(input_ptr); *reinterpret_cast(output_ptr) = *long_col_input; } } } -/** - * @brief copy data from row-based format to cudf columns - * - * @param num_rows total number of rows in the table - * @param num_columns total number of columns in the table - * @param shmem_used_per_block amount of shared memory that is used by a block - * @param offsets - * @param output_data - * @param output_nm - * @param col_sizes array of sizes for each element in a column - one per column - * @param col_offsets offset into input data row for each column's start - * @param block_infos information about the blocks of work - * @param input_data pointer to input data - * - */ -__global__ void copy_to_columns(const size_type num_rows, - const size_type num_columns, - const size_type shmem_used_per_block, - const size_type *offsets, - int8_t **output_data, - cudf::bitmask_type **output_nm, - const size_type *col_sizes, - const size_type *col_offsets, - const block_info *block_infos, - const int8_t *input_data) -{ - // We are going to copy the data in two passes. - // The first pass copies a chunk of data into shared memory. - // The second pass copies that chunk from shared memory out to the final location. - - // Because shared memory is limited we copy a subset of the rows at a time. - // This has been broken up for us in the block_info struct, so we don't have - // any calculation to do here, but it is important to note. - - constexpr bool debug_print = false; //blockIdx.x == 0 && threadIdx.x == 0; - - if (debug_print) { - printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns); - printf("block infos are at %p and my index is %d\n", block_infos, blockIdx.x); - /* printf("Row Offsets:\n"); - for (int i=0; i blockDim.x) { - break; - } - auto block = block_infos[this_block_index]; - auto const rows_in_block = block.end_row - block.start_row + 1; - auto const cols_in_block = block.end_col - block.start_col + 1; - extern __shared__ int8_t shared_data[]; - - // copy data from our block's window to shared memory - // offsets information can get us on the row, then we need to know where the column - // starts to offset into the row data. - - // each thread is responsible for 8-byte chunks starting at threadIdx.x and striding - // at blockDim.x. If the 8-byte chunk falls on the boundary of the window, then the - // thread may copy less than 8 bytes. Even if at the beginning of the window, because - // every internal copy is aligned to 8-byte boundaries. - // - // thread 0 thread 1 thread 2 thread 3 thread 4 thread 5 - // 01234567 89abcdef 01234567 89abcdef 01234567 89abcdef - // xxxbbbbb bbbbbbbb bbbbbbbb bbbbbbbb bbbbbbbb bbxxxxxx - // | | | | | | | - // - // - - auto const window_start_quad = col_offsets[block.start_col] / 8; - auto const window_end_quad = (col_offsets[block.end_col] + col_sizes[block.end_col] + 7) / 8; - auto const window_quad_width = window_end_quad - window_start_quad; - auto const total_quads = window_quad_width * rows_in_block; - auto const shared_memory_starting_pad = col_offsets[block.start_col] & 0x7; - - if (debug_print) { - printf("col_offsets[%d]: %d, col_offsets[%d]: %d col_sizes[%d]: %d\n", block.start_col, col_offsets[block.start_col], block.end_col, col_offsets[block.end_col], block.end_col, col_sizes[block.end_col]); - printf("window start quad is %d, window end quad is %d\n", window_start_quad, window_end_quad); - printf("window quad width is %d and there are %d total quads\n%d shared memory starting pad\n", window_quad_width, total_quads, shared_memory_starting_pad); - } - - // the copy to shared memory will be greedy. We know that the data is 8-byte aligned, so we won't - // access illegal memory by doing 8-byte aligned copies, so we can copy 8-byte aligned. This will - // result in the window edges being duplicated across blocks, but we can copy the padding as well - // to speed up our transfers to shared memory. - for (int i = threadIdx.x; i < total_quads; i += blockDim.x) { - auto const relative_row = i / window_quad_width; - auto const absolute_row = relative_row + block.start_row; - //auto const row = i / window_quad_width; - auto const offset_in_row = i % window_quad_width * 8; - auto const shmem_dest = &shared_data[i * 8]; - - if (debug_print) { - printf("relative_row: %d, absolute_row: %d, offset_in_row: %d, shmem_dest: %p\n", relative_row, absolute_row, offset_in_row, shmem_dest); - printf("offsets is %p\n", offsets); - printf("offsets[%d]: %d\n", absolute_row, offsets[absolute_row]); - printf("input_data[%d] will be dereferenced\n", offsets[absolute_row] + offset_in_row); - } - - // full 8-byte copy - const int64_t *long_col_input = - reinterpret_cast(&input_data[offsets[absolute_row] + offset_in_row]); - if (debug_print) { - printf("which will be address %p\n", long_col_input); - printf("%p <- long %lu\n", shmem_dest, *long_col_input); } - *reinterpret_cast(shmem_dest) = *long_col_input; - } - - __syncthreads(); - - // now we copy from shared memory to final destination. - // the data is laid out in rows in shared memory, so the reads - // for a column will be "vertical". Because of this and the different - // sizes for each column, this portion is handled on row/column basis. - // to prevent each thread working on a single row and also to ensure - // that all threads can do work in the case of more threads than rows, - // we do a global index instead of a double for loop with col/row. - for (int index = threadIdx.x; index < rows_in_block * cols_in_block; index += blockDim.x) { - auto const relative_col = index % cols_in_block; - auto const relative_row = index / cols_in_block; - auto const absolute_col = relative_col + block.start_col; - auto const absolute_row = relative_row + block.start_row; - - auto const shared_memory_row_offset = window_quad_width * 8 * relative_row; - auto const shared_memory_offset = col_offsets[absolute_col] - col_offsets[block.start_col] + - shared_memory_row_offset + shared_memory_starting_pad; - auto const column_size = col_sizes[absolute_col]; - - int8_t *shmem_src = &shared_data[shared_memory_offset]; - int8_t *dst = &output_data[absolute_col][absolute_row * column_size]; - - if (debug_print) { - printf("relative_col: %d, relative_row: %d, absolute_col: %d, absolute_row: %d, shared_mmeory_row_offset: %d, shared_memory_offset: %d," - " column_size: %d, shmem_src: %p, dst: %p\n", relative_col, relative_row, absolute_col, absolute_row, shared_memory_row_offset, shared_memory_offset, column_size, - shmem_src, dst) ; - } - switch (column_size) { - case 1: { - if (debug_print) { printf("%p <- byte %d\n", dst, *shmem_src); } - *dst = *shmem_src; - break; - } - case 2: { - const int16_t *short_col_input = reinterpret_cast(shmem_src); - if (debug_print) { printf("%p <- short %d\n", dst, *short_col_input); } - *reinterpret_cast(dst) = *short_col_input; - break; - } - case 4: { - const int32_t *int_col_input = reinterpret_cast(shmem_src); - if (debug_print) { printf("%p <- int 0x%x\n", dst, *int_col_input); } - *reinterpret_cast(dst) = *int_col_input; - break; - } - case 8: { - const int64_t *long_col_input = reinterpret_cast(shmem_src); - if (debug_print) { printf("%p <- long %lu\n", dst, *long_col_input); } - *reinterpret_cast(dst) = *long_col_input; - break; - } - default: { - if (debug_print) { - printf("byte for byte copy due to size %d of column %d\n", column_size, absolute_col); - } - // TODO this should just not be supported for fixed width columns, but just in case... - for (cudf::size_type b = 0; b < column_size; b++) { dst[b] = shmem_src[b]; } - break; - } - } - } - - // now handle validity. Each thread is responsible for 32 rows in 8 columns. - // to prevent indexing issues with a large number of threads, this is compressed - // to a single loop like above. TODO: investigate using shared memory here - auto const validity_batches_per_col = (num_rows + 31) / 32; - auto const validity_batches_total = std::max(1, validity_batches_per_col * (num_columns / 8)); - if (debug_print && threadIdx.x == 0 && blockIdx.x == 0) { - printf("validity_batched_per_col is %d\nvalidity_batches_total is %d for %d rows\n%d blocks of %d threads\n", validity_batches_per_col, validity_batches_total, num_rows, gridDim.x, blockDim.x); - } - for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < validity_batches_total; index += blockDim.x * gridDim.x) { - auto const start_col = (index * 8) / validity_batches_per_col; - auto const batch = index % validity_batches_per_col; - auto const starting_row = batch * 32; - auto const validity_offset = col_offsets[num_columns] + (start_col / 8); - - if (debug_print) { - printf("%d-%d: cols: %d-%d, word index: %d, batch: %d, starting_row: %d, +validity_offset: %d, index: %d, stride: %d\n", threadIdx.x, blockIdx.x, start_col, start_col + 7, (start_col / 8), batch, starting_row, validity_offset, index, blockDim.x * gridDim.x); - } - - // one for each column - int32_t dst_validity[8] = {0}; - for (int row = starting_row; row < std::min(num_rows, starting_row + 32); ++row) { - int8_t const * const validity_ptr = &input_data[offsets[row] + validity_offset]; - - if (debug_print) { - printf("%d: validity_ptr is %p for row %d\n", threadIdx.x, validity_ptr, row); - } - - auto const val_byte = *validity_ptr; - - for (int i=0; i> src_shift); - } - // auto const dst_bit_mask = 1 << dst_shift; - dst_validity[i] |= (((val_byte & src_bit_mask) >> src_shift) << dst_shift); - } - } - - - for (int i=0; i(output_nm[start_col + i] + (starting_row / 32)); - if (debug_print) { - printf("%d-%d: validity write output_nm[%d][%d] - %p <- %d\n", threadIdx.x, blockIdx.x, start_col + i, starting_row, validity_ptr, dst_validity[i]); - } - *validity_ptr = dst_validity[i]; - } - } -} -} - /** * Calculate the dimensions of the kernel for fixed width only columns. * @param [in] num_columns the number of columns being copied. @@ -895,10 +552,10 @@ static std::unique_ptr fixed_width_convert_to_rows( const cudf::size_type num_rows, const cudf::size_type num_columns, const cudf::size_type size_per_row, - rmm::device_uvector &column_start, - rmm::device_uvector &column_size, - rmm::device_uvector &input_data, - rmm::device_uvector &input_nm, + std::unique_ptr> &column_start, + std::unique_ptr> &column_size, + std::unique_ptr> &input_data, + std::unique_ptr> &input_nm, const cudf::scalar &zero, const cudf::scalar &scalar_size_per_row, rmm::cuda_stream_view stream, @@ -929,10 +586,10 @@ static std::unique_ptr fixed_width_convert_to_rows( num_rows, num_columns, size_per_row, - column_start.data(), - column_size.data(), - input_data.data(), - input_nm.data(), + column_start->data(), + column_size->data(), + input_data->data(), + input_nm->data(), data->mutable_view().data()); return cudf::make_lists_column(num_rows, @@ -986,165 +643,21 @@ static inline int32_t compute_fixed_width_layout(std::vector co return align_offset(at_offset, 8); // 8 bytes (64 bits) } -template -static size_type compute_column_information( - iterator begin, - iterator end, - std::vector &column_starts, - std::vector &column_sizes)//, - //std::function nested_type_cb) -{ - size_type fixed_width_size_per_row = 0; - for (auto cv = begin; cv != end; ++cv) { - auto col_type = std::get<0>(*cv); - bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; - -// if (nested_type && nested_type_cb) { nested_type_cb(cv->get<1>()); } - - // a list or string column will write a single uint64 - // of data here for offset/length - auto col_size = nested_type ? 8 : size_of(col_type); - - // align size for this type - std::size_t const alignment_needed = col_size; // They are the same for fixed width types - fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed); - column_starts.push_back(fixed_width_size_per_row); - column_sizes.push_back(col_size); - fixed_width_size_per_row += col_size; - } - - auto validity_offset = detail::align_offset(fixed_width_size_per_row, 4); - column_starts.push_back(validity_offset); - - return fixed_width_size_per_row; -} +} // namespace detail //#define DEBUG - -static std::vector build_block_infos(std::vector const &column_sizes, - std::vector const &column_starts, - std::vector const &row_batches, - size_type const total_number_of_rows, - size_type const &shmem_limit_per_block) +std::vector> convert_to_rows2(cudf::table_view const &tbl, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) { - std::vector block_infos; - - // block infos are organized with the windows going "down" the columns - // this provides the most coalescing of memory access - int current_window_width = 0; - int current_window_start_col = 0; - - // build the blocks for a specific set of columns - auto build_blocks = [&block_infos, &row_batches, total_number_of_rows]( - int const start_col, int const end_col, int const desired_window_height) { - int current_window_start_row = 0; - int current_window_row_batch = 0; - int rows_left_in_batch = row_batches[current_window_row_batch].row_count; - int i = 0; - while (i < total_number_of_rows) { - if (rows_left_in_batch == 0) { - current_window_row_batch++; - rows_left_in_batch = row_batches[current_window_row_batch].row_count; - } - int const window_height = std::min(desired_window_height, rows_left_in_batch); - - block_infos.emplace_back(detail::block_info{ - start_col, - current_window_start_row, - end_col, - std::min(current_window_start_row + window_height - 1, total_number_of_rows - 1), - current_window_row_batch}); - - i += window_height; - current_window_start_row += window_height; - rows_left_in_batch -= window_height; - } - }; - - // the ideal window height has lots of 8-byte reads and 8-byte writes. The optimal read/write - // would be memory cache line sized access, but since other blocks will read/write the edges this - // may not turn out to be overly important. For now, we will attempt to build a square window as - // far as byte sizes. x * y = shared_mem_size. Which translates to x^2 = shared_mem_size since we - // want them equal, so height and width are sqrt(shared_mem_size). The trick is that it's in - // bytes, not rows or columns. - int const window_height = std::min( - std::min(size_type(sqrt(shmem_limit_per_block)) / column_sizes[0], total_number_of_rows), - row_batches[0].row_count); -#if defined(DEBUG) - printf( - "sqrt(shmem_limit_per_block) / column_sizes[0] is %d and num_rows is %d, batch row count is %d - which makes window height " - "%d\n", - size_type(sqrt(shmem_limit_per_block)) / column_sizes[0], - total_number_of_rows, - row_batches[0].row_count, - window_height); -#endif - - int row_size = 0; - - // march each column and build the blocks of appropriate sizes - for (unsigned int col = 0; col < column_sizes.size(); ++col) { - auto const col_size = column_sizes[col]; - - // align size for this type - std::size_t alignment_needed = col_size; // They are the same for fixed width types - auto row_size_aligned = detail::align_offset(row_size, alignment_needed); - auto row_size_with_this_col = row_size_aligned + col_size; - auto row_size_with_end_pad = detail::align_offset(row_size_with_this_col, 8); - - if (row_size_with_end_pad * window_height > shmem_limit_per_block) { -#if defined(DEBUG) - printf( - "Window size %d too large at column %d, bumping back to build windows of size %d(cols " - "%d-%d), which is %d tall. Row size is too large at %d and ok at %d(aligned overall is %d) " - "for shared mem size %d\n", - row_size_with_end_pad * window_height, - col, - row_size * window_height, - current_window_start_col, - col - 1, - window_height, - row_size_with_end_pad, - row_size, - row_size_aligned, - shmem_limit_per_block); -#endif - // too large, close this window, generate vertical blocks and restart - build_blocks(current_window_start_col, col - 1, window_height); - row_size = - detail::align_offset((column_starts[col] + column_sizes[col]) & 7, alignment_needed); -#if defined(DEBUG) - printf( - "New window starting with offset %d and row size %d to be %d (previous column offset %d+%d " - "or %d)\n", - row_size, - col_size, - row_size + col_size, - column_starts[col - 1], - column_sizes[col - 1], - column_starts[col - 1] + column_sizes[col - 1]); -#endif - row_size += col_size; // alignment required for shared memory window boundary to match - // alignment of output row - current_window_start_col = col; - current_window_width = 0; - } else { - row_size = row_size_with_this_col; - current_window_width++; - } - } - - // build last set of blocks - if (current_window_width > 0) { - build_blocks(current_window_start_col, (int)column_sizes.size()-1, window_height); - } - - return block_infos; -} -} // namespace detail + // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the data, but small enough + // that multiple columns fit in memory so the writes can coalese as well. Potential optimization for window sizes. + constexpr int max_window_height = 1024; + const size_type num_columns = tbl.num_columns(); + const size_type num_rows = tbl.num_rows(); -#if defined(DEBUG) - void pretty_print(uint64_t i) { + #if defined(DEBUG) + auto pretty_print = [](uint64_t i) { if (i > (1 * 1024 * 1024 * 1024)) { printf("%.2f GB", i / float(1 * 1024 * 1024 * 1024)); } else if (i > (1 * 1024 * 1024)) { @@ -1154,18 +667,8 @@ static std::vector build_block_infos(std::vector const &c } else { printf("%lu Bytes", i); } - } -#endif - -std::vector> convert_to_rows2(cudf::table_view const &tbl, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) -{ - // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the - // data, but small enough that multiple columns fit in memory so the writes can coalese as well. - // Potential optimization for window sizes. - const size_type num_columns = tbl.num_columns(); - const size_type num_rows = tbl.num_rows(); + }; + #endif int device_id; CUDA_TRY(cudaGetDevice(&device_id)); @@ -1173,12 +676,6 @@ std::vector> convert_to_rows2(cudf::table_view con CUDA_TRY( cudaDeviceGetAttribute(&shmem_limit_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); -#if defined(DEBUG) - size_t free, total; - cudaMemGetInfo(&free, &total); - printf("%lu/%lu Memory\n", free, total); -#endif - // break up the work into blocks, which are a starting and ending row/col #. // this window size is calculated based on the shared memory size available // we want a single block to fill up the entire shared memory space available @@ -1194,78 +691,50 @@ std::vector> convert_to_rows2(cudf::table_view con // to that point. These are row batches and they are decided first before building the // windows so the windows can be properly cut around them. - // Get the pointers to the input columnar data ready - std::vector input_data; - std::vector input_nm; - input_data.reserve(num_columns); - input_nm.reserve(num_columns); - for (size_type column_number = 0; column_number < num_columns; column_number++) { - column_view cv = tbl.column(column_number); - auto const col_type = cv.type(); - bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; - - if (!nested_type) { - input_data.emplace_back(cv.data()); - input_nm.emplace_back(cv.null_mask()); - } - } - - auto dev_input_data = make_device_uvector_async(input_data, stream, mr); - auto dev_input_nm = make_device_uvector_async(input_nm, stream, mr); - - std::vector row_sizes; // size of each row in bytes including any alignment padding - std::vector row_offsets; // offset from the start of the data to this row + std::vector row_sizes; // size of each row in bytes including any alignment padding + std::vector row_offsets; // offset from the start of the data to this row std::vector column_sizes; // byte size of each column - std::vector column_starts; // offset of column inside a row including alignment - std::vector - variable_width_columns; // list of the variable width columns in the table + std::vector column_starts; // offset of column inside a row including alignment + std::vector variable_width_columns; // list of the variable width columns in the table row_sizes.reserve(num_rows); row_offsets.reserve(num_rows); column_sizes.reserve(num_columns); - column_starts.reserve(num_columns + 1); // we add a final offset for validity data start - - auto iter = thrust::make_transform_iterator(thrust::make_counting_iterator(0), [&tbl](auto i) -> std::tuple { - return std::make_tuple(tbl.column(i).type(), tbl.column(i)); - }); - - size_type fixed_width_size_per_row = detail::compute_column_information( - iter, - iter + num_columns, - column_starts, - column_sizes);//, -// [&variable_width_columns](column_view const &cv) { variable_width_columns.push_back(cv); }); - /* size_type fixed_width_size_per_row = 0; - for (int col = 0; col < num_columns; ++col) { - auto cv = tbl.column(col); - auto col_type = cv.type(); - bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; - - if (nested_type) { variable_width_columns.push_back(cv); } - - // a list or string column will write a single uint64 - // of data here for offset/length - auto col_size = nested_type ? 8 : size_of(col_type); - - // align size for this type - std::size_t const alignment_needed = col_size; // They are the same for fixed width types - fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed); - column_starts.push_back(fixed_width_size_per_row); - column_sizes.push_back(col_size); - fixed_width_size_per_row += col_size; - }*/ - -#if defined(DEBUG) - printf("validity offset will be %d + %d = %d\n", - column_starts.back(), - column_sizes.back(), - column_starts.back() + column_sizes.back()); -#endif - - - auto dev_col_sizes = make_device_uvector_async(column_sizes, stream, mr); - auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr); - - std::vector row_batches; + column_starts.reserve(num_columns+1); // we add a final offset for validity data start + + size_type fixed_width_size_per_row = 0; + for (int col = 0; col < num_columns; ++col) { + auto cv = tbl.column(col); + auto col_type = cv.type(); + bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; + + if (nested_type) { variable_width_columns.push_back(cv);} + + // a list or string column will write a single uint64 + // of data here for offset/length + auto col_size = nested_type ? 8 : size_of(col_type); + + // align size for this type + std::size_t const alignment_needed = col_size; // They are the same for fixed width types + fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed); + column_starts.push_back(fixed_width_size_per_row); + column_sizes.push_back(col_size); + fixed_width_size_per_row += col_size; + } + + // When building the columns to return, we have to be mindful of the offset limit in cudf. + // It is 32-bit and these data columns are capable of surpassing that easily. The data should + // not be cut off exactly at the limit though due to the validity buffers. The most efficient + // place to cut the validity is on a 32-row boundary, so as we calculate the row sizes + // we keep track of the cut points for the validity, which we call row batches. If the row + // is larger than can be represented with the 32-bit offsets, we use the last 32-row boundary we hit. + // Note that this boundary is for our book-keeping with column pointers and not anything + // that the kernel needs to worry about. We cut the output at convienient boundaries + // when assembling the outgoing data stream. + struct row_batch { + size_type num_bytes; + size_type row_count; + }; + std::vector row_batches; auto calculate_variable_width_row_data_size = [](int const row) { // each level of variable-width data will add an offset/length @@ -1277,156 +746,210 @@ std::vector> convert_to_rows2(cudf::table_view con // will be included in the variable-width data blob at the end of the // row. return 0; - /* auto c = variable_width_columns[col]; - while (true) { - auto col_offsets = c.child(0).data(); - auto col_data_size = size_of(c.child(1).type()); - std::size_t alignment_needed = col_data_size; - - row_sizes[row] += (col_offsets[row + 1] - col_offsets[row]) * col_data_size; - if (c.num_children() == 0) { - break; - } - c = c.child(1); - } - */ +/* auto c = variable_width_columns[col]; + while (true) { + auto col_offsets = c.child(0).data(); + auto col_data_size = size_of(c.child(1).type()); + std::size_t alignment_needed = col_data_size; + + row_sizes[row] += (col_offsets[row + 1] - col_offsets[row]) * col_data_size; + if (c.num_children() == 0) { + break; + } + c = c.child(1); + } +*/ }; uint64_t row_batch_size = 0; uint64_t total_table_size = 0; - size_type row_batch_rows = 0; - uint64_t row_offset = 0; + size_type row_batch_rows = 0; + uint64_t row_offset = 0; - // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then - // calculate the size of each row's variable-width data and validity as well. - auto validity_size = num_bitmask_words(num_columns) * 4; + // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then calculate + // the size of each row's variable-width data as well. for (int row = 0; row < num_rows; ++row) { - auto aligned_row_batch_size = - detail::align_offset(row_batch_size, 8); // rows are 8 byte aligned - row_sizes[row] = fixed_width_size_per_row; - // validity is byte aligned - row_sizes[row] += validity_size; - // variable width data is 8-byte aligned - row_sizes[row] = detail::align_offset(row_sizes[row], 8) + - calculate_variable_width_row_data_size(row); // rows are 8 byte aligned - - if ((uint64_t)aligned_row_batch_size + row_sizes[row] > - (uint64_t)std::numeric_limits::max()) { + row_sizes[row] = fixed_width_size_per_row + calculate_variable_width_row_data_size(row); + if (row_batch_size + row_sizes[row] > std::numeric_limits::max()) { // a new batch starts at the last 32-row boundary - row_batches.push_back( - detail::row_batch{static_cast(row_batch_size), row_batch_rows & ~31}); - row_batch_size = 0; - row_batch_rows = row_batch_rows & 31; - row_offset = 0; - aligned_row_batch_size = 0; + row_batches.push_back(row_batch{static_cast(row_batch_size), row_batch_rows & ~31}); + row_batch_size = 0; + row_batch_rows = row_batch_rows & 31; + row_offset = 0; } - row_offset = detail::align_offset(row_offset, 8); // rows are 8 byte aligned + row_offset = detail::align_offset(row_offset, 8); // rows are 8 byte aligned row_offsets.push_back(row_offset); - row_batch_size = aligned_row_batch_size + row_sizes[row]; + row_batch_size += row_sizes[row]; row_offset += row_sizes[row]; - total_table_size = detail::align_offset(total_table_size, 8); // rows are 8 byte aligned + total_table_size = detail::align_offset(total_table_size, 8); // rows are 8 byte aligned total_table_size += row_sizes[row]; row_batch_rows++; } if (row_batch_size > 0) { - row_batches.push_back(detail::row_batch{static_cast(row_batch_size), row_batch_rows}); + row_batches.push_back(row_batch{static_cast(row_batch_size), row_batch_rows & ~31}); } - auto dev_row_offsets = make_device_uvector_async(row_offsets, stream, mr); - -#if defined(DEBUG) - printf("%d rows and %d columns in table\n", num_rows, num_columns); + #if defined(DEBUG) printf("%lu batches:\n", row_batches.size()); for (auto i = 0; i < (int)row_batches.size(); ++i) { printf("%d: %d rows, ", i, row_batches[i].row_count); pretty_print(row_batches[i].num_bytes); printf("\n"); } -#endif + #endif - std::vector output_buffers; - std::vector output_data; - output_data.reserve(row_batches.size()); - for (uint i = 0; i < row_batches.size(); ++i) { - rmm::device_buffer temp(row_batches[i].num_bytes, stream, mr); - output_data.push_back(static_cast(temp.data())); - output_buffers.push_back(std::move(temp)); + std::vector block_infos; + + // block infos are organized with the windows going "down" the columns + // this provides the most coalescing of memory access + int current_window_size = 0; + int current_window_start_col = 0; + + // build the blocks for a specific set of columns + auto build_blocks = [&block_infos, &row_batches, num_rows](int const start_col, int const end_col, int const desired_window_height) { + int current_window_start_row = 0; + int current_window_row_batch = 0; + int rows_left_in_batch = row_batches[current_window_row_batch].row_count; + int i = 0; + while (i < num_rows) { + if (rows_left_in_batch == 0) { + current_window_row_batch++; + rows_left_in_batch = row_batches[current_window_row_batch].row_count; + } + int const window_height = std::min(desired_window_height, rows_left_in_batch); + + block_infos.emplace_back( + detail::block_info{start_col, + current_window_start_row, + start_col + end_col, + std::min(current_window_start_row + window_height - 1, num_rows), current_window_row_batch}); + + i += window_height; + current_window_start_row += window_height; + rows_left_in_batch -= window_height; + } + }; + + int const window_height = std::min(std::min(max_window_height, num_rows), row_batches[0].row_count); + + int row_size = 0; + + // march each column and build the blocks of appropriate sizes + for (int col = 0; col < num_columns; ++col) { + auto const col_size = column_sizes[col]; + + // align size for this type + std::size_t alignment_needed = col_size; // They are the same for fixed width types + auto row_size_with_this_col = detail::align_offset(row_size, alignment_needed) + col_size; + + if (row_size_with_this_col * window_height > shmem_limit_per_block) { + // too large, close this window, generate vertical blocks and restart + build_blocks(current_window_start_col, col - 1, window_height); + row_size = detail::align_offset(column_starts[col] & 7, alignment_needed) + col_size; // alignment required for shared memory window boundary to match alignment of output row + current_window_start_col = col; + } else { + row_size = row_size_with_this_col; + } } - auto dev_output_data = make_device_uvector_async(output_data, stream, mr); - std::vector block_infos = - build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block); + auto validity_offset = detail::align_offset(column_starts.back(), 4); + column_starts.push_back(validity_offset); + + // build last set of blocks + if (current_window_size > 0) { build_blocks(current_window_start_col, (int)tbl.num_columns() - 1, window_height); } + + // Get the pointers to the input columnar data ready - possibly moved up to copy to gpu while calculating other things + std::vector input_data; + std::vector input_nm; + for (size_type column_number = 0; column_number < num_columns; column_number++) { + column_view cv = tbl.column(column_number); + auto const col_type = cv.type(); + bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; + + if (!nested_type) { + input_data.emplace_back(cv.data()); + input_nm.emplace_back(cv.null_mask()); + } + } -#if defined(DEBUG) - printf("%lu windows for %d columns, %d rows to fit in ", - block_infos.size(), - block_infos[0].end_col - block_infos[0].start_col + 1, - block_infos[0].end_row - block_infos[0].start_row); + #if defined(DEBUG) + printf("%lu windows for %d columns, %d rows to fit in ", block_infos.size(), block_infos[0].end_col - block_infos[0].start_col, block_infos[0].end_row - block_infos[0].start_row); pretty_print(shmem_limit_per_block); printf(" shared mem("); pretty_print(fixed_width_size_per_row); printf("/row, %d columns, %d rows, ", num_columns, num_rows); pretty_print(total_table_size); printf(" total):\n"); -#endif + #endif - auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr); + auto dev_input_data = detail::copy_to_dev_async2(input_data, stream, mr); + auto dev_input_nm = detail::copy_to_dev_async2(input_nm, stream, mr); + auto dev_col_sizes = detail::copy_to_dev_async2(column_sizes, stream, mr); + auto dev_col_starts = detail::copy_to_dev_async2(column_starts, stream, mr); + auto dev_block_infos = detail::copy_to_dev_async2(block_infos, stream, mr); + auto dev_row_offsets = detail::copy_to_dev_async2(row_offsets, stream, mr); + + std::vector output_data; + output_data.reserve(row_batches.size()); + for (uint i=0; i>>( - num_rows, - num_columns, - dev_input_data.data(), - dev_input_nm.data(), - dev_col_sizes.data(), - dev_col_starts.data(), - dev_block_infos.data(), - dev_row_offsets.data(), - reinterpret_cast(dev_output_data.data())); + dim3 blocks; + dim3 threads; + blocks.x = block_infos.size(); + blocks.y = 0; + blocks.z = 0; + threads.x = 1024; + threads.y = 0; + threads.z = 0; + detail::copy_from_columns<<>>(num_rows, + num_columns, + dev_input_data.data(), + dev_input_nm.data(), + dev_col_sizes.data(), + dev_col_starts.data(), + dev_block_infos.data(), + dev_row_offsets.data(), + reinterpret_cast(dev_output_data.data())); // split up the output buffer into multiple buffers based on row batch sizes // and create list of byte columns int offset_offset = 0; std::vector> ret; - for (uint i = 0; i < row_batches.size(); ++i) { + for (uint i=0; i offset_vals; offset_vals.reserve(row_batches[i].row_count + 1); size_type cur_offset = 0; offset_vals.push_back(cur_offset); - for (int row = 0; row < row_batches[i].row_count; ++row) { - cur_offset = detail::align_offset(cur_offset, 8) + row_sizes[row + offset_offset]; + for (int row=0; row( - data_type{type_id::INT32}, (size_type)offset_vals.size(), dev_offsets.release()); + auto dev_offsets = detail::copy_to_dev_async2(offset_vals, stream, mr); + auto offsets = + std::make_unique(data_type{type_id::INT32}, (size_type)offset_vals.size(), dev_offsets.release()); - auto data = std::make_unique( - data_type{cudf::type_id::INT8}, row_batches[i].num_bytes, std::move(output_buffers[i])); + auto data = + std::make_unique(data_type{cudf::type_id::INT8}, + row_batches[i].num_bytes, + std::move(output_data[i])); ret.push_back(cudf::make_lists_column(row_batches[i].row_count, - std::move(offsets), - std::move(data), - 0, - rmm::device_buffer{0, rmm::cuda_stream_default, mr}, - stream, - mr)); + std::move(offsets), + std::move(data), + 0, + rmm::device_buffer{0, rmm::cuda_stream_default, mr}, + stream, + mr)); } - + return ret; } @@ -1445,8 +968,8 @@ std::vector> convert_to_rows(cudf::table_view cons std::vector column_size; int32_t size_per_row = detail::compute_fixed_width_layout(schema, column_start, column_size); - auto dev_column_start = make_device_uvector_async(column_start, stream, mr); - auto dev_column_size = make_device_uvector_async(column_size, stream, mr); + auto dev_column_start = detail::copy_to_dev_async(column_start, stream, mr); + auto dev_column_size = detail::copy_to_dev_async(column_size, stream, mr); int32_t max_rows_per_batch = std::numeric_limits::max() / size_per_row; // Make the number of rows per batch a multiple of 32 so we don't have to worry about @@ -1463,16 +986,16 @@ std::vector> convert_to_rows(cudf::table_view cons input_data.emplace_back(cv.data()); input_nm.emplace_back(cv.null_mask()); } - auto dev_input_data = make_device_uvector_async(input_data, stream, mr); - auto dev_input_nm = make_device_uvector_async(input_nm, stream, mr); + auto dev_input_data = detail::copy_to_dev_async(input_data, stream, mr); + auto dev_input_nm = detail::copy_to_dev_async(input_nm, stream, mr); using ScalarType = cudf::scalar_type_t; auto zero = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value()); - zero->set_valid_async(true, stream); + zero->set_valid(true, stream); static_cast(zero.get())->set_value(0, stream); auto step = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value()); - step->set_valid_async(true, stream); + step->set_valid(true, stream); static_cast(step.get()) ->set_value(static_cast(size_per_row), stream); @@ -1500,100 +1023,6 @@ std::vector> convert_to_rows(cudf::table_view cons } } -std::unique_ptr convert_from_rows2(cudf::lists_column_view const &input, - std::vector const &schema, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) -{ - // verify that the types are what we expect - cudf::column_view child = input.child(); - cudf::type_id list_type = child.type().id(); - CUDF_EXPECTS(list_type == cudf::type_id::INT8 || list_type == cudf::type_id::UINT8, - "Only a list of bytes is supported as input"); - - cudf::size_type num_columns = schema.size(); - cudf::size_type num_rows = input.parent().size(); - - int device_id; - CUDA_TRY(cudaGetDevice(&device_id)); - int shmem_limit_per_block; - CUDA_TRY( - cudaDeviceGetAttribute(&shmem_limit_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); - - shmem_limit_per_block /= NUM_BLOCKS_PER_KERNEL_TO_COLUMNS; - - std::vector column_starts; - std::vector column_sizes; - - auto iter = thrust::make_transform_iterator(thrust::make_counting_iterator(0), [&schema](auto i) { - return std::make_tuple(schema[i], nullptr); - }); - size_type fixed_width_size_per_row = detail::compute_column_information( - iter, iter + num_columns, column_starts, column_sizes);//, [](void *) {}); - - size_type validity_size = num_bitmask_words(num_columns) * 4; - - size_type row_size = detail::align_offset(fixed_width_size_per_row + validity_size, 8); - - // Ideally we would check that the offsets are all the same, etc. but for now - // this is probably fine - CUDF_EXPECTS(row_size * num_rows == child.size(), - "The layout of the data appears to be off"); - auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr); - auto dev_col_sizes = make_device_uvector_async(column_sizes, stream, mr); - - // build the row_batches from the passed in list column - std::vector row_batches; - - row_batches.push_back(detail::row_batch{child.size(), num_rows}); - - // Allocate the columns we are going to write into - std::vector> output_columns; - std::vector output_data; - std::vector output_nm; - for (cudf::size_type i = 0; i < num_columns; i++) { - auto column = cudf::make_fixed_width_column( - schema[i], num_rows, cudf::mask_state::UNINITIALIZED, stream, mr); - auto mut = column->mutable_view(); - output_data.emplace_back(mut.data()); - output_nm.emplace_back(mut.null_mask()); - output_columns.emplace_back(std::move(column)); - } - - auto dev_output_data = make_device_uvector_async(output_data, stream, mr); - auto dev_output_nm = make_device_uvector_async(output_nm, stream, mr); - - std::vector block_infos = - build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block); - - auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr); - - dim3 blocks((block_infos.size() + (NUM_BLOCKS_PER_KERNEL_TO_COLUMNS - 1)) / NUM_BLOCKS_PER_KERNEL_TO_COLUMNS); - #if defined(DEBUG) || 1 - dim3 threads(std::min(std::min(512, shmem_limit_per_block / 8), (int)child.size())); - #else - dim3 threads(std::min(std::min(1024, shmem_limit_per_block / 8), (int)child.size())); - #endif -#if defined(DEBUG) - printf("Launching kernel with %d blocks, %d threads, ", blocks.x, threads.x); - pretty_print(shmem_limit_per_block); - printf(" shared memory\n"); -#endif - detail::copy_to_columns<<>>( - num_rows, - num_columns, - shmem_limit_per_block, - input.offsets().data(), - dev_output_data.data(), - dev_output_nm.data(), - dev_col_sizes.data(), - dev_col_starts.data(), - dev_block_infos.data(), - child.data()); - - return std::make_unique(std::move(output_columns)); -} - std::unique_ptr convert_from_rows(cudf::lists_column_view const &input, std::vector const &schema, rmm::cuda_stream_view stream, @@ -1618,8 +1047,8 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const &in // this is probably fine CUDF_EXPECTS(size_per_row * num_rows == child.size(), "The layout of the data appears to be off"); - auto dev_column_start = make_device_uvector_async(column_start, stream); - auto dev_column_size = make_device_uvector_async(column_size, stream); + auto dev_column_start = detail::copy_to_dev_async(column_start, stream, mr); + auto dev_column_size = detail::copy_to_dev_async(column_size, stream, mr); // Allocate the columns we are going to write into std::vector> output_columns; @@ -1634,8 +1063,8 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const &in output_columns.emplace_back(std::move(column)); } - auto dev_output_data = make_device_uvector_async(output_data, stream, mr); - auto dev_output_nm = make_device_uvector_async(output_nm, stream, mr); + auto dev_output_data = detail::copy_to_dev_async(output_data, stream, mr); + auto dev_output_nm = detail::copy_to_dev_async(output_nm, stream, mr); dim3 blocks; dim3 threads; @@ -1646,10 +1075,10 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const &in num_rows, num_columns, size_per_row, - dev_column_start.data(), - dev_column_size.data(), - dev_output_data.data(), - dev_output_nm.data(), + dev_column_start->data(), + dev_column_size->data(), + dev_output_data->data(), + dev_output_nm->data(), child.data()); return std::make_unique(std::move(output_columns)); @@ -1674,20 +1103,4 @@ std::unique_ptr convert_from_rows( // } } -std::unique_ptr convert_from_rows2( - std::vector> const &input, - std::vector const &schema, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) -{ - CUDF_EXPECTS(input.size() == 1, "Too large of an input, need to concat the output tables..."); - - // for (uint i=0; iview(); - auto ret = convert_from_rows2(lcv, schema, stream, mr); - - return ret; - // } -} - } // namespace cudf From 7bcf41c94a30404b7145b1e32ea9ef77642ae787 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Thu, 10 Jun 2021 17:53:09 +0000 Subject: [PATCH 43/80] fixing kernel launch and updating --- .../row_conversion/row_conversion.cpp | 9 +- cpp/src/row_conversion/row_conversion.cu | 105 +++++++++++++----- 2 files changed, 83 insertions(+), 31 deletions(-) diff --git a/cpp/benchmarks/row_conversion/row_conversion.cpp b/cpp/benchmarks/row_conversion/row_conversion.cpp index c4edee91b3c..9fa05c408e5 100644 --- a/cpp/benchmarks/row_conversion/row_conversion.cpp +++ b/cpp/benchmarks/row_conversion/row_conversion.cpp @@ -28,7 +28,7 @@ class RowConversion : public cudf::benchmark { static void BM_to_row(benchmark::State& state) { cudf::size_type const n_rows{(cudf::size_type)state.range(0)}; - auto const table = create_random_table({cudf::type_id::INT8, +/* auto const table = create_random_table({cudf::type_id::INT8, cudf::type_id::INT32, cudf::type_id::INT16, cudf::type_id::INT64, @@ -38,7 +38,10 @@ static void BM_to_row(benchmark::State& state) cudf::type_id::UINT8, cudf::type_id::UINT64}, 50, - row_count{n_rows}); + row_count{n_rows});*/ + auto const table = create_random_table({cudf::type_id::INT32}, + 64, + row_count{n_rows}); cudf::size_type total_bytes = 0; for (int i = 0; i < table->num_columns(); ++i) { @@ -98,7 +101,7 @@ static void BM_from_row(benchmark::State& state) (::benchmark::State & st) { BM_to_row(st); } \ BENCHMARK_REGISTER_F(RowConversion, name) \ ->RangeMultiplier(8) \ - ->Ranges({{1 << 16, 1 << 24}}) \ + ->Ranges({{1 << 6, 1 << 20}}) \ ->UseManualTime() \ ->Unit(benchmark::kMillisecond); diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu index fb5dc4cb38d..994233a0700 100644 --- a/cpp/src/row_conversion/row_conversion.cu +++ b/cpp/src/row_conversion/row_conversion.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include #include #include #include @@ -347,14 +348,14 @@ struct block_info { * @param output_data pointer to output data * */ -__global__ void copy_from_columns(const cudf::size_type num_rows, - const cudf::size_type num_columns, +__global__ void copy_from_columns(const size_type num_rows, + const size_type num_columns, const int8_t **input_data, - const cudf::bitmask_type **input_nm, - const cudf::size_type *col_sizes, - const cudf::size_type *col_offsets, + const bitmask_type **input_nm, + const size_type *col_sizes, + const size_type *col_offsets, const block_info *block_infos, - const uint64_t *row_offsets, + const size_type *row_offsets, int8_t **output_data) { // We are going to copy the data in two passes. @@ -365,47 +366,92 @@ __global__ void copy_from_columns(const cudf::size_type num_rows, // This has been broken up for us in the block_info struct, so we don't have // any calculation to do here, but it is important to note. + bool debug_print = false; + + if (debug_print) { + printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns); + printf("Column Info:\n"); + for (int i=0; i(&output_data[0][output_start_offset]) & 7; // offset for alignment shim in order to match shared memory with final dest - - printf("copying from column %d to column %d with rows %d to row %d(grid dim %d, blockIdx %d)\n", block.start_col, block.end_col, block.start_row, block.end_row, gridDim.x, blockIdx.x); - + if (debug_print) { + printf("outputting to offset %lu\n", output_start_offset); + printf("dest shim offset is %d\n", dest_shim_offset); + printf("Shared data is %p-%p\n", shared_data, shared_data + (48 * 1024)); + } // each thread is responsible for every threadcount rows of data. // the data is copies into shared memory in the final layout. auto const shmem_row_size = align_offset(col_offsets[block.end_col] + col_sizes[block.end_col] - col_offsets[block.start_col] + dest_shim_offset, 8); // 8 byte alignment required for shared memory rows auto const validity_offset = col_offsets[num_columns]; + if (debug_print) { + printf("col_offsets[%d] = %d, col_sizes[%d] = %d, col_offsets[%d] = %d\n", block.end_col, col_offsets[block.end_col], block.end_col, col_sizes[block.end_col], block.start_col, col_offsets[block.start_col]); + printf("shmem row size %d\n", shmem_row_size); + printf("validity offset is %d\n", validity_offset); + printf("starting at %d,%d and going to %d, %d\n", block.start_col, block.start_row, block.end_col, block.end_row); + } for (int col=block.start_col; col<=block.end_col; ++col) { /*if (!col_is_variable) */{ uint64_t col_offset = 0; cudf::size_type col_size = col_sizes[col]; auto const dest_col_offset = col_offsets[col] - col_offsets[block.start_col] + dest_shim_offset; + if (debug_print) { + printf("dest col offset %d\n", dest_col_offset); + } for (int row=block.start_row + threadIdx.x; row(input_data[col]); + if (debug_print) { + printf("%p <- short %d\n", shmem_dest, short_col_input[row]); + } *reinterpret_cast(shmem_dest) = short_col_input[row]; break; } case 4: { const int32_t *int_col_input = reinterpret_cast(input_data[col]); + if (debug_print) { + printf("shmem[%d][%d] - %p <- int %d\n", row, col, shmem_dest, int_col_input[row]); + } *reinterpret_cast(shmem_dest) = int_col_input[row]; break; } case 8: { const int64_t *long_col_input = reinterpret_cast(input_data[col]); + if (debug_print) { + printf("%p <- long %lu\n", shmem_dest, long_col_input[row]); + } *reinterpret_cast(shmem_dest) = long_col_input[row]; break; } default: { cudf::size_type input_offset = col_size * row; - // TODO this should just not be supported for fixed width columns, but just in case... + if (debug_print) { + printf("byte for byte copy due to size %d\n", col_size); + printf("%p <- input_data[%d] which is %d\n", shmem_dest, input_offset, input_data[col][input_offset]); + } + // TODO this should just not be supported for fixed width columns, but just in case... for (cudf::size_type b = 0; b < col_size; b++) { shmem_dest[b] = input_data[col][b + input_offset]; } @@ -676,6 +722,12 @@ std::vector> convert_to_rows2(cudf::table_view con CUDA_TRY( cudaDeviceGetAttribute(&shmem_limit_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); + #if defined(DEBUG) + size_t free, total; + cudaMemGetInfo( &free, &total ); + printf("%lu/%lu Memory", free, total); + #endif + // break up the work into blocks, which are a starting and ending row/col #. // this window size is calculated based on the shared memory size available // we want a single block to fill up the entire shared memory space available @@ -692,7 +744,7 @@ std::vector> convert_to_rows2(cudf::table_view con // windows so the windows can be properly cut around them. std::vector row_sizes; // size of each row in bytes including any alignment padding - std::vector row_offsets; // offset from the start of the data to this row + std::vector row_offsets; // offset from the start of the data to this row std::vector column_sizes; // byte size of each column std::vector column_starts; // offset of column inside a row including alignment std::vector variable_width_columns; // list of the variable width columns in the table @@ -821,7 +873,7 @@ std::vector> convert_to_rows2(cudf::table_view con block_infos.emplace_back( detail::block_info{start_col, current_window_start_row, - start_col + end_col, + end_col, std::min(current_window_start_row + window_height - 1, num_rows), current_window_row_batch}); i += window_height; @@ -889,23 +941,20 @@ std::vector> convert_to_rows2(cudf::table_view con auto dev_block_infos = detail::copy_to_dev_async2(block_infos, stream, mr); auto dev_row_offsets = detail::copy_to_dev_async2(row_offsets, stream, mr); - std::vector output_data; + std::vector output_buffers; + std::vector output_data; output_data.reserve(row_batches.size()); for (uint i=0; i(temp.data())); + output_buffers.push_back(std::move(temp)); } - auto dev_output_data = detail::copy_to_dev_async2(row_offsets, stream, mr); + auto dev_output_data = detail::copy_to_dev_async2(output_data, stream, mr); // blast through the entire table and convert it - dim3 blocks; - dim3 threads; - blocks.x = block_infos.size(); - blocks.y = 0; - blocks.z = 0; - threads.x = 1024; - threads.y = 0; - threads.z = 0; - detail::copy_from_columns<<>>(num_rows, + dim3 blocks(block_infos.size()); + dim3 threads(1024); + copy_from_columns<<>>(num_rows, num_columns, dev_input_data.data(), dev_input_nm.data(), @@ -932,14 +981,14 @@ std::vector> convert_to_rows2(cudf::table_view con } offset_offset += row_batches[i].row_count; - auto dev_offsets = detail::copy_to_dev_async2(offset_vals, stream, mr); + auto dev_offsets = detail::copy_to_dev_async2(offset_vals, stream, mr); auto offsets = std::make_unique(data_type{type_id::INT32}, (size_type)offset_vals.size(), dev_offsets.release()); auto data = std::make_unique(data_type{cudf::type_id::INT8}, row_batches[i].num_bytes, - std::move(output_data[i])); + std::move(output_buffers[i])); ret.push_back(cudf::make_lists_column(row_batches[i].row_count, std::move(offsets), From 17f1e5da99044036a2873e98905a10b7a5725adb Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Wed, 16 Jun 2021 19:25:57 +0000 Subject: [PATCH 44/80] Updates and bug fixing --- .../row_conversion/row_conversion.cpp | 76 ++- cpp/src/row_conversion/row_conversion.cu | 498 ++++++++++++------ cpp/tests/row_conversion/row_conversion.cpp | 106 ---- 3 files changed, 378 insertions(+), 302 deletions(-) diff --git a/cpp/benchmarks/row_conversion/row_conversion.cpp b/cpp/benchmarks/row_conversion/row_conversion.cpp index 9fa05c408e5..e1228c9df21 100644 --- a/cpp/benchmarks/row_conversion/row_conversion.cpp +++ b/cpp/benchmarks/row_conversion/row_conversion.cpp @@ -25,10 +25,43 @@ class RowConversion : public cudf::benchmark { }; -static void BM_to_row(benchmark::State& state) +static void BM_old_to_row(benchmark::State& state) { cudf::size_type const n_rows{(cudf::size_type)state.range(0)}; -/* auto const table = create_random_table({cudf::type_id::INT8, + auto const table = create_random_table({cudf::type_id::INT8, + cudf::type_id::INT32, + cudf::type_id::INT16, + cudf::type_id::INT64, + cudf::type_id::INT32, + cudf::type_id::BOOL8, + cudf::type_id::UINT16, + cudf::type_id::UINT8, + cudf::type_id::UINT64}, + 212, + row_count{n_rows}); + /* auto const table = create_random_table({cudf::type_id::INT32}, + 64, + row_count{n_rows});*/ + + cudf::size_type total_bytes = 0; + for (int i = 0; i < table->num_columns(); ++i) { + auto t = table->get_column(i).type(); + total_bytes += cudf::size_of(t); + } + + for (auto _ : state) { + cuda_event_timer raii(state, true, rmm::cuda_stream_default); + + auto rows = cudf::convert_to_rows(table->view()); + } + + state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows()); +} + +static void BM_new_to_row(benchmark::State& state) +{ + cudf::size_type const n_rows{(cudf::size_type)state.range(0)}; + auto const table = create_random_table({cudf::type_id::INT8, cudf::type_id::INT32, cudf::type_id::INT16, cudf::type_id::INT64, @@ -37,11 +70,11 @@ static void BM_to_row(benchmark::State& state) cudf::type_id::UINT16, cudf::type_id::UINT8, cudf::type_id::UINT64}, - 50, - row_count{n_rows});*/ - auto const table = create_random_table({cudf::type_id::INT32}, - 64, - row_count{n_rows}); + 212, + row_count{n_rows}); + /* auto const table = create_random_table({cudf::type_id::INT32}, + 64, + row_count{n_rows});*/ cudf::size_type total_bytes = 0; for (int i = 0; i < table->num_columns(); ++i) { @@ -52,14 +85,13 @@ static void BM_to_row(benchmark::State& state) for (auto _ : state) { cuda_event_timer raii(state, true, rmm::cuda_stream_default); -// auto rows = cudf::convert_to_rows(table->view()); auto new_rows = cudf::convert_to_rows2(table->view()); } state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows()); } -static void BM_from_row(benchmark::State& state) +/*static void BM_from_row(benchmark::State& state) { cudf::size_type const n_rows{(cudf::size_type)state.range(0)}; auto const table = create_random_table({cudf::type_id::INT8, @@ -73,9 +105,6 @@ static void BM_from_row(benchmark::State& state) cudf::type_id::UINT64}, 256, row_count{n_rows}); - /* auto const table = create_random_table({cudf::type_id::INT32}, - 4, - row_count{n_rows});*/ std::vector schema; cudf::size_type total_bytes = 0; @@ -94,18 +123,19 @@ static void BM_from_row(benchmark::State& state) } state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows()); -} - -#define TO_ROW_CONVERSION_BENCHMARK_DEFINE(name) \ - BENCHMARK_DEFINE_F(RowConversion, name) \ - (::benchmark::State & st) { BM_to_row(st); } \ - BENCHMARK_REGISTER_F(RowConversion, name) \ - ->RangeMultiplier(8) \ - ->Ranges({{1 << 6, 1 << 20}}) \ - ->UseManualTime() \ +}*/ + +#define TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \ + BENCHMARK_DEFINE_F(RowConversion, name) \ + (::benchmark::State & st) { f(st); } \ + BENCHMARK_REGISTER_F(RowConversion, name) \ + ->RangeMultiplier(8) \ + ->Ranges({{1 << 6, 1 << 20}}) \ + ->UseManualTime() \ ->Unit(benchmark::kMillisecond); -TO_ROW_CONVERSION_BENCHMARK_DEFINE(to_row_conversion) +TO_ROW_CONVERSION_BENCHMARK_DEFINE(old_to_row_conversion, BM_old_to_row) +TO_ROW_CONVERSION_BENCHMARK_DEFINE(new_to_row_conversion, BM_new_to_row) #define FROM_ROW_CONVERSION_BENCHMARK_DEFINE(name) \ BENCHMARK_DEFINE_F(RowConversion, name) \ @@ -116,4 +146,4 @@ TO_ROW_CONVERSION_BENCHMARK_DEFINE(to_row_conversion) ->UseManualTime() \ ->Unit(benchmark::kMillisecond); -FROM_ROW_CONVERSION_BENCHMARK_DEFINE(from_row_conversion) +//FROM_ROW_CONVERSION_BENCHMARK_DEFINE(from_row_conversion) diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu index 994233a0700..92ba075c316 100644 --- a/cpp/src/row_conversion/row_conversion.cu +++ b/cpp/src/row_conversion/row_conversion.cu @@ -44,7 +44,6 @@ static inline __host__ __device__ int32_t align_offset(int32_t offset, std::size return (offset + alignment - 1) & ~(alignment - 1); } - /** * Copy a simple vector to device memory asynchronously. Be sure to read * the data on the same stream as is used to copy it. @@ -61,10 +60,9 @@ std::unique_ptr> copy_to_dev_async(const std::vector & } template -rmm::device_uvector copy_to_dev_async2( - const std::vector &input, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) +rmm::device_uvector copy_to_dev_async2(const std::vector &input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) { rmm::device_uvector ret(input.size(), stream, mr); CUDA_TRY(cudaMemcpyAsync( @@ -346,7 +344,7 @@ struct block_info { * @param block_infos information about the blocks of work * @param row_offsets offset to a specific row in the input data * @param output_data pointer to output data - * + * */ __global__ void copy_from_columns(const size_type num_rows, const size_type num_columns, @@ -366,92 +364,119 @@ __global__ void copy_from_columns(const size_type num_rows, // This has been broken up for us in the block_info struct, so we don't have // any calculation to do here, but it is important to note. - bool debug_print = false; - + bool debug_print = false; // blockIdx.x == 70 && threadIdx.x == 448; + if (debug_print) { printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns); printf("Column Info:\n"); - for (int i=0; i(&output_data[0][output_start_offset]) & 7; // offset for alignment shim in order to match shared memory with final dest + uint8_t const dest_shim_offset = + reinterpret_cast(&output_data[0][output_start_offset]) & + 7; // offset for alignment shim in order to match shared memory with final dest if (debug_print) { printf("outputting to offset %lu\n", output_start_offset); printf("dest shim offset is %d\n", dest_shim_offset); printf("Shared data is %p-%p\n", shared_data, shared_data + (48 * 1024)); + printf("my block is %d,%d -> %d,%d - buffer %d\n", + block.start_col, + block.start_row, + block.end_col, + block.end_row, + block.buffer_num); } // each thread is responsible for every threadcount rows of data. // the data is copies into shared memory in the final layout. - auto const shmem_row_size = align_offset(col_offsets[block.end_col] + col_sizes[block.end_col] - col_offsets[block.start_col] + dest_shim_offset, 8); // 8 byte alignment required for shared memory rows + auto const real_bytes_in_row = + col_offsets[block.end_col] + col_sizes[block.end_col] - col_offsets[block.start_col]; + auto const shmem_row_size = align_offset(real_bytes_in_row + dest_shim_offset, + 8); // 8 byte alignment required for shared memory rows auto const validity_offset = col_offsets[num_columns]; if (debug_print) { - printf("col_offsets[%d] = %d, col_sizes[%d] = %d, col_offsets[%d] = %d\n", block.end_col, col_offsets[block.end_col], block.end_col, col_sizes[block.end_col], block.start_col, col_offsets[block.start_col]); + printf("col_offsets[%d] = %d, col_sizes[%d] = %d, col_offsets[%d] = %d\n", + block.end_col, + col_offsets[block.end_col], + block.end_col, + col_sizes[block.end_col], + block.start_col, + col_offsets[block.start_col]); printf("shmem row size %d\n", shmem_row_size); printf("validity offset is %d\n", validity_offset); - printf("starting at %d,%d and going to %d, %d\n", block.start_col, block.start_row, block.end_col, block.end_row); + printf("starting at %d,%d and going to %d, %d\n", + block.start_col, + block.start_row, + block.end_col, + block.end_row); } - for (int col=block.start_col; col<=block.end_col; ++col) { - /*if (!col_is_variable) */{ - uint64_t col_offset = 0; + for (int col = block.start_col; col <= block.end_col; ++col) { + /*if (!col_is_variable) */ { + uint64_t col_offset = 0; cudf::size_type col_size = col_sizes[col]; - auto const dest_col_offset = col_offsets[col] - col_offsets[block.start_col] + dest_shim_offset; - if (debug_print) { - printf("dest col offset %d\n", dest_col_offset); - } - for (int row=block.start_row + threadIdx.x; row(input_data[col]); - if (debug_print) { - printf("%p <- short %d\n", shmem_dest, short_col_input[row]); - } + const int16_t *short_col_input = reinterpret_cast(input_data[col]); + if (debug_print) { printf("%p <- short %d\n", shmem_dest, short_col_input[row]); } *reinterpret_cast(shmem_dest) = short_col_input[row]; break; } case 4: { - const int32_t *int_col_input = reinterpret_cast(input_data[col]); + const int32_t *int_col_input = reinterpret_cast(input_data[col]); if (debug_print) { - printf("shmem[%d][%d] - %p <- int %d\n", row, col, shmem_dest, int_col_input[row]); + printf("shmem[%d][%d] - %p <- int 0x%x\n", row, col, shmem_dest, int_col_input[row]); } *reinterpret_cast(shmem_dest) = int_col_input[row]; break; } case 8: { - const int64_t *long_col_input = reinterpret_cast(input_data[col]); - if (debug_print) { - printf("%p <- long %lu\n", shmem_dest, long_col_input[row]); - } + const int64_t *long_col_input = reinterpret_cast(input_data[col]); + if (debug_print) { printf("%p <- long %lu\n", shmem_dest, long_col_input[row]); } *reinterpret_cast(shmem_dest) = long_col_input[row]; break; } default: { cudf::size_type input_offset = col_size * row; if (debug_print) { - printf("byte for byte copy due to size %d\n", col_size); - printf("%p <- input_data[%d] which is %d\n", shmem_dest, input_offset, input_data[col][input_offset]); - } - // TODO this should just not be supported for fixed width columns, but just in case... + printf("byte for byte copy due to size %d of column %d\n", col_size, col); + printf("%p <- input_data[%d] which is %d\n", + shmem_dest, + input_offset, + input_data[col][input_offset]); + } + // TODO this should just not be supported for fixed width columns, but just in case... for (cudf::size_type b = 0; b < col_size; b++) { shmem_dest[b] = input_data[col][b + input_offset]; } @@ -463,11 +488,13 @@ __global__ void copy_from_columns(const size_type num_rows, // so we have to rewrite the addresses to make sure that it is 4 byte aligned // we do this directly in the final location because the entire row may not // fit in shared memory and may require many blocks to process it entirely - int8_t *valid_byte = &output_data[block.buffer_num][row_offsets[row] + validity_offset + col / 8]; + int8_t *valid_byte = + &output_data[block.buffer_num][row_offsets[row] + validity_offset + col / 8]; cudf::size_type byte_bit_offset = col % 8; uint64_t fixup_bytes = reinterpret_cast(valid_byte) % 4; int32_t *valid_int = reinterpret_cast(valid_byte - fixup_bytes); cudf::size_type int_bit_offset = byte_bit_offset + (fixup_bytes * 8); + if (debug_print) { printf("Outputting validity to %p\n", valid_byte); } // Now copy validity for the column if (input_nm[col]) { if (bit_is_set(input_nm[col], row)) { @@ -479,11 +506,11 @@ __global__ void copy_from_columns(const size_type num_rows, // It is valid so just set the bit atomicOr_block(valid_int, 1 << int_bit_offset); } - } // end row + } // end row - col_offset += col_sizes[col] * (block.end_row - block.start_row); + col_offset += col_sizes[col] * rows_in_block; } - } // end col + } // end col // wait for the data to be totally copied into shared memory __syncthreads(); @@ -496,30 +523,75 @@ __global__ void copy_from_columns(const size_type num_rows, // row in shared memory may not be an entire row of the destination. // auto const thread_start_offset = threadIdx.x * 8; - auto const thread_stride = gridDim.x * 8; - for (auto src_offset = thread_start_offset; src_offset < shmem_row_size * (block.end_row - block.start_row); src_offset += thread_stride) { + auto const thread_stride = gridDim.x * 8; + if (debug_print) { + printf("writing final data from %d to %d at stride %d\n", + thread_start_offset, + shmem_row_size * rows_in_block, + thread_stride); + printf("rows in block %d\n", rows_in_block); + } + for (auto src_offset = thread_start_offset; src_offset < shmem_row_size * rows_in_block; + src_offset += thread_stride) { auto const output_row_num = src_offset / shmem_row_size; - auto const row_offset = row_offsets[block.start_row + output_row_num]; - auto const col_offset = src_offset % shmem_row_size; - int8_t *output_ptr = &output_data[block.buffer_num][row_offset + col_offset]; - int8_t *input_ptr = &shared_data[src_offset]; - // the first part and last part of the row is unaligned data copy. This is copied a single byte - // at a time. - if (dest_shim_offset > 0 && src_offset % shmem_row_size == 0) { - // first part of a row, copy single bytes + auto const row_offset = row_offsets[block.start_row + output_row_num]; + auto const col_offset = src_offset % shmem_row_size; + int8_t *output_ptr = &output_data[block.buffer_num][row_offset + col_offset]; + int8_t *input_ptr = &shared_data[src_offset]; + + // three cases to worry about here + // 1) first 8-byte part of a large row - dest_shim_offset bytes of pad at the front + // 2) last 8-byte part of a large row - some bytes of pad at the end + // 3) corner case of <= 8 bytes of data, which means dest_shim_offset bytes of pad at the front + // AND potentially pad at the rear + + // we know the real number of bytes in a row, so we can figure out if we are in case 3 easily. + // 1st case is when we're at some even multiple of shmem_row_size offset. + // 2nd case is when offset + 8 is some even multiple of shmem_row_size. + // must be an 8 byte copy + + // there is a chance we have a 0 dest_shim_offset and an 8 byte thing to copy, optimize? + if (real_bytes_in_row + dest_shim_offset <= 8) { + // case 3, we want to copy real_bytes_in_row bytes + auto const num_single_bytes = real_bytes_in_row - dest_shim_offset; + for (auto i = 0; i < num_single_bytes; ++i) { + if (debug_print) { + printf("case 3 - %d single byte final write %p -> %p\n", + num_single_bytes, + &input_ptr[i + dest_shim_offset], + &output_ptr[i]); + } + output_ptr[i] = input_ptr[i + dest_shim_offset]; + } + } else if (dest_shim_offset > 0 && src_offset % shmem_row_size == 0) { + // first byte with leading pad auto const num_single_bytes = 8 - dest_shim_offset; - for (auto i=0; i %p\n", &input_ptr[i + dest_shim_offset], &output_ptr[i]); + } output_ptr[i] = input_ptr[i + dest_shim_offset]; } - } else if (dest_shim_offset > 0 && (src_offset + 8) % shmem_row_size == 0) { - // last part of a row, copy single bytes - auto const num_single_bytes = dest_shim_offset; - for (auto i=0; i 0) { + // last bytes of a row + auto const num_single_bytes = (real_bytes_in_row + dest_shim_offset) % 8; + for (auto i = 0; i < num_single_bytes; ++i) { + if (debug_print) { + printf("single trailing byte final write %p -> %p\n", + &input_ptr[i + dest_shim_offset], + &output_ptr[i]); + } output_ptr[i] = input_ptr[i + dest_shim_offset]; } } else { // copy 8 bytes aligned - const int64_t *long_col_input = reinterpret_cast(input_ptr); + const int64_t *long_col_input = reinterpret_cast(input_ptr); + if (debug_print) { + printf( + "long final write %p -> %p\n", long_col_input, reinterpret_cast(output_ptr)); + } *reinterpret_cast(output_ptr) = *long_col_input; } } @@ -696,13 +768,14 @@ std::vector> convert_to_rows2(cudf::table_view con rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) { - // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the data, but small enough - // that multiple columns fit in memory so the writes can coalese as well. Potential optimization for window sizes. + // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the + // data, but small enough that multiple columns fit in memory so the writes can coalese as well. + // Potential optimization for window sizes. constexpr int max_window_height = 1024; - const size_type num_columns = tbl.num_columns(); - const size_type num_rows = tbl.num_rows(); + const size_type num_columns = tbl.num_columns(); + const size_type num_rows = tbl.num_rows(); - #if defined(DEBUG) +#if defined(DEBUG) auto pretty_print = [](uint64_t i) { if (i > (1 * 1024 * 1024 * 1024)) { printf("%.2f GB", i / float(1 * 1024 * 1024 * 1024)); @@ -714,7 +787,7 @@ std::vector> convert_to_rows2(cudf::table_view con printf("%lu Bytes", i); } }; - #endif +#endif int device_id; CUDA_TRY(cudaGetDevice(&device_id)); @@ -722,11 +795,11 @@ std::vector> convert_to_rows2(cudf::table_view con CUDA_TRY( cudaDeviceGetAttribute(&shmem_limit_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); - #if defined(DEBUG) +#if defined(DEBUG) size_t free, total; - cudaMemGetInfo( &free, &total ); - printf("%lu/%lu Memory", free, total); - #endif + cudaMemGetInfo(&free, &total); + printf("%lu/%lu Memory\n", free, total); +#endif // break up the work into blocks, which are a starting and ending row/col #. // this window size is calculated based on the shared memory size available @@ -743,45 +816,46 @@ std::vector> convert_to_rows2(cudf::table_view con // to that point. These are row batches and they are decided first before building the // windows so the windows can be properly cut around them. - std::vector row_sizes; // size of each row in bytes including any alignment padding - std::vector row_offsets; // offset from the start of the data to this row + std::vector row_sizes; // size of each row in bytes including any alignment padding + std::vector row_offsets; // offset from the start of the data to this row std::vector column_sizes; // byte size of each column - std::vector column_starts; // offset of column inside a row including alignment - std::vector variable_width_columns; // list of the variable width columns in the table + std::vector column_starts; // offset of column inside a row including alignment + std::vector + variable_width_columns; // list of the variable width columns in the table row_sizes.reserve(num_rows); row_offsets.reserve(num_rows); column_sizes.reserve(num_columns); - column_starts.reserve(num_columns+1); // we add a final offset for validity data start + column_starts.reserve(num_columns + 1); // we add a final offset for validity data start size_type fixed_width_size_per_row = 0; for (int col = 0; col < num_columns; ++col) { - auto cv = tbl.column(col); - auto col_type = cv.type(); + auto cv = tbl.column(col); + auto col_type = cv.type(); bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; - if (nested_type) { variable_width_columns.push_back(cv);} + if (nested_type) { variable_width_columns.push_back(cv); } // a list or string column will write a single uint64 // of data here for offset/length auto col_size = nested_type ? 8 : size_of(col_type); // align size for this type - std::size_t const alignment_needed = col_size; // They are the same for fixed width types - fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed); + std::size_t const alignment_needed = col_size; // They are the same for fixed width types + fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed); column_starts.push_back(fixed_width_size_per_row); column_sizes.push_back(col_size); fixed_width_size_per_row += col_size; } - + // When building the columns to return, we have to be mindful of the offset limit in cudf. // It is 32-bit and these data columns are capable of surpassing that easily. The data should // not be cut off exactly at the limit though due to the validity buffers. The most efficient // place to cut the validity is on a 32-row boundary, so as we calculate the row sizes // we keep track of the cut points for the validity, which we call row batches. If the row - // is larger than can be represented with the 32-bit offsets, we use the last 32-row boundary we hit. - // Note that this boundary is for our book-keeping with column pointers and not anything - // that the kernel needs to worry about. We cut the output at convienient boundaries - // when assembling the outgoing data stream. + // is larger than can be represented with the 32-bit offsets, we use the last 32-row boundary we + // hit. Note that this boundary is for our book-keeping with column pointers and not anything that + // the kernel needs to worry about. We cut the output at convienient boundaries when assembling + // the outgoing data stream. struct row_batch { size_type num_bytes; size_type row_count; @@ -798,71 +872,90 @@ std::vector> convert_to_rows2(cudf::table_view con // will be included in the variable-width data blob at the end of the // row. return 0; -/* auto c = variable_width_columns[col]; - while (true) { - auto col_offsets = c.child(0).data(); - auto col_data_size = size_of(c.child(1).type()); - std::size_t alignment_needed = col_data_size; - - row_sizes[row] += (col_offsets[row + 1] - col_offsets[row]) * col_data_size; - if (c.num_children() == 0) { - break; - } - c = c.child(1); - } -*/ + /* auto c = variable_width_columns[col]; + while (true) { + auto col_offsets = c.child(0).data(); + auto col_data_size = size_of(c.child(1).type()); + std::size_t alignment_needed = col_data_size; + + row_sizes[row] += (col_offsets[row + 1] - col_offsets[row]) * col_data_size; + if (c.num_children() == 0) { + break; + } + c = c.child(1); + } + */ }; uint64_t row_batch_size = 0; uint64_t total_table_size = 0; - size_type row_batch_rows = 0; - uint64_t row_offset = 0; + size_type row_batch_rows = 0; + uint64_t row_offset = 0; + + auto calculate_validity_size = [](int const num_cols) { + // Now we need to add in space for validity + // Eventually we can think about nullable vs not nullable, but for now we will just always add + // it in + return (num_cols + 7) / 8; + }; - // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then calculate - // the size of each row's variable-width data as well. + // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then + // calculate the size of each row's variable-width data and validity as well. for (int row = 0; row < num_rows; ++row) { - row_sizes[row] = fixed_width_size_per_row + calculate_variable_width_row_data_size(row); - if (row_batch_size + row_sizes[row] > std::numeric_limits::max()) { + auto aligned_row_batch_size = + detail::align_offset(row_batch_size, 8); // rows are 8 byte aligned + row_sizes[row] = fixed_width_size_per_row; + // validity is byte aligned + row_sizes[row] += calculate_validity_size(num_columns); + // variable width data is 8-byte aligned + row_sizes[row] = detail::align_offset(row_sizes[row], 8) + + calculate_variable_width_row_data_size(row); // rows are 8 byte aligned + + if (aligned_row_batch_size + row_sizes[row] > std::numeric_limits::max()) { // a new batch starts at the last 32-row boundary - row_batches.push_back(row_batch{static_cast(row_batch_size), row_batch_rows & ~31}); - row_batch_size = 0; - row_batch_rows = row_batch_rows & 31; - row_offset = 0; + row_batches.push_back( + row_batch{static_cast(row_batch_size), row_batch_rows & ~31}); + row_batch_size = 0; + row_batch_rows = row_batch_rows & 31; + row_offset = 0; + aligned_row_batch_size = 0; } - row_offset = detail::align_offset(row_offset, 8); // rows are 8 byte aligned + row_offset = detail::align_offset(row_offset, 8); // rows are 8 byte aligned row_offsets.push_back(row_offset); - row_batch_size += row_sizes[row]; + row_batch_size = aligned_row_batch_size + row_sizes[row]; row_offset += row_sizes[row]; - total_table_size = detail::align_offset(total_table_size, 8); // rows are 8 byte aligned + total_table_size = detail::align_offset(total_table_size, 8); // rows are 8 byte aligned total_table_size += row_sizes[row]; row_batch_rows++; } if (row_batch_size > 0) { - row_batches.push_back(row_batch{static_cast(row_batch_size), row_batch_rows & ~31}); + row_batches.push_back(row_batch{static_cast(row_batch_size), row_batch_rows}); } - #if defined(DEBUG) +#if defined(DEBUG) + printf("%d rows and %d columns in table\n", num_rows, num_columns); printf("%lu batches:\n", row_batches.size()); for (auto i = 0; i < (int)row_batches.size(); ++i) { printf("%d: %d rows, ", i, row_batches[i].row_count); pretty_print(row_batches[i].num_bytes); printf("\n"); } - #endif +#endif std::vector block_infos; // block infos are organized with the windows going "down" the columns // this provides the most coalescing of memory access - int current_window_size = 0; + int current_window_width = 0; int current_window_start_col = 0; // build the blocks for a specific set of columns - auto build_blocks = [&block_infos, &row_batches, num_rows](int const start_col, int const end_col, int const desired_window_height) { + auto build_blocks = [&block_infos, &row_batches, num_rows]( + int const start_col, int const end_col, int const desired_window_height) { int current_window_start_row = 0; int current_window_row_batch = 0; - int rows_left_in_batch = row_batches[current_window_row_batch].row_count; - int i = 0; + int rows_left_in_batch = row_batches[current_window_row_batch].row_count; + int i = 0; while (i < num_rows) { if (rows_left_in_batch == 0) { current_window_row_batch++; @@ -872,9 +965,10 @@ std::vector> convert_to_rows2(cudf::table_view con block_infos.emplace_back( detail::block_info{start_col, - current_window_start_row, - end_col, - std::min(current_window_start_row + window_height - 1, num_rows), current_window_row_batch}); + current_window_start_row, + end_col, + std::min(current_window_start_row + window_height - 1, num_rows - 1), + current_window_row_batch}); i += window_height; current_window_start_row += window_height; @@ -882,7 +976,17 @@ std::vector> convert_to_rows2(cudf::table_view con } }; - int const window_height = std::min(std::min(max_window_height, num_rows), row_batches[0].row_count); + int const window_height = + std::min(std::min(max_window_height, num_rows), row_batches[0].row_count); +#if defined(DEBUG) + printf( + "max_window_height is %d, num_rows is %d, batch row count is %d - which makes window height " + "%d\n", + max_window_height, + num_rows, + row_batches[0].row_count, + window_height); +#endif int row_size = 0; @@ -891,32 +995,74 @@ std::vector> convert_to_rows2(cudf::table_view con auto const col_size = column_sizes[col]; // align size for this type - std::size_t alignment_needed = col_size; // They are the same for fixed width types - auto row_size_with_this_col = detail::align_offset(row_size, alignment_needed) + col_size; + std::size_t alignment_needed = col_size; // They are the same for fixed width types + auto row_size_aligned = detail::align_offset(row_size, alignment_needed); + auto row_size_with_this_col = row_size_aligned + col_size; if (row_size_with_this_col * window_height > shmem_limit_per_block) { +#if defined(DEBUG) + printf( + "Window size %d too large at column %d, bumping back to build windows of size %d(cols " + "%d-%d), which is %d tall. Row size is too large at %d and ok at %d(aligned overall is %d) " + "for shared mem size %d\n", + row_size_with_this_col * window_height, + col, + row_size * window_height, + current_window_start_col, + col - 1, + window_height, + row_size_with_this_col, + row_size, + row_size_aligned, + shmem_limit_per_block); +#endif // too large, close this window, generate vertical blocks and restart build_blocks(current_window_start_col, col - 1, window_height); - row_size = detail::align_offset(column_starts[col] & 7, alignment_needed) + col_size; // alignment required for shared memory window boundary to match alignment of output row + row_size = + detail::align_offset((column_starts[col] + column_sizes[col]) & 7, alignment_needed); +#if defined(DEBUG) + printf( + "New window starting with offset %d and row size %d to be %d (previous column offset %d+%d " + "or %d)\n", + row_size, + col_size, + row_size + col_size, + column_starts[col - 1], + column_sizes[col - 1], + column_starts[col - 1] + column_sizes[col - 1]); +#endif + row_size += col_size; // alignment required for shared memory window boundary to match + // alignment of output row current_window_start_col = col; + current_window_width = 0; } else { row_size = row_size_with_this_col; + current_window_width++; } } - auto validity_offset = detail::align_offset(column_starts.back(), 4); +#if defined(DEBUG) + printf("validity offset will be %d + %d = %d\n", + column_starts.back(), + column_sizes.back(), + column_starts.back() + column_sizes.back()); +#endif + auto validity_offset = detail::align_offset(column_starts.back() + column_sizes.back(), 4); column_starts.push_back(validity_offset); - + // build last set of blocks - if (current_window_size > 0) { build_blocks(current_window_start_col, (int)tbl.num_columns() - 1, window_height); } + if (current_window_width > 0) { + build_blocks(current_window_start_col, (int)tbl.num_columns() - 1, window_height); + } - // Get the pointers to the input columnar data ready - possibly moved up to copy to gpu while calculating other things + // Get the pointers to the input columnar data ready - possibly moved up to copy to gpu while + // calculating other things std::vector input_data; std::vector input_nm; for (size_type column_number = 0; column_number < num_columns; column_number++) { - column_view cv = tbl.column(column_number); + column_view cv = tbl.column(column_number); auto const col_type = cv.type(); - bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; + bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; if (!nested_type) { input_data.emplace_back(cv.data()); @@ -924,81 +1070,87 @@ std::vector> convert_to_rows2(cudf::table_view con } } - #if defined(DEBUG) - printf("%lu windows for %d columns, %d rows to fit in ", block_infos.size(), block_infos[0].end_col - block_infos[0].start_col, block_infos[0].end_row - block_infos[0].start_row); +#if defined(DEBUG) + printf("%lu windows for %d columns, %d rows to fit in ", + block_infos.size(), + block_infos[0].end_col - block_infos[0].start_col + 1, + block_infos[0].end_row - block_infos[0].start_row); pretty_print(shmem_limit_per_block); printf(" shared mem("); pretty_print(fixed_width_size_per_row); printf("/row, %d columns, %d rows, ", num_columns, num_rows); pretty_print(total_table_size); printf(" total):\n"); - #endif +#endif auto dev_input_data = detail::copy_to_dev_async2(input_data, stream, mr); auto dev_input_nm = detail::copy_to_dev_async2(input_nm, stream, mr); auto dev_col_sizes = detail::copy_to_dev_async2(column_sizes, stream, mr); - auto dev_col_starts = detail::copy_to_dev_async2(column_starts, stream, mr); + auto dev_col_starts = detail::copy_to_dev_async2(column_starts, stream, mr); auto dev_block_infos = detail::copy_to_dev_async2(block_infos, stream, mr); - auto dev_row_offsets = detail::copy_to_dev_async2(row_offsets, stream, mr); + auto dev_row_offsets = detail::copy_to_dev_async2(row_offsets, stream, mr); std::vector output_buffers; std::vector output_data; output_data.reserve(row_batches.size()); - for (uint i=0; i(temp.data())); output_buffers.push_back(std::move(temp)); } - auto dev_output_data = detail::copy_to_dev_async2(output_data, stream, mr); + auto dev_output_data = detail::copy_to_dev_async2(output_data, stream, mr); // blast through the entire table and convert it dim3 blocks(block_infos.size()); - dim3 threads(1024); - copy_from_columns<<>>(num_rows, - num_columns, - dev_input_data.data(), - dev_input_nm.data(), - dev_col_sizes.data(), - dev_col_starts.data(), - dev_block_infos.data(), - dev_row_offsets.data(), - reinterpret_cast(dev_output_data.data())); + dim3 threads(std::min((uint64_t)1024, total_table_size / 8)); +#if defined(DEBUG) + printf("Launching kernel with %d blocks, %d threads, ", blocks.x, threads.x); + pretty_print(shmem_limit_per_block); + printf(" shared memory\n"); +#endif + copy_from_columns<<>>( + num_rows, + num_columns, + dev_input_data.data(), + dev_input_nm.data(), + dev_col_sizes.data(), + dev_col_starts.data(), + dev_block_infos.data(), + dev_row_offsets.data(), + reinterpret_cast(dev_output_data.data())); // split up the output buffer into multiple buffers based on row batch sizes // and create list of byte columns int offset_offset = 0; std::vector> ret; - for (uint i=0; i offset_vals; offset_vals.reserve(row_batches[i].row_count + 1); size_type cur_offset = 0; offset_vals.push_back(cur_offset); - for (int row=0; row(data_type{type_id::INT32}, (size_type)offset_vals.size(), dev_offsets.release()); + auto dev_offsets = detail::copy_to_dev_async2(offset_vals, stream, mr); + auto offsets = std::make_unique( + data_type{type_id::INT32}, (size_type)offset_vals.size(), dev_offsets.release()); - auto data = - std::make_unique(data_type{cudf::type_id::INT8}, - row_batches[i].num_bytes, - std::move(output_buffers[i])); + auto data = std::make_unique( + data_type{cudf::type_id::INT8}, row_batches[i].num_bytes, std::move(output_buffers[i])); ret.push_back(cudf::make_lists_column(row_batches[i].row_count, - std::move(offsets), - std::move(data), - 0, - rmm::device_buffer{0, rmm::cuda_stream_default, mr}, - stream, - mr)); + std::move(offsets), + std::move(data), + 0, + rmm::device_buffer{0, rmm::cuda_stream_default, mr}, + stream, + mr)); } - + return ret; } diff --git a/cpp/tests/row_conversion/row_conversion.cpp b/cpp/tests/row_conversion/row_conversion.cpp index 818d7a89ddb..c02f83ad1d5 100644 --- a/cpp/tests/row_conversion/row_conversion.cpp +++ b/cpp/tests/row_conversion/row_conversion.cpp @@ -21,13 +21,9 @@ #include #include -#include "cudf/lists/lists_column_view.hpp" -#include "cudf/types.hpp" struct ColumnToRowTests : public cudf::test::BaseFixture { }; -struct RowToColumnTests : public cudf::test::BaseFixture { -}; TEST_F(ColumnToRowTests, Single) { @@ -112,105 +108,3 @@ TEST_F(ColumnToRowTests, SingleByteWide) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); } } - -TEST_F(RowToColumnTests, Single) -{ - cudf::test::fixed_width_column_wrapper a({-1}); - cudf::table_view in(std::vector{a}); - - auto old_rows = cudf::convert_to_rows(in); - std::vector schema{cudf::data_type{cudf::type_id::INT32}}; - for (uint i=0; i a({-1, 0, 1}); - cudf::table_view in(std::vector{a}); - - auto old_rows = cudf::convert_to_rows(in); - std::vector schema{cudf::data_type{cudf::type_id::INT32}}; - for (uint i=0; i int32_t { return rand(); }); - cudf::test::fixed_width_column_wrapper a(r, r + (size_t)4096); - cudf::table_view in(std::vector{a}); - - auto old_rows = cudf::convert_to_rows(in); - std::vector schema; - schema.reserve(in.num_columns()); - for (auto col = in.begin(); col < in.end(); ++col) { - schema.push_back(col->type()); - } - for (uint i=0; i> cols; - std::vector views; - - for (int i = 0; i < 256; ++i) { - cols.push_back(cudf::test::fixed_width_column_wrapper({rand()})); - views.push_back(cols.back()); - } - cudf::table_view in(views); - - auto old_rows = cudf::convert_to_rows(in); - std::vector schema; - schema.reserve(in.num_columns()); - for (auto col = in.begin(); col < in.end(); ++col) { - schema.push_back(col->type()); - } - - for (uint i=0; i> cols; - std::vector views; - - for (int i = 0; i < 256; ++i) { - cols.push_back(cudf::test::fixed_width_column_wrapper({rand()})); - views.push_back(cols.back()); - } - cudf::table_view in(views); - - auto old_rows = cudf::convert_to_rows(in); - std::vector schema; - schema.reserve(in.num_columns()); - for (auto col = in.begin(); col < in.end(); ++col) { - schema.push_back(col->type()); - } - for (uint i=0; i Date: Mon, 21 Jun 2021 18:17:45 +0000 Subject: [PATCH 45/80] Updating windows to be generated in a square way so we can have more data to write out as 8-byte writes from shared memory. Shuffled some of the copy to GPU code up so it can start the copy sooner and hopefully won't force stalls. Some bug fixes. --- .../row_conversion/row_conversion.cpp | 15 ++- cpp/src/row_conversion/row_conversion.cu | 96 +++++++++++-------- 2 files changed, 67 insertions(+), 44 deletions(-) diff --git a/cpp/benchmarks/row_conversion/row_conversion.cpp b/cpp/benchmarks/row_conversion/row_conversion.cpp index e1228c9df21..d6b195433cf 100644 --- a/cpp/benchmarks/row_conversion/row_conversion.cpp +++ b/cpp/benchmarks/row_conversion/row_conversion.cpp @@ -125,7 +125,7 @@ static void BM_new_to_row(benchmark::State& state) state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows()); }*/ -#define TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \ +#define OLD_TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \ BENCHMARK_DEFINE_F(RowConversion, name) \ (::benchmark::State & st) { f(st); } \ BENCHMARK_REGISTER_F(RowConversion, name) \ @@ -134,8 +134,17 @@ static void BM_new_to_row(benchmark::State& state) ->UseManualTime() \ ->Unit(benchmark::kMillisecond); -TO_ROW_CONVERSION_BENCHMARK_DEFINE(old_to_row_conversion, BM_old_to_row) -TO_ROW_CONVERSION_BENCHMARK_DEFINE(new_to_row_conversion, BM_new_to_row) +#define NEW_TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \ + BENCHMARK_DEFINE_F(RowConversion, name) \ + (::benchmark::State & st) { f(st); } \ + BENCHMARK_REGISTER_F(RowConversion, name) \ + ->RangeMultiplier(8) \ + ->Ranges({{1 << 6, 1 << 20}}) \ + ->UseManualTime() \ + ->Unit(benchmark::kMillisecond); + +OLD_TO_ROW_CONVERSION_BENCHMARK_DEFINE(old_to_row_conversion, BM_old_to_row) +NEW_TO_ROW_CONVERSION_BENCHMARK_DEFINE(new_to_row_conversion, BM_new_to_row) #define FROM_ROW_CONVERSION_BENCHMARK_DEFINE(name) \ BENCHMARK_DEFINE_F(RowConversion, name) \ diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu index 92ba075c316..3f221e2f716 100644 --- a/cpp/src/row_conversion/row_conversion.cu +++ b/cpp/src/row_conversion/row_conversion.cu @@ -364,7 +364,7 @@ __global__ void copy_from_columns(const size_type num_rows, // This has been broken up for us in the block_info struct, so we don't have // any calculation to do here, but it is important to note. - bool debug_print = false; // blockIdx.x == 70 && threadIdx.x == 448; + constexpr bool debug_print = false; //blockIdx.x == 2649 && threadIdx.x == 479; if (debug_print) { printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns); @@ -383,6 +383,7 @@ __global__ void copy_from_columns(const size_type num_rows, }*/ printf("output data to %p\n", output_data[block_infos[blockIdx.x].buffer_num]); } + //else { return; } auto block = block_infos[blockIdx.x]; auto const rows_in_block = block.end_row - block.start_row + 1; extern __shared__ int8_t shared_data[]; @@ -416,7 +417,7 @@ __global__ void copy_from_columns(const size_type num_rows, col_sizes[block.end_col], block.start_col, col_offsets[block.start_col]); - printf("shmem row size %d\n", shmem_row_size); + printf("shmem row size %d with real bytes %d\n", shmem_row_size, real_bytes_in_row); printf("validity offset is %d\n", validity_offset); printf("starting at %d,%d and going to %d, %d\n", block.start_col, @@ -524,6 +525,8 @@ __global__ void copy_from_columns(const size_type num_rows, // auto const thread_start_offset = threadIdx.x * 8; auto const thread_stride = gridDim.x * 8; + auto const end_offset = shmem_row_size * rows_in_block; + if (debug_print) { printf("writing final data from %d to %d at stride %d\n", thread_start_offset, @@ -531,7 +534,7 @@ __global__ void copy_from_columns(const size_type num_rows, thread_stride); printf("rows in block %d\n", rows_in_block); } - for (auto src_offset = thread_start_offset; src_offset < shmem_row_size * rows_in_block; + for (auto src_offset = thread_start_offset; src_offset < end_offset; src_offset += thread_stride) { auto const output_row_num = src_offset / shmem_row_size; auto const row_offset = row_offsets[block.start_row + output_row_num]; @@ -771,7 +774,6 @@ std::vector> convert_to_rows2(cudf::table_view con // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the // data, but small enough that multiple columns fit in memory so the writes can coalese as well. // Potential optimization for window sizes. - constexpr int max_window_height = 1024; const size_type num_columns = tbl.num_columns(); const size_type num_rows = tbl.num_rows(); @@ -816,6 +818,25 @@ std::vector> convert_to_rows2(cudf::table_view con // to that point. These are row batches and they are decided first before building the // windows so the windows can be properly cut around them. + // Get the pointers to the input columnar data ready + std::vector input_data; + std::vector input_nm; + input_data.reserve(num_columns); + input_nm.reserve(num_columns); + for (size_type column_number = 0; column_number < num_columns; column_number++) { + column_view cv = tbl.column(column_number); + auto const col_type = cv.type(); + bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; + + if (!nested_type) { + input_data.emplace_back(cv.data()); + input_nm.emplace_back(cv.null_mask()); + } + } + + auto dev_input_data = detail::copy_to_dev_async2(input_data, stream, mr); + auto dev_input_nm = detail::copy_to_dev_async2(input_nm, stream, mr); + std::vector row_sizes; // size of each row in bytes including any alignment padding std::vector row_offsets; // offset from the start of the data to this row std::vector column_sizes; // byte size of each column @@ -847,6 +868,9 @@ std::vector> convert_to_rows2(cudf::table_view con fixed_width_size_per_row += col_size; } + auto dev_col_sizes = detail::copy_to_dev_async2(column_sizes, stream, mr); + auto dev_col_starts = detail::copy_to_dev_async2(column_starts, stream, mr); + // When building the columns to return, we have to be mindful of the offset limit in cudf. // It is 32-bit and these data columns are capable of surpassing that easily. The data should // not be cut off exactly at the limit though due to the validity buffers. The most efficient @@ -901,17 +925,18 @@ std::vector> convert_to_rows2(cudf::table_view con // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then // calculate the size of each row's variable-width data and validity as well. + auto validity_size = calculate_validity_size(num_columns); for (int row = 0; row < num_rows; ++row) { auto aligned_row_batch_size = detail::align_offset(row_batch_size, 8); // rows are 8 byte aligned row_sizes[row] = fixed_width_size_per_row; // validity is byte aligned - row_sizes[row] += calculate_validity_size(num_columns); + row_sizes[row] += validity_size; // variable width data is 8-byte aligned row_sizes[row] = detail::align_offset(row_sizes[row], 8) + calculate_variable_width_row_data_size(row); // rows are 8 byte aligned - if (aligned_row_batch_size + row_sizes[row] > std::numeric_limits::max()) { + if ((uint64_t)aligned_row_batch_size + row_sizes[row] > (uint64_t)std::numeric_limits::max()) { // a new batch starts at the last 32-row boundary row_batches.push_back( row_batch{static_cast(row_batch_size), row_batch_rows & ~31}); @@ -932,7 +957,9 @@ std::vector> convert_to_rows2(cudf::table_view con row_batches.push_back(row_batch{static_cast(row_batch_size), row_batch_rows}); } -#if defined(DEBUG) + auto dev_row_offsets = detail::copy_to_dev_async2(row_offsets, stream, mr); + + #if defined(DEBUG) printf("%d rows and %d columns in table\n", num_rows, num_columns); printf("%lu batches:\n", row_batches.size()); for (auto i = 0; i < (int)row_batches.size(); ++i) { @@ -942,6 +969,16 @@ std::vector> convert_to_rows2(cudf::table_view con } #endif + std::vector output_buffers; + std::vector output_data; + output_data.reserve(row_batches.size()); + for (uint i = 0; i < row_batches.size(); ++i) { + rmm::device_buffer temp(row_batches[i].num_bytes, stream, mr); + output_data.push_back(static_cast(temp.data())); + output_buffers.push_back(std::move(temp)); + } + auto dev_output_data = detail::copy_to_dev_async2(output_data, stream, mr); + std::vector block_infos; // block infos are organized with the windows going "down" the columns @@ -976,8 +1013,13 @@ std::vector> convert_to_rows2(cudf::table_view con } }; + // the ideal window height has lots of 8-byte reads and 8-byte writes. The optimal read/write would be memory cache line sized + // access, but since other blocks will read/write the edges this may not turn out to be overly important. + // For now, we will attempt to build a square window as far as byte sizes. x * y = shared_mem_size. + // Which translates to x^2 = shared_mem_size since we want them equal, so height and width are + // sqrt(shared_mem_size). The trick is that it's in bytes, not rows or columns. int const window_height = - std::min(std::min(max_window_height, num_rows), row_batches[0].row_count); + std::min(std::min(size_type(sqrt(shmem_limit_per_block))/column_sizes[0], num_rows), row_batches[0].row_count); #if defined(DEBUG) printf( "max_window_height is %d, num_rows is %d, batch row count is %d - which makes window height " @@ -998,20 +1040,21 @@ std::vector> convert_to_rows2(cudf::table_view con std::size_t alignment_needed = col_size; // They are the same for fixed width types auto row_size_aligned = detail::align_offset(row_size, alignment_needed); auto row_size_with_this_col = row_size_aligned + col_size; + auto row_size_with_end_pad = detail::align_offset(row_size_with_this_col, 8); - if (row_size_with_this_col * window_height > shmem_limit_per_block) { + if (row_size_with_end_pad * window_height > shmem_limit_per_block) { #if defined(DEBUG) printf( "Window size %d too large at column %d, bumping back to build windows of size %d(cols " "%d-%d), which is %d tall. Row size is too large at %d and ok at %d(aligned overall is %d) " "for shared mem size %d\n", - row_size_with_this_col * window_height, + row_size_with_end_pad * window_height, col, row_size * window_height, current_window_start_col, col - 1, window_height, - row_size_with_this_col, + row_size_with_end_pad, row_size, row_size_aligned, shmem_limit_per_block); @@ -1055,20 +1098,6 @@ std::vector> convert_to_rows2(cudf::table_view con build_blocks(current_window_start_col, (int)tbl.num_columns() - 1, window_height); } - // Get the pointers to the input columnar data ready - possibly moved up to copy to gpu while - // calculating other things - std::vector input_data; - std::vector input_nm; - for (size_type column_number = 0; column_number < num_columns; column_number++) { - column_view cv = tbl.column(column_number); - auto const col_type = cv.type(); - bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; - - if (!nested_type) { - input_data.emplace_back(cv.data()); - input_nm.emplace_back(cv.null_mask()); - } - } #if defined(DEBUG) printf("%lu windows for %d columns, %d rows to fit in ", @@ -1083,26 +1112,11 @@ std::vector> convert_to_rows2(cudf::table_view con printf(" total):\n"); #endif - auto dev_input_data = detail::copy_to_dev_async2(input_data, stream, mr); - auto dev_input_nm = detail::copy_to_dev_async2(input_nm, stream, mr); - auto dev_col_sizes = detail::copy_to_dev_async2(column_sizes, stream, mr); - auto dev_col_starts = detail::copy_to_dev_async2(column_starts, stream, mr); auto dev_block_infos = detail::copy_to_dev_async2(block_infos, stream, mr); - auto dev_row_offsets = detail::copy_to_dev_async2(row_offsets, stream, mr); - - std::vector output_buffers; - std::vector output_data; - output_data.reserve(row_batches.size()); - for (uint i = 0; i < row_batches.size(); ++i) { - rmm::device_buffer temp(row_batches[i].num_bytes, stream, mr); - output_data.push_back(static_cast(temp.data())); - output_buffers.push_back(std::move(temp)); - } - auto dev_output_data = detail::copy_to_dev_async2(output_data, stream, mr); // blast through the entire table and convert it dim3 blocks(block_infos.size()); - dim3 threads(std::min((uint64_t)1024, total_table_size / 8)); + dim3 threads(std::min(1024, shmem_limit_per_block / 8)); #if defined(DEBUG) printf("Launching kernel with %d blocks, %d threads, ", blocks.x, threads.x); pretty_print(shmem_limit_per_block); From 5c0e52ce20e0708025917552646f3aa48d312b1a Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Thu, 8 Jul 2021 01:52:36 +0000 Subject: [PATCH 46/80] Adding row to column conversion code. Performance falls off a cliff, but starts out reasonably. I haven't looked at this in nsight yet. --- .../row_conversion/row_conversion.cpp | 74 +- cpp/include/cudf/row_conversion.hpp | 12 + cpp/src/row_conversion/row_conversion.cu | 759 +++++++++++++----- cpp/tests/row_conversion/row_conversion.cpp | 106 +++ 4 files changed, 748 insertions(+), 203 deletions(-) diff --git a/cpp/benchmarks/row_conversion/row_conversion.cpp b/cpp/benchmarks/row_conversion/row_conversion.cpp index d6b195433cf..7c1f52c5cd6 100644 --- a/cpp/benchmarks/row_conversion/row_conversion.cpp +++ b/cpp/benchmarks/row_conversion/row_conversion.cpp @@ -91,7 +91,7 @@ static void BM_new_to_row(benchmark::State& state) state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows()); } -/*static void BM_from_row(benchmark::State& state) +static void BM_old_from_row(benchmark::State& state) { cudf::size_type const n_rows{(cudf::size_type)state.range(0)}; auto const table = create_random_table({cudf::type_id::INT8, @@ -123,36 +123,62 @@ static void BM_new_to_row(benchmark::State& state) } state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows()); -}*/ - -#define OLD_TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \ - BENCHMARK_DEFINE_F(RowConversion, name) \ - (::benchmark::State & st) { f(st); } \ - BENCHMARK_REGISTER_F(RowConversion, name) \ - ->RangeMultiplier(8) \ - ->Ranges({{1 << 6, 1 << 20}}) \ - ->UseManualTime() \ - ->Unit(benchmark::kMillisecond); +} + +static void BM_new_from_row(benchmark::State& state) +{ + cudf::size_type const n_rows{(cudf::size_type)state.range(0)}; + auto const table = create_random_table({cudf::type_id::INT8, + cudf::type_id::INT32, + cudf::type_id::INT16, + cudf::type_id::INT64, + cudf::type_id::INT32, + cudf::type_id::BOOL8, + cudf::type_id::UINT16, + cudf::type_id::UINT8, + cudf::type_id::UINT64}, + 256, + row_count{n_rows}); + + std::vector schema; + cudf::size_type total_bytes = 0; + for (int i = 0; i < table->num_columns(); ++i) { + auto t = table->get_column(i).type(); + schema.push_back(t); + total_bytes += cudf::size_of(t); + } + + auto rows = cudf::convert_to_rows(table->view()); + + for (auto _ : state) { + cuda_event_timer raii(state, true, rmm::cuda_stream_default); + + auto out = cudf::convert_from_rows2(rows, schema); + } + + state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows()); +} -#define NEW_TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \ - BENCHMARK_DEFINE_F(RowConversion, name) \ - (::benchmark::State & st) { f(st); } \ - BENCHMARK_REGISTER_F(RowConversion, name) \ - ->RangeMultiplier(8) \ - ->Ranges({{1 << 6, 1 << 20}}) \ - ->UseManualTime() \ +#define TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \ + BENCHMARK_DEFINE_F(RowConversion, name) \ + (::benchmark::State & st) { f(st); } \ + BENCHMARK_REGISTER_F(RowConversion, name) \ + ->RangeMultiplier(8) \ + ->Ranges({{1 << 6, 1 << 20}}) \ + ->UseManualTime() \ ->Unit(benchmark::kMillisecond); -OLD_TO_ROW_CONVERSION_BENCHMARK_DEFINE(old_to_row_conversion, BM_old_to_row) -NEW_TO_ROW_CONVERSION_BENCHMARK_DEFINE(new_to_row_conversion, BM_new_to_row) +TO_ROW_CONVERSION_BENCHMARK_DEFINE(old_to_row_conversion, BM_old_to_row) +TO_ROW_CONVERSION_BENCHMARK_DEFINE(new_to_row_conversion, BM_new_to_row) -#define FROM_ROW_CONVERSION_BENCHMARK_DEFINE(name) \ +#define FROM_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \ BENCHMARK_DEFINE_F(RowConversion, name) \ - (::benchmark::State & st) { BM_from_row(st); } \ + (::benchmark::State & st) { f(st); } \ BENCHMARK_REGISTER_F(RowConversion, name) \ ->RangeMultiplier(8) \ - ->Ranges({{1 << 6, 1 << 22}}) \ + ->Ranges({{1 << 6, 1 << 20}}) \ ->UseManualTime() \ ->Unit(benchmark::kMillisecond); -//FROM_ROW_CONVERSION_BENCHMARK_DEFINE(from_row_conversion) +FROM_ROW_CONVERSION_BENCHMARK_DEFINE(old_from_row_conversion, BM_old_from_row) +FROM_ROW_CONVERSION_BENCHMARK_DEFINE(new_from_row_conversion, BM_new_from_row) diff --git a/cpp/include/cudf/row_conversion.hpp b/cpp/include/cudf/row_conversion.hpp index f5e2225ad19..282ffa4b0cb 100644 --- a/cpp/include/cudf/row_conversion.hpp +++ b/cpp/include/cudf/row_conversion.hpp @@ -48,4 +48,16 @@ std::unique_ptr convert_from_rows( rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); +std::unique_ptr convert_from_rows2( + cudf::lists_column_view const &input, + std::vector const &schema, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); + +std::unique_ptr convert_from_rows2( + std::vector> const &input, + std::vector const &schema, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); + } // namespace cudf diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu index 3f221e2f716..c0e78a03576 100644 --- a/cpp/src/row_conversion/row_conversion.cu +++ b/cpp/src/row_conversion/row_conversion.cu @@ -30,6 +30,7 @@ #include #include +#include #include "cudf/types.hpp" #include "rmm/device_buffer.hpp" #include "thrust/iterator/counting_iterator.h" @@ -332,6 +333,20 @@ struct block_info { int buffer_num; }; +// When building the columns to return, we have to be mindful of the offset limit in cudf. +// It is 32-bit and these data columns are capable of surpassing that easily. The data should +// not be cut off exactly at the limit though due to the validity buffers. The most efficient +// place to cut the validity is on a 32-row boundary, so as we calculate the row sizes +// we keep track of the cut points for the validity, which we call row batches. If the row +// is larger than can be represented with the 32-bit offsets, we use the last 32-row boundary we +// hit. Note that this boundary is for our book-keeping with column pointers and not anything that +// the kernel needs to worry about. We cut the output at convienient boundaries when assembling +// the outgoing data stream. +struct row_batch { + size_type num_bytes; + size_type row_count; +}; + /** * @brief copy data from cudf columns into x format, which is row-based * @@ -364,7 +379,7 @@ __global__ void copy_from_columns(const size_type num_rows, // This has been broken up for us in the block_info struct, so we don't have // any calculation to do here, but it is important to note. - constexpr bool debug_print = false; //blockIdx.x == 2649 && threadIdx.x == 479; + bool debug_print = false; //blockIdx.x == 0 && threadIdx.x == 0; if (debug_print) { printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns); @@ -383,7 +398,7 @@ __global__ void copy_from_columns(const size_type num_rows, }*/ printf("output data to %p\n", output_data[block_infos[blockIdx.x].buffer_num]); } - //else { return; } + // else { return; } auto block = block_infos[blockIdx.x]; auto const rows_in_block = block.end_row - block.start_row + 1; extern __shared__ int8_t shared_data[]; @@ -403,7 +418,7 @@ __global__ void copy_from_columns(const size_type num_rows, block.buffer_num); } // each thread is responsible for every threadcount rows of data. - // the data is copies into shared memory in the final layout. + // the data is copied into shared memory in the final layout. auto const real_bytes_in_row = col_offsets[block.end_col] + col_sizes[block.end_col] - col_offsets[block.start_col]; auto const shmem_row_size = align_offset(real_bytes_in_row + dest_shim_offset, @@ -432,7 +447,7 @@ __global__ void copy_from_columns(const size_type num_rows, auto const dest_col_offset = col_offsets[col] - col_offsets[block.start_col] + dest_shim_offset; if (debug_print) { printf("dest col offset %d\n", dest_col_offset); } - for (int row = block.start_row + threadIdx.x; row <= block.end_row; row += gridDim.x) { + for (int row = block.start_row + threadIdx.x; row <= block.end_row; row += blockDim.x) { if (debug_print) { printf("shmem row %d(%d) at offset %d(%d)\n", row - block.start_row, @@ -524,8 +539,8 @@ __global__ void copy_from_columns(const size_type num_rows, // row in shared memory may not be an entire row of the destination. // auto const thread_start_offset = threadIdx.x * 8; - auto const thread_stride = gridDim.x * 8; - auto const end_offset = shmem_row_size * rows_in_block; + auto const thread_stride = blockDim.x * 8; + auto const end_offset = shmem_row_size * rows_in_block; if (debug_print) { printf("writing final data from %d to %d at stride %d\n", @@ -559,9 +574,10 @@ __global__ void copy_from_columns(const size_type num_rows, auto const num_single_bytes = real_bytes_in_row - dest_shim_offset; for (auto i = 0; i < num_single_bytes; ++i) { if (debug_print) { - printf("case 3 - %d single byte final write %p -> %p\n", + printf("case 3 - %d single byte final write %p(%d) -> %p\n", num_single_bytes, &input_ptr[i + dest_shim_offset], + input_ptr[i + dest_shim_offset], &output_ptr[i]); } output_ptr[i] = input_ptr[i + dest_shim_offset]; @@ -600,6 +616,237 @@ __global__ void copy_from_columns(const size_type num_rows, } } +/** + * @brief copy data from row-based format to cudf columns + * + * @param num_rows total number of rows in the table + * @param num_columns total number of columns in the table + * @param offsets + * @param output_data + * @param output_nm + * @param col_sizes array of sizes for each element in a column - one per column + * @param col_offsets offset into input data row for each column's start + * @param block_infos information about the blocks of work + * @param input_data pointer to input data + * + */ +__global__ void copy_to_columns(const size_type num_rows, + const size_type num_columns, + const size_type *offsets, + int8_t **output_data, + cudf::bitmask_type **output_nm, + const size_type *col_sizes, + const size_type *col_offsets, + const block_info *block_infos, + const int8_t *input_data) +{ + // We are going to copy the data in two passes. + // The first pass copies a chunk of data into shared memory. + // The second pass copies that chunk from shared memory out to the final location. + + // Because shared memory is limited we copy a subset of the rows at a time. + // This has been broken up for us in the block_info struct, so we don't have + // any calculation to do here, but it is important to note. + + bool debug_print = false; //blockIdx.x == 1 && threadIdx.x == 0; + + if (debug_print) { + printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns); + printf("Column Info:\n"); + for (int i = 0; i < num_columns; ++i) { + printf("col %d is at %p with size %d and offset %d\n", + i, + output_data[i], + col_sizes[i], + col_offsets[i]); + } + printf("block infos are at %p and my index is %d\n", block_infos, blockIdx.x); + /* printf("Row Offsets:\n"); + for (int i=0; i(&input_data[offsets[absolute_row] + offset_in_row]); + if (debug_print) { + printf("which will be address %p\n", long_col_input); + printf("%p <- long %lu\n", shmem_dest, *long_col_input); } + *reinterpret_cast(shmem_dest) = *long_col_input; + } + + __syncthreads(); + + // now we copy from shared memory to final destination. + // the data is laid out in rows in shared memory, so the reads + // for a column will be "vertical". Because of this and the different + // sizes for each column, this portion is handled on row/column basis. + // to prevent each thread working on a single row and also to ensure + // that all threads can do work in the case of more threads than rows, + // we do a global index instead of a double for loop with col/row. + for (int index = threadIdx.x; index < rows_in_block * cols_in_block; index += blockDim.x) { + auto const relative_col = index % cols_in_block; + auto const relative_row = index / cols_in_block; + auto const absolute_col = relative_col + block.start_col; + auto const absolute_row = relative_row + block.start_row; + + auto const shared_memory_row_offset = window_quad_width * 8 * relative_row; + auto const shared_memory_offset = col_offsets[absolute_col] - col_offsets[block.start_col] + + shared_memory_row_offset + shared_memory_starting_pad; + auto const column_size = col_sizes[absolute_col]; + + int8_t *shmem_src = &shared_data[shared_memory_offset]; + int8_t *dst = &output_data[absolute_col][absolute_row * column_size]; + + if (debug_print) { + printf("relative_col: %d, relative_row: %d, absolute_col: %d, absolute_row: %d, shared_mmeory_row_offset: %d, shared_memory_offset: %d," + " column_size: %d, shmem_src: %p, dst: %p\n", relative_col, relative_row, absolute_col, absolute_row, shared_memory_row_offset, shared_memory_offset, column_size, + shmem_src, dst) ; + } + switch (column_size) { + case 1: { + if (debug_print) { printf("%p <- byte %d\n", dst, *shmem_src); } + *dst = *shmem_src; + break; + } + case 2: { + const int16_t *short_col_input = reinterpret_cast(shmem_src); + if (debug_print) { printf("%p <- short %d\n", dst, *short_col_input); } + *reinterpret_cast(dst) = *short_col_input; + break; + } + case 4: { + const int32_t *int_col_input = reinterpret_cast(shmem_src); + if (debug_print) { printf("%p <- int 0x%x\n", dst, *int_col_input); } + *reinterpret_cast(dst) = *int_col_input; + break; + } + case 8: { + const int64_t *long_col_input = reinterpret_cast(shmem_src); + if (debug_print) { printf("%p <- long %lu\n", dst, *long_col_input); } + *reinterpret_cast(dst) = *long_col_input; + break; + } + default: { + if (debug_print) { + printf("byte for byte copy due to size %d of column %d\n", column_size, absolute_col); + } + // TODO this should just not be supported for fixed width columns, but just in case... + for (cudf::size_type b = 0; b < column_size; b++) { dst[b] = shmem_src[b]; } + break; + } + } + } + + __syncthreads(); + + // now handle validity. Each thread is responsible for 32 rows in a single column. + // to prevent indexing issues with a large number of threads, this is compressed + // to a single loop like above. TODO: investigate using shared memory here + auto const validity_batches_per_col = (num_rows + 31) / 32; + auto const validity_batches_total = validity_batches_per_col * num_columns; + if (debug_print) { + printf("validity_batched_per_col is %d\nvalidity_batches_total is %d for %d rows\n", validity_batches_per_col, validity_batches_total, num_rows); + } + for (int index = threadIdx.x; index < validity_batches_total; index += blockDim.x) { + // what column is this? + auto const col = index / validity_batches_per_col; + auto const batch = index % validity_batches_per_col; + auto const starting_row = batch * 32; + auto const validity_offset = col_offsets[num_columns] + col / 8; + + if (debug_print) { + printf("col: %d, batch: %d, starting_row: %d, validity_offset: %d\n", col, batch, starting_row, validity_offset); + } + + int32_t dst_validity = 0; + for (int row = starting_row; row < std::min(num_rows, starting_row + 32); ++row) { + int8_t const * const validity_ptr = &input_data[offsets[row] + validity_offset]; + + if (debug_print) { + printf("validity_ptr is %p for row %d\nwhich is input_data[%d]\n", validity_ptr, row, offsets[row] + validity_offset); + } + + auto const val_byte = *validity_ptr; + auto const src_shift = col % 8; + auto const dst_shift = row % 32; + auto const src_bit_mask = 1 << src_shift; + if (debug_print) { + printf("src bit mask is 0x%x\n", src_bit_mask); + printf("src shift is 0x%x and dst shift is 0x%x\n", src_shift, dst_shift); + printf("validity bit is 0x%x\n", (val_byte & src_bit_mask) >> src_shift); + } +// auto const dst_bit_mask = 1 << dst_shift; + dst_validity |= (((val_byte & src_bit_mask) >> src_shift) << dst_shift); + if (debug_print) { + printf("validity is now 0x%x\n", dst_validity); + } + } + + + int32_t *validity_ptr = reinterpret_cast(output_nm[col] + (starting_row / 32)); + if (debug_print) { + printf("valiidty_ptr is output_nm[%d]: %p + starting_row / 8: %d because starting row is %d, which becomes %p\n", col, output_nm[col], starting_row / 32, starting_row, output_nm[col] + (starting_row / 32)); + printf("validity to write is %d\n", dst_validity); + printf("validity write %p <- %d\n", validity_ptr, dst_validity); + } + *validity_ptr = dst_validity; + } +} + /** * Calculate the dimensions of the kernel for fixed width only columns. * @param [in] num_columns the number of columns being copied. @@ -764,21 +1011,165 @@ static inline int32_t compute_fixed_width_layout(std::vector co return align_offset(at_offset, 8); // 8 bytes (64 bits) } -} // namespace detail +template +static size_type compute_column_information( + iterator begin, + iterator end, + std::vector &column_starts, + std::vector &column_sizes)//, + //std::function nested_type_cb) +{ + size_type fixed_width_size_per_row = 0; + for (auto cv = begin; cv != end; ++cv) { + auto col_type = std::get<0>(*cv); + bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; + +// if (nested_type && nested_type_cb) { nested_type_cb(cv->get<1>()); } + + // a list or string column will write a single uint64 + // of data here for offset/length + auto col_size = nested_type ? 8 : size_of(col_type); + + // align size for this type + std::size_t const alignment_needed = col_size; // They are the same for fixed width types + fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed); + column_starts.push_back(fixed_width_size_per_row); + column_sizes.push_back(col_size); + fixed_width_size_per_row += col_size; + } + + auto validity_offset = detail::align_offset(fixed_width_size_per_row, 4); + column_starts.push_back(validity_offset); + + return fixed_width_size_per_row; +} //#define DEBUG -std::vector> convert_to_rows2(cudf::table_view const &tbl, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) + +static std::vector build_block_infos(std::vector const &column_sizes, + std::vector const &column_starts, + std::vector const &row_batches, + size_type const total_number_of_rows, + size_type const &shmem_limit_per_block) { - // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the - // data, but small enough that multiple columns fit in memory so the writes can coalese as well. - // Potential optimization for window sizes. - const size_type num_columns = tbl.num_columns(); - const size_type num_rows = tbl.num_rows(); + std::vector block_infos; + + // block infos are organized with the windows going "down" the columns + // this provides the most coalescing of memory access + int current_window_width = 0; + int current_window_start_col = 0; + + // build the blocks for a specific set of columns + auto build_blocks = [&block_infos, &row_batches, total_number_of_rows]( + int const start_col, int const end_col, int const desired_window_height) { + int current_window_start_row = 0; + int current_window_row_batch = 0; + int rows_left_in_batch = row_batches[current_window_row_batch].row_count; + int i = 0; + while (i < total_number_of_rows) { + if (rows_left_in_batch == 0) { + current_window_row_batch++; + rows_left_in_batch = row_batches[current_window_row_batch].row_count; + } + int const window_height = std::min(desired_window_height, rows_left_in_batch); + + block_infos.emplace_back(detail::block_info{ + start_col, + current_window_start_row, + end_col, + std::min(current_window_start_row + window_height - 1, total_number_of_rows - 1), + current_window_row_batch}); + + i += window_height; + current_window_start_row += window_height; + rows_left_in_batch -= window_height; + } + }; + + // the ideal window height has lots of 8-byte reads and 8-byte writes. The optimal read/write + // would be memory cache line sized access, but since other blocks will read/write the edges this + // may not turn out to be overly important. For now, we will attempt to build a square window as + // far as byte sizes. x * y = shared_mem_size. Which translates to x^2 = shared_mem_size since we + // want them equal, so height and width are sqrt(shared_mem_size). The trick is that it's in + // bytes, not rows or columns. + int const window_height = std::min( + std::min(size_type(sqrt(shmem_limit_per_block)) / column_sizes[0], total_number_of_rows), + row_batches[0].row_count); +#if defined(DEBUG) + printf( + "sqrt(shmem_limit_per_block) / column_sizes[0] is %d and num_rows is %d, batch row count is %d - which makes window height " + "%d\n", + size_type(sqrt(shmem_limit_per_block)) / column_sizes[0], + total_number_of_rows, + row_batches[0].row_count, + window_height); +#endif + + int row_size = 0; + + // march each column and build the blocks of appropriate sizes + for (unsigned int col = 0; col < column_sizes.size(); ++col) { + auto const col_size = column_sizes[col]; + + // align size for this type + std::size_t alignment_needed = col_size; // They are the same for fixed width types + auto row_size_aligned = detail::align_offset(row_size, alignment_needed); + auto row_size_with_this_col = row_size_aligned + col_size; + auto row_size_with_end_pad = detail::align_offset(row_size_with_this_col, 8); + + if (row_size_with_end_pad * window_height > shmem_limit_per_block) { +#if defined(DEBUG) + printf( + "Window size %d too large at column %d, bumping back to build windows of size %d(cols " + "%d-%d), which is %d tall. Row size is too large at %d and ok at %d(aligned overall is %d) " + "for shared mem size %d\n", + row_size_with_end_pad * window_height, + col, + row_size * window_height, + current_window_start_col, + col - 1, + window_height, + row_size_with_end_pad, + row_size, + row_size_aligned, + shmem_limit_per_block); +#endif + // too large, close this window, generate vertical blocks and restart + build_blocks(current_window_start_col, col - 1, window_height); + row_size = + detail::align_offset((column_starts[col] + column_sizes[col]) & 7, alignment_needed); +#if defined(DEBUG) + printf( + "New window starting with offset %d and row size %d to be %d (previous column offset %d+%d " + "or %d)\n", + row_size, + col_size, + row_size + col_size, + column_starts[col - 1], + column_sizes[col - 1], + column_starts[col - 1] + column_sizes[col - 1]); +#endif + row_size += col_size; // alignment required for shared memory window boundary to match + // alignment of output row + current_window_start_col = col; + current_window_width = 0; + } else { + row_size = row_size_with_this_col; + current_window_width++; + } + } + + // build last set of blocks + if (current_window_width > 0) { + build_blocks(current_window_start_col, (int)column_sizes.size()-1, window_height); + } + + return block_infos; +} +} // namespace detail #if defined(DEBUG) - auto pretty_print = [](uint64_t i) { + void pretty_print(uint64_t i) { if (i > (1 * 1024 * 1024 * 1024)) { printf("%.2f GB", i / float(1 * 1024 * 1024 * 1024)); } else if (i > (1 * 1024 * 1024)) { @@ -788,9 +1179,19 @@ std::vector> convert_to_rows2(cudf::table_view con } else { printf("%lu Bytes", i); } - }; + } #endif +std::vector> convert_to_rows2(cudf::table_view const &tbl, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) +{ + // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the + // data, but small enough that multiple columns fit in memory so the writes can coalese as well. + // Potential optimization for window sizes. + const size_type num_columns = tbl.num_columns(); + const size_type num_rows = tbl.num_rows(); + int device_id; CUDA_TRY(cudaGetDevice(&device_id)); int shmem_limit_per_block; @@ -834,8 +1235,8 @@ std::vector> convert_to_rows2(cudf::table_view con } } - auto dev_input_data = detail::copy_to_dev_async2(input_data, stream, mr); - auto dev_input_nm = detail::copy_to_dev_async2(input_nm, stream, mr); + auto dev_input_data = detail::copy_to_dev_async2(input_data, stream, mr); + auto dev_input_nm = detail::copy_to_dev_async2(input_nm, stream, mr); std::vector row_sizes; // size of each row in bytes including any alignment padding std::vector row_offsets; // offset from the start of the data to this row @@ -848,43 +1249,48 @@ std::vector> convert_to_rows2(cudf::table_view con column_sizes.reserve(num_columns); column_starts.reserve(num_columns + 1); // we add a final offset for validity data start - size_type fixed_width_size_per_row = 0; - for (int col = 0; col < num_columns; ++col) { - auto cv = tbl.column(col); - auto col_type = cv.type(); - bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; + auto iter = thrust::make_transform_iterator(thrust::make_counting_iterator(0), [&tbl](auto i) -> std::tuple { + return std::make_tuple(tbl.column(i).type(), tbl.column(i)); + }); + + size_type fixed_width_size_per_row = detail::compute_column_information( + iter, + iter + num_columns, + column_starts, + column_sizes);//, +// [&variable_width_columns](column_view const &cv) { variable_width_columns.push_back(cv); }); + /* size_type fixed_width_size_per_row = 0; + for (int col = 0; col < num_columns; ++col) { + auto cv = tbl.column(col); + auto col_type = cv.type(); + bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; + + if (nested_type) { variable_width_columns.push_back(cv); } + + // a list or string column will write a single uint64 + // of data here for offset/length + auto col_size = nested_type ? 8 : size_of(col_type); + + // align size for this type + std::size_t const alignment_needed = col_size; // They are the same for fixed width types + fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed); + column_starts.push_back(fixed_width_size_per_row); + column_sizes.push_back(col_size); + fixed_width_size_per_row += col_size; + }*/ - if (nested_type) { variable_width_columns.push_back(cv); } +#if defined(DEBUG) + printf("validity offset will be %d + %d = %d\n", + column_starts.back(), + column_sizes.back(), + column_starts.back() + column_sizes.back()); +#endif - // a list or string column will write a single uint64 - // of data here for offset/length - auto col_size = nested_type ? 8 : size_of(col_type); - // align size for this type - std::size_t const alignment_needed = col_size; // They are the same for fixed width types - fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed); - column_starts.push_back(fixed_width_size_per_row); - column_sizes.push_back(col_size); - fixed_width_size_per_row += col_size; - } + auto dev_col_sizes = detail::copy_to_dev_async2(column_sizes, stream, mr); + auto dev_col_starts = detail::copy_to_dev_async2(column_starts, stream, mr); - auto dev_col_sizes = detail::copy_to_dev_async2(column_sizes, stream, mr); - auto dev_col_starts = detail::copy_to_dev_async2(column_starts, stream, mr); - - // When building the columns to return, we have to be mindful of the offset limit in cudf. - // It is 32-bit and these data columns are capable of surpassing that easily. The data should - // not be cut off exactly at the limit though due to the validity buffers. The most efficient - // place to cut the validity is on a 32-row boundary, so as we calculate the row sizes - // we keep track of the cut points for the validity, which we call row batches. If the row - // is larger than can be represented with the 32-bit offsets, we use the last 32-row boundary we - // hit. Note that this boundary is for our book-keeping with column pointers and not anything that - // the kernel needs to worry about. We cut the output at convienient boundaries when assembling - // the outgoing data stream. - struct row_batch { - size_type num_bytes; - size_type row_count; - }; - std::vector row_batches; + std::vector row_batches; auto calculate_variable_width_row_data_size = [](int const row) { // each level of variable-width data will add an offset/length @@ -936,10 +1342,11 @@ std::vector> convert_to_rows2(cudf::table_view con row_sizes[row] = detail::align_offset(row_sizes[row], 8) + calculate_variable_width_row_data_size(row); // rows are 8 byte aligned - if ((uint64_t)aligned_row_batch_size + row_sizes[row] > (uint64_t)std::numeric_limits::max()) { + if ((uint64_t)aligned_row_batch_size + row_sizes[row] > + (uint64_t)std::numeric_limits::max()) { // a new batch starts at the last 32-row boundary row_batches.push_back( - row_batch{static_cast(row_batch_size), row_batch_rows & ~31}); + detail::row_batch{static_cast(row_batch_size), row_batch_rows & ~31}); row_batch_size = 0; row_batch_rows = row_batch_rows & 31; row_offset = 0; @@ -954,12 +1361,12 @@ std::vector> convert_to_rows2(cudf::table_view con row_batch_rows++; } if (row_batch_size > 0) { - row_batches.push_back(row_batch{static_cast(row_batch_size), row_batch_rows}); + row_batches.push_back(detail::row_batch{static_cast(row_batch_size), row_batch_rows}); } auto dev_row_offsets = detail::copy_to_dev_async2(row_offsets, stream, mr); - #if defined(DEBUG) +#if defined(DEBUG) printf("%d rows and %d columns in table\n", num_rows, num_columns); printf("%lu batches:\n", row_batches.size()); for (auto i = 0; i < (int)row_batches.size(); ++i) { @@ -979,125 +1386,8 @@ std::vector> convert_to_rows2(cudf::table_view con } auto dev_output_data = detail::copy_to_dev_async2(output_data, stream, mr); - std::vector block_infos; - - // block infos are organized with the windows going "down" the columns - // this provides the most coalescing of memory access - int current_window_width = 0; - int current_window_start_col = 0; - - // build the blocks for a specific set of columns - auto build_blocks = [&block_infos, &row_batches, num_rows]( - int const start_col, int const end_col, int const desired_window_height) { - int current_window_start_row = 0; - int current_window_row_batch = 0; - int rows_left_in_batch = row_batches[current_window_row_batch].row_count; - int i = 0; - while (i < num_rows) { - if (rows_left_in_batch == 0) { - current_window_row_batch++; - rows_left_in_batch = row_batches[current_window_row_batch].row_count; - } - int const window_height = std::min(desired_window_height, rows_left_in_batch); - - block_infos.emplace_back( - detail::block_info{start_col, - current_window_start_row, - end_col, - std::min(current_window_start_row + window_height - 1, num_rows - 1), - current_window_row_batch}); - - i += window_height; - current_window_start_row += window_height; - rows_left_in_batch -= window_height; - } - }; - - // the ideal window height has lots of 8-byte reads and 8-byte writes. The optimal read/write would be memory cache line sized - // access, but since other blocks will read/write the edges this may not turn out to be overly important. - // For now, we will attempt to build a square window as far as byte sizes. x * y = shared_mem_size. - // Which translates to x^2 = shared_mem_size since we want them equal, so height and width are - // sqrt(shared_mem_size). The trick is that it's in bytes, not rows or columns. - int const window_height = - std::min(std::min(size_type(sqrt(shmem_limit_per_block))/column_sizes[0], num_rows), row_batches[0].row_count); -#if defined(DEBUG) - printf( - "max_window_height is %d, num_rows is %d, batch row count is %d - which makes window height " - "%d\n", - max_window_height, - num_rows, - row_batches[0].row_count, - window_height); -#endif - - int row_size = 0; - - // march each column and build the blocks of appropriate sizes - for (int col = 0; col < num_columns; ++col) { - auto const col_size = column_sizes[col]; - - // align size for this type - std::size_t alignment_needed = col_size; // They are the same for fixed width types - auto row_size_aligned = detail::align_offset(row_size, alignment_needed); - auto row_size_with_this_col = row_size_aligned + col_size; - auto row_size_with_end_pad = detail::align_offset(row_size_with_this_col, 8); - - if (row_size_with_end_pad * window_height > shmem_limit_per_block) { -#if defined(DEBUG) - printf( - "Window size %d too large at column %d, bumping back to build windows of size %d(cols " - "%d-%d), which is %d tall. Row size is too large at %d and ok at %d(aligned overall is %d) " - "for shared mem size %d\n", - row_size_with_end_pad * window_height, - col, - row_size * window_height, - current_window_start_col, - col - 1, - window_height, - row_size_with_end_pad, - row_size, - row_size_aligned, - shmem_limit_per_block); -#endif - // too large, close this window, generate vertical blocks and restart - build_blocks(current_window_start_col, col - 1, window_height); - row_size = - detail::align_offset((column_starts[col] + column_sizes[col]) & 7, alignment_needed); -#if defined(DEBUG) - printf( - "New window starting with offset %d and row size %d to be %d (previous column offset %d+%d " - "or %d)\n", - row_size, - col_size, - row_size + col_size, - column_starts[col - 1], - column_sizes[col - 1], - column_starts[col - 1] + column_sizes[col - 1]); -#endif - row_size += col_size; // alignment required for shared memory window boundary to match - // alignment of output row - current_window_start_col = col; - current_window_width = 0; - } else { - row_size = row_size_with_this_col; - current_window_width++; - } - } - -#if defined(DEBUG) - printf("validity offset will be %d + %d = %d\n", - column_starts.back(), - column_sizes.back(), - column_starts.back() + column_sizes.back()); -#endif - auto validity_offset = detail::align_offset(column_starts.back() + column_sizes.back(), 4); - column_starts.push_back(validity_offset); - - // build last set of blocks - if (current_window_width > 0) { - build_blocks(current_window_start_col, (int)tbl.num_columns() - 1, window_height); - } - + std::vector block_infos = + build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block); #if defined(DEBUG) printf("%lu windows for %d columns, %d rows to fit in ", @@ -1116,7 +1406,11 @@ std::vector> convert_to_rows2(cudf::table_view con // blast through the entire table and convert it dim3 blocks(block_infos.size()); - dim3 threads(std::min(1024, shmem_limit_per_block / 8)); + #if defined(DEBUG) || 1 + dim3 threads(std::min(std::min(512, shmem_limit_per_block / 8), (int)total_table_size)); + #else + dim3 threads(std::min(std::min(1024, shmem_limit_per_block / 8), (int)total_table_size)); + #endif #if defined(DEBUG) printf("Launching kernel with %d blocks, %d threads, ", blocks.x, threads.x); pretty_print(shmem_limit_per_block); @@ -1206,11 +1500,11 @@ std::vector> convert_to_rows(cudf::table_view cons using ScalarType = cudf::scalar_type_t; auto zero = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value()); - zero->set_valid(true, stream); + zero->set_valid_async(true, stream); static_cast(zero.get())->set_value(0, stream); auto step = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value()); - step->set_valid(true, stream); + step->set_valid_async(true, stream); static_cast(step.get()) ->set_value(static_cast(size_per_row), stream); @@ -1238,6 +1532,97 @@ std::vector> convert_to_rows(cudf::table_view cons } } +std::unique_ptr convert_from_rows2(cudf::lists_column_view const &input, + std::vector const &schema, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) +{ + // verify that the types are what we expect + cudf::column_view child = input.child(); + cudf::type_id list_type = child.type().id(); + CUDF_EXPECTS(list_type == cudf::type_id::INT8 || list_type == cudf::type_id::UINT8, + "Only a list of bytes is supported as input"); + + cudf::size_type num_columns = schema.size(); + cudf::size_type num_rows = input.parent().size(); + + int device_id; + CUDA_TRY(cudaGetDevice(&device_id)); + int shmem_limit_per_block; + CUDA_TRY( + cudaDeviceGetAttribute(&shmem_limit_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); + + std::vector column_starts; + std::vector column_sizes; + + auto iter = thrust::make_transform_iterator(thrust::make_counting_iterator(0), [&schema](auto i) { + return std::make_tuple(schema[i], nullptr); + }); + size_type fixed_width_size_per_row = detail::compute_column_information( + iter, iter + num_columns, column_starts, column_sizes);//, [](void *) {}); + + size_type validity_size = (num_columns + 7) / 8; + + size_type row_size = detail::align_offset(fixed_width_size_per_row + validity_size, 8); + + // Ideally we would check that the offsets are all the same, etc. but for now + // this is probably fine + CUDF_EXPECTS(row_size * num_rows == child.size(), + "The layout of the data appears to be off"); + auto dev_col_starts = detail::copy_to_dev_async2(column_starts, stream, mr); + auto dev_col_sizes = detail::copy_to_dev_async2(column_sizes, stream, mr); + + // build the row_batches from the passed in list column + std::vector row_batches; + + row_batches.push_back(detail::row_batch{child.size(), num_rows}); + + // Allocate the columns we are going to write into + std::vector> output_columns; + std::vector output_data; + std::vector output_nm; + for (cudf::size_type i = 0; i < num_columns; i++) { + auto column = cudf::make_fixed_width_column( + schema[i], num_rows, cudf::mask_state::UNINITIALIZED, stream, mr); + auto mut = column->mutable_view(); + output_data.emplace_back(mut.data()); + output_nm.emplace_back(mut.null_mask()); + output_columns.emplace_back(std::move(column)); + } + + auto dev_output_data = detail::copy_to_dev_async2(output_data, stream, mr); + auto dev_output_nm = detail::copy_to_dev_async2(output_nm, stream, mr); + + std::vector block_infos = + build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block); + + auto dev_block_infos = detail::copy_to_dev_async2(block_infos, stream, mr); + + dim3 blocks(block_infos.size()); + #if defined(DEBUG) || 1 + dim3 threads(std::min(std::min(512, shmem_limit_per_block / 8), (int)child.size())); + #else + dim3 threads(std::min(std::min(1024, shmem_limit_per_block / 8), (int)child.size())); + #endif +#if defined(DEBUG) + printf("Launching kernel with %d blocks, %d threads, ", blocks.x, threads.x); + pretty_print(shmem_limit_per_block); + printf(" shared memory\n"); +#endif + detail::copy_to_columns<<>>( + num_rows, + num_columns, + input.offsets().data(), + dev_output_data.data(), + dev_output_nm.data(), + dev_col_sizes.data(), + dev_col_starts.data(), + dev_block_infos.data(), + child.data()); + + return std::make_unique(std::move(output_columns)); +} + std::unique_ptr convert_from_rows(cudf::lists_column_view const &input, std::vector const &schema, rmm::cuda_stream_view stream, @@ -1318,4 +1703,20 @@ std::unique_ptr convert_from_rows( // } } +std::unique_ptr convert_from_rows2( + std::vector> const &input, + std::vector const &schema, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) +{ + CUDF_EXPECTS(input.size() == 1, "Too large of an input, need to concat the output tables..."); + + // for (uint i=0; iview(); + auto ret = convert_from_rows2(lcv, schema, stream, mr); + + return ret; + // } +} + } // namespace cudf diff --git a/cpp/tests/row_conversion/row_conversion.cpp b/cpp/tests/row_conversion/row_conversion.cpp index c02f83ad1d5..818d7a89ddb 100644 --- a/cpp/tests/row_conversion/row_conversion.cpp +++ b/cpp/tests/row_conversion/row_conversion.cpp @@ -21,9 +21,13 @@ #include #include +#include "cudf/lists/lists_column_view.hpp" +#include "cudf/types.hpp" struct ColumnToRowTests : public cudf::test::BaseFixture { }; +struct RowToColumnTests : public cudf::test::BaseFixture { +}; TEST_F(ColumnToRowTests, Single) { @@ -108,3 +112,105 @@ TEST_F(ColumnToRowTests, SingleByteWide) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); } } + +TEST_F(RowToColumnTests, Single) +{ + cudf::test::fixed_width_column_wrapper a({-1}); + cudf::table_view in(std::vector{a}); + + auto old_rows = cudf::convert_to_rows(in); + std::vector schema{cudf::data_type{cudf::type_id::INT32}}; + for (uint i=0; i a({-1, 0, 1}); + cudf::table_view in(std::vector{a}); + + auto old_rows = cudf::convert_to_rows(in); + std::vector schema{cudf::data_type{cudf::type_id::INT32}}; + for (uint i=0; i int32_t { return rand(); }); + cudf::test::fixed_width_column_wrapper a(r, r + (size_t)4096); + cudf::table_view in(std::vector{a}); + + auto old_rows = cudf::convert_to_rows(in); + std::vector schema; + schema.reserve(in.num_columns()); + for (auto col = in.begin(); col < in.end(); ++col) { + schema.push_back(col->type()); + } + for (uint i=0; i> cols; + std::vector views; + + for (int i = 0; i < 256; ++i) { + cols.push_back(cudf::test::fixed_width_column_wrapper({rand()})); + views.push_back(cols.back()); + } + cudf::table_view in(views); + + auto old_rows = cudf::convert_to_rows(in); + std::vector schema; + schema.reserve(in.num_columns()); + for (auto col = in.begin(); col < in.end(); ++col) { + schema.push_back(col->type()); + } + + for (uint i=0; i> cols; + std::vector views; + + for (int i = 0; i < 256; ++i) { + cols.push_back(cudf::test::fixed_width_column_wrapper({rand()})); + views.push_back(cols.back()); + } + cudf::table_view in(views); + + auto old_rows = cudf::convert_to_rows(in); + std::vector schema; + schema.reserve(in.num_columns()); + for (auto col = in.begin(); col < in.end(); ++col) { + schema.push_back(col->type()); + } + for (uint i=0; i Date: Thu, 8 Jul 2021 20:45:18 +0000 Subject: [PATCH 47/80] updating to use make_device_uvector_async and bitmask functions per review comments --- cpp/src/row_conversion/row_conversion.cu | 125 +++++++++-------------- 1 file changed, 47 insertions(+), 78 deletions(-) diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu index c0e78a03576..c73e967cf0f 100644 --- a/cpp/src/row_conversion/row_conversion.cu +++ b/cpp/src/row_conversion/row_conversion.cu @@ -21,6 +21,7 @@ #include #include +#include #include #include #include @@ -36,6 +37,7 @@ #include "thrust/iterator/counting_iterator.h" #include "thrust/iterator/transform_iterator.h" +using cudf::detail::make_device_uvector_async; namespace cudf { namespace detail { @@ -45,32 +47,6 @@ static inline __host__ __device__ int32_t align_offset(int32_t offset, std::size return (offset + alignment - 1) & ~(alignment - 1); } -/** - * Copy a simple vector to device memory asynchronously. Be sure to read - * the data on the same stream as is used to copy it. - */ -template -std::unique_ptr> copy_to_dev_async(const std::vector &input, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) -{ - std::unique_ptr> ret(new rmm::device_uvector(input.size(), stream, mr)); - CUDA_TRY(cudaMemcpyAsync( - ret->data(), input.data(), sizeof(T) * input.size(), cudaMemcpyHostToDevice, stream.value())); - return ret; -} - -template -rmm::device_uvector copy_to_dev_async2(const std::vector &input, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) -{ - rmm::device_uvector ret(input.size(), stream, mr); - CUDA_TRY(cudaMemcpyAsync( - ret.data(), input.data(), sizeof(T) * input.size(), cudaMemcpyHostToDevice, stream.value())); - return ret; -} - __global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows, const cudf::size_type num_columns, const cudf::size_type row_size, @@ -180,8 +156,8 @@ __global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows, } cudf::bitmask_type *nm = output_nm[col_index]; - int8_t *valid_byte = &row_vld_tmp[col_index / 8]; - cudf::size_type byte_bit_offset = col_index % 8; + int8_t *valid_byte = &row_vld_tmp[word_index(col_index)]; + cudf::size_type byte_bit_offset = intra_word_index(col_index); int predicate = *valid_byte & (1 << byte_bit_offset); uint32_t bitmask = __ballot_sync(active_mask, predicate); if (row_index % 32 == 0) { nm[word_index(row_index)] = bitmask; } @@ -278,8 +254,8 @@ __global__ void copy_from_fixed_width_columns(const cudf::size_type start_row, } // atomicOr only works on 32 bit or 64 bit aligned values, and not byte aligned // so we have to rewrite the addresses to make sure that it is 4 byte aligned - int8_t *valid_byte = &row_vld_tmp[col_index / 8]; - cudf::size_type byte_bit_offset = col_index % 8; + int8_t *valid_byte = &row_vld_tmp[word_index(col_index)]; + cudf::size_type byte_bit_offset = intra_word_index(col_index); uint64_t fixup_bytes = reinterpret_cast(valid_byte) % 4; int32_t *valid_int = reinterpret_cast(valid_byte - fixup_bytes); cudf::size_type int_bit_offset = byte_bit_offset + (fixup_bytes * 8); @@ -505,8 +481,8 @@ __global__ void copy_from_columns(const size_type num_rows, // we do this directly in the final location because the entire row may not // fit in shared memory and may require many blocks to process it entirely int8_t *valid_byte = - &output_data[block.buffer_num][row_offsets[row] + validity_offset + col / 8]; - cudf::size_type byte_bit_offset = col % 8; + &output_data[block.buffer_num][row_offsets[row] + validity_offset + word_index(col)]; + cudf::size_type byte_bit_offset = intra_word_index(col); uint64_t fixup_bytes = reinterpret_cast(valid_byte) % 4; int32_t *valid_int = reinterpret_cast(valid_byte - fixup_bytes); cudf::size_type int_bit_offset = byte_bit_offset + (fixup_bytes * 8); @@ -648,7 +624,7 @@ __global__ void copy_to_columns(const size_type num_rows, // This has been broken up for us in the block_info struct, so we don't have // any calculation to do here, but it is important to note. - bool debug_print = false; //blockIdx.x == 1 && threadIdx.x == 0; + bool debug_print = false; //blockIdx.x == 0 && threadIdx.x == 0; if (debug_print) { printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns); @@ -806,7 +782,7 @@ __global__ void copy_to_columns(const size_type num_rows, auto const col = index / validity_batches_per_col; auto const batch = index % validity_batches_per_col; auto const starting_row = batch * 32; - auto const validity_offset = col_offsets[num_columns] + col / 8; + auto const validity_offset = col_offsets[num_columns] + word_index(col); if (debug_print) { printf("col: %d, batch: %d, starting_row: %d, validity_offset: %d\n", col, batch, starting_row, validity_offset); @@ -821,7 +797,7 @@ __global__ void copy_to_columns(const size_type num_rows, } auto const val_byte = *validity_ptr; - auto const src_shift = col % 8; + auto const src_shift = intra_word_index(col); auto const dst_shift = row % 32; auto const src_bit_mask = 1 << src_shift; if (debug_print) { @@ -920,10 +896,10 @@ static std::unique_ptr fixed_width_convert_to_rows( const cudf::size_type num_rows, const cudf::size_type num_columns, const cudf::size_type size_per_row, - std::unique_ptr> &column_start, - std::unique_ptr> &column_size, - std::unique_ptr> &input_data, - std::unique_ptr> &input_nm, + rmm::device_uvector &column_start, + rmm::device_uvector &column_size, + rmm::device_uvector &input_data, + rmm::device_uvector &input_nm, const cudf::scalar &zero, const cudf::scalar &scalar_size_per_row, rmm::cuda_stream_view stream, @@ -954,10 +930,10 @@ static std::unique_ptr fixed_width_convert_to_rows( num_rows, num_columns, size_per_row, - column_start->data(), - column_size->data(), - input_data->data(), - input_nm->data(), + column_start.data(), + column_size.data(), + input_data.data(), + input_nm.data(), data->mutable_view().data()); return cudf::make_lists_column(num_rows, @@ -1004,7 +980,7 @@ static inline int32_t compute_fixed_width_layout(std::vector co // Now we need to add in space for validity // Eventually we can think about nullable vs not nullable, but for now we will just always add it // in - int32_t validity_bytes_needed = (schema.size() + 7) / 8; + int32_t validity_bytes_needed = word_index(schema.size() + 7); // validity comes at the end and is byte aligned so we can pack more in. at_offset += validity_bytes_needed; // Now we need to pad the end so all rows are 64 bit aligned @@ -1235,8 +1211,8 @@ std::vector> convert_to_rows2(cudf::table_view con } } - auto dev_input_data = detail::copy_to_dev_async2(input_data, stream, mr); - auto dev_input_nm = detail::copy_to_dev_async2(input_nm, stream, mr); + auto dev_input_data = make_device_uvector_async(input_data, stream, mr); + auto dev_input_nm = make_device_uvector_async(input_nm, stream, mr); std::vector row_sizes; // size of each row in bytes including any alignment padding std::vector row_offsets; // offset from the start of the data to this row @@ -1287,8 +1263,8 @@ std::vector> convert_to_rows2(cudf::table_view con #endif - auto dev_col_sizes = detail::copy_to_dev_async2(column_sizes, stream, mr); - auto dev_col_starts = detail::copy_to_dev_async2(column_starts, stream, mr); + auto dev_col_sizes = make_device_uvector_async(column_sizes, stream, mr); + auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr); std::vector row_batches; @@ -1322,16 +1298,9 @@ std::vector> convert_to_rows2(cudf::table_view con size_type row_batch_rows = 0; uint64_t row_offset = 0; - auto calculate_validity_size = [](int const num_cols) { - // Now we need to add in space for validity - // Eventually we can think about nullable vs not nullable, but for now we will just always add - // it in - return (num_cols + 7) / 8; - }; - // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then // calculate the size of each row's variable-width data and validity as well. - auto validity_size = calculate_validity_size(num_columns); + auto validity_size = num_bitmask_words(num_columns); for (int row = 0; row < num_rows; ++row) { auto aligned_row_batch_size = detail::align_offset(row_batch_size, 8); // rows are 8 byte aligned @@ -1364,7 +1333,7 @@ std::vector> convert_to_rows2(cudf::table_view con row_batches.push_back(detail::row_batch{static_cast(row_batch_size), row_batch_rows}); } - auto dev_row_offsets = detail::copy_to_dev_async2(row_offsets, stream, mr); + auto dev_row_offsets = make_device_uvector_async(row_offsets, stream, mr); #if defined(DEBUG) printf("%d rows and %d columns in table\n", num_rows, num_columns); @@ -1384,7 +1353,7 @@ std::vector> convert_to_rows2(cudf::table_view con output_data.push_back(static_cast(temp.data())); output_buffers.push_back(std::move(temp)); } - auto dev_output_data = detail::copy_to_dev_async2(output_data, stream, mr); + auto dev_output_data = make_device_uvector_async(output_data, stream, mr); std::vector block_infos = build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block); @@ -1402,7 +1371,7 @@ std::vector> convert_to_rows2(cudf::table_view con printf(" total):\n"); #endif - auto dev_block_infos = detail::copy_to_dev_async2(block_infos, stream, mr); + auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr); // blast through the entire table and convert it dim3 blocks(block_infos.size()); @@ -1443,7 +1412,7 @@ std::vector> convert_to_rows2(cudf::table_view con } offset_offset += row_batches[i].row_count; - auto dev_offsets = detail::copy_to_dev_async2(offset_vals, stream, mr); + auto dev_offsets = make_device_uvector_async(offset_vals, stream, mr); auto offsets = std::make_unique( data_type{type_id::INT32}, (size_type)offset_vals.size(), dev_offsets.release()); @@ -1477,8 +1446,8 @@ std::vector> convert_to_rows(cudf::table_view cons std::vector column_size; int32_t size_per_row = detail::compute_fixed_width_layout(schema, column_start, column_size); - auto dev_column_start = detail::copy_to_dev_async(column_start, stream, mr); - auto dev_column_size = detail::copy_to_dev_async(column_size, stream, mr); + auto dev_column_start = make_device_uvector_async(column_start, stream, mr); + auto dev_column_size = make_device_uvector_async(column_size, stream, mr); int32_t max_rows_per_batch = std::numeric_limits::max() / size_per_row; // Make the number of rows per batch a multiple of 32 so we don't have to worry about @@ -1495,8 +1464,8 @@ std::vector> convert_to_rows(cudf::table_view cons input_data.emplace_back(cv.data()); input_nm.emplace_back(cv.null_mask()); } - auto dev_input_data = detail::copy_to_dev_async(input_data, stream, mr); - auto dev_input_nm = detail::copy_to_dev_async(input_nm, stream, mr); + auto dev_input_data = make_device_uvector_async(input_data, stream, mr); + auto dev_input_nm = make_device_uvector_async(input_nm, stream, mr); using ScalarType = cudf::scalar_type_t; auto zero = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value()); @@ -1561,7 +1530,7 @@ std::unique_ptr convert_from_rows2(cudf::lists_column_view const &i size_type fixed_width_size_per_row = detail::compute_column_information( iter, iter + num_columns, column_starts, column_sizes);//, [](void *) {}); - size_type validity_size = (num_columns + 7) / 8; + size_type validity_size = num_bitmask_words(num_columns); size_type row_size = detail::align_offset(fixed_width_size_per_row + validity_size, 8); @@ -1569,8 +1538,8 @@ std::unique_ptr convert_from_rows2(cudf::lists_column_view const &i // this is probably fine CUDF_EXPECTS(row_size * num_rows == child.size(), "The layout of the data appears to be off"); - auto dev_col_starts = detail::copy_to_dev_async2(column_starts, stream, mr); - auto dev_col_sizes = detail::copy_to_dev_async2(column_sizes, stream, mr); + auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr); + auto dev_col_sizes = make_device_uvector_async(column_sizes, stream, mr); // build the row_batches from the passed in list column std::vector row_batches; @@ -1590,13 +1559,13 @@ std::unique_ptr convert_from_rows2(cudf::lists_column_view const &i output_columns.emplace_back(std::move(column)); } - auto dev_output_data = detail::copy_to_dev_async2(output_data, stream, mr); - auto dev_output_nm = detail::copy_to_dev_async2(output_nm, stream, mr); + auto dev_output_data = make_device_uvector_async(output_data, stream, mr); + auto dev_output_nm = make_device_uvector_async(output_nm, stream, mr); std::vector block_infos = build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block); - auto dev_block_infos = detail::copy_to_dev_async2(block_infos, stream, mr); + auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr); dim3 blocks(block_infos.size()); #if defined(DEBUG) || 1 @@ -1647,8 +1616,8 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const &in // this is probably fine CUDF_EXPECTS(size_per_row * num_rows == child.size(), "The layout of the data appears to be off"); - auto dev_column_start = detail::copy_to_dev_async(column_start, stream, mr); - auto dev_column_size = detail::copy_to_dev_async(column_size, stream, mr); + auto dev_column_start = make_device_uvector_async(column_start, stream); + auto dev_column_size = make_device_uvector_async(column_size, stream); // Allocate the columns we are going to write into std::vector> output_columns; @@ -1663,8 +1632,8 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const &in output_columns.emplace_back(std::move(column)); } - auto dev_output_data = detail::copy_to_dev_async(output_data, stream, mr); - auto dev_output_nm = detail::copy_to_dev_async(output_nm, stream, mr); + auto dev_output_data = make_device_uvector_async(output_data, stream, mr); + auto dev_output_nm = make_device_uvector_async(output_nm, stream, mr); dim3 blocks; dim3 threads; @@ -1675,10 +1644,10 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const &in num_rows, num_columns, size_per_row, - dev_column_start->data(), - dev_column_size->data(), - dev_output_data->data(), - dev_output_nm->data(), + dev_column_start.data(), + dev_column_size.data(), + dev_output_data.data(), + dev_output_nm.data(), child.data()); return std::make_unique(std::move(output_columns)); From 7bb049655fa35a4453cd705b8740e30eb2041533 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Tue, 13 Jul 2021 07:18:49 +0000 Subject: [PATCH 48/80] updating conversion code. Found out bit operations are on 32-bit values, so they can't be used since row data has byte-aligned validity. Performance improvements on the row to column side. --- cpp/src/row_conversion/row_conversion.cu | 106 ++++++++++++----------- 1 file changed, 54 insertions(+), 52 deletions(-) diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu index c73e967cf0f..0879a1c50a5 100644 --- a/cpp/src/row_conversion/row_conversion.cu +++ b/cpp/src/row_conversion/row_conversion.cu @@ -37,6 +37,8 @@ #include "thrust/iterator/counting_iterator.h" #include "thrust/iterator/transform_iterator.h" +#define NUM_BLOCKS_PER_KERNEL_TO_COLUMNS (2) + using cudf::detail::make_device_uvector_async; namespace cudf { @@ -156,11 +158,11 @@ __global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows, } cudf::bitmask_type *nm = output_nm[col_index]; - int8_t *valid_byte = &row_vld_tmp[word_index(col_index)]; - cudf::size_type byte_bit_offset = intra_word_index(col_index); + int8_t *valid_byte = &row_vld_tmp[col_index / 8]; + cudf::size_type byte_bit_offset = col_index % 8; int predicate = *valid_byte & (1 << byte_bit_offset); uint32_t bitmask = __ballot_sync(active_mask, predicate); - if (row_index % 32 == 0) { nm[word_index(row_index)] = bitmask; } + if (row_index % 32 == 0) { nm[row_index / 8] = bitmask; } } // end column loop } // end row copy // wait for the row_group to be totally copied before starting on the next row group @@ -254,8 +256,8 @@ __global__ void copy_from_fixed_width_columns(const cudf::size_type start_row, } // atomicOr only works on 32 bit or 64 bit aligned values, and not byte aligned // so we have to rewrite the addresses to make sure that it is 4 byte aligned - int8_t *valid_byte = &row_vld_tmp[word_index(col_index)]; - cudf::size_type byte_bit_offset = intra_word_index(col_index); + int8_t *valid_byte = &row_vld_tmp[col_index / 8]; + cudf::size_type byte_bit_offset = col_index % 8; uint64_t fixup_bytes = reinterpret_cast(valid_byte) % 4; int32_t *valid_int = reinterpret_cast(valid_byte - fixup_bytes); cudf::size_type int_bit_offset = byte_bit_offset + (fixup_bytes * 8); @@ -481,8 +483,8 @@ __global__ void copy_from_columns(const size_type num_rows, // we do this directly in the final location because the entire row may not // fit in shared memory and may require many blocks to process it entirely int8_t *valid_byte = - &output_data[block.buffer_num][row_offsets[row] + validity_offset + word_index(col)]; - cudf::size_type byte_bit_offset = intra_word_index(col); + &output_data[block.buffer_num][row_offsets[row] + validity_offset + (col / 8)]; + cudf::size_type byte_bit_offset = col % 8; uint64_t fixup_bytes = reinterpret_cast(valid_byte) % 4; int32_t *valid_int = reinterpret_cast(valid_byte - fixup_bytes); cudf::size_type int_bit_offset = byte_bit_offset + (fixup_bytes * 8); @@ -597,6 +599,7 @@ __global__ void copy_from_columns(const size_type num_rows, * * @param num_rows total number of rows in the table * @param num_columns total number of columns in the table + * @param shmem_used_per_block amount of shared memory that is used by a block * @param offsets * @param output_data * @param output_nm @@ -608,6 +611,7 @@ __global__ void copy_from_columns(const size_type num_rows, */ __global__ void copy_to_columns(const size_type num_rows, const size_type num_columns, + const size_type shmem_used_per_block, const size_type *offsets, int8_t **output_data, cudf::bitmask_type **output_nm, @@ -624,18 +628,10 @@ __global__ void copy_to_columns(const size_type num_rows, // This has been broken up for us in the block_info struct, so we don't have // any calculation to do here, but it is important to note. - bool debug_print = false; //blockIdx.x == 0 && threadIdx.x == 0; + constexpr bool debug_print = false; //blockIdx.x == 0 && threadIdx.x == 0; if (debug_print) { printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns); - printf("Column Info:\n"); - for (int i = 0; i < num_columns; ++i) { - printf("col %d is at %p with size %d and offset %d\n", - i, - output_data[i], - col_sizes[i], - col_offsets[i]); - } printf("block infos are at %p and my index is %d\n", block_infos, blockIdx.x); /* printf("Row Offsets:\n"); for (int i=0; i blockDim.x) { + break; + } + auto block = block_infos[this_block_index]; auto const rows_in_block = block.end_row - block.start_row + 1; auto const cols_in_block = block.end_col - block.start_col + 1; extern __shared__ int8_t shared_data[]; @@ -767,61 +769,58 @@ __global__ void copy_to_columns(const size_type num_rows, } } - __syncthreads(); - - // now handle validity. Each thread is responsible for 32 rows in a single column. + // now handle validity. Each thread is responsible for 32 rows in 8 columns. // to prevent indexing issues with a large number of threads, this is compressed // to a single loop like above. TODO: investigate using shared memory here auto const validity_batches_per_col = (num_rows + 31) / 32; - auto const validity_batches_total = validity_batches_per_col * num_columns; - if (debug_print) { - printf("validity_batched_per_col is %d\nvalidity_batches_total is %d for %d rows\n", validity_batches_per_col, validity_batches_total, num_rows); + auto const validity_batches_total = std::max(1, validity_batches_per_col * (num_columns / 8)); + if (debug_print && threadIdx.x == 0 && blockIdx.x == 0) { + printf("validity_batched_per_col is %d\nvalidity_batches_total is %d for %d rows\n%d blocks of %d threads\n", validity_batches_per_col, validity_batches_total, num_rows, gridDim.x, blockDim.x); } - for (int index = threadIdx.x; index < validity_batches_total; index += blockDim.x) { - // what column is this? - auto const col = index / validity_batches_per_col; + for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < validity_batches_total; index += blockDim.x * gridDim.x) { + auto const start_col = (index * 8) / validity_batches_per_col; auto const batch = index % validity_batches_per_col; auto const starting_row = batch * 32; - auto const validity_offset = col_offsets[num_columns] + word_index(col); + auto const validity_offset = col_offsets[num_columns] + (start_col / 8); if (debug_print) { - printf("col: %d, batch: %d, starting_row: %d, validity_offset: %d\n", col, batch, starting_row, validity_offset); + printf("%d-%d: cols: %d-%d, word index: %d, batch: %d, starting_row: %d, +validity_offset: %d, index: %d, stride: %d\n", threadIdx.x, blockIdx.x, start_col, start_col + 7, (start_col / 8), batch, starting_row, validity_offset, index, blockDim.x * gridDim.x); } - int32_t dst_validity = 0; + // one for each column + int32_t dst_validity[8] = {0}; for (int row = starting_row; row < std::min(num_rows, starting_row + 32); ++row) { int8_t const * const validity_ptr = &input_data[offsets[row] + validity_offset]; if (debug_print) { - printf("validity_ptr is %p for row %d\nwhich is input_data[%d]\n", validity_ptr, row, offsets[row] + validity_offset); + printf("%d: validity_ptr is %p for row %d\n", threadIdx.x, validity_ptr, row); } auto const val_byte = *validity_ptr; - auto const src_shift = intra_word_index(col); - auto const dst_shift = row % 32; - auto const src_bit_mask = 1 << src_shift; - if (debug_print) { - printf("src bit mask is 0x%x\n", src_bit_mask); - printf("src shift is 0x%x and dst shift is 0x%x\n", src_shift, dst_shift); - printf("validity bit is 0x%x\n", (val_byte & src_bit_mask) >> src_shift); - } -// auto const dst_bit_mask = 1 << dst_shift; - dst_validity |= (((val_byte & src_bit_mask) >> src_shift) << dst_shift); - if (debug_print) { - printf("validity is now 0x%x\n", dst_validity); + + for (int i=0; i> src_shift); + } + // auto const dst_bit_mask = 1 << dst_shift; + dst_validity[i] |= (((val_byte & src_bit_mask) >> src_shift) << dst_shift); } } - int32_t *validity_ptr = reinterpret_cast(output_nm[col] + (starting_row / 32)); - if (debug_print) { - printf("valiidty_ptr is output_nm[%d]: %p + starting_row / 8: %d because starting row is %d, which becomes %p\n", col, output_nm[col], starting_row / 32, starting_row, output_nm[col] + (starting_row / 32)); - printf("validity to write is %d\n", dst_validity); - printf("validity write %p <- %d\n", validity_ptr, dst_validity); + for (int i=0; i(output_nm[start_col + i] + (starting_row / 32)); + if (debug_print) { + printf("%d-%d: validity write output_nm[%d][%d] - %p <- %d\n", threadIdx.x, blockIdx.x, start_col + i, starting_row, validity_ptr, dst_validity[i]); + } + *validity_ptr = dst_validity[i]; } - *validity_ptr = dst_validity; } } +} /** * Calculate the dimensions of the kernel for fixed width only columns. @@ -980,7 +979,7 @@ static inline int32_t compute_fixed_width_layout(std::vector co // Now we need to add in space for validity // Eventually we can think about nullable vs not nullable, but for now we will just always add it // in - int32_t validity_bytes_needed = word_index(schema.size() + 7); + int32_t validity_bytes_needed = (schema.size() + 7) / 8; // validity comes at the end and is byte aligned so we can pack more in. at_offset += validity_bytes_needed; // Now we need to pad the end so all rows are 64 bit aligned @@ -1300,7 +1299,7 @@ std::vector> convert_to_rows2(cudf::table_view con // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then // calculate the size of each row's variable-width data and validity as well. - auto validity_size = num_bitmask_words(num_columns); + auto validity_size = num_bitmask_words(num_columns) * 4; for (int row = 0; row < num_rows; ++row) { auto aligned_row_batch_size = detail::align_offset(row_batch_size, 8); // rows are 8 byte aligned @@ -1521,6 +1520,8 @@ std::unique_ptr convert_from_rows2(cudf::lists_column_view const &i CUDA_TRY( cudaDeviceGetAttribute(&shmem_limit_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); + shmem_limit_per_block /= NUM_BLOCKS_PER_KERNEL_TO_COLUMNS; + std::vector column_starts; std::vector column_sizes; @@ -1530,7 +1531,7 @@ std::unique_ptr convert_from_rows2(cudf::lists_column_view const &i size_type fixed_width_size_per_row = detail::compute_column_information( iter, iter + num_columns, column_starts, column_sizes);//, [](void *) {}); - size_type validity_size = num_bitmask_words(num_columns); + size_type validity_size = num_bitmask_words(num_columns) * 4; size_type row_size = detail::align_offset(fixed_width_size_per_row + validity_size, 8); @@ -1567,7 +1568,7 @@ std::unique_ptr convert_from_rows2(cudf::lists_column_view const &i auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr); - dim3 blocks(block_infos.size()); + dim3 blocks((block_infos.size() + (NUM_BLOCKS_PER_KERNEL_TO_COLUMNS - 1)) / NUM_BLOCKS_PER_KERNEL_TO_COLUMNS); #if defined(DEBUG) || 1 dim3 threads(std::min(std::min(512, shmem_limit_per_block / 8), (int)child.size())); #else @@ -1581,6 +1582,7 @@ std::unique_ptr convert_from_rows2(cudf::lists_column_view const &i detail::copy_to_columns<<>>( num_rows, num_columns, + shmem_limit_per_block, input.offsets().data(), dev_output_data.data(), dev_output_nm.data(), From 2b069caf7e7d34077e37d7a1fdb92439472527fc Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Mon, 13 Sep 2021 19:46:03 +0000 Subject: [PATCH 49/80] updating for memcpy_async and validation in a different kernel --- .../row_conversion/row_conversion.cpp | 47 +- cpp/include/cudf/row_conversion.hpp | 38 +- cpp/src/row_conversion/row_conversion.cu | 1926 ++++++++++++----- cpp/tests/row_conversion/row_conversion.cpp | 132 +- java/src/main/native/src/row_conversion.cu | 1293 ++++++++++- java/src/main/native/src/row_conversion.hpp | 12 + 6 files changed, 2714 insertions(+), 734 deletions(-) diff --git a/cpp/benchmarks/row_conversion/row_conversion.cpp b/cpp/benchmarks/row_conversion/row_conversion.cpp index 7c1f52c5cd6..ad9925e9043 100644 --- a/cpp/benchmarks/row_conversion/row_conversion.cpp +++ b/cpp/benchmarks/row_conversion/row_conversion.cpp @@ -20,7 +20,8 @@ #include #include -#include "cudf_test/column_utilities.hpp" +#include +#include class RowConversion : public cudf::benchmark { }; @@ -39,9 +40,6 @@ static void BM_old_to_row(benchmark::State& state) cudf::type_id::UINT64}, 212, row_count{n_rows}); - /* auto const table = create_random_table({cudf::type_id::INT32}, - 64, - row_count{n_rows});*/ cudf::size_type total_bytes = 0; for (int i = 0; i < table->num_columns(); ++i) { @@ -52,7 +50,7 @@ static void BM_old_to_row(benchmark::State& state) for (auto _ : state) { cuda_event_timer raii(state, true, rmm::cuda_stream_default); - auto rows = cudf::convert_to_rows(table->view()); + auto rows = cudf::old_convert_to_rows(table->view()); } state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows()); @@ -72,9 +70,6 @@ static void BM_new_to_row(benchmark::State& state) cudf::type_id::UINT64}, 212, row_count{n_rows}); - /* auto const table = create_random_table({cudf::type_id::INT32}, - 64, - row_count{n_rows});*/ cudf::size_type total_bytes = 0; for (int i = 0; i < table->num_columns(); ++i) { @@ -85,7 +80,7 @@ static void BM_new_to_row(benchmark::State& state) for (auto _ : state) { cuda_event_timer raii(state, true, rmm::cuda_stream_default); - auto new_rows = cudf::convert_to_rows2(table->view()); + auto new_rows = cudf::convert_to_rows(table->view()); } state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows()); @@ -114,12 +109,13 @@ static void BM_old_from_row(benchmark::State& state) total_bytes += cudf::size_of(t); } - auto rows = cudf::convert_to_rows(table->view()); + auto rows = cudf::old_convert_to_rows(table->view()); + cudf::lists_column_view const first_list(rows.front()->view()); for (auto _ : state) { cuda_event_timer raii(state, true, rmm::cuda_stream_default); - auto out = cudf::convert_from_rows(rows, schema); + auto out = cudf::old_convert_from_rows(first_list, schema); } state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows()); @@ -148,36 +144,37 @@ static void BM_new_from_row(benchmark::State& state) total_bytes += cudf::size_of(t); } - auto rows = cudf::convert_to_rows(table->view()); + auto rows = cudf::old_convert_to_rows(table->view()); + cudf::lists_column_view const first_list(rows.front()->view()); for (auto _ : state) { cuda_event_timer raii(state, true, rmm::cuda_stream_default); - auto out = cudf::convert_from_rows2(rows, schema); + auto out = cudf::convert_from_rows(first_list, schema); } state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows()); } #define TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \ - BENCHMARK_DEFINE_F(RowConversion, name) \ - (::benchmark::State & st) { f(st); } \ - BENCHMARK_REGISTER_F(RowConversion, name) \ - ->RangeMultiplier(8) \ - ->Ranges({{1 << 6, 1 << 20}}) \ - ->UseManualTime() \ + BENCHMARK_DEFINE_F(RowConversion, name) \ + (::benchmark::State & st) { f(st); } \ + BENCHMARK_REGISTER_F(RowConversion, name) \ + ->RangeMultiplier(8) \ + ->Ranges({{1 << 6, 1 << 20}}) \ + ->UseManualTime() \ ->Unit(benchmark::kMillisecond); TO_ROW_CONVERSION_BENCHMARK_DEFINE(old_to_row_conversion, BM_old_to_row) TO_ROW_CONVERSION_BENCHMARK_DEFINE(new_to_row_conversion, BM_new_to_row) #define FROM_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \ - BENCHMARK_DEFINE_F(RowConversion, name) \ - (::benchmark::State & st) { f(st); } \ - BENCHMARK_REGISTER_F(RowConversion, name) \ - ->RangeMultiplier(8) \ - ->Ranges({{1 << 6, 1 << 20}}) \ - ->UseManualTime() \ + BENCHMARK_DEFINE_F(RowConversion, name) \ + (::benchmark::State & st) { f(st); } \ + BENCHMARK_REGISTER_F(RowConversion, name) \ + ->RangeMultiplier(8) \ + ->Ranges({{1 << 6, 1 << 20}}) \ + ->UseManualTime() \ ->Unit(benchmark::kMillisecond); FROM_ROW_CONVERSION_BENCHMARK_DEFINE(old_from_row_conversion, BM_old_from_row) diff --git a/cpp/include/cudf/row_conversion.hpp b/cpp/include/cudf/row_conversion.hpp index 282ffa4b0cb..8f82d01b06c 100644 --- a/cpp/include/cudf/row_conversion.hpp +++ b/cpp/include/cudf/row_conversion.hpp @@ -24,40 +24,28 @@ namespace cudf { -std::vector> convert_to_rows( - cudf::table_view const &tbl, +std::vector> old_convert_to_rows( + cudf::table_view const& tbl, // TODO need something for validity rmm::cuda_stream_view stream = rmm::cuda_stream_default, - rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); -std::vector> convert_to_rows2( - cudf::table_view const &tbl, +std::vector> convert_to_rows( + cudf::table_view const& tbl, // TODO need something for validity rmm::cuda_stream_view stream = rmm::cuda_stream_default, - rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); -std::unique_ptr convert_from_rows( - cudf::lists_column_view const &input, - std::vector const &schema, +std::unique_ptr old_convert_from_rows( + cudf::lists_column_view const& input, + std::vector const& schema, rmm::cuda_stream_view stream = rmm::cuda_stream_default, - rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); std::unique_ptr convert_from_rows( - std::vector> const &input, - std::vector const &schema, - rmm::cuda_stream_view stream = rmm::cuda_stream_default, - rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); - -std::unique_ptr convert_from_rows2( - cudf::lists_column_view const &input, - std::vector const &schema, - rmm::cuda_stream_view stream = rmm::cuda_stream_default, - rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); - -std::unique_ptr convert_from_rows2( - std::vector> const &input, - std::vector const &schema, + cudf::lists_column_view const& input, + std::vector const& schema, rmm::cuda_stream_view stream = rmm::cuda_stream_default, - rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace cudf diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu index 0879a1c50a5..42c40e0542d 100644 --- a/cpp/src/row_conversion/row_conversion.cu +++ b/cpp/src/row_conversion/row_conversion.cu @@ -18,26 +18,42 @@ #include #include #include +#include +#include + +#include + +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 +#include +#endif #include #include +#include +#include #include +#include #include #include +#include #include #include #include + #include +#include #include -#include -#include -#include "cudf/types.hpp" -#include "rmm/device_buffer.hpp" -#include "thrust/iterator/counting_iterator.h" -#include "thrust/iterator/transform_iterator.h" +#include +#include -#define NUM_BLOCKS_PER_KERNEL_TO_COLUMNS (2) +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 +constexpr auto NUM_BLOCKS_PER_KERNEL_TO_COLUMNS = 8; +constexpr auto NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS = 8; +constexpr auto NUM_BLOCKS_PER_KERNEL_LOADED = 2; +constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL = 8; +constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED = 2; +#endif using cudf::detail::make_device_uvector_async; namespace cudf { @@ -52,11 +68,11 @@ static inline __host__ __device__ int32_t align_offset(int32_t offset, std::size __global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows, const cudf::size_type num_columns, const cudf::size_type row_size, - const cudf::size_type *input_offset_in_row, - const cudf::size_type *num_bytes, - int8_t **output_data, - cudf::bitmask_type **output_nm, - const int8_t *input_data) + const cudf::size_type* input_offset_in_row, + const cudf::size_type* num_bytes, + int8_t** output_data, + cudf::bitmask_type** output_nm, + const int8_t* input_data) { // We are going to copy the data in two passes. // The first pass copies a chunk of data into shared memory. @@ -81,15 +97,15 @@ __global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows, // Because we are copying fixed width only data and we stride the rows // this thread will always start copying from shared data in the same place - int8_t *row_tmp = &shared_data[row_size * threadIdx.x]; - int8_t *row_vld_tmp = &row_tmp[input_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]]; + int8_t* row_tmp = &shared_data[row_size * threadIdx.x]; + int8_t* row_vld_tmp = &row_tmp[input_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]]; for (cudf::size_type row_group_index = row_group_start; row_group_index < row_group_end; row_group_index += row_group_stride) { // Step 1: Copy the data into shared memory // We know row_size is always aligned with and a multiple of int64_t; - int64_t *long_shared = reinterpret_cast(shared_data); - const int64_t *long_input = reinterpret_cast(input_data); + int64_t* long_shared = reinterpret_cast(shared_data); + const int64_t* long_input = reinterpret_cast(input_data); cudf::size_type shared_output_index = threadIdx.x + (threadIdx.y * blockDim.x); cudf::size_type shared_output_stride = blockDim.x * blockDim.y; @@ -125,26 +141,26 @@ __global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows, for (cudf::size_type col_index = col_index_start; col_index < num_columns; col_index += col_index_stride) { cudf::size_type col_size = num_bytes[col_index]; - const int8_t *col_tmp = &(row_tmp[input_offset_in_row[col_index]]); - int8_t *col_output = output_data[col_index]; + const int8_t* col_tmp = &(row_tmp[input_offset_in_row[col_index]]); + int8_t* col_output = output_data[col_index]; switch (col_size) { case 1: { col_output[row_index] = *col_tmp; break; } case 2: { - int16_t *short_col_output = reinterpret_cast(col_output); - short_col_output[row_index] = *reinterpret_cast(col_tmp); + int16_t* short_col_output = reinterpret_cast(col_output); + short_col_output[row_index] = *reinterpret_cast(col_tmp); break; } case 4: { - int32_t *int_col_output = reinterpret_cast(col_output); - int_col_output[row_index] = *reinterpret_cast(col_tmp); + int32_t* int_col_output = reinterpret_cast(col_output); + int_col_output[row_index] = *reinterpret_cast(col_tmp); break; } case 8: { - int64_t *long_col_output = reinterpret_cast(col_output); - long_col_output[row_index] = *reinterpret_cast(col_tmp); + int64_t* long_col_output = reinterpret_cast(col_output); + long_col_output[row_index] = *reinterpret_cast(col_tmp); break; } default: { @@ -157,12 +173,12 @@ __global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows, } } - cudf::bitmask_type *nm = output_nm[col_index]; - int8_t *valid_byte = &row_vld_tmp[col_index / 8]; + cudf::bitmask_type* nm = output_nm[col_index]; + int8_t* valid_byte = &row_vld_tmp[col_index / 8]; cudf::size_type byte_bit_offset = col_index % 8; int predicate = *valid_byte & (1 << byte_bit_offset); uint32_t bitmask = __ballot_sync(active_mask, predicate); - if (row_index % 32 == 0) { nm[row_index / 8] = bitmask; } + if (row_index % 32 == 0) { nm[word_index(row_index)] = bitmask; } } // end column loop } // end row copy // wait for the row_group to be totally copied before starting on the next row group @@ -174,11 +190,11 @@ __global__ void copy_from_fixed_width_columns(const cudf::size_type start_row, const cudf::size_type num_rows, const cudf::size_type num_columns, const cudf::size_type row_size, - const cudf::size_type *output_offset_in_row, - const cudf::size_type *num_bytes, - const int8_t **input_data, - const cudf::bitmask_type **input_nm, - int8_t *output_data) + const cudf::size_type* output_offset_in_row, + const cudf::size_type* num_bytes, + const int8_t** input_data, + const cudf::bitmask_type** input_nm, + int8_t* output_data) { // We are going to copy the data in two passes. // The first pass copies a chunk of data into shared memory. @@ -205,8 +221,8 @@ __global__ void copy_from_fixed_width_columns(const cudf::size_type start_row, // Because we are copying fixed width only data and we stride the rows // this thread will always start copying to shared data in the same place - int8_t *row_tmp = &shared_data[row_size * threadIdx.x]; - int8_t *row_vld_tmp = + int8_t* row_tmp = &shared_data[row_size * threadIdx.x]; + int8_t* row_vld_tmp = &row_tmp[output_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]]; for (cudf::size_type row_group_index = row_group_start; row_group_index < row_group_end; @@ -223,26 +239,26 @@ __global__ void copy_from_fixed_width_columns(const cudf::size_type start_row, for (cudf::size_type col_index = col_index_start; col_index < num_columns; col_index += col_index_stride) { cudf::size_type col_size = num_bytes[col_index]; - int8_t *col_tmp = &(row_tmp[output_offset_in_row[col_index]]); - const int8_t *col_input = input_data[col_index]; + int8_t* col_tmp = &(row_tmp[output_offset_in_row[col_index]]); + const int8_t* col_input = input_data[col_index]; switch (col_size) { case 1: { *col_tmp = col_input[row_index]; break; } case 2: { - const int16_t *short_col_input = reinterpret_cast(col_input); - *reinterpret_cast(col_tmp) = short_col_input[row_index]; + const int16_t* short_col_input = reinterpret_cast(col_input); + *reinterpret_cast(col_tmp) = short_col_input[row_index]; break; } case 4: { - const int32_t *int_col_input = reinterpret_cast(col_input); - *reinterpret_cast(col_tmp) = int_col_input[row_index]; + const int32_t* int_col_input = reinterpret_cast(col_input); + *reinterpret_cast(col_tmp) = int_col_input[row_index]; break; } case 8: { - const int64_t *long_col_input = reinterpret_cast(col_input); - *reinterpret_cast(col_tmp) = long_col_input[row_index]; + const int64_t* long_col_input = reinterpret_cast(col_input); + *reinterpret_cast(col_tmp) = long_col_input[row_index]; break; } default: { @@ -256,10 +272,10 @@ __global__ void copy_from_fixed_width_columns(const cudf::size_type start_row, } // atomicOr only works on 32 bit or 64 bit aligned values, and not byte aligned // so we have to rewrite the addresses to make sure that it is 4 byte aligned - int8_t *valid_byte = &row_vld_tmp[col_index / 8]; + int8_t* valid_byte = &row_vld_tmp[col_index / 8]; cudf::size_type byte_bit_offset = col_index % 8; uint64_t fixup_bytes = reinterpret_cast(valid_byte) % 4; - int32_t *valid_int = reinterpret_cast(valid_byte - fixup_bytes); + int32_t* valid_int = reinterpret_cast(valid_byte - fixup_bytes); cudf::size_type int_bit_offset = byte_bit_offset + (fixup_bytes * 8); // Now copy validity for the column if (input_nm[col_index]) { @@ -279,8 +295,8 @@ __global__ void copy_from_fixed_width_columns(const cudf::size_type start_row, // Step 2: Copy the data back out // We know row_size is always aligned with and a multiple of int64_t; - int64_t *long_shared = reinterpret_cast(shared_data); - int64_t *long_output = reinterpret_cast(output_data); + int64_t* long_shared = reinterpret_cast(shared_data); + int64_t* long_output = reinterpret_cast(output_data); cudf::size_type shared_input_index = threadIdx.x + (threadIdx.y * blockDim.x); cudf::size_type shared_input_stride = blockDim.x * blockDim.y; @@ -303,12 +319,35 @@ __global__ void copy_from_fixed_width_columns(const cudf::size_type start_row, } } +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 + struct block_info { int start_col; int start_row; int end_col; int end_row; int buffer_num; + + __host__ __device__ size_type get_row_size(size_type const* const col_offsets, + size_type const* const col_sizes, + bool debug_print = false) const + { + if (debug_print) + printf("col_offsets[%d]: %p + col_sizes[%d]: %p - col_offsets[%d]: %p\n%d + %d - %d\n", + end_col, + &col_offsets[end_col], + end_col, + &col_sizes[end_col], + start_col, + &col_offsets[start_col], + col_offsets[end_col], + col_sizes[end_col], + col_offsets[start_col]); + return align_offset(col_offsets[end_col] + col_sizes[end_col] - col_offsets[start_col], 8); + } + __host__ __device__ size_type num_cols() const { return end_col - start_col + 1; } + + __host__ __device__ size_type num_rows() const { return end_row - start_row + 1; } }; // When building the columns to return, we have to be mindful of the offset limit in cudf. @@ -341,13 +380,14 @@ struct row_batch { */ __global__ void copy_from_columns(const size_type num_rows, const size_type num_columns, - const int8_t **input_data, - const bitmask_type **input_nm, - const size_type *col_sizes, - const size_type *col_offsets, - const block_info *block_infos, - const size_type *row_offsets, - int8_t **output_data) + const size_type shmem_used_per_block, + const size_type num_block_infos, + const int8_t** input_data, + const size_type* col_sizes, + const size_type* col_offsets, + const block_info* block_infos, + const size_type* row_offsets, + int8_t** output_data) { // We are going to copy the data in two passes. // The first pass copies a chunk of data into shared memory. @@ -357,239 +397,597 @@ __global__ void copy_from_columns(const size_type num_rows, // This has been broken up for us in the block_info struct, so we don't have // any calculation to do here, but it is important to note. - bool debug_print = false; //blockIdx.x == 0 && threadIdx.x == 0; + constexpr bool debug_print = false; // blockIdx.x == 0 && threadIdx.x == 1; + + constexpr unsigned stages_count = NUM_BLOCKS_PER_KERNEL_LOADED; + auto group = cooperative_groups::this_thread_block(); + extern __shared__ int8_t shared_data[]; + int8_t* shared[stages_count] = {shared_data, shared_data + shmem_used_per_block}; + + __shared__ cuda::barrier block_barrier[NUM_BLOCKS_PER_KERNEL_LOADED]; + if (group.thread_rank() == 0) { + for (int i = 0; i < NUM_BLOCKS_PER_KERNEL_LOADED; ++i) { + init(&block_barrier[i], group.size()); + } + } + + group.sync(); if (debug_print) { printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns); - printf("Column Info:\n"); - for (int i = 0; i < num_columns; ++i) { - printf("col %d is at %p with size %d and offset %d\n", - i, - input_data[i], - col_sizes[i], - col_offsets[i]); - } + printf("col sizes at %p, col offsets at %p, and row offsets at %p\n", + col_sizes, + col_offsets, + row_offsets); printf("block infos are at %p and my index is %d\n", block_infos, blockIdx.x); - /* printf("Row Offsets:\n"); - for (int i=0; i NUM_BLOCKS_PER_KERNEL_LOADED) { fetch_barrier.arrive_and_wait(); } + + // to do the copy we need to do n column copies followed by m element copies OR + // we have to do m element copies followed by r row copies. When going from column + // to row it is much easier to copy by elements first otherwise we would need a running + // total of the column sizes for our block, which isn't readily available. This makes it more + // appealing to copy element-wise from input data into shared matching the end layout and do + // row-based memcopies out. + + for (auto el = (int)threadIdx.x; el < num_elements_in_block; el += blockDim.x) { + auto const relative_col = el / num_fetch_rows; + auto const relative_row = el % num_fetch_rows; + auto const absolute_col = relative_col + fetch_block.start_col; + auto const absolute_row = relative_row + fetch_block.start_row; + if (debug_print) + printf("row %d(%d), col %d(%d), %d fetch rows, element %d\n", + relative_row, + absolute_row, + relative_col, + absolute_col, + num_fetch_rows, + el); + auto const col_size = col_sizes[absolute_col]; + auto const col_offset = col_offsets[absolute_col]; + auto const relative_col_offset = col_offset - starting_column_offset; + + auto const shared_offset = relative_row * fetch_block_row_size + relative_col_offset; + auto const input_src = input_data[absolute_col] + col_size * absolute_row; + + if (debug_print) + printf("block %lu to shared chunk %lu. %p <- %p - %d bytes\n", + fetch, + fetch % stages_count, + &shared[fetch % stages_count][shared_offset], + input_src, + col_size); + + // copy the main + cuda::memcpy_async( + &shared[fetch % stages_count][shared_offset], input_src, col_size, fetch_barrier); + } + } + + auto& subset_barrier = block_barrier[subset % NUM_BLOCKS_PER_KERNEL_LOADED]; + subset_barrier.arrive_and_wait(); + + auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + subset]; + if (debug_print) + printf("reading block %lu\n", blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + subset); + + /* auto const rows_in_block = block.num_rows(); + auto const cols_in_block = block.num_cols();*/ + auto const block_row_size = block.get_row_size(col_offsets, col_sizes); + auto const column_offset = col_offsets[block.start_col]; + + // copy entire rows to final dest + for (auto absolute_row = block.start_row + threadIdx.x; absolute_row <= block.end_row; + absolute_row += blockDim.x) { + auto const relative_row = absolute_row - block.start_row; + auto const output_dest = + output_data[block.buffer_num] + absolute_row * block_row_size + column_offset; + if (debug_print) + printf("processing row %d\noutput data[%d] is address %p\n", + absolute_row, + absolute_row, + output_dest); + auto const shared_offset = block_row_size * relative_row; + if (debug_print) + printf("memcpy %p <- %p - %d bytes which is row %d\n", + output_dest, + &shared[subset % stages_count][shared_offset], + block_row_size, + absolute_row); + cuda::memcpy_async( + output_dest, &shared[subset % stages_count][shared_offset], block_row_size, subset_barrier); + } + } + + // wait on the last copies to complete + for (uint i = 0; i < std::min(stages_count, blocks_remaining); ++i) { + block_barrier[i].arrive_and_wait(); + } +} + +/** + * @brief copy data from row-based format to cudf columns + * + * @param num_rows total number of rows in the table + * @param num_columns total number of columns in the table + * @param shmem_used_per_block amount of shared memory that is used by a block + * @param offsets + * @param output_data pointer to output data, partitioned by data size + * @param validity_offsets offset into input data row for validity data + * @param block_infos information about the blocks of work + * @param num_block_infos number of infos in blocks array + * @param input_data pointer to input data + * + */ +__global__ void copy_validity_from_columns(const size_type num_rows, + const size_type num_columns, + const size_type shmem_used_per_block, + const size_type* row_offsets, + int8_t** output_data, + const size_type validity_offset, + const block_info* block_infos, + const size_type num_block_infos, + const bitmask_type** input_nm) +{ extern __shared__ int8_t shared_data[]; - uint64_t const output_start_offset = col_offsets[block.start_col] + row_offsets[block.start_row]; - uint8_t const dest_shim_offset = - reinterpret_cast(&output_data[0][output_start_offset]) & - 7; // offset for alignment shim in order to match shared memory with final dest - if (debug_print) { - printf("outputting to offset %lu\n", output_start_offset); - printf("dest shim offset is %d\n", dest_shim_offset); - printf("Shared data is %p-%p\n", shared_data, shared_data + (48 * 1024)); - printf("my block is %d,%d -> %d,%d - buffer %d\n", - block.start_col, - block.start_row, - block.end_col, - block.end_row, - block.buffer_num); + int8_t* shared_blocks[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] = { + shared_data, shared_data + shmem_used_per_block / 2}; + + constexpr bool print_debug = false; //(threadIdx.x==0 || threadIdx.x == 32) && blockIdx.x == 0; + // if (blockIdx.x != 3 || threadIdx.x / 32 != 0) return; + if (print_debug) { + printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns); + printf("%d %d - block infos are at %p and my index is %d\n", + threadIdx.x, + blockIdx.x, + block_infos, + blockIdx.x); + printf("%d %d - input nm is %p, input_nm[0] is at %p\n", + threadIdx.x, + blockIdx.x, + input_nm, + input_nm[0]); + printf("shared memory is %p to %p\n", shared_data, shared_data + shmem_used_per_block * 2); + printf("block infos at %p and this is index %d\n", + &block_infos, + blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + 0); + /* printf("Row Offsets:\n"); + for (int i=0; i + shared_block_barriers[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]; + if (group.thread_rank() == 0) { + for (int i = 0; i < NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; ++i) { + init(&shared_block_barriers[i], group.size()); + } } - for (int col = block.start_col; col <= block.end_col; ++col) { - /*if (!col_is_variable) */ { - uint64_t col_offset = 0; - cudf::size_type col_size = col_sizes[col]; - auto const dest_col_offset = - col_offsets[col] - col_offsets[block.start_col] + dest_shim_offset; - if (debug_print) { printf("dest col offset %d\n", dest_col_offset); } - for (int row = block.start_row + threadIdx.x; row <= block.end_row; row += blockDim.x) { - if (debug_print) { - printf("shmem row %d(%d) at offset %d(%d)\n", - row - block.start_row, - row, - (row - block.start_row) * shmem_row_size, - row * shmem_row_size); - } - int8_t *shmem_dest = - &shared_data[dest_col_offset + shmem_row_size * (row - block.start_row)]; - switch (col_size) { - case 1: { - if (debug_print) { printf("%p <- byte %d\n", shmem_dest, input_data[col][row]); } - *shmem_dest = input_data[col][row]; - break; - } - case 2: { - const int16_t *short_col_input = reinterpret_cast(input_data[col]); - if (debug_print) { printf("%p <- short %d\n", shmem_dest, short_col_input[row]); } - *reinterpret_cast(shmem_dest) = short_col_input[row]; - break; - } - case 4: { - const int32_t *int_col_input = reinterpret_cast(input_data[col]); - if (debug_print) { - printf("shmem[%d][%d] - %p <- int 0x%x\n", row, col, shmem_dest, int_col_input[row]); - } - *reinterpret_cast(shmem_dest) = int_col_input[row]; - break; - } - case 8: { - const int64_t *long_col_input = reinterpret_cast(input_data[col]); - if (debug_print) { printf("%p <- long %lu\n", shmem_dest, long_col_input[row]); } - *reinterpret_cast(shmem_dest) = long_col_input[row]; - break; - } - default: { - cudf::size_type input_offset = col_size * row; - if (debug_print) { - printf("byte for byte copy due to size %d of column %d\n", col_size, col); - printf("%p <- input_data[%d] which is %d\n", - shmem_dest, - input_offset, - input_data[col][input_offset]); - } - // TODO this should just not be supported for fixed width columns, but just in case... - for (cudf::size_type b = 0; b < col_size; b++) { - shmem_dest[b] = input_data[col][b + input_offset]; + + group.sync(); + + for (int validity_block = 0; validity_block < blocks_remaining; ++validity_block) { + if (validity_block != validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED) { + if (print_debug) + printf("%d: waiting at barrier %d\n", + threadIdx.x, + validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED); + shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] + .arrive_and_wait(); + if (print_debug) printf("past barrier...\n"); + } + int8_t* this_shared_block = shared_blocks[validity_block % 2]; + if (print_debug) printf("top of loop for validity block %d\n", validity_block); + if (print_debug) + printf("reading validity block info %d at %p\n", + blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block, + &block_infos[blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block]); + auto block = block_infos[blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block]; + + auto const num_block_cols = block.num_cols(); + auto const num_block_rows = block.num_rows(); + + auto const num_sections_x = (num_block_cols + 31) / 32; + auto const num_sections_y = (num_block_rows + 7) / 8; + auto const validity_data_row_length = + align_offset(util::div_rounding_up_unsafe(num_block_cols, 8), 8); + auto const total_sections = num_sections_x * num_sections_y; + + if (print_debug) { + printf("%d %d - block %d has %d cols, %d rows, %d row length, and %d total sections\n", + threadIdx.x, + blockIdx.x, + blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block, + num_block_cols, + num_block_rows, + validity_data_row_length, + total_sections); + } + int const warp_id = threadIdx.x / detail::warp_size; + int const lane_id = threadIdx.x % detail::warp_size; + auto const warps_per_block = std::max(1u, blockDim.x / detail::warp_size); + + if (print_debug) + printf( + "%d %d - my warp is %d, %d total sections, %d warps per block, blockDim.x=%d, warp side " + "%d\n", + threadIdx.x, + blockIdx.x, + warp_id, + total_sections, + warps_per_block, + blockDim.x, + detail::warp_size); + // the block is divided into sections. A warp operates on a section at a time. + for (int my_section_idx = warp_id; my_section_idx < total_sections; + my_section_idx += warps_per_block) { + // convert to rows and cols + auto const section_x = my_section_idx / num_sections_x; + auto const section_y = my_section_idx % num_sections_x; + + if (print_debug) printf("working on section %d of %d...\n", section_x, num_sections_x); + auto const relative_col = section_x * 32 + lane_id; + auto const relative_row = section_y * 8; + auto const absolute_col = relative_col + block.start_col; + auto const absolute_row = relative_row + block.start_row; + auto const cols_left = num_columns - absolute_col; + + if (print_debug) printf("pre ballot sync...\n"); + auto const participation_mask = __ballot_sync(0xFFFFFFFF, absolute_col < num_columns); + + if (print_debug) + printf( + "participation mask is 0x%x for relative row %d(%d real), relative col %d(%d absolute)\n", + participation_mask, + relative_row, + absolute_row, + relative_col, + absolute_col); + + if (absolute_col < num_columns) { + if (print_debug) + printf( + "thread %d's byte is at %p, participation mask is 0x%x for relative row %d(%d real), " + "relative col %d(%d absolute)\n", + threadIdx.x, + &input_nm[absolute_col][absolute_row / 32], + participation_mask, + relative_row, + absolute_row, + relative_col, + absolute_col); + auto my_byte = + input_nm[absolute_col] != nullptr ? input_nm[absolute_col][absolute_row / 32] : 0xFF; + + if (print_debug) + printf( + "thread %d's byte is 0x%x, participation mask is 0x%x for relative row %d(%d real), " + "relative col %d(%d absolute)\n", + threadIdx.x, + my_byte & 0xFF, + participation_mask, + relative_row, + absolute_row, + relative_col, + absolute_col); + + // every thread that is participating in the warp has a byte, but it's column-based + // data and we need it in row-based. So we shiffle the bits around with ballot_sync to make + // the bytes we actually write. + for (int i = 0, byte_mask = 1; i < 8 && relative_row + i < num_rows; ++i, byte_mask <<= 1) { + auto validity_data = __ballot_sync(participation_mask, my_byte & byte_mask); + // lead thread in each warp writes data + auto const validity_write_offset = + validity_data_row_length * (relative_row + i) + relative_col / 8; + if (threadIdx.x % detail::warp_size == 0) { + if (print_debug) + printf( + "%d %d - byte_mask is 0x%x, masked_byte is 0x%x, shared_data_block[%d][%d] = " + "0x%x\n", + threadIdx.x, + blockIdx.x, + byte_mask, + my_byte & byte_mask, + validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED, + validity_write_offset, + validity_data); + if (cols_left <= 8) { + // write byte + if (print_debug) + printf("writing single byte to shared offset 0x%x which is %p...\n", + validity_write_offset, + &this_shared_block[validity_write_offset]); + this_shared_block[validity_write_offset] = validity_data & 0xFF; + } else if (cols_left <= 16) { + // write int16 + if (print_debug) + printf("writing two bytes to shared offset 0x%x which is %p...\n", + validity_write_offset, + &this_shared_block[validity_write_offset]); + *reinterpret_cast(&this_shared_block[validity_write_offset]) = + validity_data & 0xFFFF; + } else if (cols_left <= 24) { + // write int16 and then int8 + if (print_debug) + printf("writing three bytes to shared offset 0x%x which is %p...\n", + validity_write_offset, + &this_shared_block[validity_write_offset]); + *reinterpret_cast(&this_shared_block[validity_write_offset]) = + validity_data & 0xFFFF; + shared_data[validity_write_offset + 2] = (validity_data >> 16) & 0xFF; + } else { + // write int32 + if (print_debug) + printf("writing 4 bytes to shared offset 0x%x which is %p...\n", + validity_write_offset, + &this_shared_block[validity_write_offset]); + *reinterpret_cast(&this_shared_block[validity_write_offset]) = + validity_data; } - break; } } + } + } - // atomicOr only works on 32 bit or 64 bit aligned values, and not byte aligned - // so we have to rewrite the addresses to make sure that it is 4 byte aligned - // we do this directly in the final location because the entire row may not - // fit in shared memory and may require many blocks to process it entirely - int8_t *valid_byte = - &output_data[block.buffer_num][row_offsets[row] + validity_offset + (col / 8)]; - cudf::size_type byte_bit_offset = col % 8; - uint64_t fixup_bytes = reinterpret_cast(valid_byte) % 4; - int32_t *valid_int = reinterpret_cast(valid_byte - fixup_bytes); - cudf::size_type int_bit_offset = byte_bit_offset + (fixup_bytes * 8); - if (debug_print) { printf("Outputting validity to %p\n", valid_byte); } - // Now copy validity for the column - if (input_nm[col]) { - if (bit_is_set(input_nm[col], row)) { - atomicOr_block(valid_int, 1 << int_bit_offset); - } else { - atomicAnd_block(valid_int, ~(1 << int_bit_offset)); - } - } else { - // It is valid so just set the bit - atomicOr_block(valid_int, 1 << int_bit_offset); - } - } // end row + // make sure entire block has finished copy + group.sync(); - col_offset += col_sizes[col] * rows_in_block; + // now async memcpy the shared memory out to the final destination + for (int row = block.start_row + threadIdx.x; row <= block.end_row; row += blockDim.x) { + auto const relative_row = row - block.start_row; + if (print_debug) { + printf( + "base output data is %p, row offset is 0x%x, validity offset into row is 0x%x, word " + "index of block start is 0x%x\n", + output_data[block.buffer_num], + row_offsets[row], + validity_offset, + word_index(block.start_col)); + printf( + "%d %d - row %d/%d/%d col %d-%d - %p = shared_data_block[%d][%d] which is %p - %d " + "bytes\n - %p <- 0x%x\n", + threadIdx.x, + blockIdx.x, + block.start_row, + row, + block.end_row, + block.start_col, + block.end_col, + output_data[block.buffer_num] + row_offsets[row] + validity_offset + + (word_index(block.start_col)), + validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED, + validity_data_row_length * relative_row, + &this_shared_block[validity_data_row_length * relative_row], + util::div_rounding_up_unsafe(num_block_cols, 8), + output_data[block.buffer_num] + row_offsets[row] + validity_offset + + word_index(block.start_col), + this_shared_block[validity_data_row_length * relative_row]); + } + auto const output_ptr = + output_data[block.buffer_num] + row_offsets[row] + validity_offset + block.start_col / 8; + auto const num_bytes = util::div_rounding_up_unsafe(num_block_cols, 8); + cuda::memcpy_async( + output_ptr, + &this_shared_block[validity_data_row_length * relative_row], + num_bytes, + shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]); + + /* auto const padding_ptr = output_ptr + num_bytes; + auto const padding_needed = -reinterpret_cast(padding_ptr) & 7; + if (print_debug) printf( + "absolute_row: %d, row_offset for this row: 0x%x, validity data bytes: %d, end + address: %p, padding bytes %lu\n", row, row_offsets[row], num_bytes, output_ptr + + num_bytes, padding_needed); cuda::memcpy_async(padding_ptr, zero, padding_needed, + shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]); + */ + + /* if (print_debug) { + for (int i=0; i %p\n", - num_single_bytes, - &input_ptr[i + dest_shim_offset], - input_ptr[i + dest_shim_offset], - &output_ptr[i]); - } - output_ptr[i] = input_ptr[i + dest_shim_offset]; - } - } else if (dest_shim_offset > 0 && src_offset % shmem_row_size == 0) { - // first byte with leading pad - auto const num_single_bytes = 8 - dest_shim_offset; - for (auto i = 0; i < num_single_bytes; ++i) { - if (debug_print) { - printf( - "single byte final write %p -> %p\n", &input_ptr[i + dest_shim_offset], &output_ptr[i]); - } - output_ptr[i] = input_ptr[i + dest_shim_offset]; - } - } else if ((src_offset + 8) % shmem_row_size == 0 && - (real_bytes_in_row + dest_shim_offset) % 8 > 0) { - // last bytes of a row - auto const num_single_bytes = (real_bytes_in_row + dest_shim_offset) % 8; - for (auto i = 0; i < num_single_bytes; ++i) { - if (debug_print) { - printf("single trailing byte final write %p -> %p\n", - &input_ptr[i + dest_shim_offset], - &output_ptr[i]); - } - output_ptr[i] = input_ptr[i + dest_shim_offset]; - } - } else { - // copy 8 bytes aligned - const int64_t *long_col_input = reinterpret_cast(input_ptr); - if (debug_print) { - printf( - "long final write %p -> %p\n", long_col_input, reinterpret_cast(output_ptr)); +} + +static __device__ std::tuple get_admin_data_sizes(size_t col_size_size, + size_t col_offset_size, + int const num_cols) +{ + auto const col_size_bytes = num_cols * col_size_size; + auto const col_offset_bytes = num_cols * col_offset_size; + + return {col_size_bytes, col_offset_bytes}; +} + +/** + * @brief ensure `read_ahead` buffer blocks are fetched + * + * @param fetch_index internal state passed into the function + * @param processing_index index where processing is occuring + * @param read_ahead_count how many blocks to read ahead + * @param max_resident_blocks how many blocks can be loaded at once + * @param total_blocks total number of blocks overall + * @param block_infos pointer to the block infos + * @param col_sizes pointer to column size information + * @param col_offsets pointer to the table's column offsets + * @param row_offsets pointer to offsets for each row in the table + * @param input_data pointer to the input data + * @param shared pointer to shared memory + * @param group thread group participating in the fetch + * @param block_barrier barriers used for each block + * @param debug_print + * @return + */ +static __device__ void fetch_blocks_for_row_to_column( + size_t& fetch_index, + size_t const processing_index, + int const read_ahead_count, + int const max_resident_blocks, + int const total_blocks, + block_info const* const block_infos, + size_type const* const col_sizes, + size_type const* const col_offsets, + size_type const* const row_offsets, + int8_t const* const input_data, + int8_t* shared[], + cooperative_groups::thread_block const group, + cuda::barrier* block_barrier, + bool debug_print) +{ + for (; fetch_index < static_cast(total_blocks) && + fetch_index < (processing_index + read_ahead_count); + ++fetch_index) { + if (debug_print) + printf("fetching block %lu of %d\n", + blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch_index, + total_blocks); + auto const fetch_block = + block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch_index]; + auto const fetch_block_start_row = fetch_block.start_row; + auto const fetch_block_end_row = fetch_block.end_row; + auto const starting_col_offset = col_offsets[fetch_block.start_col]; + + auto const fetch_block_row_size = fetch_block.get_row_size(col_offsets, col_sizes); + auto const num_fetch_cols = fetch_block.num_cols(); + auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes( + sizeof(decltype(*col_sizes)), sizeof(decltype(*col_offsets)), num_fetch_cols); + auto& fetch_barrier = block_barrier[fetch_index % NUM_BLOCKS_PER_KERNEL_LOADED]; + + // if we have fetched all buffers, we need to wait for processing + // to complete on them before we can use them again + if (fetch_index > NUM_BLOCKS_PER_KERNEL_LOADED) { fetch_barrier.arrive_and_wait(); } + + auto shared_row_offset = 0; + // copy the data for column sizes + if (debug_print) + printf("%d: col sizes memcpy_async(group, %p, %p, %d, barrier);\n", + threadIdx.x, + &shared[fetch_index % max_resident_blocks][shared_row_offset], + &col_offsets[fetch_block.start_col], + col_size_bytes); + if (debug_print && group.thread_rank() == 0 && blockIdx.x == 0) + printf("%d-%d fetching to %p with barrier %p\n", + threadIdx.x, + blockIdx.x, + shared[fetch_index % max_resident_blocks], + &fetch_barrier); + cuda::memcpy_async(group, + &shared[fetch_index % max_resident_blocks][shared_row_offset], + &col_sizes[fetch_block.start_col], + col_size_bytes, + fetch_barrier); + shared_row_offset += col_size_bytes; + // copy the data for column offsets + if (debug_print) + printf("%d: offsets memcpy_async(group, %p, %p, %d, barrier);\n", + threadIdx.x, + &shared[fetch_index % max_resident_blocks][shared_row_offset], + &col_offsets[fetch_block.start_col], + col_offset_bytes); + cuda::memcpy_async(group, + &shared[fetch_index % max_resident_blocks][shared_row_offset], + &col_offsets[fetch_block.start_col], + col_offset_bytes, + fetch_barrier); + shared_row_offset += col_offset_bytes; + shared_row_offset = align_offset(shared_row_offset, 8); + + if (debug_print && group.thread_rank() == 0 && blockIdx.x == 0 && fetch_block.start_col == 0 && + fetch_block.start_row <= 51 && fetch_block.end_row >= 51) { + printf("Input data for col 0 row 51 is 0x"); + for (int i = 0; i < col_sizes[0]; ++i) { + printf("%x ", input_data[row_offsets[51] + col_offsets[0] + i]); } - *reinterpret_cast(output_ptr) = *long_col_input; + printf("\n"); + printf( + "this is at offset %d-%d and starting column offset is %d and we're reading %d bytes\n", + col_offsets[0], + col_offsets[0] + col_sizes[0], + starting_col_offset, + fetch_block_row_size); + auto shared_offset = (51 - fetch_block_start_row) * fetch_block_row_size + shared_row_offset; + printf("destination is %p", &shared[fetch_index % max_resident_blocks][shared_offset]); + } + + for (auto row = fetch_block_start_row + static_cast(threadIdx.x); + row <= fetch_block_end_row; + row += blockDim.x) { + auto shared_offset = (row - fetch_block_start_row) * fetch_block_row_size + shared_row_offset; + if (debug_print) + printf("fetching block %lu to shared chunk %lu. %p <- %p\n", + fetch_index, + fetch_index % max_resident_blocks, + &shared[fetch_index % max_resident_blocks][shared_offset], + &input_data[row_offsets[row] + starting_col_offset]); + // copy the main + cuda::memcpy_async(&shared[fetch_index % max_resident_blocks][shared_offset], + &input_data[row_offsets[row] + starting_col_offset], + fetch_block_row_size, + fetch_barrier); } } } @@ -600,7 +998,7 @@ __global__ void copy_from_columns(const size_type num_rows, * @param num_rows total number of rows in the table * @param num_columns total number of columns in the table * @param shmem_used_per_block amount of shared memory that is used by a block - * @param offsets + * @param row_offsets * @param output_data * @param output_nm * @param col_sizes array of sizes for each element in a column - one per column @@ -612,13 +1010,13 @@ __global__ void copy_from_columns(const size_type num_rows, __global__ void copy_to_columns(const size_type num_rows, const size_type num_columns, const size_type shmem_used_per_block, - const size_type *offsets, - int8_t **output_data, - cudf::bitmask_type **output_nm, - const size_type *col_sizes, - const size_type *col_offsets, - const block_info *block_infos, - const int8_t *input_data) + const size_type* row_offsets, + int8_t** output_data, + const size_type* _col_sizes, + const size_type* _col_offsets, + const block_info* block_infos, + const size_type num_block_infos, + const int8_t* input_data) { // We are going to copy the data in two passes. // The first pass copies a chunk of data into shared memory. @@ -628,7 +1026,14 @@ __global__ void copy_to_columns(const size_type num_rows, // This has been broken up for us in the block_info struct, so we don't have // any calculation to do here, but it is important to note. - constexpr bool debug_print = false; //blockIdx.x == 0 && threadIdx.x == 0; + // to speed up some of the random access memory we do, we copy col_sizes and col_offsets + // to shared memory for each of the blocks that we work on + + /*constexpr*/ bool debug_print = false; // threadIdx.x == 0; + constexpr unsigned stages_count = NUM_BLOCKS_PER_KERNEL_LOADED; + auto group = cooperative_groups::this_thread_block(); + extern __shared__ int8_t shared_data[]; + int8_t* shared[stages_count] = {shared_data, shared_data + shmem_used_per_block}; if (debug_print) { printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns); @@ -638,189 +1043,387 @@ __global__ void copy_to_columns(const size_type num_rows, printf("%d: %d\n", i, row_offsets[i]); }*/ printf("output data to %p\n", output_data[block_infos[blockIdx.x].buffer_num]); + printf("shared memory pointers are %p and %p\n", shared[0], shared[1]); + printf("shared_memory ends at %p\n", &shared_data[shmem_used_per_block * 2]); + printf("group is %d threads\n", group.size()); } -// else { return; } + // else { return; } - for (int block_offset = 0; block_offset < NUM_BLOCKS_PER_KERNEL_TO_COLUMNS; ++block_offset) { - auto this_block_index = blockIdx.x*NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + block_offset; - if (this_block_index > blockDim.x) { - break; + __shared__ cuda::barrier block_barrier[NUM_BLOCKS_PER_KERNEL_LOADED]; + if (group.thread_rank() == 0) { + for (int i = 0; i < NUM_BLOCKS_PER_KERNEL_LOADED; ++i) { + init(&block_barrier[i], group.size()); } - auto block = block_infos[this_block_index]; - auto const rows_in_block = block.end_row - block.start_row + 1; - auto const cols_in_block = block.end_col - block.start_col + 1; - extern __shared__ int8_t shared_data[]; + } - // copy data from our block's window to shared memory - // offsets information can get us on the row, then we need to know where the column - // starts to offset into the row data. - - // each thread is responsible for 8-byte chunks starting at threadIdx.x and striding - // at blockDim.x. If the 8-byte chunk falls on the boundary of the window, then the - // thread may copy less than 8 bytes. Even if at the beginning of the window, because - // every internal copy is aligned to 8-byte boundaries. - // - // thread 0 thread 1 thread 2 thread 3 thread 4 thread 5 - // 01234567 89abcdef 01234567 89abcdef 01234567 89abcdef - // xxxbbbbb bbbbbbbb bbbbbbbb bbbbbbbb bbbbbbbb bbxxxxxx - // | | | | | | | - // - // - - auto const window_start_quad = col_offsets[block.start_col] / 8; - auto const window_end_quad = (col_offsets[block.end_col] + col_sizes[block.end_col] + 7) / 8; - auto const window_quad_width = window_end_quad - window_start_quad; - auto const total_quads = window_quad_width * rows_in_block; - auto const shared_memory_starting_pad = col_offsets[block.start_col] & 0x7; + group.sync(); - if (debug_print) { - printf("col_offsets[%d]: %d, col_offsets[%d]: %d col_sizes[%d]: %d\n", block.start_col, col_offsets[block.start_col], block.end_col, col_offsets[block.end_col], block.end_col, col_sizes[block.end_col]); - printf("window start quad is %d, window end quad is %d\n", window_start_quad, window_end_quad); - printf("window quad width is %d and there are %d total quads\n%d shared memory starting pad\n", window_quad_width, total_quads, shared_memory_starting_pad); - } + auto blocks_remaining = std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS, + (uint)NUM_BLOCKS_PER_KERNEL_TO_COLUMNS); - // the copy to shared memory will be greedy. We know that the data is 8-byte aligned, so we won't - // access illegal memory by doing 8-byte aligned copies, so we can copy 8-byte aligned. This will - // result in the window edges being duplicated across blocks, but we can copy the padding as well - // to speed up our transfers to shared memory. - for (int i = threadIdx.x; i < total_quads; i += blockDim.x) { - auto const relative_row = i / window_quad_width; - auto const absolute_row = relative_row + block.start_row; - //auto const row = i / window_quad_width; - auto const offset_in_row = i % window_quad_width * 8; - auto const shmem_dest = &shared_data[i * 8]; - - if (debug_print) { - printf("relative_row: %d, absolute_row: %d, offset_in_row: %d, shmem_dest: %p\n", relative_row, absolute_row, offset_in_row, shmem_dest); - printf("offsets is %p\n", offsets); - printf("offsets[%d]: %d\n", absolute_row, offsets[absolute_row]); - printf("input_data[%d] will be dereferenced\n", offsets[absolute_row] + offset_in_row); - } + auto get_admin_data_sizes = [col_size_size = sizeof(decltype(*_col_sizes)), + col_offset_size = sizeof(decltype(*_col_offsets))]( + int const num_cols, + int const num_rows) -> std::tuple { + auto const col_size_bytes = num_cols * col_size_size; + auto const col_offset_bytes = num_cols * col_offset_size; - // full 8-byte copy - const int64_t *long_col_input = - reinterpret_cast(&input_data[offsets[absolute_row] + offset_in_row]); - if (debug_print) { - printf("which will be address %p\n", long_col_input); - printf("%p <- long %lu\n", shmem_dest, *long_col_input); } - *reinterpret_cast(shmem_dest) = *long_col_input; - } + return {col_size_bytes, col_offset_bytes}; + }; - __syncthreads(); - - // now we copy from shared memory to final destination. - // the data is laid out in rows in shared memory, so the reads - // for a column will be "vertical". Because of this and the different - // sizes for each column, this portion is handled on row/column basis. - // to prevent each thread working on a single row and also to ensure - // that all threads can do work in the case of more threads than rows, - // we do a global index instead of a double for loop with col/row. - for (int index = threadIdx.x; index < rows_in_block * cols_in_block; index += blockDim.x) { - auto const relative_col = index % cols_in_block; - auto const relative_row = index / cols_in_block; - auto const absolute_col = relative_col + block.start_col; - auto const absolute_row = relative_row + block.start_row; - - auto const shared_memory_row_offset = window_quad_width * 8 * relative_row; - auto const shared_memory_offset = col_offsets[absolute_col] - col_offsets[block.start_col] + - shared_memory_row_offset + shared_memory_starting_pad; - auto const column_size = col_sizes[absolute_col]; - - int8_t *shmem_src = &shared_data[shared_memory_offset]; - int8_t *dst = &output_data[absolute_col][absolute_row * column_size]; - - if (debug_print) { - printf("relative_col: %d, relative_row: %d, absolute_col: %d, absolute_row: %d, shared_mmeory_row_offset: %d, shared_memory_offset: %d," - " column_size: %d, shmem_src: %p, dst: %p\n", relative_col, relative_row, absolute_col, absolute_row, shared_memory_row_offset, shared_memory_offset, column_size, - shmem_src, dst) ; - } - switch (column_size) { - case 1: { - if (debug_print) { printf("%p <- byte %d\n", dst, *shmem_src); } - *dst = *shmem_src; - break; - } - case 2: { - const int16_t *short_col_input = reinterpret_cast(shmem_src); - if (debug_print) { printf("%p <- short %d\n", dst, *short_col_input); } - *reinterpret_cast(dst) = *short_col_input; - break; - } - case 4: { - const int32_t *int_col_input = reinterpret_cast(shmem_src); - if (debug_print) { printf("%p <- int 0x%x\n", dst, *int_col_input); } - *reinterpret_cast(dst) = *int_col_input; - break; - } - case 8: { - const int64_t *long_col_input = reinterpret_cast(shmem_src); - if (debug_print) { printf("%p <- long %lu\n", dst, *long_col_input); } - *reinterpret_cast(dst) = *long_col_input; - break; + if (debug_print) + printf("%d blocks remaining -> %d block infos, %d block index\n", + blocks_remaining, + num_block_infos, + blockIdx.x); + size_t fetch; + size_t subset; + for (subset = fetch = 0; subset < blocks_remaining; ++subset) { + // Fetch ahead up to stages_count subsets + fetch_blocks_for_row_to_column(fetch, + subset, + stages_count, + stages_count, + blocks_remaining, + block_infos, + _col_sizes, + _col_offsets, + row_offsets, + input_data, + shared, + group, + block_barrier, + debug_print); + + auto& subset_barrier = block_barrier[subset % NUM_BLOCKS_PER_KERNEL_LOADED]; + // ensure our data is ready + if (debug_print && group.thread_rank() == 0 && blockIdx.x == 0) + printf("%d-%d waiting at barrier %p\n", threadIdx.x, blockIdx.x, &subset_barrier); + subset_barrier.arrive_and_wait(); + + auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + subset]; + if (debug_print && group.thread_rank() == 0 && blockIdx.x == 0) + printf("%d-%d reading block %lu at address %p\n", + threadIdx.x, + blockIdx.x, + blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + subset, + shared[subset % stages_count]); + + auto const rows_in_block = block.num_rows(); + auto const cols_in_block = block.num_cols(); + + auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes(cols_in_block, rows_in_block); + // auto shared_row_offsets = shared[subset]; + auto shared_col_sizes = reinterpret_cast(shared[subset % stages_count]); + auto shared_col_offsets = + reinterpret_cast(&shared[subset % stages_count][col_size_bytes]); + + auto const shared_row_offset = align_offset(col_size_bytes + col_offset_bytes, 8); + + auto block_row_size = block.get_row_size(_col_offsets, _col_sizes, debug_print); + + // now we copy from shared memory to final destination. + // the data is laid out in rows in shared memory, so the reads + // for a column will be "vertical". Because of this and the different + // sizes for each column, this portion is handled on row/column basis. + // to prevent each thread working on a single row and also to ensure + // that all threads can do work in the case of more threads than rows, + // we do a global index instead of a double for loop with col/row. + for (int index = threadIdx.x; index < rows_in_block * cols_in_block; index += blockDim.x) { + auto const relative_col = index % cols_in_block; + auto const relative_row = index / cols_in_block; + auto const absolute_col = relative_col + block.start_col; + auto const absolute_row = relative_row + block.start_row; + + if (debug_print) + printf("copying for row %d(%d absolute) col %d(%d absolute)\n", + relative_row, + absolute_row, + relative_col, + absolute_col); + + auto const shared_memory_row_offset = block_row_size * relative_row; + if (debug_print) + printf("shared_col_offsets is %p and relative col is %d, making me access %p\n", + shared_col_offsets, + relative_col, + &shared_col_offsets[relative_col]); + auto const shared_memory_offset = shared_col_offsets[relative_col] - shared_col_offsets[0] + + shared_memory_row_offset + shared_row_offset; + if (debug_print) + printf("shared_col_sizes is %p and relative col is %d, making me access %p\n", + shared_col_sizes, + relative_col, + &shared_col_sizes[relative_col]); + auto const column_size = shared_col_sizes[relative_col]; + + int8_t* shmem_src = &shared[subset % stages_count][shared_memory_offset]; + int8_t* dst = &output_data[absolute_col][absolute_row * column_size]; + + if (debug_print) { + printf( + "relative_col: %d, relative_row: %d, absolute_col: %d, absolute_row: %d, " + "shared_mmeory_row_offset: %d, shared_memory_offset: %d," + " column_size: %d, shmem_src: %p, dst: %p\n",//, uint32 is %u\n", + relative_col, + relative_row, + absolute_col, + absolute_row, + shared_memory_row_offset, + shared_memory_offset, + column_size, + shmem_src, + dst/*, + *reinterpret_cast(shmem_src)*/); + printf("memcpy_async(%p, %p, %d, subset_barrier);\n", dst, shmem_src, column_size); } - default: { - if (debug_print) { - printf("byte for byte copy due to size %d of column %d\n", column_size, absolute_col); + if (debug_print && absolute_col == 0 && absolute_row == 51) { + printf("col0row51(%d bytes) = %p - 0x", column_size, shmem_src); + for (int i = 0; i < column_size; ++i) { + printf("%x ", shmem_src[i]); } - // TODO this should just not be supported for fixed width columns, but just in case... - for (cudf::size_type b = 0; b < column_size; b++) { dst[b] = shmem_src[b]; } - break; + printf("\n"); } + + cuda::memcpy_async(dst, shmem_src, column_size, subset_barrier); } + group.sync(); + if (debug_print && group.thread_rank() == 0 && blockIdx.x == 0) + printf( + "%d-%d copy to main memory with barrier %p\n", threadIdx.x, blockIdx.x, &subset_barrier); + } + + // wait on the last copies to complete + for (uint i = 0; i < std::min(stages_count, blocks_remaining); ++i) { + block_barrier[i].arrive_and_wait(); } +} + +/** + * @brief copy data from row-based format to cudf columns + * + * @param num_rows total number of rows in the table + * @param num_columns total number of columns in the table + * @param shmem_used_per_block amount of shared memory that is used by a block + * @param offsets + * @param output_nm + * @param validity_offsets offset into input data row for validity data + * @param block_infos information about the blocks of work + * @param num_block_infos number of infos in blocks array + * @param input_data pointer to input data + * + */ +__global__ void copy_validity_to_columns(const size_type num_rows, + const size_type num_columns, + const size_type shmem_used_per_block, + const size_type* row_offsets, + cudf::bitmask_type** output_nm, + const size_type validity_offset, + const block_info* block_infos, + const size_type num_block_infos, + const int8_t* input_data) +{ + extern __shared__ int8_t shared_data[]; + int8_t* shared_blocks[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] = { + shared_data, shared_data + shmem_used_per_block / 2}; - // now handle validity. Each thread is responsible for 32 rows in 8 columns. - // to prevent indexing issues with a large number of threads, this is compressed - // to a single loop like above. TODO: investigate using shared memory here - auto const validity_batches_per_col = (num_rows + 31) / 32; - auto const validity_batches_total = std::max(1, validity_batches_per_col * (num_columns / 8)); - if (debug_print && threadIdx.x == 0 && blockIdx.x == 0) { - printf("validity_batched_per_col is %d\nvalidity_batches_total is %d for %d rows\n%d blocks of %d threads\n", validity_batches_per_col, validity_batches_total, num_rows, gridDim.x, blockDim.x); + bool print_debug = false; // threadIdx.x == 0 && blockIdx.x == 0; + // bool print_debug = false; + // if (blockIdx.x != 3 || threadIdx.x / 32 != 0) return; + if (print_debug) { + printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns); + printf("%d %d - block infos are at %p and my index is %d\n", + threadIdx.x, + blockIdx.x, + block_infos, + blockIdx.x); + printf( + "%d %d - Shared memory starts at %p and ends at %p, input data is %p, output data is %p, row " + "offsets are %p, block infos at %p\n", + threadIdx.x, + blockIdx.x, + shared_data, + shared_data + shmem_used_per_block, + input_data, + output_nm, + row_offsets, + block_infos); + /* printf("Row Offsets:\n"); + for (int i=0; i + shared_block_barriers[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]; + if (group.thread_rank() == 0) { + for (int i = 0; i < NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; ++i) { + init(&shared_block_barriers[i], group.size()); } + } - // one for each column - int32_t dst_validity[8] = {0}; - for (int row = starting_row; row < std::min(num_rows, starting_row + 32); ++row) { - int8_t const * const validity_ptr = &input_data[offsets[row] + validity_offset]; + group.sync(); - if (debug_print) { - printf("%d: validity_ptr is %p for row %d\n", threadIdx.x, validity_ptr, row); - } - - auto const val_byte = *validity_ptr; - - for (int i=0; i> src_shift); + for (int validity_block = 0; validity_block < blocks_remaining; ++validity_block) { + auto const validity_index = validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; + if (validity_block != validity_index) { + shared_block_barriers[validity_index].arrive_and_wait(); + } + int8_t* this_shared_block = shared_blocks[validity_block % 2]; + auto const block = block_infos[blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block]; + auto const block_start_col = block.start_col; + auto const block_start_row = block.start_row; + + auto const num_block_cols = block.num_cols(); + auto const num_block_rows = block.num_rows(); + + auto const num_sections_x = (num_block_cols + 7) / 8; + auto const num_sections_y = (num_block_rows + 31) / 32; + auto const validity_data_col_length = align_offset(num_sections_y, 4); + auto const total_sections = num_sections_x * num_sections_y; + + if (print_debug) { + printf("%d %d - block %d has %d cols, %d rows, and %d total sections\n", + threadIdx.x, + blockIdx.x, + blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block, + num_block_cols, + num_block_rows, + total_sections); + } + int const warp_id = threadIdx.x / detail::warp_size; + int const lane_id = threadIdx.x % detail::warp_size; + auto const warps_per_block = std::max(1u, blockDim.x / detail::warp_size); + + if (print_debug) + printf( + "%d %d - my warp is %d, %d total sections, %d warps per block, blockDim.x=%d, warp side " + "%d\n", + threadIdx.x, + blockIdx.x, + warp_id, + total_sections, + warps_per_block, + blockDim.x, + detail::warp_size); + // the block is divided into sections. A warp operates on a section at a time. + for (int my_section_idx = warp_id; my_section_idx < total_sections; + my_section_idx += warps_per_block) { + // convert to rows and cols + auto const section_x = my_section_idx % num_sections_x; + auto const section_y = my_section_idx / num_sections_x; + + auto const relative_col = section_x * 8; + auto const relative_row = section_y * 32 + lane_id; + auto const absolute_col = relative_col + block_start_col; + auto const absolute_row = relative_row + block_start_row; + auto const rows_left = num_rows - absolute_row; + + if (print_debug) + printf("%d-%d: si: %d nsx: %d nsy: %d sx: %d sy: %d ar: %d nr: %d rc: %d rr: %d\n", + threadIdx.x, + blockIdx.x, + my_section_idx, + num_sections_x, + num_sections_y, + section_x, + section_y, + absolute_row, + num_rows, + relative_col, + relative_row); + auto const participation_mask = __ballot_sync(0xFFFFFFFF, absolute_row < num_rows); + + if (absolute_row < num_rows) { + auto const my_byte = + input_data[row_offsets[absolute_row] + validity_offset + absolute_col / 8]; + + // so every thread that is participating in the warp has a byte, but it's row-based + // data and we need it in column-based. So we shiffle the bits around to make + // the bytes we actually write. + for (int i = 0, byte_mask = 1; i < 8 && relative_col + i < num_columns; + ++i, byte_mask <<= 1) { + auto validity_data = __ballot_sync(participation_mask, my_byte & byte_mask); + // lead thread in each warp writes data + if (threadIdx.x % detail::warp_size == 0) { + auto const validity_write_offset = + validity_data_col_length * (relative_col + i) + relative_row / 8; + + if (print_debug) + printf("%d - Writing validity data 0x%x to shared memory location %d\n", + threadIdx.x, + validity_data, + validity_write_offset); + if (rows_left <= 8) { + // write byte + this_shared_block[validity_write_offset] = validity_data & 0xFF; + } else if (rows_left <= 16) { + // write int16 + *reinterpret_cast(&this_shared_block[validity_write_offset]) = + validity_data & 0xFFFF; + } else if (rows_left <= 24) { + // write int16 and then int8 + *reinterpret_cast(&this_shared_block[validity_write_offset]) = + validity_data & 0xFFFF; + shared_data[validity_write_offset + 2] = (validity_data >> 16) & 0xFF; + } else { + // write int32 + *reinterpret_cast(&this_shared_block[validity_write_offset]) = + validity_data; + } + } } - // auto const dst_bit_mask = 1 << dst_shift; - dst_validity[i] |= (((val_byte & src_bit_mask) >> src_shift) << dst_shift); } } - - for (int i=0; i(output_nm[start_col + i] + (starting_row / 32)); - if (debug_print) { - printf("%d-%d: validity write output_nm[%d][%d] - %p <- %d\n", threadIdx.x, blockIdx.x, start_col + i, starting_row, validity_ptr, dst_validity[i]); - } - *validity_ptr = dst_validity[i]; + // make sure entire block has finished copy + group.sync(); + + // now async memcpy the shared + for (int col = block.start_col + threadIdx.x; col <= block.end_col; col += blockDim.x) { + auto const relative_col = col - block.start_col; + auto const words_to_copy = util::div_rounding_up_unsafe(num_block_rows, 32); + auto const starting_address = output_nm[col] + word_index(block_start_row); + + if (print_debug) + printf("memcpy_async(%p(offset %d), %p, %d, subset_barrier);\n", + starting_address, + word_index(block_start_row), + &this_shared_block[validity_data_col_length * relative_col], + words_to_copy * 4); + cuda::memcpy_async( + output_nm[col] + word_index(block_start_row), + &this_shared_block[validity_data_col_length * relative_col], + util::div_rounding_up_unsafe(num_block_rows, 8), + shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]); } } + + // if (print_debug) printf("leaving...\n"); + // wait for last blocks of data to arrive + auto const num_blocks_to_wait = blocks_remaining > NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED + ? NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED + : blocks_remaining; + for (int validity_block = 0; validity_block < num_blocks_to_wait; ++validity_block) { + shared_block_barriers[validity_block].arrive_and_wait(); + } } -} + +#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 /** * Calculate the dimensions of the kernel for fixed width only columns. @@ -834,8 +1437,8 @@ __global__ void copy_to_columns(const size_type num_rows, static int calc_fixed_width_kernel_dims(const cudf::size_type num_columns, const cudf::size_type num_rows, const cudf::size_type size_per_row, - dim3 &blocks, - dim3 &threads) + dim3& blocks, + dim3& threads) { // We have found speed degrades when a thread handles more than 4 columns. // Each block is 2 dimensional. The y dimension indicates the columns. @@ -846,7 +1449,7 @@ static int calc_fixed_width_kernel_dims(const cudf::size_type num_columns, // in the x dimension because we use atomic operations at the block // level when writing validity data out to main memory, and that would // need to change if we split a word of validity data between blocks. - int y_block_size = (num_columns + 3) / 4; + int y_block_size = (num_columns + 3) / 4; // cudf::util::div_rounding_up_safe(num_columns, 4); if (y_block_size > 32) { y_block_size = 32; } int x_possible_block_size = 1024 / y_block_size; // 48KB is the default setting for shared memory per block according to the cuda tutorials @@ -895,14 +1498,14 @@ static std::unique_ptr fixed_width_convert_to_rows( const cudf::size_type num_rows, const cudf::size_type num_columns, const cudf::size_type size_per_row, - rmm::device_uvector &column_start, - rmm::device_uvector &column_size, - rmm::device_uvector &input_data, - rmm::device_uvector &input_nm, - const cudf::scalar &zero, - const cudf::scalar &scalar_size_per_row, + rmm::device_uvector& column_start, + rmm::device_uvector& column_size, + rmm::device_uvector& input_data, + rmm::device_uvector& input_nm, + const cudf::scalar& zero, + const cudf::scalar& scalar_size_per_row, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) + rmm::mr::device_memory_resource* mr) { int64_t total_allocation = size_per_row * num_rows; // We made a mistake in the split somehow @@ -944,12 +1547,12 @@ static std::unique_ptr fixed_width_convert_to_rows( mr); } -static cudf::data_type get_data_type(const cudf::column_view &v) { return v.type(); } +static cudf::data_type get_data_type(const cudf::column_view& v) { return v.type(); } -static inline bool are_all_fixed_width(std::vector const &schema) +static inline bool are_all_fixed_width(std::vector const& schema) { return std::all_of( - schema.begin(), schema.end(), [](const cudf::data_type &t) { return cudf::is_fixed_width(t); }); + schema.begin(), schema.end(), [](const cudf::data_type& t) { return cudf::is_fixed_width(t); }); } /** @@ -959,9 +1562,9 @@ static inline bool are_all_fixed_width(std::vector const &schem * @param [out] column_size the size in bytes of the data for each columns in the row. * @return the size in bytes each row needs. */ -static inline int32_t compute_fixed_width_layout(std::vector const &schema, - std::vector &column_start, - std::vector &column_size) +static inline int32_t compute_fixed_width_layout(std::vector const& schema, + std::vector& column_start, + std::vector& column_size) { // We guarantee that the start of each column is 64-bit aligned so anything can go // there, but to make the code simple we will still do an alignment for it. @@ -979,27 +1582,29 @@ static inline int32_t compute_fixed_width_layout(std::vector co // Now we need to add in space for validity // Eventually we can think about nullable vs not nullable, but for now we will just always add it // in - int32_t validity_bytes_needed = (schema.size() + 7) / 8; + int32_t validity_bytes_needed = + (schema.size() + 7) / 8; // cudf::util::div_rounding_up_safe(schema.size(), 8); // validity comes at the end and is byte aligned so we can pack more in. at_offset += validity_bytes_needed; // Now we need to pad the end so all rows are 64 bit aligned return align_offset(at_offset, 8); // 8 bytes (64 bits) } +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 + template -static size_type compute_column_information( - iterator begin, - iterator end, - std::vector &column_starts, - std::vector &column_sizes)//, - //std::function nested_type_cb) +static size_type compute_column_information(iterator begin, + iterator end, + std::vector& column_starts, + std::vector& column_sizes) //, +// std::function nested_type_cb) { size_type fixed_width_size_per_row = 0; for (auto cv = begin; cv != end; ++cv) { auto col_type = std::get<0>(*cv); bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; -// if (nested_type && nested_type_cb) { nested_type_cb(cv->get<1>()); } + // if (nested_type && nested_type_cb) { nested_type_cb(cv->get<1>()); } // a list or string column will write a single uint64 // of data here for offset/length @@ -1021,11 +1626,53 @@ static size_type compute_column_information( //#define DEBUG -static std::vector build_block_infos(std::vector const &column_sizes, - std::vector const &column_starts, - std::vector const &row_batches, - size_type const total_number_of_rows, - size_type const &shmem_limit_per_block) +std::vector build_validity_block_infos( + size_type const& num_columns, + size_type const& num_rows, + size_type const& shmem_limit_per_block, + std::vector const& row_batches) +{ + auto const desired_rows_and_columns = (int)sqrt(shmem_limit_per_block); + auto const column_stride = align_offset( + [&]() { + if (desired_rows_and_columns > num_columns) { + // not many columns, group it into 8s and ship it off + return std::min(8, num_columns); + } else { + return util::round_down_safe(desired_rows_and_columns, 8); + } + }(), + 8); + // we fit as much as we can given the column stride + auto const row_stride = std::min(num_rows, shmem_limit_per_block * 8 / column_stride); + + std::vector validity_block_infos; + for (int col = 0; col < num_columns; col += column_stride) { + int current_window_row_batch = 0; + int rows_left_in_batch = row_batches[current_window_row_batch].row_count; + int row = 0; + while (row < num_rows) { + if (rows_left_in_batch == 0) { + current_window_row_batch++; + rows_left_in_batch = row_batches[current_window_row_batch].row_count; + } + int const window_height = std::min(row_stride, rows_left_in_batch); + + validity_block_infos.emplace_back(detail::block_info{ + col, row, std::min(col + column_stride - 1, num_columns - 1), row + window_height - 1}); + row += window_height; + rows_left_in_batch -= window_height; + } + } + + return validity_block_infos; +} + +std::vector build_block_infos(std::vector const& column_sizes, + std::vector const& column_starts, + std::vector const& row_batches, + size_type const total_number_of_rows, + size_type const& shmem_limit_per_block) { std::vector block_infos; @@ -1067,19 +1714,37 @@ static std::vector build_block_infos(std::vector const &c // far as byte sizes. x * y = shared_mem_size. Which translates to x^2 = shared_mem_size since we // want them equal, so height and width are sqrt(shared_mem_size). The trick is that it's in // bytes, not rows or columns. - int const window_height = std::min( - std::min(size_type(sqrt(shmem_limit_per_block)) / column_sizes[0], total_number_of_rows), - row_batches[0].row_count); + size_type const optimal_square_len = size_type(sqrt(shmem_limit_per_block)); + int const window_height = + std::clamp(util::round_up_safe( + optimal_square_len <= (size_type)column_sizes.size() + ? std::min(optimal_square_len / column_sizes[0], total_number_of_rows) + : row_batches[0].row_count / 2, + 32), + 1, + row_batches[0].row_count); #if defined(DEBUG) printf( - "sqrt(shmem_limit_per_block) / column_sizes[0] is %d and num_rows is %d, batch row count is %d - which makes window height " - "%d\n", - size_type(sqrt(shmem_limit_per_block)) / column_sizes[0], + "optimal_square_len is %d and we have %d columns, optimal_square_len / column_sizes[0] is %d " + "and num_rows is %d, batch row count is %d " + "- which makes window height " + "%d - admin size is %lu\n", + optimal_square_len, + (int)column_sizes.size(), + optimal_square_len / column_sizes[0], total_number_of_rows, row_batches[0].row_count, - window_height); + window_height, + column_sizes.size() * sizeof(size_type) * 2); #endif + auto calc_admin_data_size = [](int num_cols) -> size_type { + // admin data is the column sizes and column start information. + // this is copied to shared memory as well and needs to be accounted for + // in the window calculation. + return num_cols * sizeof(size_type) + num_cols * sizeof(size_type); + }; + int row_size = 0; // march each column and build the blocks of appropriate sizes @@ -1092,14 +1757,26 @@ static std::vector build_block_infos(std::vector const &c auto row_size_with_this_col = row_size_aligned + col_size; auto row_size_with_end_pad = detail::align_offset(row_size_with_this_col, 8); - if (row_size_with_end_pad * window_height > shmem_limit_per_block) { + if (row_size_with_end_pad * window_height + + calc_admin_data_size(col - current_window_start_col) > + shmem_limit_per_block) { #if defined(DEBUG) printf( - "Window size %d too large at column %d, bumping back to build windows of size %d(cols " + "row size with end pad is %d and admin data is %d, which adds up to %d and that is too " + "large for shmem block of %d\n", + row_size_with_end_pad, + calc_admin_data_size(col - current_window_start_col), + row_size_with_end_pad * window_height + + calc_admin_data_size(col - current_window_start_col), + shmem_limit_per_block); + printf( + "Window size %d too large at column %d, admin size is %d, bumping back to build windows of " + "size %d(cols " "%d-%d), which is %d tall. Row size is too large at %d and ok at %d(aligned overall is %d) " "for shared mem size %d\n", row_size_with_end_pad * window_height, col, + calc_admin_data_size(col - current_window_start_col), row_size * window_height, current_window_start_col, col - 1, @@ -1136,31 +1813,35 @@ static std::vector build_block_infos(std::vector const &c // build last set of blocks if (current_window_width > 0) { - build_blocks(current_window_start_col, (int)column_sizes.size()-1, window_height); + build_blocks(current_window_start_col, (int)column_sizes.size() - 1, window_height); } return block_infos; } -} // namespace detail #if defined(DEBUG) - void pretty_print(uint64_t i) { - if (i > (1 * 1024 * 1024 * 1024)) { - printf("%.2f GB", i / float(1 * 1024 * 1024 * 1024)); - } else if (i > (1 * 1024 * 1024)) { - printf("%.2f MB", i / float(1 * 1024 * 1024)); - } else if (i > (1 * 1024)) { - printf("%.2f KB", float(i / 1024)); - } else { - printf("%lu Bytes", i); - } +void pretty_print(uint64_t i) +{ + if (i > (1 * 1024 * 1024 * 1024)) { + printf("%.2f GB", i / float(1 * 1024 * 1024 * 1024)); + } else if (i > (1 * 1024 * 1024)) { + printf("%.2f MB", i / float(1 * 1024 * 1024)); + } else if (i > (1 * 1024)) { + printf("%.2f KB", float(i / 1024)); + } else { + printf("%lu Bytes", i); } +} #endif +#endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 + +} // namespace detail -std::vector> convert_to_rows2(cudf::table_view const &tbl, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) +std::vector> convert_to_rows(cudf::table_view const& tbl, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the // data, but small enough that multiple columns fit in memory so the writes can coalese as well. // Potential optimization for window sizes. @@ -1169,9 +1850,13 @@ std::vector> convert_to_rows2(cudf::table_view con int device_id; CUDA_TRY(cudaGetDevice(&device_id)); - int shmem_limit_per_block; - CUDA_TRY( - cudaDeviceGetAttribute(&shmem_limit_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); + int total_shmem; + CUDA_TRY(cudaDeviceGetAttribute(&total_shmem, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); + +#if defined(DEBUG) || 1 + total_shmem -= 1024; +#endif + int shmem_limit_per_block = total_shmem / NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; #if defined(DEBUG) size_t free, total; @@ -1195,8 +1880,8 @@ std::vector> convert_to_rows2(cudf::table_view con // windows so the windows can be properly cut around them. // Get the pointers to the input columnar data ready - std::vector input_data; - std::vector input_nm; + std::vector input_data; + std::vector input_nm; input_data.reserve(num_columns); input_nm.reserve(num_columns); for (size_type column_number = 0; column_number < num_columns; column_number++) { @@ -1224,16 +1909,16 @@ std::vector> convert_to_rows2(cudf::table_view con column_sizes.reserve(num_columns); column_starts.reserve(num_columns + 1); // we add a final offset for validity data start - auto iter = thrust::make_transform_iterator(thrust::make_counting_iterator(0), [&tbl](auto i) -> std::tuple { - return std::make_tuple(tbl.column(i).type(), tbl.column(i)); - }); + auto iter = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), [&tbl](auto i) -> std::tuple { + return std::make_tuple(tbl.column(i).type(), tbl.column(i)); + }); - size_type fixed_width_size_per_row = detail::compute_column_information( - iter, - iter + num_columns, - column_starts, - column_sizes);//, -// [&variable_width_columns](column_view const &cv) { variable_width_columns.push_back(cv); }); + size_type fixed_width_size_per_row = detail::compute_column_information(iter, + iter + num_columns, + column_starts, + column_sizes); //, + // [&variable_width_columns](column_view const &cv) { variable_width_columns.push_back(cv); }); /* size_type fixed_width_size_per_row = 0; for (int col = 0; col < num_columns; ++col) { auto cv = tbl.column(col); @@ -1261,7 +1946,6 @@ std::vector> convert_to_rows2(cudf::table_view con column_starts.back() + column_sizes.back()); #endif - auto dev_col_sizes = make_device_uvector_async(column_sizes, stream, mr); auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr); @@ -1329,7 +2013,8 @@ std::vector> convert_to_rows2(cudf::table_view con row_batch_rows++; } if (row_batch_size > 0) { - row_batches.push_back(detail::row_batch{static_cast(row_batch_size), row_batch_rows}); + row_batches.push_back( + detail::row_batch{static_cast(row_batch_size), row_batch_rows}); } auto dev_row_offsets = make_device_uvector_async(row_offsets, stream, mr); @@ -1339,17 +2024,17 @@ std::vector> convert_to_rows2(cudf::table_view con printf("%lu batches:\n", row_batches.size()); for (auto i = 0; i < (int)row_batches.size(); ++i) { printf("%d: %d rows, ", i, row_batches[i].row_count); - pretty_print(row_batches[i].num_bytes); + detail::pretty_print(row_batches[i].num_bytes); printf("\n"); } #endif std::vector output_buffers; - std::vector output_data; + std::vector output_data; output_data.reserve(row_batches.size()); for (uint i = 0; i < row_batches.size(); ++i) { rmm::device_buffer temp(row_batches[i].num_bytes, stream, mr); - output_data.push_back(static_cast(temp.data())); + output_data.push_back(static_cast(temp.data())); output_buffers.push_back(std::move(temp)); } auto dev_output_data = make_device_uvector_async(output_data, stream, mr); @@ -1362,38 +2047,63 @@ std::vector> convert_to_rows2(cudf::table_view con block_infos.size(), block_infos[0].end_col - block_infos[0].start_col + 1, block_infos[0].end_row - block_infos[0].start_row); - pretty_print(shmem_limit_per_block); + detail::pretty_print(shmem_limit_per_block); printf(" shared mem("); - pretty_print(fixed_width_size_per_row); + detail::pretty_print(fixed_width_size_per_row); printf("/row, %d columns, %d rows, ", num_columns, num_rows); - pretty_print(total_table_size); + detail::pretty_print(total_table_size); printf(" total):\n"); #endif auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr); // blast through the entire table and convert it - dim3 blocks(block_infos.size()); - #if defined(DEBUG) || 1 - dim3 threads(std::min(std::min(512, shmem_limit_per_block / 8), (int)total_table_size)); - #else - dim3 threads(std::min(std::min(1024, shmem_limit_per_block / 8), (int)total_table_size)); - #endif + dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS)); + dim3 threads(256); + #if defined(DEBUG) printf("Launching kernel with %d blocks, %d threads, ", blocks.x, threads.x); - pretty_print(shmem_limit_per_block); + detail::pretty_print(shmem_limit_per_block); printf(" shared memory\n"); #endif - copy_from_columns<<>>( + detail::copy_from_columns<<>>( num_rows, num_columns, + shmem_limit_per_block, + block_infos.size(), dev_input_data.data(), - dev_input_nm.data(), dev_col_sizes.data(), dev_col_starts.data(), dev_block_infos.data(), dev_row_offsets.data(), - reinterpret_cast(dev_output_data.data())); + reinterpret_cast(dev_output_data.data())); + + auto validity_block_infos = + build_validity_block_infos(num_columns, num_rows, shmem_limit_per_block, row_batches); + + auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream, mr); + dim3 validity_blocks( + util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_COLUMNS)); + dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu)); +#if defined(DEBUG) + printf("Launching validity kernel with %d blocks, for %lu validity blocks with %d threads, ", + validity_blocks.x, + validity_block_infos.size(), + validity_threads.x); + detail::pretty_print(total_shmem); + printf(" shared memory\n"); +#endif + detail:: + copy_validity_from_columns<<>>( + num_rows, + num_columns, + shmem_limit_per_block, + dev_row_offsets.data(), + dev_output_data.data(), + column_starts.back(), + dev_validity_block_infos.data(), + validity_block_infos.size(), + dev_input_nm.data()); // split up the output buffer into multiple buffers based on row batch sizes // and create list of byte columns @@ -1428,11 +2138,15 @@ std::vector> convert_to_rows2(cudf::table_view con } return ret; +#else + CUDF_FAIL("Column to row conversion optimization requires volta or later hardware."); + return {}; +#endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 } -std::vector> convert_to_rows(cudf::table_view const &tbl, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) +std::vector> old_convert_to_rows(cudf::table_view const& tbl, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { const cudf::size_type num_columns = tbl.num_columns(); @@ -1456,8 +2170,8 @@ std::vector> convert_to_rows(cudf::table_view cons cudf::size_type num_rows = tbl.num_rows(); // Get the pointers to the input columnar data ready - std::vector input_data; - std::vector input_nm; + std::vector input_data; + std::vector input_nm; for (cudf::size_type column_number = 0; column_number < num_columns; column_number++) { cudf::column_view cv = tbl.column(column_number); input_data.emplace_back(cv.data()); @@ -1469,11 +2183,11 @@ std::vector> convert_to_rows(cudf::table_view cons using ScalarType = cudf::scalar_type_t; auto zero = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value()); zero->set_valid_async(true, stream); - static_cast(zero.get())->set_value(0, stream); + static_cast(zero.get())->set_value(0, stream); auto step = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value()); step->set_valid_async(true, stream); - static_cast(step.get()) + static_cast(step.get()) ->set_value(static_cast(size_per_row), stream); std::vector> ret; @@ -1500,11 +2214,12 @@ std::vector> convert_to_rows(cudf::table_view cons } } -std::unique_ptr convert_from_rows2(cudf::lists_column_view const &input, - std::vector const &schema, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) +std::unique_ptr convert_from_rows(cudf::lists_column_view const& input, + std::vector const& schema, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 // verify that the types are what we expect cudf::column_view child = input.child(); cudf::type_id list_type = child.type().id(); @@ -1516,11 +2231,13 @@ std::unique_ptr convert_from_rows2(cudf::lists_column_view const &i int device_id; CUDA_TRY(cudaGetDevice(&device_id)); - int shmem_limit_per_block; - CUDA_TRY( - cudaDeviceGetAttribute(&shmem_limit_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); + int total_shmem; + CUDA_TRY(cudaDeviceGetAttribute(&total_shmem, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); - shmem_limit_per_block /= NUM_BLOCKS_PER_KERNEL_TO_COLUMNS; +#if defined(DEBUG) || 1 + total_shmem -= 1024; +#endif + int shmem_limit_per_block = total_shmem / NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; std::vector column_starts; std::vector column_sizes; @@ -1529,7 +2246,7 @@ std::unique_ptr convert_from_rows2(cudf::lists_column_view const &i return std::make_tuple(schema[i], nullptr); }); size_type fixed_width_size_per_row = detail::compute_column_information( - iter, iter + num_columns, column_starts, column_sizes);//, [](void *) {}); + iter, iter + num_columns, column_starts, column_sizes); //, [](void *) {}); size_type validity_size = num_bitmask_words(num_columns) * 4; @@ -1537,8 +2254,7 @@ std::unique_ptr convert_from_rows2(cudf::lists_column_view const &i // Ideally we would check that the offsets are all the same, etc. but for now // this is probably fine - CUDF_EXPECTS(row_size * num_rows == child.size(), - "The layout of the data appears to be off"); + CUDF_EXPECTS(row_size * num_rows == child.size(), "The layout of the data appears to be off"); auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr); auto dev_col_sizes = make_device_uvector_async(column_sizes, stream, mr); @@ -1549,8 +2265,8 @@ std::unique_ptr convert_from_rows2(cudf::lists_column_view const &i // Allocate the columns we are going to write into std::vector> output_columns; - std::vector output_data; - std::vector output_nm; + std::vector output_data; + std::vector output_nm; for (cudf::size_type i = 0; i < num_columns; i++) { auto column = cudf::make_fixed_width_column( schema[i], num_rows, cudf::mask_state::UNINITIALIZED, stream, mr); @@ -1568,36 +2284,97 @@ std::unique_ptr convert_from_rows2(cudf::lists_column_view const &i auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr); - dim3 blocks((block_infos.size() + (NUM_BLOCKS_PER_KERNEL_TO_COLUMNS - 1)) / NUM_BLOCKS_PER_KERNEL_TO_COLUMNS); - #if defined(DEBUG) || 1 - dim3 threads(std::min(std::min(512, shmem_limit_per_block / 8), (int)child.size())); - #else - dim3 threads(std::min(std::min(1024, shmem_limit_per_block / 8), (int)child.size())); - #endif + dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_COLUMNS)); +#if defined(DEBUG) + dim3 threads(std::min(std::min(128, shmem_limit_per_block / 8), (int)child.size())); +#else + dim3 threads(std::min(256, (int)child.size())); +#endif #if defined(DEBUG) printf("Launching kernel with %d blocks, %d threads, ", blocks.x, threads.x); - pretty_print(shmem_limit_per_block); + detail::pretty_print(total_shmem); printf(" shared memory\n"); #endif - detail::copy_to_columns<<>>( + detail::copy_to_columns<<>>( num_rows, num_columns, shmem_limit_per_block, input.offsets().data(), dev_output_data.data(), - dev_output_nm.data(), dev_col_sizes.data(), dev_col_starts.data(), dev_block_infos.data(), + block_infos.size(), child.data()); + auto const desired_rows_and_columns = (int)sqrt(shmem_limit_per_block); + auto const column_stride = [&]() { + if (desired_rows_and_columns > num_columns) { + // not many columns, group it into 8s and ship it off + return std::min(8, num_columns); + } else { + return util::round_down_safe(desired_rows_and_columns, 8); + } + }(); + auto const row_stride = [&]() { + // we fit as much as we can, we know the column stride now, so calculate the row + return std::min(num_rows, util::round_down_safe(shmem_limit_per_block * 8 / column_stride, 32)); + /* if (desired_rows_and_columns > num_rows) { + return std::min(32, num_rows); + } else { + return util::round_down_safe(desired_rows_and_columns, 32); + }*/ + }(); + std::vector validity_block_infos; + for (int col = 0; col < num_columns; col += column_stride) { + for (int row = 0; row < num_rows; row += row_stride) { + validity_block_infos.emplace_back( + detail::block_info{col, + row, + std::min(col + column_stride - 1, num_columns - 1), + std::min(row + row_stride - 1, num_rows - 1)}); + } + } + auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream, mr); + dim3 validity_blocks( + util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_COLUMNS)); +#if defined(DEBUG) + printf( + "Launching validity kernel with %d blocks, for %lu validity blocks, col stride %d and row " + "stride of %d with %d threads, ", + validity_blocks.x, + validity_block_infos.size(), + column_stride, + row_stride, + threads.x); + detail::pretty_print(total_shmem); + printf(" shared memory\n"); +#endif + + dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu)); + detail:: + copy_validity_to_columns<<>>( + num_rows, + num_columns, + shmem_limit_per_block, + input.offsets().data(), + dev_output_nm.data(), + column_starts.back(), + dev_validity_block_infos.data(), + validity_block_infos.size(), + child.data()); + return std::make_unique(std::move(output_columns)); +#else + CUDF_FAIL("Row to column conversion optimization requires volta or later hardware."); + return {}; +#endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 } -std::unique_ptr convert_from_rows(cudf::lists_column_view const &input, - std::vector const &schema, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) +std::unique_ptr old_convert_from_rows(cudf::lists_column_view const& input, + std::vector const& schema, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { // verify that the types are what we expect cudf::column_view child = input.child(); @@ -1619,12 +2396,12 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const &in CUDF_EXPECTS(size_per_row * num_rows == child.size(), "The layout of the data appears to be off"); auto dev_column_start = make_device_uvector_async(column_start, stream); - auto dev_column_size = make_device_uvector_async(column_size, stream); + auto dev_column_size = make_device_uvector_async(column_size, stream); // Allocate the columns we are going to write into std::vector> output_columns; - std::vector output_data; - std::vector output_nm; + std::vector output_data; + std::vector output_nm; for (cudf::size_type i = 0; i < num_columns; i++) { auto column = cudf::make_fixed_width_column( schema[i], num_rows, cudf::mask_state::UNINITIALIZED, stream, mr); @@ -1642,6 +2419,11 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const &in int shared_size = detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads); + // printf("Launching (%d, %d, %d) blocks, (%d, %d, %d) threads, with %d shared size\n", + // blocks.x, blocks.y, blocks.z, threads.x, threads.y, threads.z, shared_size); + // printf("pointers are column_start: %p, column_size: %p, output_data: %p, output_nm: %p\n", + // dev_column_start.data(), dev_column_size.data(), dev_output_data.data(), + // dev_output_nm.data()); detail::copy_to_fixed_width_columns<<>>( num_rows, num_columns, @@ -1658,36 +2440,4 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const &in } } -std::unique_ptr convert_from_rows( - std::vector> const &input, - std::vector const &schema, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) -{ - CUDF_EXPECTS(input.size() == 1, "Too large of an input, need to concat the output tables..."); - - // for (uint i=0; iview(); - auto ret = convert_from_rows(lcv, schema, stream, mr); - - return ret; - // } -} - -std::unique_ptr convert_from_rows2( - std::vector> const &input, - std::vector const &schema, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) -{ - CUDF_EXPECTS(input.size() == 1, "Too large of an input, need to concat the output tables..."); - - // for (uint i=0; iview(); - auto ret = convert_from_rows2(lcv, schema, stream, mr); - - return ret; - // } -} - } // namespace cudf diff --git a/cpp/tests/row_conversion/row_conversion.cpp b/cpp/tests/row_conversion/row_conversion.cpp index 818d7a89ddb..e38b37e81a6 100644 --- a/cpp/tests/row_conversion/row_conversion.cpp +++ b/cpp/tests/row_conversion/row_conversion.cpp @@ -34,8 +34,8 @@ TEST_F(ColumnToRowTests, Single) cudf::test::fixed_width_column_wrapper a({-1}); cudf::table_view in(std::vector{a}); - auto old_rows = cudf::convert_to_rows(in); - auto new_rows = cudf::convert_to_rows2(in); + auto old_rows = cudf::old_convert_to_rows(in); + auto new_rows = cudf::convert_to_rows(in); EXPECT_EQ(old_rows.size(), new_rows.size()); for (uint i = 0; i < old_rows.size(); i++) { @@ -48,8 +48,8 @@ TEST_F(ColumnToRowTests, Simple) cudf::test::fixed_width_column_wrapper a({-1, 0, 1}); cudf::table_view in(std::vector{a}); - auto old_rows = cudf::convert_to_rows(in); - auto new_rows = cudf::convert_to_rows2(in); + auto old_rows = cudf::old_convert_to_rows(in); + auto new_rows = cudf::convert_to_rows(in); EXPECT_EQ(old_rows.size(), new_rows.size()); for (uint i = 0; i < old_rows.size(); i++) { @@ -64,8 +64,8 @@ TEST_F(ColumnToRowTests, Tall) cudf::test::fixed_width_column_wrapper a(r, r + (size_t)4096); cudf::table_view in(std::vector{a}); - auto old_rows = cudf::convert_to_rows(in); - auto new_rows = cudf::convert_to_rows2(in); + auto old_rows = cudf::old_convert_to_rows(in); + auto new_rows = cudf::convert_to_rows(in); EXPECT_EQ(old_rows.size(), new_rows.size()); for (uint i = 0; i < old_rows.size(); i++) { @@ -84,8 +84,8 @@ TEST_F(ColumnToRowTests, Wide) } cudf::table_view in(views); - auto old_rows = cudf::convert_to_rows(in); - auto new_rows = cudf::convert_to_rows2(in); + auto old_rows = cudf::old_convert_to_rows(in); + auto new_rows = cudf::convert_to_rows(in); EXPECT_EQ(old_rows.size(), new_rows.size()); for (uint i = 0; i < old_rows.size(); i++) { @@ -104,8 +104,31 @@ TEST_F(ColumnToRowTests, SingleByteWide) } cudf::table_view in(views); - auto old_rows = cudf::convert_to_rows(in); - auto new_rows = cudf::convert_to_rows2(in); + auto old_rows = cudf::old_convert_to_rows(in); + auto new_rows = cudf::convert_to_rows(in); + + EXPECT_EQ(old_rows.size(), new_rows.size()); + for (uint i = 0; i < old_rows.size(); i++) { + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); + } +} + +TEST_F(ColumnToRowTests, Big) +{ + auto r = + cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); }); + std::vector> cols; + std::vector views; + + for (int i = 0; i < 256; ++i) { + cols.push_back( + cudf::test::fixed_width_column_wrapper(r + 4096 * i, r + 4096 * i + 4096)); + views.push_back(cols.back()); + } + cudf::table_view in(views); + + auto old_rows = cudf::old_convert_to_rows(in); + auto new_rows = cudf::convert_to_rows(in); EXPECT_EQ(old_rows.size(), new_rows.size()); for (uint i = 0; i < old_rows.size(); i++) { @@ -120,9 +143,9 @@ TEST_F(RowToColumnTests, Single) auto old_rows = cudf::convert_to_rows(in); std::vector schema{cudf::data_type{cudf::type_id::INT32}}; - for (uint i=0; i a({-1, 0, 1}); cudf::table_view in(std::vector{a}); - auto old_rows = cudf::convert_to_rows(in); + auto old_rows = cudf::old_convert_to_rows(in); std::vector schema{cudf::data_type{cudf::type_id::INT32}}; - for (uint i=0; i a(r, r + (size_t)4096); cudf::table_view in(std::vector{a}); - auto old_rows = cudf::convert_to_rows(in); + auto old_rows = cudf::old_convert_to_rows(in); std::vector schema; schema.reserve(in.num_columns()); for (auto col = in.begin(); col < in.end(); ++col) { schema.push_back(col->type()); } - for (uint i=0; i views; for (int i = 0; i < 256; ++i) { - cols.push_back(cudf::test::fixed_width_column_wrapper({rand()})); + cols.push_back(cudf::test::fixed_width_column_wrapper({i})); // rand()})); views.push_back(cols.back()); } cudf::table_view in(views); - auto old_rows = cudf::convert_to_rows(in); + auto old_rows = cudf::old_convert_to_rows(in); std::vector schema; schema.reserve(in.num_columns()); for (auto col = in.begin(); col < in.end(); ++col) { schema.push_back(col->type()); } - for (uint i=0; i schema; schema.reserve(in.num_columns()); for (auto col = in.begin(); col < in.end(); ++col) { schema.push_back(col->type()); } - for (uint i=0; i int32_t { return rand(); }); + std::vector> cols; + std::vector views; + std::vector schema; + + cols.push_back(cudf::test::fixed_width_column_wrapper(r, r + 13)); + views.push_back(cols.back()); + schema.push_back(cudf::data_type{cudf::type_id::INT32}); + cudf::table_view in(views); + + auto old_rows = cudf::old_convert_to_rows(in); + + for (uint i = 0; i < old_rows.size(); ++i) { + auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); + } +} + +TEST_F(RowToColumnTests, Big) +{ + auto r = + cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); }); + std::vector> cols; + std::vector views; + std::vector schema; + + for (int i = 0; i < 256; ++i) { + cols.push_back( + cudf::test::fixed_width_column_wrapper(r + 4096 * i, r + 4096 * i + 4096)); + views.push_back(cols.back()); + schema.push_back(cudf::data_type{cudf::type_id::INT32}); + } + cudf::table_view in(views); + + auto old_rows = cudf::old_convert_to_rows(in); + + for (uint i = 0; i < old_rows.size(); ++i) { + auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); } diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu index 68f1ae93dec..1babbc6fd1a 100644 --- a/java/src/main/native/src/row_conversion.cu +++ b/java/src/main/native/src/row_conversion.cu @@ -14,36 +14,52 @@ * limitations under the License. */ +#include #include +#include #include +#include + +#include +#include + +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 +#include +#endif #include #include +#include +#include +#include +#include #include #include +#include #include #include #include #include +#include #include - -#include "row_conversion.hpp" - +#include +#include + +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 +constexpr auto NUM_BLOCKS_PER_KERNEL_TO_COLUMNS = 8; +constexpr auto NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS = 8; +constexpr auto NUM_BLOCKS_PER_KERNEL_LOADED = 2; +constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL = 8; +constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED = 2; +#endif + +using cudf::detail::make_device_uvector_async; namespace cudf { -namespace java { -/** - * Copy a simple vector to device memory asynchronously. Be sure to read - * the data on the same stream as is used to copy it. - */ -template -std::unique_ptr> copy_to_dev_async(const std::vector &input, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) { - std::unique_ptr> ret(new rmm::device_uvector(input.size(), stream, mr)); - CUDA_TRY(cudaMemcpyAsync(ret->data(), input.data(), sizeof(T) * input.size(), - cudaMemcpyHostToDevice, stream.value())); - return ret; +namespace detail { + +static inline __host__ __device__ int32_t align_offset(int32_t offset, std::size_t alignment) { + return (offset + alignment - 1) & ~(alignment - 1); } __global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows, @@ -53,7 +69,6 @@ __global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows, const cudf::size_type *num_bytes, int8_t **output_data, cudf::bitmask_type **output_nm, const int8_t *input_data) { - // We are going to copy the data in two passes. // The first pass copies a chunk of data into shared memory. // The second pass copies that chunk from shared memory out to the final location. @@ -122,7 +137,6 @@ __global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows, cudf::size_type col_index_stride = blockDim.y; for (cudf::size_type col_index = col_index_start; col_index < num_columns; col_index += col_index_stride) { - cudf::size_type col_size = num_bytes[col_index]; const int8_t *col_tmp = &(row_tmp[input_offset_in_row[col_index]]); int8_t *col_output = output_data[col_index]; @@ -208,7 +222,6 @@ copy_from_fixed_width_columns(const cudf::size_type start_row, const cudf::size_ for (cudf::size_type row_group_index = row_group_start; row_group_index < row_group_end; row_group_index += row_group_stride) { - // Within the row group there should be 1 thread for each row. This is a // requirement for launching the kernel cudf::size_type row_index = start_row + (row_group_index * rows_per_group) + threadIdx.x; @@ -220,7 +233,6 @@ copy_from_fixed_width_columns(const cudf::size_type start_row, const cudf::size_ cudf::size_type col_index_stride = blockDim.y; for (cudf::size_type col_index = col_index_start; col_index < num_columns; col_index += col_index_stride) { - cudf::size_type col_size = num_bytes[col_index]; int8_t *col_tmp = &(row_tmp[output_offset_in_row[col_index]]); const int8_t *col_input = input_data[col_index]; @@ -304,6 +316,630 @@ copy_from_fixed_width_columns(const cudf::size_type start_row, const cudf::size_ } } +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 + +struct block_info { + int start_col; + int start_row; + int end_col; + int end_row; + int buffer_num; + + __host__ __device__ size_type get_row_size(size_type const *const col_offsets, + size_type const *const col_sizes) const { + return align_offset(col_offsets[end_col] + col_sizes[end_col] - col_offsets[start_col], 8); + } + __host__ __device__ size_type num_cols() const { return end_col - start_col + 1; } + + __host__ __device__ size_type num_rows() const { return end_row - start_row + 1; } +}; + +// When building the columns to return, we have to be mindful of the offset limit in cudf. +// It is 32-bit and these data columns are capable of surpassing that easily. The data should +// not be cut off exactly at the limit though due to the validity buffers. The most efficient +// place to cut the validity is on a 32-row boundary, so as we calculate the row sizes +// we keep track of the cut points for the validity, which we call row batches. If the row +// is larger than can be represented with the 32-bit offsets, we use the last 32-row boundary we +// hit. Note that this boundary is for our book-keeping with column pointers and not anything that +// the kernel needs to worry about. We cut the output at convienient boundaries when assembling +// the outgoing data stream. +struct row_batch { + size_type num_bytes; + size_type row_count; +}; + +/** + * @brief copy data from cudf columns into x format, which is row-based + * + * @param num_rows total number of rows in the table + * @param num_columns total number of columns in the table + * @param input_data pointer to raw table data + * @param input_nm pointer to validity data + * @param col_sizes array of sizes for each element in a column - one per column + * @param col_offsets offset into input data row for each column's start + * @param block_infos information about the blocks of work + * @param row_offsets offset to a specific row in the input data + * @param output_data pointer to output data + * + */ +__global__ void copy_from_columns(const size_type num_rows, const size_type num_columns, + const size_type shmem_used_per_block, + const size_type num_block_infos, const int8_t **input_data, + const size_type *col_sizes, const size_type *col_offsets, + const block_info *block_infos, const size_type *row_offsets, + int8_t **output_data) { + // We are going to copy the data in two passes. + // The first pass copies a chunk of data into shared memory. + // The second pass copies that chunk from shared memory out to the final location. + + // Because shared memory is limited we copy a subset of the rows at a time. + // This has been broken up for us in the block_info struct, so we don't have + // any calculation to do here, but it is important to note. + + constexpr unsigned stages_count = NUM_BLOCKS_PER_KERNEL_LOADED; + auto group = cooperative_groups::this_thread_block(); + extern __shared__ int8_t shared_data[]; + int8_t *shared[stages_count] = {shared_data, shared_data + shmem_used_per_block}; + + __shared__ cuda::barrier block_barrier[NUM_BLOCKS_PER_KERNEL_LOADED]; + if (group.thread_rank() == 0) { + for (int i = 0; i < NUM_BLOCKS_PER_KERNEL_LOADED; ++i) { + init(&block_barrier[i], group.size()); + } + } + + group.sync(); + + auto const blocks_remaining = + std::min((uint)(num_block_infos % NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS), + std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS, + (uint)NUM_BLOCKS_PER_KERNEL_TO_COLUMNS)); + + size_t fetch; + size_t subset; + for (subset = fetch = 0; subset < blocks_remaining; ++subset) { + // Fetch ahead up to stages_count subsets + for (; fetch < blocks_remaining && fetch < (subset + stages_count); ++fetch) { + auto const fetch_block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch]; + + auto const num_fetch_cols = fetch_block.num_cols(); + auto const num_fetch_rows = fetch_block.num_rows(); + auto const num_elements_in_block = num_fetch_cols * num_fetch_rows; + auto const fetch_block_row_size = fetch_block.get_row_size(col_offsets, col_sizes); + auto const starting_column_offset = col_offsets[fetch_block.start_col]; + auto &fetch_barrier = block_barrier[fetch % NUM_BLOCKS_PER_KERNEL_LOADED]; + + // wait for the last use of the memory to be completed + if (fetch > NUM_BLOCKS_PER_KERNEL_LOADED) { + fetch_barrier.arrive_and_wait(); + } + + // to do the copy we need to do n column copies followed by m element copies OR + // we have to do m element copies followed by r row copies. When going from column + // to row it is much easier to copy by elements first otherwise we would need a running + // total of the column sizes for our block, which isn't readily available. This makes it more + // appealing to copy element-wise from input data into shared matching the end layout and do + // row-based memcopies out. + + for (auto el = (int)threadIdx.x; el < num_elements_in_block; el += blockDim.x) { + auto const relative_col = el / num_fetch_rows; + auto const relative_row = el % num_fetch_rows; + auto const absolute_col = relative_col + fetch_block.start_col; + auto const absolute_row = relative_row + fetch_block.start_row; + auto const col_size = col_sizes[absolute_col]; + auto const col_offset = col_offsets[absolute_col]; + auto const relative_col_offset = col_offset - starting_column_offset; + + auto const shared_offset = relative_row * fetch_block_row_size + relative_col_offset; + auto const input_src = input_data[absolute_col] + col_size * absolute_row; + + // copy the main + cuda::memcpy_async(&shared[fetch % stages_count][shared_offset], input_src, col_size, + fetch_barrier); + } + } + + auto &subset_barrier = block_barrier[subset % NUM_BLOCKS_PER_KERNEL_LOADED]; + subset_barrier.arrive_and_wait(); + + auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + subset]; + /* auto const rows_in_block = block.num_rows(); + auto const cols_in_block = block.num_cols();*/ + auto const block_row_size = block.get_row_size(col_offsets, col_sizes); + auto const column_offset = col_offsets[block.start_col]; + + // copy entire rows to final dest + for (auto absolute_row = block.start_row + threadIdx.x; absolute_row <= block.end_row; + absolute_row += blockDim.x) { + auto const relative_row = absolute_row - block.start_row; + auto const output_dest = + output_data[block.buffer_num] + absolute_row * block_row_size + column_offset; + auto const shared_offset = block_row_size * relative_row; + cuda::memcpy_async(output_dest, &shared[subset % stages_count][shared_offset], block_row_size, + subset_barrier); + } + } + + // wait on the last copies to complete + for (uint i = 0; i < std::min(stages_count, blocks_remaining); ++i) { + block_barrier[i].arrive_and_wait(); + } +} + +/** + * @brief copy data from row-based format to cudf columns + * + * @param num_rows total number of rows in the table + * @param num_columns total number of columns in the table + * @param shmem_used_per_block amount of shared memory that is used by a block + * @param offsets + * @param output_data pointer to output data, partitioned by data size + * @param validity_offsets offset into input data row for validity data + * @param block_infos information about the blocks of work + * @param num_block_infos number of infos in blocks array + * @param input_data pointer to input data + * + */ +__global__ void copy_validity_from_columns( + const size_type num_rows, const size_type num_columns, const size_type shmem_used_per_block, + const size_type *row_offsets, int8_t **output_data, const size_type validity_offset, + const block_info *block_infos, const size_type num_block_infos, const bitmask_type **input_nm) { + extern __shared__ int8_t shared_data[]; + int8_t *shared_blocks[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] = { + shared_data, shared_data + shmem_used_per_block / 2}; + + // per conversation with DaveB + // each thread of warp reads a single int32 of validity - so we read 128 bytes + // then ballot_sync the bits and write the result to shmem + // after we fill shared mem memcpy it out in a blob. + // probably need knobs for number of rows vs columns to balance read/write + auto group = cooperative_groups::this_thread_block(); + + int const blocks_remaining = + std::min(num_block_infos - blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL, + (uint)NUM_VALIDITY_BLOCKS_PER_KERNEL); + + __shared__ cuda::barrier + shared_block_barriers[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]; + if (group.thread_rank() == 0) { + for (int i = 0; i < NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; ++i) { + init(&shared_block_barriers[i], group.size()); + } + } + + group.sync(); + + for (int validity_block = 0; validity_block < blocks_remaining; ++validity_block) { + if (validity_block != validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED) { + shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] + .arrive_and_wait(); + } + int8_t *this_shared_block = shared_blocks[validity_block % 2]; + auto block = block_infos[blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block]; + + auto const num_block_cols = block.num_cols(); + auto const num_block_rows = block.num_rows(); + + auto const num_sections_x = (num_block_cols + 31) / 32; + auto const num_sections_y = (num_block_rows + 7) / 8; + auto const validity_data_row_length = + align_offset(util::div_rounding_up_unsafe(num_block_cols, 8), 8); + auto const total_sections = num_sections_x * num_sections_y; + + int const warp_id = threadIdx.x / detail::warp_size; + int const lane_id = threadIdx.x % detail::warp_size; + auto const warps_per_block = std::max(1u, blockDim.x / detail::warp_size); + + // the block is divided into sections. A warp operates on a section at a time. + for (int my_section_idx = warp_id; my_section_idx < total_sections; + my_section_idx += warps_per_block) { + // convert to rows and cols + auto const section_x = my_section_idx / num_sections_x; + auto const section_y = my_section_idx % num_sections_x; + + auto const relative_col = section_x * 32 + lane_id; + auto const relative_row = section_y * 8; + auto const absolute_col = relative_col + block.start_col; + auto const absolute_row = relative_row + block.start_row; + auto const cols_left = num_columns - absolute_col; + + auto const participation_mask = __ballot_sync(0xFFFFFFFF, absolute_col < num_columns); + + if (absolute_col < num_columns) { + auto my_byte = + input_nm[absolute_col] != nullptr ? input_nm[absolute_col][absolute_row / 32] : 0xFF; + + // every thread that is participating in the warp has a byte, but it's column-based + // data and we need it in row-based. So we shiffle the bits around with ballot_sync to make + // the bytes we actually write. + for (int i = 0, byte_mask = 1; i < 8 && relative_row + i < num_rows; ++i, byte_mask <<= 1) { + auto validity_data = __ballot_sync(participation_mask, my_byte & byte_mask); + // lead thread in each warp writes data + auto const validity_write_offset = + validity_data_row_length * (relative_row + i) + relative_col / 8; + if (threadIdx.x % detail::warp_size == 0) { + if (cols_left <= 8) { + // write byte + this_shared_block[validity_write_offset] = validity_data & 0xFF; + } else if (cols_left <= 16) { + // write int16 + *reinterpret_cast(&this_shared_block[validity_write_offset]) = + validity_data & 0xFFFF; + } else if (cols_left <= 24) { + // write int16 and then int8 + *reinterpret_cast(&this_shared_block[validity_write_offset]) = + validity_data & 0xFFFF; + shared_data[validity_write_offset + 2] = (validity_data >> 16) & 0xFF; + } else { + // write int32 + *reinterpret_cast(&this_shared_block[validity_write_offset]) = + validity_data; + } + } + } + } + } + + // make sure entire block has finished copy + group.sync(); + + // now async memcpy the shared memory out to the final destination + for (int row = block.start_row + threadIdx.x; row <= block.end_row; row += blockDim.x) { + auto const relative_row = row - block.start_row; + auto const output_ptr = + output_data[block.buffer_num] + row_offsets[row] + validity_offset + block.start_col / 8; + auto const num_bytes = util::div_rounding_up_unsafe(num_block_cols, 8); + cuda::memcpy_async( + output_ptr, &this_shared_block[validity_data_row_length * relative_row], num_bytes, + shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]); + } + } + + // wait for last blocks of data to arrive + for (int validity_block = 0; + validity_block < blocks_remaining % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; + ++validity_block) { + shared_block_barriers[validity_block].arrive_and_wait(); + } +} + +static __device__ std::tuple +get_admin_data_sizes(size_t col_size_size, size_t col_offset_size, int const num_cols) { + auto const col_size_bytes = num_cols * col_size_size; + auto const col_offset_bytes = num_cols * col_offset_size; + + return {col_size_bytes, col_offset_bytes}; +} + +/** + * @brief ensure `read_ahead` buffer blocks are fetched + * + * @param fetch_index internal state passed into the function + * @param processing_index index where processing is occuring + * @param read_ahead_count how many blocks to read ahead + * @param max_resident_blocks how many blocks can be loaded at once + * @param total_blocks total number of blocks overall + * @param block_infos pointer to the block infos + * @param col_sizes pointer to column size information + * @param col_offsets pointer to the table's column offsets + * @param row_offsets pointer to offsets for each row in the table + * @param input_data pointer to the input data + * @param shared pointer to shared memory + * @param group thread group participating in the fetch + * @param block_barrier barriers used for each block + * @return + */ +static __device__ void +fetch_blocks_for_row_to_column(size_t &fetch_index, size_t const processing_index, + int const read_ahead_count, int const max_resident_blocks, + int const total_blocks, block_info const *const block_infos, + size_type const *const col_sizes, size_type const *const col_offsets, + size_type const *const row_offsets, int8_t const *const input_data, + int8_t *shared[], cooperative_groups::thread_block const group, + cuda::barrier *block_barrier) { + for (; fetch_index < static_cast(total_blocks) && + fetch_index < (processing_index + read_ahead_count); + ++fetch_index) { + auto const fetch_block = + block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch_index]; + auto const fetch_block_start_row = fetch_block.start_row; + auto const fetch_block_end_row = fetch_block.end_row; + auto const starting_col_offset = col_offsets[fetch_block.start_col]; + + auto const fetch_block_row_size = fetch_block.get_row_size(col_offsets, col_sizes); + auto const num_fetch_cols = fetch_block.num_cols(); + auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes( + sizeof(decltype(*col_sizes)), sizeof(decltype(*col_offsets)), num_fetch_cols); + auto &fetch_barrier = block_barrier[fetch_index % NUM_BLOCKS_PER_KERNEL_LOADED]; + + // if we have fetched all buffers, we need to wait for processing + // to complete on them before we can use them again + if (fetch_index > NUM_BLOCKS_PER_KERNEL_LOADED) { + fetch_barrier.arrive_and_wait(); + } + + auto shared_row_offset = 0; + // copy the data for column sizes + cuda::memcpy_async(group, &shared[fetch_index % max_resident_blocks][shared_row_offset], + &col_sizes[fetch_block.start_col], col_size_bytes, fetch_barrier); + shared_row_offset += col_size_bytes; + // copy the data for column offsets + cuda::memcpy_async(group, &shared[fetch_index % max_resident_blocks][shared_row_offset], + &col_offsets[fetch_block.start_col], col_offset_bytes, fetch_barrier); + shared_row_offset += col_offset_bytes; + shared_row_offset = align_offset(shared_row_offset, 8); + + for (auto row = fetch_block_start_row + static_cast(threadIdx.x); + row <= fetch_block_end_row; row += blockDim.x) { + auto shared_offset = (row - fetch_block_start_row) * fetch_block_row_size + shared_row_offset; + // copy the main + cuda::memcpy_async(&shared[fetch_index % max_resident_blocks][shared_offset], + &input_data[row_offsets[row] + starting_col_offset], fetch_block_row_size, + fetch_barrier); + } + } +} + +/** + * @brief copy data from row-based format to cudf columns + * + * @param num_rows total number of rows in the table + * @param num_columns total number of columns in the table + * @param shmem_used_per_block amount of shared memory that is used by a block + * @param row_offsets + * @param output_data + * @param output_nm + * @param col_sizes array of sizes for each element in a column - one per column + * @param col_offsets offset into input data row for each column's start + * @param block_infos information about the blocks of work + * @param input_data pointer to input data + * + */ +__global__ void copy_to_columns(const size_type num_rows, const size_type num_columns, + const size_type shmem_used_per_block, const size_type *row_offsets, + int8_t **output_data, const size_type *_col_sizes, + const size_type *_col_offsets, const block_info *block_infos, + const size_type num_block_infos, const int8_t *input_data) { + // We are going to copy the data in two passes. + // The first pass copies a chunk of data into shared memory. + // The second pass copies that chunk from shared memory out to the final location. + + // Because shared memory is limited we copy a subset of the rows at a time. + // This has been broken up for us in the block_info struct, so we don't have + // any calculation to do here, but it is important to note. + + // to speed up some of the random access memory we do, we copy col_sizes and col_offsets + // to shared memory for each of the blocks that we work on + + constexpr unsigned stages_count = NUM_BLOCKS_PER_KERNEL_LOADED; + auto group = cooperative_groups::this_thread_block(); + extern __shared__ int8_t shared_data[]; + int8_t *shared[stages_count] = {shared_data, shared_data + shmem_used_per_block}; + + __shared__ cuda::barrier block_barrier[stages_count]; + if (group.thread_rank() == 0) { + for (int i = 0; i < stages_count; ++i) { + init(&block_barrier[i], group.size()); + } + } + + group.sync(); + + auto blocks_remaining = std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS, + (uint)NUM_BLOCKS_PER_KERNEL_TO_COLUMNS); + + auto get_admin_data_sizes = [col_size_size = sizeof(decltype(*_col_sizes)), + col_offset_size = sizeof(decltype(*_col_offsets))]( + int const num_cols, + int const num_rows) -> std::tuple { + auto const col_size_bytes = num_cols * col_size_size; + auto const col_offset_bytes = num_cols * col_offset_size; + + return {col_size_bytes, col_offset_bytes}; + }; + + size_t fetch; + size_t subset; + for (subset = fetch = 0; subset < blocks_remaining; ++subset) { + // Fetch ahead up to stages_count subsets + fetch_blocks_for_row_to_column(fetch, subset, stages_count, stages_count, blocks_remaining, + block_infos, _col_sizes, _col_offsets, row_offsets, input_data, + shared, group, block_barrier); + + auto &subset_barrier = block_barrier[subset % stages_count]; + // ensure our data is ready + subset_barrier.arrive_and_wait(); + + auto const block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + subset]; + + auto const rows_in_block = block.num_rows(); + auto const cols_in_block = block.num_cols(); + + auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes(cols_in_block, rows_in_block); + // auto shared_row_offsets = shared[subset]; + auto shared_col_sizes = reinterpret_cast(shared[subset % stages_count]); + auto shared_col_offsets = + reinterpret_cast(&shared[subset % stages_count][col_size_bytes]); + + auto const shared_row_offset = align_offset(col_size_bytes + col_offset_bytes, 8); + + auto block_row_size = block.get_row_size(_col_offsets, _col_sizes); + + // now we copy from shared memory to final destination. + // the data is laid out in rows in shared memory, so the reads + // for a column will be "vertical". Because of this and the different + // sizes for each column, this portion is handled on row/column basis. + // to prevent each thread working on a single row and also to ensure + // that all threads can do work in the case of more threads than rows, + // we do a global index instead of a double for loop with col/row. + for (int index = threadIdx.x; index < rows_in_block * cols_in_block; index += blockDim.x) { + auto const relative_col = index % cols_in_block; + auto const relative_row = index / cols_in_block; + auto const absolute_col = relative_col + block.start_col; + auto const absolute_row = relative_row + block.start_row; + + auto const shared_memory_row_offset = block_row_size * relative_row; + auto const shared_memory_offset = shared_col_offsets[relative_col] - shared_col_offsets[0] + + shared_memory_row_offset + shared_row_offset; + auto const column_size = shared_col_sizes[relative_col]; + + int8_t *shmem_src = &shared[subset % stages_count][shared_memory_offset]; + int8_t *dst = &output_data[absolute_col][absolute_row * column_size]; + + cuda::memcpy_async(dst, shmem_src, column_size, subset_barrier); + } + group.sync(); + } + + // wait on the last copies to complete + for (uint i = 0; i < std::min(stages_count, blocks_remaining); ++i) { + block_barrier[i].arrive_and_wait(); + } +} + +/** + * @brief copy data from row-based format to cudf columns + * + * @param num_rows total number of rows in the table + * @param num_columns total number of columns in the table + * @param shmem_used_per_block amount of shared memory that is used by a block + * @param offsets + * @param output_nm + * @param validity_offsets offset into input data row for validity data + * @param block_infos information about the blocks of work + * @param num_block_infos number of infos in blocks array + * @param input_data pointer to input data + * + */ +__global__ void copy_validity_to_columns( + const size_type num_rows, const size_type num_columns, const size_type shmem_used_per_block, + const size_type *row_offsets, cudf::bitmask_type **output_nm, const size_type validity_offset, + const block_info *block_infos, const size_type num_block_infos, const int8_t *input_data) { + extern __shared__ int8_t shared_data[]; + int8_t *shared_blocks[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] = { + shared_data, shared_data + shmem_used_per_block / 2}; + + // per conversation with DaveB + // each thread of warp reads a single byte of validity - so we read 32 bytes + // then ballot_sync the bits and write the result to shmem + // after we fill shared mem memcpy it out in a blob. + // probably need knobs for number of rows vs columns to balance read/write + auto group = cooperative_groups::this_thread_block(); + + int const blocks_remaining = + std::min(num_block_infos - blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL, + (uint)NUM_VALIDITY_BLOCKS_PER_KERNEL); + + __shared__ cuda::barrier + shared_block_barriers[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]; + if (group.thread_rank() == 0) { + for (int i = 0; i < NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; ++i) { + init(&shared_block_barriers[i], group.size()); + } + } + + group.sync(); + + for (int validity_block = 0; validity_block < blocks_remaining; ++validity_block) { + auto const validity_index = validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; + if (validity_block != validity_index) { + shared_block_barriers[validity_index].arrive_and_wait(); + } + int8_t *this_shared_block = shared_blocks[validity_block % 2]; + auto const block = block_infos[blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block]; + auto const block_start_col = block.start_col; + auto const block_start_row = block.start_row; + + auto const num_block_cols = block.num_cols(); + auto const num_block_rows = block.num_rows(); + + auto const num_sections_x = (num_block_cols + 7) / 8; + auto const num_sections_y = (num_block_rows + 31) / 32; + auto const validity_data_col_length = align_offset(num_sections_y, 4); + auto const total_sections = num_sections_x * num_sections_y; + + int const warp_id = threadIdx.x / detail::warp_size; + int const lane_id = threadIdx.x % detail::warp_size; + auto const warps_per_block = std::max(1u, blockDim.x / detail::warp_size); + + // the block is divided into sections. A warp operates on a section at a time. + for (int my_section_idx = warp_id; my_section_idx < total_sections; + my_section_idx += warps_per_block) { + // convert to rows and cols + auto const section_x = my_section_idx % num_sections_x; + auto const section_y = my_section_idx / num_sections_x; + + auto const relative_col = section_x * 8; + auto const relative_row = section_y * 32 + lane_id; + auto const absolute_col = relative_col + block_start_col; + auto const absolute_row = relative_row + block_start_row; + auto const rows_left = num_rows - absolute_row; + + auto const participation_mask = __ballot_sync(0xFFFFFFFF, absolute_row < num_rows); + + if (absolute_row < num_rows) { + auto const my_byte = + input_data[row_offsets[absolute_row] + validity_offset + absolute_col / 8]; + + // so every thread that is participating in the warp has a byte, but it's row-based + // data and we need it in column-based. So we shiffle the bits around to make + // the bytes we actually write. + for (int i = 0, byte_mask = 1; i < 8 && relative_col + i < num_columns; + ++i, byte_mask <<= 1) { + auto validity_data = __ballot_sync(participation_mask, my_byte & byte_mask); + // lead thread in each warp writes data + if (threadIdx.x % detail::warp_size == 0) { + auto const validity_write_offset = + validity_data_col_length * (relative_col + i) + relative_row / 8; + + if (rows_left <= 8) { + // write byte + this_shared_block[validity_write_offset] = validity_data & 0xFF; + } else if (rows_left <= 16) { + // write int16 + *reinterpret_cast(&this_shared_block[validity_write_offset]) = + validity_data & 0xFFFF; + } else if (rows_left <= 24) { + // write int16 and then int8 + *reinterpret_cast(&this_shared_block[validity_write_offset]) = + validity_data & 0xFFFF; + shared_data[validity_write_offset + 2] = (validity_data >> 16) & 0xFF; + } else { + // write int32 + *reinterpret_cast(&this_shared_block[validity_write_offset]) = + validity_data; + } + } + } + } + } + + // make sure entire block has finished copy + group.sync(); + + // now async memcpy the shared + for (int col = block.start_col + threadIdx.x; col <= block.end_col; col += blockDim.x) { + auto const relative_col = col - block.start_col; + + cuda::memcpy_async( + output_nm[col] + word_index(block_start_row), + &this_shared_block[validity_data_col_length * relative_col], + util::div_rounding_up_unsafe(num_block_rows, 8), + shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]); + } + } + + // wait for last blocks of data to arrive + auto const num_blocks_to_wait = blocks_remaining > NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED ? + NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED : + blocks_remaining; + for (int validity_block = 0; validity_block < num_blocks_to_wait; ++validity_block) { + shared_block_barriers[validity_block].arrive_and_wait(); + } +} + +#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 + /** * Calculate the dimensions of the kernel for fixed width only columns. * @param [in] num_columns the number of columns being copied. @@ -317,7 +953,6 @@ static int calc_fixed_width_kernel_dims(const cudf::size_type num_columns, const cudf::size_type num_rows, const cudf::size_type size_per_row, dim3 &blocks, dim3 &threads) { - // We have found speed degrades when a thread handles more than 4 columns. // Each block is 2 dimensional. The y dimension indicates the columns. // We limit this to 32 threads in the y dimension so we can still @@ -327,10 +962,9 @@ static int calc_fixed_width_kernel_dims(const cudf::size_type num_columns, // in the x dimension because we use atomic operations at the block // level when writing validity data out to main memory, and that would // need to change if we split a word of validity data between blocks. - int y_block_size = (num_columns + 3) / 4; - if (y_block_size > 32) { + int y_block_size = (num_columns + 3) / 4; // cudf::util::div_rounding_up_safe(num_columns, 4); + if (y_block_size > 32) y_block_size = 32; - } int x_possible_block_size = 1024 / y_block_size; // 48KB is the default setting for shared memory per block according to the cuda tutorials // If someone configures the GPU to only have 16 KB this might not work. @@ -373,15 +1007,15 @@ static int calc_fixed_width_kernel_dims(const cudf::size_type num_columns, * going from start row and containing the next num_rows. Most of the parameters passed * into this function are common between runs and should be calculated once. */ -static std::unique_ptr fixed_width_convert_to_rows( - const cudf::size_type start_row, const cudf::size_type num_rows, - const cudf::size_type num_columns, const cudf::size_type size_per_row, - std::unique_ptr> &column_start, - std::unique_ptr> &column_size, - std::unique_ptr> &input_data, - std::unique_ptr> &input_nm, - const cudf::scalar &zero, const cudf::scalar &scalar_size_per_row, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) { +static std::unique_ptr +fixed_width_convert_to_rows(const cudf::size_type start_row, const cudf::size_type num_rows, + const cudf::size_type num_columns, const cudf::size_type size_per_row, + rmm::device_uvector &column_start, + rmm::device_uvector &column_size, + rmm::device_uvector &input_data, + rmm::device_uvector &input_nm, + const cudf::scalar &zero, const cudf::scalar &scalar_size_per_row, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) { int64_t total_allocation = size_per_row * num_rows; // We made a mistake in the split somehow CUDF_EXPECTS(total_allocation < std::numeric_limits::max(), "Table is too large to fit!"); @@ -397,30 +1031,23 @@ static std::unique_ptr fixed_width_convert_to_rows( dim3 blocks; dim3 threads; int shared_size = - calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads); + detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads); copy_from_fixed_width_columns<<>>( - start_row, num_rows, num_columns, size_per_row, column_start->data(), column_size->data(), - input_data->data(), input_nm->data(), data->mutable_view().data()); + start_row, num_rows, num_columns, size_per_row, column_start.data(), column_size.data(), + input_data.data(), input_nm.data(), data->mutable_view().data()); return cudf::make_lists_column(num_rows, std::move(offsets), std::move(data), 0, - rmm::device_buffer{}, stream, mr); + rmm::device_buffer{0, rmm::cuda_stream_default, mr}, stream, mr); } static cudf::data_type get_data_type(const cudf::column_view &v) { return v.type(); } -static bool is_fixed_width(const cudf::data_type &t) { - return cudf::is_fixed_width(t); -} - -static inline int32_t align_offset(int32_t offset, std::size_t alignment) { - return (offset + alignment - 1) & ~(alignment - 1); -} - static inline bool are_all_fixed_width(std::vector const &schema) { - return std::all_of(schema.begin(), schema.end(), cudf::java::is_fixed_width); + return std::all_of(schema.begin(), schema.end(), + [](const cudf::data_type &t) { return cudf::is_fixed_width(t); }); } /** @@ -449,30 +1076,443 @@ static inline int32_t compute_fixed_width_layout(std::vector co // Now we need to add in space for validity // Eventually we can think about nullable vs not nullable, but for now we will just always add it // in - int32_t validity_bytes_needed = (schema.size() + 7) / 8; + int32_t validity_bytes_needed = + (schema.size() + 7) / 8; // cudf::util::div_rounding_up_safe(schema.size(), 8); // validity comes at the end and is byte aligned so we can pack more in. at_offset += validity_bytes_needed; // Now we need to pad the end so all rows are 64 bit aligned return align_offset(at_offset, 8); // 8 bytes (64 bits) } +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 + +template +static size_type compute_column_information(iterator begin, iterator end, + std::vector &column_starts, + std::vector &column_sizes) //, +// std::function nested_type_cb) +{ + size_type fixed_width_size_per_row = 0; + for (auto cv = begin; cv != end; ++cv) { + auto col_type = std::get<0>(*cv); + bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; + + // if (nested_type && nested_type_cb) { nested_type_cb(cv->get<1>()); } + + // a list or string column will write a single uint64 + // of data here for offset/length + auto col_size = nested_type ? 8 : size_of(col_type); + + // align size for this type + std::size_t const alignment_needed = col_size; // They are the same for fixed width types + fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed); + column_starts.push_back(fixed_width_size_per_row); + column_sizes.push_back(col_size); + fixed_width_size_per_row += col_size; + } + + auto validity_offset = detail::align_offset(fixed_width_size_per_row, 4); + column_starts.push_back(validity_offset); + + return fixed_width_size_per_row; +} + +std::vector +build_validity_block_infos(size_type const &num_columns, size_type const &num_rows, + size_type const &shmem_limit_per_block, + std::vector const &row_batches) { + auto const desired_rows_and_columns = (int)sqrt(shmem_limit_per_block); + auto const column_stride = align_offset( + [&]() { + if (desired_rows_and_columns > num_columns) { + // not many columns, group it into 8s and ship it off + return std::min(8, num_columns); + } else { + return util::round_down_safe(desired_rows_and_columns, 8); + } + }(), + 8); + // we fit as much as we can given the column stride + auto const row_stride = std::min(num_rows, shmem_limit_per_block * 8 / column_stride); + + std::vector validity_block_infos; + for (int col = 0; col < num_columns; col += column_stride) { + int current_window_row_batch = 0; + int rows_left_in_batch = row_batches[current_window_row_batch].row_count; + int row = 0; + while (row < num_rows) { + if (rows_left_in_batch == 0) { + current_window_row_batch++; + rows_left_in_batch = row_batches[current_window_row_batch].row_count; + } + int const window_height = std::min(row_stride, rows_left_in_batch); + + validity_block_infos.emplace_back(detail::block_info{ + col, row, std::min(col + column_stride - 1, num_columns - 1), row + window_height - 1}); + row += window_height; + rows_left_in_batch -= window_height; + } + } + + return validity_block_infos; +} + +std::vector build_block_infos(std::vector const &column_sizes, + std::vector const &column_starts, + std::vector const &row_batches, + size_type const total_number_of_rows, + size_type const &shmem_limit_per_block) { + std::vector block_infos; + + // block infos are organized with the windows going "down" the columns + // this provides the most coalescing of memory access + int current_window_width = 0; + int current_window_start_col = 0; + + // build the blocks for a specific set of columns + auto build_blocks = [&block_infos, &row_batches, total_number_of_rows]( + int const start_col, int const end_col, int const desired_window_height) { + int current_window_start_row = 0; + int current_window_row_batch = 0; + int rows_left_in_batch = row_batches[current_window_row_batch].row_count; + int i = 0; + while (i < total_number_of_rows) { + if (rows_left_in_batch == 0) { + current_window_row_batch++; + rows_left_in_batch = row_batches[current_window_row_batch].row_count; + } + int const window_height = std::min(desired_window_height, rows_left_in_batch); + + block_infos.emplace_back(detail::block_info{ + start_col, current_window_start_row, end_col, + std::min(current_window_start_row + window_height - 1, total_number_of_rows - 1), + current_window_row_batch}); + + i += window_height; + current_window_start_row += window_height; + rows_left_in_batch -= window_height; + } + }; + + // the ideal window height has lots of 8-byte reads and 8-byte writes. The optimal read/write + // would be memory cache line sized access, but since other blocks will read/write the edges this + // may not turn out to be overly important. For now, we will attempt to build a square window as + // far as byte sizes. x * y = shared_mem_size. Which translates to x^2 = shared_mem_size since we + // want them equal, so height and width are sqrt(shared_mem_size). The trick is that it's in + // bytes, not rows or columns. + size_type const optimal_square_len = size_type(sqrt(shmem_limit_per_block)); + int const window_height = + std::clamp(util::round_up_safe( + optimal_square_len <= (size_type)column_sizes.size() ? + std::min(optimal_square_len / column_sizes[0], total_number_of_rows) : + row_batches[0].row_count / 2, + 32), + 1, row_batches[0].row_count); + + auto calc_admin_data_size = [](int num_cols) -> size_type { + // admin data is the column sizes and column start information. + // this is copied to shared memory as well and needs to be accounted for + // in the window calculation. + return num_cols * sizeof(size_type) + num_cols * sizeof(size_type); + }; + + int row_size = 0; + + // march each column and build the blocks of appropriate sizes + for (unsigned int col = 0; col < column_sizes.size(); ++col) { + auto const col_size = column_sizes[col]; + + // align size for this type + std::size_t alignment_needed = col_size; // They are the same for fixed width types + auto row_size_aligned = detail::align_offset(row_size, alignment_needed); + auto row_size_with_this_col = row_size_aligned + col_size; + auto row_size_with_end_pad = detail::align_offset(row_size_with_this_col, 8); + + if (row_size_with_end_pad * window_height + + calc_admin_data_size(col - current_window_start_col) > + shmem_limit_per_block) { + // too large, close this window, generate vertical blocks and restart + build_blocks(current_window_start_col, col - 1, window_height); + row_size = + detail::align_offset((column_starts[col] + column_sizes[col]) & 7, alignment_needed); + row_size += col_size; // alignment required for shared memory window boundary to match + // alignment of output row + current_window_start_col = col; + current_window_width = 0; + } else { + row_size = row_size_with_this_col; + current_window_width++; + } + } + + // build last set of blocks + if (current_window_width > 0) { + build_blocks(current_window_start_col, (int)column_sizes.size() - 1, window_height); + } + + return block_infos; +} + +#endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 + +} // namespace detail + std::vector> convert_to_rows(cudf::table_view const &tbl, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) { +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 + // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the + // data, but small enough that multiple columns fit in memory so the writes can coalese as well. + // Potential optimization for window sizes. + const size_type num_columns = tbl.num_columns(); + const size_type num_rows = tbl.num_rows(); + + int device_id; + CUDA_TRY(cudaGetDevice(&device_id)); + int total_shmem; + CUDA_TRY(cudaDeviceGetAttribute(&total_shmem, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); + + // TODO: kernels fail to launch if we use all the available shared memory. + total_shmem -= 1024; + + int shmem_limit_per_block = total_shmem / NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; + + // break up the work into blocks, which are a starting and ending row/col #. + // this window size is calculated based on the shared memory size available + // we want a single block to fill up the entire shared memory space available + // for the transpose-like conversion. + + // There are two different processes going on here. The GPU conversion of the data + // and the writing of the data into the list of byte columns that are a maximum of + // 2 gigs each due to offset maximum size. The GPU conversion portion has to understand + // this limitation because the column must own the data inside and as a result it must be + // a distinct allocation for that column. Copying the data into these final buffers would + // be prohibitively expensive, so care is taken to ensure the GPU writes to the proper buffer. + // The windows are broken at the boundaries of specific rows based on the row sizes up + // to that point. These are row batches and they are decided first before building the + // windows so the windows can be properly cut around them. + + // Get the pointers to the input columnar data ready + std::vector input_data; + std::vector input_nm; + input_data.reserve(num_columns); + input_nm.reserve(num_columns); + for (size_type column_number = 0; column_number < num_columns; column_number++) { + column_view cv = tbl.column(column_number); + auto const col_type = cv.type(); + bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; + + if (!nested_type) { + input_data.emplace_back(cv.data()); + input_nm.emplace_back(cv.null_mask()); + } + } + auto dev_input_data = make_device_uvector_async(input_data, stream, mr); + auto dev_input_nm = make_device_uvector_async(input_nm, stream, mr); + + std::vector row_sizes; // size of each row in bytes including any alignment padding + std::vector row_offsets; // offset from the start of the data to this row + std::vector column_sizes; // byte size of each column + std::vector column_starts; // offset of column inside a row including alignment + std::vector + variable_width_columns; // list of the variable width columns in the table + row_sizes.reserve(num_rows); + row_offsets.reserve(num_rows); + column_sizes.reserve(num_columns); + column_starts.reserve(num_columns + 1); // we add a final offset for validity data start + + auto iter = + thrust::make_transform_iterator(thrust::make_counting_iterator(0), + [&tbl](auto i) -> std::tuple { + return std::make_tuple(tbl.column(i).type(), tbl.column(i)); + }); + + size_type fixed_width_size_per_row = + detail::compute_column_information(iter, iter + num_columns, column_starts, + column_sizes); //, + // [&variable_width_columns](column_view const &cv) { variable_width_columns.push_back(cv); }); + /* size_type fixed_width_size_per_row = 0; + for (int col = 0; col < num_columns; ++col) { + auto cv = tbl.column(col); + auto col_type = cv.type(); + bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; + + if (nested_type) { variable_width_columns.push_back(cv); } + + // a list or string column will write a single uint64 + // of data here for offset/length + auto col_size = nested_type ? 8 : size_of(col_type); + + // align size for this type + std::size_t const alignment_needed = col_size; // They are the same for fixed width types + fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed); + column_starts.push_back(fixed_width_size_per_row); + column_sizes.push_back(col_size); + fixed_width_size_per_row += col_size; + }*/ + + auto dev_col_sizes = make_device_uvector_async(column_sizes, stream, mr); + auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr); + + std::vector row_batches; + + auto calculate_variable_width_row_data_size = [](int const row) { + // each level of variable-width data will add an offset/length + // uint64 of data. The first of which is inside the fixed-width + // data itself and needs to be aligned based on what is around + // that data. This is handled above with the fixed-width calculations + // for that reason. We may still need to add more of these offset/length + // combinations if the nesting is deeper than one level as these + // will be included in the variable-width data blob at the end of the + // row. + return 0; + /* auto c = variable_width_columns[col]; + while (true) { + auto col_offsets = c.child(0).data(); + auto col_data_size = size_of(c.child(1).type()); + std::size_t alignment_needed = col_data_size; + + row_sizes[row] += (col_offsets[row + 1] - col_offsets[row]) * col_data_size; + if (c.num_children() == 0) { + break; + } + c = c.child(1); + } + */ + }; + + uint64_t row_batch_size = 0; + uint64_t total_table_size = 0; + size_type row_batch_rows = 0; + uint64_t row_offset = 0; + + // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then + // calculate the size of each row's variable-width data and validity as well. + auto validity_size = num_bitmask_words(num_columns) * 4; + for (int row = 0; row < num_rows; ++row) { + auto aligned_row_batch_size = + detail::align_offset(row_batch_size, 8); // rows are 8 byte aligned + row_sizes[row] = fixed_width_size_per_row; + // validity is byte aligned + row_sizes[row] += validity_size; + // variable width data is 8-byte aligned + row_sizes[row] = detail::align_offset(row_sizes[row], 8) + + calculate_variable_width_row_data_size(row); // rows are 8 byte aligned + + if ((uint64_t)aligned_row_batch_size + row_sizes[row] > + (uint64_t)std::numeric_limits::max()) { + // a new batch starts at the last 32-row boundary + row_batches.push_back( + detail::row_batch{static_cast(row_batch_size), row_batch_rows & ~31}); + row_batch_size = 0; + row_batch_rows = row_batch_rows & 31; + row_offset = 0; + aligned_row_batch_size = 0; + } + row_offset = detail::align_offset(row_offset, 8); // rows are 8 byte aligned + row_offsets.push_back(row_offset); + row_batch_size = aligned_row_batch_size + row_sizes[row]; + row_offset += row_sizes[row]; + total_table_size = detail::align_offset(total_table_size, 8); // rows are 8 byte aligned + total_table_size += row_sizes[row]; + row_batch_rows++; + } + if (row_batch_size > 0) { + row_batches.push_back( + detail::row_batch{static_cast(row_batch_size), row_batch_rows}); + } + + auto dev_row_offsets = make_device_uvector_async(row_offsets, stream, mr); + + std::vector output_buffers; + std::vector output_data; + output_data.reserve(row_batches.size()); + for (uint i = 0; i < row_batches.size(); ++i) { + rmm::device_buffer temp(row_batches[i].num_bytes, stream, mr); + output_data.push_back(static_cast(temp.data())); + output_buffers.push_back(std::move(temp)); + } + auto dev_output_data = make_device_uvector_async(output_data, stream, mr); + + std::vector block_infos = + build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block); + + auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr); + + // blast through the entire table and convert it + dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS)); + dim3 threads(256); + + detail::copy_from_columns<<>>( + num_rows, num_columns, shmem_limit_per_block, block_infos.size(), dev_input_data.data(), + dev_col_sizes.data(), dev_col_starts.data(), dev_block_infos.data(), dev_row_offsets.data(), + reinterpret_cast(dev_output_data.data())); + + auto validity_block_infos = + build_validity_block_infos(num_columns, num_rows, shmem_limit_per_block, row_batches); + + auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream, mr); + dim3 validity_blocks( + util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_COLUMNS)); + dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu)); + detail::copy_validity_from_columns<<>>( + num_rows, num_columns, shmem_limit_per_block, dev_row_offsets.data(), dev_output_data.data(), + column_starts.back(), dev_validity_block_infos.data(), validity_block_infos.size(), + dev_input_nm.data()); + + // split up the output buffer into multiple buffers based on row batch sizes + // and create list of byte columns + int offset_offset = 0; + std::vector> ret; + for (uint i = 0; i < row_batches.size(); ++i) { + // compute offsets for this row batch + std::vector offset_vals; + offset_vals.reserve(row_batches[i].row_count + 1); + size_type cur_offset = 0; + offset_vals.push_back(cur_offset); + for (int row = 0; row < row_batches[i].row_count; ++row) { + cur_offset = detail::align_offset(cur_offset, 8) + row_sizes[row + offset_offset]; + offset_vals.push_back(cur_offset); + } + offset_offset += row_batches[i].row_count; + + auto dev_offsets = make_device_uvector_async(offset_vals, stream, mr); + auto offsets = std::make_unique(data_type{type_id::INT32}, + (size_type)offset_vals.size(), dev_offsets.release()); + + auto data = std::make_unique(data_type{cudf::type_id::INT8}, row_batches[i].num_bytes, + std::move(output_buffers[i])); + + ret.push_back( + cudf::make_lists_column(row_batches[i].row_count, std::move(offsets), std::move(data), 0, + rmm::device_buffer{0, rmm::cuda_stream_default, mr}, stream, mr)); + } + + return ret; +#else + CUDF_FAIL("Column to row conversion optimization requires volta or later hardware."); + return {}; +#endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 +} + +std::vector> +old_convert_to_rows(cudf::table_view const &tbl, rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) { const cudf::size_type num_columns = tbl.num_columns(); std::vector schema; schema.resize(num_columns); - std::transform(tbl.begin(), tbl.end(), schema.begin(), cudf::java::get_data_type); + std::transform(tbl.begin(), tbl.end(), schema.begin(), detail::get_data_type); - if (are_all_fixed_width(schema)) { + if (detail::are_all_fixed_width(schema)) { std::vector column_start; std::vector column_size; - int32_t size_per_row = compute_fixed_width_layout(schema, column_start, column_size); - auto dev_column_start = copy_to_dev_async(column_start, stream, mr); - auto dev_column_size = copy_to_dev_async(column_size, stream, mr); + int32_t size_per_row = detail::compute_fixed_width_layout(schema, column_start, column_size); + auto dev_column_start = make_device_uvector_async(column_start, stream, mr); + auto dev_column_size = make_device_uvector_async(column_size, stream, mr); int32_t max_rows_per_batch = std::numeric_limits::max() / size_per_row; // Make the number of rows per batch a multiple of 32 so we don't have to worry about @@ -489,8 +1529,8 @@ std::vector> convert_to_rows(cudf::table_view cons input_data.emplace_back(cv.data()); input_nm.emplace_back(cv.null_mask()); } - auto dev_input_data = copy_to_dev_async(input_data, stream, mr); - auto dev_input_nm = copy_to_dev_async(input_nm, stream, mr); + auto dev_input_data = make_device_uvector_async(input_data, stream, mr); + auto dev_input_nm = make_device_uvector_async(input_nm, stream, mr); using ScalarType = cudf::scalar_type_t; auto zero = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value()); @@ -506,7 +1546,7 @@ std::vector> convert_to_rows(cudf::table_view cons for (cudf::size_type row_start = 0; row_start < num_rows; row_start += max_rows_per_batch) { cudf::size_type row_count = num_rows - row_start; row_count = row_count > max_rows_per_batch ? max_rows_per_batch : row_count; - ret.emplace_back(fixed_width_convert_to_rows( + ret.emplace_back(detail::fixed_width_convert_to_rows( row_start, row_count, num_columns, size_per_row, dev_column_start, dev_column_size, dev_input_data, dev_input_nm, *zero, *step, stream, mr)); } @@ -521,7 +1561,129 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const &in std::vector const &schema, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) { +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 + // verify that the types are what we expect + cudf::column_view child = input.child(); + cudf::type_id list_type = child.type().id(); + CUDF_EXPECTS(list_type == cudf::type_id::INT8 || list_type == cudf::type_id::UINT8, + "Only a list of bytes is supported as input"); + + cudf::size_type num_columns = schema.size(); + cudf::size_type num_rows = input.parent().size(); + + int device_id; + CUDA_TRY(cudaGetDevice(&device_id)); + int total_shmem; + CUDA_TRY(cudaDeviceGetAttribute(&total_shmem, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); + + // TODO: unable to launch a kernel with all shared used + total_shmem -= 1024; + int shmem_limit_per_block = total_shmem / NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; + + std::vector column_starts; + std::vector column_sizes; + + auto iter = thrust::make_transform_iterator(thrust::make_counting_iterator(0), [&schema](auto i) { + return std::make_tuple(schema[i], nullptr); + }); + size_type fixed_width_size_per_row = detail::compute_column_information( + iter, iter + num_columns, column_starts, column_sizes); //, [](void *) {}); + + size_type validity_size = num_bitmask_words(num_columns) * 4; + + size_type row_size = detail::align_offset(fixed_width_size_per_row + validity_size, 8); + + // Ideally we would check that the offsets are all the same, etc. but for now + // this is probably fine + CUDF_EXPECTS(row_size * num_rows == child.size(), "The layout of the data appears to be off"); + auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr); + auto dev_col_sizes = make_device_uvector_async(column_sizes, stream, mr); + + // build the row_batches from the passed in list column + std::vector row_batches; + + row_batches.push_back(detail::row_batch{child.size(), num_rows}); + + // Allocate the columns we are going to write into + std::vector> output_columns; + std::vector output_data; + std::vector output_nm; + for (cudf::size_type i = 0; i < num_columns; i++) { + auto column = cudf::make_fixed_width_column(schema[i], num_rows, + cudf::mask_state::UNINITIALIZED, stream, mr); + auto mut = column->mutable_view(); + output_data.emplace_back(mut.data()); + output_nm.emplace_back(mut.null_mask()); + output_columns.emplace_back(std::move(column)); + } + + auto dev_output_data = make_device_uvector_async(output_data, stream, mr); + auto dev_output_nm = make_device_uvector_async(output_nm, stream, mr); + + std::vector block_infos = + build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block); + + auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr); + + dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_COLUMNS)); +#if defined(DEBUG) + dim3 threads(std::min(std::min(128, shmem_limit_per_block / 8), (int)child.size())); +#else + dim3 threads(std::min(256, (int)child.size())); +#endif + detail::copy_to_columns<<>>( + num_rows, num_columns, shmem_limit_per_block, input.offsets().data(), + dev_output_data.data(), dev_col_sizes.data(), dev_col_starts.data(), dev_block_infos.data(), + block_infos.size(), child.data()); + + auto const desired_rows_and_columns = (int)sqrt(shmem_limit_per_block); + auto const column_stride = [&]() { + if (desired_rows_and_columns > num_columns) { + // not many columns, group it into 8s and ship it off + return std::min(8, num_columns); + } else { + return util::round_down_safe(desired_rows_and_columns, 8); + } + }(); + auto const row_stride = [&]() { + // we fit as much as we can, we know the column stride now, so calculate the row + return std::min(num_rows, util::round_down_safe(shmem_limit_per_block * 8 / column_stride, 32)); + /* if (desired_rows_and_columns > num_rows) { + return std::min(32, num_rows); + } else { + return util::round_down_safe(desired_rows_and_columns, 32); + }*/ + }(); + std::vector validity_block_infos; + for (int col = 0; col < num_columns; col += column_stride) { + for (int row = 0; row < num_rows; row += row_stride) { + validity_block_infos.emplace_back( + detail::block_info{col, row, std::min(col + column_stride - 1, num_columns - 1), + std::min(row + row_stride - 1, num_rows - 1)}); + } + } + auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream, mr); + dim3 validity_blocks( + util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_COLUMNS)); + + dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu)); + detail:: + copy_validity_to_columns<<>>( + num_rows, num_columns, shmem_limit_per_block, input.offsets().data(), + dev_output_nm.data(), column_starts.back(), dev_validity_block_infos.data(), + validity_block_infos.size(), child.data()); + + return std::make_unique(std::move(output_columns)); +#else + CUDF_FAIL("Row to column conversion optimization requires volta or later hardware."); + return {}; +#endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 +} +std::unique_ptr old_convert_from_rows(cudf::lists_column_view const &input, + std::vector const &schema, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) { // verify that the types are what we expect cudf::column_view child = input.child(); cudf::type_id list_type = child.type().id(); @@ -530,19 +1692,19 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const &in cudf::size_type num_columns = schema.size(); - if (are_all_fixed_width(schema)) { + if (detail::are_all_fixed_width(schema)) { std::vector column_start; std::vector column_size; cudf::size_type num_rows = input.parent().size(); - int32_t size_per_row = compute_fixed_width_layout(schema, column_start, column_size); + int32_t size_per_row = detail::compute_fixed_width_layout(schema, column_start, column_size); // Ideally we would check that the offsets are all the same, etc. but for now // this is probably fine CUDF_EXPECTS(size_per_row * num_rows == child.size(), "The layout of the data appears to be off"); - auto dev_column_start = copy_to_dev_async(column_start, stream, mr); - auto dev_column_size = copy_to_dev_async(column_size, stream, mr); + auto dev_column_start = make_device_uvector_async(column_start, stream); + auto dev_column_size = make_device_uvector_async(column_size, stream); // Allocate the columns we are going to write into std::vector> output_columns; @@ -557,17 +1719,17 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const &in output_columns.emplace_back(std::move(column)); } - auto dev_output_data = copy_to_dev_async(output_data, stream, mr); - auto dev_output_nm = copy_to_dev_async(output_nm, stream, mr); + auto dev_output_data = make_device_uvector_async(output_data, stream, mr); + auto dev_output_nm = make_device_uvector_async(output_nm, stream, mr); dim3 blocks; dim3 threads; int shared_size = - calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads); + detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads); - copy_to_fixed_width_columns<<>>( - num_rows, num_columns, size_per_row, dev_column_start->data(), dev_column_size->data(), - dev_output_data->data(), dev_output_nm->data(), child.data()); + detail::copy_to_fixed_width_columns<<>>( + num_rows, num_columns, size_per_row, dev_column_start.data(), dev_column_size.data(), + dev_output_data.data(), dev_output_nm.data(), child.data()); return std::make_unique(std::move(output_columns)); } else { @@ -575,5 +1737,4 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const &in } } -} // namespace java } // namespace cudf diff --git a/java/src/main/native/src/row_conversion.hpp b/java/src/main/native/src/row_conversion.hpp index 17abde8df19..517202f3892 100644 --- a/java/src/main/native/src/row_conversion.hpp +++ b/java/src/main/native/src/row_conversion.hpp @@ -25,12 +25,24 @@ namespace cudf { namespace java { +std::vector> +old_convert_to_rows(cudf::table_view const &tbl, + // TODO need something for validity + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); + std::vector> convert_to_rows(cudf::table_view const &tbl, // TODO need something for validity rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); +std::unique_ptr +old_convert_from_rows(cudf::lists_column_view const &input, + std::vector const &schema, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); + std::unique_ptr convert_from_rows(cudf::lists_column_view const &input, std::vector const &schema, rmm::cuda_stream_view stream = rmm::cuda_stream_default, From 92f52cd2b97ac03dec5e9752f1d6cd4e08b4323e Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Tue, 21 Sep 2021 21:39:00 +0000 Subject: [PATCH 50/80] fixing validity alignment bugs --- cpp/src/row_conversion/row_conversion.cu | 144 +++++++++---- cpp/tests/row_conversion/row_conversion.cpp | 226 +++++++++++++++++++- java/src/main/native/src/row_conversion.cu | 22 +- 3 files changed, 333 insertions(+), 59 deletions(-) diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu index 42c40e0542d..0409a65b630 100644 --- a/cpp/src/row_conversion/row_conversion.cu +++ b/cpp/src/row_conversion/row_conversion.cu @@ -493,7 +493,7 @@ __global__ void copy_from_columns(const size_type num_rows, input_src, col_size); - // copy the main + // copy the element to global memory cuda::memcpy_async( &shared[fetch % stages_count][shared_offset], input_src, col_size, fetch_barrier); } @@ -568,7 +568,11 @@ __global__ void copy_validity_from_columns(const size_type num_rows, int8_t* shared_blocks[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] = { shared_data, shared_data + shmem_used_per_block / 2}; - constexpr bool print_debug = false; //(threadIdx.x==0 || threadIdx.x == 32) && blockIdx.x == 0; + int8_t* output_check_addr = nullptr; + int8_t* output_block_start = nullptr; + size_type output_block_size = 0; + + bool print_debug = false; //threadIdx.x==0 && blockIdx.x == 0; // if (blockIdx.x != 3 || threadIdx.x / 32 != 0) return; if (print_debug) { printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns); @@ -659,12 +663,14 @@ __global__ void copy_validity_from_columns(const size_type num_rows, if (print_debug) printf( - "%d %d - my warp is %d, %d total sections, %d warps per block, blockDim.x=%d, warp side " + "%d %d - my warp is %d, %d total sections(%d x, %d y), %d warps per block, blockDim.x=%d, warp size " "%d\n", threadIdx.x, blockIdx.x, warp_id, total_sections, + num_sections_x, + num_sections_y, warps_per_block, blockDim.x, detail::warp_size); @@ -672,10 +678,10 @@ __global__ void copy_validity_from_columns(const size_type num_rows, for (int my_section_idx = warp_id; my_section_idx < total_sections; my_section_idx += warps_per_block) { // convert to rows and cols - auto const section_x = my_section_idx / num_sections_x; - auto const section_y = my_section_idx % num_sections_x; + auto const section_x = my_section_idx % num_sections_x; + auto const section_y = my_section_idx / num_sections_x; - if (print_debug) printf("working on section %d of %d...\n", section_x, num_sections_x); + if (print_debug) printf("working on section %d,%d - %d of %d...\n", section_x, section_y, my_section_idx, total_sections); auto const relative_col = section_x * 32 + lane_id; auto const relative_row = section_y * 8; auto const absolute_col = relative_col + block.start_col; @@ -722,7 +728,7 @@ __global__ void copy_validity_from_columns(const size_type num_rows, absolute_col); // every thread that is participating in the warp has a byte, but it's column-based - // data and we need it in row-based. So we shiffle the bits around with ballot_sync to make + // data and we need it in row-based. So we shuffle the bits around with ballot_sync to make // the bytes we actually write. for (int i = 0, byte_mask = 1; i < 8 && relative_row + i < num_rows; ++i, byte_mask <<= 1) { auto validity_data = __ballot_sync(participation_mask, my_byte & byte_mask); @@ -744,23 +750,23 @@ __global__ void copy_validity_from_columns(const size_type num_rows, if (cols_left <= 8) { // write byte if (print_debug) - printf("writing single byte to shared offset 0x%x which is %p...\n", - validity_write_offset, + printf("%d %d - writing single byte to shared offset 0x%x which is %p...\n", + threadIdx.x, blockIdx.x, validity_write_offset, &this_shared_block[validity_write_offset]); this_shared_block[validity_write_offset] = validity_data & 0xFF; } else if (cols_left <= 16) { // write int16 if (print_debug) - printf("writing two bytes to shared offset 0x%x which is %p...\n", - validity_write_offset, + printf("%d %d - writing two bytes to shared offset 0x%x which is %p...\n", + threadIdx.x, blockIdx.x, validity_write_offset, &this_shared_block[validity_write_offset]); *reinterpret_cast(&this_shared_block[validity_write_offset]) = validity_data & 0xFFFF; } else if (cols_left <= 24) { // write int16 and then int8 if (print_debug) - printf("writing three bytes to shared offset 0x%x which is %p...\n", - validity_write_offset, + printf("%d %d - writing three bytes to shared offset 0x%x which is %p...\n", + threadIdx.x, blockIdx.x, validity_write_offset, &this_shared_block[validity_write_offset]); *reinterpret_cast(&this_shared_block[validity_write_offset]) = validity_data & 0xFFFF; @@ -768,8 +774,8 @@ __global__ void copy_validity_from_columns(const size_type num_rows, } else { // write int32 if (print_debug) - printf("writing 4 bytes to shared offset 0x%x which is %p...\n", - validity_write_offset, + printf("%d %d - writing 4 bytes to shared offset 0x%x which is %p...\n", + threadIdx.x, blockIdx.x, validity_write_offset, &this_shared_block[validity_write_offset]); *reinterpret_cast(&this_shared_block[validity_write_offset]) = validity_data; @@ -816,6 +822,18 @@ __global__ void copy_validity_from_columns(const size_type num_rows, auto const output_ptr = output_data[block.buffer_num] + row_offsets[row] + validity_offset + block.start_col / 8; auto const num_bytes = util::div_rounding_up_unsafe(num_block_cols, 8); + +/* if (num_rows >= 5006) { + auto const row5006_col_65 = output_data[block.buffer_num] + row_offsets[5006] + validity_offset + 65 / 8; + if (output_ptr >= row5006_col_65 && output_ptr <= row5006_col_65 + 4) { + printf("%d %d - writing bytes from %p(0x%x)-%p to %p-%p that overlap global %p(0x%x), which is row 5006, col 65!\n", threadIdx.x, blockIdx.x, &this_shared_block[validity_data_row_length * relative_row], this_shared_block[validity_data_row_length * relative_row], &this_shared_block[validity_data_row_length * relative_row + num_bytes], output_ptr, output_ptr + num_bytes, row5006_col_65, *row5006_col_65); + printf("%d %d - block information\n%d,%d -> %d,%d\n%d columns, %d rows\n", threadIdx.x, blockIdx.x, block.start_col, block.start_row, block.end_col, block.end_row, block.num_cols(), block.num_rows()); + output_check_addr = row5006_col_65; + output_block_start = output_ptr; + output_block_size = num_bytes; + } + }*/ + cuda::memcpy_async( output_ptr, &this_shared_block[validity_data_row_length * relative_row], @@ -851,6 +869,17 @@ __global__ void copy_validity_from_columns(const size_type num_rows, ++validity_block) { shared_block_barriers[validity_block].arrive_and_wait(); } + if (output_check_addr != nullptr) { + printf("output check after write to %p - 0x%x\n", output_check_addr, *output_check_addr); + for (int i=0; i get_admin_data_sizes(size_t col_size_size, @@ -901,12 +930,12 @@ static __device__ void fetch_blocks_for_row_to_column( for (; fetch_index < static_cast(total_blocks) && fetch_index < (processing_index + read_ahead_count); ++fetch_index) { - if (debug_print) - printf("fetching block %lu of %d\n", - blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch_index, - total_blocks); auto const fetch_block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch_index]; + if (debug_print) + printf("fetching block %lu of %d for start col %d, end col %d. Starting col offset is %p, ending offset %p\n", + blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch_index, + total_blocks, fetch_block.start_col, fetch_block.end_col, &col_offsets[fetch_block.start_col], &col_offsets[fetch_block.end_col]); auto const fetch_block_start_row = fetch_block.start_row; auto const fetch_block_end_row = fetch_block.end_row; auto const starting_col_offset = col_offsets[fetch_block.start_col]; @@ -948,7 +977,7 @@ static __device__ void fetch_blocks_for_row_to_column( &shared[fetch_index % max_resident_blocks][shared_row_offset], &col_offsets[fetch_block.start_col], col_offset_bytes); - cuda::memcpy_async(group, + cuda::memcpy_async(group, &shared[fetch_index % max_resident_blocks][shared_row_offset], &col_offsets[fetch_block.start_col], col_offset_bytes, @@ -983,7 +1012,7 @@ static __device__ void fetch_blocks_for_row_to_column( fetch_index % max_resident_blocks, &shared[fetch_index % max_resident_blocks][shared_offset], &input_data[row_offsets[row] + starting_col_offset]); - // copy the main + // copy the main cuda::memcpy_async(&shared[fetch_index % max_resident_blocks][shared_offset], &input_data[row_offsets[row] + starting_col_offset], fetch_block_row_size, @@ -1029,7 +1058,7 @@ __global__ void copy_to_columns(const size_type num_rows, // to speed up some of the random access memory we do, we copy col_sizes and col_offsets // to shared memory for each of the blocks that we work on - /*constexpr*/ bool debug_print = false; // threadIdx.x == 0; + /*constexpr*/ bool debug_print = false; //threadIdx.x == 0 && blockIdx.x == 0; constexpr unsigned stages_count = NUM_BLOCKS_PER_KERNEL_LOADED; auto group = cooperative_groups::this_thread_block(); extern __shared__ int8_t shared_data[]; @@ -1037,12 +1066,14 @@ __global__ void copy_to_columns(const size_type num_rows, if (debug_print) { printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns); - printf("block infos are at %p and my index is %d\n", block_infos, blockIdx.x); + printf("%d block infos are at %p and my index is %d\n", num_block_infos, block_infos, blockIdx.x); /* printf("Row Offsets:\n"); for (int i=0; i build_validity_block_infos( }(), 8); // we fit as much as we can given the column stride - auto const row_stride = std::min(num_rows, shmem_limit_per_block * 8 / column_stride); + // note that an element in the table takes just 1 bit, but a row with a single + // element still takes 8 bytes! + auto const bytes_per_row = align_offset(util::div_rounding_up_unsafe(column_stride, 8), 8); + auto const row_stride = std::min(num_rows, shmem_limit_per_block / bytes_per_row); std::vector validity_block_infos; for (int col = 0; col < num_columns; col += column_stride) { @@ -1695,6 +1747,7 @@ std::vector build_block_infos(std::vector const& column_s } int const window_height = std::min(desired_window_height, rows_left_in_batch); +// printf("block %d, %d to %d, %d\n", start_col, current_window_start_row, end_col, std::min(current_window_start_row + window_height - 1, total_number_of_rows - 1)); block_infos.emplace_back(detail::block_info{ start_col, current_window_start_row, @@ -1716,11 +1769,7 @@ std::vector build_block_infos(std::vector const& column_s // bytes, not rows or columns. size_type const optimal_square_len = size_type(sqrt(shmem_limit_per_block)); int const window_height = - std::clamp(util::round_up_safe( - optimal_square_len <= (size_type)column_sizes.size() - ? std::min(optimal_square_len / column_sizes[0], total_number_of_rows) - : row_batches[0].row_count / 2, - 32), + std::clamp(util::round_up_safe(std::min(std::min(optimal_square_len, (size_type)column_sizes.size()) / column_sizes[0], total_number_of_rows), 32), 1, row_batches[0].row_count); #if defined(DEBUG) @@ -1787,7 +1836,7 @@ std::vector build_block_infos(std::vector const& column_s shmem_limit_per_block); #endif // too large, close this window, generate vertical blocks and restart - build_blocks(current_window_start_col, col - 1, window_height); + build_blocks(current_window_start_col, col == 0 ? col : col - 1, window_height); row_size = detail::align_offset((column_starts[col] + column_sizes[col]) & 7, alignment_needed); #if defined(DEBUG) @@ -1973,6 +2022,16 @@ std::vector> convert_to_rows(cudf::table_view cons } c = c.child(1); } + exclusive_scan([t](int row_index) { + size_type total_row_size = 0; + for (int i=0 i> convert_to_rows(cudf::table_view cons // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then // calculate the size of each row's variable-width data and validity as well. auto validity_size = num_bitmask_words(num_columns) * 4; + // thrust for (int row = 0; row < num_rows; ++row) { auto aligned_row_batch_size = detail::align_offset(row_batch_size, 8); // rows are 8 byte aligned @@ -2310,8 +2370,8 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const& in auto const desired_rows_and_columns = (int)sqrt(shmem_limit_per_block); auto const column_stride = [&]() { if (desired_rows_and_columns > num_columns) { - // not many columns, group it into 8s and ship it off - return std::min(8, num_columns); + // not many columns, group it into 64s and ship it off + return std::min(64, num_columns); } else { return util::round_down_safe(desired_rows_and_columns, 8); } @@ -2325,6 +2385,8 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const& in return util::round_down_safe(desired_rows_and_columns, 32); }*/ }(); + printf("column stride is %d and row stride is %d. std::min(%d, util::round_down_safe(%d * 8 / %d, 32))\n", column_stride, row_stride, num_rows, shmem_limit_per_block, column_stride); + printf("each block uses %d bytes of shared memory\n", (column_stride / 8) * detail::align_offset(row_stride, 4)); std::vector validity_block_infos; for (int col = 0; col < num_columns; col += column_stride) { for (int row = 0; row < num_rows; row += row_stride) { diff --git a/cpp/tests/row_conversion/row_conversion.cpp b/cpp/tests/row_conversion/row_conversion.cpp index e38b37e81a6..26e071eef79 100644 --- a/cpp/tests/row_conversion/row_conversion.cpp +++ b/cpp/tests/row_conversion/row_conversion.cpp @@ -33,11 +33,19 @@ TEST_F(ColumnToRowTests, Single) { cudf::test::fixed_width_column_wrapper a({-1}); cudf::table_view in(std::vector{a}); + std::vector schema = {cudf::data_type{cudf::type_id::INT32}}; auto old_rows = cudf::old_convert_to_rows(in); auto new_rows = cudf::convert_to_rows(in); EXPECT_EQ(old_rows.size(), new_rows.size()); + for (uint i = 0; i < old_rows.size(); ++i) { + auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); + } + for (uint i = 0; i < old_rows.size(); i++) { CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); } @@ -47,11 +55,19 @@ TEST_F(ColumnToRowTests, Simple) { cudf::test::fixed_width_column_wrapper a({-1, 0, 1}); cudf::table_view in(std::vector{a}); + std::vector schema = {cudf::data_type{cudf::type_id::INT32}}; auto old_rows = cudf::old_convert_to_rows(in); auto new_rows = cudf::convert_to_rows(in); EXPECT_EQ(old_rows.size(), new_rows.size()); + for (uint i = 0; i < old_rows.size(); ++i) { + auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); + } + for (uint i = 0; i < old_rows.size(); i++) { CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); } @@ -63,11 +79,20 @@ TEST_F(ColumnToRowTests, Tall) cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); }); cudf::test::fixed_width_column_wrapper a(r, r + (size_t)4096); cudf::table_view in(std::vector{a}); + std::vector schema = {cudf::data_type{cudf::type_id::INT32}}; auto old_rows = cudf::old_convert_to_rows(in); auto new_rows = cudf::convert_to_rows(in); EXPECT_EQ(old_rows.size(), new_rows.size()); + + for (uint i = 0; i < old_rows.size(); ++i) { + auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); + } + for (uint i = 0; i < old_rows.size(); i++) { CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); } @@ -77,10 +102,12 @@ TEST_F(ColumnToRowTests, Wide) { std::vector> cols; std::vector views; + std::vector schema; for (int i = 0; i < 256; ++i) { cols.push_back(cudf::test::fixed_width_column_wrapper({rand()})); views.push_back(cols.back()); + schema.push_back(cudf::data_type{cudf::type_id::INT32}); } cudf::table_view in(views); @@ -88,6 +115,13 @@ TEST_F(ColumnToRowTests, Wide) auto new_rows = cudf::convert_to_rows(in); EXPECT_EQ(old_rows.size(), new_rows.size()); + for (uint i = 0; i < old_rows.size(); ++i) { + auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); + } + for (uint i = 0; i < old_rows.size(); i++) { CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); } @@ -97,10 +131,13 @@ TEST_F(ColumnToRowTests, SingleByteWide) { std::vector> cols; std::vector views; + std::vector schema; for (int i = 0; i < 256; ++i) { cols.push_back(cudf::test::fixed_width_column_wrapper({rand()})); views.push_back(cols.back()); + + schema.push_back(cudf::data_type{cudf::type_id::INT8}); } cudf::table_view in(views); @@ -108,6 +145,59 @@ TEST_F(ColumnToRowTests, SingleByteWide) auto new_rows = cudf::convert_to_rows(in); EXPECT_EQ(old_rows.size(), new_rows.size()); + + for (uint i = 0; i < old_rows.size(); ++i) { + auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); + } + + for (uint i = 0; i < old_rows.size(); i++) { + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); + } +} + +TEST_F(ColumnToRowTests, Non2Power) +{ + auto r = + cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); }); + std::vector> cols; + std::vector views; + std::vector schema; + + constexpr auto num_rows = 6 * 1024 + 557; + for (int i = 0; i < 131; ++i) { + cols.push_back( + cudf::test::fixed_width_column_wrapper(r + num_rows * i, r + num_rows * i + num_rows)); + views.push_back(cols.back()); + schema.push_back(cudf::data_type{cudf::type_id::INT32}); + } + cudf::table_view in(views); + + auto old_rows = cudf::old_convert_to_rows(in); + auto new_rows = cudf::convert_to_rows(in); + + EXPECT_EQ(old_rows.size(), new_rows.size()); + + for (uint i = 0; i < old_rows.size(); ++i) { + auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + + for (int j=0; jnum_columns(); ++j) { + printf("testing column %d\n", j); + if (j==65) { + printf("old\n"); + cudf::test::print(old_tbl->get_column(j)); + printf("new\n"); + cudf::test::print(new_tbl->get_column(j)); + } + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(old_tbl->get_column(j), new_tbl->get_column(j)); + } + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); + } + for (uint i = 0; i < old_rows.size(); i++) { CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); } @@ -119,11 +209,69 @@ TEST_F(ColumnToRowTests, Big) cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); }); std::vector> cols; std::vector views; + std::vector schema; - for (int i = 0; i < 256; ++i) { + // 28 columns of 1 million rows + constexpr auto num_rows = 1024 * 1024; + for (int i = 0; i < 28; ++i) { + cols.push_back( + cudf::test::fixed_width_column_wrapper(r + num_rows * i, r + num_rows * i + num_rows)); + views.push_back(cols.back()); + schema.push_back(cudf::data_type{cudf::type_id::INT32}); + } + cudf::table_view in(views); + + auto old_rows = cudf::old_convert_to_rows(in); + auto new_rows = cudf::convert_to_rows(in); + + EXPECT_EQ(old_rows.size(), new_rows.size()); + for (uint i = 0; i < old_rows.size(); i++) { + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); + } +} + +TEST_F(ColumnToRowTests, Bigger) +{ + auto r = + cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); }); + std::vector> cols; + std::vector views; + std::vector schema; + + // 128 columns of 1 million rows + constexpr auto num_rows = 1024 * 1024; + for (int i = 0; i < 128; ++i) { cols.push_back( - cudf::test::fixed_width_column_wrapper(r + 4096 * i, r + 4096 * i + 4096)); + cudf::test::fixed_width_column_wrapper(r + num_rows * i, r + num_rows * i + num_rows)); views.push_back(cols.back()); + schema.push_back(cudf::data_type{cudf::type_id::INT32}); + } + cudf::table_view in(views); + + auto old_rows = cudf::old_convert_to_rows(in); + auto new_rows = cudf::convert_to_rows(in); + + EXPECT_EQ(old_rows.size(), new_rows.size()); + for (uint i = 0; i < old_rows.size(); i++) { + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); + } +} + +TEST_F(ColumnToRowTests, Biggest) +{ + auto r = + cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); }); + std::vector> cols; + std::vector views; + std::vector schema; + + // 128 columns of 2 million rows + constexpr auto num_rows = 2 * 1024 * 1024; + for (int i = 0; i < 128; ++i) { + cols.push_back( + cudf::test::fixed_width_column_wrapper(r + num_rows * i, r + num_rows * i + num_rows)); + views.push_back(cols.back()); + schema.push_back(cudf::data_type{cudf::type_id::INT32}); } cudf::table_view in(views); @@ -238,7 +386,7 @@ TEST_F(RowToColumnTests, SingleByteWide) } } -TEST_F(RowToColumnTests, non2power) +TEST_F(RowToColumnTests, Non2Power) { auto r = cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); }); @@ -246,9 +394,13 @@ TEST_F(RowToColumnTests, non2power) std::vector views; std::vector schema; - cols.push_back(cudf::test::fixed_width_column_wrapper(r, r + 13)); - views.push_back(cols.back()); - schema.push_back(cudf::data_type{cudf::type_id::INT32}); + constexpr auto num_rows = 6 * 1024 + 557; + for (int i = 0; i < 131; ++i) { + cols.push_back( + cudf::test::fixed_width_column_wrapper(r + num_rows * i, r + num_rows * i + num_rows)); + views.push_back(cols.back()); + schema.push_back(cudf::data_type{cudf::type_id::INT32}); + } cudf::table_view in(views); auto old_rows = cudf::old_convert_to_rows(in); @@ -269,9 +421,67 @@ TEST_F(RowToColumnTests, Big) std::vector views; std::vector schema; - for (int i = 0; i < 256; ++i) { + // 28 columns of 1 million rows + constexpr auto num_rows = 1024 * 1024; + for (int i = 0; i < 28; ++i) { + cols.push_back( + cudf::test::fixed_width_column_wrapper(r + num_rows * i, r + num_rows * i + num_rows)); + views.push_back(cols.back()); + schema.push_back(cudf::data_type{cudf::type_id::INT32}); + } + cudf::table_view in(views); + + auto old_rows = cudf::old_convert_to_rows(in); + + for (uint i = 0; i < old_rows.size(); ++i) { + auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); + } +} + +TEST_F(RowToColumnTests, Bigger) +{ + auto r = + cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); }); + std::vector> cols; + std::vector views; + std::vector schema; + + // 28 columns of 1 million rows + constexpr auto num_rows = 1024 * 1024; + for (int i = 0; i < 128; ++i) { + cols.push_back( + cudf::test::fixed_width_column_wrapper(r + num_rows * i, r + num_rows * i + num_rows)); + views.push_back(cols.back()); + schema.push_back(cudf::data_type{cudf::type_id::INT32}); + } + cudf::table_view in(views); + + auto old_rows = cudf::old_convert_to_rows(in); + + for (uint i = 0; i < old_rows.size(); ++i) { + auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); + } +} + +TEST_F(RowToColumnTests, Biggest) +{ + auto r = + cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); }); + std::vector> cols; + std::vector views; + std::vector schema; + + // 28 columns of 1 million rows + constexpr auto num_rows = 5 * 1024 * 1024; + for (int i = 0; i < 128; ++i) { cols.push_back( - cudf::test::fixed_width_column_wrapper(r + 4096 * i, r + 4096 * i + 4096)); + cudf::test::fixed_width_column_wrapper(r + num_rows * i, r + num_rows * i + num_rows)); views.push_back(cols.back()); schema.push_back(cudf::data_type{cudf::type_id::INT32}); } diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu index 1babbc6fd1a..9f0df3569a7 100644 --- a/java/src/main/native/src/row_conversion.cu +++ b/java/src/main/native/src/row_conversion.cu @@ -54,7 +54,9 @@ constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED = 2; #endif using cudf::detail::make_device_uvector_async; -namespace cudf { +using cudf::detail::warp_size; + +namespace cudf::java { namespace detail { @@ -526,9 +528,9 @@ __global__ void copy_validity_from_columns( align_offset(util::div_rounding_up_unsafe(num_block_cols, 8), 8); auto const total_sections = num_sections_x * num_sections_y; - int const warp_id = threadIdx.x / detail::warp_size; - int const lane_id = threadIdx.x % detail::warp_size; - auto const warps_per_block = std::max(1u, blockDim.x / detail::warp_size); + int const warp_id = threadIdx.x / warp_size; + int const lane_id = threadIdx.x % warp_size; + auto const warps_per_block = std::max(1u, blockDim.x / warp_size); // the block is divided into sections. A warp operates on a section at a time. for (int my_section_idx = warp_id; my_section_idx < total_sections; @@ -557,7 +559,7 @@ __global__ void copy_validity_from_columns( // lead thread in each warp writes data auto const validity_write_offset = validity_data_row_length * (relative_row + i) + relative_col / 8; - if (threadIdx.x % detail::warp_size == 0) { + if (threadIdx.x % warp_size == 0) { if (cols_left <= 8) { // write byte this_shared_block[validity_write_offset] = validity_data & 0xFF; @@ -855,12 +857,12 @@ __global__ void copy_validity_to_columns( auto const num_sections_x = (num_block_cols + 7) / 8; auto const num_sections_y = (num_block_rows + 31) / 32; - auto const validity_data_col_length = align_offset(num_sections_y, 4); + auto const validity_data_col_length = num_sections_y * 4; // words to bytes auto const total_sections = num_sections_x * num_sections_y; - int const warp_id = threadIdx.x / detail::warp_size; - int const lane_id = threadIdx.x % detail::warp_size; - auto const warps_per_block = std::max(1u, blockDim.x / detail::warp_size); + int const warp_id = threadIdx.x / warp_size; + int const lane_id = threadIdx.x % warp_size; + auto const warps_per_block = std::max(1u, blockDim.x / warp_size); // the block is divided into sections. A warp operates on a section at a time. for (int my_section_idx = warp_id; my_section_idx < total_sections; @@ -888,7 +890,7 @@ __global__ void copy_validity_to_columns( ++i, byte_mask <<= 1) { auto validity_data = __ballot_sync(participation_mask, my_byte & byte_mask); // lead thread in each warp writes data - if (threadIdx.x % detail::warp_size == 0) { + if (threadIdx.x % warp_size == 0) { auto const validity_write_offset = validity_data_col_length * (relative_col + i) + relative_row / 8; From 83118d2c63101c31629e6cd3ade17bb772215e75 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Wed, 22 Sep 2021 03:11:58 +0000 Subject: [PATCH 51/80] Updates and bug fixes --- .../row_conversion/row_conversion.cpp | 2 +- cpp/src/row_conversion/row_conversion.cu | 206 +++++++----------- cpp/tests/row_conversion/row_conversion.cpp | 36 +-- java/src/main/native/src/row_conversion.cu | 106 ++++----- 4 files changed, 155 insertions(+), 195 deletions(-) diff --git a/cpp/benchmarks/row_conversion/row_conversion.cpp b/cpp/benchmarks/row_conversion/row_conversion.cpp index ad9925e9043..2fe436a22c1 100644 --- a/cpp/benchmarks/row_conversion/row_conversion.cpp +++ b/cpp/benchmarks/row_conversion/row_conversion.cpp @@ -19,8 +19,8 @@ #include #include -#include #include +#include #include class RowConversion : public cudf::benchmark { diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu index 0409a65b630..eb3c4b28b6a 100644 --- a/cpp/src/row_conversion/row_conversion.cu +++ b/cpp/src/row_conversion/row_conversion.cu @@ -568,11 +568,7 @@ __global__ void copy_validity_from_columns(const size_type num_rows, int8_t* shared_blocks[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] = { shared_data, shared_data + shmem_used_per_block / 2}; - int8_t* output_check_addr = nullptr; - int8_t* output_block_start = nullptr; - size_type output_block_size = 0; - - bool print_debug = false; //threadIdx.x==0 && blockIdx.x == 0; + constexpr bool print_debug = false; // threadIdx.x==0 && blockIdx.x == 0; // if (blockIdx.x != 3 || threadIdx.x / 32 != 0) return; if (print_debug) { printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns); @@ -663,7 +659,8 @@ __global__ void copy_validity_from_columns(const size_type num_rows, if (print_debug) printf( - "%d %d - my warp is %d, %d total sections(%d x, %d y), %d warps per block, blockDim.x=%d, warp size " + "%d %d - my warp is %d, %d total sections(%d x, %d y), %d warps per block, blockDim.x=%d, " + "warp size " "%d\n", threadIdx.x, blockIdx.x, @@ -681,7 +678,12 @@ __global__ void copy_validity_from_columns(const size_type num_rows, auto const section_x = my_section_idx % num_sections_x; auto const section_y = my_section_idx / num_sections_x; - if (print_debug) printf("working on section %d,%d - %d of %d...\n", section_x, section_y, my_section_idx, total_sections); + if (print_debug) + printf("working on section %d,%d - %d of %d...\n", + section_x, + section_y, + my_section_idx, + total_sections); auto const relative_col = section_x * 32 + lane_id; auto const relative_row = section_y * 8; auto const absolute_col = relative_col + block.start_col; @@ -751,14 +753,18 @@ __global__ void copy_validity_from_columns(const size_type num_rows, // write byte if (print_debug) printf("%d %d - writing single byte to shared offset 0x%x which is %p...\n", - threadIdx.x, blockIdx.x, validity_write_offset, + threadIdx.x, + blockIdx.x, + validity_write_offset, &this_shared_block[validity_write_offset]); this_shared_block[validity_write_offset] = validity_data & 0xFF; } else if (cols_left <= 16) { // write int16 if (print_debug) printf("%d %d - writing two bytes to shared offset 0x%x which is %p...\n", - threadIdx.x, blockIdx.x, validity_write_offset, + threadIdx.x, + blockIdx.x, + validity_write_offset, &this_shared_block[validity_write_offset]); *reinterpret_cast(&this_shared_block[validity_write_offset]) = validity_data & 0xFFFF; @@ -766,7 +772,9 @@ __global__ void copy_validity_from_columns(const size_type num_rows, // write int16 and then int8 if (print_debug) printf("%d %d - writing three bytes to shared offset 0x%x which is %p...\n", - threadIdx.x, blockIdx.x, validity_write_offset, + threadIdx.x, + blockIdx.x, + validity_write_offset, &this_shared_block[validity_write_offset]); *reinterpret_cast(&this_shared_block[validity_write_offset]) = validity_data & 0xFFFF; @@ -775,7 +783,9 @@ __global__ void copy_validity_from_columns(const size_type num_rows, // write int32 if (print_debug) printf("%d %d - writing 4 bytes to shared offset 0x%x which is %p...\n", - threadIdx.x, blockIdx.x, validity_write_offset, + threadIdx.x, + blockIdx.x, + validity_write_offset, &this_shared_block[validity_write_offset]); *reinterpret_cast(&this_shared_block[validity_write_offset]) = validity_data; @@ -823,63 +833,20 @@ __global__ void copy_validity_from_columns(const size_type num_rows, output_data[block.buffer_num] + row_offsets[row] + validity_offset + block.start_col / 8; auto const num_bytes = util::div_rounding_up_unsafe(num_block_cols, 8); -/* if (num_rows >= 5006) { - auto const row5006_col_65 = output_data[block.buffer_num] + row_offsets[5006] + validity_offset + 65 / 8; - if (output_ptr >= row5006_col_65 && output_ptr <= row5006_col_65 + 4) { - printf("%d %d - writing bytes from %p(0x%x)-%p to %p-%p that overlap global %p(0x%x), which is row 5006, col 65!\n", threadIdx.x, blockIdx.x, &this_shared_block[validity_data_row_length * relative_row], this_shared_block[validity_data_row_length * relative_row], &this_shared_block[validity_data_row_length * relative_row + num_bytes], output_ptr, output_ptr + num_bytes, row5006_col_65, *row5006_col_65); - printf("%d %d - block information\n%d,%d -> %d,%d\n%d columns, %d rows\n", threadIdx.x, blockIdx.x, block.start_col, block.start_row, block.end_col, block.end_row, block.num_cols(), block.num_rows()); - output_check_addr = row5006_col_65; - output_block_start = output_ptr; - output_block_size = num_bytes; - } - }*/ - cuda::memcpy_async( output_ptr, &this_shared_block[validity_data_row_length * relative_row], num_bytes, shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]); - - /* auto const padding_ptr = output_ptr + num_bytes; - auto const padding_needed = -reinterpret_cast(padding_ptr) & 7; - if (print_debug) printf( - "absolute_row: %d, row_offset for this row: 0x%x, validity data bytes: %d, end - address: %p, padding bytes %lu\n", row, row_offsets[row], num_bytes, output_ptr + - num_bytes, padding_needed); cuda::memcpy_async(padding_ptr, zero, padding_needed, - shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]); - */ - - /* if (print_debug) { - for (int i=0; i get_admin_data_sizes(size_t col_size_size, @@ -932,10 +899,16 @@ static __device__ void fetch_blocks_for_row_to_column( ++fetch_index) { auto const fetch_block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch_index]; - if (debug_print) - printf("fetching block %lu of %d for start col %d, end col %d. Starting col offset is %p, ending offset %p\n", - blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch_index, - total_blocks, fetch_block.start_col, fetch_block.end_col, &col_offsets[fetch_block.start_col], &col_offsets[fetch_block.end_col]); + if (debug_print) + printf( + "fetching block %lu of %d for start col %d, end col %d. Starting col offset is %p, ending " + "offset %p\n", + blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch_index, + total_blocks, + fetch_block.start_col, + fetch_block.end_col, + &col_offsets[fetch_block.start_col], + &col_offsets[fetch_block.end_col]); auto const fetch_block_start_row = fetch_block.start_row; auto const fetch_block_end_row = fetch_block.end_row; auto const starting_col_offset = col_offsets[fetch_block.start_col]; @@ -977,7 +950,7 @@ static __device__ void fetch_blocks_for_row_to_column( &shared[fetch_index % max_resident_blocks][shared_row_offset], &col_offsets[fetch_block.start_col], col_offset_bytes); - cuda::memcpy_async(group, + cuda::memcpy_async(group, &shared[fetch_index % max_resident_blocks][shared_row_offset], &col_offsets[fetch_block.start_col], col_offset_bytes, @@ -985,23 +958,6 @@ static __device__ void fetch_blocks_for_row_to_column( shared_row_offset += col_offset_bytes; shared_row_offset = align_offset(shared_row_offset, 8); - if (debug_print && group.thread_rank() == 0 && blockIdx.x == 0 && fetch_block.start_col == 0 && - fetch_block.start_row <= 51 && fetch_block.end_row >= 51) { - printf("Input data for col 0 row 51 is 0x"); - for (int i = 0; i < col_sizes[0]; ++i) { - printf("%x ", input_data[row_offsets[51] + col_offsets[0] + i]); - } - printf("\n"); - printf( - "this is at offset %d-%d and starting column offset is %d and we're reading %d bytes\n", - col_offsets[0], - col_offsets[0] + col_sizes[0], - starting_col_offset, - fetch_block_row_size); - auto shared_offset = (51 - fetch_block_start_row) * fetch_block_row_size + shared_row_offset; - printf("destination is %p", &shared[fetch_index % max_resident_blocks][shared_offset]); - } - for (auto row = fetch_block_start_row + static_cast(threadIdx.x); row <= fetch_block_end_row; row += blockDim.x) { @@ -1012,7 +968,7 @@ static __device__ void fetch_blocks_for_row_to_column( fetch_index % max_resident_blocks, &shared[fetch_index % max_resident_blocks][shared_offset], &input_data[row_offsets[row] + starting_col_offset]); - // copy the main + // copy the main cuda::memcpy_async(&shared[fetch_index % max_resident_blocks][shared_offset], &input_data[row_offsets[row] + starting_col_offset], fetch_block_row_size, @@ -1058,7 +1014,7 @@ __global__ void copy_to_columns(const size_type num_rows, // to speed up some of the random access memory we do, we copy col_sizes and col_offsets // to shared memory for each of the blocks that we work on - /*constexpr*/ bool debug_print = false; //threadIdx.x == 0 && blockIdx.x == 0; + /*constexpr*/ bool debug_print = false; // threadIdx.x == 0 && blockIdx.x == 0; constexpr unsigned stages_count = NUM_BLOCKS_PER_KERNEL_LOADED; auto group = cooperative_groups::this_thread_block(); extern __shared__ int8_t shared_data[]; @@ -1066,14 +1022,17 @@ __global__ void copy_to_columns(const size_type num_rows, if (debug_print) { printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns); - printf("%d block infos are at %p and my index is %d\n", num_block_infos, block_infos, blockIdx.x); + printf( + "%d block infos are at %p and my index is %d\n", num_block_infos, block_infos, blockIdx.x); /* printf("Row Offsets:\n"); for (int i=0; i NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED ? NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED @@ -1696,7 +1650,7 @@ std::vector build_validity_block_infos( // note that an element in the table takes just 1 bit, but a row with a single // element still takes 8 bytes! auto const bytes_per_row = align_offset(util::div_rounding_up_unsafe(column_stride, 8), 8); - auto const row_stride = std::min(num_rows, shmem_limit_per_block / bytes_per_row); + auto const row_stride = std::min(num_rows, shmem_limit_per_block / bytes_per_row); std::vector validity_block_infos; for (int col = 0; col < num_columns; col += column_stride) { @@ -1747,7 +1701,6 @@ std::vector build_block_infos(std::vector const& column_s } int const window_height = std::min(desired_window_height, rows_left_in_batch); -// printf("block %d, %d to %d, %d\n", start_col, current_window_start_row, end_col, std::min(current_window_start_row + window_height - 1, total_number_of_rows - 1)); block_infos.emplace_back(detail::block_info{ start_col, current_window_start_row, @@ -1768,10 +1721,13 @@ std::vector build_block_infos(std::vector const& column_s // want them equal, so height and width are sqrt(shared_mem_size). The trick is that it's in // bytes, not rows or columns. size_type const optimal_square_len = size_type(sqrt(shmem_limit_per_block)); - int const window_height = - std::clamp(util::round_up_safe(std::min(std::min(optimal_square_len, (size_type)column_sizes.size()) / column_sizes[0], total_number_of_rows), 32), - 1, - row_batches[0].row_count); + int const window_height = std::clamp( + util::round_up_safe( + std::min(std::min(optimal_square_len, (size_type)column_sizes.size()) / column_sizes[0], + total_number_of_rows), + 32), + 1, + row_batches[0].row_count); #if defined(DEBUG) printf( "optimal_square_len is %d and we have %d columns, optimal_square_len / column_sizes[0] is %d " @@ -2385,8 +2341,6 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const& in return util::round_down_safe(desired_rows_and_columns, 32); }*/ }(); - printf("column stride is %d and row stride is %d. std::min(%d, util::round_down_safe(%d * 8 / %d, 32))\n", column_stride, row_stride, num_rows, shmem_limit_per_block, column_stride); - printf("each block uses %d bytes of shared memory\n", (column_stride / 8) * detail::align_offset(row_stride, 4)); std::vector validity_block_infos; for (int col = 0; col < num_columns; col += column_stride) { for (int row = 0; row < num_rows; row += row_stride) { diff --git a/cpp/tests/row_conversion/row_conversion.cpp b/cpp/tests/row_conversion/row_conversion.cpp index 26e071eef79..70a4552a6f9 100644 --- a/cpp/tests/row_conversion/row_conversion.cpp +++ b/cpp/tests/row_conversion/row_conversion.cpp @@ -168,8 +168,8 @@ TEST_F(ColumnToRowTests, Non2Power) constexpr auto num_rows = 6 * 1024 + 557; for (int i = 0; i < 131; ++i) { - cols.push_back( - cudf::test::fixed_width_column_wrapper(r + num_rows * i, r + num_rows * i + num_rows)); + cols.push_back(cudf::test::fixed_width_column_wrapper(r + num_rows * i, + r + num_rows * i + num_rows)); views.push_back(cols.back()); schema.push_back(cudf::data_type{cudf::type_id::INT32}); } @@ -184,9 +184,9 @@ TEST_F(ColumnToRowTests, Non2Power) auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); - for (int j=0; jnum_columns(); ++j) { + for (int j = 0; j < old_tbl->num_columns(); ++j) { printf("testing column %d\n", j); - if (j==65) { + if (j == 65) { printf("old\n"); cudf::test::print(old_tbl->get_column(j)); printf("new\n"); @@ -214,8 +214,8 @@ TEST_F(ColumnToRowTests, Big) // 28 columns of 1 million rows constexpr auto num_rows = 1024 * 1024; for (int i = 0; i < 28; ++i) { - cols.push_back( - cudf::test::fixed_width_column_wrapper(r + num_rows * i, r + num_rows * i + num_rows)); + cols.push_back(cudf::test::fixed_width_column_wrapper(r + num_rows * i, + r + num_rows * i + num_rows)); views.push_back(cols.back()); schema.push_back(cudf::data_type{cudf::type_id::INT32}); } @@ -241,8 +241,8 @@ TEST_F(ColumnToRowTests, Bigger) // 128 columns of 1 million rows constexpr auto num_rows = 1024 * 1024; for (int i = 0; i < 128; ++i) { - cols.push_back( - cudf::test::fixed_width_column_wrapper(r + num_rows * i, r + num_rows * i + num_rows)); + cols.push_back(cudf::test::fixed_width_column_wrapper(r + num_rows * i, + r + num_rows * i + num_rows)); views.push_back(cols.back()); schema.push_back(cudf::data_type{cudf::type_id::INT32}); } @@ -268,8 +268,8 @@ TEST_F(ColumnToRowTests, Biggest) // 128 columns of 2 million rows constexpr auto num_rows = 2 * 1024 * 1024; for (int i = 0; i < 128; ++i) { - cols.push_back( - cudf::test::fixed_width_column_wrapper(r + num_rows * i, r + num_rows * i + num_rows)); + cols.push_back(cudf::test::fixed_width_column_wrapper(r + num_rows * i, + r + num_rows * i + num_rows)); views.push_back(cols.back()); schema.push_back(cudf::data_type{cudf::type_id::INT32}); } @@ -396,8 +396,8 @@ TEST_F(RowToColumnTests, Non2Power) constexpr auto num_rows = 6 * 1024 + 557; for (int i = 0; i < 131; ++i) { - cols.push_back( - cudf::test::fixed_width_column_wrapper(r + num_rows * i, r + num_rows * i + num_rows)); + cols.push_back(cudf::test::fixed_width_column_wrapper(r + num_rows * i, + r + num_rows * i + num_rows)); views.push_back(cols.back()); schema.push_back(cudf::data_type{cudf::type_id::INT32}); } @@ -424,8 +424,8 @@ TEST_F(RowToColumnTests, Big) // 28 columns of 1 million rows constexpr auto num_rows = 1024 * 1024; for (int i = 0; i < 28; ++i) { - cols.push_back( - cudf::test::fixed_width_column_wrapper(r + num_rows * i, r + num_rows * i + num_rows)); + cols.push_back(cudf::test::fixed_width_column_wrapper(r + num_rows * i, + r + num_rows * i + num_rows)); views.push_back(cols.back()); schema.push_back(cudf::data_type{cudf::type_id::INT32}); } @@ -452,8 +452,8 @@ TEST_F(RowToColumnTests, Bigger) // 28 columns of 1 million rows constexpr auto num_rows = 1024 * 1024; for (int i = 0; i < 128; ++i) { - cols.push_back( - cudf::test::fixed_width_column_wrapper(r + num_rows * i, r + num_rows * i + num_rows)); + cols.push_back(cudf::test::fixed_width_column_wrapper(r + num_rows * i, + r + num_rows * i + num_rows)); views.push_back(cols.back()); schema.push_back(cudf::data_type{cudf::type_id::INT32}); } @@ -480,8 +480,8 @@ TEST_F(RowToColumnTests, Biggest) // 28 columns of 1 million rows constexpr auto num_rows = 5 * 1024 * 1024; for (int i = 0; i < 128; ++i) { - cols.push_back( - cudf::test::fixed_width_column_wrapper(r + num_rows * i, r + num_rows * i + num_rows)); + cols.push_back(cudf::test::fixed_width_column_wrapper(r + num_rows * i, + r + num_rows * i + num_rows)); views.push_back(cols.back()); schema.push_back(cudf::data_type{cudf::type_id::INT32}); } diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu index 9f0df3569a7..c64a61b3373 100644 --- a/java/src/main/native/src/row_conversion.cu +++ b/java/src/main/native/src/row_conversion.cu @@ -54,9 +54,7 @@ constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED = 2; #endif using cudf::detail::make_device_uvector_async; -using cudf::detail::warp_size; - -namespace cudf::java { +namespace cudf { namespace detail { @@ -403,7 +401,6 @@ __global__ void copy_from_columns(const size_type num_rows, const size_type num_ // Fetch ahead up to stages_count subsets for (; fetch < blocks_remaining && fetch < (subset + stages_count); ++fetch) { auto const fetch_block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch]; - auto const num_fetch_cols = fetch_block.num_cols(); auto const num_fetch_rows = fetch_block.num_rows(); auto const num_elements_in_block = num_fetch_cols * num_fetch_rows; @@ -435,7 +432,7 @@ __global__ void copy_from_columns(const size_type num_rows, const size_type num_ auto const shared_offset = relative_row * fetch_block_row_size + relative_col_offset; auto const input_src = input_data[absolute_col] + col_size * absolute_row; - // copy the main + // copy the element to global memory cuda::memcpy_async(&shared[fetch % stages_count][shared_offset], input_src, col_size, fetch_barrier); } @@ -445,18 +442,19 @@ __global__ void copy_from_columns(const size_type num_rows, const size_type num_ subset_barrier.arrive_and_wait(); auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + subset]; - /* auto const rows_in_block = block.num_rows(); - auto const cols_in_block = block.num_cols();*/ + auto const block_row_size = block.get_row_size(col_offsets, col_sizes); auto const column_offset = col_offsets[block.start_col]; // copy entire rows to final dest for (auto absolute_row = block.start_row + threadIdx.x; absolute_row <= block.end_row; absolute_row += blockDim.x) { + auto const relative_row = absolute_row - block.start_row; auto const output_dest = output_data[block.buffer_num] + absolute_row * block_row_size + column_offset; auto const shared_offset = block_row_size * relative_row; + cuda::memcpy_async(output_dest, &shared[subset % stages_count][shared_offset], block_row_size, subset_barrier); } @@ -528,23 +526,22 @@ __global__ void copy_validity_from_columns( align_offset(util::div_rounding_up_unsafe(num_block_cols, 8), 8); auto const total_sections = num_sections_x * num_sections_y; - int const warp_id = threadIdx.x / warp_size; - int const lane_id = threadIdx.x % warp_size; - auto const warps_per_block = std::max(1u, blockDim.x / warp_size); + int const warp_id = threadIdx.x / detail::warp_size; + int const lane_id = threadIdx.x % detail::warp_size; + auto const warps_per_block = std::max(1u, blockDim.x / detail::warp_size); // the block is divided into sections. A warp operates on a section at a time. for (int my_section_idx = warp_id; my_section_idx < total_sections; my_section_idx += warps_per_block) { - // convert to rows and cols - auto const section_x = my_section_idx / num_sections_x; - auto const section_y = my_section_idx % num_sections_x; + // convert to rows and cols + auto const section_x = my_section_idx % num_sections_x; + auto const section_y = my_section_idx / num_sections_x; auto const relative_col = section_x * 32 + lane_id; auto const relative_row = section_y * 8; auto const absolute_col = relative_col + block.start_col; auto const absolute_row = relative_row + block.start_row; auto const cols_left = num_columns - absolute_col; - auto const participation_mask = __ballot_sync(0xFFFFFFFF, absolute_col < num_columns); if (absolute_col < num_columns) { @@ -552,14 +549,14 @@ __global__ void copy_validity_from_columns( input_nm[absolute_col] != nullptr ? input_nm[absolute_col][absolute_row / 32] : 0xFF; // every thread that is participating in the warp has a byte, but it's column-based - // data and we need it in row-based. So we shiffle the bits around with ballot_sync to make + // data and we need it in row-based. So we shuffle the bits around with ballot_sync to make // the bytes we actually write. for (int i = 0, byte_mask = 1; i < 8 && relative_row + i < num_rows; ++i, byte_mask <<= 1) { auto validity_data = __ballot_sync(participation_mask, my_byte & byte_mask); // lead thread in each warp writes data auto const validity_write_offset = validity_data_row_length * (relative_row + i) + relative_col / 8; - if (threadIdx.x % warp_size == 0) { + if (threadIdx.x % detail::warp_size == 0) { if (cols_left <= 8) { // write byte this_shared_block[validity_write_offset] = validity_data & 0xFF; @@ -591,6 +588,7 @@ __global__ void copy_validity_from_columns( auto const output_ptr = output_data[block.buffer_num] + row_offsets[row] + validity_offset + block.start_col / 8; auto const num_bytes = util::div_rounding_up_unsafe(num_block_cols, 8); + cuda::memcpy_async( output_ptr, &this_shared_block[validity_data_row_length * relative_row], num_bytes, shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]); @@ -647,7 +645,6 @@ fetch_blocks_for_row_to_column(size_t &fetch_index, size_t const processing_inde auto const fetch_block_start_row = fetch_block.start_row; auto const fetch_block_end_row = fetch_block.end_row; auto const starting_col_offset = col_offsets[fetch_block.start_col]; - auto const fetch_block_row_size = fetch_block.get_row_size(col_offsets, col_sizes); auto const num_fetch_cols = fetch_block.num_cols(); auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes( @@ -718,9 +715,9 @@ __global__ void copy_to_columns(const size_type num_rows, const size_type num_co extern __shared__ int8_t shared_data[]; int8_t *shared[stages_count] = {shared_data, shared_data + shmem_used_per_block}; - __shared__ cuda::barrier block_barrier[stages_count]; + __shared__ cuda::barrier block_barrier[NUM_BLOCKS_PER_KERNEL_LOADED]; if (group.thread_rank() == 0) { - for (int i = 0; i < stages_count; ++i) { + for (int i = 0; i < NUM_BLOCKS_PER_KERNEL_LOADED; ++i) { init(&block_barrier[i], group.size()); } } @@ -748,12 +745,11 @@ __global__ void copy_to_columns(const size_type num_rows, const size_type num_co block_infos, _col_sizes, _col_offsets, row_offsets, input_data, shared, group, block_barrier); - auto &subset_barrier = block_barrier[subset % stages_count]; + auto &subset_barrier = block_barrier[subset % NUM_BLOCKS_PER_KERNEL_LOADED]; // ensure our data is ready subset_barrier.arrive_and_wait(); - auto const block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + subset]; - + auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + subset]; auto const rows_in_block = block.num_rows(); auto const cols_in_block = block.num_cols(); @@ -851,18 +847,15 @@ __global__ void copy_validity_to_columns( auto const block = block_infos[blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block]; auto const block_start_col = block.start_col; auto const block_start_row = block.start_row; - auto const num_block_cols = block.num_cols(); auto const num_block_rows = block.num_rows(); - auto const num_sections_x = (num_block_cols + 7) / 8; auto const num_sections_y = (num_block_rows + 31) / 32; auto const validity_data_col_length = num_sections_y * 4; // words to bytes auto const total_sections = num_sections_x * num_sections_y; - - int const warp_id = threadIdx.x / warp_size; - int const lane_id = threadIdx.x % warp_size; - auto const warps_per_block = std::max(1u, blockDim.x / warp_size); + int const warp_id = threadIdx.x / detail::warp_size; + int const lane_id = threadIdx.x % detail::warp_size; + auto const warps_per_block = std::max(1u, blockDim.x / detail::warp_size); // the block is divided into sections. A warp operates on a section at a time. for (int my_section_idx = warp_id; my_section_idx < total_sections; @@ -870,7 +863,6 @@ __global__ void copy_validity_to_columns( // convert to rows and cols auto const section_x = my_section_idx % num_sections_x; auto const section_y = my_section_idx / num_sections_x; - auto const relative_col = section_x * 8; auto const relative_row = section_y * 32 + lane_id; auto const absolute_col = relative_col + block_start_col; @@ -890,9 +882,11 @@ __global__ void copy_validity_to_columns( ++i, byte_mask <<= 1) { auto validity_data = __ballot_sync(participation_mask, my_byte & byte_mask); // lead thread in each warp writes data - if (threadIdx.x % warp_size == 0) { + if (threadIdx.x % detail::warp_size == 0) { auto const validity_write_offset = validity_data_col_length * (relative_col + i) + relative_row / 8; + auto const write_5006_offset = 837; // validity_data_col_length * (65 - block_start_col) + // + (5006 - block_start_row)/8; if (rows_left <= 8) { // write byte @@ -922,6 +916,8 @@ __global__ void copy_validity_to_columns( // now async memcpy the shared for (int col = block.start_col + threadIdx.x; col <= block.end_col; col += blockDim.x) { auto const relative_col = col - block.start_col; + auto const words_to_copy = util::div_rounding_up_unsafe(num_block_rows, 32); + auto const starting_address = output_nm[col] + word_index(block_start_row); cuda::memcpy_async( output_nm[col] + word_index(block_start_row), @@ -965,8 +961,9 @@ static int calc_fixed_width_kernel_dims(const cudf::size_type num_columns, // level when writing validity data out to main memory, and that would // need to change if we split a word of validity data between blocks. int y_block_size = (num_columns + 3) / 4; // cudf::util::div_rounding_up_safe(num_columns, 4); - if (y_block_size > 32) + if (y_block_size > 32) { y_block_size = 32; + } int x_possible_block_size = 1024 / y_block_size; // 48KB is the default setting for shared memory per block according to the cuda tutorials // If someone configures the GPU to only have 16 KB this might not work. @@ -1135,7 +1132,10 @@ build_validity_block_infos(size_type const &num_columns, size_type const &num_ro }(), 8); // we fit as much as we can given the column stride - auto const row_stride = std::min(num_rows, shmem_limit_per_block * 8 / column_stride); + // note that an element in the table takes just 1 bit, but a row with a single + // element still takes 8 bytes! + auto const bytes_per_row = align_offset(util::div_rounding_up_unsafe(column_stride, 8), 8); + auto const row_stride = std::min(num_rows, shmem_limit_per_block / bytes_per_row); std::vector validity_block_infos; for (int col = 0; col < num_columns; col += column_stride) { @@ -1203,13 +1203,12 @@ std::vector build_block_infos(std::vector const &column_s // want them equal, so height and width are sqrt(shared_mem_size). The trick is that it's in // bytes, not rows or columns. size_type const optimal_square_len = size_type(sqrt(shmem_limit_per_block)); - int const window_height = - std::clamp(util::round_up_safe( - optimal_square_len <= (size_type)column_sizes.size() ? - std::min(optimal_square_len / column_sizes[0], total_number_of_rows) : - row_batches[0].row_count / 2, - 32), - 1, row_batches[0].row_count); + int const window_height = std::clamp( + util::round_up_safe( + std::min(std::min(optimal_square_len, (size_type)column_sizes.size()) / column_sizes[0], + total_number_of_rows), + 32), + 1, row_batches[0].row_count); auto calc_admin_data_size = [](int num_cols) -> size_type { // admin data is the column sizes and column start information. @@ -1233,8 +1232,9 @@ std::vector build_block_infos(std::vector const &column_s if (row_size_with_end_pad * window_height + calc_admin_data_size(col - current_window_start_col) > shmem_limit_per_block) { + // too large, close this window, generate vertical blocks and restart - build_blocks(current_window_start_col, col - 1, window_height); + build_blocks(current_window_start_col, col == 0 ? col : col - 1, window_height); row_size = detail::align_offset((column_starts[col] + column_sizes[col]) & 7, alignment_needed); row_size += col_size; // alignment required for shared memory window boundary to match @@ -1274,9 +1274,8 @@ std::vector> convert_to_rows(cudf::table_view cons int total_shmem; CUDA_TRY(cudaDeviceGetAttribute(&total_shmem, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); - // TODO: kernels fail to launch if we use all the available shared memory. + // TODO: why? total_shmem -= 1024; - int shmem_limit_per_block = total_shmem / NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; // break up the work into blocks, which are a starting and ending row/col #. @@ -1381,6 +1380,16 @@ std::vector> convert_to_rows(cudf::table_view cons } c = c.child(1); } + exclusive_scan([t](int row_index) { + size_type total_row_size = 0; + for (int i=0 i> convert_to_rows(cudf::table_view cons // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then // calculate the size of each row's variable-width data and validity as well. auto validity_size = num_bitmask_words(num_columns) * 4; + // thrust for (int row = 0; row < num_rows; ++row) { auto aligned_row_batch_size = detail::align_offset(row_batch_size, 8); // rows are 8 byte aligned @@ -1578,7 +1588,7 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const &in int total_shmem; CUDA_TRY(cudaDeviceGetAttribute(&total_shmem, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); - // TODO: unable to launch a kernel with all shared used + // TODO why? total_shmem -= 1024; int shmem_limit_per_block = total_shmem / NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; @@ -1628,11 +1638,7 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const &in auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr); dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_COLUMNS)); -#if defined(DEBUG) - dim3 threads(std::min(std::min(128, shmem_limit_per_block / 8), (int)child.size())); -#else - dim3 threads(std::min(256, (int)child.size())); -#endif + dim3 threads(std::min(std::min(256, shmem_limit_per_block / 8), (int)child.size())); detail::copy_to_columns<<>>( num_rows, num_columns, shmem_limit_per_block, input.offsets().data(), dev_output_data.data(), dev_col_sizes.data(), dev_col_starts.data(), dev_block_infos.data(), @@ -1641,8 +1647,8 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const &in auto const desired_rows_and_columns = (int)sqrt(shmem_limit_per_block); auto const column_stride = [&]() { if (desired_rows_and_columns > num_columns) { - // not many columns, group it into 8s and ship it off - return std::min(8, num_columns); + // not many columns, group it into 64s and ship it off + return std::min(64, num_columns); } else { return util::round_down_safe(desired_rows_and_columns, 8); } From d563eaa8443f4e4e8834ac80b2010360a3040425 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Fri, 1 Oct 2021 15:14:54 +0000 Subject: [PATCH 52/80] Fixing merge issue --- cpp/benchmarks/CMakeLists.txt | 54 ++++++++++++++++------------------- 1 file changed, 25 insertions(+), 29 deletions(-) diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index a8f075d2464..79783f0e512 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -22,21 +22,10 @@ target_compile_options( "$<$:${CUDF_CUDA_FLAGS}>" ) -<<<<<<< HEAD target_link_libraries( cudf_datagen PUBLIC GTest::gmock GTest::gtest GTest::gmock_main GTest::gtest_main benchmark::benchmark nvbench::nvbench Threads::Threads cudf ) -======= -target_link_libraries(cudf_datagen - PUBLIC GTest::gmock - GTest::gtest - GTest::gmock_main - GTest::gtest_main - benchmark::benchmark - Threads::Threads - cudf) ->>>>>>> working on row and column conversions target_include_directories( cudf_datagen @@ -57,7 +46,6 @@ target_link_libraries(cudf_benchmark_common PRIVATE cudf_datagen) # This function takes in a benchmark name and benchmark source and handles setting all of the # associated properties and linking to build the benchmark function(ConfigureBench CMAKE_BENCH_NAME) -<<<<<<< HEAD add_executable(${CMAKE_BENCH_NAME} ${ARGN}) set_target_properties( ${CMAKE_BENCH_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY @@ -83,17 +71,6 @@ endfunction() # ################################################################################################## # * column benchmarks ----------------------------------------------------------------------------- -======= - add_executable(${CMAKE_BENCH_NAME} ${ARGN}) - set_target_properties(${CMAKE_BENCH_NAME} - PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$") - target_link_libraries(${CMAKE_BENCH_NAME} - PRIVATE cudf_benchmark_common cudf_datagen benchmark::benchmark_main) -endfunction() - -################################################################################################### -# - column benchmarks ----------------------------------------------------------------------------- ->>>>>>> working on row and column conversions ConfigureBench(COLUMN_CONCAT_BENCH column/concatenate_benchmark.cpp) # ################################################################################################## @@ -104,17 +81,12 @@ ConfigureBench(GATHER_BENCH copying/gather_benchmark.cu) # * scatter benchmark ----------------------------------------------------------------------------- ConfigureBench(SCATTER_BENCH copying/scatter_benchmark.cu) -<<<<<<< HEAD # ################################################################################################## # * lists scatter benchmark ----------------------------------------------------------------------- ConfigureBench(SCATTER_LISTS_BENCH lists/copying/scatter_lists_benchmark.cu) # ################################################################################################## # * contiguous_split benchmark ------------------------------------------------------------------- -======= -################################################################################################### -# - contiguous_split benchmark ------------------------------------------------------------------- ->>>>>>> working on row and column conversions ConfigureBench(CONTIGUOUS_SPLIT_BENCH copying/contiguous_split_benchmark.cu) # ################################################################################################## @@ -146,8 +118,13 @@ ConfigureNVBench(JOIN_NVBENCH join/join_nvbench.cu) ======= ################################################################################################### # - join benchmark -------------------------------------------------------------------------------- +<<<<<<< HEAD ConfigureBench(JOIN_BENCH join/join_benchmark.cu) >>>>>>> working on row and column conversions +======= +ConfigureBench(JOIN_BENCH join/join_benchmark.cu join/conditional_join_benchmark.cu) +ConfigureNVBench(JOIN_NVBENCH join/join_nvbench.cu) +>>>>>>> Fixing merge issue # ################################################################################################## # * iterator benchmark ---------------------------------------------------------------------------- @@ -238,6 +215,7 @@ ConfigureBench(CSV_WRITER_BENCH io/csv/csv_writer_benchmark.cpp) # * ast benchmark --------------------------------------------------------------------------------- ConfigureBench(AST_BENCH ast/transform_benchmark.cpp) +<<<<<<< HEAD # ################################################################################################## # * binaryop benchmark ---------------------------------------------------------------------------- ConfigureBench( @@ -249,6 +227,18 @@ ConfigureBench( # * nvtext benchmark ------------------------------------------------------------------- ConfigureBench( TEXT_BENCH +======= +################################################################################################### +# - binaryop benchmark ---------------------------------------------------------------------------- +ConfigureBench(BINARYOP_BENCH + binaryop/binaryop_benchmark.cpp + binaryop/compiled_binaryop_benchmark.cpp + binaryop/jit_binaryop_benchmark.cpp) + +################################################################################################### +# - nvtext benchmark ------------------------------------------------------------------- +ConfigureBench(TEXT_BENCH +>>>>>>> Fixing merge issue text/ngrams_benchmark.cpp text/normalize_benchmark.cpp text/normalize_spaces_benchmark.cpp @@ -273,6 +263,7 @@ ConfigureBench( string/factory_benchmark.cu string/filter_benchmark.cpp string/find_benchmark.cpp + string/repeat_strings_benchmark.cpp string/replace_benchmark.cpp string/replace_re_benchmark.cpp string/split_benchmark.cpp @@ -291,6 +282,11 @@ ConfigureBench(JSON_BENCH string/json_benchmark.cpp) ConfigureBench(MULTIBYTE_SPLIT_BENCHMARK io/text/multibyte_split_benchmark.cpp) ======= ################################################################################################### -# - row conversion benchmark ---------------------------------------------------------------------------- +# - io benchmark --------------------------------------------------------------------- +ConfigureBench(MULTIBYTE_SPLIT_BENCHMARK + io/text/multibyte_split_benchmark.cpp) + +################################################################################################### +# - row conversion benchmark --------------------------------------------------------- ConfigureBench(ROW_CONVERSION_BENCH row_conversion/row_conversion.cpp) >>>>>>> working on row and column conversions From 5b6688db4a790947f9c7ffb5c9e7cb5f73c4124d Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Fri, 1 Oct 2021 15:17:11 +0000 Subject: [PATCH 53/80] working on code to move block creation and batch creation to gpu --- cpp/src/row_conversion/row_conversion.cu | 180 +++++++++++++++++++- cpp/tests/row_conversion/row_conversion.cpp | 7 - 2 files changed, 178 insertions(+), 9 deletions(-) diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu index eb3c4b28b6a..ae218e637d0 100644 --- a/cpp/src/row_conversion/row_conversion.cu +++ b/cpp/src/row_conversion/row_conversion.cu @@ -20,6 +20,8 @@ #include #include #include +#include "cudf/detail/iterator.cuh" +#include "cudf/lists/lists_column_device_view.cuh" #include @@ -43,7 +45,9 @@ #include #include #include +#include +#include #include #include @@ -56,6 +60,7 @@ constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED = 2; #endif using cudf::detail::make_device_uvector_async; +using rmm::device_uvector; namespace cudf { namespace detail { @@ -1352,8 +1357,6 @@ __global__ void copy_validity_to_columns(const size_type num_rows, if (threadIdx.x % detail::warp_size == 0) { auto const validity_write_offset = validity_data_col_length * (relative_col + i) + relative_row / 8; - auto const write_5006_offset = 837; // validity_data_col_length * (65 - - // block_start_col) + (5006 - block_start_row)/8; if (print_debug) printf( @@ -1674,6 +1677,173 @@ std::vector build_validity_block_infos( return validity_block_infos; } +constexpr size_t max_batch_size = 1024; // 2ul * 1024 * 1024 * 1024; + +template +void build_batches(size_t total_size, + size_type num_rows, + CumulativeRowSize cumulative_row_size, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto const num_batches = ((total_size + (max_batch_size - 1)) / max_batch_size); + auto const num_offsets = num_batches + 1; + printf("%lu batches so %lu offsets\n", num_batches, num_offsets); + + // at most max gpu memory / 2GB iterations. + std::vector h_batch_row_offsets; + h_batch_row_offsets.reserve(num_offsets); + h_batch_row_offsets.push_back(0); + size_type last_row_end = 0; + while (h_batch_row_offsets.size() < num_batches) { + // subtract out the size of the last row in the previous batch + auto adjusted_row_size = + thrust::make_transform_iterator(cumulative_row_size + last_row_end, + [last_row_end, cumulative_row_size] __device__(size_t size) { + return size - cumulative_row_size[last_row_end]; + }); + // find the next max_batch_size boundary + size_type const row_end = ((thrust::lower_bound(rmm::exec_policy(stream), + adjusted_row_size, + adjusted_row_size + (num_rows - last_row_end), + max_batch_size) - + adjusted_row_size) + + last_row_end) - + 1; + + h_batch_row_offsets.push_back(row_end); + last_row_end = row_end; + } + printf("batches: "); + for (uint i = 0; i < h_batch_row_offsets.size(); ++i) { + printf("%d ", h_batch_row_offsets[i]); + } + printf("\n"); +} + +int compute_block_counts(device_uvector const& batch_row_offsets, + int desired_window_height, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + size_type const num_batches = batch_row_offsets.size() - 1; + device_uvector num_blocks(num_batches, stream); + auto iter = thrust::make_counting_iterator(0); + thrust::transform( + rmm::exec_policy(stream), + iter, + iter + num_batches, + num_blocks.begin(), + [desired_window_height, + batch_row_offsets = batch_row_offsets.data()] __device__(auto batch_index) -> size_type { + return (batch_row_offsets[batch_index + 1] - batch_row_offsets[batch_index]) / + desired_window_height; + }); + return thrust::reduce(rmm::exec_policy(stream), num_blocks.begin(), num_blocks.end()); +} + +size_type block_lambda( + block_info* blocks, + device_uvector const& batch_row_offsets, // comes from build_batches + int column_start, + int column_end, + int desired_window_height, + int total_number_of_rows, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + size_type const num_batches = batch_row_offsets.size() - 1; + device_uvector num_blocks(num_batches, stream); + auto iter = thrust::make_counting_iterator(0); + thrust::transform( + rmm::exec_policy(stream), + iter, + iter + num_batches, + num_blocks.begin(), + [=, batch_row_offsets = batch_row_offsets.data()] __device__(int batch_index) -> size_type { + return (batch_row_offsets[batch_index + 1] - batch_row_offsets[batch_index]) / + desired_window_height; + }); + size_type const total_blocks = + thrust::reduce(rmm::exec_policy(stream), num_blocks.begin(), num_blocks.end()); + device_uvector block_starts(num_batches, stream); + thrust::exclusive_scan(rmm::exec_policy(stream), + num_blocks.begin(), + num_blocks.end(), + block_starts.begin()); // in blocks + + thrust::for_each( + rmm::exec_policy(stream), + iter, + iter + total_blocks, + [ =, + block_starts = block_starts.data(), + batch_row_offsets = batch_row_offsets.data()] __device__(size_type block_index) { + block_info& bi = blocks[block_index]; + + // what batch this block falls in + auto const batch_index_iter = + thrust::lower_bound(thrust::seq, block_starts, block_starts + num_batches, block_index); + auto const batch_index = batch_index_iter == block_starts ? 0 : *batch_index_iter; + // local index within the block + int const local_block_index = block_index - block_starts[batch_index]; + // the start row for this batch. + int const batch_row_start = batch_row_offsets[batch_index]; + // the start row for this block + int const block_row_start = batch_row_start + (local_block_index * desired_window_height); + // the end row for this block + int const max_row = std::min(total_number_of_rows, + batch_index + 1 > num_batches + ? std::numeric_limits::max() + : static_cast(batch_row_offsets[batch_index + 1])); + int const block_row_end = + std::min(batch_row_start + ((local_block_index + 1) * desired_window_height) - 1, + total_number_of_rows); + + // stuff the block + bi.start_col = column_start; + bi.end_col = column_end; + bi.start_row = block_row_start; + bi.end_row = block_row_end; + bi.buffer_num = batch_index; + }); + + return total_blocks; +} + +void test_block_lambda(rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) +{ + device_uvector batch_row_offsets(3, stream); + batch_row_offsets.set_element(0, 0, stream); + batch_row_offsets.set_element(1, 2000, stream); + batch_row_offsets.set_element(2, 5000, stream); + + // three groups of columns that can hold 128, 1024, and 768 rows each. + auto const total_blocks = compute_block_counts(batch_row_offsets, 128, stream, mr) + + compute_block_counts(batch_row_offsets, 1024, stream, mr) + + compute_block_counts(batch_row_offsets, 768, stream, mr); + + auto const table_num_rows = 50 * 1024; + + // allocate memory for all blocks + device_uvector blocks(total_blocks, stream); + + auto used_blocks = + block_lambda(blocks.data(), batch_row_offsets, 0, 15, 128, table_num_rows, stream, mr); + used_blocks += block_lambda( + blocks.data() + used_blocks, batch_row_offsets, 16, 28, 1024, table_num_rows, stream, mr); + used_blocks += block_lambda( + blocks.data() + used_blocks, batch_row_offsets, 29, 32, 768, table_num_rows, stream, mr); + + CUDF_EXPECTS(used_blocks == total_blocks, "used not equal to total!"); + + for (int i = 0; i < total_blocks; ++i) { + auto const block = blocks.element(i, stream); + printf( + "%d: %d,%d -> %d,%d\n", i, block.start_col, block.start_row, block.end_col, block.end_row); + } +} + std::vector build_block_infos(std::vector const& column_sizes, std::vector const& column_starts, std::vector const& row_batches, @@ -2245,6 +2415,12 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const& in cudf::size_type num_columns = schema.size(); cudf::size_type num_rows = input.parent().size(); + auto cumulative_row_size = cudf::detail::make_counting_transform_iterator( + 0, [] __device__(size_t row_index) { return 300 * row_index; }); + detail::build_batches(1024 * 1024, 1024, cumulative_row_size, stream, mr); + + detail::test_block_lambda(stream, mr); + int device_id; CUDA_TRY(cudaGetDevice(&device_id)); int total_shmem; diff --git a/cpp/tests/row_conversion/row_conversion.cpp b/cpp/tests/row_conversion/row_conversion.cpp index 70a4552a6f9..48d9690d583 100644 --- a/cpp/tests/row_conversion/row_conversion.cpp +++ b/cpp/tests/row_conversion/row_conversion.cpp @@ -185,13 +185,6 @@ TEST_F(ColumnToRowTests, Non2Power) auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); for (int j = 0; j < old_tbl->num_columns(); ++j) { - printf("testing column %d\n", j); - if (j == 65) { - printf("old\n"); - cudf::test::print(old_tbl->get_column(j)); - printf("new\n"); - cudf::test::print(new_tbl->get_column(j)); - } CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(old_tbl->get_column(j), new_tbl->get_column(j)); } From 53912ca1b9a786d9ae4c3cb7241d2ff87bd1781c Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Wed, 6 Oct 2021 19:41:49 +0000 Subject: [PATCH 54/80] pulling incomplete code for gpu building block data --- cpp/src/row_conversion/row_conversion.cu | 173 --------------------- java/src/main/native/src/row_conversion.cu | 53 +------ 2 files changed, 6 insertions(+), 220 deletions(-) diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu index ae218e637d0..9674000a69d 100644 --- a/cpp/src/row_conversion/row_conversion.cu +++ b/cpp/src/row_conversion/row_conversion.cu @@ -1677,173 +1677,6 @@ std::vector build_validity_block_infos( return validity_block_infos; } -constexpr size_t max_batch_size = 1024; // 2ul * 1024 * 1024 * 1024; - -template -void build_batches(size_t total_size, - size_type num_rows, - CumulativeRowSize cumulative_row_size, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - auto const num_batches = ((total_size + (max_batch_size - 1)) / max_batch_size); - auto const num_offsets = num_batches + 1; - printf("%lu batches so %lu offsets\n", num_batches, num_offsets); - - // at most max gpu memory / 2GB iterations. - std::vector h_batch_row_offsets; - h_batch_row_offsets.reserve(num_offsets); - h_batch_row_offsets.push_back(0); - size_type last_row_end = 0; - while (h_batch_row_offsets.size() < num_batches) { - // subtract out the size of the last row in the previous batch - auto adjusted_row_size = - thrust::make_transform_iterator(cumulative_row_size + last_row_end, - [last_row_end, cumulative_row_size] __device__(size_t size) { - return size - cumulative_row_size[last_row_end]; - }); - // find the next max_batch_size boundary - size_type const row_end = ((thrust::lower_bound(rmm::exec_policy(stream), - adjusted_row_size, - adjusted_row_size + (num_rows - last_row_end), - max_batch_size) - - adjusted_row_size) + - last_row_end) - - 1; - - h_batch_row_offsets.push_back(row_end); - last_row_end = row_end; - } - printf("batches: "); - for (uint i = 0; i < h_batch_row_offsets.size(); ++i) { - printf("%d ", h_batch_row_offsets[i]); - } - printf("\n"); -} - -int compute_block_counts(device_uvector const& batch_row_offsets, - int desired_window_height, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - size_type const num_batches = batch_row_offsets.size() - 1; - device_uvector num_blocks(num_batches, stream); - auto iter = thrust::make_counting_iterator(0); - thrust::transform( - rmm::exec_policy(stream), - iter, - iter + num_batches, - num_blocks.begin(), - [desired_window_height, - batch_row_offsets = batch_row_offsets.data()] __device__(auto batch_index) -> size_type { - return (batch_row_offsets[batch_index + 1] - batch_row_offsets[batch_index]) / - desired_window_height; - }); - return thrust::reduce(rmm::exec_policy(stream), num_blocks.begin(), num_blocks.end()); -} - -size_type block_lambda( - block_info* blocks, - device_uvector const& batch_row_offsets, // comes from build_batches - int column_start, - int column_end, - int desired_window_height, - int total_number_of_rows, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - size_type const num_batches = batch_row_offsets.size() - 1; - device_uvector num_blocks(num_batches, stream); - auto iter = thrust::make_counting_iterator(0); - thrust::transform( - rmm::exec_policy(stream), - iter, - iter + num_batches, - num_blocks.begin(), - [=, batch_row_offsets = batch_row_offsets.data()] __device__(int batch_index) -> size_type { - return (batch_row_offsets[batch_index + 1] - batch_row_offsets[batch_index]) / - desired_window_height; - }); - size_type const total_blocks = - thrust::reduce(rmm::exec_policy(stream), num_blocks.begin(), num_blocks.end()); - device_uvector block_starts(num_batches, stream); - thrust::exclusive_scan(rmm::exec_policy(stream), - num_blocks.begin(), - num_blocks.end(), - block_starts.begin()); // in blocks - - thrust::for_each( - rmm::exec_policy(stream), - iter, - iter + total_blocks, - [ =, - block_starts = block_starts.data(), - batch_row_offsets = batch_row_offsets.data()] __device__(size_type block_index) { - block_info& bi = blocks[block_index]; - - // what batch this block falls in - auto const batch_index_iter = - thrust::lower_bound(thrust::seq, block_starts, block_starts + num_batches, block_index); - auto const batch_index = batch_index_iter == block_starts ? 0 : *batch_index_iter; - // local index within the block - int const local_block_index = block_index - block_starts[batch_index]; - // the start row for this batch. - int const batch_row_start = batch_row_offsets[batch_index]; - // the start row for this block - int const block_row_start = batch_row_start + (local_block_index * desired_window_height); - // the end row for this block - int const max_row = std::min(total_number_of_rows, - batch_index + 1 > num_batches - ? std::numeric_limits::max() - : static_cast(batch_row_offsets[batch_index + 1])); - int const block_row_end = - std::min(batch_row_start + ((local_block_index + 1) * desired_window_height) - 1, - total_number_of_rows); - - // stuff the block - bi.start_col = column_start; - bi.end_col = column_end; - bi.start_row = block_row_start; - bi.end_row = block_row_end; - bi.buffer_num = batch_index; - }); - - return total_blocks; -} - -void test_block_lambda(rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) -{ - device_uvector batch_row_offsets(3, stream); - batch_row_offsets.set_element(0, 0, stream); - batch_row_offsets.set_element(1, 2000, stream); - batch_row_offsets.set_element(2, 5000, stream); - - // three groups of columns that can hold 128, 1024, and 768 rows each. - auto const total_blocks = compute_block_counts(batch_row_offsets, 128, stream, mr) + - compute_block_counts(batch_row_offsets, 1024, stream, mr) + - compute_block_counts(batch_row_offsets, 768, stream, mr); - - auto const table_num_rows = 50 * 1024; - - // allocate memory for all blocks - device_uvector blocks(total_blocks, stream); - - auto used_blocks = - block_lambda(blocks.data(), batch_row_offsets, 0, 15, 128, table_num_rows, stream, mr); - used_blocks += block_lambda( - blocks.data() + used_blocks, batch_row_offsets, 16, 28, 1024, table_num_rows, stream, mr); - used_blocks += block_lambda( - blocks.data() + used_blocks, batch_row_offsets, 29, 32, 768, table_num_rows, stream, mr); - - CUDF_EXPECTS(used_blocks == total_blocks, "used not equal to total!"); - - for (int i = 0; i < total_blocks; ++i) { - auto const block = blocks.element(i, stream); - printf( - "%d: %d,%d -> %d,%d\n", i, block.start_col, block.start_row, block.end_col, block.end_row); - } -} - std::vector build_block_infos(std::vector const& column_sizes, std::vector const& column_starts, std::vector const& row_batches, @@ -2415,12 +2248,6 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const& in cudf::size_type num_columns = schema.size(); cudf::size_type num_rows = input.parent().size(); - auto cumulative_row_size = cudf::detail::make_counting_transform_iterator( - 0, [] __device__(size_t row_index) { return 300 * row_index; }); - detail::build_batches(1024 * 1024, 1024, cumulative_row_size, stream, mr); - - detail::test_block_lambda(stream, mr); - int device_id; CUDA_TRY(cudaGetDevice(&device_id)); int total_shmem; diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu index c64a61b3373..481787c6004 100644 --- a/java/src/main/native/src/row_conversion.cu +++ b/java/src/main/native/src/row_conversion.cu @@ -21,6 +21,8 @@ #include #include +#include +#include #include #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 @@ -42,6 +44,8 @@ #include #include #include +#include +#include #include #include @@ -54,6 +58,7 @@ constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED = 2; #endif using cudf::detail::make_device_uvector_async; +using rmm::device_uvector; namespace cudf { namespace detail { @@ -885,8 +890,6 @@ __global__ void copy_validity_to_columns( if (threadIdx.x % detail::warp_size == 0) { auto const validity_write_offset = validity_data_col_length * (relative_col + i) + relative_row / 8; - auto const write_5006_offset = 837; // validity_data_col_length * (65 - block_start_col) - // + (5006 - block_start_row)/8; if (rows_left <= 8) { // write byte @@ -1330,28 +1333,7 @@ std::vector> convert_to_rows(cudf::table_view cons }); size_type fixed_width_size_per_row = - detail::compute_column_information(iter, iter + num_columns, column_starts, - column_sizes); //, - // [&variable_width_columns](column_view const &cv) { variable_width_columns.push_back(cv); }); - /* size_type fixed_width_size_per_row = 0; - for (int col = 0; col < num_columns; ++col) { - auto cv = tbl.column(col); - auto col_type = cv.type(); - bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; - - if (nested_type) { variable_width_columns.push_back(cv); } - - // a list or string column will write a single uint64 - // of data here for offset/length - auto col_size = nested_type ? 8 : size_of(col_type); - - // align size for this type - std::size_t const alignment_needed = col_size; // They are the same for fixed width types - fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed); - column_starts.push_back(fixed_width_size_per_row); - column_sizes.push_back(col_size); - fixed_width_size_per_row += col_size; - }*/ + detail::compute_column_information(iter, iter + num_columns, column_starts, column_sizes); auto dev_col_sizes = make_device_uvector_async(column_sizes, stream, mr); auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr); @@ -1368,29 +1350,6 @@ std::vector> convert_to_rows(cudf::table_view cons // will be included in the variable-width data blob at the end of the // row. return 0; - /* auto c = variable_width_columns[col]; - while (true) { - auto col_offsets = c.child(0).data(); - auto col_data_size = size_of(c.child(1).type()); - std::size_t alignment_needed = col_data_size; - - row_sizes[row] += (col_offsets[row + 1] - col_offsets[row]) * col_data_size; - if (c.num_children() == 0) { - break; - } - c = c.child(1); - } - exclusive_scan([t](int row_index) { - size_type total_row_size = 0; - for (int i=0 i Date: Thu, 7 Oct 2021 04:01:36 +0000 Subject: [PATCH 55/80] Fixing issue Raza found with 8-byte data --- cpp/src/row_conversion/row_conversion.cu | 27 +++-- cpp/tests/row_conversion/row_conversion.cpp | 122 ++++++++++++++++---- java/src/main/native/src/row_conversion.cu | 23 ++-- 3 files changed, 132 insertions(+), 40 deletions(-) diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu index 9674000a69d..84fab20fce5 100644 --- a/cpp/src/row_conversion/row_conversion.cu +++ b/cpp/src/row_conversion/row_conversion.cu @@ -333,9 +333,9 @@ struct block_info { int end_row; int buffer_num; - __host__ __device__ size_type get_row_size(size_type const* const col_offsets, - size_type const* const col_sizes, - bool debug_print = false) const + __host__ __device__ size_type get_shared_row_size(size_type const* const col_offsets, + size_type const* const col_sizes, + bool debug_print = false) const { if (debug_print) printf("col_offsets[%d]: %p + col_sizes[%d]: %p - col_offsets[%d]: %p\n%d + %d - %d\n", @@ -350,6 +350,14 @@ struct block_info { col_offsets[start_col]); return align_offset(col_offsets[end_col] + col_sizes[end_col] - col_offsets[start_col], 8); } + __host__ __device__ size_type get_dest_row_size(size_type const* const col_offsets, + size_type const* const col_sizes, + bool debug_print = false) const + { + return align_offset(col_offsets[end_col] + col_sizes[end_col] - col_offsets[start_col] + + util::div_rounding_up_unsafe(num_cols(), 8), + 8); + } __host__ __device__ size_type num_cols() const { return end_col - start_col + 1; } __host__ __device__ size_type num_rows() const { return end_row - start_row + 1; } @@ -456,7 +464,7 @@ __global__ void copy_from_columns(const size_type num_rows, auto const num_fetch_cols = fetch_block.num_cols(); auto const num_fetch_rows = fetch_block.num_rows(); auto const num_elements_in_block = num_fetch_cols * num_fetch_rows; - auto const fetch_block_row_size = fetch_block.get_row_size(col_offsets, col_sizes); + auto const fetch_block_row_size = fetch_block.get_shared_row_size(col_offsets, col_sizes); auto const starting_column_offset = col_offsets[fetch_block.start_col]; auto& fetch_barrier = block_barrier[fetch % NUM_BLOCKS_PER_KERNEL_LOADED]; @@ -513,7 +521,8 @@ __global__ void copy_from_columns(const size_type num_rows, /* auto const rows_in_block = block.num_rows(); auto const cols_in_block = block.num_cols();*/ - auto const block_row_size = block.get_row_size(col_offsets, col_sizes); + auto const block_row_size = block.get_shared_row_size(col_offsets, col_sizes); + auto const dest_row_size = block.get_dest_row_size(col_offsets, col_sizes); auto const column_offset = col_offsets[block.start_col]; // copy entire rows to final dest @@ -521,7 +530,7 @@ __global__ void copy_from_columns(const size_type num_rows, absolute_row += blockDim.x) { auto const relative_row = absolute_row - block.start_row; auto const output_dest = - output_data[block.buffer_num] + absolute_row * block_row_size + column_offset; + output_data[block.buffer_num] + absolute_row * dest_row_size + column_offset; if (debug_print) printf("processing row %d\noutput data[%d] is address %p\n", absolute_row, @@ -918,8 +927,8 @@ static __device__ void fetch_blocks_for_row_to_column( auto const fetch_block_end_row = fetch_block.end_row; auto const starting_col_offset = col_offsets[fetch_block.start_col]; - auto const fetch_block_row_size = fetch_block.get_row_size(col_offsets, col_sizes); - auto const num_fetch_cols = fetch_block.num_cols(); + auto const fetch_block_row_size = fetch_block.get_shared_row_size(col_offsets, col_sizes); + auto const num_fetch_cols = fetch_block.num_cols(); auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes( sizeof(decltype(*col_sizes)), sizeof(decltype(*col_offsets)), num_fetch_cols); auto& fetch_barrier = block_barrier[fetch_index % NUM_BLOCKS_PER_KERNEL_LOADED]; @@ -1115,7 +1124,7 @@ __global__ void copy_to_columns(const size_type num_rows, auto const shared_row_offset = align_offset(col_size_bytes + col_offset_bytes, 8); - auto block_row_size = block.get_row_size(_col_offsets, _col_sizes, debug_print); + auto block_row_size = block.get_shared_row_size(_col_offsets, _col_sizes, debug_print); // now we copy from shared memory to final destination. // the data is laid out in rows in shared memory, so the reads diff --git a/cpp/tests/row_conversion/row_conversion.cpp b/cpp/tests/row_conversion/row_conversion.cpp index 48d9690d583..0ab8b70a0f7 100644 --- a/cpp/tests/row_conversion/row_conversion.cpp +++ b/cpp/tests/row_conversion/row_conversion.cpp @@ -46,9 +46,9 @@ TEST_F(ColumnToRowTests, Single) CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); } - for (uint i = 0; i < old_rows.size(); i++) { - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); - } + /* for (uint i = 0; i < old_rows.size(); i++) { + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); + }*/ } TEST_F(ColumnToRowTests, Simple) @@ -68,9 +68,9 @@ TEST_F(ColumnToRowTests, Simple) CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); } - for (uint i = 0; i < old_rows.size(); i++) { - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); - } + /* for (uint i = 0; i < old_rows.size(); i++) { + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); + }*/ } TEST_F(ColumnToRowTests, Tall) @@ -93,9 +93,9 @@ TEST_F(ColumnToRowTests, Tall) CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); } - for (uint i = 0; i < old_rows.size(); i++) { - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); - } + /* for (uint i = 0; i < old_rows.size(); i++) { + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); + }*/ } TEST_F(ColumnToRowTests, Wide) @@ -122,9 +122,9 @@ TEST_F(ColumnToRowTests, Wide) CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); } - for (uint i = 0; i < old_rows.size(); i++) { - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); - } + /* for (uint i = 0; i < old_rows.size(); i++) { + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); + }*/ } TEST_F(ColumnToRowTests, SingleByteWide) @@ -153,9 +153,9 @@ TEST_F(ColumnToRowTests, SingleByteWide) CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); } - for (uint i = 0; i < old_rows.size(); i++) { - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); - } + /* for (uint i = 0; i < old_rows.size(); i++) { + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); + }*/ } TEST_F(ColumnToRowTests, Non2Power) @@ -191,9 +191,9 @@ TEST_F(ColumnToRowTests, Non2Power) CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); } - for (uint i = 0; i < old_rows.size(); i++) { - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); - } + /* for (uint i = 0; i < old_rows.size(); i++) { + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); + }*/ } TEST_F(ColumnToRowTests, Big) @@ -218,9 +218,21 @@ TEST_F(ColumnToRowTests, Big) auto new_rows = cudf::convert_to_rows(in); EXPECT_EQ(old_rows.size(), new_rows.size()); - for (uint i = 0; i < old_rows.size(); i++) { - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); + + for (uint i = 0; i < old_rows.size(); ++i) { + auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + + for (int j = 0; j < old_tbl->num_columns(); ++j) { + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(old_tbl->get_column(j), new_tbl->get_column(j)); + } + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); } + + /* for (uint i = 0; i < old_rows.size(); i++) { + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); + }*/ } TEST_F(ColumnToRowTests, Bigger) @@ -245,9 +257,20 @@ TEST_F(ColumnToRowTests, Bigger) auto new_rows = cudf::convert_to_rows(in); EXPECT_EQ(old_rows.size(), new_rows.size()); - for (uint i = 0; i < old_rows.size(); i++) { - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); + for (uint i = 0; i < old_rows.size(); ++i) { + auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + + for (int j = 0; j < old_tbl->num_columns(); ++j) { + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(old_tbl->get_column(j), new_tbl->get_column(j)); + } + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); } + + /* for (uint i = 0; i < old_rows.size(); i++) { + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); + }*/ } TEST_F(ColumnToRowTests, Biggest) @@ -272,9 +295,20 @@ TEST_F(ColumnToRowTests, Biggest) auto new_rows = cudf::convert_to_rows(in); EXPECT_EQ(old_rows.size(), new_rows.size()); - for (uint i = 0; i < old_rows.size(); i++) { - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); + + for (uint i = 0; i < old_rows.size(); ++i) { + auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + + for (int j = 0; j < old_tbl->num_columns(); ++j) { + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(old_tbl->get_column(j), new_tbl->get_column(j)); + } + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); } + /* for (uint i = 0; i < old_rows.size(); i++) { + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); + }*/ } TEST_F(RowToColumnTests, Single) @@ -379,6 +413,46 @@ TEST_F(RowToColumnTests, SingleByteWide) } } +TEST_F(RowToColumnTests, Raza) +{ + std::vector> cols; + std::vector views; + std::vector schema{cudf::data_type{cudf::type_id::INT64}, + cudf::data_type{cudf::type_id::FLOAT64}, + cudf::data_type{cudf::type_id::INT8}, + cudf::data_type{cudf::type_id::BOOL8}, + cudf::data_type{cudf::type_id::FLOAT32}, + cudf::data_type{cudf::type_id::INT8}, + cudf::data_type{cudf::type_id::INT32}, + cudf::data_type{cudf::type_id::INT64}}; + + cudf::test::fixed_width_column_wrapper c0({3, 9, 4, 2, 20, 0}, {1, 1, 1, 1, 1, 0}); + cudf::test::fixed_width_column_wrapper c1({5.0, 9.5, 0.9, 7.23, 2.8, 0.0}, + {1, 1, 1, 1, 1, 0}); + cudf::test::fixed_width_column_wrapper c2({5, 1, 0, 2, 7, 0}, {1, 1, 1, 1, 1, 0}); + cudf::test::fixed_width_column_wrapper c3({true, false, false, true, false, false}, + {1, 1, 1, 1, 1, 0}); + cudf::test::fixed_width_column_wrapper c4({1.0f, 3.5f, 5.9f, 7.1f, 9.8f, 0.0f}, + {1, 1, 1, 1, 1, 0}); + cudf::test::fixed_width_column_wrapper c5({2, 3, 4, 5, 9, 0}, {1, 1, 1, 1, 1, 0}); + cudf::test::fixed_point_column_wrapper c6( + {-300, 500, 950, 90, 723, 0}, {1, 1, 1, 1, 1, 1, 1, 0}, numeric::scale_type{-2}); + cudf::test::fixed_point_column_wrapper c7( + {-80, 30, 90, 20, 200, 0}, {1, 1, 1, 1, 1, 1, 0}, numeric::scale_type{-1}); + + cudf::table_view in({c0, c1, c2, c3, c4, c5, c6, c7}); + + auto old_rows = cudf::old_convert_to_rows(in); + auto new_rows = cudf::convert_to_rows(in); + + for (uint i = 0; i < old_rows.size(); ++i) { + auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*new_rows[i]), schema); + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); + } +} + TEST_F(RowToColumnTests, Non2Power) { auto r = diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu index 481787c6004..1808c7534df 100644 --- a/java/src/main/native/src/row_conversion.cu +++ b/java/src/main/native/src/row_conversion.cu @@ -330,10 +330,18 @@ struct block_info { int end_row; int buffer_num; - __host__ __device__ size_type get_row_size(size_type const *const col_offsets, - size_type const *const col_sizes) const { + __host__ __device__ size_type get_shared_row_size(size_type const *const col_offsets, + size_type const *const col_sizes, + bool debug_print = false) const { return align_offset(col_offsets[end_col] + col_sizes[end_col] - col_offsets[start_col], 8); } + __host__ __device__ size_type get_dest_row_size(size_type const *const col_offsets, + size_type const *const col_sizes, + bool debug_print = false) const { + return align_offset(col_offsets[end_col] + col_sizes[end_col] - col_offsets[start_col] + + util::div_rounding_up_unsafe(num_cols(), 8), + 8); + } __host__ __device__ size_type num_cols() const { return end_col - start_col + 1; } __host__ __device__ size_type num_rows() const { return end_row - start_row + 1; } @@ -409,7 +417,7 @@ __global__ void copy_from_columns(const size_type num_rows, const size_type num_ auto const num_fetch_cols = fetch_block.num_cols(); auto const num_fetch_rows = fetch_block.num_rows(); auto const num_elements_in_block = num_fetch_cols * num_fetch_rows; - auto const fetch_block_row_size = fetch_block.get_row_size(col_offsets, col_sizes); + auto const fetch_block_row_size = fetch_block.get_shared_row_size(col_offsets, col_sizes); auto const starting_column_offset = col_offsets[fetch_block.start_col]; auto &fetch_barrier = block_barrier[fetch % NUM_BLOCKS_PER_KERNEL_LOADED]; @@ -448,7 +456,8 @@ __global__ void copy_from_columns(const size_type num_rows, const size_type num_ auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + subset]; - auto const block_row_size = block.get_row_size(col_offsets, col_sizes); + auto const block_row_size = block.get_shared_row_size(col_offsets, col_sizes); + auto const dest_row_size = block.get_dest_row_size(col_offsets, col_sizes); auto const column_offset = col_offsets[block.start_col]; // copy entire rows to final dest @@ -457,7 +466,7 @@ __global__ void copy_from_columns(const size_type num_rows, const size_type num_ auto const relative_row = absolute_row - block.start_row; auto const output_dest = - output_data[block.buffer_num] + absolute_row * block_row_size + column_offset; + output_data[block.buffer_num] + absolute_row * dest_row_size + column_offset; auto const shared_offset = block_row_size * relative_row; cuda::memcpy_async(output_dest, &shared[subset % stages_count][shared_offset], block_row_size, @@ -650,7 +659,7 @@ fetch_blocks_for_row_to_column(size_t &fetch_index, size_t const processing_inde auto const fetch_block_start_row = fetch_block.start_row; auto const fetch_block_end_row = fetch_block.end_row; auto const starting_col_offset = col_offsets[fetch_block.start_col]; - auto const fetch_block_row_size = fetch_block.get_row_size(col_offsets, col_sizes); + auto const fetch_block_row_size = fetch_block.get_shared_row_size(col_offsets, col_sizes); auto const num_fetch_cols = fetch_block.num_cols(); auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes( sizeof(decltype(*col_sizes)), sizeof(decltype(*col_offsets)), num_fetch_cols); @@ -766,7 +775,7 @@ __global__ void copy_to_columns(const size_type num_rows, const size_type num_co auto const shared_row_offset = align_offset(col_size_bytes + col_offset_bytes, 8); - auto block_row_size = block.get_row_size(_col_offsets, _col_sizes); + auto block_row_size = block.get_shared_row_size(_col_offsets, _col_sizes); // now we copy from shared memory to final destination. // the data is laid out in rows in shared memory, so the reads From fb6dd51fb91d2694735ea9fdd5f86504ef78ebdb Mon Sep 17 00:00:00 2001 From: Raza Jafri Date: Wed, 6 Oct 2021 14:43:18 -0700 Subject: [PATCH 56/80] Use the new row<->col method Added a new method `convertFromRowsFixedWidthOptimized` and `convertToRowsFixedWidthOptimized` to be used for when columns are < 100. Otherwise use the new method This is currently failing simple tests --- java/src/main/java/ai/rapids/cudf/Table.java | 33 +++++++++++ java/src/main/native/src/TableJni.cpp | 56 ++++++++++++++++++- .../test/java/ai/rapids/cudf/TableTest.java | 43 +++++++++++++- 3 files changed, 128 insertions(+), 4 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index 68e7a21988a..eb61ec25d9a 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -645,8 +645,12 @@ private static native long[] conditionalLeftAntiJoinGatherMapWithCount(long left private static native long[] convertToRows(long nativeHandle); + private static native long[] convertToRowsFixedWidthOptimized(long nativeHandle); + private static native long[] convertFromRows(long nativeColumnView, int[] types, int[] scale); + private static native long[] convertFromRowsFixedWidthOptimized(long nativeColumnView, int[] types, int[] scale); + private static native long[] repeatStaticCount(long tableHandle, int count); private static native long[] repeatColumnCount(long tableHandle, @@ -2730,6 +2734,15 @@ public ColumnVector[] convertToRows() { return ret; } + public ColumnVector[] convertToRowsFixedWidthOptimized() { + long[] ptrs = convertToRowsFixedWidthOptimized(nativeHandle); + ColumnVector[] ret = new ColumnVector[ptrs.length]; + for (int i = 0; i < ptrs.length; i++) { + ret[i] = new ColumnVector(ptrs[i]); + } + return ret; + } + /** * Convert a column of list of bytes that is formatted like the output from `convertToRows` * and convert it back to a table. @@ -2750,6 +2763,26 @@ public static Table convertFromRows(ColumnView vec, DType ... schema) { return new Table(convertFromRows(vec.getNativeView(), types, scale)); } + /** + * Convert a column of list of bytes that is formatted like the output from `convertToRows` + * and convert it back to a table. + * @param vec the row data to process. + * @param schema the types of each column. + * @return the parsed table. + */ + public static Table convertFromRowsFixedWidthOptimized(ColumnView vec, DType ... schema) { + // TODO at some point we need a schema that support nesting so we can support nested types + // TODO we will need scale at some point very soon too + int[] types = new int[schema.length]; + int[] scale = new int[schema.length]; + for (int i = 0; i < schema.length; i++) { + types[i] = schema[i].typeId.nativeId; + scale[i] = schema[i].getScale(); + + } + return new Table(convertFromRowsFixedWidthOptimized(vec.getNativeView(), types, scale)); + } + /** * Construct a table from a packed representation. * @param metadata host-based metadata for the table diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index c66cf13a5ae..97fe7b4c71e 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -604,16 +605,20 @@ class native_arrow_ipc_reader_handle final { static jlongArray convert_table_for_return(JNIEnv *env, std::unique_ptr &table_result, std::vector> &extra_columns) { + std::cout << "entering convert_table_for_return\n"; std::vector> ret = table_result->release(); int table_cols = ret.size(); int num_columns = table_cols + extra_columns.size(); cudf::jni::native_jlongArray outcol_handles(env, num_columns); + std::cout << "0\n"; for (int i = 0; i < table_cols; i++) { outcol_handles[i] = reinterpret_cast(ret[i].release()); } + std::cout << "1\n"; for (size_t i = 0; i < extra_columns.size(); i++) { outcol_handles[i + table_cols] = reinterpret_cast(extra_columns[i].release()); } + std::cout << "exiting convert_table_for_return\n"; return outcol_handles.get_jArray(); } @@ -2688,14 +2693,35 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_gather(JNIEnv *env, jclas CATCH_STD(env, 0); } +JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertToRowsFixedWidthOptimized(JNIEnv *env, jclass, + jlong input_table) { + JNI_NULL_CHECK(env, input_table, "input table is null", 0); + + try { + cudf::jni::auto_set_device(env); + cudf::table_view *n_input_table = reinterpret_cast(input_table); + std::vector> cols = cudf::old_convert_to_rows(*n_input_table); + int num_columns = cols.size(); + cudf::jni::native_jlongArray outcol_handles(env, num_columns); + for (int i = 0; i < num_columns; i++) { + outcol_handles[i] = reinterpret_cast(cols[i].release()); + } + return outcol_handles.get_jArray(); + } + CATCH_STD(env, 0); +} + JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertToRows(JNIEnv *env, jclass, jlong input_table) { JNI_NULL_CHECK(env, input_table, "input table is null", 0); try { + std::cout << "convert_to_rows\n"; cudf::jni::auto_set_device(env); cudf::table_view *n_input_table = reinterpret_cast(input_table); - std::vector> cols = cudf::java::convert_to_rows(*n_input_table); + std::cout << "before convert_to_rows\n"; + std::vector> cols = cudf::convert_to_rows(*n_input_table); + std::cout << "after convert_to_rows\n"; int num_columns = cols.size(); cudf::jni::native_jlongArray outcol_handles(env, num_columns); for (int i = 0; i < num_columns; i++) { @@ -2706,6 +2732,29 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertToRows(JNIEnv *env CATCH_STD(env, 0); } +JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRowsFixedWidthOptimized(JNIEnv *env, jclass, + jlong input_column, + jintArray types, + jintArray scale) { + JNI_NULL_CHECK(env, input_column, "input column is null", 0); + JNI_NULL_CHECK(env, types, "types is null", 0); + + try { + cudf::jni::auto_set_device(env); + cudf::column_view *input = reinterpret_cast(input_column); + cudf::lists_column_view list_input(*input); + cudf::jni::native_jintArray n_types(env, types); + cudf::jni::native_jintArray n_scale(env, scale); + std::vector types_vec; + for (int i = 0; i < n_types.size(); i++) { + types_vec.emplace_back(cudf::jni::make_data_type(n_types[i], n_scale[i])); + } + std::unique_ptr result = cudf::old_convert_from_rows(list_input, types_vec); + return cudf::jni::convert_table_for_return(env, result); + } + CATCH_STD(env, 0); +} + JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRows(JNIEnv *env, jclass, jlong input_column, jintArray types, @@ -2714,6 +2763,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRows(JNIEnv *e JNI_NULL_CHECK(env, types, "types is null", 0); try { + std::cout << "convert_from_rows\n"; cudf::jni::auto_set_device(env); cudf::column_view *input = reinterpret_cast(input_column); cudf::lists_column_view list_input(*input); @@ -2723,7 +2773,9 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRows(JNIEnv *e for (int i = 0; i < n_types.size(); i++) { types_vec.emplace_back(cudf::jni::make_data_type(n_types[i], n_scale[i])); } - std::unique_ptr result = cudf::java::convert_from_rows(list_input, types_vec); + std::cout << "before convert_from_rows\n"; + std::unique_ptr result = cudf::convert_from_rows(list_input, types_vec); + std::cout << "after convert_from_rows\n"; return cudf::jni::convert_table_for_return(env, result); } CATCH_STD(env, 0); diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index 280a4d33ae9..623b444676f 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -51,6 +51,7 @@ import java.nio.file.Files; import java.util.*; import java.util.stream.Collectors; +import java.util.stream.IntStream; import static ai.rapids.cudf.ColumnWriterOptions.mapColumn; import static ai.rapids.cudf.ParquetWriterOptions.listBuilder; @@ -7210,6 +7211,44 @@ void testStructColumnFilterStrings() { } } + @Test + void fixedWidthRowsRoundTripWide() { + TestBuilder tb = new TestBuilder(); + IntStream.range(0, 10).forEach(i -> tb.column(3l, 9l, 4l, 2l, 20l, null)); + IntStream.range(0, 10).forEach(i -> tb.column(5.0d, 9.5d, 0.9d, 7.23d, 2.8d, null)); + IntStream.range(0, 10).forEach(i -> tb.column(5, 1, 0, 2, 7, null)); + IntStream.range(0, 10).forEach(i -> tb.column(true, false, false, true, false, null)); + IntStream.range(0, 10).forEach(i -> tb.column(1.0f, 3.5f, 5.9f, 7.1f, 9.8f, null)); + IntStream.range(0, 10).forEach(i -> tb.column(new Byte[]{2, 3, 4, 5, 9, null})); + IntStream.range(0, 10).forEach(i -> tb.decimal32Column(-3, RoundingMode.UNNECESSARY, 5.0d, + 9.5d, 0.9d, 7.23d, 2.8d, null)); + IntStream.range(0, 10).forEach(i -> tb.decimal64Column(-8, 3L, 9L, 4L, 2L, 20L, null)); + try (Table t = tb.build()) { + ColumnVector[] rows = t.convertToRows(); + try { + // We didn't overflow + assert rows.length == 1; + ColumnVector cv = rows[0]; + assert cv.getRowCount() == t.getRowCount(); +// try (HostColumnVector hcv = cv.copyToHost()) { +// hcv.getChildColumnView(0).getDataBuffer().printBuffer(8); +// } + + DType[] types = new DType[t.getNumberOfColumns()]; + for (int i = 0; i < t.getNumberOfColumns(); i++) { + types[i] = t.getColumn(i).getType(); + } + try (Table backAgain = Table.convertFromRows(cv, types)) { + assertTablesAreEqual(t, backAgain); + } + } finally { + for (ColumnVector cv : rows) { + cv.close(); + } + } + } + } + @Test void fixedWidthRowsRoundTrip() { try (Table t = new TestBuilder() @@ -7222,7 +7261,7 @@ void fixedWidthRowsRoundTrip() { .decimal32Column(-3, RoundingMode.UNNECESSARY, 5.0d, 9.5d, 0.9d, 7.23d, 2.8d, null) .decimal64Column(-8, 3L, 9L, 4L, 2L, 20L, null) .build()) { - ColumnVector[] rows = t.convertToRows(); + ColumnVector[] rows = t.convertToRowsFixedWidthOptimized(); try { // We didn't overflow assert rows.length == 1; @@ -7236,7 +7275,7 @@ void fixedWidthRowsRoundTrip() { for (int i = 0; i < t.getNumberOfColumns(); i++) { types[i] = t.getColumn(i).getType(); } - try (Table backAgain = Table.convertFromRows(cv, types)) { + try (Table backAgain = Table.convertFromRowsFixedWidthOptimized(cv, types)) { assertTablesAreEqual(t, backAgain); } } finally { From b0173bfa0b0006c8def3dde0b659a996b6a6078b Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Wed, 13 Oct 2021 22:06:04 +0000 Subject: [PATCH 57/80] fixing bug with float columns when 'enough' data was present. Updated function names --- .../row_conversion/row_conversion.cpp | 8 +- cpp/include/cudf/row_conversion.hpp | 4 +- cpp/src/row_conversion/row_conversion.cu | 95 ++++--- cpp/tests/row_conversion/row_conversion.cpp | 245 ++++++++++++------ java/src/main/native/src/TableJni.cpp | 16 +- java/src/main/native/src/row_conversion.cu | 66 ++--- java/src/main/native/src/row_conversion.hpp | 19 +- 7 files changed, 265 insertions(+), 188 deletions(-) diff --git a/cpp/benchmarks/row_conversion/row_conversion.cpp b/cpp/benchmarks/row_conversion/row_conversion.cpp index 2fe436a22c1..fb8e4c8aef3 100644 --- a/cpp/benchmarks/row_conversion/row_conversion.cpp +++ b/cpp/benchmarks/row_conversion/row_conversion.cpp @@ -50,7 +50,7 @@ static void BM_old_to_row(benchmark::State& state) for (auto _ : state) { cuda_event_timer raii(state, true, rmm::cuda_stream_default); - auto rows = cudf::old_convert_to_rows(table->view()); + auto rows = cudf::convert_to_rows_fixed_width_optimized(table->view()); } state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows()); @@ -109,13 +109,13 @@ static void BM_old_from_row(benchmark::State& state) total_bytes += cudf::size_of(t); } - auto rows = cudf::old_convert_to_rows(table->view()); + auto rows = cudf::convert_to_rows_fixed_width_optimized(table->view()); cudf::lists_column_view const first_list(rows.front()->view()); for (auto _ : state) { cuda_event_timer raii(state, true, rmm::cuda_stream_default); - auto out = cudf::old_convert_from_rows(first_list, schema); + auto out = cudf::convert_from_rows_fixed_width_optimized(first_list, schema); } state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows()); @@ -144,7 +144,7 @@ static void BM_new_from_row(benchmark::State& state) total_bytes += cudf::size_of(t); } - auto rows = cudf::old_convert_to_rows(table->view()); + auto rows = cudf::convert_to_rows_fixed_width_optimized(table->view()); cudf::lists_column_view const first_list(rows.front()->view()); for (auto _ : state) { diff --git a/cpp/include/cudf/row_conversion.hpp b/cpp/include/cudf/row_conversion.hpp index 8f82d01b06c..5d799f4c596 100644 --- a/cpp/include/cudf/row_conversion.hpp +++ b/cpp/include/cudf/row_conversion.hpp @@ -24,7 +24,7 @@ namespace cudf { -std::vector> old_convert_to_rows( +std::vector> convert_to_rows_fixed_width_optimized( cudf::table_view const& tbl, // TODO need something for validity rmm::cuda_stream_view stream = rmm::cuda_stream_default, @@ -36,7 +36,7 @@ std::vector> convert_to_rows( rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); -std::unique_ptr old_convert_from_rows( +std::unique_ptr convert_from_rows_fixed_width_optimized( cudf::lists_column_view const& input, std::vector const& schema, rmm::cuda_stream_view stream = rmm::cuda_stream_default, diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu index 84fab20fce5..0457bbf71e4 100644 --- a/cpp/src/row_conversion/row_conversion.cu +++ b/cpp/src/row_conversion/row_conversion.cu @@ -53,7 +53,7 @@ #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 constexpr auto NUM_BLOCKS_PER_KERNEL_TO_COLUMNS = 8; -constexpr auto NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS = 8; +constexpr auto NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS = 2; constexpr auto NUM_BLOCKS_PER_KERNEL_LOADED = 2; constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL = 8; constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED = 2; @@ -350,14 +350,6 @@ struct block_info { col_offsets[start_col]); return align_offset(col_offsets[end_col] + col_sizes[end_col] - col_offsets[start_col], 8); } - __host__ __device__ size_type get_dest_row_size(size_type const* const col_offsets, - size_type const* const col_sizes, - bool debug_print = false) const - { - return align_offset(col_offsets[end_col] + col_sizes[end_col] - col_offsets[start_col] + - util::div_rounding_up_unsafe(num_cols(), 8), - 8); - } __host__ __device__ size_type num_cols() const { return end_col - start_col + 1; } __host__ __device__ size_type num_rows() const { return end_row - start_row + 1; } @@ -441,9 +433,8 @@ __global__ void copy_from_columns(const size_type num_rows, // else { return; } auto const blocks_remaining = - std::min((uint)(num_block_infos % NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS), - std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS, - (uint)NUM_BLOCKS_PER_KERNEL_TO_COLUMNS)); + std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS, + (uint)NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS); size_t fetch; size_t subset; @@ -451,11 +442,11 @@ __global__ void copy_from_columns(const size_type num_rows, // Fetch ahead up to stages_count subsets for (; fetch < blocks_remaining && fetch < (subset + stages_count); ++fetch) { if (debug_print) - printf("fetching block %lu\n", blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch); - auto const fetch_block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch]; + printf("fetching block %lu\n", blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS + fetch); + auto const fetch_block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS + fetch]; if (debug_print) printf("block %lu rows %d-%d and cols %d-%d\n", - blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch, + blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS + fetch, fetch_block.start_row, fetch_block.end_row, fetch_block.start_col, @@ -474,9 +465,9 @@ __global__ void copy_from_columns(const size_type num_rows, // to do the copy we need to do n column copies followed by m element copies OR // we have to do m element copies followed by r row copies. When going from column // to row it is much easier to copy by elements first otherwise we would need a running - // total of the column sizes for our block, which isn't readily available. This makes it more - // appealing to copy element-wise from input data into shared matching the end layout and do - // row-based memcopies out. + // total of the column sizes for our block, which isn't readily available. This makes it + // more appealing to copy element-wise from input data into shared matching the end layout + // and do row-based memcopies out. for (auto el = (int)threadIdx.x; el < num_elements_in_block; el += blockDim.x) { auto const relative_col = el / num_fetch_rows; @@ -499,14 +490,15 @@ __global__ void copy_from_columns(const size_type num_rows, auto const input_src = input_data[absolute_col] + col_size * absolute_row; if (debug_print) - printf("block %lu to shared chunk %lu. %p <- %p - %d bytes\n", + printf("block %lu to shared chunk %lu. %p <- %p(0x%x) - %d bytes\n", fetch, fetch % stages_count, &shared[fetch % stages_count][shared_offset], input_src, + *input_src, col_size); - // copy the element to global memory + // copy the element from global memory cuda::memcpy_async( &shared[fetch % stages_count][shared_offset], input_src, col_size, fetch_barrier); } @@ -515,14 +507,11 @@ __global__ void copy_from_columns(const size_type num_rows, auto& subset_barrier = block_barrier[subset % NUM_BLOCKS_PER_KERNEL_LOADED]; subset_barrier.arrive_and_wait(); - auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + subset]; + auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS + subset]; if (debug_print) - printf("reading block %lu\n", blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + subset); + printf("reading block %lu\n", blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS + subset); - /* auto const rows_in_block = block.num_rows(); - auto const cols_in_block = block.num_cols();*/ auto const block_row_size = block.get_shared_row_size(col_offsets, col_sizes); - auto const dest_row_size = block.get_dest_row_size(col_offsets, col_sizes); auto const column_offset = col_offsets[block.start_col]; // copy entire rows to final dest @@ -530,7 +519,7 @@ __global__ void copy_from_columns(const size_type num_rows, absolute_row += blockDim.x) { auto const relative_row = absolute_row - block.start_row; auto const output_dest = - output_data[block.buffer_num] + absolute_row * dest_row_size + column_offset; + output_data[block.buffer_num] + row_offsets[absolute_row] + column_offset; if (debug_print) printf("processing row %d\noutput data[%d] is address %p\n", absolute_row, @@ -543,6 +532,7 @@ __global__ void copy_from_columns(const size_type num_rows, &shared[subset % stages_count][shared_offset], block_row_size, absolute_row); + cuda::memcpy_async( output_dest, &shared[subset % stages_count][shared_offset], block_row_size, subset_barrier); } @@ -673,7 +663,8 @@ __global__ void copy_validity_from_columns(const size_type num_rows, if (print_debug) printf( - "%d %d - my warp is %d, %d total sections(%d x, %d y), %d warps per block, blockDim.x=%d, " + "%d %d - my warp is %d, %d total sections(%d x, %d y), %d warps per block, " + "blockDim.x=%d, " "warp size " "%d\n", threadIdx.x, @@ -709,7 +700,8 @@ __global__ void copy_validity_from_columns(const size_type num_rows, if (print_debug) printf( - "participation mask is 0x%x for relative row %d(%d real), relative col %d(%d absolute)\n", + "participation mask is 0x%x for relative row %d(%d real), relative col %d(%d " + "absolute)\n", participation_mask, relative_row, absolute_row, @@ -744,8 +736,8 @@ __global__ void copy_validity_from_columns(const size_type num_rows, absolute_col); // every thread that is participating in the warp has a byte, but it's column-based - // data and we need it in row-based. So we shuffle the bits around with ballot_sync to make - // the bytes we actually write. + // data and we need it in row-based. So we shuffle the bits around with ballot_sync to + // make the bytes we actually write. for (int i = 0, byte_mask = 1; i < 8 && relative_row + i < num_rows; ++i, byte_mask <<= 1) { auto validity_data = __ballot_sync(participation_mask, my_byte & byte_mask); // lead thread in each warp writes data @@ -915,7 +907,8 @@ static __device__ void fetch_blocks_for_row_to_column( block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch_index]; if (debug_print) printf( - "fetching block %lu of %d for start col %d, end col %d. Starting col offset is %p, ending " + "fetching block %lu of %d for start col %d, end col %d. Starting col offset is %p, " + "ending " "offset %p\n", blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch_index, total_blocks, @@ -1242,7 +1235,8 @@ __global__ void copy_validity_to_columns(const size_type num_rows, block_infos, blockIdx.x); printf( - "%d %d - Shared memory starts at %p and ends at %p, input data is %p, output data is %p, row " + "%d %d - Shared memory starts at %p and ends at %p, input data is %p, output data is %p, " + "row " "offsets are %p, block infos at %p\n", threadIdx.x, blockIdx.x, @@ -1595,8 +1589,8 @@ static inline int32_t compute_fixed_width_layout(std::vector co } // Now we need to add in space for validity - // Eventually we can think about nullable vs not nullable, but for now we will just always add it - // in + // Eventually we can think about nullable vs not nullable, but for now we will just always add + // it in int32_t validity_bytes_needed = (schema.size() + 7) / 8; // cudf::util::div_rounding_up_safe(schema.size(), 8); // validity comes at the end and is byte aligned so we can pack more in. @@ -1727,11 +1721,11 @@ std::vector build_block_infos(std::vector const& column_s }; // the ideal window height has lots of 8-byte reads and 8-byte writes. The optimal read/write - // would be memory cache line sized access, but since other blocks will read/write the edges this - // may not turn out to be overly important. For now, we will attempt to build a square window as - // far as byte sizes. x * y = shared_mem_size. Which translates to x^2 = shared_mem_size since we - // want them equal, so height and width are sqrt(shared_mem_size). The trick is that it's in - // bytes, not rows or columns. + // would be memory cache line sized access, but since other blocks will read/write the edges + // this may not turn out to be overly important. For now, we will attempt to build a square + // window as far as byte sizes. x * y = shared_mem_size. Which translates to x^2 = + // shared_mem_size since we want them equal, so height and width are sqrt(shared_mem_size). The + // trick is that it's in bytes, not rows or columns. size_type const optimal_square_len = size_type(sqrt(shmem_limit_per_block)); int const window_height = std::clamp( util::round_up_safe( @@ -1787,9 +1781,11 @@ std::vector build_block_infos(std::vector const& column_s calc_admin_data_size(col - current_window_start_col), shmem_limit_per_block); printf( - "Window size %d too large at column %d, admin size is %d, bumping back to build windows of " + "Window size %d too large at column %d, admin size is %d, bumping back to build windows " + "of " "size %d(cols " - "%d-%d), which is %d tall. Row size is too large at %d and ok at %d(aligned overall is %d) " + "%d-%d), which is %d tall. Row size is too large at %d and ok at %d(aligned overall is " + "%d) " "for shared mem size %d\n", row_size_with_end_pad * window_height, col, @@ -1809,7 +1805,8 @@ std::vector build_block_infos(std::vector const& column_s detail::align_offset((column_starts[col] + column_sizes[col]) & 7, alignment_needed); #if defined(DEBUG) printf( - "New window starting with offset %d and row size %d to be %d (previous column offset %d+%d " + "New window starting with offset %d and row size %d to be %d (previous column offset " + "%d+%d " "or %d)\n", row_size, col_size, @@ -2172,9 +2169,8 @@ std::vector> convert_to_rows(cudf::table_view cons #endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 } -std::vector> old_convert_to_rows(cudf::table_view const& tbl, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::vector> convert_to_rows_fixed_width_optimized( + cudf::table_view const& tbl, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { const cudf::size_type num_columns = tbl.num_columns(); @@ -2399,10 +2395,11 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const& in #endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 } -std::unique_ptr old_convert_from_rows(cudf::lists_column_view const& input, - std::vector const& schema, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::unique_ptr convert_from_rows_fixed_width_optimized( + cudf::lists_column_view const& input, + std::vector const& schema, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { // verify that the types are what we expect cudf::column_view child = input.child(); diff --git a/cpp/tests/row_conversion/row_conversion.cpp b/cpp/tests/row_conversion/row_conversion.cpp index 0ab8b70a0f7..746ac0655f7 100644 --- a/cpp/tests/row_conversion/row_conversion.cpp +++ b/cpp/tests/row_conversion/row_conversion.cpp @@ -14,15 +14,21 @@ * limitations under the License. */ +#include +#include +#include +#include +#include +#include #include #include #include #include #include -#include -#include "cudf/lists/lists_column_view.hpp" -#include "cudf/types.hpp" +#include + +#include struct ColumnToRowTests : public cudf::test::BaseFixture { }; @@ -35,20 +41,17 @@ TEST_F(ColumnToRowTests, Single) cudf::table_view in(std::vector{a}); std::vector schema = {cudf::data_type{cudf::type_id::INT32}}; - auto old_rows = cudf::old_convert_to_rows(in); + auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); auto new_rows = cudf::convert_to_rows(in); EXPECT_EQ(old_rows.size(), new_rows.size()); for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto old_tbl = + cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); } - - /* for (uint i = 0; i < old_rows.size(); i++) { - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); - }*/ } TEST_F(ColumnToRowTests, Simple) @@ -57,20 +60,17 @@ TEST_F(ColumnToRowTests, Simple) cudf::table_view in(std::vector{a}); std::vector schema = {cudf::data_type{cudf::type_id::INT32}}; - auto old_rows = cudf::old_convert_to_rows(in); + auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); auto new_rows = cudf::convert_to_rows(in); EXPECT_EQ(old_rows.size(), new_rows.size()); for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto old_tbl = + cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); } - - /* for (uint i = 0; i < old_rows.size(); i++) { - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); - }*/ } TEST_F(ColumnToRowTests, Tall) @@ -81,21 +81,18 @@ TEST_F(ColumnToRowTests, Tall) cudf::table_view in(std::vector{a}); std::vector schema = {cudf::data_type{cudf::type_id::INT32}}; - auto old_rows = cudf::old_convert_to_rows(in); + auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); auto new_rows = cudf::convert_to_rows(in); EXPECT_EQ(old_rows.size(), new_rows.size()); for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto old_tbl = + cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); } - - /* for (uint i = 0; i < old_rows.size(); i++) { - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); - }*/ } TEST_F(ColumnToRowTests, Wide) @@ -111,20 +108,17 @@ TEST_F(ColumnToRowTests, Wide) } cudf::table_view in(views); - auto old_rows = cudf::old_convert_to_rows(in); + auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); auto new_rows = cudf::convert_to_rows(in); EXPECT_EQ(old_rows.size(), new_rows.size()); for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto old_tbl = + cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); } - - /* for (uint i = 0; i < old_rows.size(); i++) { - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); - }*/ } TEST_F(ColumnToRowTests, SingleByteWide) @@ -141,21 +135,18 @@ TEST_F(ColumnToRowTests, SingleByteWide) } cudf::table_view in(views); - auto old_rows = cudf::old_convert_to_rows(in); + auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); auto new_rows = cudf::convert_to_rows(in); EXPECT_EQ(old_rows.size(), new_rows.size()); for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto old_tbl = + cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); } - - /* for (uint i = 0; i < old_rows.size(); i++) { - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); - }*/ } TEST_F(ColumnToRowTests, Non2Power) @@ -175,13 +166,14 @@ TEST_F(ColumnToRowTests, Non2Power) } cudf::table_view in(views); - auto old_rows = cudf::old_convert_to_rows(in); + auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); auto new_rows = cudf::convert_to_rows(in); EXPECT_EQ(old_rows.size(), new_rows.size()); for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto old_tbl = + cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); for (int j = 0; j < old_tbl->num_columns(); ++j) { @@ -190,10 +182,6 @@ TEST_F(ColumnToRowTests, Non2Power) CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); } - - /* for (uint i = 0; i < old_rows.size(); i++) { - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); - }*/ } TEST_F(ColumnToRowTests, Big) @@ -214,13 +202,14 @@ TEST_F(ColumnToRowTests, Big) } cudf::table_view in(views); - auto old_rows = cudf::old_convert_to_rows(in); + auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); auto new_rows = cudf::convert_to_rows(in); EXPECT_EQ(old_rows.size(), new_rows.size()); for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto old_tbl = + cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); for (int j = 0; j < old_tbl->num_columns(); ++j) { @@ -229,10 +218,6 @@ TEST_F(ColumnToRowTests, Big) CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); } - - /* for (uint i = 0; i < old_rows.size(); i++) { - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); - }*/ } TEST_F(ColumnToRowTests, Bigger) @@ -253,12 +238,13 @@ TEST_F(ColumnToRowTests, Bigger) } cudf::table_view in(views); - auto old_rows = cudf::old_convert_to_rows(in); + auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); auto new_rows = cudf::convert_to_rows(in); EXPECT_EQ(old_rows.size(), new_rows.size()); for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto old_tbl = + cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); for (int j = 0; j < old_tbl->num_columns(); ++j) { @@ -267,10 +253,6 @@ TEST_F(ColumnToRowTests, Bigger) CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); } - - /* for (uint i = 0; i < old_rows.size(); i++) { - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); - }*/ } TEST_F(ColumnToRowTests, Biggest) @@ -291,13 +273,14 @@ TEST_F(ColumnToRowTests, Biggest) } cudf::table_view in(views); - auto old_rows = cudf::old_convert_to_rows(in); + auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); auto new_rows = cudf::convert_to_rows(in); EXPECT_EQ(old_rows.size(), new_rows.size()); for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto old_tbl = + cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); for (int j = 0; j < old_tbl->num_columns(); ++j) { @@ -306,9 +289,6 @@ TEST_F(ColumnToRowTests, Biggest) CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); } - /* for (uint i = 0; i < old_rows.size(); i++) { - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*old_rows[i], *new_rows[i]); - }*/ } TEST_F(RowToColumnTests, Single) @@ -319,7 +299,8 @@ TEST_F(RowToColumnTests, Single) auto old_rows = cudf::convert_to_rows(in); std::vector schema{cudf::data_type{cudf::type_id::INT32}}; for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto old_tbl = + cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); @@ -331,10 +312,11 @@ TEST_F(RowToColumnTests, Simple) cudf::test::fixed_width_column_wrapper a({-1, 0, 1}); cudf::table_view in(std::vector{a}); - auto old_rows = cudf::old_convert_to_rows(in); + auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); std::vector schema{cudf::data_type{cudf::type_id::INT32}}; for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto old_tbl = + cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); @@ -348,14 +330,15 @@ TEST_F(RowToColumnTests, Tall) cudf::test::fixed_width_column_wrapper a(r, r + (size_t)4096); cudf::table_view in(std::vector{a}); - auto old_rows = cudf::old_convert_to_rows(in); + auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); std::vector schema; schema.reserve(in.num_columns()); for (auto col = in.begin(); col < in.end(); ++col) { schema.push_back(col->type()); } for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto old_tbl = + cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); @@ -373,7 +356,7 @@ TEST_F(RowToColumnTests, Wide) } cudf::table_view in(views); - auto old_rows = cudf::old_convert_to_rows(in); + auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); std::vector schema; schema.reserve(in.num_columns()); for (auto col = in.begin(); col < in.end(); ++col) { @@ -381,7 +364,8 @@ TEST_F(RowToColumnTests, Wide) } for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto old_tbl = + cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); @@ -399,21 +383,22 @@ TEST_F(RowToColumnTests, SingleByteWide) } cudf::table_view in(views); - auto old_rows = cudf::old_convert_to_rows(in); + auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); std::vector schema; schema.reserve(in.num_columns()); for (auto col = in.begin(); col < in.end(); ++col) { schema.push_back(col->type()); } for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto old_tbl = + cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); } } -TEST_F(RowToColumnTests, Raza) +TEST_F(RowToColumnTests, AllTypes) { std::vector> cols; std::vector views; @@ -442,11 +427,115 @@ TEST_F(RowToColumnTests, Raza) cudf::table_view in({c0, c1, c2, c3, c4, c5, c6, c7}); - auto old_rows = cudf::old_convert_to_rows(in); + auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); + auto new_rows = cudf::convert_to_rows(in); + + for (uint i = 0; i < old_rows.size(); ++i) { + auto old_tbl = + cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); + auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*new_rows[i]), schema); + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); + } +} + +TEST_F(RowToColumnTests, AllTypesLarge) +{ + std::vector cols; + std::vector schema{}; + + // 10 columns of each type with 1024 entries + constexpr int num_rows{1024}; + + std::default_random_engine re; + std::uniform_real_distribution rand_double(std::numeric_limits::min(), + std::numeric_limits::max()); + std::uniform_int_distribution rand_int64(std::numeric_limits::min(), + std::numeric_limits::max()); + auto r = cudf::detail::make_counting_transform_iterator( + 0, [&](auto i) -> int64_t { return rand_int64(re); }); + auto d = cudf::detail::make_counting_transform_iterator( + 0, [&](auto i) -> double { return rand_double(re); }); + + for (int i = 0; i < 10; ++i) { + cols.push_back( + *cudf::test::fixed_width_column_wrapper(r, r + num_rows).release().release()); + schema.push_back(cudf::data_type{cudf::type_id::INT8}); + } + + for (int i = 0; i < 10; ++i) { + cols.push_back( + *cudf::test::fixed_width_column_wrapper(r, r + num_rows).release().release()); + schema.push_back(cudf::data_type{cudf::type_id::INT16}); + } + + for (int i = 0; i < 10; ++i) { + cols.push_back( + *cudf::test::fixed_width_column_wrapper(r, r + num_rows).release().release()); + schema.push_back(cudf::data_type{cudf::type_id::INT32}); + } + + for (int i = 0; i < 10; ++i) { + cols.push_back( + *cudf::test::fixed_width_column_wrapper(d, d + num_rows).release().release()); + schema.push_back(cudf::data_type{cudf::type_id::FLOAT32}); + } + + for (int i = 0; i < 10; ++i) { + cols.push_back( + *cudf::test::fixed_width_column_wrapper(d, d + num_rows).release().release()); + schema.push_back(cudf::data_type{cudf::type_id::FLOAT64}); + } + + for (int i = 0; i < 10; ++i) { + cols.push_back( + *cudf::test::fixed_width_column_wrapper(r, r + num_rows).release().release()); + schema.push_back(cudf::data_type{cudf::type_id::BOOL8}); + } + + for (int i = 0; i < 10; ++i) { + cols.push_back( + *cudf::test::fixed_width_column_wrapper( + r, r + num_rows) + .release() + .release()); + schema.push_back(cudf::data_type{cudf::type_id::TIMESTAMP_MILLISECONDS}); + } + + for (int i = 0; i < 10; ++i) { + cols.push_back( + *cudf::test::fixed_width_column_wrapper( + r, r + num_rows) + .release() + .release()); + schema.push_back(cudf::data_type{cudf::type_id::TIMESTAMP_DAYS}); + } + + for (int i = 0; i < 10; ++i) { + cols.push_back( + *cudf::test::fixed_point_column_wrapper(r, r + num_rows, numeric::scale_type{-2}) + .release() + .release()); + schema.push_back(cudf::data_type{cudf::type_id::DECIMAL32}); + } + + for (int i = 0; i < 10; ++i) { + cols.push_back( + *cudf::test::fixed_point_column_wrapper(r, r + num_rows, numeric::scale_type{-1}) + .release() + .release()); + schema.push_back(cudf::data_type{cudf::type_id::DECIMAL64}); + } + + std::vector views(cols.begin(), cols.end()); + cudf::table_view in(views); + + auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); auto new_rows = cudf::convert_to_rows(in); for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto old_tbl = + cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*new_rows[i]), schema); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); @@ -470,10 +559,11 @@ TEST_F(RowToColumnTests, Non2Power) } cudf::table_view in(views); - auto old_rows = cudf::old_convert_to_rows(in); + auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto old_tbl = + cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); @@ -498,10 +588,11 @@ TEST_F(RowToColumnTests, Big) } cudf::table_view in(views); - auto old_rows = cudf::old_convert_to_rows(in); + auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto old_tbl = + cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); @@ -526,10 +617,11 @@ TEST_F(RowToColumnTests, Bigger) } cudf::table_view in(views); - auto old_rows = cudf::old_convert_to_rows(in); + auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto old_tbl = + cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); @@ -554,10 +646,11 @@ TEST_F(RowToColumnTests, Biggest) } cudf::table_view in(views); - auto old_rows = cudf::old_convert_to_rows(in); + auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = cudf::old_convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); + auto old_tbl = + cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 97fe7b4c71e..76b249d591b 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -2693,14 +2693,15 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_gather(JNIEnv *env, jclas CATCH_STD(env, 0); } -JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertToRowsFixedWidthOptimized(JNIEnv *env, jclass, - jlong input_table) { +JNIEXPORT jlongArray JNICALL +Java_ai_rapids_cudf_Table_convertToRowsFixedWidthOptimized(JNIEnv *env, jclass, jlong input_table) { JNI_NULL_CHECK(env, input_table, "input table is null", 0); try { cudf::jni::auto_set_device(env); cudf::table_view *n_input_table = reinterpret_cast(input_table); - std::vector> cols = cudf::old_convert_to_rows(*n_input_table); + std::vector> cols = + cudf::convert_to_rows_fixed_width_optimized(*n_input_table); int num_columns = cols.size(); cudf::jni::native_jlongArray outcol_handles(env, num_columns); for (int i = 0; i < num_columns; i++) { @@ -2732,10 +2733,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertToRows(JNIEnv *env CATCH_STD(env, 0); } -JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRowsFixedWidthOptimized(JNIEnv *env, jclass, - jlong input_column, - jintArray types, - jintArray scale) { +JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRowsFixedWidthOptimized( + JNIEnv *env, jclass, jlong input_column, jintArray types, jintArray scale) { JNI_NULL_CHECK(env, input_column, "input column is null", 0); JNI_NULL_CHECK(env, types, "types is null", 0); @@ -2749,7 +2748,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRowsFixedWidth for (int i = 0; i < n_types.size(); i++) { types_vec.emplace_back(cudf::jni::make_data_type(n_types[i], n_scale[i])); } - std::unique_ptr result = cudf::old_convert_from_rows(list_input, types_vec); + std::unique_ptr result = + cudf::convert_from_rows_fixed_width_optimized(list_input, types_vec); return cudf::jni::convert_table_for_return(env, result); } CATCH_STD(env, 0); diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu index 1808c7534df..e6cd9a9da32 100644 --- a/java/src/main/native/src/row_conversion.cu +++ b/java/src/main/native/src/row_conversion.cu @@ -21,8 +21,6 @@ #include #include -#include -#include #include #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 @@ -30,10 +28,12 @@ #endif #include +#include #include #include #include #include +#include #include #include #include @@ -51,7 +51,7 @@ #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 constexpr auto NUM_BLOCKS_PER_KERNEL_TO_COLUMNS = 8; -constexpr auto NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS = 8; +constexpr auto NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS = 2; constexpr auto NUM_BLOCKS_PER_KERNEL_LOADED = 2; constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL = 8; constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED = 2; @@ -331,17 +331,9 @@ struct block_info { int buffer_num; __host__ __device__ size_type get_shared_row_size(size_type const *const col_offsets, - size_type const *const col_sizes, - bool debug_print = false) const { + size_type const *const col_sizes) const { return align_offset(col_offsets[end_col] + col_sizes[end_col] - col_offsets[start_col], 8); } - __host__ __device__ size_type get_dest_row_size(size_type const *const col_offsets, - size_type const *const col_sizes, - bool debug_print = false) const { - return align_offset(col_offsets[end_col] + col_sizes[end_col] - col_offsets[start_col] + - util::div_rounding_up_unsafe(num_cols(), 8), - 8); - } __host__ __device__ size_type num_cols() const { return end_col - start_col + 1; } __host__ __device__ size_type num_rows() const { return end_row - start_row + 1; } @@ -404,16 +396,15 @@ __global__ void copy_from_columns(const size_type num_rows, const size_type num_ group.sync(); auto const blocks_remaining = - std::min((uint)(num_block_infos % NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS), - std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS, - (uint)NUM_BLOCKS_PER_KERNEL_TO_COLUMNS)); + std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS, + (uint)NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS); size_t fetch; size_t subset; for (subset = fetch = 0; subset < blocks_remaining; ++subset) { // Fetch ahead up to stages_count subsets for (; fetch < blocks_remaining && fetch < (subset + stages_count); ++fetch) { - auto const fetch_block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch]; + auto const fetch_block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS + fetch]; auto const num_fetch_cols = fetch_block.num_cols(); auto const num_fetch_rows = fetch_block.num_rows(); auto const num_elements_in_block = num_fetch_cols * num_fetch_rows; @@ -429,9 +420,9 @@ __global__ void copy_from_columns(const size_type num_rows, const size_type num_ // to do the copy we need to do n column copies followed by m element copies OR // we have to do m element copies followed by r row copies. When going from column // to row it is much easier to copy by elements first otherwise we would need a running - // total of the column sizes for our block, which isn't readily available. This makes it more - // appealing to copy element-wise from input data into shared matching the end layout and do - // row-based memcopies out. + // total of the column sizes for our block, which isn't readily available. This makes it + // more appealing to copy element-wise from input data into shared matching the end layout + // and do row-based memcopies out. for (auto el = (int)threadIdx.x; el < num_elements_in_block; el += blockDim.x) { auto const relative_col = el / num_fetch_rows; @@ -445,7 +436,7 @@ __global__ void copy_from_columns(const size_type num_rows, const size_type num_ auto const shared_offset = relative_row * fetch_block_row_size + relative_col_offset; auto const input_src = input_data[absolute_col] + col_size * absolute_row; - // copy the element to global memory + // copy the element from global memory cuda::memcpy_async(&shared[fetch % stages_count][shared_offset], input_src, col_size, fetch_barrier); } @@ -454,10 +445,8 @@ __global__ void copy_from_columns(const size_type num_rows, const size_type num_ auto &subset_barrier = block_barrier[subset % NUM_BLOCKS_PER_KERNEL_LOADED]; subset_barrier.arrive_and_wait(); - auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + subset]; - + auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS + subset]; auto const block_row_size = block.get_shared_row_size(col_offsets, col_sizes); - auto const dest_row_size = block.get_dest_row_size(col_offsets, col_sizes); auto const column_offset = col_offsets[block.start_col]; // copy entire rows to final dest @@ -466,7 +455,7 @@ __global__ void copy_from_columns(const size_type num_rows, const size_type num_ auto const relative_row = absolute_row - block.start_row; auto const output_dest = - output_data[block.buffer_num] + absolute_row * dest_row_size + column_offset; + output_data[block.buffer_num] + row_offsets[absolute_row] + column_offset; auto const shared_offset = block_row_size * relative_row; cuda::memcpy_async(output_dest, &shared[subset % stages_count][shared_offset], block_row_size, @@ -563,8 +552,8 @@ __global__ void copy_validity_from_columns( input_nm[absolute_col] != nullptr ? input_nm[absolute_col][absolute_row / 32] : 0xFF; // every thread that is participating in the warp has a byte, but it's column-based - // data and we need it in row-based. So we shuffle the bits around with ballot_sync to make - // the bytes we actually write. + // data and we need it in row-based. So we shuffle the bits around with ballot_sync to + // make the bytes we actually write. for (int i = 0, byte_mask = 1; i < 8 && relative_row + i < num_rows; ++i, byte_mask <<= 1) { auto validity_data = __ballot_sync(participation_mask, my_byte & byte_mask); // lead thread in each warp writes data @@ -1085,8 +1074,8 @@ static inline int32_t compute_fixed_width_layout(std::vector co } // Now we need to add in space for validity - // Eventually we can think about nullable vs not nullable, but for now we will just always add it - // in + // Eventually we can think about nullable vs not nullable, but for now we will just always add + // it in int32_t validity_bytes_needed = (schema.size() + 7) / 8; // cudf::util::div_rounding_up_safe(schema.size(), 8); // validity comes at the end and is byte aligned so we can pack more in. @@ -1209,11 +1198,11 @@ std::vector build_block_infos(std::vector const &column_s }; // the ideal window height has lots of 8-byte reads and 8-byte writes. The optimal read/write - // would be memory cache line sized access, but since other blocks will read/write the edges this - // may not turn out to be overly important. For now, we will attempt to build a square window as - // far as byte sizes. x * y = shared_mem_size. Which translates to x^2 = shared_mem_size since we - // want them equal, so height and width are sqrt(shared_mem_size). The trick is that it's in - // bytes, not rows or columns. + // would be memory cache line sized access, but since other blocks will read/write the edges + // this may not turn out to be overly important. For now, we will attempt to build a square + // window as far as byte sizes. x * y = shared_mem_size. Which translates to x^2 = + // shared_mem_size since we want them equal, so height and width are sqrt(shared_mem_size). The + // trick is that it's in bytes, not rows or columns. size_type const optimal_square_len = size_type(sqrt(shmem_limit_per_block)); int const window_height = std::clamp( util::round_up_safe( @@ -1478,8 +1467,8 @@ std::vector> convert_to_rows(cudf::table_view cons } std::vector> -old_convert_to_rows(cudf::table_view const &tbl, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) { +convert_to_rows_fixed_width_optimized(cudf::table_view const &tbl, rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) { const cudf::size_type num_columns = tbl.num_columns(); std::vector schema; @@ -1656,10 +1645,9 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const &in #endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 } -std::unique_ptr old_convert_from_rows(cudf::lists_column_view const &input, - std::vector const &schema, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) { +std::unique_ptr convert_from_rows_fixed_width_optimized( + cudf::lists_column_view const &input, std::vector const &schema, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) { // verify that the types are what we expect cudf::column_view child = input.child(); cudf::type_id list_type = child.type().id(); diff --git a/java/src/main/native/src/row_conversion.hpp b/java/src/main/native/src/row_conversion.hpp index 517202f3892..edc2768d4bb 100644 --- a/java/src/main/native/src/row_conversion.hpp +++ b/java/src/main/native/src/row_conversion.hpp @@ -25,11 +25,11 @@ namespace cudf { namespace java { -std::vector> -old_convert_to_rows(cudf::table_view const &tbl, - // TODO need something for validity - rmm::cuda_stream_view stream = rmm::cuda_stream_default, - rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); +std::vector> convert_to_rows_fixed_width_optimized( + cudf::table_view const &tbl, + // TODO need something for validity + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); std::vector> convert_to_rows(cudf::table_view const &tbl, @@ -37,11 +37,10 @@ convert_to_rows(cudf::table_view const &tbl, rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); -std::unique_ptr -old_convert_from_rows(cudf::lists_column_view const &input, - std::vector const &schema, - rmm::cuda_stream_view stream = rmm::cuda_stream_default, - rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); +std::unique_ptr convert_from_rows_fixed_width_optimized( + cudf::lists_column_view const &input, std::vector const &schema, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); std::unique_ptr convert_from_rows(cudf::lists_column_view const &input, std::vector const &schema, From 81cbaa60c6ae5dfa40ccaea821de575d7fd19d9e Mon Sep 17 00:00:00 2001 From: Raza Jafri Date: Fri, 15 Oct 2021 15:20:52 -0700 Subject: [PATCH 58/80] code cleanup and removed comments --- java/src/main/native/src/TableJni.cpp | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 76b249d591b..45403f1eb0d 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -605,20 +605,16 @@ class native_arrow_ipc_reader_handle final { static jlongArray convert_table_for_return(JNIEnv *env, std::unique_ptr &table_result, std::vector> &extra_columns) { - std::cout << "entering convert_table_for_return\n"; std::vector> ret = table_result->release(); int table_cols = ret.size(); int num_columns = table_cols + extra_columns.size(); cudf::jni::native_jlongArray outcol_handles(env, num_columns); - std::cout << "0\n"; for (int i = 0; i < table_cols; i++) { outcol_handles[i] = reinterpret_cast(ret[i].release()); } - std::cout << "1\n"; for (size_t i = 0; i < extra_columns.size(); i++) { outcol_handles[i + table_cols] = reinterpret_cast(extra_columns[i].release()); } - std::cout << "exiting convert_table_for_return\n"; return outcol_handles.get_jArray(); } @@ -2717,12 +2713,9 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertToRows(JNIEnv *env JNI_NULL_CHECK(env, input_table, "input table is null", 0); try { - std::cout << "convert_to_rows\n"; cudf::jni::auto_set_device(env); cudf::table_view *n_input_table = reinterpret_cast(input_table); - std::cout << "before convert_to_rows\n"; std::vector> cols = cudf::convert_to_rows(*n_input_table); - std::cout << "after convert_to_rows\n"; int num_columns = cols.size(); cudf::jni::native_jlongArray outcol_handles(env, num_columns); for (int i = 0; i < num_columns; i++) { @@ -2763,7 +2756,6 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRows(JNIEnv *e JNI_NULL_CHECK(env, types, "types is null", 0); try { - std::cout << "convert_from_rows\n"; cudf::jni::auto_set_device(env); cudf::column_view *input = reinterpret_cast(input_column); cudf::lists_column_view list_input(*input); @@ -2773,9 +2765,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRows(JNIEnv *e for (int i = 0; i < n_types.size(); i++) { types_vec.emplace_back(cudf::jni::make_data_type(n_types[i], n_scale[i])); } - std::cout << "before convert_from_rows\n"; std::unique_ptr result = cudf::convert_from_rows(list_input, types_vec); - std::cout << "after convert_from_rows\n"; return cudf::jni::convert_table_for_return(env, result); } CATCH_STD(env, 0); From 58eb43f7e1e23baa68fe6813a4200cb54a0321b2 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Thu, 21 Oct 2021 00:53:01 +0000 Subject: [PATCH 59/80] Fixing validity buffer alignment issue for row data --- cpp/src/row_conversion/row_conversion.cu | 142 ++++++++++++-------- cpp/tests/row_conversion/row_conversion.cpp | 63 ++++++--- java/src/main/native/src/row_conversion.cu | 58 +++++--- 3 files changed, 165 insertions(+), 98 deletions(-) diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu index 0457bbf71e4..90bd8b88ef0 100644 --- a/cpp/src/row_conversion/row_conversion.cu +++ b/cpp/src/row_conversion/row_conversion.cu @@ -469,6 +469,7 @@ __global__ void copy_from_columns(const size_type num_rows, // more appealing to copy element-wise from input data into shared matching the end layout // and do row-based memcopies out. + auto const shared_buffer_base = shared[fetch % stages_count]; for (auto el = (int)threadIdx.x; el < num_elements_in_block; el += blockDim.x) { auto const relative_col = el / num_fetch_rows; auto const relative_row = el % num_fetch_rows; @@ -493,14 +494,36 @@ __global__ void copy_from_columns(const size_type num_rows, printf("block %lu to shared chunk %lu. %p <- %p(0x%x) - %d bytes\n", fetch, fetch % stages_count, - &shared[fetch % stages_count][shared_offset], + &shared_buffer_base[shared_offset], input_src, *input_src, col_size); // copy the element from global memory - cuda::memcpy_async( - &shared[fetch % stages_count][shared_offset], input_src, col_size, fetch_barrier); + switch (col_size) { + case 2: + cuda::memcpy_async(&shared_buffer_base[shared_offset], + input_src, + cuda::aligned_size_t<2>(col_size), + fetch_barrier); + break; + case 4: + cuda::memcpy_async(&shared_buffer_base[shared_offset], + input_src, + cuda::aligned_size_t<4>(col_size), + fetch_barrier); + break; + case 8: + cuda::memcpy_async(&shared_buffer_base[shared_offset], + input_src, + cuda::aligned_size_t<8>(col_size), + fetch_barrier); + break; + default: + cuda::memcpy_async( + &shared_buffer_base[shared_offset], input_src, col_size, fetch_barrier); + break; + } } } @@ -511,15 +534,15 @@ __global__ void copy_from_columns(const size_type num_rows, if (debug_print) printf("reading block %lu\n", blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS + subset); - auto const block_row_size = block.get_shared_row_size(col_offsets, col_sizes); - auto const column_offset = col_offsets[block.start_col]; + auto const block_row_size = block.get_shared_row_size(col_offsets, col_sizes); + auto const column_offset = col_offsets[block.start_col]; + auto const block_output_buffer = output_data[block.buffer_num]; // copy entire rows to final dest for (auto absolute_row = block.start_row + threadIdx.x; absolute_row <= block.end_row; absolute_row += blockDim.x) { auto const relative_row = absolute_row - block.start_row; - auto const output_dest = - output_data[block.buffer_num] + row_offsets[absolute_row] + column_offset; + auto const output_dest = block_output_buffer + row_offsets[absolute_row] + column_offset; if (debug_print) printf("processing row %d\noutput data[%d] is address %p\n", absolute_row, @@ -533,8 +556,10 @@ __global__ void copy_from_columns(const size_type num_rows, block_row_size, absolute_row); - cuda::memcpy_async( - output_dest, &shared[subset % stages_count][shared_offset], block_row_size, subset_barrier); + cuda::memcpy_async(output_dest, + &shared[subset % stages_count][shared_offset], + cuda::aligned_size_t<8>(block_row_size), + subset_barrier); } } @@ -641,8 +666,8 @@ __global__ void copy_validity_from_columns(const size_type num_rows, auto const num_block_cols = block.num_cols(); auto const num_block_rows = block.num_rows(); - auto const num_sections_x = (num_block_cols + 31) / 32; - auto const num_sections_y = (num_block_rows + 7) / 8; + auto const num_sections_x = util::div_rounding_up_unsafe(num_block_cols, 32); + auto const num_sections_y = util::div_rounding_up_unsafe(num_block_rows, 32); auto const validity_data_row_length = align_offset(util::div_rounding_up_unsafe(num_block_cols, 8), 8); auto const total_sections = num_sections_x * num_sections_y; @@ -690,7 +715,7 @@ __global__ void copy_validity_from_columns(const size_type num_rows, my_section_idx, total_sections); auto const relative_col = section_x * 32 + lane_id; - auto const relative_row = section_y * 8; + auto const relative_row = section_y * 32; auto const absolute_col = relative_col + block.start_col; auto const absolute_row = relative_row + block.start_row; auto const cols_left = num_columns - absolute_col; @@ -720,15 +745,15 @@ __global__ void copy_validity_from_columns(const size_type num_rows, absolute_row, relative_col, absolute_col); - auto my_byte = - input_nm[absolute_col] != nullptr ? input_nm[absolute_col][absolute_row / 32] : 0xFF; + auto my_data = input_nm[absolute_col] != nullptr ? input_nm[absolute_col][absolute_row / 32] + : std::numeric_limits::max(); if (print_debug) printf( - "thread %d's byte is 0x%x, participation mask is 0x%x for relative row %d(%d real), " + "thread %d's bytes are 0x%x, participation mask is 0x%x for relative row %d(%d real), " "relative col %d(%d absolute)\n", threadIdx.x, - my_byte & 0xFF, + my_data, participation_mask, relative_row, absolute_row, @@ -738,8 +763,9 @@ __global__ void copy_validity_from_columns(const size_type num_rows, // every thread that is participating in the warp has a byte, but it's column-based // data and we need it in row-based. So we shuffle the bits around with ballot_sync to // make the bytes we actually write. - for (int i = 0, byte_mask = 1; i < 8 && relative_row + i < num_rows; ++i, byte_mask <<= 1) { - auto validity_data = __ballot_sync(participation_mask, my_byte & byte_mask); + bitmask_type dw_mask = 1; + for (int i = 0; i < 32 && relative_row + i < num_rows; ++i, dw_mask <<= 1) { + auto validity_data = __ballot_sync(participation_mask, my_data & dw_mask); // lead thread in each warp writes data auto const validity_write_offset = validity_data_row_length * (relative_row + i) + relative_col / 8; @@ -750,8 +776,8 @@ __global__ void copy_validity_from_columns(const size_type num_rows, "0x%x\n", threadIdx.x, blockIdx.x, - byte_mask, - my_byte & byte_mask, + dw_mask, + my_data & dw_mask, validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED, validity_write_offset, validity_data); @@ -804,6 +830,9 @@ __global__ void copy_validity_from_columns(const size_type num_rows, // make sure entire block has finished copy group.sync(); + auto const output_data_base = + output_data[block.buffer_num] + validity_offset + block.start_col / 8; + // now async memcpy the shared memory out to the final destination for (int row = block.start_row + threadIdx.x; row <= block.end_row; row += blockDim.x) { auto const relative_row = row - block.start_row; @@ -835,9 +864,8 @@ __global__ void copy_validity_from_columns(const size_type num_rows, word_index(block.start_col), this_shared_block[validity_data_row_length * relative_row]); } - auto const output_ptr = - output_data[block.buffer_num] + row_offsets[row] + validity_offset + block.start_col / 8; - auto const num_bytes = util::div_rounding_up_unsafe(num_block_cols, 8); + auto const output_ptr = output_data_base + row_offsets[row]; + auto const num_bytes = util::div_rounding_up_unsafe(num_block_cols, 8); cuda::memcpy_async( output_ptr, @@ -970,11 +998,20 @@ static __device__ void fetch_blocks_for_row_to_column( row += blockDim.x) { auto shared_offset = (row - fetch_block_start_row) * fetch_block_row_size + shared_row_offset; if (debug_print) - printf("fetching block %lu to shared chunk %lu. %p <- %p\n", - fetch_index, - fetch_index % max_resident_blocks, - &shared[fetch_index % max_resident_blocks][shared_offset], - &input_data[row_offsets[row] + starting_col_offset]); + printf( + "%d - fetching block %lu to shared chunk %lu. %p(shared[%d %% %d][%d]) <- %p(row %d, row " + "offset %d starting col offset %d)\n", + threadIdx.x, + fetch_index, + fetch_index % max_resident_blocks, + &shared[fetch_index % max_resident_blocks][shared_offset], + (int)fetch_index, + max_resident_blocks, + shared_offset, + &input_data[row_offsets[row] + starting_col_offset], + row, + row_offsets[row], + starting_col_offset); // copy the main cuda::memcpy_async(&shared[fetch_index % max_resident_blocks][shared_offset], &input_data[row_offsets[row] + starting_col_offset], @@ -1021,7 +1058,7 @@ __global__ void copy_to_columns(const size_type num_rows, // to speed up some of the random access memory we do, we copy col_sizes and col_offsets // to shared memory for each of the blocks that we work on - /*constexpr*/ bool debug_print = false; // threadIdx.x == 0 && blockIdx.x == 0; + constexpr bool debug_print = false; // threadIdx.x == 2 && blockIdx.x == 0; constexpr unsigned stages_count = NUM_BLOCKS_PER_KERNEL_LOADED; auto group = cooperative_groups::this_thread_block(); extern __shared__ int8_t shared_data[]; @@ -1094,12 +1131,12 @@ __global__ void copy_to_columns(const size_type num_rows, auto& subset_barrier = block_barrier[subset % NUM_BLOCKS_PER_KERNEL_LOADED]; // ensure our data is ready - if (debug_print && group.thread_rank() == 0 && blockIdx.x == 0) + if (debug_print) printf("%d-%d waiting at barrier %p\n", threadIdx.x, blockIdx.x, &subset_barrier); subset_barrier.arrive_and_wait(); auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + subset]; - if (debug_print && group.thread_rank() == 0 && blockIdx.x == 0) + if (debug_print) printf("%d-%d reading block %lu at address %p\n", threadIdx.x, blockIdx.x, @@ -1159,19 +1196,19 @@ __global__ void copy_to_columns(const size_type num_rows, if (debug_print) { printf( - "relative_col: %d, relative_row: %d, absolute_col: %d, absolute_row: %d, " - "shared_mmeory_row_offset: %d, shared_memory_offset: %d," - " column_size: %d, shmem_src: %p, dst: %p\n",//, uint32 is %u\n", - relative_col, - relative_row, - absolute_col, - absolute_row, - shared_memory_row_offset, - shared_memory_offset, - column_size, - shmem_src, - dst/*, - *reinterpret_cast(shmem_src)*/); + "relative_col: %d, relative_row: %d, absolute_col: %d, absolute_row: %d, " + "shared_mmeory_row_offset: %d, shared_memory_offset: %d," + " column_size: %d, shmem_src: %p, dst: %p\n",//, uint32 is %u\n", + relative_col, + relative_row, + absolute_col, + absolute_row, + shared_memory_row_offset, + shared_memory_offset, + column_size, + shmem_src, + dst/*, + *reinterpret_cast(shmem_src)*/); printf("memcpy_async(%p, %p, %d, subset_barrier);\n", dst, shmem_src, column_size); } if (debug_print && absolute_col == 0 && absolute_row == 51) { @@ -1185,7 +1222,7 @@ __global__ void copy_to_columns(const size_type num_rows, cuda::memcpy_async(dst, shmem_src, column_size, subset_barrier); } group.sync(); - if (debug_print && group.thread_rank() == 0 && blockIdx.x == 0) + if (debug_print) printf( "%d-%d copy to main memory with barrier %p\n", threadIdx.x, blockIdx.x, &subset_barrier); } @@ -1224,9 +1261,7 @@ __global__ void copy_validity_to_columns(const size_type num_rows, int8_t* shared_blocks[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] = { shared_data, shared_data + shmem_used_per_block / 2}; - bool print_debug = false; // threadIdx.x == 0 && blockIdx.x == 0; - // bool print_debug = false; - // if (blockIdx.x != 3 || threadIdx.x / 32 != 0) return; + constexpr bool print_debug = false; // threadIdx.x == 0 && blockIdx.x == 0; if (print_debug) { printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns); printf("%d %d - block infos are at %p and my index is %d\n", @@ -1246,10 +1281,6 @@ __global__ void copy_validity_to_columns(const size_type num_rows, output_nm, row_offsets, block_infos); - /* printf("Row Offsets:\n"); - for (int i=0; i double { return rand_double(re); }); + auto all_valid = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return 1; }); + auto none_valid = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return 0; }); + auto most_valid = cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return rand() % 2 == 0 ? 0 : 1; }); + auto few_valid = cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return rand() % 13 == 0 ? 1 : 0; }); + for (int i = 0; i < 10; ++i) { - cols.push_back( - *cudf::test::fixed_width_column_wrapper(r, r + num_rows).release().release()); + cols.push_back(*cudf::test::fixed_width_column_wrapper(r, r + num_rows, all_valid) + .release() + .release()); schema.push_back(cudf::data_type{cudf::type_id::INT8}); } for (int i = 0; i < 10; ++i) { - cols.push_back( - *cudf::test::fixed_width_column_wrapper(r, r + num_rows).release().release()); + cols.push_back(*cudf::test::fixed_width_column_wrapper(r, r + num_rows, few_valid) + .release() + .release()); schema.push_back(cudf::data_type{cudf::type_id::INT16}); } for (int i = 0; i < 10; ++i) { - cols.push_back( - *cudf::test::fixed_width_column_wrapper(r, r + num_rows).release().release()); + if (i < 5) { + cols.push_back(*cudf::test::fixed_width_column_wrapper(r, r + num_rows, few_valid) + .release() + .release()); + } else { + cols.push_back(*cudf::test::fixed_width_column_wrapper(r, r + num_rows, none_valid) + .release() + .release()); + } schema.push_back(cudf::data_type{cudf::type_id::INT32}); } for (int i = 0; i < 10; ++i) { - cols.push_back( - *cudf::test::fixed_width_column_wrapper(d, d + num_rows).release().release()); + cols.push_back(*cudf::test::fixed_width_column_wrapper(d, d + num_rows, most_valid) + .release() + .release()); schema.push_back(cudf::data_type{cudf::type_id::FLOAT32}); } for (int i = 0; i < 10; ++i) { - cols.push_back( - *cudf::test::fixed_width_column_wrapper(d, d + num_rows).release().release()); + cols.push_back(*cudf::test::fixed_width_column_wrapper(d, d + num_rows, most_valid) + .release() + .release()); schema.push_back(cudf::data_type{cudf::type_id::FLOAT64}); } for (int i = 0; i < 10; ++i) { - cols.push_back( - *cudf::test::fixed_width_column_wrapper(r, r + num_rows).release().release()); + cols.push_back(*cudf::test::fixed_width_column_wrapper(r, r + num_rows, few_valid) + .release() + .release()); schema.push_back(cudf::data_type{cudf::type_id::BOOL8}); } for (int i = 0; i < 10; ++i) { cols.push_back( *cudf::test::fixed_width_column_wrapper( - r, r + num_rows) + r, r + num_rows, all_valid) .release() .release()); schema.push_back(cudf::data_type{cudf::type_id::TIMESTAMP_MILLISECONDS}); @@ -505,25 +524,25 @@ TEST_F(RowToColumnTests, AllTypesLarge) for (int i = 0; i < 10; ++i) { cols.push_back( *cudf::test::fixed_width_column_wrapper( - r, r + num_rows) + r, r + num_rows, most_valid) .release() .release()); schema.push_back(cudf::data_type{cudf::type_id::TIMESTAMP_DAYS}); } for (int i = 0; i < 10; ++i) { - cols.push_back( - *cudf::test::fixed_point_column_wrapper(r, r + num_rows, numeric::scale_type{-2}) - .release() - .release()); + cols.push_back(*cudf::test::fixed_point_column_wrapper( + r, r + num_rows, all_valid, numeric::scale_type{-2}) + .release() + .release()); schema.push_back(cudf::data_type{cudf::type_id::DECIMAL32}); } for (int i = 0; i < 10; ++i) { - cols.push_back( - *cudf::test::fixed_point_column_wrapper(r, r + num_rows, numeric::scale_type{-1}) - .release() - .release()); + cols.push_back(*cudf::test::fixed_point_column_wrapper( + r, r + num_rows, most_valid, numeric::scale_type{-1}) + .release() + .release()); schema.push_back(cudf::data_type{cudf::type_id::DECIMAL64}); } diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu index e6cd9a9da32..a67589fbaec 100644 --- a/java/src/main/native/src/row_conversion.cu +++ b/java/src/main/native/src/row_conversion.cu @@ -424,6 +424,7 @@ __global__ void copy_from_columns(const size_type num_rows, const size_type num_ // more appealing to copy element-wise from input data into shared matching the end layout // and do row-based memcopies out. + auto const shared_buffer_base = shared[fetch % stages_count]; for (auto el = (int)threadIdx.x; el < num_elements_in_block; el += blockDim.x) { auto const relative_col = el / num_fetch_rows; auto const relative_row = el % num_fetch_rows; @@ -437,8 +438,24 @@ __global__ void copy_from_columns(const size_type num_rows, const size_type num_ auto const input_src = input_data[absolute_col] + col_size * absolute_row; // copy the element from global memory - cuda::memcpy_async(&shared[fetch % stages_count][shared_offset], input_src, col_size, - fetch_barrier); + switch (col_size) { + case 2: + cuda::memcpy_async(&shared_buffer_base[shared_offset], input_src, + cuda::aligned_size_t<2>(col_size), fetch_barrier); + break; + case 4: + cuda::memcpy_async(&shared_buffer_base[shared_offset], input_src, + cuda::aligned_size_t<4>(col_size), fetch_barrier); + break; + case 8: + cuda::memcpy_async(&shared_buffer_base[shared_offset], input_src, + cuda::aligned_size_t<8>(col_size), fetch_barrier); + break; + default: + cuda::memcpy_async(&shared_buffer_base[shared_offset], input_src, col_size, + fetch_barrier); + break; + } } } @@ -448,18 +465,17 @@ __global__ void copy_from_columns(const size_type num_rows, const size_type num_ auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS + subset]; auto const block_row_size = block.get_shared_row_size(col_offsets, col_sizes); auto const column_offset = col_offsets[block.start_col]; + auto const block_output_buffer = output_data[block.buffer_num]; // copy entire rows to final dest for (auto absolute_row = block.start_row + threadIdx.x; absolute_row <= block.end_row; absolute_row += blockDim.x) { - auto const relative_row = absolute_row - block.start_row; - auto const output_dest = - output_data[block.buffer_num] + row_offsets[absolute_row] + column_offset; + auto const output_dest = block_output_buffer + row_offsets[absolute_row] + column_offset; auto const shared_offset = block_row_size * relative_row; - cuda::memcpy_async(output_dest, &shared[subset % stages_count][shared_offset], block_row_size, - subset_barrier); + cuda::memcpy_async(output_dest, &shared[subset % stages_count][shared_offset], + cuda::aligned_size_t<8>(block_row_size), subset_barrier); } } @@ -523,8 +539,8 @@ __global__ void copy_validity_from_columns( auto const num_block_cols = block.num_cols(); auto const num_block_rows = block.num_rows(); - auto const num_sections_x = (num_block_cols + 31) / 32; - auto const num_sections_y = (num_block_rows + 7) / 8; + auto const num_sections_x = util::div_rounding_up_unsafe(num_block_cols, 32); + auto const num_sections_y = util::div_rounding_up_unsafe(num_block_rows, 32); auto const validity_data_row_length = align_offset(util::div_rounding_up_unsafe(num_block_cols, 8), 8); auto const total_sections = num_sections_x * num_sections_y; @@ -536,26 +552,27 @@ __global__ void copy_validity_from_columns( // the block is divided into sections. A warp operates on a section at a time. for (int my_section_idx = warp_id; my_section_idx < total_sections; my_section_idx += warps_per_block) { - // convert to rows and cols auto const section_x = my_section_idx % num_sections_x; auto const section_y = my_section_idx / num_sections_x; auto const relative_col = section_x * 32 + lane_id; - auto const relative_row = section_y * 8; + auto const relative_row = section_y * 32; auto const absolute_col = relative_col + block.start_col; auto const absolute_row = relative_row + block.start_row; auto const cols_left = num_columns - absolute_col; auto const participation_mask = __ballot_sync(0xFFFFFFFF, absolute_col < num_columns); if (absolute_col < num_columns) { - auto my_byte = - input_nm[absolute_col] != nullptr ? input_nm[absolute_col][absolute_row / 32] : 0xFF; + auto my_data = input_nm[absolute_col] != nullptr ? + input_nm[absolute_col][absolute_row / 32] : + std::numeric_limits::max(); // every thread that is participating in the warp has a byte, but it's column-based // data and we need it in row-based. So we shuffle the bits around with ballot_sync to // make the bytes we actually write. - for (int i = 0, byte_mask = 1; i < 8 && relative_row + i < num_rows; ++i, byte_mask <<= 1) { - auto validity_data = __ballot_sync(participation_mask, my_byte & byte_mask); + bitmask_type dw_mask = 1; + for (int i = 0; i < 32 && relative_row + i < num_rows; ++i, dw_mask <<= 1) { + auto validity_data = __ballot_sync(participation_mask, my_data & dw_mask); // lead thread in each warp writes data auto const validity_write_offset = validity_data_row_length * (relative_row + i) + relative_col / 8; @@ -585,11 +602,13 @@ __global__ void copy_validity_from_columns( // make sure entire block has finished copy group.sync(); + auto const output_data_base = + output_data[block.buffer_num] + validity_offset + block.start_col / 8; + // now async memcpy the shared memory out to the final destination for (int row = block.start_row + threadIdx.x; row <= block.end_row; row += blockDim.x) { auto const relative_row = row - block.start_row; - auto const output_ptr = - output_data[block.buffer_num] + row_offsets[row] + validity_offset + block.start_col / 8; + auto const output_ptr = output_data_base + row_offsets[row]; auto const num_bytes = util::div_rounding_up_unsafe(num_block_cols, 8); cuda::memcpy_async( @@ -917,8 +936,6 @@ __global__ void copy_validity_to_columns( // now async memcpy the shared for (int col = block.start_col + threadIdx.x; col <= block.end_col; col += blockDim.x) { auto const relative_col = col - block.start_col; - auto const words_to_copy = util::div_rounding_up_unsafe(num_block_rows, 32); - auto const starting_address = output_nm[col] + word_index(block_start_row); cuda::memcpy_async( output_nm[col] + word_index(block_start_row), @@ -1111,7 +1128,7 @@ static size_type compute_column_information(iterator begin, iterator end, fixed_width_size_per_row += col_size; } - auto validity_offset = detail::align_offset(fixed_width_size_per_row, 4); + auto validity_offset = fixed_width_size_per_row; column_starts.push_back(validity_offset); return fixed_width_size_per_row; @@ -1233,7 +1250,6 @@ std::vector build_block_infos(std::vector const &column_s if (row_size_with_end_pad * window_height + calc_admin_data_size(col - current_window_start_col) > shmem_limit_per_block) { - // too large, close this window, generate vertical blocks and restart build_blocks(current_window_start_col, col == 0 ? col : col - 1, window_height); row_size = From 06837f061795c6bc09b530e42d6dd14cbcf1af5f Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Thu, 21 Oct 2021 18:02:07 +0000 Subject: [PATCH 60/80] Cleaning up code for PR --- cpp/src/row_conversion/row_conversion.cu | 4132 ++++++++------------ java/src/main/native/src/row_conversion.cu | 237 +- 2 files changed, 1740 insertions(+), 2629 deletions(-) diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu index 90bd8b88ef0..c068a2c0b76 100644 --- a/cpp/src/row_conversion/row_conversion.cu +++ b/cpp/src/row_conversion/row_conversion.cu @@ -14,2487 +14,1653 @@ * limitations under the License. */ -#include -#include -#include -#include -#include -#include -#include "cudf/detail/iterator.cuh" -#include "cudf/lists/lists_column_device_view.cuh" - -#include - -#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 -#include -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#include -#include -#include - -#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 -constexpr auto NUM_BLOCKS_PER_KERNEL_TO_COLUMNS = 8; -constexpr auto NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS = 2; -constexpr auto NUM_BLOCKS_PER_KERNEL_LOADED = 2; -constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL = 8; -constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED = 2; -#endif - -using cudf::detail::make_device_uvector_async; -using rmm::device_uvector; -namespace cudf { - -namespace detail { - -static inline __host__ __device__ int32_t align_offset(int32_t offset, std::size_t alignment) -{ - return (offset + alignment - 1) & ~(alignment - 1); -} - -__global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows, - const cudf::size_type num_columns, - const cudf::size_type row_size, - const cudf::size_type* input_offset_in_row, - const cudf::size_type* num_bytes, - int8_t** output_data, - cudf::bitmask_type** output_nm, - const int8_t* input_data) -{ - // We are going to copy the data in two passes. - // The first pass copies a chunk of data into shared memory. - // The second pass copies that chunk from shared memory out to the final location. - - // Because shared memory is limited we copy a subset of the rows at a time. - // For simplicity we will refer to this as a row_group - - // In practice we have found writing more than 4 columns of data per thread - // results in performance loss. As such we are using a 2 dimensional - // kernel in terms of threads, but not in terms of blocks. Columns are - // controlled by the y dimension (there is no y dimension in blocks). Rows - // are controlled by the x dimension (there are multiple blocks in the x - // dimension). - - cudf::size_type rows_per_group = blockDim.x; - cudf::size_type row_group_start = blockIdx.x; - cudf::size_type row_group_stride = gridDim.x; - cudf::size_type row_group_end = (num_rows + rows_per_group - 1) / rows_per_group + 1; - - extern __shared__ int8_t shared_data[]; - - // Because we are copying fixed width only data and we stride the rows - // this thread will always start copying from shared data in the same place - int8_t* row_tmp = &shared_data[row_size * threadIdx.x]; - int8_t* row_vld_tmp = &row_tmp[input_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]]; - - for (cudf::size_type row_group_index = row_group_start; row_group_index < row_group_end; - row_group_index += row_group_stride) { - // Step 1: Copy the data into shared memory - // We know row_size is always aligned with and a multiple of int64_t; - int64_t* long_shared = reinterpret_cast(shared_data); - const int64_t* long_input = reinterpret_cast(input_data); - - cudf::size_type shared_output_index = threadIdx.x + (threadIdx.y * blockDim.x); - cudf::size_type shared_output_stride = blockDim.x * blockDim.y; - cudf::size_type row_index_end = ((row_group_index + 1) * rows_per_group); - if (row_index_end > num_rows) { row_index_end = num_rows; } - cudf::size_type num_rows_in_group = row_index_end - (row_group_index * rows_per_group); - cudf::size_type shared_length = row_size * num_rows_in_group; - - cudf::size_type shared_output_end = shared_length / sizeof(int64_t); - - cudf::size_type start_input_index = - (row_size * row_group_index * rows_per_group) / sizeof(int64_t); - - for (cudf::size_type shared_index = shared_output_index; shared_index < shared_output_end; - shared_index += shared_output_stride) { - long_shared[shared_index] = long_input[start_input_index + shared_index]; - } - // Wait for all of the data to be in shared memory - __syncthreads(); - - // Step 2 copy the data back out - - // Within the row group there should be 1 thread for each row. This is a - // requirement for launching the kernel - cudf::size_type row_index = (row_group_index * rows_per_group) + threadIdx.x; - // But we might not use all of the threads if the number of rows does not go - // evenly into the thread count. We don't want those threads to exit yet - // because we may need them to copy data in for the next row group. - uint32_t active_mask = __ballot_sync(0xffffffff, row_index < num_rows); - if (row_index < num_rows) { - cudf::size_type col_index_start = threadIdx.y; - cudf::size_type col_index_stride = blockDim.y; - for (cudf::size_type col_index = col_index_start; col_index < num_columns; - col_index += col_index_stride) { - cudf::size_type col_size = num_bytes[col_index]; - const int8_t* col_tmp = &(row_tmp[input_offset_in_row[col_index]]); - int8_t* col_output = output_data[col_index]; - switch (col_size) { - case 1: { - col_output[row_index] = *col_tmp; - break; - } - case 2: { - int16_t* short_col_output = reinterpret_cast(col_output); - short_col_output[row_index] = *reinterpret_cast(col_tmp); - break; - } - case 4: { - int32_t* int_col_output = reinterpret_cast(col_output); - int_col_output[row_index] = *reinterpret_cast(col_tmp); - break; - } - case 8: { - int64_t* long_col_output = reinterpret_cast(col_output); - long_col_output[row_index] = *reinterpret_cast(col_tmp); - break; - } - default: { - cudf::size_type output_offset = col_size * row_index; - // TODO this should just not be supported for fixed width columns, but just in case... - for (cudf::size_type b = 0; b < col_size; b++) { - col_output[b + output_offset] = col_tmp[b]; - } - break; - } - } - - cudf::bitmask_type* nm = output_nm[col_index]; - int8_t* valid_byte = &row_vld_tmp[col_index / 8]; - cudf::size_type byte_bit_offset = col_index % 8; - int predicate = *valid_byte & (1 << byte_bit_offset); - uint32_t bitmask = __ballot_sync(active_mask, predicate); - if (row_index % 32 == 0) { nm[word_index(row_index)] = bitmask; } - } // end column loop - } // end row copy - // wait for the row_group to be totally copied before starting on the next row group - __syncthreads(); - } -} - -__global__ void copy_from_fixed_width_columns(const cudf::size_type start_row, - const cudf::size_type num_rows, - const cudf::size_type num_columns, - const cudf::size_type row_size, - const cudf::size_type* output_offset_in_row, - const cudf::size_type* num_bytes, - const int8_t** input_data, - const cudf::bitmask_type** input_nm, - int8_t* output_data) -{ - // We are going to copy the data in two passes. - // The first pass copies a chunk of data into shared memory. - // The second pass copies that chunk from shared memory out to the final location. - - // Because shared memory is limited we copy a subset of the rows at a time. - // We do not support copying a subset of the columns in a row yet, so we don't - // currently support a row that is wider than shared memory. - // For simplicity we will refer to this as a row_group - - // In practice we have found reading more than 4 columns of data per thread - // results in performance loss. As such we are using a 2 dimensional - // kernel in terms of threads, but not in terms of blocks. Columns are - // controlled by the y dimension (there is no y dimension in blocks). Rows - // are controlled by the x dimension (there are multiple blocks in the x - // dimension). - - cudf::size_type rows_per_group = blockDim.x; - cudf::size_type row_group_start = blockIdx.x; - cudf::size_type row_group_stride = gridDim.x; - cudf::size_type row_group_end = (num_rows + rows_per_group - 1) / rows_per_group + 1; - - extern __shared__ int8_t shared_data[]; - - // Because we are copying fixed width only data and we stride the rows - // this thread will always start copying to shared data in the same place - int8_t* row_tmp = &shared_data[row_size * threadIdx.x]; - int8_t* row_vld_tmp = - &row_tmp[output_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]]; - - for (cudf::size_type row_group_index = row_group_start; row_group_index < row_group_end; - row_group_index += row_group_stride) { - // Within the row group there should be 1 thread for each row. This is a - // requirement for launching the kernel - cudf::size_type row_index = start_row + (row_group_index * rows_per_group) + threadIdx.x; - // But we might not use all of the threads if the number of rows does not go - // evenly into the thread count. We don't want those threads to exit yet - // because we may need them to copy data back out. - if (row_index < (start_row + num_rows)) { - cudf::size_type col_index_start = threadIdx.y; - cudf::size_type col_index_stride = blockDim.y; - for (cudf::size_type col_index = col_index_start; col_index < num_columns; - col_index += col_index_stride) { - cudf::size_type col_size = num_bytes[col_index]; - int8_t* col_tmp = &(row_tmp[output_offset_in_row[col_index]]); - const int8_t* col_input = input_data[col_index]; - switch (col_size) { - case 1: { - *col_tmp = col_input[row_index]; - break; - } - case 2: { - const int16_t* short_col_input = reinterpret_cast(col_input); - *reinterpret_cast(col_tmp) = short_col_input[row_index]; - break; - } - case 4: { - const int32_t* int_col_input = reinterpret_cast(col_input); - *reinterpret_cast(col_tmp) = int_col_input[row_index]; - break; - } - case 8: { - const int64_t* long_col_input = reinterpret_cast(col_input); - *reinterpret_cast(col_tmp) = long_col_input[row_index]; - break; - } - default: { - cudf::size_type input_offset = col_size * row_index; - // TODO this should just not be supported for fixed width columns, but just in case... - for (cudf::size_type b = 0; b < col_size; b++) { - col_tmp[b] = col_input[b + input_offset]; - } - break; - } - } - // atomicOr only works on 32 bit or 64 bit aligned values, and not byte aligned - // so we have to rewrite the addresses to make sure that it is 4 byte aligned - int8_t* valid_byte = &row_vld_tmp[col_index / 8]; - cudf::size_type byte_bit_offset = col_index % 8; - uint64_t fixup_bytes = reinterpret_cast(valid_byte) % 4; - int32_t* valid_int = reinterpret_cast(valid_byte - fixup_bytes); - cudf::size_type int_bit_offset = byte_bit_offset + (fixup_bytes * 8); - // Now copy validity for the column - if (input_nm[col_index]) { - if (bit_is_set(input_nm[col_index], row_index)) { - atomicOr_block(valid_int, 1 << int_bit_offset); - } else { - atomicAnd_block(valid_int, ~(1 << int_bit_offset)); - } - } else { - // It is valid so just set the bit - atomicOr_block(valid_int, 1 << int_bit_offset); - } - } // end column loop - } // end row copy - // wait for the row_group to be totally copied into shared memory - __syncthreads(); - - // Step 2: Copy the data back out - // We know row_size is always aligned with and a multiple of int64_t; - int64_t* long_shared = reinterpret_cast(shared_data); - int64_t* long_output = reinterpret_cast(output_data); - - cudf::size_type shared_input_index = threadIdx.x + (threadIdx.y * blockDim.x); - cudf::size_type shared_input_stride = blockDim.x * blockDim.y; - cudf::size_type row_index_end = ((row_group_index + 1) * rows_per_group); - if (row_index_end > num_rows) { row_index_end = num_rows; } - cudf::size_type num_rows_in_group = row_index_end - (row_group_index * rows_per_group); - cudf::size_type shared_length = row_size * num_rows_in_group; - - cudf::size_type shared_input_end = shared_length / sizeof(int64_t); - - cudf::size_type start_output_index = - (row_size * row_group_index * rows_per_group) / sizeof(int64_t); - - for (cudf::size_type shared_index = shared_input_index; shared_index < shared_input_end; - shared_index += shared_input_stride) { - long_output[start_output_index + shared_index] = long_shared[shared_index]; - } - __syncthreads(); - // Go for the next round - } -} - -#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 - -struct block_info { - int start_col; - int start_row; - int end_col; - int end_row; - int buffer_num; - - __host__ __device__ size_type get_shared_row_size(size_type const* const col_offsets, - size_type const* const col_sizes, - bool debug_print = false) const - { - if (debug_print) - printf("col_offsets[%d]: %p + col_sizes[%d]: %p - col_offsets[%d]: %p\n%d + %d - %d\n", - end_col, - &col_offsets[end_col], - end_col, - &col_sizes[end_col], - start_col, - &col_offsets[start_col], - col_offsets[end_col], - col_sizes[end_col], - col_offsets[start_col]); - return align_offset(col_offsets[end_col] + col_sizes[end_col] - col_offsets[start_col], 8); - } - __host__ __device__ size_type num_cols() const { return end_col - start_col + 1; } - - __host__ __device__ size_type num_rows() const { return end_row - start_row + 1; } -}; - -// When building the columns to return, we have to be mindful of the offset limit in cudf. -// It is 32-bit and these data columns are capable of surpassing that easily. The data should -// not be cut off exactly at the limit though due to the validity buffers. The most efficient -// place to cut the validity is on a 32-row boundary, so as we calculate the row sizes -// we keep track of the cut points for the validity, which we call row batches. If the row -// is larger than can be represented with the 32-bit offsets, we use the last 32-row boundary we -// hit. Note that this boundary is for our book-keeping with column pointers and not anything that -// the kernel needs to worry about. We cut the output at convienient boundaries when assembling -// the outgoing data stream. -struct row_batch { - size_type num_bytes; - size_type row_count; -}; - -/** - * @brief copy data from cudf columns into x format, which is row-based - * - * @param num_rows total number of rows in the table - * @param num_columns total number of columns in the table - * @param input_data pointer to raw table data - * @param input_nm pointer to validity data - * @param col_sizes array of sizes for each element in a column - one per column - * @param col_offsets offset into input data row for each column's start - * @param block_infos information about the blocks of work - * @param row_offsets offset to a specific row in the input data - * @param output_data pointer to output data - * - */ -__global__ void copy_from_columns(const size_type num_rows, - const size_type num_columns, - const size_type shmem_used_per_block, - const size_type num_block_infos, - const int8_t** input_data, - const size_type* col_sizes, - const size_type* col_offsets, - const block_info* block_infos, - const size_type* row_offsets, - int8_t** output_data) -{ - // We are going to copy the data in two passes. - // The first pass copies a chunk of data into shared memory. - // The second pass copies that chunk from shared memory out to the final location. - - // Because shared memory is limited we copy a subset of the rows at a time. - // This has been broken up for us in the block_info struct, so we don't have - // any calculation to do here, but it is important to note. - - constexpr bool debug_print = false; // blockIdx.x == 0 && threadIdx.x == 1; - - constexpr unsigned stages_count = NUM_BLOCKS_PER_KERNEL_LOADED; - auto group = cooperative_groups::this_thread_block(); - extern __shared__ int8_t shared_data[]; - int8_t* shared[stages_count] = {shared_data, shared_data + shmem_used_per_block}; - - __shared__ cuda::barrier block_barrier[NUM_BLOCKS_PER_KERNEL_LOADED]; - if (group.thread_rank() == 0) { - for (int i = 0; i < NUM_BLOCKS_PER_KERNEL_LOADED; ++i) { - init(&block_barrier[i], group.size()); - } - } - - group.sync(); - - if (debug_print) { - printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns); - printf("col sizes at %p, col offsets at %p, and row offsets at %p\n", - col_sizes, - col_offsets, - row_offsets); - printf("block infos are at %p and my index is %d\n", block_infos, blockIdx.x); - printf("output data to %p\n", output_data[block_infos[blockIdx.x].buffer_num]); - printf("shared memory pointers are %p and %p\n", shared[0], shared[1]); - printf("shared_memory ends at %p\n", &shared_data[shmem_used_per_block * 2]); - printf("group is %d threads\n", group.size()); - } - // else { return; } - - auto const blocks_remaining = - std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS, - (uint)NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS); - - size_t fetch; - size_t subset; - for (subset = fetch = 0; subset < blocks_remaining; ++subset) { - // Fetch ahead up to stages_count subsets - for (; fetch < blocks_remaining && fetch < (subset + stages_count); ++fetch) { - if (debug_print) - printf("fetching block %lu\n", blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS + fetch); - auto const fetch_block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS + fetch]; - if (debug_print) - printf("block %lu rows %d-%d and cols %d-%d\n", - blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS + fetch, - fetch_block.start_row, - fetch_block.end_row, - fetch_block.start_col, - fetch_block.end_col); - - auto const num_fetch_cols = fetch_block.num_cols(); - auto const num_fetch_rows = fetch_block.num_rows(); - auto const num_elements_in_block = num_fetch_cols * num_fetch_rows; - auto const fetch_block_row_size = fetch_block.get_shared_row_size(col_offsets, col_sizes); - auto const starting_column_offset = col_offsets[fetch_block.start_col]; - auto& fetch_barrier = block_barrier[fetch % NUM_BLOCKS_PER_KERNEL_LOADED]; - - // wait for the last use of the memory to be completed - if (fetch > NUM_BLOCKS_PER_KERNEL_LOADED) { fetch_barrier.arrive_and_wait(); } - - // to do the copy we need to do n column copies followed by m element copies OR - // we have to do m element copies followed by r row copies. When going from column - // to row it is much easier to copy by elements first otherwise we would need a running - // total of the column sizes for our block, which isn't readily available. This makes it - // more appealing to copy element-wise from input data into shared matching the end layout - // and do row-based memcopies out. - - auto const shared_buffer_base = shared[fetch % stages_count]; - for (auto el = (int)threadIdx.x; el < num_elements_in_block; el += blockDim.x) { - auto const relative_col = el / num_fetch_rows; - auto const relative_row = el % num_fetch_rows; - auto const absolute_col = relative_col + fetch_block.start_col; - auto const absolute_row = relative_row + fetch_block.start_row; - if (debug_print) - printf("row %d(%d), col %d(%d), %d fetch rows, element %d\n", - relative_row, - absolute_row, - relative_col, - absolute_col, - num_fetch_rows, - el); - auto const col_size = col_sizes[absolute_col]; - auto const col_offset = col_offsets[absolute_col]; - auto const relative_col_offset = col_offset - starting_column_offset; - - auto const shared_offset = relative_row * fetch_block_row_size + relative_col_offset; - auto const input_src = input_data[absolute_col] + col_size * absolute_row; - - if (debug_print) - printf("block %lu to shared chunk %lu. %p <- %p(0x%x) - %d bytes\n", - fetch, - fetch % stages_count, - &shared_buffer_base[shared_offset], - input_src, - *input_src, - col_size); - - // copy the element from global memory - switch (col_size) { - case 2: - cuda::memcpy_async(&shared_buffer_base[shared_offset], - input_src, - cuda::aligned_size_t<2>(col_size), - fetch_barrier); - break; - case 4: - cuda::memcpy_async(&shared_buffer_base[shared_offset], - input_src, - cuda::aligned_size_t<4>(col_size), - fetch_barrier); - break; - case 8: - cuda::memcpy_async(&shared_buffer_base[shared_offset], - input_src, - cuda::aligned_size_t<8>(col_size), - fetch_barrier); - break; - default: - cuda::memcpy_async( - &shared_buffer_base[shared_offset], input_src, col_size, fetch_barrier); - break; - } - } - } - - auto& subset_barrier = block_barrier[subset % NUM_BLOCKS_PER_KERNEL_LOADED]; - subset_barrier.arrive_and_wait(); - - auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS + subset]; - if (debug_print) - printf("reading block %lu\n", blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS + subset); - - auto const block_row_size = block.get_shared_row_size(col_offsets, col_sizes); - auto const column_offset = col_offsets[block.start_col]; - auto const block_output_buffer = output_data[block.buffer_num]; - - // copy entire rows to final dest - for (auto absolute_row = block.start_row + threadIdx.x; absolute_row <= block.end_row; - absolute_row += blockDim.x) { - auto const relative_row = absolute_row - block.start_row; - auto const output_dest = block_output_buffer + row_offsets[absolute_row] + column_offset; - if (debug_print) - printf("processing row %d\noutput data[%d] is address %p\n", - absolute_row, - absolute_row, - output_dest); - auto const shared_offset = block_row_size * relative_row; - if (debug_print) - printf("memcpy %p <- %p - %d bytes which is row %d\n", - output_dest, - &shared[subset % stages_count][shared_offset], - block_row_size, - absolute_row); - - cuda::memcpy_async(output_dest, - &shared[subset % stages_count][shared_offset], - cuda::aligned_size_t<8>(block_row_size), - subset_barrier); - } - } - - // wait on the last copies to complete - for (uint i = 0; i < std::min(stages_count, blocks_remaining); ++i) { - block_barrier[i].arrive_and_wait(); - } -} - -/** - * @brief copy data from row-based format to cudf columns - * - * @param num_rows total number of rows in the table - * @param num_columns total number of columns in the table - * @param shmem_used_per_block amount of shared memory that is used by a block - * @param offsets - * @param output_data pointer to output data, partitioned by data size - * @param validity_offsets offset into input data row for validity data - * @param block_infos information about the blocks of work - * @param num_block_infos number of infos in blocks array - * @param input_data pointer to input data - * - */ -__global__ void copy_validity_from_columns(const size_type num_rows, - const size_type num_columns, - const size_type shmem_used_per_block, - const size_type* row_offsets, - int8_t** output_data, - const size_type validity_offset, - const block_info* block_infos, - const size_type num_block_infos, - const bitmask_type** input_nm) -{ - extern __shared__ int8_t shared_data[]; - int8_t* shared_blocks[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] = { - shared_data, shared_data + shmem_used_per_block / 2}; - - constexpr bool print_debug = false; // threadIdx.x==0 && blockIdx.x == 0; - // if (blockIdx.x != 3 || threadIdx.x / 32 != 0) return; - if (print_debug) { - printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns); - printf("%d %d - block infos are at %p and my index is %d\n", - threadIdx.x, - blockIdx.x, - block_infos, - blockIdx.x); - printf("%d %d - input nm is %p, input_nm[0] is at %p\n", - threadIdx.x, - blockIdx.x, - input_nm, - input_nm[0]); - printf("shared memory is %p to %p\n", shared_data, shared_data + shmem_used_per_block * 2); - printf("block infos at %p and this is index %d\n", - &block_infos, - blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + 0); - /* printf("Row Offsets:\n"); - for (int i=0; i - shared_block_barriers[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]; - if (group.thread_rank() == 0) { - for (int i = 0; i < NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; ++i) { - init(&shared_block_barriers[i], group.size()); - } - } - - group.sync(); - - for (int validity_block = 0; validity_block < blocks_remaining; ++validity_block) { - if (validity_block != validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED) { - if (print_debug) - printf("%d: waiting at barrier %d\n", - threadIdx.x, - validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED); - shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] - .arrive_and_wait(); - if (print_debug) printf("past barrier...\n"); - } - int8_t* this_shared_block = shared_blocks[validity_block % 2]; - if (print_debug) printf("top of loop for validity block %d\n", validity_block); - if (print_debug) - printf("reading validity block info %d at %p\n", - blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block, - &block_infos[blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block]); - auto block = block_infos[blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block]; - - auto const num_block_cols = block.num_cols(); - auto const num_block_rows = block.num_rows(); - - auto const num_sections_x = util::div_rounding_up_unsafe(num_block_cols, 32); - auto const num_sections_y = util::div_rounding_up_unsafe(num_block_rows, 32); - auto const validity_data_row_length = - align_offset(util::div_rounding_up_unsafe(num_block_cols, 8), 8); - auto const total_sections = num_sections_x * num_sections_y; - - if (print_debug) { - printf("%d %d - block %d has %d cols, %d rows, %d row length, and %d total sections\n", - threadIdx.x, - blockIdx.x, - blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block, - num_block_cols, - num_block_rows, - validity_data_row_length, - total_sections); - } - int const warp_id = threadIdx.x / detail::warp_size; - int const lane_id = threadIdx.x % detail::warp_size; - auto const warps_per_block = std::max(1u, blockDim.x / detail::warp_size); - - if (print_debug) - printf( - "%d %d - my warp is %d, %d total sections(%d x, %d y), %d warps per block, " - "blockDim.x=%d, " - "warp size " - "%d\n", - threadIdx.x, - blockIdx.x, - warp_id, - total_sections, - num_sections_x, - num_sections_y, - warps_per_block, - blockDim.x, - detail::warp_size); - // the block is divided into sections. A warp operates on a section at a time. - for (int my_section_idx = warp_id; my_section_idx < total_sections; - my_section_idx += warps_per_block) { - // convert to rows and cols - auto const section_x = my_section_idx % num_sections_x; - auto const section_y = my_section_idx / num_sections_x; - - if (print_debug) - printf("working on section %d,%d - %d of %d...\n", - section_x, - section_y, - my_section_idx, - total_sections); - auto const relative_col = section_x * 32 + lane_id; - auto const relative_row = section_y * 32; - auto const absolute_col = relative_col + block.start_col; - auto const absolute_row = relative_row + block.start_row; - auto const cols_left = num_columns - absolute_col; - - if (print_debug) printf("pre ballot sync...\n"); - auto const participation_mask = __ballot_sync(0xFFFFFFFF, absolute_col < num_columns); - - if (print_debug) - printf( - "participation mask is 0x%x for relative row %d(%d real), relative col %d(%d " - "absolute)\n", - participation_mask, - relative_row, - absolute_row, - relative_col, - absolute_col); - - if (absolute_col < num_columns) { - if (print_debug) - printf( - "thread %d's byte is at %p, participation mask is 0x%x for relative row %d(%d real), " - "relative col %d(%d absolute)\n", - threadIdx.x, - &input_nm[absolute_col][absolute_row / 32], - participation_mask, - relative_row, - absolute_row, - relative_col, - absolute_col); - auto my_data = input_nm[absolute_col] != nullptr ? input_nm[absolute_col][absolute_row / 32] - : std::numeric_limits::max(); - - if (print_debug) - printf( - "thread %d's bytes are 0x%x, participation mask is 0x%x for relative row %d(%d real), " - "relative col %d(%d absolute)\n", - threadIdx.x, - my_data, - participation_mask, - relative_row, - absolute_row, - relative_col, - absolute_col); - - // every thread that is participating in the warp has a byte, but it's column-based - // data and we need it in row-based. So we shuffle the bits around with ballot_sync to - // make the bytes we actually write. - bitmask_type dw_mask = 1; - for (int i = 0; i < 32 && relative_row + i < num_rows; ++i, dw_mask <<= 1) { - auto validity_data = __ballot_sync(participation_mask, my_data & dw_mask); - // lead thread in each warp writes data - auto const validity_write_offset = - validity_data_row_length * (relative_row + i) + relative_col / 8; - if (threadIdx.x % detail::warp_size == 0) { - if (print_debug) - printf( - "%d %d - byte_mask is 0x%x, masked_byte is 0x%x, shared_data_block[%d][%d] = " - "0x%x\n", - threadIdx.x, - blockIdx.x, - dw_mask, - my_data & dw_mask, - validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED, - validity_write_offset, - validity_data); - if (cols_left <= 8) { - // write byte - if (print_debug) - printf("%d %d - writing single byte to shared offset 0x%x which is %p...\n", - threadIdx.x, - blockIdx.x, - validity_write_offset, - &this_shared_block[validity_write_offset]); - this_shared_block[validity_write_offset] = validity_data & 0xFF; - } else if (cols_left <= 16) { - // write int16 - if (print_debug) - printf("%d %d - writing two bytes to shared offset 0x%x which is %p...\n", - threadIdx.x, - blockIdx.x, - validity_write_offset, - &this_shared_block[validity_write_offset]); - *reinterpret_cast(&this_shared_block[validity_write_offset]) = - validity_data & 0xFFFF; - } else if (cols_left <= 24) { - // write int16 and then int8 - if (print_debug) - printf("%d %d - writing three bytes to shared offset 0x%x which is %p...\n", - threadIdx.x, - blockIdx.x, - validity_write_offset, - &this_shared_block[validity_write_offset]); - *reinterpret_cast(&this_shared_block[validity_write_offset]) = - validity_data & 0xFFFF; - shared_data[validity_write_offset + 2] = (validity_data >> 16) & 0xFF; - } else { - // write int32 - if (print_debug) - printf("%d %d - writing 4 bytes to shared offset 0x%x which is %p...\n", - threadIdx.x, - blockIdx.x, - validity_write_offset, - &this_shared_block[validity_write_offset]); - *reinterpret_cast(&this_shared_block[validity_write_offset]) = - validity_data; - } - } - } - } - } - - // make sure entire block has finished copy - group.sync(); - - auto const output_data_base = - output_data[block.buffer_num] + validity_offset + block.start_col / 8; - - // now async memcpy the shared memory out to the final destination - for (int row = block.start_row + threadIdx.x; row <= block.end_row; row += blockDim.x) { - auto const relative_row = row - block.start_row; - if (print_debug) { - printf( - "base output data is %p, row offset is 0x%x, validity offset into row is 0x%x, word " - "index of block start is 0x%x\n", - output_data[block.buffer_num], - row_offsets[row], - validity_offset, - word_index(block.start_col)); - printf( - "%d %d - row %d/%d/%d col %d-%d - %p = shared_data_block[%d][%d] which is %p - %d " - "bytes\n - %p <- 0x%x\n", - threadIdx.x, - blockIdx.x, - block.start_row, - row, - block.end_row, - block.start_col, - block.end_col, - output_data[block.buffer_num] + row_offsets[row] + validity_offset + - (word_index(block.start_col)), - validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED, - validity_data_row_length * relative_row, - &this_shared_block[validity_data_row_length * relative_row], - util::div_rounding_up_unsafe(num_block_cols, 8), - output_data[block.buffer_num] + row_offsets[row] + validity_offset + - word_index(block.start_col), - this_shared_block[validity_data_row_length * relative_row]); - } - auto const output_ptr = output_data_base + row_offsets[row]; - auto const num_bytes = util::div_rounding_up_unsafe(num_block_cols, 8); - - cuda::memcpy_async( - output_ptr, - &this_shared_block[validity_data_row_length * relative_row], - num_bytes, - shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]); - } - } - - // wait for last blocks of data to arrive - for (int validity_block = 0; - validity_block < blocks_remaining % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; - ++validity_block) { - shared_block_barriers[validity_block].arrive_and_wait(); - } -} - -static __device__ std::tuple get_admin_data_sizes(size_t col_size_size, - size_t col_offset_size, - int const num_cols) -{ - auto const col_size_bytes = num_cols * col_size_size; - auto const col_offset_bytes = num_cols * col_offset_size; - - return {col_size_bytes, col_offset_bytes}; -} - -/** - * @brief ensure `read_ahead` buffer blocks are fetched - * - * @param fetch_index internal state passed into the function - * @param processing_index index where processing is occuring - * @param read_ahead_count how many blocks to read ahead - * @param max_resident_blocks how many blocks can be loaded at once - * @param total_blocks total number of blocks overall - * @param block_infos pointer to the block infos - * @param col_sizes pointer to column size information - * @param col_offsets pointer to the table's column offsets - * @param row_offsets pointer to offsets for each row in the table - * @param input_data pointer to the input data - * @param shared pointer to shared memory - * @param group thread group participating in the fetch - * @param block_barrier barriers used for each block - * @param debug_print - * @return - */ -static __device__ void fetch_blocks_for_row_to_column( - size_t& fetch_index, - size_t const processing_index, - int const read_ahead_count, - int const max_resident_blocks, - int const total_blocks, - block_info const* const block_infos, - size_type const* const col_sizes, - size_type const* const col_offsets, - size_type const* const row_offsets, - int8_t const* const input_data, - int8_t* shared[], - cooperative_groups::thread_block const group, - cuda::barrier* block_barrier, - bool debug_print) -{ - for (; fetch_index < static_cast(total_blocks) && - fetch_index < (processing_index + read_ahead_count); - ++fetch_index) { - auto const fetch_block = - block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch_index]; - if (debug_print) - printf( - "fetching block %lu of %d for start col %d, end col %d. Starting col offset is %p, " - "ending " - "offset %p\n", - blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch_index, - total_blocks, - fetch_block.start_col, - fetch_block.end_col, - &col_offsets[fetch_block.start_col], - &col_offsets[fetch_block.end_col]); - auto const fetch_block_start_row = fetch_block.start_row; - auto const fetch_block_end_row = fetch_block.end_row; - auto const starting_col_offset = col_offsets[fetch_block.start_col]; - - auto const fetch_block_row_size = fetch_block.get_shared_row_size(col_offsets, col_sizes); - auto const num_fetch_cols = fetch_block.num_cols(); - auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes( - sizeof(decltype(*col_sizes)), sizeof(decltype(*col_offsets)), num_fetch_cols); - auto& fetch_barrier = block_barrier[fetch_index % NUM_BLOCKS_PER_KERNEL_LOADED]; - - // if we have fetched all buffers, we need to wait for processing - // to complete on them before we can use them again - if (fetch_index > NUM_BLOCKS_PER_KERNEL_LOADED) { fetch_barrier.arrive_and_wait(); } - - auto shared_row_offset = 0; - // copy the data for column sizes - if (debug_print) - printf("%d: col sizes memcpy_async(group, %p, %p, %d, barrier);\n", - threadIdx.x, - &shared[fetch_index % max_resident_blocks][shared_row_offset], - &col_offsets[fetch_block.start_col], - col_size_bytes); - if (debug_print && group.thread_rank() == 0 && blockIdx.x == 0) - printf("%d-%d fetching to %p with barrier %p\n", - threadIdx.x, - blockIdx.x, - shared[fetch_index % max_resident_blocks], - &fetch_barrier); - cuda::memcpy_async(group, - &shared[fetch_index % max_resident_blocks][shared_row_offset], - &col_sizes[fetch_block.start_col], - col_size_bytes, - fetch_barrier); - shared_row_offset += col_size_bytes; - // copy the data for column offsets - if (debug_print) - printf("%d: offsets memcpy_async(group, %p, %p, %d, barrier);\n", - threadIdx.x, - &shared[fetch_index % max_resident_blocks][shared_row_offset], - &col_offsets[fetch_block.start_col], - col_offset_bytes); - cuda::memcpy_async(group, - &shared[fetch_index % max_resident_blocks][shared_row_offset], - &col_offsets[fetch_block.start_col], - col_offset_bytes, - fetch_barrier); - shared_row_offset += col_offset_bytes; - shared_row_offset = align_offset(shared_row_offset, 8); - - for (auto row = fetch_block_start_row + static_cast(threadIdx.x); - row <= fetch_block_end_row; - row += blockDim.x) { - auto shared_offset = (row - fetch_block_start_row) * fetch_block_row_size + shared_row_offset; - if (debug_print) - printf( - "%d - fetching block %lu to shared chunk %lu. %p(shared[%d %% %d][%d]) <- %p(row %d, row " - "offset %d starting col offset %d)\n", - threadIdx.x, - fetch_index, - fetch_index % max_resident_blocks, - &shared[fetch_index % max_resident_blocks][shared_offset], - (int)fetch_index, - max_resident_blocks, - shared_offset, - &input_data[row_offsets[row] + starting_col_offset], - row, - row_offsets[row], - starting_col_offset); - // copy the main - cuda::memcpy_async(&shared[fetch_index % max_resident_blocks][shared_offset], - &input_data[row_offsets[row] + starting_col_offset], - fetch_block_row_size, - fetch_barrier); - } - } -} - -/** - * @brief copy data from row-based format to cudf columns - * - * @param num_rows total number of rows in the table - * @param num_columns total number of columns in the table - * @param shmem_used_per_block amount of shared memory that is used by a block - * @param row_offsets - * @param output_data - * @param output_nm - * @param col_sizes array of sizes for each element in a column - one per column - * @param col_offsets offset into input data row for each column's start - * @param block_infos information about the blocks of work - * @param input_data pointer to input data - * - */ -__global__ void copy_to_columns(const size_type num_rows, - const size_type num_columns, - const size_type shmem_used_per_block, - const size_type* row_offsets, - int8_t** output_data, - const size_type* _col_sizes, - const size_type* _col_offsets, - const block_info* block_infos, - const size_type num_block_infos, - const int8_t* input_data) -{ - // We are going to copy the data in two passes. - // The first pass copies a chunk of data into shared memory. - // The second pass copies that chunk from shared memory out to the final location. - - // Because shared memory is limited we copy a subset of the rows at a time. - // This has been broken up for us in the block_info struct, so we don't have - // any calculation to do here, but it is important to note. - - // to speed up some of the random access memory we do, we copy col_sizes and col_offsets - // to shared memory for each of the blocks that we work on - - constexpr bool debug_print = false; // threadIdx.x == 2 && blockIdx.x == 0; - constexpr unsigned stages_count = NUM_BLOCKS_PER_KERNEL_LOADED; - auto group = cooperative_groups::this_thread_block(); - extern __shared__ int8_t shared_data[]; - int8_t* shared[stages_count] = {shared_data, shared_data + shmem_used_per_block}; - - if (debug_print) { - printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns); - printf( - "%d block infos are at %p and my index is %d\n", num_block_infos, block_infos, blockIdx.x); - /* printf("Row Offsets:\n"); - for (int i=0; i block_barrier[NUM_BLOCKS_PER_KERNEL_LOADED]; - if (group.thread_rank() == 0) { - for (int i = 0; i < NUM_BLOCKS_PER_KERNEL_LOADED; ++i) { - init(&block_barrier[i], group.size()); - } - } - - group.sync(); - - auto blocks_remaining = std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS, - (uint)NUM_BLOCKS_PER_KERNEL_TO_COLUMNS); - - auto get_admin_data_sizes = [col_size_size = sizeof(decltype(*_col_sizes)), - col_offset_size = sizeof(decltype(*_col_offsets))]( - int const num_cols, - int const num_rows) -> std::tuple { - auto const col_size_bytes = num_cols * col_size_size; - auto const col_offset_bytes = num_cols * col_offset_size; - - return {col_size_bytes, col_offset_bytes}; - }; - - if (debug_print) - printf("%d blocks remaining -> %d block infos, %d block index\n", - blocks_remaining, - num_block_infos, - blockIdx.x); - size_t fetch; - size_t subset; - for (subset = fetch = 0; subset < blocks_remaining; ++subset) { - // Fetch ahead up to stages_count subsets - fetch_blocks_for_row_to_column(fetch, - subset, - stages_count, - stages_count, - blocks_remaining, - block_infos, - _col_sizes, - _col_offsets, - row_offsets, - input_data, - shared, - group, - block_barrier, - debug_print); - - auto& subset_barrier = block_barrier[subset % NUM_BLOCKS_PER_KERNEL_LOADED]; - // ensure our data is ready - if (debug_print) - printf("%d-%d waiting at barrier %p\n", threadIdx.x, blockIdx.x, &subset_barrier); - subset_barrier.arrive_and_wait(); - - auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + subset]; - if (debug_print) - printf("%d-%d reading block %lu at address %p\n", - threadIdx.x, - blockIdx.x, - blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + subset, - shared[subset % stages_count]); - - auto const rows_in_block = block.num_rows(); - auto const cols_in_block = block.num_cols(); - - auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes(cols_in_block, rows_in_block); - // auto shared_row_offsets = shared[subset]; - auto shared_col_sizes = reinterpret_cast(shared[subset % stages_count]); - auto shared_col_offsets = - reinterpret_cast(&shared[subset % stages_count][col_size_bytes]); - - auto const shared_row_offset = align_offset(col_size_bytes + col_offset_bytes, 8); - - auto block_row_size = block.get_shared_row_size(_col_offsets, _col_sizes, debug_print); - - // now we copy from shared memory to final destination. - // the data is laid out in rows in shared memory, so the reads - // for a column will be "vertical". Because of this and the different - // sizes for each column, this portion is handled on row/column basis. - // to prevent each thread working on a single row and also to ensure - // that all threads can do work in the case of more threads than rows, - // we do a global index instead of a double for loop with col/row. - for (int index = threadIdx.x; index < rows_in_block * cols_in_block; index += blockDim.x) { - auto const relative_col = index % cols_in_block; - auto const relative_row = index / cols_in_block; - auto const absolute_col = relative_col + block.start_col; - auto const absolute_row = relative_row + block.start_row; - - if (debug_print) - printf("copying for row %d(%d absolute) col %d(%d absolute)\n", - relative_row, - absolute_row, - relative_col, - absolute_col); - - auto const shared_memory_row_offset = block_row_size * relative_row; - if (debug_print) - printf("shared_col_offsets is %p and relative col is %d, making me access %p\n", - shared_col_offsets, - relative_col, - &shared_col_offsets[relative_col]); - auto const shared_memory_offset = shared_col_offsets[relative_col] - shared_col_offsets[0] + - shared_memory_row_offset + shared_row_offset; - if (debug_print) - printf("shared_col_sizes is %p and relative col is %d, making me access %p\n", - shared_col_sizes, - relative_col, - &shared_col_sizes[relative_col]); - auto const column_size = shared_col_sizes[relative_col]; - - int8_t* shmem_src = &shared[subset % stages_count][shared_memory_offset]; - int8_t* dst = &output_data[absolute_col][absolute_row * column_size]; - - if (debug_print) { - printf( - "relative_col: %d, relative_row: %d, absolute_col: %d, absolute_row: %d, " - "shared_mmeory_row_offset: %d, shared_memory_offset: %d," - " column_size: %d, shmem_src: %p, dst: %p\n",//, uint32 is %u\n", - relative_col, - relative_row, - absolute_col, - absolute_row, - shared_memory_row_offset, - shared_memory_offset, - column_size, - shmem_src, - dst/*, - *reinterpret_cast(shmem_src)*/); - printf("memcpy_async(%p, %p, %d, subset_barrier);\n", dst, shmem_src, column_size); - } - if (debug_print && absolute_col == 0 && absolute_row == 51) { - printf("col0row51(%d bytes) = %p - 0x", column_size, shmem_src); - for (int i = 0; i < column_size; ++i) { - printf("%x ", shmem_src[i]); - } - printf("\n"); - } - - cuda::memcpy_async(dst, shmem_src, column_size, subset_barrier); - } - group.sync(); - if (debug_print) - printf( - "%d-%d copy to main memory with barrier %p\n", threadIdx.x, blockIdx.x, &subset_barrier); - } - - // wait on the last copies to complete - for (uint i = 0; i < std::min(stages_count, blocks_remaining); ++i) { - block_barrier[i].arrive_and_wait(); - } -} - -/** - * @brief copy data from row-based format to cudf columns - * - * @param num_rows total number of rows in the table - * @param num_columns total number of columns in the table - * @param shmem_used_per_block amount of shared memory that is used by a block - * @param offsets - * @param output_nm - * @param validity_offsets offset into input data row for validity data - * @param block_infos information about the blocks of work - * @param num_block_infos number of infos in blocks array - * @param input_data pointer to input data - * - */ -__global__ void copy_validity_to_columns(const size_type num_rows, - const size_type num_columns, - const size_type shmem_used_per_block, - const size_type* row_offsets, - cudf::bitmask_type** output_nm, - const size_type validity_offset, - const block_info* block_infos, - const size_type num_block_infos, - const int8_t* input_data) -{ - extern __shared__ int8_t shared_data[]; - int8_t* shared_blocks[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] = { - shared_data, shared_data + shmem_used_per_block / 2}; - - constexpr bool print_debug = false; // threadIdx.x == 0 && blockIdx.x == 0; - if (print_debug) { - printf("%d %d - %d rows, %d columns\n", threadIdx.x, blockIdx.x, num_rows, num_columns); - printf("%d %d - block infos are at %p and my index is %d\n", - threadIdx.x, - blockIdx.x, - block_infos, - blockIdx.x); - printf( - "%d %d - Shared memory starts at %p and ends at %p, input data is %p, output data is %p, " - "row " - "offsets are %p, block infos at %p\n", - threadIdx.x, - blockIdx.x, - shared_data, - shared_data + shmem_used_per_block, - input_data, - output_nm, - row_offsets, - block_infos); - } - // else { return; } - - // per conversation with DaveB - // each thread of warp reads a single byte of validity - so we read 32 bytes - // then ballot_sync the bits and write the result to shmem - // after we fill shared mem memcpy it out in a blob. - // probably need knobs for number of rows vs columns to balance read/write - auto group = cooperative_groups::this_thread_block(); - - int const blocks_remaining = - std::min(num_block_infos - blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL, - (uint)NUM_VALIDITY_BLOCKS_PER_KERNEL); - - if (print_debug) printf("%d blocks with %d in group\n", blocks_remaining, group.size()); - - __shared__ cuda::barrier - shared_block_barriers[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]; - if (group.thread_rank() == 0) { - for (int i = 0; i < NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; ++i) { - init(&shared_block_barriers[i], group.size()); - } - } - - group.sync(); - - for (int validity_block = 0; validity_block < blocks_remaining; ++validity_block) { - auto const validity_index = validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; - if (validity_block != validity_index) { - shared_block_barriers[validity_index].arrive_and_wait(); - } - int8_t* this_shared_block = shared_blocks[validity_block % 2]; - auto const block = block_infos[blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block]; - auto const block_start_col = block.start_col; - auto const block_start_row = block.start_row; - - auto const num_block_cols = block.num_cols(); - auto const num_block_rows = block.num_rows(); - - auto const num_sections_x = (num_block_cols + 7) / 8; - auto const num_sections_y = (num_block_rows + 31) / 32; - auto const validity_data_col_length = num_sections_y * 4; // words to bytes - auto const total_sections = num_sections_x * num_sections_y; - - if (print_debug) { - printf("%d %d - block %d has %d cols, %d rows, and %d total sections\n", - threadIdx.x, - blockIdx.x, - blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block, - num_block_cols, - num_block_rows, - total_sections); - } - int const warp_id = threadIdx.x / detail::warp_size; - int const lane_id = threadIdx.x % detail::warp_size; - auto const warps_per_block = std::max(1u, blockDim.x / detail::warp_size); - - if (print_debug) - printf( - "%d %d - my warp is %d, %d total sections, %d warps per block, blockDim.x=%d, warp side " - "%d\n", - threadIdx.x, - blockIdx.x, - warp_id, - total_sections, - warps_per_block, - blockDim.x, - detail::warp_size); - // the block is divided into sections. A warp operates on a section at a time. - for (int my_section_idx = warp_id; my_section_idx < total_sections; - my_section_idx += warps_per_block) { - // convert to rows and cols - auto const section_x = my_section_idx % num_sections_x; - auto const section_y = my_section_idx / num_sections_x; - - auto const relative_col = section_x * 8; - auto const relative_row = section_y * 32 + lane_id; - auto const absolute_col = relative_col + block_start_col; - auto const absolute_row = relative_row + block_start_row; - auto const rows_left = num_rows - absolute_row; - - /* if (print_debug) - printf("%d-%d: si: %d nsx: %d nsy: %d sx: %d sy: %d ar: %d nr: %d rc: %d rr: %d\n", - threadIdx.x, - blockIdx.x, - my_section_idx, - num_sections_x, - num_sections_y, - section_x, - section_y, - absolute_row, - num_rows, - relative_col, - relative_row);*/ - auto const participation_mask = __ballot_sync(0xFFFFFFFF, absolute_row < num_rows); - - if (absolute_row < num_rows) { - auto const my_byte = - input_data[row_offsets[absolute_row] + validity_offset + absolute_col / 8]; - - // so every thread that is participating in the warp has a byte, but it's row-based - // data and we need it in column-based. So we shiffle the bits around to make - // the bytes we actually write. - for (int i = 0, byte_mask = 1; i < 8 && relative_col + i < num_columns; - ++i, byte_mask <<= 1) { - auto validity_data = __ballot_sync(participation_mask, my_byte & byte_mask); - // lead thread in each warp writes data - if (threadIdx.x % detail::warp_size == 0) { - auto const validity_write_offset = - validity_data_col_length * (relative_col + i) + relative_row / 8; - - if (print_debug) - printf( - "%d - Writing validity data for column %d, row %d 0x%x to shared memory location " - "%d(%d * (%d + %d) + %d / 8)\n", - threadIdx.x, - absolute_col + i, - absolute_row, - validity_data, - validity_write_offset, - validity_data_col_length, - relative_col, - i, - relative_row); - - if (rows_left <= 8) { - // write byte - this_shared_block[validity_write_offset] = validity_data & 0xFF; - } else if (rows_left <= 16) { - // write int16 - *reinterpret_cast(&this_shared_block[validity_write_offset]) = - validity_data & 0xFFFF; - } else if (rows_left <= 24) { - // write int16 and then int8 - *reinterpret_cast(&this_shared_block[validity_write_offset]) = - validity_data & 0xFFFF; - shared_data[validity_write_offset + 2] = (validity_data >> 16) & 0xFF; - } else { - // write int32 - *reinterpret_cast(&this_shared_block[validity_write_offset]) = - validity_data; - } - } - } - } - } - - // make sure entire block has finished copy - group.sync(); - - // now async memcpy the shared - for (int col = block.start_col + threadIdx.x; col <= block.end_col; col += blockDim.x) { - auto const relative_col = col - block.start_col; - auto const words_to_copy = util::div_rounding_up_unsafe(num_block_rows, 32); - auto const starting_address = output_nm[col] + word_index(block_start_row); - - if (print_debug) - printf("%d %d - col %d memcpy_async(%p(offset %d), %p, %d, subset_barrier); - 0x%x\n", - threadIdx.x, - blockIdx.x, - col, - starting_address, - word_index(block_start_row), - &this_shared_block[validity_data_col_length * relative_col], - words_to_copy * 4, - this_shared_block[validity_data_col_length * relative_col]); - cuda::memcpy_async( - output_nm[col] + word_index(block_start_row), - &this_shared_block[validity_data_col_length * relative_col], - util::div_rounding_up_unsafe(num_block_rows, 8), - shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]); - } - } - - // wait for last blocks of data to arrive - auto const num_blocks_to_wait = blocks_remaining > NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED - ? NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED - : blocks_remaining; - for (int validity_block = 0; validity_block < num_blocks_to_wait; ++validity_block) { - shared_block_barriers[validity_block].arrive_and_wait(); - } -} - -#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 - -/** - * Calculate the dimensions of the kernel for fixed width only columns. - * @param [in] num_columns the number of columns being copied. - * @param [in] num_rows the number of rows being copied. - * @param [in] size_per_row the size each row takes up when padded. - * @param [out] blocks the size of the blocks for the kernel - * @param [out] threads the size of the threads for the kernel - * @return the size in bytes of shared memory needed for each block. - */ -static int calc_fixed_width_kernel_dims(const cudf::size_type num_columns, - const cudf::size_type num_rows, - const cudf::size_type size_per_row, - dim3& blocks, - dim3& threads) -{ - // We have found speed degrades when a thread handles more than 4 columns. - // Each block is 2 dimensional. The y dimension indicates the columns. - // We limit this to 32 threads in the y dimension so we can still - // have at least 32 threads in the x dimension (1 warp) which should - // result in better coalescing of memory operations. We also - // want to guarantee that we are processing a multiple of 32 threads - // in the x dimension because we use atomic operations at the block - // level when writing validity data out to main memory, and that would - // need to change if we split a word of validity data between blocks. - int y_block_size = (num_columns + 3) / 4; // cudf::util::div_rounding_up_safe(num_columns, 4); - if (y_block_size > 32) { y_block_size = 32; } - int x_possible_block_size = 1024 / y_block_size; - // 48KB is the default setting for shared memory per block according to the cuda tutorials - // If someone configures the GPU to only have 16 KB this might not work. - int max_shared_size = 48 * 1024; - int max_block_size = max_shared_size / size_per_row; - // If we don't have enough shared memory there is no point in having more threads - // per block that will just sit idle - max_block_size = max_block_size > x_possible_block_size ? x_possible_block_size : max_block_size; - // Make sure that the x dimension is a multiple of 32 this not only helps - // coalesce memory access it also lets us do a ballot sync for validity to write - // the data back out the warp level. If x is a multiple of 32 then each thread in the y - // dimension is associated with one or more warps, that should correspond to the validity - // words directly. - int block_size = (max_block_size / 32) * 32; - CUDF_EXPECTS(block_size != 0, "Row size is too large to fit in shared memory"); - - int num_blocks = (num_rows + block_size - 1) / block_size; - if (num_blocks < 1) { - num_blocks = 1; - } else if (num_blocks > 10240) { - // The maximum number of blocks supported in the x dimension is 2 ^ 31 - 1 - // but in practice haveing too many can cause some overhead that I don't totally - // understand. Playing around with this haveing as little as 600 blocks appears - // to be able to saturate memory on V100, so this is an order of magnitude higher - // to try and future proof this a bit. - num_blocks = 10240; - } - blocks.x = num_blocks; - blocks.y = 1; - blocks.z = 1; - threads.x = block_size; - threads.y = y_block_size; - threads.z = 1; - return size_per_row * block_size; -} - -/** - * When converting to rows it is possible that the size of the table was too big to fit - * in a single column. This creates an output column for a subset of the rows in a table - * going from start row and containing the next num_rows. Most of the parameters passed - * into this function are common between runs and should be calculated once. - */ -static std::unique_ptr fixed_width_convert_to_rows( - const cudf::size_type start_row, - const cudf::size_type num_rows, - const cudf::size_type num_columns, - const cudf::size_type size_per_row, - rmm::device_uvector& column_start, - rmm::device_uvector& column_size, - rmm::device_uvector& input_data, - rmm::device_uvector& input_nm, - const cudf::scalar& zero, - const cudf::scalar& scalar_size_per_row, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - int64_t total_allocation = size_per_row * num_rows; - // We made a mistake in the split somehow - CUDF_EXPECTS(total_allocation < std::numeric_limits::max(), "Table is too large to fit!"); - - // Allocate and set the offsets row for the byte array - std::unique_ptr offsets = - cudf::detail::sequence(num_rows + 1, zero, scalar_size_per_row, stream); - - std::unique_ptr data = - cudf::make_numeric_column(cudf::data_type(cudf::type_id::INT8), - static_cast(total_allocation), - cudf::mask_state::UNALLOCATED, - stream, - mr); - - dim3 blocks; - dim3 threads; - int shared_size = - detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads); - - copy_from_fixed_width_columns<<>>( - start_row, - num_rows, - num_columns, - size_per_row, - column_start.data(), - column_size.data(), - input_data.data(), - input_nm.data(), - data->mutable_view().data()); - - return cudf::make_lists_column(num_rows, - std::move(offsets), - std::move(data), - 0, - rmm::device_buffer{0, rmm::cuda_stream_default, mr}, - stream, - mr); -} - -static cudf::data_type get_data_type(const cudf::column_view& v) { return v.type(); } - -static inline bool are_all_fixed_width(std::vector const& schema) -{ - return std::all_of( - schema.begin(), schema.end(), [](const cudf::data_type& t) { return cudf::is_fixed_width(t); }); -} - -/** - * Given a set of fixed width columns, calculate how the data will be laid out in memory. - * @param [in] schema the types of columns that need to be laid out. - * @param [out] column_start the byte offset where each column starts in the row. - * @param [out] column_size the size in bytes of the data for each columns in the row. - * @return the size in bytes each row needs. - */ -static inline int32_t compute_fixed_width_layout(std::vector const& schema, - std::vector& column_start, - std::vector& column_size) -{ - // We guarantee that the start of each column is 64-bit aligned so anything can go - // there, but to make the code simple we will still do an alignment for it. - int32_t at_offset = 0; - for (auto col = schema.begin(); col < schema.end(); col++) { - cudf::size_type s = cudf::size_of(*col); - column_size.emplace_back(s); - std::size_t allocation_needed = s; - std::size_t alignment_needed = allocation_needed; // They are the same for fixed width types - at_offset = align_offset(at_offset, alignment_needed); - column_start.emplace_back(at_offset); - at_offset += allocation_needed; - } - - // Now we need to add in space for validity - // Eventually we can think about nullable vs not nullable, but for now we will just always add - // it in - int32_t validity_bytes_needed = - (schema.size() + 7) / 8; // cudf::util::div_rounding_up_safe(schema.size(), 8); - // validity comes at the end and is byte aligned so we can pack more in. - at_offset += validity_bytes_needed; - // Now we need to pad the end so all rows are 64 bit aligned - return align_offset(at_offset, 8); // 8 bytes (64 bits) -} - -#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 - -template -static size_type compute_column_information(iterator begin, - iterator end, - std::vector& column_starts, - std::vector& column_sizes) //, -// std::function nested_type_cb) -{ - size_type fixed_width_size_per_row = 0; - for (auto cv = begin; cv != end; ++cv) { - auto col_type = std::get<0>(*cv); - bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; - - // if (nested_type && nested_type_cb) { nested_type_cb(cv->get<1>()); } - - // a list or string column will write a single uint64 - // of data here for offset/length - auto col_size = nested_type ? 8 : size_of(col_type); - - // align size for this type - std::size_t const alignment_needed = col_size; // They are the same for fixed width types - fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed); - column_starts.push_back(fixed_width_size_per_row); - column_sizes.push_back(col_size); - fixed_width_size_per_row += col_size; - } - - auto validity_offset = fixed_width_size_per_row; - column_starts.push_back(validity_offset); - - return fixed_width_size_per_row; -} - -//#define DEBUG - -std::vector build_validity_block_infos( - size_type const& num_columns, - size_type const& num_rows, - size_type const& shmem_limit_per_block, - std::vector const& row_batches) -{ - auto const desired_rows_and_columns = (int)sqrt(shmem_limit_per_block); - auto const column_stride = align_offset( - [&]() { - if (desired_rows_and_columns > num_columns) { - // not many columns, group it into 8s and ship it off - return std::min(8, num_columns); - } else { - return util::round_down_safe(desired_rows_and_columns, 8); - } - }(), - 8); - // we fit as much as we can given the column stride - // note that an element in the table takes just 1 bit, but a row with a single - // element still takes 8 bytes! - auto const bytes_per_row = align_offset(util::div_rounding_up_unsafe(column_stride, 8), 8); - auto const row_stride = std::min(num_rows, shmem_limit_per_block / bytes_per_row); - - std::vector validity_block_infos; - for (int col = 0; col < num_columns; col += column_stride) { - int current_window_row_batch = 0; - int rows_left_in_batch = row_batches[current_window_row_batch].row_count; - int row = 0; - while (row < num_rows) { - if (rows_left_in_batch == 0) { - current_window_row_batch++; - rows_left_in_batch = row_batches[current_window_row_batch].row_count; - } - int const window_height = std::min(row_stride, rows_left_in_batch); - - validity_block_infos.emplace_back(detail::block_info{ - col, row, std::min(col + column_stride - 1, num_columns - 1), row + window_height - 1}); - row += window_height; - rows_left_in_batch -= window_height; - } - } - - return validity_block_infos; -} - -std::vector build_block_infos(std::vector const& column_sizes, - std::vector const& column_starts, - std::vector const& row_batches, - size_type const total_number_of_rows, - size_type const& shmem_limit_per_block) -{ - std::vector block_infos; - - // block infos are organized with the windows going "down" the columns - // this provides the most coalescing of memory access - int current_window_width = 0; - int current_window_start_col = 0; - - // build the blocks for a specific set of columns - auto build_blocks = [&block_infos, &row_batches, total_number_of_rows]( - int const start_col, int const end_col, int const desired_window_height) { - int current_window_start_row = 0; - int current_window_row_batch = 0; - int rows_left_in_batch = row_batches[current_window_row_batch].row_count; - int i = 0; - while (i < total_number_of_rows) { - if (rows_left_in_batch == 0) { - current_window_row_batch++; - rows_left_in_batch = row_batches[current_window_row_batch].row_count; - } - int const window_height = std::min(desired_window_height, rows_left_in_batch); - - block_infos.emplace_back(detail::block_info{ - start_col, - current_window_start_row, - end_col, - std::min(current_window_start_row + window_height - 1, total_number_of_rows - 1), - current_window_row_batch}); - - i += window_height; - current_window_start_row += window_height; - rows_left_in_batch -= window_height; - } - }; - - // the ideal window height has lots of 8-byte reads and 8-byte writes. The optimal read/write - // would be memory cache line sized access, but since other blocks will read/write the edges - // this may not turn out to be overly important. For now, we will attempt to build a square - // window as far as byte sizes. x * y = shared_mem_size. Which translates to x^2 = - // shared_mem_size since we want them equal, so height and width are sqrt(shared_mem_size). The - // trick is that it's in bytes, not rows or columns. - size_type const optimal_square_len = size_type(sqrt(shmem_limit_per_block)); - int const window_height = std::clamp( - util::round_up_safe( - std::min(std::min(optimal_square_len, (size_type)column_sizes.size()) / column_sizes[0], - total_number_of_rows), - 32), - 1, - row_batches[0].row_count); -#if defined(DEBUG) - printf( - "optimal_square_len is %d and we have %d columns, optimal_square_len / column_sizes[0] is %d " - "and num_rows is %d, batch row count is %d " - "- which makes window height " - "%d - admin size is %lu\n", - optimal_square_len, - (int)column_sizes.size(), - optimal_square_len / column_sizes[0], - total_number_of_rows, - row_batches[0].row_count, - window_height, - column_sizes.size() * sizeof(size_type) * 2); -#endif - - auto calc_admin_data_size = [](int num_cols) -> size_type { - // admin data is the column sizes and column start information. - // this is copied to shared memory as well and needs to be accounted for - // in the window calculation. - return num_cols * sizeof(size_type) + num_cols * sizeof(size_type); - }; - - int row_size = 0; - - // march each column and build the blocks of appropriate sizes - for (unsigned int col = 0; col < column_sizes.size(); ++col) { - auto const col_size = column_sizes[col]; - - // align size for this type - std::size_t alignment_needed = col_size; // They are the same for fixed width types - auto row_size_aligned = detail::align_offset(row_size, alignment_needed); - auto row_size_with_this_col = row_size_aligned + col_size; - auto row_size_with_end_pad = detail::align_offset(row_size_with_this_col, 8); - - if (row_size_with_end_pad * window_height + - calc_admin_data_size(col - current_window_start_col) > - shmem_limit_per_block) { -#if defined(DEBUG) - printf( - "row size with end pad is %d and admin data is %d, which adds up to %d and that is too " - "large for shmem block of %d\n", - row_size_with_end_pad, - calc_admin_data_size(col - current_window_start_col), - row_size_with_end_pad * window_height + - calc_admin_data_size(col - current_window_start_col), - shmem_limit_per_block); - printf( - "Window size %d too large at column %d, admin size is %d, bumping back to build windows " - "of " - "size %d(cols " - "%d-%d), which is %d tall. Row size is too large at %d and ok at %d(aligned overall is " - "%d) " - "for shared mem size %d\n", - row_size_with_end_pad * window_height, - col, - calc_admin_data_size(col - current_window_start_col), - row_size * window_height, - current_window_start_col, - col - 1, - window_height, - row_size_with_end_pad, - row_size, - row_size_aligned, - shmem_limit_per_block); -#endif - // too large, close this window, generate vertical blocks and restart - build_blocks(current_window_start_col, col == 0 ? col : col - 1, window_height); - row_size = - detail::align_offset((column_starts[col] + column_sizes[col]) & 7, alignment_needed); -#if defined(DEBUG) - printf( - "New window starting with offset %d and row size %d to be %d (previous column offset " - "%d+%d " - "or %d)\n", - row_size, - col_size, - row_size + col_size, - column_starts[col - 1], - column_sizes[col - 1], - column_starts[col - 1] + column_sizes[col - 1]); -#endif - row_size += col_size; // alignment required for shared memory window boundary to match + #include + #include + #include + #include + #include + + #include + #include + + #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 + #include + #endif + + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + + #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 + constexpr auto NUM_BLOCKS_PER_KERNEL_FROM_ROWS = 8; + constexpr auto NUM_BLOCKS_PER_KERNEL_TO_ROWS = 2; + constexpr auto NUM_BLOCKS_PER_KERNEL_LOADED = 2; + constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL = 8; + constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED = 2; + #endif + + using cudf::detail::make_device_uvector_async; + using rmm::device_uvector; + namespace cudf { + + namespace detail { + + static inline __host__ __device__ int32_t align_offset(int32_t offset, std::size_t alignment) { + return (offset + alignment - 1) & ~(alignment - 1); + } + + __global__ void copy_from_rows_fixed_width_optimized( + const cudf::size_type num_rows, const cudf::size_type num_columns, + const cudf::size_type row_size, const cudf::size_type *input_offset_in_row, + const cudf::size_type *num_bytes, int8_t **output_data, cudf::bitmask_type **output_nm, + const int8_t *input_data) { + // We are going to copy the data in two passes. + // The first pass copies a chunk of data into shared memory. + // The second pass copies that chunk from shared memory out to the final location. + + // Because shared memory is limited we copy a subset of the rows at a time. + // For simplicity we will refer to this as a row_group + + // In practice we have found writing more than 4 columns of data per thread + // results in performance loss. As such we are using a 2 dimensional + // kernel in terms of threads, but not in terms of blocks. Columns are + // controlled by the y dimension (there is no y dimension in blocks). Rows + // are controlled by the x dimension (there are multiple blocks in the x + // dimension). + + cudf::size_type rows_per_group = blockDim.x; + cudf::size_type row_group_start = blockIdx.x; + cudf::size_type row_group_stride = gridDim.x; + cudf::size_type row_group_end = (num_rows + rows_per_group - 1) / rows_per_group + 1; + + extern __shared__ int8_t shared_data[]; + + // Because we are copying fixed width only data and we stride the rows + // this thread will always start copying from shared data in the same place + int8_t *row_tmp = &shared_data[row_size * threadIdx.x]; + int8_t *row_vld_tmp = &row_tmp[input_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]]; + + for (cudf::size_type row_group_index = row_group_start; row_group_index < row_group_end; + row_group_index += row_group_stride) { + // Step 1: Copy the data into shared memory + // We know row_size is always aligned with and a multiple of int64_t; + int64_t *long_shared = reinterpret_cast(shared_data); + const int64_t *long_input = reinterpret_cast(input_data); + + cudf::size_type shared_output_index = threadIdx.x + (threadIdx.y * blockDim.x); + cudf::size_type shared_output_stride = blockDim.x * blockDim.y; + cudf::size_type row_index_end = ((row_group_index + 1) * rows_per_group); + if (row_index_end > num_rows) { + row_index_end = num_rows; + } + cudf::size_type num_rows_in_group = row_index_end - (row_group_index * rows_per_group); + cudf::size_type shared_length = row_size * num_rows_in_group; + + cudf::size_type shared_output_end = shared_length / sizeof(int64_t); + + cudf::size_type start_input_index = + (row_size * row_group_index * rows_per_group) / sizeof(int64_t); + + for (cudf::size_type shared_index = shared_output_index; shared_index < shared_output_end; + shared_index += shared_output_stride) { + long_shared[shared_index] = long_input[start_input_index + shared_index]; + } + // Wait for all of the data to be in shared memory + __syncthreads(); + + // Step 2 copy the data back out + + // Within the row group there should be 1 thread for each row. This is a + // requirement for launching the kernel + cudf::size_type row_index = (row_group_index * rows_per_group) + threadIdx.x; + // But we might not use all of the threads if the number of rows does not go + // evenly into the thread count. We don't want those threads to exit yet + // because we may need them to copy data in for the next row group. + uint32_t active_mask = __ballot_sync(0xffffffff, row_index < num_rows); + if (row_index < num_rows) { + cudf::size_type col_index_start = threadIdx.y; + cudf::size_type col_index_stride = blockDim.y; + for (cudf::size_type col_index = col_index_start; col_index < num_columns; + col_index += col_index_stride) { + cudf::size_type col_size = num_bytes[col_index]; + const int8_t *col_tmp = &(row_tmp[input_offset_in_row[col_index]]); + int8_t *col_output = output_data[col_index]; + switch (col_size) { + case 1: { + col_output[row_index] = *col_tmp; + break; + } + case 2: { + int16_t *short_col_output = reinterpret_cast(col_output); + short_col_output[row_index] = *reinterpret_cast(col_tmp); + break; + } + case 4: { + int32_t *int_col_output = reinterpret_cast(col_output); + int_col_output[row_index] = *reinterpret_cast(col_tmp); + break; + } + case 8: { + int64_t *long_col_output = reinterpret_cast(col_output); + long_col_output[row_index] = *reinterpret_cast(col_tmp); + break; + } + default: { + cudf::size_type output_offset = col_size * row_index; + // TODO this should just not be supported for fixed width columns, but just in case... + for (cudf::size_type b = 0; b < col_size; b++) { + col_output[b + output_offset] = col_tmp[b]; + } + break; + } + } + + cudf::bitmask_type *nm = output_nm[col_index]; + int8_t *valid_byte = &row_vld_tmp[col_index / 8]; + cudf::size_type byte_bit_offset = col_index % 8; + int predicate = *valid_byte & (1 << byte_bit_offset); + uint32_t bitmask = __ballot_sync(active_mask, predicate); + if (row_index % 32 == 0) { + nm[word_index(row_index)] = bitmask; + } + } // end column loop + } // end row copy + // wait for the row_group to be totally copied before starting on the next row group + __syncthreads(); + } + } + + __global__ void copy_to_rows_fixed_width_optimized( + const cudf::size_type start_row, const cudf::size_type num_rows, + const cudf::size_type num_columns, const cudf::size_type row_size, + const cudf::size_type *output_offset_in_row, const cudf::size_type *num_bytes, + const int8_t **input_data, const cudf::bitmask_type **input_nm, int8_t *output_data) { + // We are going to copy the data in two passes. + // The first pass copies a chunk of data into shared memory. + // The second pass copies that chunk from shared memory out to the final location. + + // Because shared memory is limited we copy a subset of the rows at a time. + // We do not support copying a subset of the columns in a row yet, so we don't + // currently support a row that is wider than shared memory. + // For simplicity we will refer to this as a row_group + + // In practice we have found reading more than 4 columns of data per thread + // results in performance loss. As such we are using a 2 dimensional + // kernel in terms of threads, but not in terms of blocks. Columns are + // controlled by the y dimension (there is no y dimension in blocks). Rows + // are controlled by the x dimension (there are multiple blocks in the x + // dimension). + + cudf::size_type rows_per_group = blockDim.x; + cudf::size_type row_group_start = blockIdx.x; + cudf::size_type row_group_stride = gridDim.x; + cudf::size_type row_group_end = (num_rows + rows_per_group - 1) / rows_per_group + 1; + + extern __shared__ int8_t shared_data[]; + + // Because we are copying fixed width only data and we stride the rows + // this thread will always start copying to shared data in the same place + int8_t *row_tmp = &shared_data[row_size * threadIdx.x]; + int8_t *row_vld_tmp = + &row_tmp[output_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]]; + + for (cudf::size_type row_group_index = row_group_start; row_group_index < row_group_end; + row_group_index += row_group_stride) { + // Within the row group there should be 1 thread for each row. This is a + // requirement for launching the kernel + cudf::size_type row_index = start_row + (row_group_index * rows_per_group) + threadIdx.x; + // But we might not use all of the threads if the number of rows does not go + // evenly into the thread count. We don't want those threads to exit yet + // because we may need them to copy data back out. + if (row_index < (start_row + num_rows)) { + cudf::size_type col_index_start = threadIdx.y; + cudf::size_type col_index_stride = blockDim.y; + for (cudf::size_type col_index = col_index_start; col_index < num_columns; + col_index += col_index_stride) { + cudf::size_type col_size = num_bytes[col_index]; + int8_t *col_tmp = &(row_tmp[output_offset_in_row[col_index]]); + const int8_t *col_input = input_data[col_index]; + switch (col_size) { + case 1: { + *col_tmp = col_input[row_index]; + break; + } + case 2: { + const int16_t *short_col_input = reinterpret_cast(col_input); + *reinterpret_cast(col_tmp) = short_col_input[row_index]; + break; + } + case 4: { + const int32_t *int_col_input = reinterpret_cast(col_input); + *reinterpret_cast(col_tmp) = int_col_input[row_index]; + break; + } + case 8: { + const int64_t *long_col_input = reinterpret_cast(col_input); + *reinterpret_cast(col_tmp) = long_col_input[row_index]; + break; + } + default: { + cudf::size_type input_offset = col_size * row_index; + // TODO this should just not be supported for fixed width columns, but just in case... + for (cudf::size_type b = 0; b < col_size; b++) { + col_tmp[b] = col_input[b + input_offset]; + } + break; + } + } + // atomicOr only works on 32 bit or 64 bit aligned values, and not byte aligned + // so we have to rewrite the addresses to make sure that it is 4 byte aligned + int8_t *valid_byte = &row_vld_tmp[col_index / 8]; + cudf::size_type byte_bit_offset = col_index % 8; + uint64_t fixup_bytes = reinterpret_cast(valid_byte) % 4; + int32_t *valid_int = reinterpret_cast(valid_byte - fixup_bytes); + cudf::size_type int_bit_offset = byte_bit_offset + (fixup_bytes * 8); + // Now copy validity for the column + if (input_nm[col_index]) { + if (bit_is_set(input_nm[col_index], row_index)) { + atomicOr_block(valid_int, 1 << int_bit_offset); + } else { + atomicAnd_block(valid_int, ~(1 << int_bit_offset)); + } + } else { + // It is valid so just set the bit + atomicOr_block(valid_int, 1 << int_bit_offset); + } + } // end column loop + } // end row copy + // wait for the row_group to be totally copied into shared memory + __syncthreads(); + + // Step 2: Copy the data back out + // We know row_size is always aligned with and a multiple of int64_t; + int64_t *long_shared = reinterpret_cast(shared_data); + int64_t *long_output = reinterpret_cast(output_data); + + cudf::size_type shared_input_index = threadIdx.x + (threadIdx.y * blockDim.x); + cudf::size_type shared_input_stride = blockDim.x * blockDim.y; + cudf::size_type row_index_end = ((row_group_index + 1) * rows_per_group); + if (row_index_end > num_rows) { + row_index_end = num_rows; + } + cudf::size_type num_rows_in_group = row_index_end - (row_group_index * rows_per_group); + cudf::size_type shared_length = row_size * num_rows_in_group; + + cudf::size_type shared_input_end = shared_length / sizeof(int64_t); + + cudf::size_type start_output_index = + (row_size * row_group_index * rows_per_group) / sizeof(int64_t); + + for (cudf::size_type shared_index = shared_input_index; shared_index < shared_input_end; + shared_index += shared_input_stride) { + long_output[start_output_index + shared_index] = long_shared[shared_index]; + } + __syncthreads(); + // Go for the next round + } + } + + #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 + + struct block_info { + int start_col; + int start_row; + int end_col; + int end_row; + int buffer_num; + + __host__ __device__ size_type get_shared_row_size(size_type const *const col_offsets, + size_type const *const col_sizes) const { + return align_offset(col_offsets[end_col] + col_sizes[end_col] - col_offsets[start_col], 8); + } + __host__ __device__ size_type num_cols() const { return end_col - start_col + 1; } + + __host__ __device__ size_type num_rows() const { return end_row - start_row + 1; } + }; + + // When building the columns to return, we have to be mindful of the offset limit in cudf. + // It is 32-bit and these data columns are capable of surpassing that easily. The data should + // not be cut off exactly at the limit though due to the validity buffers. The most efficient + // place to cut the validity is on a 32-row boundary, so as we calculate the row sizes + // we keep track of the cut points for the validity, which we call row batches. If the row + // is larger than can be represented with the 32-bit offsets, we use the last 32-row boundary we + // hit. Note that this boundary is for our book-keeping with column pointers and not anything that + // the kernel needs to worry about. We cut the output at convienient boundaries when assembling + // the outgoing data stream. + struct row_batch { + size_type num_bytes; + size_type row_count; + }; + + /** + * @brief copy data from cudf columns into x format, which is row-based + * + * @param num_rows total number of rows in the table + * @param num_columns total number of columns in the table + * @param input_data pointer to raw table data + * @param input_nm pointer to validity data + * @param col_sizes array of sizes for each element in a column - one per column + * @param col_offsets offset into input data row for each column's start + * @param block_infos information about the blocks of work + * @param row_offsets offset to a specific row in the input data + * @param output_data pointer to output data + * + */ + __global__ void copy_to_rows(const size_type num_rows, const size_type num_columns, + const size_type shmem_used_per_block, const size_type num_block_infos, + const int8_t **input_data, const size_type *col_sizes, + const size_type *col_offsets, const block_info *block_infos, + const size_type *row_offsets, int8_t **output_data) { + // We are going to copy the data in two passes. + // The first pass copies a chunk of data into shared memory. + // The second pass copies that chunk from shared memory out to the final location. + + // Because shared memory is limited we copy a subset of the rows at a time. + // This has been broken up for us in the block_info struct, so we don't have + // any calculation to do here, but it is important to note. + + constexpr unsigned stages_count = NUM_BLOCKS_PER_KERNEL_LOADED; + auto group = cooperative_groups::this_thread_block(); + extern __shared__ int8_t shared_data[]; + int8_t *shared[stages_count] = {shared_data, shared_data + shmem_used_per_block}; + + __shared__ cuda::barrier block_barrier[NUM_BLOCKS_PER_KERNEL_LOADED]; + if (group.thread_rank() == 0) { + for (int i = 0; i < NUM_BLOCKS_PER_KERNEL_LOADED; ++i) { + init(&block_barrier[i], group.size()); + } + } + + group.sync(); + + auto const blocks_remaining = + std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_ROWS, + (uint)NUM_BLOCKS_PER_KERNEL_TO_ROWS); + + size_t fetch; + size_t subset; + for (subset = fetch = 0; subset < blocks_remaining; ++subset) { + // Fetch ahead up to stages_count subsets + for (; fetch < blocks_remaining && fetch < (subset + stages_count); ++fetch) { + auto const fetch_block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_ROWS + fetch]; + auto const num_fetch_cols = fetch_block.num_cols(); + auto const num_fetch_rows = fetch_block.num_rows(); + auto const num_elements_in_block = num_fetch_cols * num_fetch_rows; + auto const fetch_block_row_size = fetch_block.get_shared_row_size(col_offsets, col_sizes); + auto const starting_column_offset = col_offsets[fetch_block.start_col]; + auto &fetch_barrier = block_barrier[fetch % NUM_BLOCKS_PER_KERNEL_LOADED]; + + // wait for the last use of the memory to be completed + if (fetch > NUM_BLOCKS_PER_KERNEL_LOADED) { + fetch_barrier.arrive_and_wait(); + } + + // to do the copy we need to do n column copies followed by m element copies OR + // we have to do m element copies followed by r row copies. When going from column + // to row it is much easier to copy by elements first otherwise we would need a running + // total of the column sizes for our block, which isn't readily available. This makes it + // more appealing to copy element-wise from input data into shared matching the end layout + // and do row-based memcopies out. + + auto const shared_buffer_base = shared[fetch % stages_count]; + for (auto el = (int)threadIdx.x; el < num_elements_in_block; el += blockDim.x) { + auto const relative_col = el / num_fetch_rows; + auto const relative_row = el % num_fetch_rows; + auto const absolute_col = relative_col + fetch_block.start_col; + auto const absolute_row = relative_row + fetch_block.start_row; + auto const col_size = col_sizes[absolute_col]; + auto const col_offset = col_offsets[absolute_col]; + auto const relative_col_offset = col_offset - starting_column_offset; + + auto const shared_offset = relative_row * fetch_block_row_size + relative_col_offset; + auto const input_src = input_data[absolute_col] + col_size * absolute_row; + + // copy the element from global memory + switch (col_size) { + case 2: + cuda::memcpy_async(&shared_buffer_base[shared_offset], input_src, + cuda::aligned_size_t<2>(col_size), fetch_barrier); + break; + case 4: + cuda::memcpy_async(&shared_buffer_base[shared_offset], input_src, + cuda::aligned_size_t<4>(col_size), fetch_barrier); + break; + case 8: + cuda::memcpy_async(&shared_buffer_base[shared_offset], input_src, + cuda::aligned_size_t<8>(col_size), fetch_barrier); + break; + default: + cuda::memcpy_async(&shared_buffer_base[shared_offset], input_src, col_size, + fetch_barrier); + break; + } + } + } + + auto &subset_barrier = block_barrier[subset % NUM_BLOCKS_PER_KERNEL_LOADED]; + subset_barrier.arrive_and_wait(); + + auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_ROWS + subset]; + auto const block_row_size = block.get_shared_row_size(col_offsets, col_sizes); + auto const column_offset = col_offsets[block.start_col]; + auto const block_output_buffer = output_data[block.buffer_num]; + + // copy entire rows to final dest + for (auto absolute_row = block.start_row + threadIdx.x; absolute_row <= block.end_row; + absolute_row += blockDim.x) { + auto const relative_row = absolute_row - block.start_row; + auto const output_dest = block_output_buffer + row_offsets[absolute_row] + column_offset; + auto const shared_offset = block_row_size * relative_row; + + cuda::memcpy_async(output_dest, &shared[subset % stages_count][shared_offset], + cuda::aligned_size_t<8>(block_row_size), subset_barrier); + } + } + + // wait on the last copies to complete + for (uint i = 0; i < std::min(stages_count, blocks_remaining); ++i) { + block_barrier[i].arrive_and_wait(); + } + } + + /** + * @brief copy data from row-based format to cudf columns + * + * @param num_rows total number of rows in the table + * @param num_columns total number of columns in the table + * @param shmem_used_per_block amount of shared memory that is used by a block + * @param offsets + * @param output_data pointer to output data, partitioned by data size + * @param validity_offsets offset into input data row for validity data + * @param block_infos information about the blocks of work + * @param num_block_infos number of infos in blocks array + * @param input_data pointer to input data + * + */ + __global__ void copy_validity_to_rows( + const size_type num_rows, const size_type num_columns, const size_type shmem_used_per_block, + const size_type *row_offsets, int8_t **output_data, const size_type validity_offset, + const block_info *block_infos, const size_type num_block_infos, const bitmask_type **input_nm) { + extern __shared__ int8_t shared_data[]; + int8_t *shared_blocks[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] = { + shared_data, shared_data + shmem_used_per_block / 2}; + + // per conversation with DaveB + // each thread of warp reads a single int32 of validity - so we read 128 bytes + // then ballot_sync the bits and write the result to shmem + // after we fill shared mem memcpy it out in a blob. + // probably need knobs for number of rows vs columns to balance read/write + auto group = cooperative_groups::this_thread_block(); + + int const blocks_remaining = + std::min(num_block_infos - blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL, + (uint)NUM_VALIDITY_BLOCKS_PER_KERNEL); + + __shared__ cuda::barrier + shared_block_barriers[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]; + if (group.thread_rank() == 0) { + for (int i = 0; i < NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; ++i) { + init(&shared_block_barriers[i], group.size()); + } + } + + group.sync(); + + for (int validity_block = 0; validity_block < blocks_remaining; ++validity_block) { + if (validity_block != validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED) { + shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] + .arrive_and_wait(); + } + int8_t *this_shared_block = shared_blocks[validity_block % 2]; + auto block = block_infos[blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block]; + + auto const num_block_cols = block.num_cols(); + auto const num_block_rows = block.num_rows(); + + auto const num_sections_x = util::div_rounding_up_unsafe(num_block_cols, 32); + auto const num_sections_y = util::div_rounding_up_unsafe(num_block_rows, 32); + auto const validity_data_row_length = + align_offset(util::div_rounding_up_unsafe(num_block_cols, 8), 8); + auto const total_sections = num_sections_x * num_sections_y; + + int const warp_id = threadIdx.x / detail::warp_size; + int const lane_id = threadIdx.x % detail::warp_size; + auto const warps_per_block = std::max(1u, blockDim.x / detail::warp_size); + + // the block is divided into sections. A warp operates on a section at a time. + for (int my_section_idx = warp_id; my_section_idx < total_sections; + my_section_idx += warps_per_block) { + // convert to rows and cols + auto const section_x = my_section_idx % num_sections_x; + auto const section_y = my_section_idx / num_sections_x; + auto const relative_col = section_x * 32 + lane_id; + auto const relative_row = section_y * 32; + auto const absolute_col = relative_col + block.start_col; + auto const absolute_row = relative_row + block.start_row; + auto const cols_left = num_columns - absolute_col; + auto const participation_mask = __ballot_sync(0xFFFFFFFF, absolute_col < num_columns); + + if (absolute_col < num_columns) { + auto my_data = input_nm[absolute_col] != nullptr ? + input_nm[absolute_col][absolute_row / 32] : + std::numeric_limits::max(); + + // every thread that is participating in the warp has a byte, but it's column-based + // data and we need it in row-based. So we shuffle the bits around with ballot_sync to + // make the bytes we actually write. + bitmask_type dw_mask = 1; + for (int i = 0; i < 32 && relative_row + i < num_rows; ++i, dw_mask <<= 1) { + auto validity_data = __ballot_sync(participation_mask, my_data & dw_mask); + // lead thread in each warp writes data + auto const validity_write_offset = + validity_data_row_length * (relative_row + i) + relative_col / 8; + if (threadIdx.x % detail::warp_size == 0) { + if (cols_left <= 8) { + // write byte + this_shared_block[validity_write_offset] = validity_data & 0xFF; + } else if (cols_left <= 16) { + // write int16 + *reinterpret_cast(&this_shared_block[validity_write_offset]) = + validity_data & 0xFFFF; + } else if (cols_left <= 24) { + // write int16 and then int8 + *reinterpret_cast(&this_shared_block[validity_write_offset]) = + validity_data & 0xFFFF; + shared_data[validity_write_offset + 2] = (validity_data >> 16) & 0xFF; + } else { + // write int32 + *reinterpret_cast(&this_shared_block[validity_write_offset]) = + validity_data; + } + } + } + } + } + + // make sure entire block has finished copy + group.sync(); + + auto const output_data_base = + output_data[block.buffer_num] + validity_offset + block.start_col / 8; + + // now async memcpy the shared memory out to the final destination + for (int row = block.start_row + threadIdx.x; row <= block.end_row; row += blockDim.x) { + auto const relative_row = row - block.start_row; + auto const output_ptr = output_data_base + row_offsets[row]; + auto const num_bytes = util::div_rounding_up_unsafe(num_block_cols, 8); + + cuda::memcpy_async( + output_ptr, &this_shared_block[validity_data_row_length * relative_row], num_bytes, + shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]); + } + } + + // wait for last blocks of data to arrive + for (int validity_block = 0; + validity_block < blocks_remaining % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; + ++validity_block) { + shared_block_barriers[validity_block].arrive_and_wait(); + } + } + + static __device__ std::tuple + get_admin_data_sizes(size_t col_size_size, size_t col_offset_size, int const num_cols) { + auto const col_size_bytes = num_cols * col_size_size; + auto const col_offset_bytes = num_cols * col_offset_size; + + return {col_size_bytes, col_offset_bytes}; + } + + /** + * @brief copy data from row-based format to cudf columns + * + * @param num_rows total number of rows in the table + * @param num_columns total number of columns in the table + * @param shmem_used_per_block amount of shared memory that is used by a block + * @param row_offsets + * @param output_data + * @param output_nm + * @param col_sizes array of sizes for each element in a column - one per column + * @param col_offsets offset into input data row for each column's start + * @param block_infos information about the blocks of work + * @param input_data pointer to input data + * + */ + __global__ void copy_from_rows(const size_type num_rows, const size_type num_columns, + const size_type shmem_used_per_block, const size_type *row_offsets, + int8_t **output_data, const size_type *_col_sizes, + const size_type *_col_offsets, const block_info *block_infos, + const size_type num_block_infos, const int8_t *input_data) { + // We are going to copy the data in two passes. + // The first pass copies a chunk of data into shared memory. + // The second pass copies that chunk from shared memory out to the final location. + + // Because shared memory is limited we copy a subset of the rows at a time. + // This has been broken up for us in the block_info struct, so we don't have + // any calculation to do here, but it is important to note. + + // to speed up some of the random access memory we do, we copy col_sizes and col_offsets + // to shared memory for each of the blocks that we work on + + constexpr unsigned stages_count = NUM_BLOCKS_PER_KERNEL_LOADED; + auto group = cooperative_groups::this_thread_block(); + extern __shared__ int8_t shared_data[]; + int8_t *shared[stages_count] = {shared_data, shared_data + shmem_used_per_block}; + + __shared__ cuda::barrier block_barrier[NUM_BLOCKS_PER_KERNEL_LOADED]; + if (group.thread_rank() == 0) { + for (int i = 0; i < NUM_BLOCKS_PER_KERNEL_LOADED; ++i) { + init(&block_barrier[i], group.size()); + } + } + + group.sync(); + + auto blocks_remaining = std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_ROWS, + (uint)NUM_BLOCKS_PER_KERNEL_FROM_ROWS); + + size_t fetch_index; + size_t processing_index; + for (processing_index = fetch_index = 0; processing_index < blocks_remaining; + ++processing_index) { + // Fetch ahead up to stages_count groups + for (; fetch_index < static_cast(blocks_remaining) && + fetch_index < (processing_index + stages_count); + ++fetch_index) { + auto const fetch_block = + block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_ROWS + fetch_index]; + auto const fetch_block_start_row = fetch_block.start_row; + auto const fetch_block_end_row = fetch_block.end_row; + auto const starting_col_offset = _col_offsets[fetch_block.start_col]; + auto const fetch_block_row_size = fetch_block.get_shared_row_size(_col_offsets, _col_sizes); + auto const num_fetch_cols = fetch_block.num_cols(); + auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes( + sizeof(decltype(*_col_sizes)), sizeof(decltype(*_col_offsets)), num_fetch_cols); + auto &fetch_barrier = block_barrier[fetch_index % NUM_BLOCKS_PER_KERNEL_LOADED]; + + // if we have fetched all buffers, we need to wait for processing + // to complete on them before we can use them again + if (fetch_index > NUM_BLOCKS_PER_KERNEL_LOADED) { + fetch_barrier.arrive_and_wait(); + } + + auto shared_row_offset = 0; + // copy the data for column sizes + cuda::memcpy_async(group, &shared[fetch_index % stages_count][shared_row_offset], + &_col_sizes[fetch_block.start_col], col_size_bytes, fetch_barrier); + shared_row_offset += col_size_bytes; + // copy the data for column offsets + cuda::memcpy_async(group, &shared[fetch_index % stages_count][shared_row_offset], + &_col_offsets[fetch_block.start_col], col_offset_bytes, fetch_barrier); + shared_row_offset += col_offset_bytes; + shared_row_offset = align_offset(shared_row_offset, 8); + + for (auto row = fetch_block_start_row + static_cast(threadIdx.x); + row <= fetch_block_end_row; row += blockDim.x) { + auto shared_offset = + (row - fetch_block_start_row) * fetch_block_row_size + shared_row_offset; + // copy the main + cuda::memcpy_async(&shared[fetch_index % stages_count][shared_offset], + &input_data[row_offsets[row] + starting_col_offset], + fetch_block_row_size, fetch_barrier); + } + } + + auto &processing_barrier = block_barrier[processing_index % NUM_BLOCKS_PER_KERNEL_LOADED]; + + // ensure our data is ready + processing_barrier.arrive_and_wait(); + + auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_ROWS + processing_index]; + auto const rows_in_block = block.num_rows(); + auto const cols_in_block = block.num_cols(); + + auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes( + sizeof(decltype(*_col_sizes)), sizeof(decltype(*_col_offsets)), cols_in_block); + auto shared_col_sizes = reinterpret_cast(shared[processing_index % stages_count]); + auto shared_col_offsets = + reinterpret_cast(&shared[processing_index % stages_count][col_size_bytes]); + + auto const shared_row_offset = align_offset(col_size_bytes + col_offset_bytes, 8); + + auto block_row_size = block.get_shared_row_size(_col_offsets, _col_sizes); + + // now we copy from shared memory to final destination. + // the data is laid out in rows in shared memory, so the reads + // for a column will be "vertical". Because of this and the different + // sizes for each column, this portion is handled on row/column basis. + // to prevent each thread working on a single row and also to ensure + // that all threads can do work in the case of more threads than rows, + // we do a global index instead of a double for loop with col/row. + for (int index = threadIdx.x; index < rows_in_block * cols_in_block; index += blockDim.x) { + auto const relative_col = index % cols_in_block; + auto const relative_row = index / cols_in_block; + auto const absolute_col = relative_col + block.start_col; + auto const absolute_row = relative_row + block.start_row; + + auto const shared_memory_row_offset = block_row_size * relative_row; + auto const shared_memory_offset = shared_col_offsets[relative_col] - shared_col_offsets[0] + + shared_memory_row_offset + shared_row_offset; + auto const column_size = shared_col_sizes[relative_col]; + + int8_t *shmem_src = &shared[processing_index % stages_count][shared_memory_offset]; + int8_t *dst = &output_data[absolute_col][absolute_row * column_size]; + + cuda::memcpy_async(dst, shmem_src, column_size, processing_barrier); + } + group.sync(); + } + + // wait on the last copies to complete + for (uint i = 0; i < std::min(stages_count, blocks_remaining); ++i) { + block_barrier[i].arrive_and_wait(); + } + } + + /** + * @brief copy data from row-based format to cudf columns + * + * @param num_rows total number of rows in the table + * @param num_columns total number of columns in the table + * @param shmem_used_per_block amount of shared memory that is used by a block + * @param offsets + * @param output_nm + * @param validity_offsets offset into input data row for validity data + * @param block_infos information about the blocks of work + * @param num_block_infos number of infos in blocks array + * @param input_data pointer to input data + * + */ + __global__ void copy_validity_from_rows( + const size_type num_rows, const size_type num_columns, const size_type shmem_used_per_block, + const size_type *row_offsets, cudf::bitmask_type **output_nm, const size_type validity_offset, + const block_info *block_infos, const size_type num_block_infos, const int8_t *input_data) { + extern __shared__ int8_t shared_data[]; + int8_t *shared_blocks[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] = { + shared_data, shared_data + shmem_used_per_block / 2}; + + // per conversation with DaveB + // each thread of warp reads a single byte of validity - so we read 32 bytes + // then ballot_sync the bits and write the result to shmem + // after we fill shared mem memcpy it out in a blob. + // probably need knobs for number of rows vs columns to balance read/write + auto group = cooperative_groups::this_thread_block(); + + int const blocks_remaining = + std::min(num_block_infos - blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL, + (uint)NUM_VALIDITY_BLOCKS_PER_KERNEL); + + __shared__ cuda::barrier + shared_block_barriers[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]; + if (group.thread_rank() == 0) { + for (int i = 0; i < NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; ++i) { + init(&shared_block_barriers[i], group.size()); + } + } + + group.sync(); + + for (int validity_block = 0; validity_block < blocks_remaining; ++validity_block) { + auto const validity_index = validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; + if (validity_block != validity_index) { + shared_block_barriers[validity_index].arrive_and_wait(); + } + int8_t *this_shared_block = shared_blocks[validity_block % 2]; + auto const block = block_infos[blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block]; + auto const block_start_col = block.start_col; + auto const block_start_row = block.start_row; + auto const num_block_cols = block.num_cols(); + auto const num_block_rows = block.num_rows(); + auto const num_sections_x = (num_block_cols + 7) / 8; + auto const num_sections_y = (num_block_rows + 31) / 32; + auto const validity_data_col_length = num_sections_y * 4; // words to bytes + auto const total_sections = num_sections_x * num_sections_y; + int const warp_id = threadIdx.x / detail::warp_size; + int const lane_id = threadIdx.x % detail::warp_size; + auto const warps_per_block = std::max(1u, blockDim.x / detail::warp_size); + + // the block is divided into sections. A warp operates on a section at a time. + for (int my_section_idx = warp_id; my_section_idx < total_sections; + my_section_idx += warps_per_block) { + // convert to rows and cols + auto const section_x = my_section_idx % num_sections_x; + auto const section_y = my_section_idx / num_sections_x; + auto const relative_col = section_x * 8; + auto const relative_row = section_y * 32 + lane_id; + auto const absolute_col = relative_col + block_start_col; + auto const absolute_row = relative_row + block_start_row; + auto const rows_left = num_rows - absolute_row; + + auto const participation_mask = __ballot_sync(0xFFFFFFFF, absolute_row < num_rows); + + if (absolute_row < num_rows) { + auto const my_byte = + input_data[row_offsets[absolute_row] + validity_offset + absolute_col / 8]; + + // so every thread that is participating in the warp has a byte, but it's row-based + // data and we need it in column-based. So we shiffle the bits around to make + // the bytes we actually write. + for (int i = 0, byte_mask = 1; i < 8 && relative_col + i < num_columns; + ++i, byte_mask <<= 1) { + auto validity_data = __ballot_sync(participation_mask, my_byte & byte_mask); + // lead thread in each warp writes data + if (threadIdx.x % detail::warp_size == 0) { + auto const validity_write_offset = + validity_data_col_length * (relative_col + i) + relative_row / 8; + + if (rows_left <= 8) { + // write byte + this_shared_block[validity_write_offset] = validity_data & 0xFF; + } else if (rows_left <= 16) { + // write int16 + *reinterpret_cast(&this_shared_block[validity_write_offset]) = + validity_data & 0xFFFF; + } else if (rows_left <= 24) { + // write int16 and then int8 + *reinterpret_cast(&this_shared_block[validity_write_offset]) = + validity_data & 0xFFFF; + shared_data[validity_write_offset + 2] = (validity_data >> 16) & 0xFF; + } else { + // write int32 + *reinterpret_cast(&this_shared_block[validity_write_offset]) = + validity_data; + } + } + } + } + } + + // make sure entire block has finished copy + group.sync(); + + // now async memcpy the shared + for (int col = block.start_col + threadIdx.x; col <= block.end_col; col += blockDim.x) { + auto const relative_col = col - block.start_col; + + cuda::memcpy_async( + output_nm[col] + word_index(block_start_row), + &this_shared_block[validity_data_col_length * relative_col], + util::div_rounding_up_unsafe(num_block_rows, 8), + shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]); + } + } + + // wait for last blocks of data to arrive + auto const num_blocks_to_wait = blocks_remaining > NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED ? + NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED : + blocks_remaining; + for (int validity_block = 0; validity_block < num_blocks_to_wait; ++validity_block) { + shared_block_barriers[validity_block].arrive_and_wait(); + } + } + + #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 + + /** + * Calculate the dimensions of the kernel for fixed width only columns. + * @param [in] num_columns the number of columns being copied. + * @param [in] num_rows the number of rows being copied. + * @param [in] size_per_row the size each row takes up when padded. + * @param [out] blocks the size of the blocks for the kernel + * @param [out] threads the size of the threads for the kernel + * @return the size in bytes of shared memory needed for each block. + */ + static int calc_fixed_width_kernel_dims(const cudf::size_type num_columns, + const cudf::size_type num_rows, + const cudf::size_type size_per_row, dim3 &blocks, + dim3 &threads) { + // We have found speed degrades when a thread handles more than 4 columns. + // Each block is 2 dimensional. The y dimension indicates the columns. + // We limit this to 32 threads in the y dimension so we can still + // have at least 32 threads in the x dimension (1 warp) which should + // result in better coalescing of memory operations. We also + // want to guarantee that we are processing a multiple of 32 threads + // in the x dimension because we use atomic operations at the block + // level when writing validity data out to main memory, and that would + // need to change if we split a word of validity data between blocks. + int y_block_size = (num_columns + 3) / 4; // cudf::util::div_rounding_up_safe(num_columns, 4); + if (y_block_size > 32) { + y_block_size = 32; + } + int x_possible_block_size = 1024 / y_block_size; + // 48KB is the default setting for shared memory per block according to the cuda tutorials + // If someone configures the GPU to only have 16 KB this might not work. + int max_shared_size = 48 * 1024; + int max_block_size = max_shared_size / size_per_row; + // If we don't have enough shared memory there is no point in having more threads + // per block that will just sit idle + max_block_size = max_block_size > x_possible_block_size ? x_possible_block_size : max_block_size; + // Make sure that the x dimension is a multiple of 32 this not only helps + // coalesce memory access it also lets us do a ballot sync for validity to write + // the data back out the warp level. If x is a multiple of 32 then each thread in the y + // dimension is associated with one or more warps, that should correspond to the validity + // words directly. + int block_size = (max_block_size / 32) * 32; + CUDF_EXPECTS(block_size != 0, "Row size is too large to fit in shared memory"); + + int num_blocks = (num_rows + block_size - 1) / block_size; + if (num_blocks < 1) { + num_blocks = 1; + } else if (num_blocks > 10240) { + // The maximum number of blocks supported in the x dimension is 2 ^ 31 - 1 + // but in practice haveing too many can cause some overhead that I don't totally + // understand. Playing around with this haveing as little as 600 blocks appears + // to be able to saturate memory on V100, so this is an order of magnitude higher + // to try and future proof this a bit. + num_blocks = 10240; + } + blocks.x = num_blocks; + blocks.y = 1; + blocks.z = 1; + threads.x = block_size; + threads.y = y_block_size; + threads.z = 1; + return size_per_row * block_size; + } + + /** + * When converting to rows it is possible that the size of the table was too big to fit + * in a single column. This creates an output column for a subset of the rows in a table + * going from start row and containing the next num_rows. Most of the parameters passed + * into this function are common between runs and should be calculated once. + */ + static std::unique_ptr + fixed_width_convert_to_rows(const cudf::size_type start_row, const cudf::size_type num_rows, + const cudf::size_type num_columns, const cudf::size_type size_per_row, + rmm::device_uvector &column_start, + rmm::device_uvector &column_size, + rmm::device_uvector &input_data, + rmm::device_uvector &input_nm, + const cudf::scalar &zero, const cudf::scalar &scalar_size_per_row, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) { + int64_t total_allocation = size_per_row * num_rows; + // We made a mistake in the split somehow + CUDF_EXPECTS(total_allocation < std::numeric_limits::max(), "Table is too large to fit!"); + + // Allocate and set the offsets row for the byte array + std::unique_ptr offsets = + cudf::detail::sequence(num_rows + 1, zero, scalar_size_per_row, stream); + + std::unique_ptr data = cudf::make_numeric_column( + cudf::data_type(cudf::type_id::INT8), static_cast(total_allocation), + cudf::mask_state::UNALLOCATED, stream, mr); + + dim3 blocks; + dim3 threads; + int shared_size = + detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads); + + copy_to_rows_fixed_width_optimized<<>>( + start_row, num_rows, num_columns, size_per_row, column_start.data(), column_size.data(), + input_data.data(), input_nm.data(), data->mutable_view().data()); + + return cudf::make_lists_column(num_rows, std::move(offsets), std::move(data), 0, + rmm::device_buffer{0, rmm::cuda_stream_default, mr}, stream, mr); + } + + static cudf::data_type get_data_type(const cudf::column_view &v) { + return v.type(); + } + + static inline bool are_all_fixed_width(std::vector const &schema) { + return std::all_of(schema.begin(), schema.end(), + [](const cudf::data_type &t) { return cudf::is_fixed_width(t); }); + } + + /** + * Given a set of fixed width columns, calculate how the data will be laid out in memory. + * @param [in] schema the types of columns that need to be laid out. + * @param [out] column_start the byte offset where each column starts in the row. + * @param [out] column_size the size in bytes of the data for each columns in the row. + * @return the size in bytes each row needs. + */ + static inline int32_t compute_fixed_width_layout(std::vector const &schema, + std::vector &column_start, + std::vector &column_size) { + // We guarantee that the start of each column is 64-bit aligned so anything can go + // there, but to make the code simple we will still do an alignment for it. + int32_t at_offset = 0; + for (auto col = schema.begin(); col < schema.end(); col++) { + cudf::size_type s = cudf::size_of(*col); + column_size.emplace_back(s); + std::size_t allocation_needed = s; + std::size_t alignment_needed = allocation_needed; // They are the same for fixed width types + at_offset = align_offset(at_offset, alignment_needed); + column_start.emplace_back(at_offset); + at_offset += allocation_needed; + } + + // Now we need to add in space for validity + // Eventually we can think about nullable vs not nullable, but for now we will just always add + // it in + int32_t validity_bytes_needed = + (schema.size() + 7) / 8; // cudf::util::div_rounding_up_safe(schema.size(), 8); + // validity comes at the end and is byte aligned so we can pack more in. + at_offset += validity_bytes_needed; + // Now we need to pad the end so all rows are 64 bit aligned + return align_offset(at_offset, 8); // 8 bytes (64 bits) + } + + #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 + + template + static size_type compute_column_information(iterator begin, iterator end, + std::vector &column_starts, + std::vector &column_sizes) //, + // std::function nested_type_cb) + { + size_type fixed_width_size_per_row = 0; + for (auto cv = begin; cv != end; ++cv) { + auto col_type = std::get<0>(*cv); + bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; + + // if (nested_type && nested_type_cb) { nested_type_cb(cv->get<1>()); } + + // a list or string column will write a single uint64 + // of data here for offset/length + auto col_size = nested_type ? 8 : size_of(col_type); + + // align size for this type + std::size_t const alignment_needed = col_size; // They are the same for fixed width types + fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed); + column_starts.push_back(fixed_width_size_per_row); + column_sizes.push_back(col_size); + fixed_width_size_per_row += col_size; + } + + auto validity_offset = fixed_width_size_per_row; + column_starts.push_back(validity_offset); + + return fixed_width_size_per_row; + } + + std::vector + build_validity_block_infos(size_type const &num_columns, size_type const &num_rows, + size_type const &shmem_limit_per_block, + std::vector const &row_batches) { + auto const desired_rows_and_columns = (int)sqrt(shmem_limit_per_block); + auto const column_stride = align_offset( + [&]() { + if (desired_rows_and_columns > num_columns) { + // not many columns, group it into 8s and ship it off + return std::min(8, num_columns); + } else { + return util::round_down_safe(desired_rows_and_columns, 8); + } + }(), + 8); + // we fit as much as we can given the column stride + // note that an element in the table takes just 1 bit, but a row with a single + // element still takes 8 bytes! + auto const bytes_per_row = align_offset(util::div_rounding_up_unsafe(column_stride, 8), 8); + auto const row_stride = std::min(num_rows, shmem_limit_per_block / bytes_per_row); + + std::vector validity_block_infos; + for (int col = 0; col < num_columns; col += column_stride) { + int current_window_row_batch = 0; + int rows_left_in_batch = row_batches[current_window_row_batch].row_count; + int row = 0; + while (row < num_rows) { + if (rows_left_in_batch == 0) { + current_window_row_batch++; + rows_left_in_batch = row_batches[current_window_row_batch].row_count; + } + int const window_height = std::min(row_stride, rows_left_in_batch); + + validity_block_infos.emplace_back(detail::block_info{ + col, row, std::min(col + column_stride - 1, num_columns - 1), row + window_height - 1}); + row += window_height; + rows_left_in_batch -= window_height; + } + } + + return validity_block_infos; + } + + std::vector build_block_infos(std::vector const &column_sizes, + std::vector const &column_starts, + std::vector const &row_batches, + size_type const total_number_of_rows, + size_type const &shmem_limit_per_block) { + std::vector block_infos; + + // block infos are organized with the windows going "down" the columns + // this provides the most coalescing of memory access + int current_window_width = 0; + int current_window_start_col = 0; + + // build the blocks for a specific set of columns + auto build_blocks = [&block_infos, &row_batches, total_number_of_rows]( + int const start_col, int const end_col, int const desired_window_height) { + int current_window_start_row = 0; + int current_window_row_batch = 0; + int rows_left_in_batch = row_batches[current_window_row_batch].row_count; + int i = 0; + while (i < total_number_of_rows) { + if (rows_left_in_batch == 0) { + current_window_row_batch++; + rows_left_in_batch = row_batches[current_window_row_batch].row_count; + } + int const window_height = std::min(desired_window_height, rows_left_in_batch); + + block_infos.emplace_back(detail::block_info{ + start_col, current_window_start_row, end_col, + std::min(current_window_start_row + window_height - 1, total_number_of_rows - 1), + current_window_row_batch}); + + i += window_height; + current_window_start_row += window_height; + rows_left_in_batch -= window_height; + } + }; + + // the ideal window height has lots of 8-byte reads and 8-byte writes. The optimal read/write + // would be memory cache line sized access, but since other blocks will read/write the edges + // this may not turn out to be overly important. For now, we will attempt to build a square + // window as far as byte sizes. x * y = shared_mem_size. Which translates to x^2 = + // shared_mem_size since we want them equal, so height and width are sqrt(shared_mem_size). The + // trick is that it's in bytes, not rows or columns. + size_type const optimal_square_len = size_type(sqrt(shmem_limit_per_block)); + int const window_height = std::clamp( + util::round_up_safe( + std::min(std::min(optimal_square_len, (size_type)column_sizes.size()) / column_sizes[0], + total_number_of_rows), + 32), + 1, row_batches[0].row_count); + + auto calc_admin_data_size = [](int num_cols) -> size_type { + // admin data is the column sizes and column start information. + // this is copied to shared memory as well and needs to be accounted for + // in the window calculation. + return num_cols * sizeof(size_type) + num_cols * sizeof(size_type); + }; + + int row_size = 0; + + // march each column and build the blocks of appropriate sizes + for (unsigned int col = 0; col < column_sizes.size(); ++col) { + auto const col_size = column_sizes[col]; + + // align size for this type + std::size_t alignment_needed = col_size; // They are the same for fixed width types + auto row_size_aligned = detail::align_offset(row_size, alignment_needed); + auto row_size_with_this_col = row_size_aligned + col_size; + auto row_size_with_end_pad = detail::align_offset(row_size_with_this_col, 8); + + if (row_size_with_end_pad * window_height + + calc_admin_data_size(col - current_window_start_col) > + shmem_limit_per_block) { + // too large, close this window, generate vertical blocks and restart + build_blocks(current_window_start_col, col == 0 ? col : col - 1, window_height); + row_size = + detail::align_offset((column_starts[col] + column_sizes[col]) & 7, alignment_needed); + row_size += col_size; // alignment required for shared memory window boundary to match // alignment of output row - current_window_start_col = col; - current_window_width = 0; - } else { - row_size = row_size_with_this_col; - current_window_width++; - } - } - - // build last set of blocks - if (current_window_width > 0) { - build_blocks(current_window_start_col, (int)column_sizes.size() - 1, window_height); - } - - return block_infos; -} - -#if defined(DEBUG) -void pretty_print(uint64_t i) -{ - if (i > (1 * 1024 * 1024 * 1024)) { - printf("%.2f GB", i / float(1 * 1024 * 1024 * 1024)); - } else if (i > (1 * 1024 * 1024)) { - printf("%.2f MB", i / float(1 * 1024 * 1024)); - } else if (i > (1 * 1024)) { - printf("%.2f KB", float(i / 1024)); - } else { - printf("%lu Bytes", i); - } -} -#endif -#endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 - -} // namespace detail - -std::vector> convert_to_rows(cudf::table_view const& tbl, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ -#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 - // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the - // data, but small enough that multiple columns fit in memory so the writes can coalese as well. - // Potential optimization for window sizes. - const size_type num_columns = tbl.num_columns(); - const size_type num_rows = tbl.num_rows(); - - int device_id; - CUDA_TRY(cudaGetDevice(&device_id)); - int total_shmem; - CUDA_TRY(cudaDeviceGetAttribute(&total_shmem, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); - -#if defined(DEBUG) || 1 - total_shmem -= 1024; -#endif - int shmem_limit_per_block = total_shmem / NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; - -#if defined(DEBUG) - size_t free, total; - cudaMemGetInfo(&free, &total); - printf("%lu/%lu Memory\n", free, total); -#endif - - // break up the work into blocks, which are a starting and ending row/col #. - // this window size is calculated based on the shared memory size available - // we want a single block to fill up the entire shared memory space available - // for the transpose-like conversion. - - // There are two different processes going on here. The GPU conversion of the data - // and the writing of the data into the list of byte columns that are a maximum of - // 2 gigs each due to offset maximum size. The GPU conversion portion has to understand - // this limitation because the column must own the data inside and as a result it must be - // a distinct allocation for that column. Copying the data into these final buffers would - // be prohibitively expensive, so care is taken to ensure the GPU writes to the proper buffer. - // The windows are broken at the boundaries of specific rows based on the row sizes up - // to that point. These are row batches and they are decided first before building the - // windows so the windows can be properly cut around them. - - // Get the pointers to the input columnar data ready - std::vector input_data; - std::vector input_nm; - input_data.reserve(num_columns); - input_nm.reserve(num_columns); - for (size_type column_number = 0; column_number < num_columns; column_number++) { - column_view cv = tbl.column(column_number); - auto const col_type = cv.type(); - bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; - - if (!nested_type) { - input_data.emplace_back(cv.data()); - input_nm.emplace_back(cv.null_mask()); - } - } - - auto dev_input_data = make_device_uvector_async(input_data, stream, mr); - auto dev_input_nm = make_device_uvector_async(input_nm, stream, mr); - - std::vector row_sizes; // size of each row in bytes including any alignment padding - std::vector row_offsets; // offset from the start of the data to this row - std::vector column_sizes; // byte size of each column - std::vector column_starts; // offset of column inside a row including alignment - std::vector - variable_width_columns; // list of the variable width columns in the table - row_sizes.reserve(num_rows); - row_offsets.reserve(num_rows); - column_sizes.reserve(num_columns); - column_starts.reserve(num_columns + 1); // we add a final offset for validity data start - - auto iter = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), [&tbl](auto i) -> std::tuple { - return std::make_tuple(tbl.column(i).type(), tbl.column(i)); - }); - - size_type fixed_width_size_per_row = detail::compute_column_information(iter, - iter + num_columns, - column_starts, - column_sizes); //, - // [&variable_width_columns](column_view const &cv) { variable_width_columns.push_back(cv); }); - /* size_type fixed_width_size_per_row = 0; - for (int col = 0; col < num_columns; ++col) { - auto cv = tbl.column(col); - auto col_type = cv.type(); - bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; - - if (nested_type) { variable_width_columns.push_back(cv); } - - // a list or string column will write a single uint64 - // of data here for offset/length - auto col_size = nested_type ? 8 : size_of(col_type); - - // align size for this type - std::size_t const alignment_needed = col_size; // They are the same for fixed width types - fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed); - column_starts.push_back(fixed_width_size_per_row); - column_sizes.push_back(col_size); - fixed_width_size_per_row += col_size; - }*/ - -#if defined(DEBUG) - printf("validity offset will be %d + %d = %d\n", - column_starts.back(), - column_sizes.back(), - column_starts.back() + column_sizes.back()); -#endif - - auto dev_col_sizes = make_device_uvector_async(column_sizes, stream, mr); - auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr); - - std::vector row_batches; - - auto calculate_variable_width_row_data_size = [](int const row) { - // each level of variable-width data will add an offset/length - // uint64 of data. The first of which is inside the fixed-width - // data itself and needs to be aligned based on what is around - // that data. This is handled above with the fixed-width calculations - // for that reason. We may still need to add more of these offset/length - // combinations if the nesting is deeper than one level as these - // will be included in the variable-width data blob at the end of the - // row. - return 0; - /* auto c = variable_width_columns[col]; - while (true) { - auto col_offsets = c.child(0).data(); - auto col_data_size = size_of(c.child(1).type()); - std::size_t alignment_needed = col_data_size; - - row_sizes[row] += (col_offsets[row + 1] - col_offsets[row]) * col_data_size; - if (c.num_children() == 0) { - break; - } - c = c.child(1); - } - exclusive_scan([t](int row_index) { - size_type total_row_size = 0; - for (int i=0 i - (uint64_t)std::numeric_limits::max()) { - // a new batch starts at the last 32-row boundary - row_batches.push_back( - detail::row_batch{static_cast(row_batch_size), row_batch_rows & ~31}); - row_batch_size = 0; - row_batch_rows = row_batch_rows & 31; - row_offset = 0; - aligned_row_batch_size = 0; - } - row_offset = detail::align_offset(row_offset, 8); // rows are 8 byte aligned - row_offsets.push_back(row_offset); - row_batch_size = aligned_row_batch_size + row_sizes[row]; - row_offset += row_sizes[row]; - total_table_size = detail::align_offset(total_table_size, 8); // rows are 8 byte aligned - total_table_size += row_sizes[row]; - row_batch_rows++; - } - if (row_batch_size > 0) { - row_batches.push_back( - detail::row_batch{static_cast(row_batch_size), row_batch_rows}); - } - - auto dev_row_offsets = make_device_uvector_async(row_offsets, stream, mr); - -#if defined(DEBUG) - printf("%d rows and %d columns in table\n", num_rows, num_columns); - printf("%lu batches:\n", row_batches.size()); - for (auto i = 0; i < (int)row_batches.size(); ++i) { - printf("%d: %d rows, ", i, row_batches[i].row_count); - detail::pretty_print(row_batches[i].num_bytes); - printf("\n"); - } -#endif - - std::vector output_buffers; - std::vector output_data; - output_data.reserve(row_batches.size()); - for (uint i = 0; i < row_batches.size(); ++i) { - rmm::device_buffer temp(row_batches[i].num_bytes, stream, mr); - output_data.push_back(static_cast(temp.data())); - output_buffers.push_back(std::move(temp)); - } - auto dev_output_data = make_device_uvector_async(output_data, stream, mr); - - std::vector block_infos = - build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block); - -#if defined(DEBUG) - printf("%lu windows for %d columns, %d rows to fit in ", - block_infos.size(), - block_infos[0].end_col - block_infos[0].start_col + 1, - block_infos[0].end_row - block_infos[0].start_row); - detail::pretty_print(shmem_limit_per_block); - printf(" shared mem("); - detail::pretty_print(fixed_width_size_per_row); - printf("/row, %d columns, %d rows, ", num_columns, num_rows); - detail::pretty_print(total_table_size); - printf(" total):\n"); -#endif - - auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr); - - // blast through the entire table and convert it - dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS)); - dim3 threads(256); - -#if defined(DEBUG) - printf("Launching kernel with %d blocks, %d threads, ", blocks.x, threads.x); - detail::pretty_print(shmem_limit_per_block); - printf(" shared memory\n"); -#endif - detail::copy_from_columns<<>>( - num_rows, - num_columns, - shmem_limit_per_block, - block_infos.size(), - dev_input_data.data(), - dev_col_sizes.data(), - dev_col_starts.data(), - dev_block_infos.data(), - dev_row_offsets.data(), - reinterpret_cast(dev_output_data.data())); - - auto validity_block_infos = - build_validity_block_infos(num_columns, num_rows, shmem_limit_per_block, row_batches); - - auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream, mr); - dim3 validity_blocks( - util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_COLUMNS)); - dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu)); -#if defined(DEBUG) - printf("Launching validity kernel with %d blocks, for %lu validity blocks with %d threads, ", - validity_blocks.x, - validity_block_infos.size(), - validity_threads.x); - detail::pretty_print(total_shmem); - printf(" shared memory\n"); -#endif - detail:: - copy_validity_from_columns<<>>( - num_rows, - num_columns, - shmem_limit_per_block, - dev_row_offsets.data(), - dev_output_data.data(), - column_starts.back(), - dev_validity_block_infos.data(), - validity_block_infos.size(), - dev_input_nm.data()); - - // split up the output buffer into multiple buffers based on row batch sizes - // and create list of byte columns - int offset_offset = 0; - std::vector> ret; - for (uint i = 0; i < row_batches.size(); ++i) { - // compute offsets for this row batch - std::vector offset_vals; - offset_vals.reserve(row_batches[i].row_count + 1); - size_type cur_offset = 0; - offset_vals.push_back(cur_offset); - for (int row = 0; row < row_batches[i].row_count; ++row) { - cur_offset = detail::align_offset(cur_offset, 8) + row_sizes[row + offset_offset]; - offset_vals.push_back(cur_offset); - } - offset_offset += row_batches[i].row_count; - - auto dev_offsets = make_device_uvector_async(offset_vals, stream, mr); - auto offsets = std::make_unique( - data_type{type_id::INT32}, (size_type)offset_vals.size(), dev_offsets.release()); - - auto data = std::make_unique( - data_type{cudf::type_id::INT8}, row_batches[i].num_bytes, std::move(output_buffers[i])); - - ret.push_back(cudf::make_lists_column(row_batches[i].row_count, - std::move(offsets), - std::move(data), - 0, - rmm::device_buffer{0, rmm::cuda_stream_default, mr}, - stream, - mr)); - } - - return ret; -#else - CUDF_FAIL("Column to row conversion optimization requires volta or later hardware."); - return {}; -#endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 -} - -std::vector> convert_to_rows_fixed_width_optimized( - cudf::table_view const& tbl, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) -{ - const cudf::size_type num_columns = tbl.num_columns(); - - std::vector schema; - schema.resize(num_columns); - std::transform(tbl.begin(), tbl.end(), schema.begin(), detail::get_data_type); - - if (detail::are_all_fixed_width(schema)) { - std::vector column_start; - std::vector column_size; - - int32_t size_per_row = detail::compute_fixed_width_layout(schema, column_start, column_size); - auto dev_column_start = make_device_uvector_async(column_start, stream, mr); - auto dev_column_size = make_device_uvector_async(column_size, stream, mr); - - int32_t max_rows_per_batch = std::numeric_limits::max() / size_per_row; - // Make the number of rows per batch a multiple of 32 so we don't have to worry about - // splitting validity at a specific row offset. This might change in the future. - max_rows_per_batch = (max_rows_per_batch / 32) * 32; - - cudf::size_type num_rows = tbl.num_rows(); - - // Get the pointers to the input columnar data ready - std::vector input_data; - std::vector input_nm; - for (cudf::size_type column_number = 0; column_number < num_columns; column_number++) { - cudf::column_view cv = tbl.column(column_number); - input_data.emplace_back(cv.data()); - input_nm.emplace_back(cv.null_mask()); - } - auto dev_input_data = make_device_uvector_async(input_data, stream, mr); - auto dev_input_nm = make_device_uvector_async(input_nm, stream, mr); - - using ScalarType = cudf::scalar_type_t; - auto zero = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value()); - zero->set_valid_async(true, stream); - static_cast(zero.get())->set_value(0, stream); - - auto step = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value()); - step->set_valid_async(true, stream); - static_cast(step.get()) - ->set_value(static_cast(size_per_row), stream); - - std::vector> ret; - for (cudf::size_type row_start = 0; row_start < num_rows; row_start += max_rows_per_batch) { - cudf::size_type row_count = num_rows - row_start; - row_count = row_count > max_rows_per_batch ? max_rows_per_batch : row_count; - ret.emplace_back(detail::fixed_width_convert_to_rows(row_start, - row_count, - num_columns, - size_per_row, - dev_column_start, - dev_column_size, - dev_input_data, - dev_input_nm, - *zero, - *step, - stream, - mr)); - } - - return ret; - } else { - CUDF_FAIL("Only fixed width types are currently supported"); - } -} - -std::unique_ptr convert_from_rows(cudf::lists_column_view const& input, - std::vector const& schema, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ -#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 - // verify that the types are what we expect - cudf::column_view child = input.child(); - cudf::type_id list_type = child.type().id(); - CUDF_EXPECTS(list_type == cudf::type_id::INT8 || list_type == cudf::type_id::UINT8, - "Only a list of bytes is supported as input"); - - cudf::size_type num_columns = schema.size(); - cudf::size_type num_rows = input.parent().size(); - - int device_id; - CUDA_TRY(cudaGetDevice(&device_id)); - int total_shmem; - CUDA_TRY(cudaDeviceGetAttribute(&total_shmem, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); - -#if defined(DEBUG) || 1 - total_shmem -= 1024; -#endif - int shmem_limit_per_block = total_shmem / NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; - - std::vector column_starts; - std::vector column_sizes; - - auto iter = thrust::make_transform_iterator(thrust::make_counting_iterator(0), [&schema](auto i) { - return std::make_tuple(schema[i], nullptr); - }); - size_type fixed_width_size_per_row = detail::compute_column_information( - iter, iter + num_columns, column_starts, column_sizes); //, [](void *) {}); - - size_type validity_size = num_bitmask_words(num_columns) * 4; - - size_type row_size = detail::align_offset(fixed_width_size_per_row + validity_size, 8); - - // Ideally we would check that the offsets are all the same, etc. but for now - // this is probably fine - CUDF_EXPECTS(row_size * num_rows == child.size(), "The layout of the data appears to be off"); - auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr); - auto dev_col_sizes = make_device_uvector_async(column_sizes, stream, mr); - - // build the row_batches from the passed in list column - std::vector row_batches; - - row_batches.push_back(detail::row_batch{child.size(), num_rows}); - - // Allocate the columns we are going to write into - std::vector> output_columns; - std::vector output_data; - std::vector output_nm; - for (cudf::size_type i = 0; i < num_columns; i++) { - auto column = cudf::make_fixed_width_column( - schema[i], num_rows, cudf::mask_state::UNINITIALIZED, stream, mr); - auto mut = column->mutable_view(); - output_data.emplace_back(mut.data()); - output_nm.emplace_back(mut.null_mask()); - output_columns.emplace_back(std::move(column)); - } - - auto dev_output_data = make_device_uvector_async(output_data, stream, mr); - auto dev_output_nm = make_device_uvector_async(output_nm, stream, mr); - - std::vector block_infos = - build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block); - - auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr); - - dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_COLUMNS)); -#if defined(DEBUG) - dim3 threads(std::min(std::min(128, shmem_limit_per_block / 8), (int)child.size())); -#else - dim3 threads(std::min(256, (int)child.size())); -#endif -#if defined(DEBUG) - printf("Launching kernel with %d blocks, %d threads, ", blocks.x, threads.x); - detail::pretty_print(total_shmem); - printf(" shared memory\n"); -#endif - detail::copy_to_columns<<>>( - num_rows, - num_columns, - shmem_limit_per_block, - input.offsets().data(), - dev_output_data.data(), - dev_col_sizes.data(), - dev_col_starts.data(), - dev_block_infos.data(), - block_infos.size(), - child.data()); - - auto const desired_rows_and_columns = (int)sqrt(shmem_limit_per_block); - auto const column_stride = [&]() { - if (desired_rows_and_columns > num_columns) { - // not many columns, group it into 64s and ship it off - return std::min(64, num_columns); - } else { - return util::round_down_safe(desired_rows_and_columns, 8); - } - }(); - auto const row_stride = [&]() { - // we fit as much as we can, we know the column stride now, so calculate the row - return std::min(num_rows, util::round_down_safe(shmem_limit_per_block * 8 / column_stride, 32)); - /* if (desired_rows_and_columns > num_rows) { - return std::min(32, num_rows); - } else { - return util::round_down_safe(desired_rows_and_columns, 32); - }*/ - }(); - std::vector validity_block_infos; - for (int col = 0; col < num_columns; col += column_stride) { - for (int row = 0; row < num_rows; row += row_stride) { - validity_block_infos.emplace_back( - detail::block_info{col, - row, - std::min(col + column_stride - 1, num_columns - 1), - std::min(row + row_stride - 1, num_rows - 1)}); - } - } - auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream, mr); - dim3 validity_blocks( - util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_COLUMNS)); -#if defined(DEBUG) - printf( - "Launching validity kernel with %d blocks, for %lu validity blocks, col stride %d and row " - "stride of %d with %d threads, ", - validity_blocks.x, - validity_block_infos.size(), - column_stride, - row_stride, - threads.x); - detail::pretty_print(total_shmem); - printf(" shared memory\n"); -#endif - - dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu)); - detail:: - copy_validity_to_columns<<>>( - num_rows, - num_columns, - shmem_limit_per_block, - input.offsets().data(), - dev_output_nm.data(), - column_starts.back(), - dev_validity_block_infos.data(), - validity_block_infos.size(), - child.data()); - - return std::make_unique(std::move(output_columns)); -#else - CUDF_FAIL("Row to column conversion optimization requires volta or later hardware."); - return {}; -#endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 -} - -std::unique_ptr convert_from_rows_fixed_width_optimized( - cudf::lists_column_view const& input, - std::vector const& schema, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - // verify that the types are what we expect - cudf::column_view child = input.child(); - cudf::type_id list_type = child.type().id(); - CUDF_EXPECTS(list_type == cudf::type_id::INT8 || list_type == cudf::type_id::UINT8, - "Only a list of bytes is supported as input"); - - cudf::size_type num_columns = schema.size(); - - if (detail::are_all_fixed_width(schema)) { - std::vector column_start; - std::vector column_size; - - cudf::size_type num_rows = input.parent().size(); - int32_t size_per_row = detail::compute_fixed_width_layout(schema, column_start, column_size); - - // Ideally we would check that the offsets are all the same, etc. but for now - // this is probably fine - CUDF_EXPECTS(size_per_row * num_rows == child.size(), - "The layout of the data appears to be off"); - auto dev_column_start = make_device_uvector_async(column_start, stream); - auto dev_column_size = make_device_uvector_async(column_size, stream); - - // Allocate the columns we are going to write into - std::vector> output_columns; - std::vector output_data; - std::vector output_nm; - for (cudf::size_type i = 0; i < num_columns; i++) { - auto column = cudf::make_fixed_width_column( - schema[i], num_rows, cudf::mask_state::UNINITIALIZED, stream, mr); - auto mut = column->mutable_view(); - output_data.emplace_back(mut.data()); - output_nm.emplace_back(mut.null_mask()); - output_columns.emplace_back(std::move(column)); - } - - auto dev_output_data = make_device_uvector_async(output_data, stream, mr); - auto dev_output_nm = make_device_uvector_async(output_nm, stream, mr); - - dim3 blocks; - dim3 threads; - int shared_size = - detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads); - - // printf("Launching (%d, %d, %d) blocks, (%d, %d, %d) threads, with %d shared size\n", - // blocks.x, blocks.y, blocks.z, threads.x, threads.y, threads.z, shared_size); - // printf("pointers are column_start: %p, column_size: %p, output_data: %p, output_nm: %p\n", - // dev_column_start.data(), dev_column_size.data(), dev_output_data.data(), - // dev_output_nm.data()); - detail::copy_to_fixed_width_columns<<>>( - num_rows, - num_columns, - size_per_row, - dev_column_start.data(), - dev_column_size.data(), - dev_output_data.data(), - dev_output_nm.data(), - child.data()); - - return std::make_unique(std::move(output_columns)); - } else { - CUDF_FAIL("Only fixed width types are currently supported"); - } -} - -} // namespace cudf + current_window_start_col = col; + current_window_width = 0; + } else { + row_size = row_size_with_this_col; + current_window_width++; + } + } + + // build last set of blocks + if (current_window_width > 0) { + build_blocks(current_window_start_col, (int)column_sizes.size() - 1, window_height); + } + + return block_infos; + } + + #endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 + + } // namespace detail + + std::vector> convert_to_rows(cudf::table_view const &tbl, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) { + #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 + // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the + // data, but small enough that multiple columns fit in memory so the writes can coalese as well. + // Potential optimization for window sizes. + const size_type num_columns = tbl.num_columns(); + const size_type num_rows = tbl.num_rows(); + + int device_id; + CUDA_TRY(cudaGetDevice(&device_id)); + int total_shmem; + CUDA_TRY(cudaDeviceGetAttribute(&total_shmem, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); + + // TODO: why? + total_shmem -= 1024; + int shmem_limit_per_block = total_shmem / NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; + + // break up the work into blocks, which are a starting and ending row/col #. + // this window size is calculated based on the shared memory size available + // we want a single block to fill up the entire shared memory space available + // for the transpose-like conversion. + + // There are two different processes going on here. The GPU conversion of the data + // and the writing of the data into the list of byte columns that are a maximum of + // 2 gigs each due to offset maximum size. The GPU conversion portion has to understand + // this limitation because the column must own the data inside and as a result it must be + // a distinct allocation for that column. Copying the data into these final buffers would + // be prohibitively expensive, so care is taken to ensure the GPU writes to the proper buffer. + // The windows are broken at the boundaries of specific rows based on the row sizes up + // to that point. These are row batches and they are decided first before building the + // windows so the windows can be properly cut around them. + + // Get the pointers to the input columnar data ready + std::vector input_data; + std::vector input_nm; + input_data.reserve(num_columns); + input_nm.reserve(num_columns); + for (size_type column_number = 0; column_number < num_columns; column_number++) { + column_view cv = tbl.column(column_number); + auto const col_type = cv.type(); + bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; + + if (!nested_type) { + input_data.emplace_back(cv.data()); + input_nm.emplace_back(cv.null_mask()); + } + } + + auto dev_input_data = make_device_uvector_async(input_data, stream, mr); + auto dev_input_nm = make_device_uvector_async(input_nm, stream, mr); + + std::vector row_sizes; // size of each row in bytes including any alignment padding + std::vector row_offsets; // offset from the start of the data to this row + std::vector column_sizes; // byte size of each column + std::vector column_starts; // offset of column inside a row including alignment + std::vector + variable_width_columns; // list of the variable width columns in the table + row_sizes.reserve(num_rows); + row_offsets.reserve(num_rows); + column_sizes.reserve(num_columns); + column_starts.reserve(num_columns + 1); // we add a final offset for validity data start + + auto iter = + thrust::make_transform_iterator(thrust::make_counting_iterator(0), + [&tbl](auto i) -> std::tuple { + return std::make_tuple(tbl.column(i).type(), tbl.column(i)); + }); + + size_type fixed_width_size_per_row = + detail::compute_column_information(iter, iter + num_columns, column_starts, column_sizes); + + auto dev_col_sizes = make_device_uvector_async(column_sizes, stream, mr); + auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr); + + std::vector row_batches; + + uint64_t row_batch_size = 0; + uint64_t total_table_size = 0; + size_type row_batch_rows = 0; + uint64_t row_offset = 0; + + // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then + // calculate the size of each row's variable-width data and validity as well. + auto validity_size = num_bitmask_words(num_columns) * 4; + // thrust + for (int row = 0; row < num_rows; ++row) { + auto aligned_row_batch_size = + detail::align_offset(row_batch_size, 8); // rows are 8 byte aligned + row_sizes[row] = fixed_width_size_per_row; + // validity is byte aligned + row_sizes[row] += validity_size; + // variable width data is 8-byte aligned + row_sizes[row] = detail::align_offset(row_sizes[row], 8); // rows are 8 byte aligned + + if ((uint64_t)aligned_row_batch_size + row_sizes[row] > + (uint64_t)std::numeric_limits::max()) { + // a new batch starts at the last 32-row boundary + row_batches.push_back( + detail::row_batch{static_cast(row_batch_size), row_batch_rows & ~31}); + row_batch_size = 0; + row_batch_rows = row_batch_rows & 31; + row_offset = 0; + aligned_row_batch_size = 0; + } + row_offset = detail::align_offset(row_offset, 8); // rows are 8 byte aligned + row_offsets.push_back(row_offset); + row_batch_size = aligned_row_batch_size + row_sizes[row]; + row_offset += row_sizes[row]; + total_table_size = detail::align_offset(total_table_size, 8); // rows are 8 byte aligned + total_table_size += row_sizes[row]; + row_batch_rows++; + } + if (row_batch_size > 0) { + row_batches.push_back( + detail::row_batch{static_cast(row_batch_size), row_batch_rows}); + } + + auto dev_row_offsets = make_device_uvector_async(row_offsets, stream, mr); + + std::vector output_buffers; + std::vector output_data; + output_data.reserve(row_batches.size()); + for (uint i = 0; i < row_batches.size(); ++i) { + rmm::device_buffer temp(row_batches[i].num_bytes, stream, mr); + output_data.push_back(static_cast(temp.data())); + output_buffers.push_back(std::move(temp)); + } + auto dev_output_data = make_device_uvector_async(output_data, stream, mr); + + std::vector block_infos = + build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block); + + auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr); + + // blast through the entire table and convert it + dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_ROWS)); + dim3 threads(256); + + detail::copy_to_rows<<>>( + num_rows, num_columns, shmem_limit_per_block, block_infos.size(), dev_input_data.data(), + dev_col_sizes.data(), dev_col_starts.data(), dev_block_infos.data(), dev_row_offsets.data(), + reinterpret_cast(dev_output_data.data())); + + auto validity_block_infos = + build_validity_block_infos(num_columns, num_rows, shmem_limit_per_block, row_batches); + + auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream, mr); + dim3 validity_blocks( + util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_ROWS)); + dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu)); + detail::copy_validity_to_rows<<>>( + num_rows, num_columns, shmem_limit_per_block, dev_row_offsets.data(), dev_output_data.data(), + column_starts.back(), dev_validity_block_infos.data(), validity_block_infos.size(), + dev_input_nm.data()); + + // split up the output buffer into multiple buffers based on row batch sizes + // and create list of byte columns + int offset_offset = 0; + std::vector> ret; + for (uint i = 0; i < row_batches.size(); ++i) { + // compute offsets for this row batch + std::vector offset_vals; + offset_vals.reserve(row_batches[i].row_count + 1); + size_type cur_offset = 0; + offset_vals.push_back(cur_offset); + for (int row = 0; row < row_batches[i].row_count; ++row) { + cur_offset = detail::align_offset(cur_offset, 8) + row_sizes[row + offset_offset]; + offset_vals.push_back(cur_offset); + } + offset_offset += row_batches[i].row_count; + + auto dev_offsets = make_device_uvector_async(offset_vals, stream, mr); + auto offsets = std::make_unique(data_type{type_id::INT32}, + (size_type)offset_vals.size(), dev_offsets.release()); + + auto data = std::make_unique(data_type{cudf::type_id::INT8}, row_batches[i].num_bytes, + std::move(output_buffers[i])); + + ret.push_back( + cudf::make_lists_column(row_batches[i].row_count, std::move(offsets), std::move(data), 0, + rmm::device_buffer{0, rmm::cuda_stream_default, mr}, stream, mr)); + } + + return ret; + #else + CUDF_FAIL("Column to row conversion optimization requires volta or later hardware."); + return {}; + #endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 + } + + std::vector> + convert_to_rows_fixed_width_optimized(cudf::table_view const &tbl, rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) { + const cudf::size_type num_columns = tbl.num_columns(); + + std::vector schema; + schema.resize(num_columns); + std::transform(tbl.begin(), tbl.end(), schema.begin(), detail::get_data_type); + + if (detail::are_all_fixed_width(schema)) { + std::vector column_start; + std::vector column_size; + + int32_t size_per_row = detail::compute_fixed_width_layout(schema, column_start, column_size); + auto dev_column_start = make_device_uvector_async(column_start, stream, mr); + auto dev_column_size = make_device_uvector_async(column_size, stream, mr); + + int32_t max_rows_per_batch = std::numeric_limits::max() / size_per_row; + // Make the number of rows per batch a multiple of 32 so we don't have to worry about + // splitting validity at a specific row offset. This might change in the future. + max_rows_per_batch = (max_rows_per_batch / 32) * 32; + + cudf::size_type num_rows = tbl.num_rows(); + + // Get the pointers to the input columnar data ready + std::vector input_data; + std::vector input_nm; + for (cudf::size_type column_number = 0; column_number < num_columns; column_number++) { + cudf::column_view cv = tbl.column(column_number); + input_data.emplace_back(cv.data()); + input_nm.emplace_back(cv.null_mask()); + } + auto dev_input_data = make_device_uvector_async(input_data, stream, mr); + auto dev_input_nm = make_device_uvector_async(input_nm, stream, mr); + + using ScalarType = cudf::scalar_type_t; + auto zero = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value()); + zero->set_valid_async(true, stream); + static_cast(zero.get())->set_value(0, stream); + + auto step = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value()); + step->set_valid_async(true, stream); + static_cast(step.get()) + ->set_value(static_cast(size_per_row), stream); + + std::vector> ret; + for (cudf::size_type row_start = 0; row_start < num_rows; row_start += max_rows_per_batch) { + cudf::size_type row_count = num_rows - row_start; + row_count = row_count > max_rows_per_batch ? max_rows_per_batch : row_count; + ret.emplace_back(detail::fixed_width_convert_to_rows( + row_start, row_count, num_columns, size_per_row, dev_column_start, dev_column_size, + dev_input_data, dev_input_nm, *zero, *step, stream, mr)); + } + + return ret; + } else { + CUDF_FAIL("Only fixed width types are currently supported"); + } + } + + std::unique_ptr convert_from_rows(cudf::lists_column_view const &input, + std::vector const &schema, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) { + #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 + // verify that the types are what we expect + cudf::column_view child = input.child(); + cudf::type_id list_type = child.type().id(); + CUDF_EXPECTS(list_type == cudf::type_id::INT8 || list_type == cudf::type_id::UINT8, + "Only a list of bytes is supported as input"); + + cudf::size_type num_columns = schema.size(); + cudf::size_type num_rows = input.parent().size(); + + int device_id; + CUDA_TRY(cudaGetDevice(&device_id)); + int total_shmem; + CUDA_TRY(cudaDeviceGetAttribute(&total_shmem, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); + + // TODO why? + total_shmem -= 1024; + int shmem_limit_per_block = total_shmem / NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; + + std::vector column_starts; + std::vector column_sizes; + + auto iter = thrust::make_transform_iterator(thrust::make_counting_iterator(0), [&schema](auto i) { + return std::make_tuple(schema[i], nullptr); + }); + size_type fixed_width_size_per_row = detail::compute_column_information( + iter, iter + num_columns, column_starts, column_sizes); //, [](void *) {}); + + size_type validity_size = num_bitmask_words(num_columns) * 4; + + size_type row_size = detail::align_offset(fixed_width_size_per_row + validity_size, 8); + + // Ideally we would check that the offsets are all the same, etc. but for now + // this is probably fine + CUDF_EXPECTS(row_size * num_rows == child.size(), "The layout of the data appears to be off"); + auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr); + auto dev_col_sizes = make_device_uvector_async(column_sizes, stream, mr); + + // build the row_batches from the passed in list column + std::vector row_batches; + + row_batches.push_back(detail::row_batch{child.size(), num_rows}); + + // Allocate the columns we are going to write into + std::vector> output_columns; + std::vector output_data; + std::vector output_nm; + for (cudf::size_type i = 0; i < num_columns; i++) { + auto column = cudf::make_fixed_width_column(schema[i], num_rows, + cudf::mask_state::UNINITIALIZED, stream, mr); + auto mut = column->mutable_view(); + output_data.emplace_back(mut.data()); + output_nm.emplace_back(mut.null_mask()); + output_columns.emplace_back(std::move(column)); + } + + auto dev_output_data = make_device_uvector_async(output_data, stream, mr); + auto dev_output_nm = make_device_uvector_async(output_nm, stream, mr); + + std::vector block_infos = + build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block); + + auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr); + + dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_FROM_ROWS)); + dim3 threads(std::min(std::min(256, shmem_limit_per_block / 8), (int)child.size())); + detail::copy_from_rows<<>>( + num_rows, num_columns, shmem_limit_per_block, input.offsets().data(), + dev_output_data.data(), dev_col_sizes.data(), dev_col_starts.data(), dev_block_infos.data(), + block_infos.size(), child.data()); + + auto const desired_rows_and_columns = (int)sqrt(shmem_limit_per_block); + auto const column_stride = [&]() { + if (desired_rows_and_columns > num_columns) { + // not many columns, group it into 64s and ship it off + return std::min(64, num_columns); + } else { + return util::round_down_safe(desired_rows_and_columns, 8); + } + }(); + auto const row_stride = [&]() { + // we fit as much as we can, we know the column stride now, so calculate the row + return std::min(num_rows, util::round_down_safe(shmem_limit_per_block * 8 / column_stride, 32)); + /* if (desired_rows_and_columns > num_rows) { + return std::min(32, num_rows); + } else { + return util::round_down_safe(desired_rows_and_columns, 32); + }*/ + }(); + std::vector validity_block_infos; + for (int col = 0; col < num_columns; col += column_stride) { + for (int row = 0; row < num_rows; row += row_stride) { + validity_block_infos.emplace_back( + detail::block_info{col, row, std::min(col + column_stride - 1, num_columns - 1), + std::min(row + row_stride - 1, num_rows - 1)}); + } + } + auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream, mr); + dim3 validity_blocks( + util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_BLOCKS_PER_KERNEL_FROM_ROWS)); + + dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu)); + detail:: + copy_validity_from_rows<<>>( + num_rows, num_columns, shmem_limit_per_block, input.offsets().data(), + dev_output_nm.data(), column_starts.back(), dev_validity_block_infos.data(), + validity_block_infos.size(), child.data()); + + return std::make_unique(std::move(output_columns)); + #else + CUDF_FAIL("Row to column conversion optimization requires volta or later hardware."); + return {}; + #endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 + } + + std::unique_ptr convert_from_rows_fixed_width_optimized( + cudf::lists_column_view const &input, std::vector const &schema, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) { + // verify that the types are what we expect + cudf::column_view child = input.child(); + cudf::type_id list_type = child.type().id(); + CUDF_EXPECTS(list_type == cudf::type_id::INT8 || list_type == cudf::type_id::UINT8, + "Only a list of bytes is supported as input"); + + cudf::size_type num_columns = schema.size(); + + if (detail::are_all_fixed_width(schema)) { + std::vector column_start; + std::vector column_size; + + cudf::size_type num_rows = input.parent().size(); + int32_t size_per_row = detail::compute_fixed_width_layout(schema, column_start, column_size); + + // Ideally we would check that the offsets are all the same, etc. but for now + // this is probably fine + CUDF_EXPECTS(size_per_row * num_rows == child.size(), + "The layout of the data appears to be off"); + auto dev_column_start = make_device_uvector_async(column_start, stream); + auto dev_column_size = make_device_uvector_async(column_size, stream); + + // Allocate the columns we are going to write into + std::vector> output_columns; + std::vector output_data; + std::vector output_nm; + for (cudf::size_type i = 0; i < num_columns; i++) { + auto column = cudf::make_fixed_width_column(schema[i], num_rows, + cudf::mask_state::UNINITIALIZED, stream, mr); + auto mut = column->mutable_view(); + output_data.emplace_back(mut.data()); + output_nm.emplace_back(mut.null_mask()); + output_columns.emplace_back(std::move(column)); + } + + auto dev_output_data = make_device_uvector_async(output_data, stream, mr); + auto dev_output_nm = make_device_uvector_async(output_nm, stream, mr); + + dim3 blocks; + dim3 threads; + int shared_size = + detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads); + + detail::copy_from_rows_fixed_width_optimized<<>>( + num_rows, num_columns, size_per_row, dev_column_start.data(), dev_column_size.data(), + dev_output_data.data(), dev_output_nm.data(), child.data()); + + return std::make_unique(std::move(output_columns)); + } else { + CUDF_FAIL("Only fixed width types are currently supported"); + } + } + + } // namespace cudf + \ No newline at end of file diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu index a67589fbaec..932afa4bb70 100644 --- a/java/src/main/native/src/row_conversion.cu +++ b/java/src/main/native/src/row_conversion.cu @@ -50,8 +50,8 @@ #include #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 -constexpr auto NUM_BLOCKS_PER_KERNEL_TO_COLUMNS = 8; -constexpr auto NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS = 2; +constexpr auto NUM_BLOCKS_PER_KERNEL_FROM_ROWS = 8; +constexpr auto NUM_BLOCKS_PER_KERNEL_TO_ROWS = 2; constexpr auto NUM_BLOCKS_PER_KERNEL_LOADED = 2; constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL = 8; constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED = 2; @@ -67,13 +67,11 @@ static inline __host__ __device__ int32_t align_offset(int32_t offset, std::size return (offset + alignment - 1) & ~(alignment - 1); } -__global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows, - const cudf::size_type num_columns, - const cudf::size_type row_size, - const cudf::size_type *input_offset_in_row, - const cudf::size_type *num_bytes, int8_t **output_data, - cudf::bitmask_type **output_nm, - const int8_t *input_data) { +__global__ void copy_from_rows_fixed_width_optimized( + const cudf::size_type num_rows, const cudf::size_type num_columns, + const cudf::size_type row_size, const cudf::size_type *input_offset_in_row, + const cudf::size_type *num_bytes, int8_t **output_data, cudf::bitmask_type **output_nm, + const int8_t *input_data) { // We are going to copy the data in two passes. // The first pass copies a chunk of data into shared memory. // The second pass copies that chunk from shared memory out to the final location. @@ -190,12 +188,11 @@ __global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows, } } -__global__ void -copy_from_fixed_width_columns(const cudf::size_type start_row, const cudf::size_type num_rows, - const cudf::size_type num_columns, const cudf::size_type row_size, - const cudf::size_type *output_offset_in_row, - const cudf::size_type *num_bytes, const int8_t **input_data, - const cudf::bitmask_type **input_nm, int8_t *output_data) { +__global__ void copy_to_rows_fixed_width_optimized( + const cudf::size_type start_row, const cudf::size_type num_rows, + const cudf::size_type num_columns, const cudf::size_type row_size, + const cudf::size_type *output_offset_in_row, const cudf::size_type *num_bytes, + const int8_t **input_data, const cudf::bitmask_type **input_nm, int8_t *output_data) { // We are going to copy the data in two passes. // The first pass copies a chunk of data into shared memory. // The second pass copies that chunk from shared memory out to the final location. @@ -367,12 +364,11 @@ struct row_batch { * @param output_data pointer to output data * */ -__global__ void copy_from_columns(const size_type num_rows, const size_type num_columns, - const size_type shmem_used_per_block, - const size_type num_block_infos, const int8_t **input_data, - const size_type *col_sizes, const size_type *col_offsets, - const block_info *block_infos, const size_type *row_offsets, - int8_t **output_data) { +__global__ void copy_to_rows(const size_type num_rows, const size_type num_columns, + const size_type shmem_used_per_block, const size_type num_block_infos, + const int8_t **input_data, const size_type *col_sizes, + const size_type *col_offsets, const block_info *block_infos, + const size_type *row_offsets, int8_t **output_data) { // We are going to copy the data in two passes. // The first pass copies a chunk of data into shared memory. // The second pass copies that chunk from shared memory out to the final location. @@ -396,15 +392,15 @@ __global__ void copy_from_columns(const size_type num_rows, const size_type num_ group.sync(); auto const blocks_remaining = - std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS, - (uint)NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS); + std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_ROWS, + (uint)NUM_BLOCKS_PER_KERNEL_TO_ROWS); size_t fetch; size_t subset; for (subset = fetch = 0; subset < blocks_remaining; ++subset) { // Fetch ahead up to stages_count subsets for (; fetch < blocks_remaining && fetch < (subset + stages_count); ++fetch) { - auto const fetch_block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS + fetch]; + auto const fetch_block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_ROWS + fetch]; auto const num_fetch_cols = fetch_block.num_cols(); auto const num_fetch_rows = fetch_block.num_rows(); auto const num_elements_in_block = num_fetch_cols * num_fetch_rows; @@ -462,7 +458,7 @@ __global__ void copy_from_columns(const size_type num_rows, const size_type num_ auto &subset_barrier = block_barrier[subset % NUM_BLOCKS_PER_KERNEL_LOADED]; subset_barrier.arrive_and_wait(); - auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS + subset]; + auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_ROWS + subset]; auto const block_row_size = block.get_shared_row_size(col_offsets, col_sizes); auto const column_offset = col_offsets[block.start_col]; auto const block_output_buffer = output_data[block.buffer_num]; @@ -499,7 +495,7 @@ __global__ void copy_from_columns(const size_type num_rows, const size_type num_ * @param input_data pointer to input data * */ -__global__ void copy_validity_from_columns( +__global__ void copy_validity_to_rows( const size_type num_rows, const size_type num_columns, const size_type shmem_used_per_block, const size_type *row_offsets, int8_t **output_data, const size_type validity_offset, const block_info *block_infos, const size_type num_block_infos, const bitmask_type **input_nm) { @@ -633,74 +629,6 @@ get_admin_data_sizes(size_t col_size_size, size_t col_offset_size, int const num return {col_size_bytes, col_offset_bytes}; } -/** - * @brief ensure `read_ahead` buffer blocks are fetched - * - * @param fetch_index internal state passed into the function - * @param processing_index index where processing is occuring - * @param read_ahead_count how many blocks to read ahead - * @param max_resident_blocks how many blocks can be loaded at once - * @param total_blocks total number of blocks overall - * @param block_infos pointer to the block infos - * @param col_sizes pointer to column size information - * @param col_offsets pointer to the table's column offsets - * @param row_offsets pointer to offsets for each row in the table - * @param input_data pointer to the input data - * @param shared pointer to shared memory - * @param group thread group participating in the fetch - * @param block_barrier barriers used for each block - * @return - */ -static __device__ void -fetch_blocks_for_row_to_column(size_t &fetch_index, size_t const processing_index, - int const read_ahead_count, int const max_resident_blocks, - int const total_blocks, block_info const *const block_infos, - size_type const *const col_sizes, size_type const *const col_offsets, - size_type const *const row_offsets, int8_t const *const input_data, - int8_t *shared[], cooperative_groups::thread_block const group, - cuda::barrier *block_barrier) { - for (; fetch_index < static_cast(total_blocks) && - fetch_index < (processing_index + read_ahead_count); - ++fetch_index) { - auto const fetch_block = - block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + fetch_index]; - auto const fetch_block_start_row = fetch_block.start_row; - auto const fetch_block_end_row = fetch_block.end_row; - auto const starting_col_offset = col_offsets[fetch_block.start_col]; - auto const fetch_block_row_size = fetch_block.get_shared_row_size(col_offsets, col_sizes); - auto const num_fetch_cols = fetch_block.num_cols(); - auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes( - sizeof(decltype(*col_sizes)), sizeof(decltype(*col_offsets)), num_fetch_cols); - auto &fetch_barrier = block_barrier[fetch_index % NUM_BLOCKS_PER_KERNEL_LOADED]; - - // if we have fetched all buffers, we need to wait for processing - // to complete on them before we can use them again - if (fetch_index > NUM_BLOCKS_PER_KERNEL_LOADED) { - fetch_barrier.arrive_and_wait(); - } - - auto shared_row_offset = 0; - // copy the data for column sizes - cuda::memcpy_async(group, &shared[fetch_index % max_resident_blocks][shared_row_offset], - &col_sizes[fetch_block.start_col], col_size_bytes, fetch_barrier); - shared_row_offset += col_size_bytes; - // copy the data for column offsets - cuda::memcpy_async(group, &shared[fetch_index % max_resident_blocks][shared_row_offset], - &col_offsets[fetch_block.start_col], col_offset_bytes, fetch_barrier); - shared_row_offset += col_offset_bytes; - shared_row_offset = align_offset(shared_row_offset, 8); - - for (auto row = fetch_block_start_row + static_cast(threadIdx.x); - row <= fetch_block_end_row; row += blockDim.x) { - auto shared_offset = (row - fetch_block_start_row) * fetch_block_row_size + shared_row_offset; - // copy the main - cuda::memcpy_async(&shared[fetch_index % max_resident_blocks][shared_offset], - &input_data[row_offsets[row] + starting_col_offset], fetch_block_row_size, - fetch_barrier); - } - } -} - /** * @brief copy data from row-based format to cudf columns * @@ -716,7 +644,7 @@ fetch_blocks_for_row_to_column(size_t &fetch_index, size_t const processing_inde * @param input_data pointer to input data * */ -__global__ void copy_to_columns(const size_type num_rows, const size_type num_columns, +__global__ void copy_from_rows(const size_type num_rows, const size_type num_columns, const size_type shmem_used_per_block, const size_type *row_offsets, int8_t **output_data, const size_type *_col_sizes, const size_type *_col_offsets, const block_info *block_infos, @@ -746,40 +674,70 @@ __global__ void copy_to_columns(const size_type num_rows, const size_type num_co group.sync(); - auto blocks_remaining = std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS, - (uint)NUM_BLOCKS_PER_KERNEL_TO_COLUMNS); + auto blocks_remaining = std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_ROWS, + (uint)NUM_BLOCKS_PER_KERNEL_FROM_ROWS); + + size_t fetch_index; + size_t processing_index; + for (processing_index = fetch_index = 0; processing_index < blocks_remaining; + ++processing_index) { + // Fetch ahead up to stages_count groups + for (; fetch_index < static_cast(blocks_remaining) && + fetch_index < (processing_index + stages_count); + ++fetch_index) { + auto const fetch_block = + block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_ROWS + fetch_index]; + auto const fetch_block_start_row = fetch_block.start_row; + auto const fetch_block_end_row = fetch_block.end_row; + auto const starting_col_offset = _col_offsets[fetch_block.start_col]; + auto const fetch_block_row_size = fetch_block.get_shared_row_size(_col_offsets, _col_sizes); + auto const num_fetch_cols = fetch_block.num_cols(); + auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes( + sizeof(decltype(*_col_sizes)), sizeof(decltype(*_col_offsets)), num_fetch_cols); + auto &fetch_barrier = block_barrier[fetch_index % NUM_BLOCKS_PER_KERNEL_LOADED]; - auto get_admin_data_sizes = [col_size_size = sizeof(decltype(*_col_sizes)), - col_offset_size = sizeof(decltype(*_col_offsets))]( - int const num_cols, - int const num_rows) -> std::tuple { - auto const col_size_bytes = num_cols * col_size_size; - auto const col_offset_bytes = num_cols * col_offset_size; + // if we have fetched all buffers, we need to wait for processing + // to complete on them before we can use them again + if (fetch_index > NUM_BLOCKS_PER_KERNEL_LOADED) { + fetch_barrier.arrive_and_wait(); + } - return {col_size_bytes, col_offset_bytes}; - }; + auto shared_row_offset = 0; + // copy the data for column sizes + cuda::memcpy_async(group, &shared[fetch_index % stages_count][shared_row_offset], + &_col_sizes[fetch_block.start_col], col_size_bytes, fetch_barrier); + shared_row_offset += col_size_bytes; + // copy the data for column offsets + cuda::memcpy_async(group, &shared[fetch_index % stages_count][shared_row_offset], + &_col_offsets[fetch_block.start_col], col_offset_bytes, fetch_barrier); + shared_row_offset += col_offset_bytes; + shared_row_offset = align_offset(shared_row_offset, 8); + + for (auto row = fetch_block_start_row + static_cast(threadIdx.x); + row <= fetch_block_end_row; row += blockDim.x) { + auto shared_offset = + (row - fetch_block_start_row) * fetch_block_row_size + shared_row_offset; + // copy the main + cuda::memcpy_async(&shared[fetch_index % stages_count][shared_offset], + &input_data[row_offsets[row] + starting_col_offset], + fetch_block_row_size, fetch_barrier); + } + } - size_t fetch; - size_t subset; - for (subset = fetch = 0; subset < blocks_remaining; ++subset) { - // Fetch ahead up to stages_count subsets - fetch_blocks_for_row_to_column(fetch, subset, stages_count, stages_count, blocks_remaining, - block_infos, _col_sizes, _col_offsets, row_offsets, input_data, - shared, group, block_barrier); + auto &processing_barrier = block_barrier[processing_index % NUM_BLOCKS_PER_KERNEL_LOADED]; - auto &subset_barrier = block_barrier[subset % NUM_BLOCKS_PER_KERNEL_LOADED]; // ensure our data is ready - subset_barrier.arrive_and_wait(); + processing_barrier.arrive_and_wait(); - auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_COLUMNS + subset]; + auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_ROWS + processing_index]; auto const rows_in_block = block.num_rows(); auto const cols_in_block = block.num_cols(); - auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes(cols_in_block, rows_in_block); - // auto shared_row_offsets = shared[subset]; - auto shared_col_sizes = reinterpret_cast(shared[subset % stages_count]); + auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes( + sizeof(decltype(*_col_sizes)), sizeof(decltype(*_col_offsets)), cols_in_block); + auto shared_col_sizes = reinterpret_cast(shared[processing_index % stages_count]); auto shared_col_offsets = - reinterpret_cast(&shared[subset % stages_count][col_size_bytes]); + reinterpret_cast(&shared[processing_index % stages_count][col_size_bytes]); auto const shared_row_offset = align_offset(col_size_bytes + col_offset_bytes, 8); @@ -803,10 +761,10 @@ __global__ void copy_to_columns(const size_type num_rows, const size_type num_co shared_memory_row_offset + shared_row_offset; auto const column_size = shared_col_sizes[relative_col]; - int8_t *shmem_src = &shared[subset % stages_count][shared_memory_offset]; + int8_t *shmem_src = &shared[processing_index % stages_count][shared_memory_offset]; int8_t *dst = &output_data[absolute_col][absolute_row * column_size]; - cuda::memcpy_async(dst, shmem_src, column_size, subset_barrier); + cuda::memcpy_async(dst, shmem_src, column_size, processing_barrier); } group.sync(); } @@ -831,7 +789,7 @@ __global__ void copy_to_columns(const size_type num_rows, const size_type num_co * @param input_data pointer to input data * */ -__global__ void copy_validity_to_columns( +__global__ void copy_validity_from_rows( const size_type num_rows, const size_type num_columns, const size_type shmem_used_per_block, const size_type *row_offsets, cudf::bitmask_type **output_nm, const size_type validity_offset, const block_info *block_infos, const size_type num_block_infos, const int8_t *input_data) { @@ -1050,7 +1008,7 @@ fixed_width_convert_to_rows(const cudf::size_type start_row, const cudf::size_ty int shared_size = detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads); - copy_from_fixed_width_columns<<>>( + copy_to_rows_fixed_width_optimized<<>>( start_row, num_rows, num_columns, size_per_row, column_start.data(), column_size.data(), input_data.data(), input_nm.data(), data->mutable_view().data()); @@ -1354,18 +1312,6 @@ std::vector> convert_to_rows(cudf::table_view cons std::vector row_batches; - auto calculate_variable_width_row_data_size = [](int const row) { - // each level of variable-width data will add an offset/length - // uint64 of data. The first of which is inside the fixed-width - // data itself and needs to be aligned based on what is around - // that data. This is handled above with the fixed-width calculations - // for that reason. We may still need to add more of these offset/length - // combinations if the nesting is deeper than one level as these - // will be included in the variable-width data blob at the end of the - // row. - return 0; - }; - uint64_t row_batch_size = 0; uint64_t total_table_size = 0; size_type row_batch_rows = 0; @@ -1382,8 +1328,7 @@ std::vector> convert_to_rows(cudf::table_view cons // validity is byte aligned row_sizes[row] += validity_size; // variable width data is 8-byte aligned - row_sizes[row] = detail::align_offset(row_sizes[row], 8) + - calculate_variable_width_row_data_size(row); // rows are 8 byte aligned + row_sizes[row] = detail::align_offset(row_sizes[row], 8); // rows are 8 byte aligned if ((uint64_t)aligned_row_batch_size + row_sizes[row] > (uint64_t)std::numeric_limits::max()) { @@ -1426,10 +1371,10 @@ std::vector> convert_to_rows(cudf::table_view cons auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr); // blast through the entire table and convert it - dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_FROM_COLUMNS)); + dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_ROWS)); dim3 threads(256); - detail::copy_from_columns<<>>( + detail::copy_to_rows<<>>( num_rows, num_columns, shmem_limit_per_block, block_infos.size(), dev_input_data.data(), dev_col_sizes.data(), dev_col_starts.data(), dev_block_infos.data(), dev_row_offsets.data(), reinterpret_cast(dev_output_data.data())); @@ -1439,9 +1384,9 @@ std::vector> convert_to_rows(cudf::table_view cons auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream, mr); dim3 validity_blocks( - util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_COLUMNS)); + util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_ROWS)); dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu)); - detail::copy_validity_from_columns<<>>( num_rows, num_columns, shmem_limit_per_block, dev_row_offsets.data(), dev_output_data.data(), column_starts.back(), dev_validity_block_infos.data(), validity_block_infos.size(), @@ -1610,9 +1555,9 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const &in auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr); - dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_COLUMNS)); + dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_FROM_ROWS)); dim3 threads(std::min(std::min(256, shmem_limit_per_block / 8), (int)child.size())); - detail::copy_to_columns<<>>( + detail::copy_from_rows<<>>( num_rows, num_columns, shmem_limit_per_block, input.offsets().data(), dev_output_data.data(), dev_col_sizes.data(), dev_col_starts.data(), dev_block_infos.data(), block_infos.size(), child.data()); @@ -1645,11 +1590,11 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const &in } auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream, mr); dim3 validity_blocks( - util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_COLUMNS)); + util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_BLOCKS_PER_KERNEL_FROM_ROWS)); dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu)); detail:: - copy_validity_to_columns<<>>( + copy_validity_from_rows<<>>( num_rows, num_columns, shmem_limit_per_block, input.offsets().data(), dev_output_nm.data(), column_starts.back(), dev_validity_block_infos.data(), validity_block_infos.size(), child.data()); @@ -1707,7 +1652,7 @@ std::unique_ptr convert_from_rows_fixed_width_optimized( int shared_size = detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads); - detail::copy_to_fixed_width_columns<<>>( + detail::copy_from_rows_fixed_width_optimized<<>>( num_rows, num_columns, size_per_row, dev_column_start.data(), dev_column_size.data(), dev_output_data.data(), dev_output_nm.data(), child.data()); From 2c4e12fcc6f76e21cd1d6b0ca3f44ceb9ce251e4 Mon Sep 17 00:00:00 2001 From: Raza Jafri Date: Thu, 21 Oct 2021 14:49:26 -0700 Subject: [PATCH 61/80] fixed typo --- java/src/main/java/ai/rapids/cudf/HostMemoryBuffer.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/HostMemoryBuffer.java b/java/src/main/java/ai/rapids/cudf/HostMemoryBuffer.java index 9541d05ce00..e4106574a19 100644 --- a/java/src/main/java/ai/rapids/cudf/HostMemoryBuffer.java +++ b/java/src/main/java/ai/rapids/cudf/HostMemoryBuffer.java @@ -393,7 +393,7 @@ public final void setInts(long offset, int[] data, long srcOffset, long len) { */ public final long getLong(long offset) { long requestedAddress = this.address + offset; - addressOutOfBoundsCheck(requestedAddress, 8, "setLong"); + addressOutOfBoundsCheck(requestedAddress, 8, "getLong"); return UnsafeMemoryAccessor.getLong(requestedAddress); } @@ -404,7 +404,7 @@ public final long getLong(long offset) { */ public final void setLong(long offset, long value) { long requestedAddress = this.address + offset; - addressOutOfBoundsCheck(requestedAddress, 8, "getLong"); + addressOutOfBoundsCheck(requestedAddress, 8, "setLong"); UnsafeMemoryAccessor.setLong(requestedAddress, value); } From fa4f0d3d7e9d8e6829a0a807dbc3eab053fff3d3 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Sat, 23 Oct 2021 01:21:15 +0000 Subject: [PATCH 62/80] Updating for actual PR. Fixed a few last minute bugs, removed cudf-land code that was there for testing and benchmarking. --- cpp/benchmarks/CMakeLists.txt | 35 - .../row_conversion/row_conversion.cpp | 181 -- cpp/src/row_conversion/row_conversion.cu | 1666 ----------------- cpp/tests/row_conversion/row_conversion.cpp | 677 ------- java/src/main/native/src/row_conversion.cu | 33 +- 5 files changed, 16 insertions(+), 2576 deletions(-) delete mode 100644 cpp/benchmarks/row_conversion/row_conversion.cpp delete mode 100644 cpp/src/row_conversion/row_conversion.cu delete mode 100644 cpp/tests/row_conversion/row_conversion.cpp diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 79783f0e512..fa1e61e26fd 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -110,21 +110,10 @@ ConfigureBench(APPLY_BOOLEAN_MASK_BENCH stream_compaction/apply_boolean_mask_ben # * stream_compaction benchmark ------------------------------------------------------------------- ConfigureBench(STREAM_COMPACTION_BENCH stream_compaction/drop_duplicates_benchmark.cpp) -<<<<<<< HEAD # ################################################################################################## # * join benchmark -------------------------------------------------------------------------------- ConfigureBench(JOIN_BENCH join/join_benchmark.cu join/conditional_join_benchmark.cu) ConfigureNVBench(JOIN_NVBENCH join/join_nvbench.cu) -======= -################################################################################################### -# - join benchmark -------------------------------------------------------------------------------- -<<<<<<< HEAD -ConfigureBench(JOIN_BENCH join/join_benchmark.cu) ->>>>>>> working on row and column conversions -======= -ConfigureBench(JOIN_BENCH join/join_benchmark.cu join/conditional_join_benchmark.cu) -ConfigureNVBench(JOIN_NVBENCH join/join_nvbench.cu) ->>>>>>> Fixing merge issue # ################################################################################################## # * iterator benchmark ---------------------------------------------------------------------------- @@ -215,7 +204,6 @@ ConfigureBench(CSV_WRITER_BENCH io/csv/csv_writer_benchmark.cpp) # * ast benchmark --------------------------------------------------------------------------------- ConfigureBench(AST_BENCH ast/transform_benchmark.cpp) -<<<<<<< HEAD # ################################################################################################## # * binaryop benchmark ---------------------------------------------------------------------------- ConfigureBench( @@ -227,18 +215,6 @@ ConfigureBench( # * nvtext benchmark ------------------------------------------------------------------- ConfigureBench( TEXT_BENCH -======= -################################################################################################### -# - binaryop benchmark ---------------------------------------------------------------------------- -ConfigureBench(BINARYOP_BENCH - binaryop/binaryop_benchmark.cpp - binaryop/compiled_binaryop_benchmark.cpp - binaryop/jit_binaryop_benchmark.cpp) - -################################################################################################### -# - nvtext benchmark ------------------------------------------------------------------- -ConfigureBench(TEXT_BENCH ->>>>>>> Fixing merge issue text/ngrams_benchmark.cpp text/normalize_benchmark.cpp text/normalize_spaces_benchmark.cpp @@ -272,7 +248,6 @@ ConfigureBench( string/url_decode_benchmark.cpp ) -<<<<<<< HEAD # ################################################################################################## # * json benchmark ------------------------------------------------------------------- ConfigureBench(JSON_BENCH string/json_benchmark.cpp) @@ -280,13 +255,3 @@ ConfigureBench(JSON_BENCH string/json_benchmark.cpp) # ################################################################################################## # * io benchmark --------------------------------------------------------------------- ConfigureBench(MULTIBYTE_SPLIT_BENCHMARK io/text/multibyte_split_benchmark.cpp) -======= -################################################################################################### -# - io benchmark --------------------------------------------------------------------- -ConfigureBench(MULTIBYTE_SPLIT_BENCHMARK - io/text/multibyte_split_benchmark.cpp) - -################################################################################################### -# - row conversion benchmark --------------------------------------------------------- -ConfigureBench(ROW_CONVERSION_BENCH row_conversion/row_conversion.cpp) ->>>>>>> working on row and column conversions diff --git a/cpp/benchmarks/row_conversion/row_conversion.cpp b/cpp/benchmarks/row_conversion/row_conversion.cpp deleted file mode 100644 index fb8e4c8aef3..00000000000 --- a/cpp/benchmarks/row_conversion/row_conversion.cpp +++ /dev/null @@ -1,181 +0,0 @@ -/* - * Copyright (c) 2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include - -#include -#include -#include - -class RowConversion : public cudf::benchmark { -}; - -static void BM_old_to_row(benchmark::State& state) -{ - cudf::size_type const n_rows{(cudf::size_type)state.range(0)}; - auto const table = create_random_table({cudf::type_id::INT8, - cudf::type_id::INT32, - cudf::type_id::INT16, - cudf::type_id::INT64, - cudf::type_id::INT32, - cudf::type_id::BOOL8, - cudf::type_id::UINT16, - cudf::type_id::UINT8, - cudf::type_id::UINT64}, - 212, - row_count{n_rows}); - - cudf::size_type total_bytes = 0; - for (int i = 0; i < table->num_columns(); ++i) { - auto t = table->get_column(i).type(); - total_bytes += cudf::size_of(t); - } - - for (auto _ : state) { - cuda_event_timer raii(state, true, rmm::cuda_stream_default); - - auto rows = cudf::convert_to_rows_fixed_width_optimized(table->view()); - } - - state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows()); -} - -static void BM_new_to_row(benchmark::State& state) -{ - cudf::size_type const n_rows{(cudf::size_type)state.range(0)}; - auto const table = create_random_table({cudf::type_id::INT8, - cudf::type_id::INT32, - cudf::type_id::INT16, - cudf::type_id::INT64, - cudf::type_id::INT32, - cudf::type_id::BOOL8, - cudf::type_id::UINT16, - cudf::type_id::UINT8, - cudf::type_id::UINT64}, - 212, - row_count{n_rows}); - - cudf::size_type total_bytes = 0; - for (int i = 0; i < table->num_columns(); ++i) { - auto t = table->get_column(i).type(); - total_bytes += cudf::size_of(t); - } - - for (auto _ : state) { - cuda_event_timer raii(state, true, rmm::cuda_stream_default); - - auto new_rows = cudf::convert_to_rows(table->view()); - } - - state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows()); -} - -static void BM_old_from_row(benchmark::State& state) -{ - cudf::size_type const n_rows{(cudf::size_type)state.range(0)}; - auto const table = create_random_table({cudf::type_id::INT8, - cudf::type_id::INT32, - cudf::type_id::INT16, - cudf::type_id::INT64, - cudf::type_id::INT32, - cudf::type_id::BOOL8, - cudf::type_id::UINT16, - cudf::type_id::UINT8, - cudf::type_id::UINT64}, - 256, - row_count{n_rows}); - - std::vector schema; - cudf::size_type total_bytes = 0; - for (int i = 0; i < table->num_columns(); ++i) { - auto t = table->get_column(i).type(); - schema.push_back(t); - total_bytes += cudf::size_of(t); - } - - auto rows = cudf::convert_to_rows_fixed_width_optimized(table->view()); - cudf::lists_column_view const first_list(rows.front()->view()); - - for (auto _ : state) { - cuda_event_timer raii(state, true, rmm::cuda_stream_default); - - auto out = cudf::convert_from_rows_fixed_width_optimized(first_list, schema); - } - - state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows()); -} - -static void BM_new_from_row(benchmark::State& state) -{ - cudf::size_type const n_rows{(cudf::size_type)state.range(0)}; - auto const table = create_random_table({cudf::type_id::INT8, - cudf::type_id::INT32, - cudf::type_id::INT16, - cudf::type_id::INT64, - cudf::type_id::INT32, - cudf::type_id::BOOL8, - cudf::type_id::UINT16, - cudf::type_id::UINT8, - cudf::type_id::UINT64}, - 256, - row_count{n_rows}); - - std::vector schema; - cudf::size_type total_bytes = 0; - for (int i = 0; i < table->num_columns(); ++i) { - auto t = table->get_column(i).type(); - schema.push_back(t); - total_bytes += cudf::size_of(t); - } - - auto rows = cudf::convert_to_rows_fixed_width_optimized(table->view()); - cudf::lists_column_view const first_list(rows.front()->view()); - - for (auto _ : state) { - cuda_event_timer raii(state, true, rmm::cuda_stream_default); - - auto out = cudf::convert_from_rows(first_list, schema); - } - - state.SetBytesProcessed(state.iterations() * total_bytes * 2 * table->num_rows()); -} - -#define TO_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \ - BENCHMARK_DEFINE_F(RowConversion, name) \ - (::benchmark::State & st) { f(st); } \ - BENCHMARK_REGISTER_F(RowConversion, name) \ - ->RangeMultiplier(8) \ - ->Ranges({{1 << 6, 1 << 20}}) \ - ->UseManualTime() \ - ->Unit(benchmark::kMillisecond); - -TO_ROW_CONVERSION_BENCHMARK_DEFINE(old_to_row_conversion, BM_old_to_row) -TO_ROW_CONVERSION_BENCHMARK_DEFINE(new_to_row_conversion, BM_new_to_row) - -#define FROM_ROW_CONVERSION_BENCHMARK_DEFINE(name, f) \ - BENCHMARK_DEFINE_F(RowConversion, name) \ - (::benchmark::State & st) { f(st); } \ - BENCHMARK_REGISTER_F(RowConversion, name) \ - ->RangeMultiplier(8) \ - ->Ranges({{1 << 6, 1 << 20}}) \ - ->UseManualTime() \ - ->Unit(benchmark::kMillisecond); - -FROM_ROW_CONVERSION_BENCHMARK_DEFINE(old_from_row_conversion, BM_old_from_row) -FROM_ROW_CONVERSION_BENCHMARK_DEFINE(new_from_row_conversion, BM_new_from_row) diff --git a/cpp/src/row_conversion/row_conversion.cu b/cpp/src/row_conversion/row_conversion.cu deleted file mode 100644 index c068a2c0b76..00000000000 --- a/cpp/src/row_conversion/row_conversion.cu +++ /dev/null @@ -1,1666 +0,0 @@ -/* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - #include - #include - #include - #include - #include - - #include - #include - - #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 - #include - #endif - - #include - #include - #include - #include - #include - #include - #include - #include - #include - #include - #include - #include - #include - #include - #include - #include - #include - #include - #include - #include - #include - - #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 - constexpr auto NUM_BLOCKS_PER_KERNEL_FROM_ROWS = 8; - constexpr auto NUM_BLOCKS_PER_KERNEL_TO_ROWS = 2; - constexpr auto NUM_BLOCKS_PER_KERNEL_LOADED = 2; - constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL = 8; - constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED = 2; - #endif - - using cudf::detail::make_device_uvector_async; - using rmm::device_uvector; - namespace cudf { - - namespace detail { - - static inline __host__ __device__ int32_t align_offset(int32_t offset, std::size_t alignment) { - return (offset + alignment - 1) & ~(alignment - 1); - } - - __global__ void copy_from_rows_fixed_width_optimized( - const cudf::size_type num_rows, const cudf::size_type num_columns, - const cudf::size_type row_size, const cudf::size_type *input_offset_in_row, - const cudf::size_type *num_bytes, int8_t **output_data, cudf::bitmask_type **output_nm, - const int8_t *input_data) { - // We are going to copy the data in two passes. - // The first pass copies a chunk of data into shared memory. - // The second pass copies that chunk from shared memory out to the final location. - - // Because shared memory is limited we copy a subset of the rows at a time. - // For simplicity we will refer to this as a row_group - - // In practice we have found writing more than 4 columns of data per thread - // results in performance loss. As such we are using a 2 dimensional - // kernel in terms of threads, but not in terms of blocks. Columns are - // controlled by the y dimension (there is no y dimension in blocks). Rows - // are controlled by the x dimension (there are multiple blocks in the x - // dimension). - - cudf::size_type rows_per_group = blockDim.x; - cudf::size_type row_group_start = blockIdx.x; - cudf::size_type row_group_stride = gridDim.x; - cudf::size_type row_group_end = (num_rows + rows_per_group - 1) / rows_per_group + 1; - - extern __shared__ int8_t shared_data[]; - - // Because we are copying fixed width only data and we stride the rows - // this thread will always start copying from shared data in the same place - int8_t *row_tmp = &shared_data[row_size * threadIdx.x]; - int8_t *row_vld_tmp = &row_tmp[input_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]]; - - for (cudf::size_type row_group_index = row_group_start; row_group_index < row_group_end; - row_group_index += row_group_stride) { - // Step 1: Copy the data into shared memory - // We know row_size is always aligned with and a multiple of int64_t; - int64_t *long_shared = reinterpret_cast(shared_data); - const int64_t *long_input = reinterpret_cast(input_data); - - cudf::size_type shared_output_index = threadIdx.x + (threadIdx.y * blockDim.x); - cudf::size_type shared_output_stride = blockDim.x * blockDim.y; - cudf::size_type row_index_end = ((row_group_index + 1) * rows_per_group); - if (row_index_end > num_rows) { - row_index_end = num_rows; - } - cudf::size_type num_rows_in_group = row_index_end - (row_group_index * rows_per_group); - cudf::size_type shared_length = row_size * num_rows_in_group; - - cudf::size_type shared_output_end = shared_length / sizeof(int64_t); - - cudf::size_type start_input_index = - (row_size * row_group_index * rows_per_group) / sizeof(int64_t); - - for (cudf::size_type shared_index = shared_output_index; shared_index < shared_output_end; - shared_index += shared_output_stride) { - long_shared[shared_index] = long_input[start_input_index + shared_index]; - } - // Wait for all of the data to be in shared memory - __syncthreads(); - - // Step 2 copy the data back out - - // Within the row group there should be 1 thread for each row. This is a - // requirement for launching the kernel - cudf::size_type row_index = (row_group_index * rows_per_group) + threadIdx.x; - // But we might not use all of the threads if the number of rows does not go - // evenly into the thread count. We don't want those threads to exit yet - // because we may need them to copy data in for the next row group. - uint32_t active_mask = __ballot_sync(0xffffffff, row_index < num_rows); - if (row_index < num_rows) { - cudf::size_type col_index_start = threadIdx.y; - cudf::size_type col_index_stride = blockDim.y; - for (cudf::size_type col_index = col_index_start; col_index < num_columns; - col_index += col_index_stride) { - cudf::size_type col_size = num_bytes[col_index]; - const int8_t *col_tmp = &(row_tmp[input_offset_in_row[col_index]]); - int8_t *col_output = output_data[col_index]; - switch (col_size) { - case 1: { - col_output[row_index] = *col_tmp; - break; - } - case 2: { - int16_t *short_col_output = reinterpret_cast(col_output); - short_col_output[row_index] = *reinterpret_cast(col_tmp); - break; - } - case 4: { - int32_t *int_col_output = reinterpret_cast(col_output); - int_col_output[row_index] = *reinterpret_cast(col_tmp); - break; - } - case 8: { - int64_t *long_col_output = reinterpret_cast(col_output); - long_col_output[row_index] = *reinterpret_cast(col_tmp); - break; - } - default: { - cudf::size_type output_offset = col_size * row_index; - // TODO this should just not be supported for fixed width columns, but just in case... - for (cudf::size_type b = 0; b < col_size; b++) { - col_output[b + output_offset] = col_tmp[b]; - } - break; - } - } - - cudf::bitmask_type *nm = output_nm[col_index]; - int8_t *valid_byte = &row_vld_tmp[col_index / 8]; - cudf::size_type byte_bit_offset = col_index % 8; - int predicate = *valid_byte & (1 << byte_bit_offset); - uint32_t bitmask = __ballot_sync(active_mask, predicate); - if (row_index % 32 == 0) { - nm[word_index(row_index)] = bitmask; - } - } // end column loop - } // end row copy - // wait for the row_group to be totally copied before starting on the next row group - __syncthreads(); - } - } - - __global__ void copy_to_rows_fixed_width_optimized( - const cudf::size_type start_row, const cudf::size_type num_rows, - const cudf::size_type num_columns, const cudf::size_type row_size, - const cudf::size_type *output_offset_in_row, const cudf::size_type *num_bytes, - const int8_t **input_data, const cudf::bitmask_type **input_nm, int8_t *output_data) { - // We are going to copy the data in two passes. - // The first pass copies a chunk of data into shared memory. - // The second pass copies that chunk from shared memory out to the final location. - - // Because shared memory is limited we copy a subset of the rows at a time. - // We do not support copying a subset of the columns in a row yet, so we don't - // currently support a row that is wider than shared memory. - // For simplicity we will refer to this as a row_group - - // In practice we have found reading more than 4 columns of data per thread - // results in performance loss. As such we are using a 2 dimensional - // kernel in terms of threads, but not in terms of blocks. Columns are - // controlled by the y dimension (there is no y dimension in blocks). Rows - // are controlled by the x dimension (there are multiple blocks in the x - // dimension). - - cudf::size_type rows_per_group = blockDim.x; - cudf::size_type row_group_start = blockIdx.x; - cudf::size_type row_group_stride = gridDim.x; - cudf::size_type row_group_end = (num_rows + rows_per_group - 1) / rows_per_group + 1; - - extern __shared__ int8_t shared_data[]; - - // Because we are copying fixed width only data and we stride the rows - // this thread will always start copying to shared data in the same place - int8_t *row_tmp = &shared_data[row_size * threadIdx.x]; - int8_t *row_vld_tmp = - &row_tmp[output_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]]; - - for (cudf::size_type row_group_index = row_group_start; row_group_index < row_group_end; - row_group_index += row_group_stride) { - // Within the row group there should be 1 thread for each row. This is a - // requirement for launching the kernel - cudf::size_type row_index = start_row + (row_group_index * rows_per_group) + threadIdx.x; - // But we might not use all of the threads if the number of rows does not go - // evenly into the thread count. We don't want those threads to exit yet - // because we may need them to copy data back out. - if (row_index < (start_row + num_rows)) { - cudf::size_type col_index_start = threadIdx.y; - cudf::size_type col_index_stride = blockDim.y; - for (cudf::size_type col_index = col_index_start; col_index < num_columns; - col_index += col_index_stride) { - cudf::size_type col_size = num_bytes[col_index]; - int8_t *col_tmp = &(row_tmp[output_offset_in_row[col_index]]); - const int8_t *col_input = input_data[col_index]; - switch (col_size) { - case 1: { - *col_tmp = col_input[row_index]; - break; - } - case 2: { - const int16_t *short_col_input = reinterpret_cast(col_input); - *reinterpret_cast(col_tmp) = short_col_input[row_index]; - break; - } - case 4: { - const int32_t *int_col_input = reinterpret_cast(col_input); - *reinterpret_cast(col_tmp) = int_col_input[row_index]; - break; - } - case 8: { - const int64_t *long_col_input = reinterpret_cast(col_input); - *reinterpret_cast(col_tmp) = long_col_input[row_index]; - break; - } - default: { - cudf::size_type input_offset = col_size * row_index; - // TODO this should just not be supported for fixed width columns, but just in case... - for (cudf::size_type b = 0; b < col_size; b++) { - col_tmp[b] = col_input[b + input_offset]; - } - break; - } - } - // atomicOr only works on 32 bit or 64 bit aligned values, and not byte aligned - // so we have to rewrite the addresses to make sure that it is 4 byte aligned - int8_t *valid_byte = &row_vld_tmp[col_index / 8]; - cudf::size_type byte_bit_offset = col_index % 8; - uint64_t fixup_bytes = reinterpret_cast(valid_byte) % 4; - int32_t *valid_int = reinterpret_cast(valid_byte - fixup_bytes); - cudf::size_type int_bit_offset = byte_bit_offset + (fixup_bytes * 8); - // Now copy validity for the column - if (input_nm[col_index]) { - if (bit_is_set(input_nm[col_index], row_index)) { - atomicOr_block(valid_int, 1 << int_bit_offset); - } else { - atomicAnd_block(valid_int, ~(1 << int_bit_offset)); - } - } else { - // It is valid so just set the bit - atomicOr_block(valid_int, 1 << int_bit_offset); - } - } // end column loop - } // end row copy - // wait for the row_group to be totally copied into shared memory - __syncthreads(); - - // Step 2: Copy the data back out - // We know row_size is always aligned with and a multiple of int64_t; - int64_t *long_shared = reinterpret_cast(shared_data); - int64_t *long_output = reinterpret_cast(output_data); - - cudf::size_type shared_input_index = threadIdx.x + (threadIdx.y * blockDim.x); - cudf::size_type shared_input_stride = blockDim.x * blockDim.y; - cudf::size_type row_index_end = ((row_group_index + 1) * rows_per_group); - if (row_index_end > num_rows) { - row_index_end = num_rows; - } - cudf::size_type num_rows_in_group = row_index_end - (row_group_index * rows_per_group); - cudf::size_type shared_length = row_size * num_rows_in_group; - - cudf::size_type shared_input_end = shared_length / sizeof(int64_t); - - cudf::size_type start_output_index = - (row_size * row_group_index * rows_per_group) / sizeof(int64_t); - - for (cudf::size_type shared_index = shared_input_index; shared_index < shared_input_end; - shared_index += shared_input_stride) { - long_output[start_output_index + shared_index] = long_shared[shared_index]; - } - __syncthreads(); - // Go for the next round - } - } - - #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 - - struct block_info { - int start_col; - int start_row; - int end_col; - int end_row; - int buffer_num; - - __host__ __device__ size_type get_shared_row_size(size_type const *const col_offsets, - size_type const *const col_sizes) const { - return align_offset(col_offsets[end_col] + col_sizes[end_col] - col_offsets[start_col], 8); - } - __host__ __device__ size_type num_cols() const { return end_col - start_col + 1; } - - __host__ __device__ size_type num_rows() const { return end_row - start_row + 1; } - }; - - // When building the columns to return, we have to be mindful of the offset limit in cudf. - // It is 32-bit and these data columns are capable of surpassing that easily. The data should - // not be cut off exactly at the limit though due to the validity buffers. The most efficient - // place to cut the validity is on a 32-row boundary, so as we calculate the row sizes - // we keep track of the cut points for the validity, which we call row batches. If the row - // is larger than can be represented with the 32-bit offsets, we use the last 32-row boundary we - // hit. Note that this boundary is for our book-keeping with column pointers and not anything that - // the kernel needs to worry about. We cut the output at convienient boundaries when assembling - // the outgoing data stream. - struct row_batch { - size_type num_bytes; - size_type row_count; - }; - - /** - * @brief copy data from cudf columns into x format, which is row-based - * - * @param num_rows total number of rows in the table - * @param num_columns total number of columns in the table - * @param input_data pointer to raw table data - * @param input_nm pointer to validity data - * @param col_sizes array of sizes for each element in a column - one per column - * @param col_offsets offset into input data row for each column's start - * @param block_infos information about the blocks of work - * @param row_offsets offset to a specific row in the input data - * @param output_data pointer to output data - * - */ - __global__ void copy_to_rows(const size_type num_rows, const size_type num_columns, - const size_type shmem_used_per_block, const size_type num_block_infos, - const int8_t **input_data, const size_type *col_sizes, - const size_type *col_offsets, const block_info *block_infos, - const size_type *row_offsets, int8_t **output_data) { - // We are going to copy the data in two passes. - // The first pass copies a chunk of data into shared memory. - // The second pass copies that chunk from shared memory out to the final location. - - // Because shared memory is limited we copy a subset of the rows at a time. - // This has been broken up for us in the block_info struct, so we don't have - // any calculation to do here, but it is important to note. - - constexpr unsigned stages_count = NUM_BLOCKS_PER_KERNEL_LOADED; - auto group = cooperative_groups::this_thread_block(); - extern __shared__ int8_t shared_data[]; - int8_t *shared[stages_count] = {shared_data, shared_data + shmem_used_per_block}; - - __shared__ cuda::barrier block_barrier[NUM_BLOCKS_PER_KERNEL_LOADED]; - if (group.thread_rank() == 0) { - for (int i = 0; i < NUM_BLOCKS_PER_KERNEL_LOADED; ++i) { - init(&block_barrier[i], group.size()); - } - } - - group.sync(); - - auto const blocks_remaining = - std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_ROWS, - (uint)NUM_BLOCKS_PER_KERNEL_TO_ROWS); - - size_t fetch; - size_t subset; - for (subset = fetch = 0; subset < blocks_remaining; ++subset) { - // Fetch ahead up to stages_count subsets - for (; fetch < blocks_remaining && fetch < (subset + stages_count); ++fetch) { - auto const fetch_block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_ROWS + fetch]; - auto const num_fetch_cols = fetch_block.num_cols(); - auto const num_fetch_rows = fetch_block.num_rows(); - auto const num_elements_in_block = num_fetch_cols * num_fetch_rows; - auto const fetch_block_row_size = fetch_block.get_shared_row_size(col_offsets, col_sizes); - auto const starting_column_offset = col_offsets[fetch_block.start_col]; - auto &fetch_barrier = block_barrier[fetch % NUM_BLOCKS_PER_KERNEL_LOADED]; - - // wait for the last use of the memory to be completed - if (fetch > NUM_BLOCKS_PER_KERNEL_LOADED) { - fetch_barrier.arrive_and_wait(); - } - - // to do the copy we need to do n column copies followed by m element copies OR - // we have to do m element copies followed by r row copies. When going from column - // to row it is much easier to copy by elements first otherwise we would need a running - // total of the column sizes for our block, which isn't readily available. This makes it - // more appealing to copy element-wise from input data into shared matching the end layout - // and do row-based memcopies out. - - auto const shared_buffer_base = shared[fetch % stages_count]; - for (auto el = (int)threadIdx.x; el < num_elements_in_block; el += blockDim.x) { - auto const relative_col = el / num_fetch_rows; - auto const relative_row = el % num_fetch_rows; - auto const absolute_col = relative_col + fetch_block.start_col; - auto const absolute_row = relative_row + fetch_block.start_row; - auto const col_size = col_sizes[absolute_col]; - auto const col_offset = col_offsets[absolute_col]; - auto const relative_col_offset = col_offset - starting_column_offset; - - auto const shared_offset = relative_row * fetch_block_row_size + relative_col_offset; - auto const input_src = input_data[absolute_col] + col_size * absolute_row; - - // copy the element from global memory - switch (col_size) { - case 2: - cuda::memcpy_async(&shared_buffer_base[shared_offset], input_src, - cuda::aligned_size_t<2>(col_size), fetch_barrier); - break; - case 4: - cuda::memcpy_async(&shared_buffer_base[shared_offset], input_src, - cuda::aligned_size_t<4>(col_size), fetch_barrier); - break; - case 8: - cuda::memcpy_async(&shared_buffer_base[shared_offset], input_src, - cuda::aligned_size_t<8>(col_size), fetch_barrier); - break; - default: - cuda::memcpy_async(&shared_buffer_base[shared_offset], input_src, col_size, - fetch_barrier); - break; - } - } - } - - auto &subset_barrier = block_barrier[subset % NUM_BLOCKS_PER_KERNEL_LOADED]; - subset_barrier.arrive_and_wait(); - - auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_ROWS + subset]; - auto const block_row_size = block.get_shared_row_size(col_offsets, col_sizes); - auto const column_offset = col_offsets[block.start_col]; - auto const block_output_buffer = output_data[block.buffer_num]; - - // copy entire rows to final dest - for (auto absolute_row = block.start_row + threadIdx.x; absolute_row <= block.end_row; - absolute_row += blockDim.x) { - auto const relative_row = absolute_row - block.start_row; - auto const output_dest = block_output_buffer + row_offsets[absolute_row] + column_offset; - auto const shared_offset = block_row_size * relative_row; - - cuda::memcpy_async(output_dest, &shared[subset % stages_count][shared_offset], - cuda::aligned_size_t<8>(block_row_size), subset_barrier); - } - } - - // wait on the last copies to complete - for (uint i = 0; i < std::min(stages_count, blocks_remaining); ++i) { - block_barrier[i].arrive_and_wait(); - } - } - - /** - * @brief copy data from row-based format to cudf columns - * - * @param num_rows total number of rows in the table - * @param num_columns total number of columns in the table - * @param shmem_used_per_block amount of shared memory that is used by a block - * @param offsets - * @param output_data pointer to output data, partitioned by data size - * @param validity_offsets offset into input data row for validity data - * @param block_infos information about the blocks of work - * @param num_block_infos number of infos in blocks array - * @param input_data pointer to input data - * - */ - __global__ void copy_validity_to_rows( - const size_type num_rows, const size_type num_columns, const size_type shmem_used_per_block, - const size_type *row_offsets, int8_t **output_data, const size_type validity_offset, - const block_info *block_infos, const size_type num_block_infos, const bitmask_type **input_nm) { - extern __shared__ int8_t shared_data[]; - int8_t *shared_blocks[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] = { - shared_data, shared_data + shmem_used_per_block / 2}; - - // per conversation with DaveB - // each thread of warp reads a single int32 of validity - so we read 128 bytes - // then ballot_sync the bits and write the result to shmem - // after we fill shared mem memcpy it out in a blob. - // probably need knobs for number of rows vs columns to balance read/write - auto group = cooperative_groups::this_thread_block(); - - int const blocks_remaining = - std::min(num_block_infos - blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL, - (uint)NUM_VALIDITY_BLOCKS_PER_KERNEL); - - __shared__ cuda::barrier - shared_block_barriers[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]; - if (group.thread_rank() == 0) { - for (int i = 0; i < NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; ++i) { - init(&shared_block_barriers[i], group.size()); - } - } - - group.sync(); - - for (int validity_block = 0; validity_block < blocks_remaining; ++validity_block) { - if (validity_block != validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED) { - shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] - .arrive_and_wait(); - } - int8_t *this_shared_block = shared_blocks[validity_block % 2]; - auto block = block_infos[blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block]; - - auto const num_block_cols = block.num_cols(); - auto const num_block_rows = block.num_rows(); - - auto const num_sections_x = util::div_rounding_up_unsafe(num_block_cols, 32); - auto const num_sections_y = util::div_rounding_up_unsafe(num_block_rows, 32); - auto const validity_data_row_length = - align_offset(util::div_rounding_up_unsafe(num_block_cols, 8), 8); - auto const total_sections = num_sections_x * num_sections_y; - - int const warp_id = threadIdx.x / detail::warp_size; - int const lane_id = threadIdx.x % detail::warp_size; - auto const warps_per_block = std::max(1u, blockDim.x / detail::warp_size); - - // the block is divided into sections. A warp operates on a section at a time. - for (int my_section_idx = warp_id; my_section_idx < total_sections; - my_section_idx += warps_per_block) { - // convert to rows and cols - auto const section_x = my_section_idx % num_sections_x; - auto const section_y = my_section_idx / num_sections_x; - auto const relative_col = section_x * 32 + lane_id; - auto const relative_row = section_y * 32; - auto const absolute_col = relative_col + block.start_col; - auto const absolute_row = relative_row + block.start_row; - auto const cols_left = num_columns - absolute_col; - auto const participation_mask = __ballot_sync(0xFFFFFFFF, absolute_col < num_columns); - - if (absolute_col < num_columns) { - auto my_data = input_nm[absolute_col] != nullptr ? - input_nm[absolute_col][absolute_row / 32] : - std::numeric_limits::max(); - - // every thread that is participating in the warp has a byte, but it's column-based - // data and we need it in row-based. So we shuffle the bits around with ballot_sync to - // make the bytes we actually write. - bitmask_type dw_mask = 1; - for (int i = 0; i < 32 && relative_row + i < num_rows; ++i, dw_mask <<= 1) { - auto validity_data = __ballot_sync(participation_mask, my_data & dw_mask); - // lead thread in each warp writes data - auto const validity_write_offset = - validity_data_row_length * (relative_row + i) + relative_col / 8; - if (threadIdx.x % detail::warp_size == 0) { - if (cols_left <= 8) { - // write byte - this_shared_block[validity_write_offset] = validity_data & 0xFF; - } else if (cols_left <= 16) { - // write int16 - *reinterpret_cast(&this_shared_block[validity_write_offset]) = - validity_data & 0xFFFF; - } else if (cols_left <= 24) { - // write int16 and then int8 - *reinterpret_cast(&this_shared_block[validity_write_offset]) = - validity_data & 0xFFFF; - shared_data[validity_write_offset + 2] = (validity_data >> 16) & 0xFF; - } else { - // write int32 - *reinterpret_cast(&this_shared_block[validity_write_offset]) = - validity_data; - } - } - } - } - } - - // make sure entire block has finished copy - group.sync(); - - auto const output_data_base = - output_data[block.buffer_num] + validity_offset + block.start_col / 8; - - // now async memcpy the shared memory out to the final destination - for (int row = block.start_row + threadIdx.x; row <= block.end_row; row += blockDim.x) { - auto const relative_row = row - block.start_row; - auto const output_ptr = output_data_base + row_offsets[row]; - auto const num_bytes = util::div_rounding_up_unsafe(num_block_cols, 8); - - cuda::memcpy_async( - output_ptr, &this_shared_block[validity_data_row_length * relative_row], num_bytes, - shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]); - } - } - - // wait for last blocks of data to arrive - for (int validity_block = 0; - validity_block < blocks_remaining % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; - ++validity_block) { - shared_block_barriers[validity_block].arrive_and_wait(); - } - } - - static __device__ std::tuple - get_admin_data_sizes(size_t col_size_size, size_t col_offset_size, int const num_cols) { - auto const col_size_bytes = num_cols * col_size_size; - auto const col_offset_bytes = num_cols * col_offset_size; - - return {col_size_bytes, col_offset_bytes}; - } - - /** - * @brief copy data from row-based format to cudf columns - * - * @param num_rows total number of rows in the table - * @param num_columns total number of columns in the table - * @param shmem_used_per_block amount of shared memory that is used by a block - * @param row_offsets - * @param output_data - * @param output_nm - * @param col_sizes array of sizes for each element in a column - one per column - * @param col_offsets offset into input data row for each column's start - * @param block_infos information about the blocks of work - * @param input_data pointer to input data - * - */ - __global__ void copy_from_rows(const size_type num_rows, const size_type num_columns, - const size_type shmem_used_per_block, const size_type *row_offsets, - int8_t **output_data, const size_type *_col_sizes, - const size_type *_col_offsets, const block_info *block_infos, - const size_type num_block_infos, const int8_t *input_data) { - // We are going to copy the data in two passes. - // The first pass copies a chunk of data into shared memory. - // The second pass copies that chunk from shared memory out to the final location. - - // Because shared memory is limited we copy a subset of the rows at a time. - // This has been broken up for us in the block_info struct, so we don't have - // any calculation to do here, but it is important to note. - - // to speed up some of the random access memory we do, we copy col_sizes and col_offsets - // to shared memory for each of the blocks that we work on - - constexpr unsigned stages_count = NUM_BLOCKS_PER_KERNEL_LOADED; - auto group = cooperative_groups::this_thread_block(); - extern __shared__ int8_t shared_data[]; - int8_t *shared[stages_count] = {shared_data, shared_data + shmem_used_per_block}; - - __shared__ cuda::barrier block_barrier[NUM_BLOCKS_PER_KERNEL_LOADED]; - if (group.thread_rank() == 0) { - for (int i = 0; i < NUM_BLOCKS_PER_KERNEL_LOADED; ++i) { - init(&block_barrier[i], group.size()); - } - } - - group.sync(); - - auto blocks_remaining = std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_ROWS, - (uint)NUM_BLOCKS_PER_KERNEL_FROM_ROWS); - - size_t fetch_index; - size_t processing_index; - for (processing_index = fetch_index = 0; processing_index < blocks_remaining; - ++processing_index) { - // Fetch ahead up to stages_count groups - for (; fetch_index < static_cast(blocks_remaining) && - fetch_index < (processing_index + stages_count); - ++fetch_index) { - auto const fetch_block = - block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_ROWS + fetch_index]; - auto const fetch_block_start_row = fetch_block.start_row; - auto const fetch_block_end_row = fetch_block.end_row; - auto const starting_col_offset = _col_offsets[fetch_block.start_col]; - auto const fetch_block_row_size = fetch_block.get_shared_row_size(_col_offsets, _col_sizes); - auto const num_fetch_cols = fetch_block.num_cols(); - auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes( - sizeof(decltype(*_col_sizes)), sizeof(decltype(*_col_offsets)), num_fetch_cols); - auto &fetch_barrier = block_barrier[fetch_index % NUM_BLOCKS_PER_KERNEL_LOADED]; - - // if we have fetched all buffers, we need to wait for processing - // to complete on them before we can use them again - if (fetch_index > NUM_BLOCKS_PER_KERNEL_LOADED) { - fetch_barrier.arrive_and_wait(); - } - - auto shared_row_offset = 0; - // copy the data for column sizes - cuda::memcpy_async(group, &shared[fetch_index % stages_count][shared_row_offset], - &_col_sizes[fetch_block.start_col], col_size_bytes, fetch_barrier); - shared_row_offset += col_size_bytes; - // copy the data for column offsets - cuda::memcpy_async(group, &shared[fetch_index % stages_count][shared_row_offset], - &_col_offsets[fetch_block.start_col], col_offset_bytes, fetch_barrier); - shared_row_offset += col_offset_bytes; - shared_row_offset = align_offset(shared_row_offset, 8); - - for (auto row = fetch_block_start_row + static_cast(threadIdx.x); - row <= fetch_block_end_row; row += blockDim.x) { - auto shared_offset = - (row - fetch_block_start_row) * fetch_block_row_size + shared_row_offset; - // copy the main - cuda::memcpy_async(&shared[fetch_index % stages_count][shared_offset], - &input_data[row_offsets[row] + starting_col_offset], - fetch_block_row_size, fetch_barrier); - } - } - - auto &processing_barrier = block_barrier[processing_index % NUM_BLOCKS_PER_KERNEL_LOADED]; - - // ensure our data is ready - processing_barrier.arrive_and_wait(); - - auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_ROWS + processing_index]; - auto const rows_in_block = block.num_rows(); - auto const cols_in_block = block.num_cols(); - - auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes( - sizeof(decltype(*_col_sizes)), sizeof(decltype(*_col_offsets)), cols_in_block); - auto shared_col_sizes = reinterpret_cast(shared[processing_index % stages_count]); - auto shared_col_offsets = - reinterpret_cast(&shared[processing_index % stages_count][col_size_bytes]); - - auto const shared_row_offset = align_offset(col_size_bytes + col_offset_bytes, 8); - - auto block_row_size = block.get_shared_row_size(_col_offsets, _col_sizes); - - // now we copy from shared memory to final destination. - // the data is laid out in rows in shared memory, so the reads - // for a column will be "vertical". Because of this and the different - // sizes for each column, this portion is handled on row/column basis. - // to prevent each thread working on a single row and also to ensure - // that all threads can do work in the case of more threads than rows, - // we do a global index instead of a double for loop with col/row. - for (int index = threadIdx.x; index < rows_in_block * cols_in_block; index += blockDim.x) { - auto const relative_col = index % cols_in_block; - auto const relative_row = index / cols_in_block; - auto const absolute_col = relative_col + block.start_col; - auto const absolute_row = relative_row + block.start_row; - - auto const shared_memory_row_offset = block_row_size * relative_row; - auto const shared_memory_offset = shared_col_offsets[relative_col] - shared_col_offsets[0] + - shared_memory_row_offset + shared_row_offset; - auto const column_size = shared_col_sizes[relative_col]; - - int8_t *shmem_src = &shared[processing_index % stages_count][shared_memory_offset]; - int8_t *dst = &output_data[absolute_col][absolute_row * column_size]; - - cuda::memcpy_async(dst, shmem_src, column_size, processing_barrier); - } - group.sync(); - } - - // wait on the last copies to complete - for (uint i = 0; i < std::min(stages_count, blocks_remaining); ++i) { - block_barrier[i].arrive_and_wait(); - } - } - - /** - * @brief copy data from row-based format to cudf columns - * - * @param num_rows total number of rows in the table - * @param num_columns total number of columns in the table - * @param shmem_used_per_block amount of shared memory that is used by a block - * @param offsets - * @param output_nm - * @param validity_offsets offset into input data row for validity data - * @param block_infos information about the blocks of work - * @param num_block_infos number of infos in blocks array - * @param input_data pointer to input data - * - */ - __global__ void copy_validity_from_rows( - const size_type num_rows, const size_type num_columns, const size_type shmem_used_per_block, - const size_type *row_offsets, cudf::bitmask_type **output_nm, const size_type validity_offset, - const block_info *block_infos, const size_type num_block_infos, const int8_t *input_data) { - extern __shared__ int8_t shared_data[]; - int8_t *shared_blocks[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] = { - shared_data, shared_data + shmem_used_per_block / 2}; - - // per conversation with DaveB - // each thread of warp reads a single byte of validity - so we read 32 bytes - // then ballot_sync the bits and write the result to shmem - // after we fill shared mem memcpy it out in a blob. - // probably need knobs for number of rows vs columns to balance read/write - auto group = cooperative_groups::this_thread_block(); - - int const blocks_remaining = - std::min(num_block_infos - blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL, - (uint)NUM_VALIDITY_BLOCKS_PER_KERNEL); - - __shared__ cuda::barrier - shared_block_barriers[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]; - if (group.thread_rank() == 0) { - for (int i = 0; i < NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; ++i) { - init(&shared_block_barriers[i], group.size()); - } - } - - group.sync(); - - for (int validity_block = 0; validity_block < blocks_remaining; ++validity_block) { - auto const validity_index = validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; - if (validity_block != validity_index) { - shared_block_barriers[validity_index].arrive_and_wait(); - } - int8_t *this_shared_block = shared_blocks[validity_block % 2]; - auto const block = block_infos[blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block]; - auto const block_start_col = block.start_col; - auto const block_start_row = block.start_row; - auto const num_block_cols = block.num_cols(); - auto const num_block_rows = block.num_rows(); - auto const num_sections_x = (num_block_cols + 7) / 8; - auto const num_sections_y = (num_block_rows + 31) / 32; - auto const validity_data_col_length = num_sections_y * 4; // words to bytes - auto const total_sections = num_sections_x * num_sections_y; - int const warp_id = threadIdx.x / detail::warp_size; - int const lane_id = threadIdx.x % detail::warp_size; - auto const warps_per_block = std::max(1u, blockDim.x / detail::warp_size); - - // the block is divided into sections. A warp operates on a section at a time. - for (int my_section_idx = warp_id; my_section_idx < total_sections; - my_section_idx += warps_per_block) { - // convert to rows and cols - auto const section_x = my_section_idx % num_sections_x; - auto const section_y = my_section_idx / num_sections_x; - auto const relative_col = section_x * 8; - auto const relative_row = section_y * 32 + lane_id; - auto const absolute_col = relative_col + block_start_col; - auto const absolute_row = relative_row + block_start_row; - auto const rows_left = num_rows - absolute_row; - - auto const participation_mask = __ballot_sync(0xFFFFFFFF, absolute_row < num_rows); - - if (absolute_row < num_rows) { - auto const my_byte = - input_data[row_offsets[absolute_row] + validity_offset + absolute_col / 8]; - - // so every thread that is participating in the warp has a byte, but it's row-based - // data and we need it in column-based. So we shiffle the bits around to make - // the bytes we actually write. - for (int i = 0, byte_mask = 1; i < 8 && relative_col + i < num_columns; - ++i, byte_mask <<= 1) { - auto validity_data = __ballot_sync(participation_mask, my_byte & byte_mask); - // lead thread in each warp writes data - if (threadIdx.x % detail::warp_size == 0) { - auto const validity_write_offset = - validity_data_col_length * (relative_col + i) + relative_row / 8; - - if (rows_left <= 8) { - // write byte - this_shared_block[validity_write_offset] = validity_data & 0xFF; - } else if (rows_left <= 16) { - // write int16 - *reinterpret_cast(&this_shared_block[validity_write_offset]) = - validity_data & 0xFFFF; - } else if (rows_left <= 24) { - // write int16 and then int8 - *reinterpret_cast(&this_shared_block[validity_write_offset]) = - validity_data & 0xFFFF; - shared_data[validity_write_offset + 2] = (validity_data >> 16) & 0xFF; - } else { - // write int32 - *reinterpret_cast(&this_shared_block[validity_write_offset]) = - validity_data; - } - } - } - } - } - - // make sure entire block has finished copy - group.sync(); - - // now async memcpy the shared - for (int col = block.start_col + threadIdx.x; col <= block.end_col; col += blockDim.x) { - auto const relative_col = col - block.start_col; - - cuda::memcpy_async( - output_nm[col] + word_index(block_start_row), - &this_shared_block[validity_data_col_length * relative_col], - util::div_rounding_up_unsafe(num_block_rows, 8), - shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]); - } - } - - // wait for last blocks of data to arrive - auto const num_blocks_to_wait = blocks_remaining > NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED ? - NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED : - blocks_remaining; - for (int validity_block = 0; validity_block < num_blocks_to_wait; ++validity_block) { - shared_block_barriers[validity_block].arrive_and_wait(); - } - } - - #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 - - /** - * Calculate the dimensions of the kernel for fixed width only columns. - * @param [in] num_columns the number of columns being copied. - * @param [in] num_rows the number of rows being copied. - * @param [in] size_per_row the size each row takes up when padded. - * @param [out] blocks the size of the blocks for the kernel - * @param [out] threads the size of the threads for the kernel - * @return the size in bytes of shared memory needed for each block. - */ - static int calc_fixed_width_kernel_dims(const cudf::size_type num_columns, - const cudf::size_type num_rows, - const cudf::size_type size_per_row, dim3 &blocks, - dim3 &threads) { - // We have found speed degrades when a thread handles more than 4 columns. - // Each block is 2 dimensional. The y dimension indicates the columns. - // We limit this to 32 threads in the y dimension so we can still - // have at least 32 threads in the x dimension (1 warp) which should - // result in better coalescing of memory operations. We also - // want to guarantee that we are processing a multiple of 32 threads - // in the x dimension because we use atomic operations at the block - // level when writing validity data out to main memory, and that would - // need to change if we split a word of validity data between blocks. - int y_block_size = (num_columns + 3) / 4; // cudf::util::div_rounding_up_safe(num_columns, 4); - if (y_block_size > 32) { - y_block_size = 32; - } - int x_possible_block_size = 1024 / y_block_size; - // 48KB is the default setting for shared memory per block according to the cuda tutorials - // If someone configures the GPU to only have 16 KB this might not work. - int max_shared_size = 48 * 1024; - int max_block_size = max_shared_size / size_per_row; - // If we don't have enough shared memory there is no point in having more threads - // per block that will just sit idle - max_block_size = max_block_size > x_possible_block_size ? x_possible_block_size : max_block_size; - // Make sure that the x dimension is a multiple of 32 this not only helps - // coalesce memory access it also lets us do a ballot sync for validity to write - // the data back out the warp level. If x is a multiple of 32 then each thread in the y - // dimension is associated with one or more warps, that should correspond to the validity - // words directly. - int block_size = (max_block_size / 32) * 32; - CUDF_EXPECTS(block_size != 0, "Row size is too large to fit in shared memory"); - - int num_blocks = (num_rows + block_size - 1) / block_size; - if (num_blocks < 1) { - num_blocks = 1; - } else if (num_blocks > 10240) { - // The maximum number of blocks supported in the x dimension is 2 ^ 31 - 1 - // but in practice haveing too many can cause some overhead that I don't totally - // understand. Playing around with this haveing as little as 600 blocks appears - // to be able to saturate memory on V100, so this is an order of magnitude higher - // to try and future proof this a bit. - num_blocks = 10240; - } - blocks.x = num_blocks; - blocks.y = 1; - blocks.z = 1; - threads.x = block_size; - threads.y = y_block_size; - threads.z = 1; - return size_per_row * block_size; - } - - /** - * When converting to rows it is possible that the size of the table was too big to fit - * in a single column. This creates an output column for a subset of the rows in a table - * going from start row and containing the next num_rows. Most of the parameters passed - * into this function are common between runs and should be calculated once. - */ - static std::unique_ptr - fixed_width_convert_to_rows(const cudf::size_type start_row, const cudf::size_type num_rows, - const cudf::size_type num_columns, const cudf::size_type size_per_row, - rmm::device_uvector &column_start, - rmm::device_uvector &column_size, - rmm::device_uvector &input_data, - rmm::device_uvector &input_nm, - const cudf::scalar &zero, const cudf::scalar &scalar_size_per_row, - rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) { - int64_t total_allocation = size_per_row * num_rows; - // We made a mistake in the split somehow - CUDF_EXPECTS(total_allocation < std::numeric_limits::max(), "Table is too large to fit!"); - - // Allocate and set the offsets row for the byte array - std::unique_ptr offsets = - cudf::detail::sequence(num_rows + 1, zero, scalar_size_per_row, stream); - - std::unique_ptr data = cudf::make_numeric_column( - cudf::data_type(cudf::type_id::INT8), static_cast(total_allocation), - cudf::mask_state::UNALLOCATED, stream, mr); - - dim3 blocks; - dim3 threads; - int shared_size = - detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads); - - copy_to_rows_fixed_width_optimized<<>>( - start_row, num_rows, num_columns, size_per_row, column_start.data(), column_size.data(), - input_data.data(), input_nm.data(), data->mutable_view().data()); - - return cudf::make_lists_column(num_rows, std::move(offsets), std::move(data), 0, - rmm::device_buffer{0, rmm::cuda_stream_default, mr}, stream, mr); - } - - static cudf::data_type get_data_type(const cudf::column_view &v) { - return v.type(); - } - - static inline bool are_all_fixed_width(std::vector const &schema) { - return std::all_of(schema.begin(), schema.end(), - [](const cudf::data_type &t) { return cudf::is_fixed_width(t); }); - } - - /** - * Given a set of fixed width columns, calculate how the data will be laid out in memory. - * @param [in] schema the types of columns that need to be laid out. - * @param [out] column_start the byte offset where each column starts in the row. - * @param [out] column_size the size in bytes of the data for each columns in the row. - * @return the size in bytes each row needs. - */ - static inline int32_t compute_fixed_width_layout(std::vector const &schema, - std::vector &column_start, - std::vector &column_size) { - // We guarantee that the start of each column is 64-bit aligned so anything can go - // there, but to make the code simple we will still do an alignment for it. - int32_t at_offset = 0; - for (auto col = schema.begin(); col < schema.end(); col++) { - cudf::size_type s = cudf::size_of(*col); - column_size.emplace_back(s); - std::size_t allocation_needed = s; - std::size_t alignment_needed = allocation_needed; // They are the same for fixed width types - at_offset = align_offset(at_offset, alignment_needed); - column_start.emplace_back(at_offset); - at_offset += allocation_needed; - } - - // Now we need to add in space for validity - // Eventually we can think about nullable vs not nullable, but for now we will just always add - // it in - int32_t validity_bytes_needed = - (schema.size() + 7) / 8; // cudf::util::div_rounding_up_safe(schema.size(), 8); - // validity comes at the end and is byte aligned so we can pack more in. - at_offset += validity_bytes_needed; - // Now we need to pad the end so all rows are 64 bit aligned - return align_offset(at_offset, 8); // 8 bytes (64 bits) - } - - #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 - - template - static size_type compute_column_information(iterator begin, iterator end, - std::vector &column_starts, - std::vector &column_sizes) //, - // std::function nested_type_cb) - { - size_type fixed_width_size_per_row = 0; - for (auto cv = begin; cv != end; ++cv) { - auto col_type = std::get<0>(*cv); - bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; - - // if (nested_type && nested_type_cb) { nested_type_cb(cv->get<1>()); } - - // a list or string column will write a single uint64 - // of data here for offset/length - auto col_size = nested_type ? 8 : size_of(col_type); - - // align size for this type - std::size_t const alignment_needed = col_size; // They are the same for fixed width types - fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed); - column_starts.push_back(fixed_width_size_per_row); - column_sizes.push_back(col_size); - fixed_width_size_per_row += col_size; - } - - auto validity_offset = fixed_width_size_per_row; - column_starts.push_back(validity_offset); - - return fixed_width_size_per_row; - } - - std::vector - build_validity_block_infos(size_type const &num_columns, size_type const &num_rows, - size_type const &shmem_limit_per_block, - std::vector const &row_batches) { - auto const desired_rows_and_columns = (int)sqrt(shmem_limit_per_block); - auto const column_stride = align_offset( - [&]() { - if (desired_rows_and_columns > num_columns) { - // not many columns, group it into 8s and ship it off - return std::min(8, num_columns); - } else { - return util::round_down_safe(desired_rows_and_columns, 8); - } - }(), - 8); - // we fit as much as we can given the column stride - // note that an element in the table takes just 1 bit, but a row with a single - // element still takes 8 bytes! - auto const bytes_per_row = align_offset(util::div_rounding_up_unsafe(column_stride, 8), 8); - auto const row_stride = std::min(num_rows, shmem_limit_per_block / bytes_per_row); - - std::vector validity_block_infos; - for (int col = 0; col < num_columns; col += column_stride) { - int current_window_row_batch = 0; - int rows_left_in_batch = row_batches[current_window_row_batch].row_count; - int row = 0; - while (row < num_rows) { - if (rows_left_in_batch == 0) { - current_window_row_batch++; - rows_left_in_batch = row_batches[current_window_row_batch].row_count; - } - int const window_height = std::min(row_stride, rows_left_in_batch); - - validity_block_infos.emplace_back(detail::block_info{ - col, row, std::min(col + column_stride - 1, num_columns - 1), row + window_height - 1}); - row += window_height; - rows_left_in_batch -= window_height; - } - } - - return validity_block_infos; - } - - std::vector build_block_infos(std::vector const &column_sizes, - std::vector const &column_starts, - std::vector const &row_batches, - size_type const total_number_of_rows, - size_type const &shmem_limit_per_block) { - std::vector block_infos; - - // block infos are organized with the windows going "down" the columns - // this provides the most coalescing of memory access - int current_window_width = 0; - int current_window_start_col = 0; - - // build the blocks for a specific set of columns - auto build_blocks = [&block_infos, &row_batches, total_number_of_rows]( - int const start_col, int const end_col, int const desired_window_height) { - int current_window_start_row = 0; - int current_window_row_batch = 0; - int rows_left_in_batch = row_batches[current_window_row_batch].row_count; - int i = 0; - while (i < total_number_of_rows) { - if (rows_left_in_batch == 0) { - current_window_row_batch++; - rows_left_in_batch = row_batches[current_window_row_batch].row_count; - } - int const window_height = std::min(desired_window_height, rows_left_in_batch); - - block_infos.emplace_back(detail::block_info{ - start_col, current_window_start_row, end_col, - std::min(current_window_start_row + window_height - 1, total_number_of_rows - 1), - current_window_row_batch}); - - i += window_height; - current_window_start_row += window_height; - rows_left_in_batch -= window_height; - } - }; - - // the ideal window height has lots of 8-byte reads and 8-byte writes. The optimal read/write - // would be memory cache line sized access, but since other blocks will read/write the edges - // this may not turn out to be overly important. For now, we will attempt to build a square - // window as far as byte sizes. x * y = shared_mem_size. Which translates to x^2 = - // shared_mem_size since we want them equal, so height and width are sqrt(shared_mem_size). The - // trick is that it's in bytes, not rows or columns. - size_type const optimal_square_len = size_type(sqrt(shmem_limit_per_block)); - int const window_height = std::clamp( - util::round_up_safe( - std::min(std::min(optimal_square_len, (size_type)column_sizes.size()) / column_sizes[0], - total_number_of_rows), - 32), - 1, row_batches[0].row_count); - - auto calc_admin_data_size = [](int num_cols) -> size_type { - // admin data is the column sizes and column start information. - // this is copied to shared memory as well and needs to be accounted for - // in the window calculation. - return num_cols * sizeof(size_type) + num_cols * sizeof(size_type); - }; - - int row_size = 0; - - // march each column and build the blocks of appropriate sizes - for (unsigned int col = 0; col < column_sizes.size(); ++col) { - auto const col_size = column_sizes[col]; - - // align size for this type - std::size_t alignment_needed = col_size; // They are the same for fixed width types - auto row_size_aligned = detail::align_offset(row_size, alignment_needed); - auto row_size_with_this_col = row_size_aligned + col_size; - auto row_size_with_end_pad = detail::align_offset(row_size_with_this_col, 8); - - if (row_size_with_end_pad * window_height + - calc_admin_data_size(col - current_window_start_col) > - shmem_limit_per_block) { - // too large, close this window, generate vertical blocks and restart - build_blocks(current_window_start_col, col == 0 ? col : col - 1, window_height); - row_size = - detail::align_offset((column_starts[col] + column_sizes[col]) & 7, alignment_needed); - row_size += col_size; // alignment required for shared memory window boundary to match - // alignment of output row - current_window_start_col = col; - current_window_width = 0; - } else { - row_size = row_size_with_this_col; - current_window_width++; - } - } - - // build last set of blocks - if (current_window_width > 0) { - build_blocks(current_window_start_col, (int)column_sizes.size() - 1, window_height); - } - - return block_infos; - } - - #endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 - - } // namespace detail - - std::vector> convert_to_rows(cudf::table_view const &tbl, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) { - #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 - // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the - // data, but small enough that multiple columns fit in memory so the writes can coalese as well. - // Potential optimization for window sizes. - const size_type num_columns = tbl.num_columns(); - const size_type num_rows = tbl.num_rows(); - - int device_id; - CUDA_TRY(cudaGetDevice(&device_id)); - int total_shmem; - CUDA_TRY(cudaDeviceGetAttribute(&total_shmem, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); - - // TODO: why? - total_shmem -= 1024; - int shmem_limit_per_block = total_shmem / NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; - - // break up the work into blocks, which are a starting and ending row/col #. - // this window size is calculated based on the shared memory size available - // we want a single block to fill up the entire shared memory space available - // for the transpose-like conversion. - - // There are two different processes going on here. The GPU conversion of the data - // and the writing of the data into the list of byte columns that are a maximum of - // 2 gigs each due to offset maximum size. The GPU conversion portion has to understand - // this limitation because the column must own the data inside and as a result it must be - // a distinct allocation for that column. Copying the data into these final buffers would - // be prohibitively expensive, so care is taken to ensure the GPU writes to the proper buffer. - // The windows are broken at the boundaries of specific rows based on the row sizes up - // to that point. These are row batches and they are decided first before building the - // windows so the windows can be properly cut around them. - - // Get the pointers to the input columnar data ready - std::vector input_data; - std::vector input_nm; - input_data.reserve(num_columns); - input_nm.reserve(num_columns); - for (size_type column_number = 0; column_number < num_columns; column_number++) { - column_view cv = tbl.column(column_number); - auto const col_type = cv.type(); - bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; - - if (!nested_type) { - input_data.emplace_back(cv.data()); - input_nm.emplace_back(cv.null_mask()); - } - } - - auto dev_input_data = make_device_uvector_async(input_data, stream, mr); - auto dev_input_nm = make_device_uvector_async(input_nm, stream, mr); - - std::vector row_sizes; // size of each row in bytes including any alignment padding - std::vector row_offsets; // offset from the start of the data to this row - std::vector column_sizes; // byte size of each column - std::vector column_starts; // offset of column inside a row including alignment - std::vector - variable_width_columns; // list of the variable width columns in the table - row_sizes.reserve(num_rows); - row_offsets.reserve(num_rows); - column_sizes.reserve(num_columns); - column_starts.reserve(num_columns + 1); // we add a final offset for validity data start - - auto iter = - thrust::make_transform_iterator(thrust::make_counting_iterator(0), - [&tbl](auto i) -> std::tuple { - return std::make_tuple(tbl.column(i).type(), tbl.column(i)); - }); - - size_type fixed_width_size_per_row = - detail::compute_column_information(iter, iter + num_columns, column_starts, column_sizes); - - auto dev_col_sizes = make_device_uvector_async(column_sizes, stream, mr); - auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr); - - std::vector row_batches; - - uint64_t row_batch_size = 0; - uint64_t total_table_size = 0; - size_type row_batch_rows = 0; - uint64_t row_offset = 0; - - // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then - // calculate the size of each row's variable-width data and validity as well. - auto validity_size = num_bitmask_words(num_columns) * 4; - // thrust - for (int row = 0; row < num_rows; ++row) { - auto aligned_row_batch_size = - detail::align_offset(row_batch_size, 8); // rows are 8 byte aligned - row_sizes[row] = fixed_width_size_per_row; - // validity is byte aligned - row_sizes[row] += validity_size; - // variable width data is 8-byte aligned - row_sizes[row] = detail::align_offset(row_sizes[row], 8); // rows are 8 byte aligned - - if ((uint64_t)aligned_row_batch_size + row_sizes[row] > - (uint64_t)std::numeric_limits::max()) { - // a new batch starts at the last 32-row boundary - row_batches.push_back( - detail::row_batch{static_cast(row_batch_size), row_batch_rows & ~31}); - row_batch_size = 0; - row_batch_rows = row_batch_rows & 31; - row_offset = 0; - aligned_row_batch_size = 0; - } - row_offset = detail::align_offset(row_offset, 8); // rows are 8 byte aligned - row_offsets.push_back(row_offset); - row_batch_size = aligned_row_batch_size + row_sizes[row]; - row_offset += row_sizes[row]; - total_table_size = detail::align_offset(total_table_size, 8); // rows are 8 byte aligned - total_table_size += row_sizes[row]; - row_batch_rows++; - } - if (row_batch_size > 0) { - row_batches.push_back( - detail::row_batch{static_cast(row_batch_size), row_batch_rows}); - } - - auto dev_row_offsets = make_device_uvector_async(row_offsets, stream, mr); - - std::vector output_buffers; - std::vector output_data; - output_data.reserve(row_batches.size()); - for (uint i = 0; i < row_batches.size(); ++i) { - rmm::device_buffer temp(row_batches[i].num_bytes, stream, mr); - output_data.push_back(static_cast(temp.data())); - output_buffers.push_back(std::move(temp)); - } - auto dev_output_data = make_device_uvector_async(output_data, stream, mr); - - std::vector block_infos = - build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block); - - auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr); - - // blast through the entire table and convert it - dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_ROWS)); - dim3 threads(256); - - detail::copy_to_rows<<>>( - num_rows, num_columns, shmem_limit_per_block, block_infos.size(), dev_input_data.data(), - dev_col_sizes.data(), dev_col_starts.data(), dev_block_infos.data(), dev_row_offsets.data(), - reinterpret_cast(dev_output_data.data())); - - auto validity_block_infos = - build_validity_block_infos(num_columns, num_rows, shmem_limit_per_block, row_batches); - - auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream, mr); - dim3 validity_blocks( - util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_ROWS)); - dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu)); - detail::copy_validity_to_rows<<>>( - num_rows, num_columns, shmem_limit_per_block, dev_row_offsets.data(), dev_output_data.data(), - column_starts.back(), dev_validity_block_infos.data(), validity_block_infos.size(), - dev_input_nm.data()); - - // split up the output buffer into multiple buffers based on row batch sizes - // and create list of byte columns - int offset_offset = 0; - std::vector> ret; - for (uint i = 0; i < row_batches.size(); ++i) { - // compute offsets for this row batch - std::vector offset_vals; - offset_vals.reserve(row_batches[i].row_count + 1); - size_type cur_offset = 0; - offset_vals.push_back(cur_offset); - for (int row = 0; row < row_batches[i].row_count; ++row) { - cur_offset = detail::align_offset(cur_offset, 8) + row_sizes[row + offset_offset]; - offset_vals.push_back(cur_offset); - } - offset_offset += row_batches[i].row_count; - - auto dev_offsets = make_device_uvector_async(offset_vals, stream, mr); - auto offsets = std::make_unique(data_type{type_id::INT32}, - (size_type)offset_vals.size(), dev_offsets.release()); - - auto data = std::make_unique(data_type{cudf::type_id::INT8}, row_batches[i].num_bytes, - std::move(output_buffers[i])); - - ret.push_back( - cudf::make_lists_column(row_batches[i].row_count, std::move(offsets), std::move(data), 0, - rmm::device_buffer{0, rmm::cuda_stream_default, mr}, stream, mr)); - } - - return ret; - #else - CUDF_FAIL("Column to row conversion optimization requires volta or later hardware."); - return {}; - #endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 - } - - std::vector> - convert_to_rows_fixed_width_optimized(cudf::table_view const &tbl, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) { - const cudf::size_type num_columns = tbl.num_columns(); - - std::vector schema; - schema.resize(num_columns); - std::transform(tbl.begin(), tbl.end(), schema.begin(), detail::get_data_type); - - if (detail::are_all_fixed_width(schema)) { - std::vector column_start; - std::vector column_size; - - int32_t size_per_row = detail::compute_fixed_width_layout(schema, column_start, column_size); - auto dev_column_start = make_device_uvector_async(column_start, stream, mr); - auto dev_column_size = make_device_uvector_async(column_size, stream, mr); - - int32_t max_rows_per_batch = std::numeric_limits::max() / size_per_row; - // Make the number of rows per batch a multiple of 32 so we don't have to worry about - // splitting validity at a specific row offset. This might change in the future. - max_rows_per_batch = (max_rows_per_batch / 32) * 32; - - cudf::size_type num_rows = tbl.num_rows(); - - // Get the pointers to the input columnar data ready - std::vector input_data; - std::vector input_nm; - for (cudf::size_type column_number = 0; column_number < num_columns; column_number++) { - cudf::column_view cv = tbl.column(column_number); - input_data.emplace_back(cv.data()); - input_nm.emplace_back(cv.null_mask()); - } - auto dev_input_data = make_device_uvector_async(input_data, stream, mr); - auto dev_input_nm = make_device_uvector_async(input_nm, stream, mr); - - using ScalarType = cudf::scalar_type_t; - auto zero = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value()); - zero->set_valid_async(true, stream); - static_cast(zero.get())->set_value(0, stream); - - auto step = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value()); - step->set_valid_async(true, stream); - static_cast(step.get()) - ->set_value(static_cast(size_per_row), stream); - - std::vector> ret; - for (cudf::size_type row_start = 0; row_start < num_rows; row_start += max_rows_per_batch) { - cudf::size_type row_count = num_rows - row_start; - row_count = row_count > max_rows_per_batch ? max_rows_per_batch : row_count; - ret.emplace_back(detail::fixed_width_convert_to_rows( - row_start, row_count, num_columns, size_per_row, dev_column_start, dev_column_size, - dev_input_data, dev_input_nm, *zero, *step, stream, mr)); - } - - return ret; - } else { - CUDF_FAIL("Only fixed width types are currently supported"); - } - } - - std::unique_ptr convert_from_rows(cudf::lists_column_view const &input, - std::vector const &schema, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) { - #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 - // verify that the types are what we expect - cudf::column_view child = input.child(); - cudf::type_id list_type = child.type().id(); - CUDF_EXPECTS(list_type == cudf::type_id::INT8 || list_type == cudf::type_id::UINT8, - "Only a list of bytes is supported as input"); - - cudf::size_type num_columns = schema.size(); - cudf::size_type num_rows = input.parent().size(); - - int device_id; - CUDA_TRY(cudaGetDevice(&device_id)); - int total_shmem; - CUDA_TRY(cudaDeviceGetAttribute(&total_shmem, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); - - // TODO why? - total_shmem -= 1024; - int shmem_limit_per_block = total_shmem / NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; - - std::vector column_starts; - std::vector column_sizes; - - auto iter = thrust::make_transform_iterator(thrust::make_counting_iterator(0), [&schema](auto i) { - return std::make_tuple(schema[i], nullptr); - }); - size_type fixed_width_size_per_row = detail::compute_column_information( - iter, iter + num_columns, column_starts, column_sizes); //, [](void *) {}); - - size_type validity_size = num_bitmask_words(num_columns) * 4; - - size_type row_size = detail::align_offset(fixed_width_size_per_row + validity_size, 8); - - // Ideally we would check that the offsets are all the same, etc. but for now - // this is probably fine - CUDF_EXPECTS(row_size * num_rows == child.size(), "The layout of the data appears to be off"); - auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr); - auto dev_col_sizes = make_device_uvector_async(column_sizes, stream, mr); - - // build the row_batches from the passed in list column - std::vector row_batches; - - row_batches.push_back(detail::row_batch{child.size(), num_rows}); - - // Allocate the columns we are going to write into - std::vector> output_columns; - std::vector output_data; - std::vector output_nm; - for (cudf::size_type i = 0; i < num_columns; i++) { - auto column = cudf::make_fixed_width_column(schema[i], num_rows, - cudf::mask_state::UNINITIALIZED, stream, mr); - auto mut = column->mutable_view(); - output_data.emplace_back(mut.data()); - output_nm.emplace_back(mut.null_mask()); - output_columns.emplace_back(std::move(column)); - } - - auto dev_output_data = make_device_uvector_async(output_data, stream, mr); - auto dev_output_nm = make_device_uvector_async(output_nm, stream, mr); - - std::vector block_infos = - build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block); - - auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr); - - dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_FROM_ROWS)); - dim3 threads(std::min(std::min(256, shmem_limit_per_block / 8), (int)child.size())); - detail::copy_from_rows<<>>( - num_rows, num_columns, shmem_limit_per_block, input.offsets().data(), - dev_output_data.data(), dev_col_sizes.data(), dev_col_starts.data(), dev_block_infos.data(), - block_infos.size(), child.data()); - - auto const desired_rows_and_columns = (int)sqrt(shmem_limit_per_block); - auto const column_stride = [&]() { - if (desired_rows_and_columns > num_columns) { - // not many columns, group it into 64s and ship it off - return std::min(64, num_columns); - } else { - return util::round_down_safe(desired_rows_and_columns, 8); - } - }(); - auto const row_stride = [&]() { - // we fit as much as we can, we know the column stride now, so calculate the row - return std::min(num_rows, util::round_down_safe(shmem_limit_per_block * 8 / column_stride, 32)); - /* if (desired_rows_and_columns > num_rows) { - return std::min(32, num_rows); - } else { - return util::round_down_safe(desired_rows_and_columns, 32); - }*/ - }(); - std::vector validity_block_infos; - for (int col = 0; col < num_columns; col += column_stride) { - for (int row = 0; row < num_rows; row += row_stride) { - validity_block_infos.emplace_back( - detail::block_info{col, row, std::min(col + column_stride - 1, num_columns - 1), - std::min(row + row_stride - 1, num_rows - 1)}); - } - } - auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream, mr); - dim3 validity_blocks( - util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_BLOCKS_PER_KERNEL_FROM_ROWS)); - - dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu)); - detail:: - copy_validity_from_rows<<>>( - num_rows, num_columns, shmem_limit_per_block, input.offsets().data(), - dev_output_nm.data(), column_starts.back(), dev_validity_block_infos.data(), - validity_block_infos.size(), child.data()); - - return std::make_unique(std::move(output_columns)); - #else - CUDF_FAIL("Row to column conversion optimization requires volta or later hardware."); - return {}; - #endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 - } - - std::unique_ptr convert_from_rows_fixed_width_optimized( - cudf::lists_column_view const &input, std::vector const &schema, - rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) { - // verify that the types are what we expect - cudf::column_view child = input.child(); - cudf::type_id list_type = child.type().id(); - CUDF_EXPECTS(list_type == cudf::type_id::INT8 || list_type == cudf::type_id::UINT8, - "Only a list of bytes is supported as input"); - - cudf::size_type num_columns = schema.size(); - - if (detail::are_all_fixed_width(schema)) { - std::vector column_start; - std::vector column_size; - - cudf::size_type num_rows = input.parent().size(); - int32_t size_per_row = detail::compute_fixed_width_layout(schema, column_start, column_size); - - // Ideally we would check that the offsets are all the same, etc. but for now - // this is probably fine - CUDF_EXPECTS(size_per_row * num_rows == child.size(), - "The layout of the data appears to be off"); - auto dev_column_start = make_device_uvector_async(column_start, stream); - auto dev_column_size = make_device_uvector_async(column_size, stream); - - // Allocate the columns we are going to write into - std::vector> output_columns; - std::vector output_data; - std::vector output_nm; - for (cudf::size_type i = 0; i < num_columns; i++) { - auto column = cudf::make_fixed_width_column(schema[i], num_rows, - cudf::mask_state::UNINITIALIZED, stream, mr); - auto mut = column->mutable_view(); - output_data.emplace_back(mut.data()); - output_nm.emplace_back(mut.null_mask()); - output_columns.emplace_back(std::move(column)); - } - - auto dev_output_data = make_device_uvector_async(output_data, stream, mr); - auto dev_output_nm = make_device_uvector_async(output_nm, stream, mr); - - dim3 blocks; - dim3 threads; - int shared_size = - detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads); - - detail::copy_from_rows_fixed_width_optimized<<>>( - num_rows, num_columns, size_per_row, dev_column_start.data(), dev_column_size.data(), - dev_output_data.data(), dev_output_nm.data(), child.data()); - - return std::make_unique(std::move(output_columns)); - } else { - CUDF_FAIL("Only fixed width types are currently supported"); - } - } - - } // namespace cudf - \ No newline at end of file diff --git a/cpp/tests/row_conversion/row_conversion.cpp b/cpp/tests/row_conversion/row_conversion.cpp deleted file mode 100644 index b807b5cec81..00000000000 --- a/cpp/tests/row_conversion/row_conversion.cpp +++ /dev/null @@ -1,677 +0,0 @@ -/* - * Copyright (c) 2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include - -struct ColumnToRowTests : public cudf::test::BaseFixture { -}; -struct RowToColumnTests : public cudf::test::BaseFixture { -}; - -TEST_F(ColumnToRowTests, Single) -{ - cudf::test::fixed_width_column_wrapper a({-1}); - cudf::table_view in(std::vector{a}); - std::vector schema = {cudf::data_type{cudf::type_id::INT32}}; - - auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); - auto new_rows = cudf::convert_to_rows(in); - - EXPECT_EQ(old_rows.size(), new_rows.size()); - for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = - cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); - - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); - } -} - -TEST_F(ColumnToRowTests, Simple) -{ - cudf::test::fixed_width_column_wrapper a({-1, 0, 1}); - cudf::table_view in(std::vector{a}); - std::vector schema = {cudf::data_type{cudf::type_id::INT32}}; - - auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); - auto new_rows = cudf::convert_to_rows(in); - - EXPECT_EQ(old_rows.size(), new_rows.size()); - for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = - cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); - - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); - } -} - -TEST_F(ColumnToRowTests, Tall) -{ - auto r = - cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); }); - cudf::test::fixed_width_column_wrapper a(r, r + (size_t)4096); - cudf::table_view in(std::vector{a}); - std::vector schema = {cudf::data_type{cudf::type_id::INT32}}; - - auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); - auto new_rows = cudf::convert_to_rows(in); - - EXPECT_EQ(old_rows.size(), new_rows.size()); - - for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = - cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); - - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); - } -} - -TEST_F(ColumnToRowTests, Wide) -{ - std::vector> cols; - std::vector views; - std::vector schema; - - for (int i = 0; i < 256; ++i) { - cols.push_back(cudf::test::fixed_width_column_wrapper({rand()})); - views.push_back(cols.back()); - schema.push_back(cudf::data_type{cudf::type_id::INT32}); - } - cudf::table_view in(views); - - auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); - auto new_rows = cudf::convert_to_rows(in); - - EXPECT_EQ(old_rows.size(), new_rows.size()); - for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = - cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); - - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); - } -} - -TEST_F(ColumnToRowTests, SingleByteWide) -{ - std::vector> cols; - std::vector views; - std::vector schema; - - for (int i = 0; i < 256; ++i) { - cols.push_back(cudf::test::fixed_width_column_wrapper({rand()})); - views.push_back(cols.back()); - - schema.push_back(cudf::data_type{cudf::type_id::INT8}); - } - cudf::table_view in(views); - - auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); - auto new_rows = cudf::convert_to_rows(in); - - EXPECT_EQ(old_rows.size(), new_rows.size()); - - for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = - cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); - - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); - } -} - -TEST_F(ColumnToRowTests, Non2Power) -{ - auto r = - cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); }); - std::vector> cols; - std::vector views; - std::vector schema; - - constexpr auto num_rows = 6 * 1024 + 557; - for (int i = 0; i < 131; ++i) { - cols.push_back(cudf::test::fixed_width_column_wrapper(r + num_rows * i, - r + num_rows * i + num_rows)); - views.push_back(cols.back()); - schema.push_back(cudf::data_type{cudf::type_id::INT32}); - } - cudf::table_view in(views); - - auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); - auto new_rows = cudf::convert_to_rows(in); - - EXPECT_EQ(old_rows.size(), new_rows.size()); - - for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = - cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); - - for (int j = 0; j < old_tbl->num_columns(); ++j) { - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(old_tbl->get_column(j), new_tbl->get_column(j)); - } - - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); - } -} - -TEST_F(ColumnToRowTests, Big) -{ - auto r = - cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); }); - std::vector> cols; - std::vector views; - std::vector schema; - - // 28 columns of 1 million rows - constexpr auto num_rows = 1024 * 1024; - for (int i = 0; i < 28; ++i) { - cols.push_back(cudf::test::fixed_width_column_wrapper(r + num_rows * i, - r + num_rows * i + num_rows)); - views.push_back(cols.back()); - schema.push_back(cudf::data_type{cudf::type_id::INT32}); - } - cudf::table_view in(views); - - auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); - auto new_rows = cudf::convert_to_rows(in); - - EXPECT_EQ(old_rows.size(), new_rows.size()); - - for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = - cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); - - for (int j = 0; j < old_tbl->num_columns(); ++j) { - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(old_tbl->get_column(j), new_tbl->get_column(j)); - } - - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); - } -} - -TEST_F(ColumnToRowTests, Bigger) -{ - auto r = - cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); }); - std::vector> cols; - std::vector views; - std::vector schema; - - // 128 columns of 1 million rows - constexpr auto num_rows = 1024 * 1024; - for (int i = 0; i < 128; ++i) { - cols.push_back(cudf::test::fixed_width_column_wrapper(r + num_rows * i, - r + num_rows * i + num_rows)); - views.push_back(cols.back()); - schema.push_back(cudf::data_type{cudf::type_id::INT32}); - } - cudf::table_view in(views); - - auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); - auto new_rows = cudf::convert_to_rows(in); - - EXPECT_EQ(old_rows.size(), new_rows.size()); - for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = - cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); - - for (int j = 0; j < old_tbl->num_columns(); ++j) { - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(old_tbl->get_column(j), new_tbl->get_column(j)); - } - - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); - } -} - -TEST_F(ColumnToRowTests, Biggest) -{ - auto r = - cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); }); - std::vector> cols; - std::vector views; - std::vector schema; - - // 128 columns of 2 million rows - constexpr auto num_rows = 2 * 1024 * 1024; - for (int i = 0; i < 128; ++i) { - cols.push_back(cudf::test::fixed_width_column_wrapper(r + num_rows * i, - r + num_rows * i + num_rows)); - views.push_back(cols.back()); - schema.push_back(cudf::data_type{cudf::type_id::INT32}); - } - cudf::table_view in(views); - - auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); - auto new_rows = cudf::convert_to_rows(in); - - EXPECT_EQ(old_rows.size(), new_rows.size()); - - for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = - cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); - - for (int j = 0; j < old_tbl->num_columns(); ++j) { - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(old_tbl->get_column(j), new_tbl->get_column(j)); - } - - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); - } -} - -TEST_F(RowToColumnTests, Single) -{ - cudf::test::fixed_width_column_wrapper a({-1}); - cudf::table_view in(std::vector{a}); - - auto old_rows = cudf::convert_to_rows(in); - std::vector schema{cudf::data_type{cudf::type_id::INT32}}; - for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = - cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); - - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); - } -} - -TEST_F(RowToColumnTests, Simple) -{ - cudf::test::fixed_width_column_wrapper a({-1, 0, 1}); - cudf::table_view in(std::vector{a}); - - auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); - std::vector schema{cudf::data_type{cudf::type_id::INT32}}; - for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = - cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); - - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); - } -} - -TEST_F(RowToColumnTests, Tall) -{ - auto r = - cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); }); - cudf::test::fixed_width_column_wrapper a(r, r + (size_t)4096); - cudf::table_view in(std::vector{a}); - - auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); - std::vector schema; - schema.reserve(in.num_columns()); - for (auto col = in.begin(); col < in.end(); ++col) { - schema.push_back(col->type()); - } - for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = - cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); - - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); - } -} - -TEST_F(RowToColumnTests, Wide) -{ - std::vector> cols; - std::vector views; - - for (int i = 0; i < 256; ++i) { - cols.push_back(cudf::test::fixed_width_column_wrapper({i})); // rand()})); - views.push_back(cols.back()); - } - cudf::table_view in(views); - - auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); - std::vector schema; - schema.reserve(in.num_columns()); - for (auto col = in.begin(); col < in.end(); ++col) { - schema.push_back(col->type()); - } - - for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = - cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); - - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); - } -} - -TEST_F(RowToColumnTests, SingleByteWide) -{ - std::vector> cols; - std::vector views; - - for (int i = 0; i < 256; ++i) { - cols.push_back(cudf::test::fixed_width_column_wrapper({rand()})); - views.push_back(cols.back()); - } - cudf::table_view in(views); - - auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); - std::vector schema; - schema.reserve(in.num_columns()); - for (auto col = in.begin(); col < in.end(); ++col) { - schema.push_back(col->type()); - } - for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = - cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); - - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); - } -} - -TEST_F(RowToColumnTests, AllTypes) -{ - std::vector> cols; - std::vector views; - std::vector schema{cudf::data_type{cudf::type_id::INT64}, - cudf::data_type{cudf::type_id::FLOAT64}, - cudf::data_type{cudf::type_id::INT8}, - cudf::data_type{cudf::type_id::BOOL8}, - cudf::data_type{cudf::type_id::FLOAT32}, - cudf::data_type{cudf::type_id::INT8}, - cudf::data_type{cudf::type_id::INT32}, - cudf::data_type{cudf::type_id::INT64}}; - - cudf::test::fixed_width_column_wrapper c0({3, 9, 4, 2, 20, 0}, {1, 1, 1, 1, 1, 0}); - cudf::test::fixed_width_column_wrapper c1({5.0, 9.5, 0.9, 7.23, 2.8, 0.0}, - {1, 1, 1, 1, 1, 0}); - cudf::test::fixed_width_column_wrapper c2({5, 1, 0, 2, 7, 0}, {1, 1, 1, 1, 1, 0}); - cudf::test::fixed_width_column_wrapper c3({true, false, false, true, false, false}, - {1, 1, 1, 1, 1, 0}); - cudf::test::fixed_width_column_wrapper c4({1.0f, 3.5f, 5.9f, 7.1f, 9.8f, 0.0f}, - {1, 1, 1, 1, 1, 0}); - cudf::test::fixed_width_column_wrapper c5({2, 3, 4, 5, 9, 0}, {1, 1, 1, 1, 1, 0}); - cudf::test::fixed_point_column_wrapper c6( - {-300, 500, 950, 90, 723, 0}, {1, 1, 1, 1, 1, 1, 1, 0}, numeric::scale_type{-2}); - cudf::test::fixed_point_column_wrapper c7( - {-80, 30, 90, 20, 200, 0}, {1, 1, 1, 1, 1, 1, 0}, numeric::scale_type{-1}); - - cudf::table_view in({c0, c1, c2, c3, c4, c5, c6, c7}); - - auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); - auto new_rows = cudf::convert_to_rows(in); - - for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = - cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*new_rows[i]), schema); - - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); - } -} - -TEST_F(RowToColumnTests, AllTypesLarge) -{ - std::vector cols; - std::vector schema{}; - - // 10 columns of each type with 1024 entries - constexpr int num_rows{1024}; - - std::default_random_engine re; - std::uniform_real_distribution rand_double(std::numeric_limits::min(), - std::numeric_limits::max()); - std::uniform_int_distribution rand_int64(std::numeric_limits::min(), - std::numeric_limits::max()); - auto r = cudf::detail::make_counting_transform_iterator( - 0, [&](auto i) -> int64_t { return rand_int64(re); }); - auto d = cudf::detail::make_counting_transform_iterator( - 0, [&](auto i) -> double { return rand_double(re); }); - - auto all_valid = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return 1; }); - auto none_valid = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return 0; }); - auto most_valid = cudf::detail::make_counting_transform_iterator( - 0, [](auto i) { return rand() % 2 == 0 ? 0 : 1; }); - auto few_valid = cudf::detail::make_counting_transform_iterator( - 0, [](auto i) { return rand() % 13 == 0 ? 1 : 0; }); - - for (int i = 0; i < 10; ++i) { - cols.push_back(*cudf::test::fixed_width_column_wrapper(r, r + num_rows, all_valid) - .release() - .release()); - schema.push_back(cudf::data_type{cudf::type_id::INT8}); - } - - for (int i = 0; i < 10; ++i) { - cols.push_back(*cudf::test::fixed_width_column_wrapper(r, r + num_rows, few_valid) - .release() - .release()); - schema.push_back(cudf::data_type{cudf::type_id::INT16}); - } - - for (int i = 0; i < 10; ++i) { - if (i < 5) { - cols.push_back(*cudf::test::fixed_width_column_wrapper(r, r + num_rows, few_valid) - .release() - .release()); - } else { - cols.push_back(*cudf::test::fixed_width_column_wrapper(r, r + num_rows, none_valid) - .release() - .release()); - } - schema.push_back(cudf::data_type{cudf::type_id::INT32}); - } - - for (int i = 0; i < 10; ++i) { - cols.push_back(*cudf::test::fixed_width_column_wrapper(d, d + num_rows, most_valid) - .release() - .release()); - schema.push_back(cudf::data_type{cudf::type_id::FLOAT32}); - } - - for (int i = 0; i < 10; ++i) { - cols.push_back(*cudf::test::fixed_width_column_wrapper(d, d + num_rows, most_valid) - .release() - .release()); - schema.push_back(cudf::data_type{cudf::type_id::FLOAT64}); - } - - for (int i = 0; i < 10; ++i) { - cols.push_back(*cudf::test::fixed_width_column_wrapper(r, r + num_rows, few_valid) - .release() - .release()); - schema.push_back(cudf::data_type{cudf::type_id::BOOL8}); - } - - for (int i = 0; i < 10; ++i) { - cols.push_back( - *cudf::test::fixed_width_column_wrapper( - r, r + num_rows, all_valid) - .release() - .release()); - schema.push_back(cudf::data_type{cudf::type_id::TIMESTAMP_MILLISECONDS}); - } - - for (int i = 0; i < 10; ++i) { - cols.push_back( - *cudf::test::fixed_width_column_wrapper( - r, r + num_rows, most_valid) - .release() - .release()); - schema.push_back(cudf::data_type{cudf::type_id::TIMESTAMP_DAYS}); - } - - for (int i = 0; i < 10; ++i) { - cols.push_back(*cudf::test::fixed_point_column_wrapper( - r, r + num_rows, all_valid, numeric::scale_type{-2}) - .release() - .release()); - schema.push_back(cudf::data_type{cudf::type_id::DECIMAL32}); - } - - for (int i = 0; i < 10; ++i) { - cols.push_back(*cudf::test::fixed_point_column_wrapper( - r, r + num_rows, most_valid, numeric::scale_type{-1}) - .release() - .release()); - schema.push_back(cudf::data_type{cudf::type_id::DECIMAL64}); - } - - std::vector views(cols.begin(), cols.end()); - cudf::table_view in(views); - - auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); - auto new_rows = cudf::convert_to_rows(in); - - for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = - cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*new_rows[i]), schema); - - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); - } -} - -TEST_F(RowToColumnTests, Non2Power) -{ - auto r = - cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); }); - std::vector> cols; - std::vector views; - std::vector schema; - - constexpr auto num_rows = 6 * 1024 + 557; - for (int i = 0; i < 131; ++i) { - cols.push_back(cudf::test::fixed_width_column_wrapper(r + num_rows * i, - r + num_rows * i + num_rows)); - views.push_back(cols.back()); - schema.push_back(cudf::data_type{cudf::type_id::INT32}); - } - cudf::table_view in(views); - - auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); - - for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = - cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); - - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); - } -} - -TEST_F(RowToColumnTests, Big) -{ - auto r = - cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); }); - std::vector> cols; - std::vector views; - std::vector schema; - - // 28 columns of 1 million rows - constexpr auto num_rows = 1024 * 1024; - for (int i = 0; i < 28; ++i) { - cols.push_back(cudf::test::fixed_width_column_wrapper(r + num_rows * i, - r + num_rows * i + num_rows)); - views.push_back(cols.back()); - schema.push_back(cudf::data_type{cudf::type_id::INT32}); - } - cudf::table_view in(views); - - auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); - - for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = - cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); - - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); - } -} - -TEST_F(RowToColumnTests, Bigger) -{ - auto r = - cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); }); - std::vector> cols; - std::vector views; - std::vector schema; - - // 28 columns of 1 million rows - constexpr auto num_rows = 1024 * 1024; - for (int i = 0; i < 128; ++i) { - cols.push_back(cudf::test::fixed_width_column_wrapper(r + num_rows * i, - r + num_rows * i + num_rows)); - views.push_back(cols.back()); - schema.push_back(cudf::data_type{cudf::type_id::INT32}); - } - cudf::table_view in(views); - - auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); - - for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = - cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); - - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); - } -} - -TEST_F(RowToColumnTests, Biggest) -{ - auto r = - cudf::detail::make_counting_transform_iterator(0, [](auto i) -> int32_t { return rand(); }); - std::vector> cols; - std::vector views; - std::vector schema; - - // 28 columns of 1 million rows - constexpr auto num_rows = 5 * 1024 * 1024; - for (int i = 0; i < 128; ++i) { - cols.push_back(cudf::test::fixed_width_column_wrapper(r + num_rows * i, - r + num_rows * i + num_rows)); - views.push_back(cols.back()); - schema.push_back(cudf::data_type{cudf::type_id::INT32}); - } - cudf::table_view in(views); - - auto old_rows = cudf::convert_to_rows_fixed_width_optimized(in); - - for (uint i = 0; i < old_rows.size(); ++i) { - auto old_tbl = - cudf::convert_from_rows_fixed_width_optimized(cudf::lists_column_view(*old_rows[i]), schema); - auto new_tbl = cudf::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema); - - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl); - } -} diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu index 932afa4bb70..f5936e86bcd 100644 --- a/java/src/main/native/src/row_conversion.cu +++ b/java/src/main/native/src/row_conversion.cu @@ -50,7 +50,7 @@ #include #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 -constexpr auto NUM_BLOCKS_PER_KERNEL_FROM_ROWS = 8; +constexpr auto NUM_BLOCKS_PER_KERNEL_FROM_ROWS = 2; constexpr auto NUM_BLOCKS_PER_KERNEL_TO_ROWS = 2; constexpr auto NUM_BLOCKS_PER_KERNEL_LOADED = 2; constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL = 8; @@ -409,7 +409,7 @@ __global__ void copy_to_rows(const size_type num_rows, const size_type num_colum auto &fetch_barrier = block_barrier[fetch % NUM_BLOCKS_PER_KERNEL_LOADED]; // wait for the last use of the memory to be completed - if (fetch > NUM_BLOCKS_PER_KERNEL_LOADED) { + if (fetch >= NUM_BLOCKS_PER_KERNEL_LOADED) { fetch_barrier.arrive_and_wait(); } @@ -525,7 +525,7 @@ __global__ void copy_validity_to_rows( group.sync(); for (int validity_block = 0; validity_block < blocks_remaining; ++validity_block) { - if (validity_block != validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED) { + if (validity_block >= NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED) { shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] .arrive_and_wait(); } @@ -645,10 +645,10 @@ get_admin_data_sizes(size_t col_size_size, size_t col_offset_size, int const num * */ __global__ void copy_from_rows(const size_type num_rows, const size_type num_columns, - const size_type shmem_used_per_block, const size_type *row_offsets, - int8_t **output_data, const size_type *_col_sizes, - const size_type *_col_offsets, const block_info *block_infos, - const size_type num_block_infos, const int8_t *input_data) { + const size_type shmem_used_per_block, const size_type *row_offsets, + int8_t **output_data, const size_type *_col_sizes, + const size_type *_col_offsets, const block_info *block_infos, + const size_type num_block_infos, const int8_t *input_data) { // We are going to copy the data in two passes. // The first pass copies a chunk of data into shared memory. // The second pass copies that chunk from shared memory out to the final location. @@ -819,8 +819,8 @@ __global__ void copy_validity_from_rows( group.sync(); for (int validity_block = 0; validity_block < blocks_remaining; ++validity_block) { - auto const validity_index = validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; - if (validity_block != validity_index) { + if (validity_block >= NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED) { + auto const validity_index = validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; shared_block_barriers[validity_index].arrive_and_wait(); } int8_t *this_shared_block = shared_blocks[validity_block % 2]; @@ -1251,7 +1251,7 @@ std::vector> convert_to_rows(cudf::table_view cons // TODO: why? total_shmem -= 1024; - int shmem_limit_per_block = total_shmem / NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; + int shmem_limit_per_block = total_shmem / NUM_BLOCKS_PER_KERNEL_LOADED; // break up the work into blocks, which are a starting and ending row/col #. // this window size is calculated based on the shared memory size available @@ -1368,7 +1368,7 @@ std::vector> convert_to_rows(cudf::table_view cons std::vector block_infos = build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block); - auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr); + auto dev_block_infos = make_device_uvector_async(block_infos, stream); // blast through the entire table and convert it dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_ROWS)); @@ -1382,12 +1382,11 @@ std::vector> convert_to_rows(cudf::table_view cons auto validity_block_infos = build_validity_block_infos(num_columns, num_rows, shmem_limit_per_block, row_batches); - auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream, mr); + auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream); dim3 validity_blocks( - util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_ROWS)); + util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_VALIDITY_BLOCKS_PER_KERNEL)); dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu)); - detail::copy_validity_to_rows<<>>( + detail::copy_validity_to_rows<<>>( num_rows, num_columns, shmem_limit_per_block, dev_row_offsets.data(), dev_output_data.data(), column_starts.back(), dev_validity_block_infos.data(), validity_block_infos.size(), dev_input_nm.data()); @@ -1508,7 +1507,7 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const &in // TODO why? total_shmem -= 1024; - int shmem_limit_per_block = total_shmem / NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; + int shmem_limit_per_block = total_shmem / NUM_BLOCKS_PER_KERNEL_LOADED; std::vector column_starts; std::vector column_sizes; @@ -1590,7 +1589,7 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const &in } auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream, mr); dim3 validity_blocks( - util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_BLOCKS_PER_KERNEL_FROM_ROWS)); + util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_VALIDITY_BLOCKS_PER_KERNEL)); dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu)); detail:: From e9938b96890e2bca0591a4ab857f8c36c2bf4c49 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Sat, 23 Oct 2021 01:37:52 +0000 Subject: [PATCH 63/80] removing unused header, suppressing shared warning for barrier, updating java bindings to use the correct namespace --- cpp/include/cudf/row_conversion.hpp | 51 ---------------------- java/src/main/native/src/TableJni.cpp | 9 ++-- java/src/main/native/src/row_conversion.cu | 6 ++- 3 files changed, 9 insertions(+), 57 deletions(-) delete mode 100644 cpp/include/cudf/row_conversion.hpp diff --git a/cpp/include/cudf/row_conversion.hpp b/cpp/include/cudf/row_conversion.hpp deleted file mode 100644 index 5d799f4c596..00000000000 --- a/cpp/include/cudf/row_conversion.hpp +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Copyright (c) 2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include - -#include -#include -#include - -namespace cudf { - -std::vector> convert_to_rows_fixed_width_optimized( - cudf::table_view const& tbl, - // TODO need something for validity - rmm::cuda_stream_view stream = rmm::cuda_stream_default, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -std::vector> convert_to_rows( - cudf::table_view const& tbl, - // TODO need something for validity - rmm::cuda_stream_view stream = rmm::cuda_stream_default, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -std::unique_ptr convert_from_rows_fixed_width_optimized( - cudf::lists_column_view const& input, - std::vector const& schema, - rmm::cuda_stream_view stream = rmm::cuda_stream_default, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -std::unique_ptr convert_from_rows( - cudf::lists_column_view const& input, - std::vector const& schema, - rmm::cuda_stream_view stream = rmm::cuda_stream_default, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -} // namespace cudf diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 45403f1eb0d..d7209a23ede 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -35,7 +35,6 @@ #include #include #include -#include #include #include #include @@ -2697,7 +2696,7 @@ Java_ai_rapids_cudf_Table_convertToRowsFixedWidthOptimized(JNIEnv *env, jclass, cudf::jni::auto_set_device(env); cudf::table_view *n_input_table = reinterpret_cast(input_table); std::vector> cols = - cudf::convert_to_rows_fixed_width_optimized(*n_input_table); + cudf::java::convert_to_rows_fixed_width_optimized(*n_input_table); int num_columns = cols.size(); cudf::jni::native_jlongArray outcol_handles(env, num_columns); for (int i = 0; i < num_columns; i++) { @@ -2715,7 +2714,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertToRows(JNIEnv *env try { cudf::jni::auto_set_device(env); cudf::table_view *n_input_table = reinterpret_cast(input_table); - std::vector> cols = cudf::convert_to_rows(*n_input_table); + std::vector> cols = cudf::java::convert_to_rows(*n_input_table); int num_columns = cols.size(); cudf::jni::native_jlongArray outcol_handles(env, num_columns); for (int i = 0; i < num_columns; i++) { @@ -2742,7 +2741,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRowsFixedWidth types_vec.emplace_back(cudf::jni::make_data_type(n_types[i], n_scale[i])); } std::unique_ptr result = - cudf::convert_from_rows_fixed_width_optimized(list_input, types_vec); + cudf::java::convert_from_rows_fixed_width_optimized(list_input, types_vec); return cudf::jni::convert_table_for_return(env, result); } CATCH_STD(env, 0); @@ -2765,7 +2764,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRows(JNIEnv *e for (int i = 0; i < n_types.size(); i++) { types_vec.emplace_back(cudf::jni::make_data_type(n_types[i], n_scale[i])); } - std::unique_ptr result = cudf::convert_from_rows(list_input, types_vec); + std::unique_ptr result = cudf::java::convert_from_rows(list_input, types_vec); return cudf::jni::convert_table_for_return(env, result); } CATCH_STD(env, 0); diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu index f5936e86bcd..af26e4c0b0d 100644 --- a/java/src/main/native/src/row_conversion.cu +++ b/java/src/main/native/src/row_conversion.cu @@ -34,7 +34,6 @@ #include #include #include -#include #include #include #include @@ -49,12 +48,17 @@ #include #include +#include "row_conversion.hpp" + #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 constexpr auto NUM_BLOCKS_PER_KERNEL_FROM_ROWS = 2; constexpr auto NUM_BLOCKS_PER_KERNEL_TO_ROWS = 2; constexpr auto NUM_BLOCKS_PER_KERNEL_LOADED = 2; constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL = 8; constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED = 2; + +// needed to suppress warning about cuda::barrier +#pragma diag_suppress static_var_with_dynamic_init #endif using cudf::detail::make_device_uvector_async; From 3c6b1e5ebff9f8265f5fbb47be457e0a68fc98a8 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Sat, 30 Oct 2021 01:00:38 +0000 Subject: [PATCH 64/80] updating code to build block infos with thrust on the gpu --- java/src/main/native/src/row_conversion.cu | 670 +++++++++++++-------- 1 file changed, 418 insertions(+), 252 deletions(-) diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu index af26e4c0b0d..87ab1ed49d8 100644 --- a/java/src/main/native/src/row_conversion.cu +++ b/java/src/main/native/src/row_conversion.cu @@ -21,6 +21,8 @@ #include #include +#include +#include #include #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 @@ -34,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -47,8 +50,7 @@ #include #include #include - -#include "row_conversion.hpp" +#include #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 constexpr auto NUM_BLOCKS_PER_KERNEL_FROM_ROWS = 2; @@ -64,7 +66,7 @@ constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED = 2; using cudf::detail::make_device_uvector_async; using rmm::device_uvector; namespace cudf { - +namespace java { namespace detail { static inline __host__ __device__ int32_t align_offset(int32_t offset, std::size_t alignment) { @@ -324,6 +326,11 @@ __global__ void copy_to_rows_fixed_width_optimized( #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 +/** + * @brief The GPU blocks work on one or more block_info structs of data. + * This structure defined the workspace for the block. + * + */ struct block_info { int start_col; int start_row; @@ -340,38 +347,36 @@ struct block_info { __host__ __device__ size_type num_rows() const { return end_row - start_row + 1; } }; -// When building the columns to return, we have to be mindful of the offset limit in cudf. -// It is 32-bit and these data columns are capable of surpassing that easily. The data should -// not be cut off exactly at the limit though due to the validity buffers. The most efficient -// place to cut the validity is on a 32-row boundary, so as we calculate the row sizes -// we keep track of the cut points for the validity, which we call row batches. If the row -// is larger than can be represented with the 32-bit offsets, we use the last 32-row boundary we -// hit. Note that this boundary is for our book-keeping with column pointers and not anything that -// the kernel needs to worry about. We cut the output at convienient boundaries when assembling -// the outgoing data stream. +/** + * @brief Returning rows is done in a byte cudf column. This is limited in size by + * `size_type` and so output is broken into batches of rows that fit inside + * this limit. + * + */ struct row_batch { size_type num_bytes; size_type row_count; + device_uvector row_offsets; }; /** - * @brief copy data from cudf columns into x format, which is row-based + * @brief copy data from cudf columns into JCUDF format, which is row-based * * @param num_rows total number of rows in the table * @param num_columns total number of columns in the table + * @param shmem_used_per_block shared memory amount each `block_info` is using + * @param block_infos span of `block_info` structs the define the work * @param input_data pointer to raw table data - * @param input_nm pointer to validity data * @param col_sizes array of sizes for each element in a column - one per column * @param col_offsets offset into input data row for each column's start - * @param block_infos information about the blocks of work * @param row_offsets offset to a specific row in the input data * @param output_data pointer to output data * */ __global__ void copy_to_rows(const size_type num_rows, const size_type num_columns, - const size_type shmem_used_per_block, const size_type num_block_infos, - const int8_t **input_data, const size_type *col_sizes, - const size_type *col_offsets, const block_info *block_infos, + const size_type shmem_used_per_block, + device_span block_infos, const int8_t **input_data, + const size_type *col_sizes, const size_type *col_offsets, const size_type *row_offsets, int8_t **output_data) { // We are going to copy the data in two passes. // The first pass copies a chunk of data into shared memory. @@ -396,7 +401,7 @@ __global__ void copy_to_rows(const size_type num_rows, const size_type num_colum group.sync(); auto const blocks_remaining = - std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_ROWS, + std::min((uint)block_infos.size() - blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_ROWS, (uint)NUM_BLOCKS_PER_KERNEL_TO_ROWS); size_t fetch; @@ -491,23 +496,25 @@ __global__ void copy_to_rows(const size_type num_rows, const size_type num_colum * @param num_rows total number of rows in the table * @param num_columns total number of columns in the table * @param shmem_used_per_block amount of shared memory that is used by a block - * @param offsets + * @param row_offsets offset to a specific row in the input data * @param output_data pointer to output data, partitioned by data size * @param validity_offsets offset into input data row for validity data * @param block_infos information about the blocks of work - * @param num_block_infos number of infos in blocks array * @param input_data pointer to input data * */ -__global__ void copy_validity_to_rows( - const size_type num_rows, const size_type num_columns, const size_type shmem_used_per_block, - const size_type *row_offsets, int8_t **output_data, const size_type validity_offset, - const block_info *block_infos, const size_type num_block_infos, const bitmask_type **input_nm) { +__global__ void copy_validity_to_rows(const size_type num_rows, const size_type num_columns, + const size_type shmem_used_per_block, + const size_type *row_offsets, int8_t **output_data, + const size_type validity_offset, + device_span block_infos, + const bitmask_type **input_nm) { extern __shared__ int8_t shared_data[]; int8_t *shared_blocks[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] = { shared_data, shared_data + shmem_used_per_block / 2}; - // per conversation with DaveB + using cudf::detail::warp_size; + // each thread of warp reads a single int32 of validity - so we read 128 bytes // then ballot_sync the bits and write the result to shmem // after we fill shared mem memcpy it out in a blob. @@ -515,7 +522,7 @@ __global__ void copy_validity_to_rows( auto group = cooperative_groups::this_thread_block(); int const blocks_remaining = - std::min(num_block_infos - blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL, + std::min((uint)block_infos.size() - blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL, (uint)NUM_VALIDITY_BLOCKS_PER_KERNEL); __shared__ cuda::barrier @@ -545,9 +552,9 @@ __global__ void copy_validity_to_rows( align_offset(util::div_rounding_up_unsafe(num_block_cols, 8), 8); auto const total_sections = num_sections_x * num_sections_y; - int const warp_id = threadIdx.x / detail::warp_size; - int const lane_id = threadIdx.x % detail::warp_size; - auto const warps_per_block = std::max(1u, blockDim.x / detail::warp_size); + int const warp_id = threadIdx.x / warp_size; + int const lane_id = threadIdx.x % warp_size; + auto const warps_per_block = std::max(1u, blockDim.x / warp_size); // the block is divided into sections. A warp operates on a section at a time. for (int my_section_idx = warp_id; my_section_idx < total_sections; @@ -567,7 +574,7 @@ __global__ void copy_validity_to_rows( input_nm[absolute_col][absolute_row / 32] : std::numeric_limits::max(); - // every thread that is participating in the warp has a byte, but it's column-based + // every thread that is participating in the warp has 4 bytes, but it's column-based // data and we need it in row-based. So we shuffle the bits around with ballot_sync to // make the bytes we actually write. bitmask_type dw_mask = 1; @@ -576,7 +583,7 @@ __global__ void copy_validity_to_rows( // lead thread in each warp writes data auto const validity_write_offset = validity_data_row_length * (relative_row + i) + relative_col / 8; - if (threadIdx.x % detail::warp_size == 0) { + if (threadIdx.x % warp_size == 0) { if (cols_left <= 8) { // write byte this_shared_block[validity_write_offset] = validity_data & 0xFF; @@ -625,6 +632,14 @@ __global__ void copy_validity_to_rows( } } +/** + * @brief Admin data is data stored in shared memory that isn't actual column data + * + * @param col_size_size size of the column size data. + * @param col_offset_size size of the column offset data. + * @param num_cols number of columns in the block. + * @return tuple of the size of column and offset admin data. + */ static __device__ std::tuple get_admin_data_sizes(size_t col_size_size, size_t col_offset_size, int const num_cols) { auto const col_size_bytes = num_cols * col_size_size; @@ -639,9 +654,8 @@ get_admin_data_sizes(size_t col_size_size, size_t col_offset_size, int const num * @param num_rows total number of rows in the table * @param num_columns total number of columns in the table * @param shmem_used_per_block amount of shared memory that is used by a block - * @param row_offsets - * @param output_data - * @param output_nm + * @param row_offsets offset to a specific row in the input data + * @param output_data pointers to column data * @param col_sizes array of sizes for each element in a column - one per column * @param col_offsets offset into input data row for each column's start * @param block_infos information about the blocks of work @@ -651,8 +665,9 @@ get_admin_data_sizes(size_t col_size_size, size_t col_offset_size, int const num __global__ void copy_from_rows(const size_type num_rows, const size_type num_columns, const size_type shmem_used_per_block, const size_type *row_offsets, int8_t **output_data, const size_type *_col_sizes, - const size_type *_col_offsets, const block_info *block_infos, - const size_type num_block_infos, const int8_t *input_data) { + const size_type *_col_offsets, + device_span block_infos, + const int8_t *input_data) { // We are going to copy the data in two passes. // The first pass copies a chunk of data into shared memory. // The second pass copies that chunk from shared memory out to the final location. @@ -678,8 +693,9 @@ __global__ void copy_from_rows(const size_type num_rows, const size_type num_col group.sync(); - auto blocks_remaining = std::min(num_block_infos - blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_ROWS, - (uint)NUM_BLOCKS_PER_KERNEL_FROM_ROWS); + auto blocks_remaining = + std::min((uint)block_infos.size() - blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_ROWS, + (uint)NUM_BLOCKS_PER_KERNEL_FROM_ROWS); size_t fetch_index; size_t processing_index; @@ -785,23 +801,24 @@ __global__ void copy_from_rows(const size_type num_rows, const size_type num_col * @param num_rows total number of rows in the table * @param num_columns total number of columns in the table * @param shmem_used_per_block amount of shared memory that is used by a block - * @param offsets - * @param output_nm + * @param row_offsets offset to a specific row in the input data + * @param output_nm pointers to null masks for columns * @param validity_offsets offset into input data row for validity data * @param block_infos information about the blocks of work - * @param num_block_infos number of infos in blocks array * @param input_data pointer to input data * */ -__global__ void copy_validity_from_rows( - const size_type num_rows, const size_type num_columns, const size_type shmem_used_per_block, - const size_type *row_offsets, cudf::bitmask_type **output_nm, const size_type validity_offset, - const block_info *block_infos, const size_type num_block_infos, const int8_t *input_data) { +__global__ void +copy_validity_from_rows(const size_type num_rows, const size_type num_columns, + const size_type shmem_used_per_block, const size_type *row_offsets, + cudf::bitmask_type **output_nm, const size_type validity_offset, + device_span block_infos, const int8_t *input_data) { extern __shared__ int8_t shared_data[]; int8_t *shared_blocks[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] = { shared_data, shared_data + shmem_used_per_block / 2}; - // per conversation with DaveB + using cudf::detail::warp_size; + // each thread of warp reads a single byte of validity - so we read 32 bytes // then ballot_sync the bits and write the result to shmem // after we fill shared mem memcpy it out in a blob. @@ -809,7 +826,7 @@ __global__ void copy_validity_from_rows( auto group = cooperative_groups::this_thread_block(); int const blocks_remaining = - std::min(num_block_infos - blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL, + std::min((uint)block_infos.size() - blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL, (uint)NUM_VALIDITY_BLOCKS_PER_KERNEL); __shared__ cuda::barrier @@ -837,14 +854,14 @@ __global__ void copy_validity_from_rows( auto const num_sections_y = (num_block_rows + 31) / 32; auto const validity_data_col_length = num_sections_y * 4; // words to bytes auto const total_sections = num_sections_x * num_sections_y; - int const warp_id = threadIdx.x / detail::warp_size; - int const lane_id = threadIdx.x % detail::warp_size; - auto const warps_per_block = std::max(1u, blockDim.x / detail::warp_size); + int const warp_id = threadIdx.x / warp_size; + int const lane_id = threadIdx.x % warp_size; + auto const warps_per_block = std::max(1u, blockDim.x / warp_size); // the block is divided into sections. A warp operates on a section at a time. for (int my_section_idx = warp_id; my_section_idx < total_sections; my_section_idx += warps_per_block) { - // convert to rows and cols + // convert section to row and col auto const section_x = my_section_idx % num_sections_x; auto const section_y = my_section_idx / num_sections_x; auto const relative_col = section_x * 8; @@ -860,13 +877,13 @@ __global__ void copy_validity_from_rows( input_data[row_offsets[absolute_row] + validity_offset + absolute_col / 8]; // so every thread that is participating in the warp has a byte, but it's row-based - // data and we need it in column-based. So we shiffle the bits around to make + // data and we need it in column-based. So we shuffle the bits around to make // the bytes we actually write. for (int i = 0, byte_mask = 1; i < 8 && relative_col + i < num_columns; ++i, byte_mask <<= 1) { auto validity_data = __ballot_sync(participation_mask, my_byte & byte_mask); // lead thread in each warp writes data - if (threadIdx.x % detail::warp_size == 0) { + if (threadIdx.x % warp_size == 0) { auto const validity_write_offset = validity_data_col_length * (relative_col + i) + relative_row / 8; @@ -898,10 +915,10 @@ __global__ void copy_validity_from_rows( // now async memcpy the shared for (int col = block.start_col + threadIdx.x; col <= block.end_col; col += blockDim.x) { auto const relative_col = col - block.start_col; + auto const starting_address = output_nm[col] + word_index(block_start_row); cuda::memcpy_async( - output_nm[col] + word_index(block_start_row), - &this_shared_block[validity_data_col_length * relative_col], + starting_address, &this_shared_block[validity_data_col_length * relative_col], util::div_rounding_up_unsafe(num_block_rows, 8), shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]); } @@ -919,7 +936,8 @@ __global__ void copy_validity_from_rows( #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 /** - * Calculate the dimensions of the kernel for fixed width only columns. + * @brief Calculate the dimensions of the kernel for fixed width only columns. + * * @param [in] num_columns the number of columns being copied. * @param [in] num_rows the number of rows being copied. * @param [in] size_per_row the size each row takes up when padded. @@ -995,7 +1013,7 @@ fixed_width_convert_to_rows(const cudf::size_type start_row, const cudf::size_ty rmm::device_uvector &input_nm, const cudf::scalar &zero, const cudf::scalar &scalar_size_per_row, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) { - int64_t total_allocation = size_per_row * num_rows; + int64_t const total_allocation = size_per_row * num_rows; // We made a mistake in the split somehow CUDF_EXPECTS(total_allocation < std::numeric_limits::max(), "Table is too large to fit!"); @@ -1020,17 +1038,14 @@ fixed_width_convert_to_rows(const cudf::size_type start_row, const cudf::size_ty rmm::device_buffer{0, rmm::cuda_stream_default, mr}, stream, mr); } -static cudf::data_type get_data_type(const cudf::column_view &v) { - return v.type(); -} - static inline bool are_all_fixed_width(std::vector const &schema) { return std::all_of(schema.begin(), schema.end(), [](const cudf::data_type &t) { return cudf::is_fixed_width(t); }); } /** - * Given a set of fixed width columns, calculate how the data will be laid out in memory. + * @brief Given a set of fixed width columns, calculate how the data will be laid out in memory. + * * @param [in] schema the types of columns that need to be laid out. * @param [out] column_start the byte offset where each column starts in the row. * @param [out] column_size the size in bytes of the data for each columns in the row. @@ -1065,19 +1080,25 @@ static inline int32_t compute_fixed_width_layout(std::vector co #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 +/** + * @brief Compute information about a table such as bytes per row and offsets. + * + * @tparam iterator iterator of column schema data + * @param begin starting iterator of column schema + * @param end ending iterator of column schema + * @param column_starts column start offsets + * @param column_sizes size in bytes of each column + * @return size of the fixed_width data portion of a row. + */ template static size_type compute_column_information(iterator begin, iterator end, std::vector &column_starts, - std::vector &column_sizes) //, -// std::function nested_type_cb) -{ + std::vector &column_sizes) { size_type fixed_width_size_per_row = 0; for (auto cv = begin; cv != end; ++cv) { auto col_type = std::get<0>(*cv); bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; - // if (nested_type && nested_type_cb) { nested_type_cb(cv->get<1>()); } - // a list or string column will write a single uint64 // of data here for offset/length auto col_size = nested_type ? 8 : size_of(col_type); @@ -1096,6 +1117,15 @@ static size_type compute_column_information(iterator begin, iterator end, return fixed_width_size_per_row; } +/** + * @brief Build `block_info` for the validity data to break up the work. + * + * @param num_columns number of columns in the table + * @param num_rows number of rows in the table + * @param shmem_limit_per_block size of shared memory available to a single gpu block + * @param row_batches batched row information for multiple output locations + * @return vector of `block_info` structs for validity data + */ std::vector build_validity_block_infos(size_type const &num_columns, size_type const &num_rows, size_type const &shmem_limit_per_block, @@ -1139,43 +1169,202 @@ build_validity_block_infos(size_type const &num_columns, size_type const &num_ro return validity_block_infos; } -std::vector build_block_infos(std::vector const &column_sizes, - std::vector const &column_starts, - std::vector const &row_batches, - size_type const total_number_of_rows, - size_type const &shmem_limit_per_block) { - std::vector block_infos; +constexpr size_type max_batch_size = std::numeric_limits::max(); + +/** + * @brief Holds information about the batches of data to be processed + * + */ +struct batch_data { + std::vector batch_row_boundaries; + device_uvector input_data_row_offsets; + std::vector row_batches; + + batch_data(size_type num_input_offsets, rmm::cuda_stream_view stream) + : input_data_row_offsets(num_input_offsets, stream){}; +}; +/** + * @brief Builds batches of rows that will fit in the size limit of a column. + * + * @tparam RowSize iterator that gives the size of a specific row of the table. + * @param num_rows Total number of rows in the table + * @param row_sizes iterator that gives the size of a specific row of the table. + * @param stream stream to operate on for this work + * @param mr memory resource used to allocate any returned data + * @returns vector of size_type's that indicate row numbers for batch boundaries and a + * device_uvector of row offsets + */ + +template +batch_data build_batches(size_type num_rows, RowSize row_sizes, rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) { + auto const total_size = thrust::reduce(rmm::exec_policy(stream), row_sizes, row_sizes + num_rows); + auto const num_batches = util::div_rounding_up_safe(total_size, max_batch_size); + auto const num_offsets = num_batches + 1; + batch_data ret(num_rows + 1, stream); + + // at most max gpu memory / 2GB iterations. + ret.batch_row_boundaries.reserve(num_offsets); + ret.batch_row_boundaries.push_back(0); + size_type last_row_end = 0; + device_uvector cumulative_row_sizes(num_rows, stream); + thrust::inclusive_scan(rmm::exec_policy(stream), row_sizes, row_sizes + num_rows, + cumulative_row_sizes.begin()); + while ((int)ret.batch_row_boundaries.size() < num_offsets) { + // find the next max_batch_size boundary + size_type const row_end = + ((thrust::lower_bound(rmm::exec_policy(stream), cumulative_row_sizes.begin(), + cumulative_row_sizes.begin() + (num_rows - last_row_end), + max_batch_size) - + cumulative_row_sizes.begin()) + + last_row_end); + + // build offset list for each row in this batch + auto const num_entries = row_end - last_row_end + 1; + device_uvector output_batch_row_offsets(num_entries, stream, mr); + + auto row_size_iter_bounded = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), [row_end, row_sizes, last_row_end] __device__(auto i) { + return i >= row_end ? 0 : row_sizes[i + last_row_end]; + }); + + thrust::exclusive_scan(rmm::exec_policy(stream), row_size_iter_bounded, + row_size_iter_bounded + num_entries, output_batch_row_offsets.begin()); + + ret.batch_row_boundaries.push_back(row_end); + auto const batch_bytes = output_batch_row_offsets.element(row_end, stream) - + output_batch_row_offsets.element(last_row_end, stream); + auto const num_rows_in_batch = row_end - last_row_end; + ret.row_batches.push_back( + {batch_bytes, num_rows_in_batch, std::move(output_batch_row_offsets)}); + last_row_end = row_end; + } + + auto row_size_iter = cudf::detail::make_counting_transform_iterator( + 0, [row_sizes, num_rows] __device__(auto i) { return (i < num_rows) ? row_sizes[i] : 0; }); + thrust::exclusive_scan(rmm::exec_policy(stream), row_size_iter, row_size_iter + num_rows + 1, + ret.input_data_row_offsets.begin()); + + return ret; +} + +/** + * @brief Computes the number of blocks necessary given a window height and batch offsets + * + * @param batch_row_offsets row offsets for each batch + * @param desired_window_height height of each window in the table + * @param stream stream to use + * @return number of windows necessary + */ +int compute_block_counts(device_span const &batch_row_offsets, + int desired_window_height, rmm::cuda_stream_view stream) { + size_type const num_batches = batch_row_offsets.size() - 1; + device_uvector num_blocks(num_batches, stream); + auto iter = thrust::make_counting_iterator(0); + thrust::transform( + rmm::exec_policy(stream), iter, iter + num_batches, num_blocks.begin(), + [desired_window_height, + batch_row_offsets = batch_row_offsets.data()] __device__(auto batch_index) -> size_type { + return util::div_rounding_up_unsafe(batch_row_offsets[batch_index + 1] - + batch_row_offsets[batch_index], + desired_window_height); + }); + return thrust::reduce(rmm::exec_policy(stream), num_blocks.begin(), num_blocks.end()); +} + +/** + * @brief Builds the `block_info` structs for a given table. + * + * @param blocks span of blocks to populate + * @param batch_row_offsets offsets to row batches + * @param column_start starting column of the window + * @param column_end ending column of the window + * @param desired_window_height height of the window + * @param total_number_of_rows total number of rows in the table + * @param stream stream to use + * @return number of windows created + */ +size_type +build_blocks(device_span blocks, + device_uvector const &batch_row_offsets, // comes from build_batches + int column_start, int column_end, int desired_window_height, int total_number_of_rows, + rmm::cuda_stream_view stream) { + size_type const num_batches = batch_row_offsets.size() - 1; + device_uvector num_blocks(num_batches, stream); + auto iter = thrust::make_counting_iterator(0); + thrust::transform( + rmm::exec_policy(stream), iter, iter + num_batches, num_blocks.begin(), + [desired_window_height, + batch_row_offsets = batch_row_offsets.data()] __device__(auto batch_index) -> size_type { + return util::div_rounding_up_unsafe(batch_row_offsets[batch_index + 1] - + batch_row_offsets[batch_index], + desired_window_height); + }); + + size_type const total_blocks = + thrust::reduce(rmm::exec_policy(stream), num_blocks.begin(), num_blocks.end()); + + device_uvector block_starts(num_batches + 1, stream); + auto block_iter = cudf::detail::make_counting_transform_iterator( + 0, [num_blocks = num_blocks.data(), num_batches] __device__(auto i) { + return (i < num_batches) ? num_blocks[i] : 0; + }); + thrust::exclusive_scan(rmm::exec_policy(stream), block_iter, block_iter + num_batches + 1, + block_starts.begin()); // in blocks + + thrust::transform( + rmm::exec_policy(stream), iter, iter + total_blocks, blocks.begin(), + [=, block_starts = block_starts.data(), + batch_row_offsets = batch_row_offsets.data()] __device__(size_type block_index) { + // what batch this block falls in + auto const batch_index_iter = + thrust::upper_bound(thrust::seq, block_starts, block_starts + num_batches, block_index); + auto const batch_index = std::distance(block_starts, batch_index_iter) - 1; + // local index within the block + int const local_block_index = block_index - block_starts[batch_index]; + // the start row for this batch. + int const batch_row_start = batch_row_offsets[batch_index]; + // the start row for this block + int const block_row_start = batch_row_start + (local_block_index * desired_window_height); + // the end row for this block + int const max_row = std::min(total_number_of_rows - 1, + batch_index + 1 > num_batches ? + std::numeric_limits::max() : + static_cast(batch_row_offsets[batch_index + 1]) - 1); + int const block_row_end = std::min( + batch_row_start + ((local_block_index + 1) * desired_window_height) - 1, max_row); + + // stuff the block + return block_info{column_start, block_row_start, column_end, block_row_end, + static_cast(batch_index)}; + }); + + return total_blocks; +} + +/** + * @brief Determines what data should be operated on by each block for the incoming table. + * + * @tparam WindowCallback Callback that receives the start and end columns of windows + * @param column_sizes vector of the size of each column + * @param column_starts vector of the offset of each column + * @param first_row_batch_size size of the first row batch to limit max window size since a window + * is unable to span batches + * @param total_number_of_rows total number of rows in the table + * @param shmem_limit_per_block shared memory allowed per block + * @param f callback function called when building a window + */ +template +void determine_windows(std::vector const &column_sizes, + std::vector const &column_starts, + size_type const first_row_batch_size, size_type const total_number_of_rows, + size_type const &shmem_limit_per_block, WindowCallback f) { // block infos are organized with the windows going "down" the columns // this provides the most coalescing of memory access int current_window_width = 0; int current_window_start_col = 0; - // build the blocks for a specific set of columns - auto build_blocks = [&block_infos, &row_batches, total_number_of_rows]( - int const start_col, int const end_col, int const desired_window_height) { - int current_window_start_row = 0; - int current_window_row_batch = 0; - int rows_left_in_batch = row_batches[current_window_row_batch].row_count; - int i = 0; - while (i < total_number_of_rows) { - if (rows_left_in_batch == 0) { - current_window_row_batch++; - rows_left_in_batch = row_batches[current_window_row_batch].row_count; - } - int const window_height = std::min(desired_window_height, rows_left_in_batch); - - block_infos.emplace_back(detail::block_info{ - start_col, current_window_start_row, end_col, - std::min(current_window_start_row + window_height - 1, total_number_of_rows - 1), - current_window_row_batch}); - - i += window_height; - current_window_start_row += window_height; - rows_left_in_batch -= window_height; - } - }; - // the ideal window height has lots of 8-byte reads and 8-byte writes. The optimal read/write // would be memory cache line sized access, but since other blocks will read/write the edges // this may not turn out to be overly important. For now, we will attempt to build a square @@ -1183,12 +1372,10 @@ std::vector build_block_infos(std::vector const &column_s // shared_mem_size since we want them equal, so height and width are sqrt(shared_mem_size). The // trick is that it's in bytes, not rows or columns. size_type const optimal_square_len = size_type(sqrt(shmem_limit_per_block)); - int const window_height = std::clamp( - util::round_up_safe( - std::min(std::min(optimal_square_len, (size_type)column_sizes.size()) / column_sizes[0], - total_number_of_rows), - 32), - 1, row_batches[0].row_count); + int const window_height = + std::clamp(util::round_up_safe( + std::min(optimal_square_len / column_sizes[0], total_number_of_rows), 32), + 1, first_row_batch_size); auto calc_admin_data_size = [](int num_cols) -> size_type { // admin data is the column sizes and column start information. @@ -1213,7 +1400,8 @@ std::vector build_block_infos(std::vector const &column_s calc_admin_data_size(col - current_window_start_col) > shmem_limit_per_block) { // too large, close this window, generate vertical blocks and restart - build_blocks(current_window_start_col, col == 0 ? col : col - 1, window_height); + f(current_window_start_col, col == 0 ? col : col - 1, window_height); + row_size = detail::align_offset((column_starts[col] + column_sizes[col]) & 7, alignment_needed); row_size += col_size; // alignment required for shared memory window boundary to match @@ -1228,12 +1416,24 @@ std::vector build_block_infos(std::vector const &column_s // build last set of blocks if (current_window_width > 0) { - build_blocks(current_window_start_col, (int)column_sizes.size() - 1, window_height); + f(current_window_start_col, (int)column_sizes.size() - 1, window_height); } - - return block_infos; } +struct row_size_functor { + size_type _fixed_width_size_per_row; + size_type _num_columns; + row_size_functor(size_t fixed_width_size_per_row, size_t num_columns) + : _fixed_width_size_per_row(fixed_width_size_per_row), _num_columns(num_columns){}; + + CUDA_DEVICE_CALLABLE + int operator()(int row_index) { + auto const bytes_needed = + _fixed_width_size_per_row + util::div_rounding_up_safe(_num_columns, 8); + return detail::align_offset(bytes_needed, 8); + } +}; + #endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 } // namespace detail @@ -1242,9 +1442,6 @@ std::vector> convert_to_rows(cudf::table_view cons rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) { #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 - // not scientifically chosen - the ideal window is long enough to allow coalesced reads of the - // data, but small enough that multiple columns fit in memory so the writes can coalese as well. - // Potential optimization for window sizes. const size_type num_columns = tbl.num_columns(); const size_type num_rows = tbl.num_rows(); @@ -1253,7 +1450,7 @@ std::vector> convert_to_rows(cudf::table_view cons int total_shmem; CUDA_TRY(cudaDeviceGetAttribute(&total_shmem, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); - // TODO: why? + // TODO: why is this needed. kernel fails to launch if all memory is requested. total_shmem -= 1024; int shmem_limit_per_block = total_shmem / NUM_BLOCKS_PER_KERNEL_LOADED; @@ -1277,150 +1474,113 @@ std::vector> convert_to_rows(cudf::table_view cons std::vector input_nm; input_data.reserve(num_columns); input_nm.reserve(num_columns); - for (size_type column_number = 0; column_number < num_columns; column_number++) { - column_view cv = tbl.column(column_number); - auto const col_type = cv.type(); - bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; - - if (!nested_type) { - input_data.emplace_back(cv.data()); - input_nm.emplace_back(cv.null_mask()); - } - } + std::transform( + tbl.begin(), tbl.end(), std::back_inserter(input_data), + [](cudf::column_view const &c) -> int8_t const * { return c.template data(); }); + std::transform(tbl.begin(), tbl.end(), std::back_inserter(input_nm), + [](auto c) { return c.null_mask(); }); auto dev_input_data = make_device_uvector_async(input_data, stream, mr); auto dev_input_nm = make_device_uvector_async(input_nm, stream, mr); - std::vector row_sizes; // size of each row in bytes including any alignment padding - std::vector row_offsets; // offset from the start of the data to this row std::vector column_sizes; // byte size of each column std::vector column_starts; // offset of column inside a row including alignment - std::vector - variable_width_columns; // list of the variable width columns in the table - row_sizes.reserve(num_rows); - row_offsets.reserve(num_rows); column_sizes.reserve(num_columns); column_starts.reserve(num_columns + 1); // we add a final offset for validity data start - auto iter = + auto schema_column_iter = thrust::make_transform_iterator(thrust::make_counting_iterator(0), [&tbl](auto i) -> std::tuple { return std::make_tuple(tbl.column(i).type(), tbl.column(i)); }); - size_type fixed_width_size_per_row = - detail::compute_column_information(iter, iter + num_columns, column_starts, column_sizes); + size_type fixed_width_size_per_row = detail::compute_column_information( + schema_column_iter, schema_column_iter + num_columns, column_starts, column_sizes); auto dev_col_sizes = make_device_uvector_async(column_sizes, stream, mr); auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr); - std::vector row_batches; - - uint64_t row_batch_size = 0; - uint64_t total_table_size = 0; - size_type row_batch_rows = 0; - uint64_t row_offset = 0; + // total encoded row size. This includes fixed-width data, validity, and variable-width data. + auto row_size_iter = cudf::detail::make_counting_transform_iterator( + 0, detail::row_size_functor(fixed_width_size_per_row, num_columns)); // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then // calculate the size of each row's variable-width data and validity as well. auto validity_size = num_bitmask_words(num_columns) * 4; - // thrust - for (int row = 0; row < num_rows; ++row) { - auto aligned_row_batch_size = - detail::align_offset(row_batch_size, 8); // rows are 8 byte aligned - row_sizes[row] = fixed_width_size_per_row; - // validity is byte aligned - row_sizes[row] += validity_size; - // variable width data is 8-byte aligned - row_sizes[row] = detail::align_offset(row_sizes[row], 8); // rows are 8 byte aligned - - if ((uint64_t)aligned_row_batch_size + row_sizes[row] > - (uint64_t)std::numeric_limits::max()) { - // a new batch starts at the last 32-row boundary - row_batches.push_back( - detail::row_batch{static_cast(row_batch_size), row_batch_rows & ~31}); - row_batch_size = 0; - row_batch_rows = row_batch_rows & 31; - row_offset = 0; - aligned_row_batch_size = 0; - } - row_offset = detail::align_offset(row_offset, 8); // rows are 8 byte aligned - row_offsets.push_back(row_offset); - row_batch_size = aligned_row_batch_size + row_sizes[row]; - row_offset += row_sizes[row]; - total_table_size = detail::align_offset(total_table_size, 8); // rows are 8 byte aligned - total_table_size += row_sizes[row]; - row_batch_rows++; - } - if (row_batch_size > 0) { - row_batches.push_back( - detail::row_batch{static_cast(row_batch_size), row_batch_rows}); - } - auto dev_row_offsets = make_device_uvector_async(row_offsets, stream, mr); + auto batch_info = detail::build_batches(num_rows, row_size_iter, stream, mr); + auto gpu_batch_row_boundaries = + make_device_uvector_async(batch_info.batch_row_boundaries, stream); + + // the first batch always exists unless we were sent an empty table + auto const first_batch_size = batch_info.row_batches[0].row_count; std::vector output_buffers; std::vector output_data; - output_data.reserve(row_batches.size()); - for (uint i = 0; i < row_batches.size(); ++i) { - rmm::device_buffer temp(row_batches[i].num_bytes, stream, mr); + output_data.reserve(batch_info.row_batches.size()); + for (uint i = 0; i < batch_info.row_batches.size(); ++i) { + rmm::device_buffer temp(batch_info.row_batches[i].num_bytes, stream, mr); output_data.push_back(static_cast(temp.data())); output_buffers.push_back(std::move(temp)); } auto dev_output_data = make_device_uvector_async(output_data, stream, mr); - std::vector block_infos = - build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block); - - auto dev_block_infos = make_device_uvector_async(block_infos, stream); + int info_count = 0; + detail::determine_windows( + column_sizes, column_starts, first_batch_size, num_rows, shmem_limit_per_block, + [&gpu_batch_row_boundaries, &info_count, &stream](int const start_col, int const end_col, + int const window_height) { + int i = detail::compute_block_counts(gpu_batch_row_boundaries, window_height, stream); + info_count += i; + }); + + // allocate space for blocks + device_uvector gpu_block_infos(info_count, stream); + int block_offset = 0; + + detail::determine_windows( + column_sizes, column_starts, first_batch_size, num_rows, shmem_limit_per_block, + [&gpu_batch_row_boundaries, &gpu_block_infos, num_rows, &block_offset, + stream](int const start_col, int const end_col, int const window_height) { + block_offset += detail::build_blocks( + {gpu_block_infos.data() + block_offset, gpu_block_infos.size() - block_offset}, + gpu_batch_row_boundaries, start_col, end_col, window_height, num_rows, stream); + }); // blast through the entire table and convert it - dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_ROWS)); + dim3 blocks(util::div_rounding_up_unsafe(gpu_block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_ROWS)); dim3 threads(256); detail::copy_to_rows<<>>( - num_rows, num_columns, shmem_limit_per_block, block_infos.size(), dev_input_data.data(), - dev_col_sizes.data(), dev_col_starts.data(), dev_block_infos.data(), dev_row_offsets.data(), + num_rows, num_columns, shmem_limit_per_block, gpu_block_infos, dev_input_data.data(), + dev_col_sizes.data(), dev_col_starts.data(), batch_info.input_data_row_offsets.data(), reinterpret_cast(dev_output_data.data())); - auto validity_block_infos = - build_validity_block_infos(num_columns, num_rows, shmem_limit_per_block, row_batches); + auto validity_block_infos = detail::build_validity_block_infos( + num_columns, num_rows, shmem_limit_per_block, batch_info.row_batches); auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream); dim3 validity_blocks( util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_VALIDITY_BLOCKS_PER_KERNEL)); dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu)); detail::copy_validity_to_rows<<>>( - num_rows, num_columns, shmem_limit_per_block, dev_row_offsets.data(), dev_output_data.data(), - column_starts.back(), dev_validity_block_infos.data(), validity_block_infos.size(), - dev_input_nm.data()); + num_rows, num_columns, shmem_limit_per_block, batch_info.input_data_row_offsets.data(), + dev_output_data.data(), column_starts.back(), dev_validity_block_infos, dev_input_nm.data()); // split up the output buffer into multiple buffers based on row batch sizes // and create list of byte columns - int offset_offset = 0; std::vector> ret; - for (uint i = 0; i < row_batches.size(); ++i) { - // compute offsets for this row batch - std::vector offset_vals; - offset_vals.reserve(row_batches[i].row_count + 1); - size_type cur_offset = 0; - offset_vals.push_back(cur_offset); - for (int row = 0; row < row_batches[i].row_count; ++row) { - cur_offset = detail::align_offset(cur_offset, 8) + row_sizes[row + offset_offset]; - offset_vals.push_back(cur_offset); - } - offset_offset += row_batches[i].row_count; - - auto dev_offsets = make_device_uvector_async(offset_vals, stream, mr); - auto offsets = std::make_unique(data_type{type_id::INT32}, - (size_type)offset_vals.size(), dev_offsets.release()); - - auto data = std::make_unique(data_type{cudf::type_id::INT8}, row_batches[i].num_bytes, - std::move(output_buffers[i])); - - ret.push_back( - cudf::make_lists_column(row_batches[i].row_count, std::move(offsets), std::move(data), 0, - rmm::device_buffer{0, rmm::cuda_stream_default, mr}, stream, mr)); + for (int batch = 0; batch < (int)batch_info.row_batches.size(); ++batch) { + auto const offset_count = batch_info.row_batches[batch].row_offsets.size(); + auto offsets = std::make_unique(data_type{type_id::INT32}, (size_type)offset_count, + batch_info.row_batches[batch].row_offsets.release()); + auto data = + std::make_unique(data_type{type_id::INT8}, batch_info.row_batches[batch].num_bytes, + std::move(output_buffers[batch])); + + ret.push_back(cudf::make_lists_column( + batch_info.row_batches[batch].row_count, std::move(offsets), std::move(data), 0, + rmm::device_buffer{0, rmm::cuda_stream_default, mr}, stream, mr)); } return ret; @@ -1437,7 +1597,8 @@ convert_to_rows_fixed_width_optimized(cudf::table_view const &tbl, rmm::cuda_str std::vector schema; schema.resize(num_columns); - std::transform(tbl.begin(), tbl.end(), schema.begin(), detail::get_data_type); + std::transform(tbl.begin(), tbl.end(), schema.begin(), + [](auto i) -> cudf::data_type { return i.type(); }); if (detail::are_all_fixed_width(schema)) { std::vector column_start; @@ -1509,7 +1670,7 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const &in int total_shmem; CUDA_TRY(cudaDeviceGetAttribute(&total_shmem, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); - // TODO why? + // TODO: why is this needed. kernel fails to launch if all memory is requested. total_shmem -= 1024; int shmem_limit_per_block = total_shmem / NUM_BLOCKS_PER_KERNEL_LOADED; @@ -1519,8 +1680,8 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const &in auto iter = thrust::make_transform_iterator(thrust::make_counting_iterator(0), [&schema](auto i) { return std::make_tuple(schema[i], nullptr); }); - size_type fixed_width_size_per_row = detail::compute_column_information( - iter, iter + num_columns, column_starts, column_sizes); //, [](void *) {}); + size_type fixed_width_size_per_row = + detail::compute_column_information(iter, iter + num_columns, column_starts, column_sizes); size_type validity_size = num_bitmask_words(num_columns) * 4; @@ -1534,8 +1695,8 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const &in // build the row_batches from the passed in list column std::vector row_batches; - - row_batches.push_back(detail::row_batch{child.size(), num_rows}); + row_batches.push_back( + {detail::row_batch{child.size(), num_rows, device_uvector(0, stream)}}); // Allocate the columns we are going to write into std::vector> output_columns; @@ -1553,45 +1714,48 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const &in auto dev_output_data = make_device_uvector_async(output_data, stream, mr); auto dev_output_nm = make_device_uvector_async(output_nm, stream, mr); - std::vector block_infos = - build_block_infos(column_sizes, column_starts, row_batches, num_rows, shmem_limit_per_block); - - auto dev_block_infos = make_device_uvector_async(block_infos, stream, mr); - - dim3 blocks(util::div_rounding_up_unsafe(block_infos.size(), NUM_BLOCKS_PER_KERNEL_FROM_ROWS)); + // only ever get a single batch when going from rows, so boundaries + // are 0, num_rows + device_uvector gpu_batch_row_boundaries(2, stream); + + thrust::transform(rmm::exec_policy(stream), thrust::make_counting_iterator(0), + thrust::make_counting_iterator(2), gpu_batch_row_boundaries.begin(), + [num_rows] __device__(auto i) { return i == 0 ? 0 : num_rows; }); + + int info_count = 0; + detail::determine_windows(column_sizes, column_starts, num_rows, num_rows, shmem_limit_per_block, + [&gpu_batch_row_boundaries, &info_count, &stream]( + int const start_col, int const end_col, int const window_height) { + info_count += detail::compute_block_counts(gpu_batch_row_boundaries, + window_height, stream); + }); + + // allocate space for blocks + device_uvector gpu_block_infos(info_count, stream); + + int block_offset = 0; + detail::determine_windows( + column_sizes, column_starts, num_rows, num_rows, shmem_limit_per_block, + [&gpu_batch_row_boundaries, &gpu_block_infos, num_rows, &block_offset, + stream](int const start_col, int const end_col, int const window_height) { + block_offset += detail::build_blocks( + {gpu_block_infos.data() + block_offset, gpu_block_infos.size() - block_offset}, + gpu_batch_row_boundaries, start_col, end_col, window_height, num_rows, stream); + }); + + dim3 blocks( + util::div_rounding_up_unsafe(gpu_block_infos.size(), NUM_BLOCKS_PER_KERNEL_FROM_ROWS)); dim3 threads(std::min(std::min(256, shmem_limit_per_block / 8), (int)child.size())); detail::copy_from_rows<<>>( num_rows, num_columns, shmem_limit_per_block, input.offsets().data(), - dev_output_data.data(), dev_col_sizes.data(), dev_col_starts.data(), dev_block_infos.data(), - block_infos.size(), child.data()); + dev_output_data.data(), dev_col_sizes.data(), dev_col_starts.data(), gpu_block_infos, + child.data()); + + auto validity_block_infos = + detail::build_validity_block_infos(num_columns, num_rows, shmem_limit_per_block, row_batches); + + auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream); - auto const desired_rows_and_columns = (int)sqrt(shmem_limit_per_block); - auto const column_stride = [&]() { - if (desired_rows_and_columns > num_columns) { - // not many columns, group it into 64s and ship it off - return std::min(64, num_columns); - } else { - return util::round_down_safe(desired_rows_and_columns, 8); - } - }(); - auto const row_stride = [&]() { - // we fit as much as we can, we know the column stride now, so calculate the row - return std::min(num_rows, util::round_down_safe(shmem_limit_per_block * 8 / column_stride, 32)); - /* if (desired_rows_and_columns > num_rows) { - return std::min(32, num_rows); - } else { - return util::round_down_safe(desired_rows_and_columns, 32); - }*/ - }(); - std::vector validity_block_infos; - for (int col = 0; col < num_columns; col += column_stride) { - for (int row = 0; row < num_rows; row += row_stride) { - validity_block_infos.emplace_back( - detail::block_info{col, row, std::min(col + column_stride - 1, num_columns - 1), - std::min(row + row_stride - 1, num_rows - 1)}); - } - } - auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream, mr); dim3 validity_blocks( util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_VALIDITY_BLOCKS_PER_KERNEL)); @@ -1599,8 +1763,8 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const &in detail:: copy_validity_from_rows<<>>( num_rows, num_columns, shmem_limit_per_block, input.offsets().data(), - dev_output_nm.data(), column_starts.back(), dev_validity_block_infos.data(), - validity_block_infos.size(), child.data()); + dev_output_nm.data(), column_starts.back(), dev_validity_block_infos, + child.data()); return std::make_unique(std::move(output_columns)); #else @@ -1665,4 +1829,6 @@ std::unique_ptr convert_from_rows_fixed_width_optimized( } } +} // namespace java + } // namespace cudf From 630222a841470848141cb57646350420c5e05452 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Thu, 4 Nov 2021 00:02:24 +0000 Subject: [PATCH 65/80] fixing overflow issues with large tables --- java/src/main/native/src/row_conversion.cu | 202 +++++++++++---------- 1 file changed, 110 insertions(+), 92 deletions(-) diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu index 87ab1ed49d8..c5bbed5274c 100644 --- a/java/src/main/native/src/row_conversion.cu +++ b/java/src/main/native/src/row_conversion.cu @@ -15,6 +15,8 @@ */ #include +#include +#include #include #include #include @@ -25,6 +27,8 @@ #include #include +#include "thrust/scan.h" + #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 #include #endif @@ -50,7 +54,6 @@ #include #include #include -#include #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 constexpr auto NUM_BLOCKS_PER_KERNEL_FROM_ROWS = 2; @@ -336,7 +339,7 @@ struct block_info { int start_row; int end_col; int end_row; - int buffer_num; + int batch_number; __host__ __device__ size_type get_shared_row_size(size_type const *const col_offsets, size_type const *const col_sizes) const { @@ -369,7 +372,7 @@ struct row_batch { * @param input_data pointer to raw table data * @param col_sizes array of sizes for each element in a column - one per column * @param col_offsets offset into input data row for each column's start - * @param row_offsets offset to a specific row in the input data + * @param row_offsets offset to a specific row in the output data * @param output_data pointer to output data * */ @@ -470,7 +473,7 @@ __global__ void copy_to_rows(const size_type num_rows, const size_type num_colum auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_ROWS + subset]; auto const block_row_size = block.get_shared_row_size(col_offsets, col_sizes); auto const column_offset = col_offsets[block.start_col]; - auto const block_output_buffer = output_data[block.buffer_num]; + auto const block_output_buffer = output_data[block.batch_number]; // copy entire rows to final dest for (auto absolute_row = block.start_row + threadIdx.x; absolute_row <= block.end_row; @@ -496,7 +499,7 @@ __global__ void copy_to_rows(const size_type num_rows, const size_type num_colum * @param num_rows total number of rows in the table * @param num_columns total number of columns in the table * @param shmem_used_per_block amount of shared memory that is used by a block - * @param row_offsets offset to a specific row in the input data + * @param row_offsets offset to a specific row in the output data * @param output_data pointer to output data, partitioned by data size * @param validity_offsets offset into input data row for validity data * @param block_infos information about the blocks of work @@ -610,7 +613,7 @@ __global__ void copy_validity_to_rows(const size_type num_rows, const size_type group.sync(); auto const output_data_base = - output_data[block.buffer_num] + validity_offset + block.start_col / 8; + output_data[block.batch_number] + validity_offset + block.start_col / 8; // now async memcpy the shared memory out to the final destination for (int row = block.start_row + threadIdx.x; row <= block.end_row; row += blockDim.x) { @@ -1176,12 +1179,18 @@ constexpr size_type max_batch_size = std::numeric_limits::max(); * */ struct batch_data { + device_uvector batch_row_offsets; std::vector batch_row_boundaries; - device_uvector input_data_row_offsets; std::vector row_batches; +}; - batch_data(size_type num_input_offsets, rmm::cuda_stream_view stream) - : input_data_row_offsets(num_input_offsets, stream){}; +template struct row_size_functor { + RowSize _row_sizes; + size_type _num_rows; + row_size_functor(RowSize row_sizes) : _row_sizes(row_sizes){}; + + CUDA_DEVICE_CALLABLE + uint64_t operator()(int row_index) { return static_cast(_row_sizes[row_index]); } }; /** @@ -1199,19 +1208,26 @@ struct batch_data { template batch_data build_batches(size_type num_rows, RowSize row_sizes, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) { - auto const total_size = thrust::reduce(rmm::exec_policy(stream), row_sizes, row_sizes + num_rows); - auto const num_batches = util::div_rounding_up_safe(total_size, max_batch_size); + auto uint64_row_sizes = + cudf::detail::make_counting_transform_iterator(0, row_size_functor(row_sizes)); + auto const total_size = + thrust::reduce(rmm::exec_policy(stream), uint64_row_sizes, uint64_row_sizes + num_rows); + auto const num_batches = static_cast( + util::div_rounding_up_safe(total_size, static_cast(max_batch_size))); auto const num_offsets = num_batches + 1; - batch_data ret(num_rows + 1, stream); + std::vector row_batches; + std::vector batch_row_boundaries; + device_uvector batch_row_offsets(num_rows, stream); // at most max gpu memory / 2GB iterations. - ret.batch_row_boundaries.reserve(num_offsets); - ret.batch_row_boundaries.push_back(0); + batch_row_boundaries.reserve(num_offsets); + batch_row_boundaries.push_back(0); size_type last_row_end = 0; - device_uvector cumulative_row_sizes(num_rows, stream); - thrust::inclusive_scan(rmm::exec_policy(stream), row_sizes, row_sizes + num_rows, + device_uvector cumulative_row_sizes(num_rows, stream); + thrust::inclusive_scan(rmm::exec_policy(stream), uint64_row_sizes, uint64_row_sizes + num_rows, cumulative_row_sizes.begin()); - while ((int)ret.batch_row_boundaries.size() < num_offsets) { + + while ((int)batch_row_boundaries.size() < num_offsets) { // find the next max_batch_size boundary size_type const row_end = ((thrust::lower_bound(rmm::exec_policy(stream), cumulative_row_sizes.begin(), @@ -1220,6 +1236,9 @@ batch_data build_batches(size_type num_rows, RowSize row_sizes, rmm::cuda_stream cumulative_row_sizes.begin()) + last_row_end); + // build offset list for each row in this batch + auto const num_rows_in_batch = row_end - last_row_end; + // build offset list for each row in this batch auto const num_entries = row_end - last_row_end + 1; device_uvector output_batch_row_offsets(num_entries, stream, mr); @@ -1232,44 +1251,44 @@ batch_data build_batches(size_type num_rows, RowSize row_sizes, rmm::cuda_stream thrust::exclusive_scan(rmm::exec_policy(stream), row_size_iter_bounded, row_size_iter_bounded + num_entries, output_batch_row_offsets.begin()); - ret.batch_row_boundaries.push_back(row_end); - auto const batch_bytes = output_batch_row_offsets.element(row_end, stream) - - output_batch_row_offsets.element(last_row_end, stream); - auto const num_rows_in_batch = row_end - last_row_end; - ret.row_batches.push_back( - {batch_bytes, num_rows_in_batch, std::move(output_batch_row_offsets)}); + auto const batch_bytes = output_batch_row_offsets.element(num_rows_in_batch, stream); + + // The output_batch_row_offsets vector is used as the offset column of the returned data. This + // needs to be individually allocated, but the kernel needs a contiguous array of offsets or + // more global lookups are necessary. + cudaMemcpy(batch_row_offsets.data() + last_row_end, output_batch_row_offsets.data(), + num_rows_in_batch * sizeof(size_type), cudaMemcpyDeviceToDevice); + + batch_row_boundaries.push_back(row_end); + row_batches.push_back({batch_bytes, num_rows_in_batch, std::move(output_batch_row_offsets)}); + last_row_end = row_end; } - auto row_size_iter = cudf::detail::make_counting_transform_iterator( - 0, [row_sizes, num_rows] __device__(auto i) { return (i < num_rows) ? row_sizes[i] : 0; }); - thrust::exclusive_scan(rmm::exec_policy(stream), row_size_iter, row_size_iter + num_rows + 1, - ret.input_data_row_offsets.begin()); - - return ret; + return {std::move(batch_row_offsets), batch_row_boundaries, std::move(row_batches)}; } /** * @brief Computes the number of blocks necessary given a window height and batch offsets * - * @param batch_row_offsets row offsets for each batch + * @param batch_row_boundaries row boundaries for each batch * @param desired_window_height height of each window in the table * @param stream stream to use * @return number of windows necessary */ -int compute_block_counts(device_span const &batch_row_offsets, +int compute_block_counts(device_span const &batch_row_boundaries, int desired_window_height, rmm::cuda_stream_view stream) { - size_type const num_batches = batch_row_offsets.size() - 1; + size_type const num_batches = batch_row_boundaries.size() - 1; device_uvector num_blocks(num_batches, stream); auto iter = thrust::make_counting_iterator(0); - thrust::transform( - rmm::exec_policy(stream), iter, iter + num_batches, num_blocks.begin(), - [desired_window_height, - batch_row_offsets = batch_row_offsets.data()] __device__(auto batch_index) -> size_type { - return util::div_rounding_up_unsafe(batch_row_offsets[batch_index + 1] - - batch_row_offsets[batch_index], - desired_window_height); - }); + thrust::transform(rmm::exec_policy(stream), iter, iter + num_batches, num_blocks.begin(), + [desired_window_height, + batch_row_boundaries = + batch_row_boundaries.data()] __device__(auto batch_index) -> size_type { + return util::div_rounding_up_unsafe(batch_row_boundaries[batch_index + 1] - + batch_row_boundaries[batch_index], + desired_window_height); + }); return thrust::reduce(rmm::exec_policy(stream), num_blocks.begin(), num_blocks.end()); } @@ -1277,7 +1296,7 @@ int compute_block_counts(device_span const &batch_row_offsets, * @brief Builds the `block_info` structs for a given table. * * @param blocks span of blocks to populate - * @param batch_row_offsets offsets to row batches + * @param batch_row_boundaries boundary to row batches * @param column_start starting column of the window * @param column_end ending column of the window * @param desired_window_height height of the window @@ -1287,20 +1306,20 @@ int compute_block_counts(device_span const &batch_row_offsets, */ size_type build_blocks(device_span blocks, - device_uvector const &batch_row_offsets, // comes from build_batches + device_uvector const &batch_row_boundaries, // comes from build_batches int column_start, int column_end, int desired_window_height, int total_number_of_rows, rmm::cuda_stream_view stream) { - size_type const num_batches = batch_row_offsets.size() - 1; + size_type const num_batches = batch_row_boundaries.size() - 1; device_uvector num_blocks(num_batches, stream); auto iter = thrust::make_counting_iterator(0); - thrust::transform( - rmm::exec_policy(stream), iter, iter + num_batches, num_blocks.begin(), - [desired_window_height, - batch_row_offsets = batch_row_offsets.data()] __device__(auto batch_index) -> size_type { - return util::div_rounding_up_unsafe(batch_row_offsets[batch_index + 1] - - batch_row_offsets[batch_index], - desired_window_height); - }); + thrust::transform(rmm::exec_policy(stream), iter, iter + num_batches, num_blocks.begin(), + [desired_window_height, + batch_row_boundaries = + batch_row_boundaries.data()] __device__(auto batch_index) -> size_type { + return util::div_rounding_up_unsafe(batch_row_boundaries[batch_index + 1] - + batch_row_boundaries[batch_index], + desired_window_height); + }); size_type const total_blocks = thrust::reduce(rmm::exec_policy(stream), num_blocks.begin(), num_blocks.end()); @@ -1316,7 +1335,7 @@ build_blocks(device_span blocks, thrust::transform( rmm::exec_policy(stream), iter, iter + total_blocks, blocks.begin(), [=, block_starts = block_starts.data(), - batch_row_offsets = batch_row_offsets.data()] __device__(size_type block_index) { + batch_row_boundaries = batch_row_boundaries.data()] __device__(size_type block_index) { // what batch this block falls in auto const batch_index_iter = thrust::upper_bound(thrust::seq, block_starts, block_starts + num_batches, block_index); @@ -1324,14 +1343,15 @@ build_blocks(device_span blocks, // local index within the block int const local_block_index = block_index - block_starts[batch_index]; // the start row for this batch. - int const batch_row_start = batch_row_offsets[batch_index]; + int const batch_row_start = batch_row_boundaries[batch_index]; // the start row for this block int const block_row_start = batch_row_start + (local_block_index * desired_window_height); // the end row for this block - int const max_row = std::min(total_number_of_rows - 1, - batch_index + 1 > num_batches ? - std::numeric_limits::max() : - static_cast(batch_row_offsets[batch_index + 1]) - 1); + int const max_row = + std::min(total_number_of_rows - 1, + batch_index + 1 > num_batches ? + std::numeric_limits::max() : + static_cast(batch_row_boundaries[batch_index + 1]) - 1); int const block_row_end = std::min( batch_row_start + ((local_block_index + 1) * desired_window_height) - 1, max_row); @@ -1420,20 +1440,6 @@ void determine_windows(std::vector const &column_sizes, } } -struct row_size_functor { - size_type _fixed_width_size_per_row; - size_type _num_columns; - row_size_functor(size_t fixed_width_size_per_row, size_t num_columns) - : _fixed_width_size_per_row(fixed_width_size_per_row), _num_columns(num_columns){}; - - CUDA_DEVICE_CALLABLE - int operator()(int row_index) { - auto const bytes_needed = - _fixed_width_size_per_row + util::div_rounding_up_safe(_num_columns, 8); - return detail::align_offset(bytes_needed, 8); - } -}; - #endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 } // namespace detail @@ -1502,7 +1508,11 @@ std::vector> convert_to_rows(cudf::table_view cons // total encoded row size. This includes fixed-width data, validity, and variable-width data. auto row_size_iter = cudf::detail::make_counting_transform_iterator( - 0, detail::row_size_functor(fixed_width_size_per_row, num_columns)); + 0, [fixed_width_size_per_row, num_columns] __device__(auto i) { + auto const bytes_needed = + fixed_width_size_per_row + util::div_rounding_up_safe(num_columns, 8); + return detail::align_offset(bytes_needed, 8); + }); // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then // calculate the size of each row's variable-width data and validity as well. @@ -1518,11 +1528,14 @@ std::vector> convert_to_rows(cudf::table_view cons std::vector output_buffers; std::vector output_data; output_data.reserve(batch_info.row_batches.size()); - for (uint i = 0; i < batch_info.row_batches.size(); ++i) { - rmm::device_buffer temp(batch_info.row_batches[i].num_bytes, stream, mr); - output_data.push_back(static_cast(temp.data())); - output_buffers.push_back(std::move(temp)); - } + output_buffers.reserve(batch_info.row_batches.size()); + std::transform(batch_info.row_batches.begin(), batch_info.row_batches.end(), + std::back_inserter(output_buffers), [&](auto const &batch) { + return rmm::device_buffer(batch.num_bytes, stream, mr); + }); + std::transform(output_buffers.begin(), output_buffers.end(), std::back_inserter(output_data), + [](auto &buf) { return static_cast(buf.data()); }); + auto dev_output_data = make_device_uvector_async(output_data, stream, mr); int info_count = 0; @@ -1551,11 +1564,6 @@ std::vector> convert_to_rows(cudf::table_view cons dim3 blocks(util::div_rounding_up_unsafe(gpu_block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_ROWS)); dim3 threads(256); - detail::copy_to_rows<<>>( - num_rows, num_columns, shmem_limit_per_block, gpu_block_infos, dev_input_data.data(), - dev_col_sizes.data(), dev_col_starts.data(), batch_info.input_data_row_offsets.data(), - reinterpret_cast(dev_output_data.data())); - auto validity_block_infos = detail::build_validity_block_infos( num_columns, num_rows, shmem_limit_per_block, batch_info.row_batches); @@ -1563,8 +1571,16 @@ std::vector> convert_to_rows(cudf::table_view cons dim3 validity_blocks( util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_VALIDITY_BLOCKS_PER_KERNEL)); dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu)); + + detail::copy_to_rows<<>>( + num_rows, num_columns, shmem_limit_per_block, gpu_block_infos, dev_input_data.data(), + dev_col_sizes.data(), dev_col_starts.data(), + batch_info.batch_row_offsets + .data(), // needs to be row offsets per batch, not overall JUST for output. + reinterpret_cast(dev_output_data.data())); + detail::copy_validity_to_rows<<>>( - num_rows, num_columns, shmem_limit_per_block, batch_info.input_data_row_offsets.data(), + num_rows, num_columns, shmem_limit_per_block, batch_info.batch_row_offsets.data(), dev_output_data.data(), column_starts.back(), dev_validity_block_infos, dev_input_nm.data()); // split up the output buffer into multiple buffers based on row batch sizes @@ -1693,11 +1709,6 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const &in auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr); auto dev_col_sizes = make_device_uvector_async(column_sizes, stream, mr); - // build the row_batches from the passed in list column - std::vector row_batches; - row_batches.push_back( - {detail::row_batch{child.size(), num_rows, device_uvector(0, stream)}}); - // Allocate the columns we are going to write into std::vector> output_columns; std::vector output_data; @@ -1711,6 +1722,11 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const &in output_columns.emplace_back(std::move(column)); } + // build the row_batches from the passed in list column + std::vector row_batches; + row_batches.push_back( + {detail::row_batch{child.size(), num_rows, device_uvector(0, stream)}}); + auto dev_output_data = make_device_uvector_async(output_data, stream, mr); auto dev_output_nm = make_device_uvector_async(output_nm, stream, mr); @@ -1746,10 +1762,6 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const &in dim3 blocks( util::div_rounding_up_unsafe(gpu_block_infos.size(), NUM_BLOCKS_PER_KERNEL_FROM_ROWS)); dim3 threads(std::min(std::min(256, shmem_limit_per_block / 8), (int)child.size())); - detail::copy_from_rows<<>>( - num_rows, num_columns, shmem_limit_per_block, input.offsets().data(), - dev_output_data.data(), dev_col_sizes.data(), dev_col_starts.data(), gpu_block_infos, - child.data()); auto validity_block_infos = detail::build_validity_block_infos(num_columns, num_rows, shmem_limit_per_block, row_batches); @@ -1760,6 +1772,12 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const &in util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_VALIDITY_BLOCKS_PER_KERNEL)); dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu)); + + detail::copy_from_rows<<>>( + num_rows, num_columns, shmem_limit_per_block, input.offsets().data(), + dev_output_data.data(), dev_col_sizes.data(), dev_col_starts.data(), gpu_block_infos, + child.data()); + detail:: copy_validity_from_rows<<>>( num_rows, num_columns, shmem_limit_per_block, input.offsets().data(), From 5e66d7ce21ebf7d8ea0f75d47bad70ad0f29e9a5 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Tue, 9 Nov 2021 03:50:24 +0000 Subject: [PATCH 66/80] fixing includes for java --- java/src/main/native/src/row_conversion.cu | 26 ++++++++++++---------- 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu index c5bbed5274c..f9cb61f4ea1 100644 --- a/java/src/main/native/src/row_conversion.cu +++ b/java/src/main/native/src/row_conversion.cu @@ -23,16 +23,24 @@ #include #include -#include -#include #include -#include "thrust/scan.h" - #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 #include #endif +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include #include #include #include @@ -40,20 +48,14 @@ #include #include #include -#include #include #include #include #include #include #include -#include -#include -#include -#include -#include -#include -#include + +#include "row_conversion.hpp" #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 constexpr auto NUM_BLOCKS_PER_KERNEL_FROM_ROWS = 2; From 37feaa1a90107cac32bdd3c5cbc02b17da2ffb9e Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Tue, 16 Nov 2021 07:03:37 +0000 Subject: [PATCH 67/80] updating from review comments --- .../cudf/detail/utilities/integer_utils.hpp | 9 + cpp/src/copying/contiguous_split.cu | 9 +- java/src/main/native/src/row_conversion.cu | 449 +++++++++--------- 3 files changed, 231 insertions(+), 236 deletions(-) diff --git a/cpp/include/cudf/detail/utilities/integer_utils.hpp b/cpp/include/cudf/detail/utilities/integer_utils.hpp index dc919433da7..48618ae53a1 100644 --- a/cpp/include/cudf/detail/utilities/integer_utils.hpp +++ b/cpp/include/cudf/detail/utilities/integer_utils.hpp @@ -60,6 +60,15 @@ inline S round_down_safe(S number_to_round, S modulus) return rounded_down; } +template +constexpr inline S round_up_unsafe(S number_to_round, S modulus) noexcept +{ + auto remainder = number_to_round % modulus; + if (remainder == 0) { return number_to_round; } + auto rounded_up = number_to_round - remainder + modulus; + return rounded_up; +} + /** * Divides the left-hand-side by the right-hand-side, rounding up * to an integral multiple of the right-hand-side, e.g. (9,5) -> 2 , (10,5) -> 2, (11,5) -> 3. diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu index a9194ceea93..6467a816542 100644 --- a/cpp/src/copying/contiguous_split.cu +++ b/cpp/src/copying/contiguous_split.cu @@ -40,13 +40,6 @@ namespace { // align all column size allocations to this boundary so that all output column buffers // start at that alignment. static constexpr std::size_t split_align = 64; -inline __device__ std::size_t _round_up_safe(std::size_t number_to_round, std::size_t modulus) -{ - auto remainder = number_to_round % modulus; - if (remainder == 0) { return number_to_round; } - auto rounded_up = number_to_round - remainder + modulus; - return rounded_up; -} /** * @brief Struct which contains information on a source buffer. @@ -953,7 +946,7 @@ std::vector contiguous_split(cudf::table_view const& input, std::size_t const bytes = static_cast(num_elements) * static_cast(element_size); - return dst_buf_info{_round_up_safe(bytes, 64), + return dst_buf_info{util::round_up_unsafe(bytes, 64ul), num_elements, element_size, num_rows, diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu index f9cb61f4ea1..6f3998216b0 100644 --- a/java/src/main/native/src/row_conversion.cu +++ b/java/src/main/native/src/row_conversion.cu @@ -29,18 +29,6 @@ #include #endif -#include -#include -#include -#include - -#include -#include -#include -#include - -#include -#include #include #include #include @@ -54,6 +42,14 @@ #include #include #include +#include +#include +#include +#include +#include +#include +#include +#include #include "row_conversion.hpp" @@ -65,24 +61,36 @@ constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL = 8; constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED = 2; // needed to suppress warning about cuda::barrier -#pragma diag_suppress static_var_with_dynamic_init +#pragma nv_diag_suppress static_var_with_dynamic_init #endif -using cudf::detail::make_device_uvector_async; +using namespace cudf; +using detail::make_device_uvector_async; using rmm::device_uvector; namespace cudf { namespace java { namespace detail { -static inline __host__ __device__ int32_t align_offset(int32_t offset, std::size_t alignment) { - return (offset + alignment - 1) & ~(alignment - 1); -} - -__global__ void copy_from_rows_fixed_width_optimized( - const cudf::size_type num_rows, const cudf::size_type num_columns, - const cudf::size_type row_size, const cudf::size_type *input_offset_in_row, - const cudf::size_type *num_bytes, int8_t **output_data, cudf::bitmask_type **output_nm, - const int8_t *input_data) { +/** + * @brief Copies data from row-base JCUDF format to column-based cudf format. + * + * This optimized version of the conversion is faster for fixed-width tables + * that do not have more than 100 columns. + * + * @param num_rows number of rows in the incoming table + * @param num_columns number of columns in the incoming table + * @param row_size length in bytes of each row + * @param input_offset_in_row offset to each row of data + * @param num_bytes total number of bytes in the incoming data + * @param output_data array of pointers to the output data + * @param output_nm array of pointers to the output null masks + * @param input_data pointing to the incoming row data + */ +__global__ void +copy_from_rows_fixed_width_optimized(const size_type num_rows, const size_type num_columns, + const size_type row_size, const size_type *input_offset_in_row, + const size_type *num_bytes, int8_t **output_data, + bitmask_type **output_nm, const int8_t *input_data) { // We are going to copy the data in two passes. // The first pass copies a chunk of data into shared memory. // The second pass copies that chunk from shared memory out to the final location. @@ -97,10 +105,10 @@ __global__ void copy_from_rows_fixed_width_optimized( // are controlled by the x dimension (there are multiple blocks in the x // dimension). - cudf::size_type rows_per_group = blockDim.x; - cudf::size_type row_group_start = blockIdx.x; - cudf::size_type row_group_stride = gridDim.x; - cudf::size_type row_group_end = (num_rows + rows_per_group - 1) / rows_per_group + 1; + size_type rows_per_group = blockDim.x; + size_type row_group_start = blockIdx.x; + size_type row_group_stride = gridDim.x; + size_type row_group_end = (num_rows + rows_per_group - 1) / rows_per_group + 1; extern __shared__ int8_t shared_data[]; @@ -109,28 +117,27 @@ __global__ void copy_from_rows_fixed_width_optimized( int8_t *row_tmp = &shared_data[row_size * threadIdx.x]; int8_t *row_vld_tmp = &row_tmp[input_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]]; - for (cudf::size_type row_group_index = row_group_start; row_group_index < row_group_end; + for (size_type row_group_index = row_group_start; row_group_index < row_group_end; row_group_index += row_group_stride) { // Step 1: Copy the data into shared memory // We know row_size is always aligned with and a multiple of int64_t; int64_t *long_shared = reinterpret_cast(shared_data); const int64_t *long_input = reinterpret_cast(input_data); - cudf::size_type shared_output_index = threadIdx.x + (threadIdx.y * blockDim.x); - cudf::size_type shared_output_stride = blockDim.x * blockDim.y; - cudf::size_type row_index_end = ((row_group_index + 1) * rows_per_group); + size_type shared_output_index = threadIdx.x + (threadIdx.y * blockDim.x); + size_type shared_output_stride = blockDim.x * blockDim.y; + size_type row_index_end = ((row_group_index + 1) * rows_per_group); if (row_index_end > num_rows) { row_index_end = num_rows; } - cudf::size_type num_rows_in_group = row_index_end - (row_group_index * rows_per_group); - cudf::size_type shared_length = row_size * num_rows_in_group; + size_type num_rows_in_group = row_index_end - (row_group_index * rows_per_group); + size_type shared_length = row_size * num_rows_in_group; - cudf::size_type shared_output_end = shared_length / sizeof(int64_t); + size_type shared_output_end = shared_length / sizeof(int64_t); - cudf::size_type start_input_index = - (row_size * row_group_index * rows_per_group) / sizeof(int64_t); + size_type start_input_index = (row_size * row_group_index * rows_per_group) / sizeof(int64_t); - for (cudf::size_type shared_index = shared_output_index; shared_index < shared_output_end; + for (size_type shared_index = shared_output_index; shared_index < shared_output_end; shared_index += shared_output_stride) { long_shared[shared_index] = long_input[start_input_index + shared_index]; } @@ -141,17 +148,17 @@ __global__ void copy_from_rows_fixed_width_optimized( // Within the row group there should be 1 thread for each row. This is a // requirement for launching the kernel - cudf::size_type row_index = (row_group_index * rows_per_group) + threadIdx.x; + size_type row_index = (row_group_index * rows_per_group) + threadIdx.x; // But we might not use all of the threads if the number of rows does not go // evenly into the thread count. We don't want those threads to exit yet // because we may need them to copy data in for the next row group. uint32_t active_mask = __ballot_sync(0xffffffff, row_index < num_rows); if (row_index < num_rows) { - cudf::size_type col_index_start = threadIdx.y; - cudf::size_type col_index_stride = blockDim.y; - for (cudf::size_type col_index = col_index_start; col_index < num_columns; + size_type col_index_start = threadIdx.y; + size_type col_index_stride = blockDim.y; + for (size_type col_index = col_index_start; col_index < num_columns; col_index += col_index_stride) { - cudf::size_type col_size = num_bytes[col_index]; + size_type col_size = num_bytes[col_index]; const int8_t *col_tmp = &(row_tmp[input_offset_in_row[col_index]]); int8_t *col_output = output_data[col_index]; switch (col_size) { @@ -175,18 +182,18 @@ __global__ void copy_from_rows_fixed_width_optimized( break; } default: { - cudf::size_type output_offset = col_size * row_index; + size_type output_offset = col_size * row_index; // TODO this should just not be supported for fixed width columns, but just in case... - for (cudf::size_type b = 0; b < col_size; b++) { + for (size_type b = 0; b < col_size; b++) { col_output[b + output_offset] = col_tmp[b]; } break; } } - cudf::bitmask_type *nm = output_nm[col_index]; + bitmask_type *nm = output_nm[col_index]; int8_t *valid_byte = &row_vld_tmp[col_index / 8]; - cudf::size_type byte_bit_offset = col_index % 8; + size_type byte_bit_offset = col_index % 8; int predicate = *valid_byte & (1 << byte_bit_offset); uint32_t bitmask = __ballot_sync(active_mask, predicate); if (row_index % 32 == 0) { @@ -200,10 +207,9 @@ __global__ void copy_from_rows_fixed_width_optimized( } __global__ void copy_to_rows_fixed_width_optimized( - const cudf::size_type start_row, const cudf::size_type num_rows, - const cudf::size_type num_columns, const cudf::size_type row_size, - const cudf::size_type *output_offset_in_row, const cudf::size_type *num_bytes, - const int8_t **input_data, const cudf::bitmask_type **input_nm, int8_t *output_data) { + const size_type start_row, const size_type num_rows, const size_type num_columns, + const size_type row_size, const size_type *output_offset_in_row, const size_type *num_bytes, + const int8_t **input_data, const bitmask_type **input_nm, int8_t *output_data) { // We are going to copy the data in two passes. // The first pass copies a chunk of data into shared memory. // The second pass copies that chunk from shared memory out to the final location. @@ -220,10 +226,10 @@ __global__ void copy_to_rows_fixed_width_optimized( // are controlled by the x dimension (there are multiple blocks in the x // dimension). - cudf::size_type rows_per_group = blockDim.x; - cudf::size_type row_group_start = blockIdx.x; - cudf::size_type row_group_stride = gridDim.x; - cudf::size_type row_group_end = (num_rows + rows_per_group - 1) / rows_per_group + 1; + size_type rows_per_group = blockDim.x; + size_type row_group_start = blockIdx.x; + size_type row_group_stride = gridDim.x; + size_type row_group_end = (num_rows + rows_per_group - 1) / rows_per_group + 1; extern __shared__ int8_t shared_data[]; @@ -233,20 +239,20 @@ __global__ void copy_to_rows_fixed_width_optimized( int8_t *row_vld_tmp = &row_tmp[output_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]]; - for (cudf::size_type row_group_index = row_group_start; row_group_index < row_group_end; + for (size_type row_group_index = row_group_start; row_group_index < row_group_end; row_group_index += row_group_stride) { // Within the row group there should be 1 thread for each row. This is a // requirement for launching the kernel - cudf::size_type row_index = start_row + (row_group_index * rows_per_group) + threadIdx.x; + size_type row_index = start_row + (row_group_index * rows_per_group) + threadIdx.x; // But we might not use all of the threads if the number of rows does not go // evenly into the thread count. We don't want those threads to exit yet // because we may need them to copy data back out. if (row_index < (start_row + num_rows)) { - cudf::size_type col_index_start = threadIdx.y; - cudf::size_type col_index_stride = blockDim.y; - for (cudf::size_type col_index = col_index_start; col_index < num_columns; + size_type col_index_start = threadIdx.y; + size_type col_index_stride = blockDim.y; + for (size_type col_index = col_index_start; col_index < num_columns; col_index += col_index_stride) { - cudf::size_type col_size = num_bytes[col_index]; + size_type col_size = num_bytes[col_index]; int8_t *col_tmp = &(row_tmp[output_offset_in_row[col_index]]); const int8_t *col_input = input_data[col_index]; switch (col_size) { @@ -270,9 +276,9 @@ __global__ void copy_to_rows_fixed_width_optimized( break; } default: { - cudf::size_type input_offset = col_size * row_index; + size_type input_offset = col_size * row_index; // TODO this should just not be supported for fixed width columns, but just in case... - for (cudf::size_type b = 0; b < col_size; b++) { + for (size_type b = 0; b < col_size; b++) { col_tmp[b] = col_input[b + input_offset]; } break; @@ -281,10 +287,10 @@ __global__ void copy_to_rows_fixed_width_optimized( // atomicOr only works on 32 bit or 64 bit aligned values, and not byte aligned // so we have to rewrite the addresses to make sure that it is 4 byte aligned int8_t *valid_byte = &row_vld_tmp[col_index / 8]; - cudf::size_type byte_bit_offset = col_index % 8; + size_type byte_bit_offset = col_index % 8; uint64_t fixup_bytes = reinterpret_cast(valid_byte) % 4; int32_t *valid_int = reinterpret_cast(valid_byte - fixup_bytes); - cudf::size_type int_bit_offset = byte_bit_offset + (fixup_bytes * 8); + size_type int_bit_offset = byte_bit_offset + (fixup_bytes * 8); // Now copy validity for the column if (input_nm[col_index]) { if (bit_is_set(input_nm[col_index], row_index)) { @@ -306,21 +312,20 @@ __global__ void copy_to_rows_fixed_width_optimized( int64_t *long_shared = reinterpret_cast(shared_data); int64_t *long_output = reinterpret_cast(output_data); - cudf::size_type shared_input_index = threadIdx.x + (threadIdx.y * blockDim.x); - cudf::size_type shared_input_stride = blockDim.x * blockDim.y; - cudf::size_type row_index_end = ((row_group_index + 1) * rows_per_group); + size_type shared_input_index = threadIdx.x + (threadIdx.y * blockDim.x); + size_type shared_input_stride = blockDim.x * blockDim.y; + size_type row_index_end = ((row_group_index + 1) * rows_per_group); if (row_index_end > num_rows) { row_index_end = num_rows; } - cudf::size_type num_rows_in_group = row_index_end - (row_group_index * rows_per_group); - cudf::size_type shared_length = row_size * num_rows_in_group; + size_type num_rows_in_group = row_index_end - (row_group_index * rows_per_group); + size_type shared_length = row_size * num_rows_in_group; - cudf::size_type shared_input_end = shared_length / sizeof(int64_t); + size_type shared_input_end = shared_length / sizeof(int64_t); - cudf::size_type start_output_index = - (row_size * row_group_index * rows_per_group) / sizeof(int64_t); + size_type start_output_index = (row_size * row_group_index * rows_per_group) / sizeof(int64_t); - for (cudf::size_type shared_index = shared_input_index; shared_index < shared_input_end; + for (size_type shared_index = shared_input_index; shared_index < shared_input_end; shared_index += shared_input_stride) { long_output[start_output_index + shared_index] = long_shared[shared_index]; } @@ -343,13 +348,14 @@ struct block_info { int end_row; int batch_number; - __host__ __device__ size_type get_shared_row_size(size_type const *const col_offsets, - size_type const *const col_sizes) const { - return align_offset(col_offsets[end_col] + col_sizes[end_col] - col_offsets[start_col], 8); + constexpr size_type get_shared_row_size(size_type const *const col_offsets, + size_type const *const col_sizes) const { + return util::round_up_unsafe(col_offsets[end_col] + col_sizes[end_col] - col_offsets[start_col], + 8); } - __host__ __device__ size_type num_cols() const { return end_col - start_col + 1; } + constexpr size_type num_cols() const { return end_col - start_col + 1; } - __host__ __device__ size_type num_rows() const { return end_row - start_row + 1; } + constexpr size_type num_rows() const { return end_row - start_row + 1; } }; /** @@ -554,7 +560,7 @@ __global__ void copy_validity_to_rows(const size_type num_rows, const size_type auto const num_sections_x = util::div_rounding_up_unsafe(num_block_cols, 32); auto const num_sections_y = util::div_rounding_up_unsafe(num_block_rows, 32); auto const validity_data_row_length = - align_offset(util::div_rounding_up_unsafe(num_block_cols, 8), 8); + util::round_up_unsafe(util::div_rounding_up_unsafe(num_block_cols, 8), 8); auto const total_sections = num_sections_x * num_sections_y; int const warp_id = threadIdx.x / warp_size; @@ -736,7 +742,7 @@ __global__ void copy_from_rows(const size_type num_rows, const size_type num_col cuda::memcpy_async(group, &shared[fetch_index % stages_count][shared_row_offset], &_col_offsets[fetch_block.start_col], col_offset_bytes, fetch_barrier); shared_row_offset += col_offset_bytes; - shared_row_offset = align_offset(shared_row_offset, 8); + shared_row_offset = util::round_up_unsafe(shared_row_offset, 8); for (auto row = fetch_block_start_row + static_cast(threadIdx.x); row <= fetch_block_end_row; row += blockDim.x) { @@ -764,7 +770,7 @@ __global__ void copy_from_rows(const size_type num_rows, const size_type num_col auto shared_col_offsets = reinterpret_cast(&shared[processing_index % stages_count][col_size_bytes]); - auto const shared_row_offset = align_offset(col_size_bytes + col_offset_bytes, 8); + auto const shared_row_offset = util::round_up_unsafe(col_size_bytes + col_offset_bytes, 8); auto block_row_size = block.get_shared_row_size(_col_offsets, _col_sizes); @@ -813,11 +819,12 @@ __global__ void copy_from_rows(const size_type num_rows, const size_type num_col * @param input_data pointer to input data * */ -__global__ void -copy_validity_from_rows(const size_type num_rows, const size_type num_columns, - const size_type shmem_used_per_block, const size_type *row_offsets, - cudf::bitmask_type **output_nm, const size_type validity_offset, - device_span block_infos, const int8_t *input_data) { +__global__ void copy_validity_from_rows(const size_type num_rows, const size_type num_columns, + const size_type shmem_used_per_block, + const size_type *row_offsets, bitmask_type **output_nm, + const size_type validity_offset, + device_span block_infos, + const int8_t *input_data) { extern __shared__ int8_t shared_data[]; int8_t *shared_blocks[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] = { shared_data, shared_data + shmem_used_per_block / 2}; @@ -950,10 +957,8 @@ copy_validity_from_rows(const size_type num_rows, const size_type num_columns, * @param [out] threads the size of the threads for the kernel * @return the size in bytes of shared memory needed for each block. */ -static int calc_fixed_width_kernel_dims(const cudf::size_type num_columns, - const cudf::size_type num_rows, - const cudf::size_type size_per_row, dim3 &blocks, - dim3 &threads) { +static int calc_fixed_width_kernel_dims(const size_type num_columns, const size_type num_rows, + const size_type size_per_row, dim3 &blocks, dim3 &threads) { // We have found speed degrades when a thread handles more than 4 columns. // Each block is 2 dimensional. The y dimension indicates the columns. // We limit this to 32 threads in the y dimension so we can still @@ -963,37 +968,29 @@ static int calc_fixed_width_kernel_dims(const cudf::size_type num_columns, // in the x dimension because we use atomic operations at the block // level when writing validity data out to main memory, and that would // need to change if we split a word of validity data between blocks. - int y_block_size = (num_columns + 3) / 4; // cudf::util::div_rounding_up_safe(num_columns, 4); - if (y_block_size > 32) { - y_block_size = 32; - } - int x_possible_block_size = 1024 / y_block_size; + int const y_block_size = min(util::div_rounding_up_safe(num_columns, 4), 32); + int const x_possible_block_size = 1024 / y_block_size; // 48KB is the default setting for shared memory per block according to the cuda tutorials // If someone configures the GPU to only have 16 KB this might not work. - int max_shared_size = 48 * 1024; - int max_block_size = max_shared_size / size_per_row; + int const max_shared_size = 48 * 1024; // If we don't have enough shared memory there is no point in having more threads // per block that will just sit idle - max_block_size = max_block_size > x_possible_block_size ? x_possible_block_size : max_block_size; + auto const max_block_size = std::min(x_possible_block_size, max_shared_size / size_per_row); // Make sure that the x dimension is a multiple of 32 this not only helps // coalesce memory access it also lets us do a ballot sync for validity to write // the data back out the warp level. If x is a multiple of 32 then each thread in the y // dimension is associated with one or more warps, that should correspond to the validity // words directly. - int block_size = (max_block_size / 32) * 32; + int const block_size = (max_block_size / 32) * 32; CUDF_EXPECTS(block_size != 0, "Row size is too large to fit in shared memory"); - int num_blocks = (num_rows + block_size - 1) / block_size; - if (num_blocks < 1) { - num_blocks = 1; - } else if (num_blocks > 10240) { - // The maximum number of blocks supported in the x dimension is 2 ^ 31 - 1 - // but in practice haveing too many can cause some overhead that I don't totally - // understand. Playing around with this haveing as little as 600 blocks appears - // to be able to saturate memory on V100, so this is an order of magnitude higher - // to try and future proof this a bit. - num_blocks = 10240; - } + // The maximum number of blocks supported in the x dimension is 2 ^ 31 - 1 + // but in practice haveing too many can cause some overhead that I don't totally + // understand. Playing around with this haveing as little as 600 blocks appears + // to be able to saturate memory on V100, so this is an order of magnitude higher + // to try and future proof this a bit. + int const num_blocks = std::clamp((num_rows + block_size - 1) / block_size, 1, 10240); + blocks.x = num_blocks; blocks.y = 1; blocks.z = 1; @@ -1009,26 +1006,24 @@ static int calc_fixed_width_kernel_dims(const cudf::size_type num_columns, * going from start row and containing the next num_rows. Most of the parameters passed * into this function are common between runs and should be calculated once. */ -static std::unique_ptr -fixed_width_convert_to_rows(const cudf::size_type start_row, const cudf::size_type num_rows, - const cudf::size_type num_columns, const cudf::size_type size_per_row, - rmm::device_uvector &column_start, - rmm::device_uvector &column_size, - rmm::device_uvector &input_data, - rmm::device_uvector &input_nm, - const cudf::scalar &zero, const cudf::scalar &scalar_size_per_row, - rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) { +static std::unique_ptr fixed_width_convert_to_rows( + const size_type start_row, const size_type num_rows, const size_type num_columns, + const size_type size_per_row, rmm::device_uvector &column_start, + rmm::device_uvector &column_size, rmm::device_uvector &input_data, + rmm::device_uvector &input_nm, const scalar &zero, + const scalar &scalar_size_per_row, rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) { int64_t const total_allocation = size_per_row * num_rows; // We made a mistake in the split somehow CUDF_EXPECTS(total_allocation < std::numeric_limits::max(), "Table is too large to fit!"); // Allocate and set the offsets row for the byte array - std::unique_ptr offsets = + std::unique_ptr offsets = cudf::detail::sequence(num_rows + 1, zero, scalar_size_per_row, stream); - std::unique_ptr data = cudf::make_numeric_column( - cudf::data_type(cudf::type_id::INT8), static_cast(total_allocation), - cudf::mask_state::UNALLOCATED, stream, mr); + std::unique_ptr data = + make_numeric_column(data_type(type_id::INT8), static_cast(total_allocation), + mask_state::UNALLOCATED, stream, mr); dim3 blocks; dim3 threads; @@ -1039,13 +1034,13 @@ fixed_width_convert_to_rows(const cudf::size_type start_row, const cudf::size_ty start_row, num_rows, num_columns, size_per_row, column_start.data(), column_size.data(), input_data.data(), input_nm.data(), data->mutable_view().data()); - return cudf::make_lists_column(num_rows, std::move(offsets), std::move(data), 0, - rmm::device_buffer{0, rmm::cuda_stream_default, mr}, stream, mr); + return make_lists_column(num_rows, std::move(offsets), std::move(data), 0, + rmm::device_buffer{0, rmm::cuda_stream_default, mr}, stream, mr); } -static inline bool are_all_fixed_width(std::vector const &schema) { +static inline bool are_all_fixed_width(std::vector const &schema) { return std::all_of(schema.begin(), schema.end(), - [](const cudf::data_type &t) { return cudf::is_fixed_width(t); }); + [](const data_type &t) { return is_fixed_width(t); }); } /** @@ -1056,18 +1051,18 @@ static inline bool are_all_fixed_width(std::vector const &schem * @param [out] column_size the size in bytes of the data for each columns in the row. * @return the size in bytes each row needs. */ -static inline int32_t compute_fixed_width_layout(std::vector const &schema, - std::vector &column_start, - std::vector &column_size) { +static inline int32_t compute_fixed_width_layout(std::vector const &schema, + std::vector &column_start, + std::vector &column_size) { // We guarantee that the start of each column is 64-bit aligned so anything can go // there, but to make the code simple we will still do an alignment for it. int32_t at_offset = 0; for (auto col = schema.begin(); col < schema.end(); col++) { - cudf::size_type s = cudf::size_of(*col); + size_type s = size_of(*col); column_size.emplace_back(s); std::size_t allocation_needed = s; std::size_t alignment_needed = allocation_needed; // They are the same for fixed width types - at_offset = align_offset(at_offset, alignment_needed); + at_offset = util::round_up_unsafe(at_offset, static_cast(alignment_needed)); column_start.emplace_back(at_offset); at_offset += allocation_needed; } @@ -1075,12 +1070,11 @@ static inline int32_t compute_fixed_width_layout(std::vector co // Now we need to add in space for validity // Eventually we can think about nullable vs not nullable, but for now we will just always add // it in - int32_t validity_bytes_needed = - (schema.size() + 7) / 8; // cudf::util::div_rounding_up_safe(schema.size(), 8); + int32_t const validity_bytes_needed = util::div_rounding_up_safe(schema.size(), 8); // validity comes at the end and is byte aligned so we can pack more in. at_offset += validity_bytes_needed; // Now we need to pad the end so all rows are 64 bit aligned - return align_offset(at_offset, 8); // 8 bytes (64 bits) + return util::round_up_unsafe(at_offset, 8); // 8 bytes (64 bits) } #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 @@ -1109,8 +1103,8 @@ static size_type compute_column_information(iterator begin, iterator end, auto col_size = nested_type ? 8 : size_of(col_type); // align size for this type - std::size_t const alignment_needed = col_size; // They are the same for fixed width types - fixed_width_size_per_row = detail::align_offset(fixed_width_size_per_row, alignment_needed); + size_type const alignment_needed = col_size; // They are the same for fixed width types + fixed_width_size_per_row = util::round_up_unsafe(fixed_width_size_per_row, alignment_needed); column_starts.push_back(fixed_width_size_per_row); column_sizes.push_back(col_size); fixed_width_size_per_row += col_size; @@ -1136,7 +1130,7 @@ build_validity_block_infos(size_type const &num_columns, size_type const &num_ro size_type const &shmem_limit_per_block, std::vector const &row_batches) { auto const desired_rows_and_columns = (int)sqrt(shmem_limit_per_block); - auto const column_stride = align_offset( + auto const column_stride = util::round_up_unsafe( [&]() { if (desired_rows_and_columns > num_columns) { // not many columns, group it into 8s and ship it off @@ -1146,10 +1140,11 @@ build_validity_block_infos(size_type const &num_columns, size_type const &num_ro } }(), 8); + // we fit as much as we can given the column stride // note that an element in the table takes just 1 bit, but a row with a single // element still takes 8 bytes! - auto const bytes_per_row = align_offset(util::div_rounding_up_unsafe(column_stride, 8), 8); + auto const bytes_per_row = util::round_up_safe(util::div_rounding_up_unsafe(column_stride, 8), 8); auto const row_stride = std::min(num_rows, shmem_limit_per_block / bytes_per_row); std::vector validity_block_infos; @@ -1210,8 +1205,7 @@ template struct row_size_functor { template batch_data build_batches(size_type num_rows, RowSize row_sizes, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) { - auto uint64_row_sizes = - cudf::detail::make_counting_transform_iterator(0, row_size_functor(row_sizes)); + auto uint64_row_sizes = cudf::detail::make_counting_transform_iterator(0, row_size_functor(row_sizes)); auto const total_size = thrust::reduce(rmm::exec_policy(stream), uint64_row_sizes, uint64_row_sizes + num_rows); auto const num_batches = static_cast( @@ -1413,10 +1407,10 @@ void determine_windows(std::vector const &column_sizes, auto const col_size = column_sizes[col]; // align size for this type - std::size_t alignment_needed = col_size; // They are the same for fixed width types - auto row_size_aligned = detail::align_offset(row_size, alignment_needed); + auto const alignment_needed = col_size; // They are the same for fixed width types + auto row_size_aligned = util::round_up_unsafe(row_size, alignment_needed); auto row_size_with_this_col = row_size_aligned + col_size; - auto row_size_with_end_pad = detail::align_offset(row_size_with_this_col, 8); + auto row_size_with_end_pad = util::round_up_unsafe(row_size_with_this_col, 8); if (row_size_with_end_pad * window_height + calc_admin_data_size(col - current_window_start_col) > @@ -1425,7 +1419,7 @@ void determine_windows(std::vector const &column_sizes, f(current_window_start_col, col == 0 ? col : col - 1, window_height); row_size = - detail::align_offset((column_starts[col] + column_sizes[col]) & 7, alignment_needed); + util::round_up_unsafe((column_starts[col] + column_sizes[col]) & 7, alignment_needed); row_size += col_size; // alignment required for shared memory window boundary to match // alignment of output row current_window_start_col = col; @@ -1446,9 +1440,9 @@ void determine_windows(std::vector const &column_sizes, } // namespace detail -std::vector> convert_to_rows(cudf::table_view const &tbl, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) { +std::vector> convert_to_rows(table_view const &tbl, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) { #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 const size_type num_columns = tbl.num_columns(); const size_type num_rows = tbl.num_rows(); @@ -1482,9 +1476,8 @@ std::vector> convert_to_rows(cudf::table_view cons std::vector input_nm; input_data.reserve(num_columns); input_nm.reserve(num_columns); - std::transform( - tbl.begin(), tbl.end(), std::back_inserter(input_data), - [](cudf::column_view const &c) -> int8_t const * { return c.template data(); }); + std::transform(tbl.begin(), tbl.end(), std::back_inserter(input_data), + [](column_view const &c) -> int8_t const * { return c.template data(); }); std::transform(tbl.begin(), tbl.end(), std::back_inserter(input_nm), [](auto c) { return c.null_mask(); }); @@ -1502,7 +1495,7 @@ std::vector> convert_to_rows(cudf::table_view cons return std::make_tuple(tbl.column(i).type(), tbl.column(i)); }); - size_type fixed_width_size_per_row = detail::compute_column_information( + auto const fixed_width_size_per_row = detail::compute_column_information( schema_column_iter, schema_column_iter + num_columns, column_starts, column_sizes); auto dev_col_sizes = make_device_uvector_async(column_sizes, stream, mr); @@ -1513,13 +1506,9 @@ std::vector> convert_to_rows(cudf::table_view cons 0, [fixed_width_size_per_row, num_columns] __device__(auto i) { auto const bytes_needed = fixed_width_size_per_row + util::div_rounding_up_safe(num_columns, 8); - return detail::align_offset(bytes_needed, 8); + return util::round_up_unsafe(bytes_needed, 8); }); - // fixed_width_size_per_row is the size of the fixed-width portion of a row. We need to then - // calculate the size of each row's variable-width data and validity as well. - auto validity_size = num_bitmask_words(num_columns) * 4; - auto batch_info = detail::build_batches(num_rows, row_size_iter, stream, mr); auto gpu_batch_row_boundaries = make_device_uvector_async(batch_info.batch_row_boundaries, stream); @@ -1587,19 +1576,22 @@ std::vector> convert_to_rows(cudf::table_view cons // split up the output buffer into multiple buffers based on row batch sizes // and create list of byte columns - std::vector> ret; - for (int batch = 0; batch < (int)batch_info.row_batches.size(); ++batch) { - auto const offset_count = batch_info.row_batches[batch].row_offsets.size(); - auto offsets = std::make_unique(data_type{type_id::INT32}, (size_type)offset_count, - batch_info.row_batches[batch].row_offsets.release()); - auto data = - std::make_unique(data_type{type_id::INT8}, batch_info.row_batches[batch].num_bytes, - std::move(output_buffers[batch])); - - ret.push_back(cudf::make_lists_column( - batch_info.row_batches[batch].row_count, std::move(offsets), std::move(data), 0, - rmm::device_buffer{0, rmm::cuda_stream_default, mr}, stream, mr)); - } + std::vector> ret; + auto counting_iter = thrust::make_counting_iterator(0); + std::transform(counting_iter, counting_iter + batch_info.row_batches.size(), + std::back_inserter(ret), [&](auto batch) { + auto const offset_count = batch_info.row_batches[batch].row_offsets.size(); + auto offsets = std::make_unique( + data_type{type_id::INT32}, (size_type)offset_count, + batch_info.row_batches[batch].row_offsets.release()); + auto data = std::make_unique(data_type{type_id::INT8}, + batch_info.row_batches[batch].num_bytes, + std::move(output_buffers[batch])); + + return make_lists_column( + batch_info.row_batches[batch].row_count, std::move(offsets), std::move(data), + 0, rmm::device_buffer{0, rmm::cuda_stream_default, mr}, stream, mr); + }); return ret; #else @@ -1608,55 +1600,55 @@ std::vector> convert_to_rows(cudf::table_view cons #endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 } -std::vector> -convert_to_rows_fixed_width_optimized(cudf::table_view const &tbl, rmm::cuda_stream_view stream, +std::vector> +convert_to_rows_fixed_width_optimized(table_view const &tbl, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) { - const cudf::size_type num_columns = tbl.num_columns(); + auto const num_columns = tbl.num_columns(); - std::vector schema; + std::vector schema; schema.resize(num_columns); std::transform(tbl.begin(), tbl.end(), schema.begin(), - [](auto i) -> cudf::data_type { return i.type(); }); + [](auto i) -> data_type { return i.type(); }); if (detail::are_all_fixed_width(schema)) { - std::vector column_start; - std::vector column_size; + std::vector column_start; + std::vector column_size; - int32_t size_per_row = detail::compute_fixed_width_layout(schema, column_start, column_size); + int32_t const size_per_row = + detail::compute_fixed_width_layout(schema, column_start, column_size); auto dev_column_start = make_device_uvector_async(column_start, stream, mr); auto dev_column_size = make_device_uvector_async(column_size, stream, mr); - int32_t max_rows_per_batch = std::numeric_limits::max() / size_per_row; // Make the number of rows per batch a multiple of 32 so we don't have to worry about // splitting validity at a specific row offset. This might change in the future. - max_rows_per_batch = (max_rows_per_batch / 32) * 32; + int32_t const max_rows_per_batch = + util::round_down_safe(std::numeric_limits::max() / size_per_row, 32); - cudf::size_type num_rows = tbl.num_rows(); + auto const num_rows = tbl.num_rows(); // Get the pointers to the input columnar data ready std::vector input_data; - std::vector input_nm; - for (cudf::size_type column_number = 0; column_number < num_columns; column_number++) { - cudf::column_view cv = tbl.column(column_number); + std::vector input_nm; + for (size_type column_number = 0; column_number < num_columns; column_number++) { + column_view cv = tbl.column(column_number); input_data.emplace_back(cv.data()); input_nm.emplace_back(cv.null_mask()); } auto dev_input_data = make_device_uvector_async(input_data, stream, mr); auto dev_input_nm = make_device_uvector_async(input_nm, stream, mr); - using ScalarType = cudf::scalar_type_t; - auto zero = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value()); + using ScalarType = scalar_type_t; + auto zero = make_numeric_scalar(data_type(type_id::INT32), stream.value()); zero->set_valid_async(true, stream); static_cast(zero.get())->set_value(0, stream); - auto step = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value()); + auto step = make_numeric_scalar(data_type(type_id::INT32), stream.value()); step->set_valid_async(true, stream); - static_cast(step.get()) - ->set_value(static_cast(size_per_row), stream); + static_cast(step.get())->set_value(static_cast(size_per_row), stream); - std::vector> ret; - for (cudf::size_type row_start = 0; row_start < num_rows; row_start += max_rows_per_batch) { - cudf::size_type row_count = num_rows - row_start; + std::vector> ret; + for (size_type row_start = 0; row_start < num_rows; row_start += max_rows_per_batch) { + size_type row_count = num_rows - row_start; row_count = row_count > max_rows_per_batch ? max_rows_per_batch : row_count; ret.emplace_back(detail::fixed_width_convert_to_rows( row_start, row_count, num_columns, size_per_row, dev_column_start, dev_column_size, @@ -1669,19 +1661,19 @@ convert_to_rows_fixed_width_optimized(cudf::table_view const &tbl, rmm::cuda_str } } -std::unique_ptr convert_from_rows(cudf::lists_column_view const &input, - std::vector const &schema, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) { +std::unique_ptr convert_from_rows(lists_column_view const &input, + std::vector const &schema, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) { #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 // verify that the types are what we expect - cudf::column_view child = input.child(); - cudf::type_id list_type = child.type().id(); - CUDF_EXPECTS(list_type == cudf::type_id::INT8 || list_type == cudf::type_id::UINT8, + column_view child = input.child(); + auto const list_type = child.type().id(); + CUDF_EXPECTS(list_type == type_id::INT8 || list_type == type_id::UINT8, "Only a list of bytes is supported as input"); - cudf::size_type num_columns = schema.size(); - cudf::size_type num_rows = input.parent().size(); + auto const num_columns = schema.size(); + auto const num_rows = input.parent().size(); int device_id; CUDA_TRY(cudaGetDevice(&device_id)); @@ -1692,18 +1684,18 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const &in total_shmem -= 1024; int shmem_limit_per_block = total_shmem / NUM_BLOCKS_PER_KERNEL_LOADED; - std::vector column_starts; - std::vector column_sizes; + std::vector column_starts; + std::vector column_sizes; auto iter = thrust::make_transform_iterator(thrust::make_counting_iterator(0), [&schema](auto i) { return std::make_tuple(schema[i], nullptr); }); - size_type fixed_width_size_per_row = + auto const fixed_width_size_per_row = detail::compute_column_information(iter, iter + num_columns, column_starts, column_sizes); - size_type validity_size = num_bitmask_words(num_columns) * 4; + auto const validity_size = num_bitmask_words(num_columns) * 4; - size_type row_size = detail::align_offset(fixed_width_size_per_row + validity_size, 8); + auto const row_size = util::round_up_unsafe(fixed_width_size_per_row + validity_size, 8); // Ideally we would check that the offsets are all the same, etc. but for now // this is probably fine @@ -1712,12 +1704,12 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const &in auto dev_col_sizes = make_device_uvector_async(column_sizes, stream, mr); // Allocate the columns we are going to write into - std::vector> output_columns; + std::vector> output_columns; std::vector output_data; - std::vector output_nm; - for (cudf::size_type i = 0; i < num_columns; i++) { - auto column = cudf::make_fixed_width_column(schema[i], num_rows, - cudf::mask_state::UNINITIALIZED, stream, mr); + std::vector output_nm; + for (int i = 0; i < static_cast(num_columns); i++) { + auto column = + make_fixed_width_column(schema[i], num_rows, mask_state::UNINITIALIZED, stream, mr); auto mut = column->mutable_view(); output_data.emplace_back(mut.data()); output_nm.emplace_back(mut.null_mask()); @@ -1786,30 +1778,31 @@ std::unique_ptr convert_from_rows(cudf::lists_column_view const &in dev_output_nm.data(), column_starts.back(), dev_validity_block_infos, child.data()); - return std::make_unique(std::move(output_columns)); + return std::make_unique
(std::move(output_columns)); #else CUDF_FAIL("Row to column conversion optimization requires volta or later hardware."); return {}; #endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 } -std::unique_ptr convert_from_rows_fixed_width_optimized( - cudf::lists_column_view const &input, std::vector const &schema, +std::unique_ptr
convert_from_rows_fixed_width_optimized( + lists_column_view const &input, std::vector const &schema, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) { // verify that the types are what we expect - cudf::column_view child = input.child(); - cudf::type_id list_type = child.type().id(); - CUDF_EXPECTS(list_type == cudf::type_id::INT8 || list_type == cudf::type_id::UINT8, + column_view child = input.child(); + auto const list_type = child.type().id(); + CUDF_EXPECTS(list_type == type_id::INT8 || list_type == type_id::UINT8, "Only a list of bytes is supported as input"); - cudf::size_type num_columns = schema.size(); + auto const num_columns = schema.size(); if (detail::are_all_fixed_width(schema)) { - std::vector column_start; - std::vector column_size; + std::vector column_start; + std::vector column_size; - cudf::size_type num_rows = input.parent().size(); - int32_t size_per_row = detail::compute_fixed_width_layout(schema, column_start, column_size); + auto const num_rows = input.parent().size(); + int32_t const size_per_row = + detail::compute_fixed_width_layout(schema, column_start, column_size); // Ideally we would check that the offsets are all the same, etc. but for now // this is probably fine @@ -1819,12 +1812,12 @@ std::unique_ptr convert_from_rows_fixed_width_optimized( auto dev_column_size = make_device_uvector_async(column_size, stream); // Allocate the columns we are going to write into - std::vector> output_columns; + std::vector> output_columns; std::vector output_data; - std::vector output_nm; - for (cudf::size_type i = 0; i < num_columns; i++) { - auto column = cudf::make_fixed_width_column(schema[i], num_rows, - cudf::mask_state::UNINITIALIZED, stream, mr); + std::vector output_nm; + for (int i = 0; i < static_cast(num_columns); i++) { + auto column = + make_fixed_width_column(schema[i], num_rows, mask_state::UNINITIALIZED, stream, mr); auto mut = column->mutable_view(); output_data.emplace_back(mut.data()); output_nm.emplace_back(mut.null_mask()); @@ -1843,7 +1836,7 @@ std::unique_ptr convert_from_rows_fixed_width_optimized( num_rows, num_columns, size_per_row, dev_column_start.data(), dev_column_size.data(), dev_output_data.data(), dev_output_nm.data(), child.data()); - return std::make_unique(std::move(output_columns)); + return std::make_unique
(std::move(output_columns)); } else { CUDF_FAIL("Only fixed width types are currently supported"); } From 27d44d908ab27aed900fac2b0d7e9d0ad5498ea7 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Wed, 17 Nov 2021 19:20:37 +0000 Subject: [PATCH 68/80] Updating from review comments --- java/src/main/native/src/row_conversion.cu | 166 +++++++++++---------- 1 file changed, 85 insertions(+), 81 deletions(-) diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu index 6f3998216b0..b7840da9b30 100644 --- a/java/src/main/native/src/row_conversion.cu +++ b/java/src/main/native/src/row_conversion.cu @@ -60,6 +60,9 @@ constexpr auto NUM_BLOCKS_PER_KERNEL_LOADED = 2; constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL = 8; constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED = 2; +constexpr auto JCUDF_ROW_ALIGNMENT = 8; +constexpr auto MAX_BATCH_SIZE = std::numeric_limits::max(); + // needed to suppress warning about cuda::barrier #pragma nv_diag_suppress static_var_with_dynamic_init #endif @@ -105,10 +108,10 @@ copy_from_rows_fixed_width_optimized(const size_type num_rows, const size_type n // are controlled by the x dimension (there are multiple blocks in the x // dimension). - size_type rows_per_group = blockDim.x; - size_type row_group_start = blockIdx.x; - size_type row_group_stride = gridDim.x; - size_type row_group_end = (num_rows + rows_per_group - 1) / rows_per_group + 1; + size_type const rows_per_group = blockDim.x; + size_type const row_group_start = blockIdx.x; + size_type const row_group_stride = gridDim.x; + size_type const row_group_end = (num_rows + rows_per_group - 1) / rows_per_group + 1; extern __shared__ int8_t shared_data[]; @@ -117,25 +120,22 @@ copy_from_rows_fixed_width_optimized(const size_type num_rows, const size_type n int8_t *row_tmp = &shared_data[row_size * threadIdx.x]; int8_t *row_vld_tmp = &row_tmp[input_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]]; - for (size_type row_group_index = row_group_start; row_group_index < row_group_end; + for (auto row_group_index = row_group_start; row_group_index < row_group_end; row_group_index += row_group_stride) { // Step 1: Copy the data into shared memory // We know row_size is always aligned with and a multiple of int64_t; int64_t *long_shared = reinterpret_cast(shared_data); - const int64_t *long_input = reinterpret_cast(input_data); + int64_t const *long_input = reinterpret_cast(input_data); - size_type shared_output_index = threadIdx.x + (threadIdx.y * blockDim.x); - size_type shared_output_stride = blockDim.x * blockDim.y; - size_type row_index_end = ((row_group_index + 1) * rows_per_group); - if (row_index_end > num_rows) { - row_index_end = num_rows; - } - size_type num_rows_in_group = row_index_end - (row_group_index * rows_per_group); - size_type shared_length = row_size * num_rows_in_group; + auto const shared_output_index = threadIdx.x + (threadIdx.y * blockDim.x); + auto const shared_output_stride = blockDim.x * blockDim.y; + auto const row_index_end = std::min(num_rows, ((row_group_index + 1) * rows_per_group)); + auto const num_rows_in_group = row_index_end - (row_group_index * rows_per_group); + auto const shared_length = row_size * num_rows_in_group; - size_type shared_output_end = shared_length / sizeof(int64_t); + size_type const shared_output_end = shared_length / sizeof(int64_t); - size_type start_input_index = (row_size * row_group_index * rows_per_group) / sizeof(int64_t); + auto const start_input_index = (row_size * row_group_index * rows_per_group) / sizeof(int64_t); for (size_type shared_index = shared_output_index; shared_index < shared_output_end; shared_index += shared_output_stride) { @@ -148,18 +148,18 @@ copy_from_rows_fixed_width_optimized(const size_type num_rows, const size_type n // Within the row group there should be 1 thread for each row. This is a // requirement for launching the kernel - size_type row_index = (row_group_index * rows_per_group) + threadIdx.x; + auto const row_index = (row_group_index * rows_per_group) + threadIdx.x; // But we might not use all of the threads if the number of rows does not go // evenly into the thread count. We don't want those threads to exit yet // because we may need them to copy data in for the next row group. uint32_t active_mask = __ballot_sync(0xffffffff, row_index < num_rows); if (row_index < num_rows) { - size_type col_index_start = threadIdx.y; - size_type col_index_stride = blockDim.y; - for (size_type col_index = col_index_start; col_index < num_columns; + auto const col_index_start = threadIdx.y; + auto const col_index_stride = blockDim.y; + for (auto col_index = col_index_start; col_index < num_columns; col_index += col_index_stride) { - size_type col_size = num_bytes[col_index]; - const int8_t *col_tmp = &(row_tmp[input_offset_in_row[col_index]]); + auto const col_size = num_bytes[col_index]; + int8_t const *col_tmp = &(row_tmp[input_offset_in_row[col_index]]); int8_t *col_output = output_data[col_index]; switch (col_size) { case 1: { @@ -182,9 +182,9 @@ copy_from_rows_fixed_width_optimized(const size_type num_rows, const size_type n break; } default: { - size_type output_offset = col_size * row_index; + auto const output_offset = col_size * row_index; // TODO this should just not be supported for fixed width columns, but just in case... - for (size_type b = 0; b < col_size; b++) { + for (auto b = 0; b < col_size; b++) { col_output[b + output_offset] = col_tmp[b]; } break; @@ -351,7 +351,7 @@ struct block_info { constexpr size_type get_shared_row_size(size_type const *const col_offsets, size_type const *const col_sizes) const { return util::round_up_unsafe(col_offsets[end_col] + col_sizes[end_col] - col_offsets[start_col], - 8); + JCUDF_ROW_ALIGNMENT); } constexpr size_type num_cols() const { return end_col - start_col + 1; } @@ -365,9 +365,21 @@ struct block_info { * */ struct row_batch { - size_type num_bytes; - size_type row_count; - device_uvector row_offsets; + size_type num_bytes; // number of bytes in this batch + size_type row_count; // number of rows in the batch + device_uvector row_offsets; // offsets column of output cudf column +}; + +/** + * @brief Holds information about the batches of data to be processed + * + */ +struct batch_data { + device_uvector + batch_row_offsets; // offsets to each row in the JCUDF data from batch start + std::vector + batch_row_boundaries; // row numbers for different batches like 0, 10000, 20000 + std::vector row_batches; // information about each batch such as byte count }; /** @@ -412,8 +424,8 @@ __global__ void copy_to_rows(const size_type num_rows, const size_type num_colum group.sync(); auto const blocks_remaining = - std::min((uint)block_infos.size() - blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_ROWS, - (uint)NUM_BLOCKS_PER_KERNEL_TO_ROWS); + std::min(static_cast(block_infos.size()) - blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_ROWS, + static_cast(NUM_BLOCKS_PER_KERNEL_TO_ROWS)); size_t fetch; size_t subset; @@ -441,7 +453,7 @@ __global__ void copy_to_rows(const size_type num_rows, const size_type num_colum // and do row-based memcopies out. auto const shared_buffer_base = shared[fetch % stages_count]; - for (auto el = (int)threadIdx.x; el < num_elements_in_block; el += blockDim.x) { + for (auto el = static_cast(threadIdx.x); el < num_elements_in_block; el += blockDim.x) { auto const relative_col = el / num_fetch_rows; auto const relative_row = el % num_fetch_rows; auto const absolute_col = relative_col + fetch_block.start_col; @@ -533,8 +545,8 @@ __global__ void copy_validity_to_rows(const size_type num_rows, const size_type auto group = cooperative_groups::this_thread_block(); int const blocks_remaining = - std::min((uint)block_infos.size() - blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL, - (uint)NUM_VALIDITY_BLOCKS_PER_KERNEL); + std::min(static_cast(block_infos.size()) - blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL, + static_cast(NUM_VALIDITY_BLOCKS_PER_KERNEL)); __shared__ cuda::barrier shared_block_barriers[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]; @@ -560,7 +572,7 @@ __global__ void copy_validity_to_rows(const size_type num_rows, const size_type auto const num_sections_x = util::div_rounding_up_unsafe(num_block_cols, 32); auto const num_sections_y = util::div_rounding_up_unsafe(num_block_rows, 32); auto const validity_data_row_length = - util::round_up_unsafe(util::div_rounding_up_unsafe(num_block_cols, 8), 8); + util::round_up_unsafe(util::div_rounding_up_unsafe(num_block_cols, 8), JCUDF_ROW_ALIGNMENT); auto const total_sections = num_sections_x * num_sections_y; int const warp_id = threadIdx.x / warp_size; @@ -705,8 +717,8 @@ __global__ void copy_from_rows(const size_type num_rows, const size_type num_col group.sync(); auto blocks_remaining = - std::min((uint)block_infos.size() - blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_ROWS, - (uint)NUM_BLOCKS_PER_KERNEL_FROM_ROWS); + std::min(static_cast(block_infos.size()) - blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_ROWS, + static_cast(NUM_BLOCKS_PER_KERNEL_FROM_ROWS)); size_t fetch_index; size_t processing_index; @@ -838,8 +850,8 @@ __global__ void copy_validity_from_rows(const size_type num_rows, const size_typ auto group = cooperative_groups::this_thread_block(); int const blocks_remaining = - std::min((uint)block_infos.size() - blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL, - (uint)NUM_VALIDITY_BLOCKS_PER_KERNEL); + std::min(static_cast(block_infos.size()) - blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL, + static_cast(NUM_VALIDITY_BLOCKS_PER_KERNEL)); __shared__ cuda::barrier shared_block_barriers[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]; @@ -862,8 +874,8 @@ __global__ void copy_validity_from_rows(const size_type num_rows, const size_typ auto const block_start_row = block.start_row; auto const num_block_cols = block.num_cols(); auto const num_block_rows = block.num_rows(); - auto const num_sections_x = (num_block_cols + 7) / 8; - auto const num_sections_y = (num_block_rows + 31) / 32; + auto const num_sections_x = util::div_rounding_up_safe(num_block_cols, 8); + auto const num_sections_y = util::div_rounding_up_safe(num_block_rows, 32); auto const validity_data_col_length = num_sections_y * 4; // words to bytes auto const total_sections = num_sections_x * num_sections_y; int const warp_id = threadIdx.x / warp_size; @@ -1015,7 +1027,8 @@ static std::unique_ptr fixed_width_convert_to_rows( rmm::mr::device_memory_resource *mr) { int64_t const total_allocation = size_per_row * num_rows; // We made a mistake in the split somehow - CUDF_EXPECTS(total_allocation < std::numeric_limits::max(), "Table is too large to fit!"); + CUDF_EXPECTS(total_allocation < std::numeric_limits::max(), + "Table is too large to fit!"); // Allocate and set the offsets row for the byte array std::unique_ptr offsets = @@ -1074,7 +1087,7 @@ static inline int32_t compute_fixed_width_layout(std::vector const &s // validity comes at the end and is byte aligned so we can pack more in. at_offset += validity_bytes_needed; // Now we need to pad the end so all rows are 64 bit aligned - return util::round_up_unsafe(at_offset, 8); // 8 bytes (64 bits) + return util::round_up_unsafe(at_offset, JCUDF_ROW_ALIGNMENT); } #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 @@ -1096,7 +1109,7 @@ static size_type compute_column_information(iterator begin, iterator end, size_type fixed_width_size_per_row = 0; for (auto cv = begin; cv != end; ++cv) { auto col_type = std::get<0>(*cv); - bool nested_type = col_type.id() == type_id::LIST || col_type.id() == type_id::STRING; + bool nested_type = is_compound(col_type); // a list or string column will write a single uint64 // of data here for offset/length @@ -1129,7 +1142,7 @@ std::vector build_validity_block_infos(size_type const &num_columns, size_type const &num_rows, size_type const &shmem_limit_per_block, std::vector const &row_batches) { - auto const desired_rows_and_columns = (int)sqrt(shmem_limit_per_block); + auto const desired_rows_and_columns = static_cast(sqrt(shmem_limit_per_block)); auto const column_stride = util::round_up_unsafe( [&]() { if (desired_rows_and_columns > num_columns) { @@ -1139,12 +1152,13 @@ build_validity_block_infos(size_type const &num_columns, size_type const &num_ro return util::round_down_safe(desired_rows_and_columns, 8); } }(), - 8); + JCUDF_ROW_ALIGNMENT); // we fit as much as we can given the column stride // note that an element in the table takes just 1 bit, but a row with a single // element still takes 8 bytes! - auto const bytes_per_row = util::round_up_safe(util::div_rounding_up_unsafe(column_stride, 8), 8); + auto const bytes_per_row = + util::round_up_safe(util::div_rounding_up_unsafe(column_stride, 8), JCUDF_ROW_ALIGNMENT); auto const row_stride = std::min(num_rows, shmem_limit_per_block / bytes_per_row); std::vector validity_block_infos; @@ -1169,18 +1183,6 @@ build_validity_block_infos(size_type const &num_columns, size_type const &num_ro return validity_block_infos; } -constexpr size_type max_batch_size = std::numeric_limits::max(); - -/** - * @brief Holds information about the batches of data to be processed - * - */ -struct batch_data { - device_uvector batch_row_offsets; - std::vector batch_row_boundaries; - std::vector row_batches; -}; - template struct row_size_functor { RowSize _row_sizes; size_type _num_rows; @@ -1205,11 +1207,12 @@ template struct row_size_functor { template batch_data build_batches(size_type num_rows, RowSize row_sizes, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) { - auto uint64_row_sizes = cudf::detail::make_counting_transform_iterator(0, row_size_functor(row_sizes)); + auto uint64_row_sizes = + cudf::detail::make_counting_transform_iterator(0, row_size_functor(row_sizes)); auto const total_size = thrust::reduce(rmm::exec_policy(stream), uint64_row_sizes, uint64_row_sizes + num_rows); auto const num_batches = static_cast( - util::div_rounding_up_safe(total_size, static_cast(max_batch_size))); + util::div_rounding_up_safe(total_size, static_cast(MAX_BATCH_SIZE))); auto const num_offsets = num_batches + 1; std::vector row_batches; std::vector batch_row_boundaries; @@ -1223,12 +1226,12 @@ batch_data build_batches(size_type num_rows, RowSize row_sizes, rmm::cuda_stream thrust::inclusive_scan(rmm::exec_policy(stream), uint64_row_sizes, uint64_row_sizes + num_rows, cumulative_row_sizes.begin()); - while ((int)batch_row_boundaries.size() < num_offsets) { - // find the next max_batch_size boundary + while (static_cast(batch_row_boundaries.size()) < num_offsets) { + // find the next MAX_BATCH_SIZE boundary size_type const row_end = ((thrust::lower_bound(rmm::exec_policy(stream), cumulative_row_sizes.begin(), cumulative_row_sizes.begin() + (num_rows - last_row_end), - max_batch_size) - + MAX_BATCH_SIZE) - cumulative_row_sizes.begin()) + last_row_end); @@ -1346,7 +1349,7 @@ build_blocks(device_span blocks, int const max_row = std::min(total_number_of_rows - 1, batch_index + 1 > num_batches ? - std::numeric_limits::max() : + std::numeric_limits::max() : static_cast(batch_row_boundaries[batch_index + 1]) - 1); int const block_row_end = std::min( batch_row_start + ((local_block_index + 1) * desired_window_height) - 1, max_row); @@ -1387,8 +1390,8 @@ void determine_windows(std::vector const &column_sizes, // window as far as byte sizes. x * y = shared_mem_size. Which translates to x^2 = // shared_mem_size since we want them equal, so height and width are sqrt(shared_mem_size). The // trick is that it's in bytes, not rows or columns. - size_type const optimal_square_len = size_type(sqrt(shmem_limit_per_block)); - int const window_height = + auto const optimal_square_len = static_cast(sqrt(shmem_limit_per_block)); + auto const window_height = std::clamp(util::round_up_safe( std::min(optimal_square_len / column_sizes[0], total_number_of_rows), 32), 1, first_row_batch_size); @@ -1403,14 +1406,15 @@ void determine_windows(std::vector const &column_sizes, int row_size = 0; // march each column and build the blocks of appropriate sizes - for (unsigned int col = 0; col < column_sizes.size(); ++col) { + for (uint col = 0; col < column_sizes.size(); ++col) { auto const col_size = column_sizes[col]; // align size for this type auto const alignment_needed = col_size; // They are the same for fixed width types - auto row_size_aligned = util::round_up_unsafe(row_size, alignment_needed); - auto row_size_with_this_col = row_size_aligned + col_size; - auto row_size_with_end_pad = util::round_up_unsafe(row_size_with_this_col, 8); + auto const row_size_aligned = util::round_up_unsafe(row_size, alignment_needed); + auto const row_size_with_this_col = row_size_aligned + col_size; + auto const row_size_with_end_pad = + util::round_up_unsafe(row_size_with_this_col, JCUDF_ROW_ALIGNMENT); if (row_size_with_end_pad * window_height + calc_admin_data_size(col - current_window_start_col) > @@ -1432,7 +1436,7 @@ void determine_windows(std::vector const &column_sizes, // build last set of blocks if (current_window_width > 0) { - f(current_window_start_col, (int)column_sizes.size() - 1, window_height); + f(current_window_start_col, static_cast(column_sizes.size()) - 1, window_height); } } @@ -1444,8 +1448,8 @@ std::vector> convert_to_rows(table_view const &tbl, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) { #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 - const size_type num_columns = tbl.num_columns(); - const size_type num_rows = tbl.num_rows(); + auto const num_columns = tbl.num_columns(); + auto const num_rows = tbl.num_rows(); int device_id; CUDA_TRY(cudaGetDevice(&device_id)); @@ -1454,7 +1458,7 @@ std::vector> convert_to_rows(table_view const &tbl, // TODO: why is this needed. kernel fails to launch if all memory is requested. total_shmem -= 1024; - int shmem_limit_per_block = total_shmem / NUM_BLOCKS_PER_KERNEL_LOADED; + auto const shmem_limit_per_block = total_shmem / NUM_BLOCKS_PER_KERNEL_LOADED; // break up the work into blocks, which are a starting and ending row/col #. // this window size is calculated based on the shared memory size available @@ -1506,7 +1510,7 @@ std::vector> convert_to_rows(table_view const &tbl, 0, [fixed_width_size_per_row, num_columns] __device__(auto i) { auto const bytes_needed = fixed_width_size_per_row + util::div_rounding_up_safe(num_columns, 8); - return util::round_up_unsafe(bytes_needed, 8); + return util::round_up_unsafe(bytes_needed, JCUDF_ROW_ALIGNMENT); }); auto batch_info = detail::build_batches(num_rows, row_size_iter, stream, mr); @@ -1621,8 +1625,8 @@ convert_to_rows_fixed_width_optimized(table_view const &tbl, rmm::cuda_stream_vi // Make the number of rows per batch a multiple of 32 so we don't have to worry about // splitting validity at a specific row offset. This might change in the future. - int32_t const max_rows_per_batch = - util::round_down_safe(std::numeric_limits::max() / size_per_row, 32); + auto const max_rows_per_batch = + util::round_down_safe(std::numeric_limits::max() / size_per_row, 32); auto const num_rows = tbl.num_rows(); @@ -1695,7 +1699,8 @@ std::unique_ptr
convert_from_rows(lists_column_view const &input, auto const validity_size = num_bitmask_words(num_columns) * 4; - auto const row_size = util::round_up_unsafe(fixed_width_size_per_row + validity_size, 8); + auto const row_size = + util::round_up_unsafe(fixed_width_size_per_row + validity_size, JCUDF_ROW_ALIGNMENT); // Ideally we would check that the offsets are all the same, etc. but for now // this is probably fine @@ -1755,7 +1760,7 @@ std::unique_ptr
convert_from_rows(lists_column_view const &input, dim3 blocks( util::div_rounding_up_unsafe(gpu_block_infos.size(), NUM_BLOCKS_PER_KERNEL_FROM_ROWS)); - dim3 threads(std::min(std::min(256, shmem_limit_per_block / 8), (int)child.size())); + dim3 threads(std::min(std::min(256, shmem_limit_per_block / 8), static_cast(child.size()))); auto validity_block_infos = detail::build_validity_block_infos(num_columns, num_rows, shmem_limit_per_block, row_batches); @@ -1801,8 +1806,7 @@ std::unique_ptr
convert_from_rows_fixed_width_optimized( std::vector column_size; auto const num_rows = input.parent().size(); - int32_t const size_per_row = - detail::compute_fixed_width_layout(schema, column_start, column_size); + auto const size_per_row = detail::compute_fixed_width_layout(schema, column_start, column_size); // Ideally we would check that the offsets are all the same, etc. but for now // this is probably fine From 3a488440bb5e59f5e95f93e86b35d2bba3920828 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Thu, 18 Nov 2021 04:48:56 +0000 Subject: [PATCH 69/80] removing odd size writing since destination is now padded --- java/src/main/native/src/row_conversion.cu | 39 ++-------------------- 1 file changed, 3 insertions(+), 36 deletions(-) diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu index b7840da9b30..8ee7b893dd9 100644 --- a/java/src/main/native/src/row_conversion.cu +++ b/java/src/main/native/src/row_conversion.cu @@ -375,8 +375,7 @@ struct row_batch { * */ struct batch_data { - device_uvector - batch_row_offsets; // offsets to each row in the JCUDF data from batch start + device_uvector batch_row_offsets; // offset column of returned cudf column std::vector batch_row_boundaries; // row numbers for different batches like 0, 10000, 20000 std::vector row_batches; // information about each batch such as byte count @@ -607,23 +606,7 @@ __global__ void copy_validity_to_rows(const size_type num_rows, const size_type auto const validity_write_offset = validity_data_row_length * (relative_row + i) + relative_col / 8; if (threadIdx.x % warp_size == 0) { - if (cols_left <= 8) { - // write byte - this_shared_block[validity_write_offset] = validity_data & 0xFF; - } else if (cols_left <= 16) { - // write int16 - *reinterpret_cast(&this_shared_block[validity_write_offset]) = - validity_data & 0xFFFF; - } else if (cols_left <= 24) { - // write int16 and then int8 - *reinterpret_cast(&this_shared_block[validity_write_offset]) = - validity_data & 0xFFFF; - shared_data[validity_write_offset + 2] = (validity_data >> 16) & 0xFF; - } else { - // write int32 - *reinterpret_cast(&this_shared_block[validity_write_offset]) = - validity_data; - } + *reinterpret_cast(&this_shared_block[validity_write_offset]) = validity_data; } } } @@ -911,23 +894,7 @@ __global__ void copy_validity_from_rows(const size_type num_rows, const size_typ auto const validity_write_offset = validity_data_col_length * (relative_col + i) + relative_row / 8; - if (rows_left <= 8) { - // write byte - this_shared_block[validity_write_offset] = validity_data & 0xFF; - } else if (rows_left <= 16) { - // write int16 - *reinterpret_cast(&this_shared_block[validity_write_offset]) = - validity_data & 0xFFFF; - } else if (rows_left <= 24) { - // write int16 and then int8 - *reinterpret_cast(&this_shared_block[validity_write_offset]) = - validity_data & 0xFFFF; - shared_data[validity_write_offset + 2] = (validity_data >> 16) & 0xFF; - } else { - // write int32 - *reinterpret_cast(&this_shared_block[validity_write_offset]) = - validity_data; - } + *reinterpret_cast(&this_shared_block[validity_write_offset]) = validity_data; } } } From 7595eaf36dd6fad557fa32a0ea383f5fbce13b36 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Tue, 7 Dec 2021 06:37:09 +0000 Subject: [PATCH 70/80] performance improvements --- java/src/main/native/src/row_conversion.cu | 165 +++++++++------------ 1 file changed, 73 insertions(+), 92 deletions(-) diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu index 8ee7b893dd9..c44ac7343e7 100644 --- a/java/src/main/native/src/row_conversion.cu +++ b/java/src/main/native/src/row_conversion.cu @@ -489,20 +489,25 @@ __global__ void copy_to_rows(const size_type num_rows, const size_type num_colum auto &subset_barrier = block_barrier[subset % NUM_BLOCKS_PER_KERNEL_LOADED]; subset_barrier.arrive_and_wait(); - auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_ROWS + subset]; + auto const block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_ROWS + subset]; auto const block_row_size = block.get_shared_row_size(col_offsets, col_sizes); auto const column_offset = col_offsets[block.start_col]; auto const block_output_buffer = output_data[block.batch_number]; - // copy entire rows to final dest - for (auto absolute_row = block.start_row + threadIdx.x; absolute_row <= block.end_row; - absolute_row += blockDim.x) { - auto const relative_row = absolute_row - block.start_row; - auto const output_dest = block_output_buffer + row_offsets[absolute_row] + column_offset; - auto const shared_offset = block_row_size * relative_row; + // copy entire row 8 bytes at a time + auto const chunks_per_row = util::div_rounding_up_unsafe(block_row_size, 8); + auto const total_chunks = chunks_per_row * block.num_rows(); - cuda::memcpy_async(output_dest, &shared[subset % stages_count][shared_offset], - cuda::aligned_size_t<8>(block_row_size), subset_barrier); + for (auto i = threadIdx.x; i < total_chunks; i += blockDim.x) { + // determine source address of my chunk + auto const relative_row = i / chunks_per_row; + auto const relative_chunk_offset = (i % chunks_per_row) * 8; + auto const output_dest = block_output_buffer + row_offsets[relative_row + block.start_row] + + column_offset + relative_chunk_offset; + auto const input_src = + &shared[subset % stages_count][block_row_size * relative_row + relative_chunk_offset]; + + cuda::memcpy_async(output_dest, input_src, cuda::aligned_size_t<8>(8), subset_barrier); } } @@ -588,7 +593,6 @@ __global__ void copy_validity_to_rows(const size_type num_rows, const size_type auto const relative_row = section_y * 32; auto const absolute_col = relative_col + block.start_col; auto const absolute_row = relative_row + block.start_row; - auto const cols_left = num_columns - absolute_col; auto const participation_mask = __ballot_sync(0xFFFFFFFF, absolute_col < num_columns); if (absolute_col < num_columns) { @@ -618,15 +622,29 @@ __global__ void copy_validity_to_rows(const size_type num_rows, const size_type auto const output_data_base = output_data[block.batch_number] + validity_offset + block.start_col / 8; - // now async memcpy the shared memory out to the final destination - for (int row = block.start_row + threadIdx.x; row <= block.end_row; row += blockDim.x) { - auto const relative_row = row - block.start_row; - auto const output_ptr = output_data_base + row_offsets[row]; - auto const num_bytes = util::div_rounding_up_unsafe(num_block_cols, 8); - - cuda::memcpy_async( - output_ptr, &this_shared_block[validity_data_row_length * relative_row], num_bytes, - shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]); + // now async memcpy the shared memory out to the final destination 4 bytes at a time since we do + // 32-row chunks + auto const row_bytes = util::div_rounding_up_unsafe(num_block_cols, 8); + auto const chunks_per_row = util::div_rounding_up_unsafe(row_bytes, 8); + auto const total_chunks = chunks_per_row * block.num_rows(); + auto &subset_barrier = + shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]; + auto const tail_bytes = row_bytes % 8; + + for (auto i = threadIdx.x; i < total_chunks; i += blockDim.x) { + // determine source address of my chunk + auto const relative_row = i / chunks_per_row; + auto const col_chunk = i % chunks_per_row; + auto const relative_chunk_offset = col_chunk * 8; + auto const output_dest = + output_data_base + row_offsets[relative_row + block.start_row] + relative_chunk_offset; + auto const input_src = + &this_shared_block[validity_data_row_length * relative_row + relative_chunk_offset]; + + if (tail_bytes > 0 && col_chunk == chunks_per_row - 1) + cuda::memcpy_async(output_dest, input_src, tail_bytes, subset_barrier); + else + cuda::memcpy_async(output_dest, input_src, cuda::aligned_size_t<8>(8), subset_barrier); } } @@ -638,22 +656,6 @@ __global__ void copy_validity_to_rows(const size_type num_rows, const size_type } } -/** - * @brief Admin data is data stored in shared memory that isn't actual column data - * - * @param col_size_size size of the column size data. - * @param col_offset_size size of the column offset data. - * @param num_cols number of columns in the block. - * @return tuple of the size of column and offset admin data. - */ -static __device__ std::tuple -get_admin_data_sizes(size_t col_size_size, size_t col_offset_size, int const num_cols) { - auto const col_size_bytes = num_cols * col_size_size; - auto const col_offset_bytes = num_cols * col_offset_size; - - return {col_size_bytes, col_offset_bytes}; -} - /** * @brief copy data from row-based format to cudf columns * @@ -670,8 +672,8 @@ get_admin_data_sizes(size_t col_size_size, size_t col_offset_size, int const num */ __global__ void copy_from_rows(const size_type num_rows, const size_type num_columns, const size_type shmem_used_per_block, const size_type *row_offsets, - int8_t **output_data, const size_type *_col_sizes, - const size_type *_col_offsets, + int8_t **output_data, const size_type *col_sizes, + const size_type *col_offsets, device_span block_infos, const int8_t *input_data) { // We are going to copy the data in two passes. @@ -714,12 +716,8 @@ __global__ void copy_from_rows(const size_type num_rows, const size_type num_col auto const fetch_block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_ROWS + fetch_index]; auto const fetch_block_start_row = fetch_block.start_row; - auto const fetch_block_end_row = fetch_block.end_row; - auto const starting_col_offset = _col_offsets[fetch_block.start_col]; - auto const fetch_block_row_size = fetch_block.get_shared_row_size(_col_offsets, _col_sizes); - auto const num_fetch_cols = fetch_block.num_cols(); - auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes( - sizeof(decltype(*_col_sizes)), sizeof(decltype(*_col_offsets)), num_fetch_cols); + auto const starting_col_offset = col_offsets[fetch_block.start_col]; + auto const fetch_block_row_size = fetch_block.get_shared_row_size(col_offsets, col_sizes); auto &fetch_barrier = block_barrier[fetch_index % NUM_BLOCKS_PER_KERNEL_LOADED]; // if we have fetched all buffers, we need to wait for processing @@ -728,22 +726,10 @@ __global__ void copy_from_rows(const size_type num_rows, const size_type num_col fetch_barrier.arrive_and_wait(); } - auto shared_row_offset = 0; - // copy the data for column sizes - cuda::memcpy_async(group, &shared[fetch_index % stages_count][shared_row_offset], - &_col_sizes[fetch_block.start_col], col_size_bytes, fetch_barrier); - shared_row_offset += col_size_bytes; - // copy the data for column offsets - cuda::memcpy_async(group, &shared[fetch_index % stages_count][shared_row_offset], - &_col_offsets[fetch_block.start_col], col_offset_bytes, fetch_barrier); - shared_row_offset += col_offset_bytes; - shared_row_offset = util::round_up_unsafe(shared_row_offset, 8); - for (auto row = fetch_block_start_row + static_cast(threadIdx.x); - row <= fetch_block_end_row; row += blockDim.x) { - auto shared_offset = - (row - fetch_block_start_row) * fetch_block_row_size + shared_row_offset; - // copy the main + row <= fetch_block.end_row; row += blockDim.x) { + auto shared_offset = (row - fetch_block_start_row) * fetch_block_row_size; + // copy the data cuda::memcpy_async(&shared[fetch_index % stages_count][shared_offset], &input_data[row_offsets[row] + starting_col_offset], fetch_block_row_size, fetch_barrier); @@ -755,19 +741,10 @@ __global__ void copy_from_rows(const size_type num_rows, const size_type num_col // ensure our data is ready processing_barrier.arrive_and_wait(); - auto block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_ROWS + processing_index]; + auto const block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_ROWS + processing_index]; auto const rows_in_block = block.num_rows(); auto const cols_in_block = block.num_cols(); - - auto [col_size_bytes, col_offset_bytes] = get_admin_data_sizes( - sizeof(decltype(*_col_sizes)), sizeof(decltype(*_col_offsets)), cols_in_block); - auto shared_col_sizes = reinterpret_cast(shared[processing_index % stages_count]); - auto shared_col_offsets = - reinterpret_cast(&shared[processing_index % stages_count][col_size_bytes]); - - auto const shared_row_offset = util::round_up_unsafe(col_size_bytes + col_offset_bytes, 8); - - auto block_row_size = block.get_shared_row_size(_col_offsets, _col_sizes); + auto const block_row_size = block.get_shared_row_size(col_offsets, col_sizes); // now we copy from shared memory to final destination. // the data is laid out in rows in shared memory, so the reads @@ -783,9 +760,9 @@ __global__ void copy_from_rows(const size_type num_rows, const size_type num_col auto const absolute_row = relative_row + block.start_row; auto const shared_memory_row_offset = block_row_size * relative_row; - auto const shared_memory_offset = shared_col_offsets[relative_col] - shared_col_offsets[0] + - shared_memory_row_offset + shared_row_offset; - auto const column_size = shared_col_sizes[relative_col]; + auto const shared_memory_offset = + col_offsets[absolute_col] - col_offsets[block.start_col] + shared_memory_row_offset; + auto const column_size = col_sizes[absolute_col]; int8_t *shmem_src = &shared[processing_index % stages_count][shared_memory_offset]; int8_t *dst = &output_data[absolute_col][absolute_row * column_size]; @@ -875,7 +852,6 @@ __global__ void copy_validity_from_rows(const size_type num_rows, const size_typ auto const relative_row = section_y * 32 + lane_id; auto const absolute_col = relative_col + block_start_col; auto const absolute_row = relative_row + block_start_row; - auto const rows_left = num_rows - absolute_row; auto const participation_mask = __ballot_sync(0xFFFFFFFF, absolute_row < num_rows); @@ -903,15 +879,29 @@ __global__ void copy_validity_from_rows(const size_type num_rows, const size_typ // make sure entire block has finished copy group.sync(); - // now async memcpy the shared - for (int col = block.start_col + threadIdx.x; col <= block.end_col; col += blockDim.x) { - auto const relative_col = col - block.start_col; - auto const starting_address = output_nm[col] + word_index(block_start_row); - - cuda::memcpy_async( - starting_address, &this_shared_block[validity_data_col_length * relative_col], - util::div_rounding_up_unsafe(num_block_rows, 8), - shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]); + // now async memcpy the shared memory out to the final destination 8 bytes at a time + auto const col_bytes = util::div_rounding_up_unsafe(num_block_rows, 8); + auto const chunks_per_col = util::div_rounding_up_unsafe(col_bytes, 8); + auto const total_chunks = chunks_per_col * num_block_cols; + auto &subset_barrier = + shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]; + auto const tail_bytes = col_bytes % 8; + + for (auto i = threadIdx.x; i < total_chunks; i += blockDim.x) { + // determine source address of my chunk + auto const relative_col = i / chunks_per_col; + auto const row_chunk = i % chunks_per_col; + auto const absolute_col = relative_col + block_start_col; + auto const relative_chunk_byte_offset = row_chunk * 8; + auto const output_dest = + output_nm[absolute_col] + word_index(block_start_row) + row_chunk * 2; + auto const input_src = + &this_shared_block[validity_data_col_length * relative_col + relative_chunk_byte_offset]; + + if (tail_bytes > 0 && row_chunk == chunks_per_col - 1) + cuda::memcpy_async(output_dest, input_src, tail_bytes, subset_barrier); + else + cuda::memcpy_async(output_dest, input_src, cuda::aligned_size_t<8>(8), subset_barrier); } } @@ -1363,13 +1353,6 @@ void determine_windows(std::vector const &column_sizes, std::min(optimal_square_len / column_sizes[0], total_number_of_rows), 32), 1, first_row_batch_size); - auto calc_admin_data_size = [](int num_cols) -> size_type { - // admin data is the column sizes and column start information. - // this is copied to shared memory as well and needs to be accounted for - // in the window calculation. - return num_cols * sizeof(size_type) + num_cols * sizeof(size_type); - }; - int row_size = 0; // march each column and build the blocks of appropriate sizes @@ -1383,9 +1366,7 @@ void determine_windows(std::vector const &column_sizes, auto const row_size_with_end_pad = util::round_up_unsafe(row_size_with_this_col, JCUDF_ROW_ALIGNMENT); - if (row_size_with_end_pad * window_height + - calc_admin_data_size(col - current_window_start_col) > - shmem_limit_per_block) { + if (row_size_with_end_pad * window_height > shmem_limit_per_block) { // too large, close this window, generate vertical blocks and restart f(current_window_start_col, col == 0 ? col : col - 1, window_height); From 74afad7c64e1f1c6529adfe4ca7df6baa7748cc1 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Tue, 14 Dec 2021 21:10:07 -0500 Subject: [PATCH 71/80] Update java/src/main/native/src/row_conversion.cu Co-authored-by: MithunR --- java/src/main/native/src/row_conversion.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu index c44ac7343e7..15a7fbf02e3 100644 --- a/java/src/main/native/src/row_conversion.cu +++ b/java/src/main/native/src/row_conversion.cu @@ -75,7 +75,7 @@ namespace java { namespace detail { /** - * @brief Copies data from row-base JCUDF format to column-based cudf format. + * @brief Copies data from row-based JCUDF format to column-based cudf format. * * This optimized version of the conversion is faster for fixed-width tables * that do not have more than 100 columns. From 57a84e4e2663ea2a291892caa76f4cc2640a3a9d Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Thu, 23 Dec 2021 02:03:02 +0000 Subject: [PATCH 72/80] changes from review comments --- java/src/main/native/src/row_conversion.cu | 1058 +++++++++++--------- 1 file changed, 595 insertions(+), 463 deletions(-) diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu index 15a7fbf02e3..c1b6bdbce5d 100644 --- a/java/src/main/native/src/row_conversion.cu +++ b/java/src/main/native/src/row_conversion.cu @@ -14,21 +14,7 @@ * limitations under the License. */ -#include -#include -#include -#include -#include -#include -#include - #include -#include - -#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 -#include -#endif - #include #include #include @@ -50,17 +36,31 @@ #include #include #include +#include #include "row_conversion.hpp" #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 -constexpr auto NUM_BLOCKS_PER_KERNEL_FROM_ROWS = 2; -constexpr auto NUM_BLOCKS_PER_KERNEL_TO_ROWS = 2; -constexpr auto NUM_BLOCKS_PER_KERNEL_LOADED = 2; -constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL = 8; -constexpr auto NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED = 2; +#include +#endif + +#include +#include +#include +#include +#include +#include +#include constexpr auto JCUDF_ROW_ALIGNMENT = 8; + +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 +constexpr auto NUM_TILES_PER_KERNEL_FROM_ROWS = 2; +constexpr auto NUM_TILES_PER_KERNEL_TO_ROWS = 2; +constexpr auto NUM_TILES_PER_KERNEL_LOADED = 2; +constexpr auto NUM_VALIDITY_TILES_PER_KERNEL = 8; +constexpr auto NUM_VALIDITY_TILES_PER_KERNEL_LOADED = 2; + constexpr auto MAX_BATCH_SIZE = std::numeric_limits::max(); // needed to suppress warning about cuda::barrier @@ -71,9 +71,139 @@ using namespace cudf; using detail::make_device_uvector_async; using rmm::device_uvector; namespace cudf { -namespace java { +namespace jni { namespace detail { +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 + +/************************************************************************ + * This module converts data from row-major to column-major and from column-major + * to row-major. It is a transpose of the data of sorts, but there are a few + * complicating factors. They are spelled out below: + * + * Row Batches: + * The row data has to fit inside a + * cuDF column, which limits it to 2 gigs currently. The calling code attempts + * to keep the data size under 2 gigs, but due to padding this isn't always + * the case, so being able to break this up into multiple columns is necessary. + * Internally, this is referred to as the row batch, which is a group of rows + * that will fit into this 2 gig space requirement. There are typically 1 of + * these batches, but there can be 2. + * + * Async Memcpy: + * The CUDA blocks are using memcpy_async, which allows for the device to + * schedule memcpy operations and then wait on them to complete at a later + * time with a barrier. The recommendation is to double-buffer the work + * so that processing can occur while a copy operation is being completed. + * On Ampere or later hardware there is dedicated hardware to do this copy + * and on pre-Ampere it should generate the same code that a hand-rolled + * loop would generate, so performance should be the same or better than + * a hand-rolled kernel. + * + * Tile Info: + * Each CUDA block will work on NUM_TILES_PER_KERNEL_*_ROWS tile infos + * before exiting. It will have enough shared memory available to load + * NUM_TILES_PER_KERNEL_LOADED tiles at one time. The block will load + * as many tiles as it can fit into shared memory and then wait on the + * first tile to completely load before processing. Processing in this + * case means coping the data from shared memory back out to device + * memory via memcpy_async. This kernel is completely memory bound. + * + * Batch Data: + * This structure contains all the row batches and some book-keeping + * data necessary for the batches such as row numbers for the batches. + * + * Tiles: + * The tile info describes a tile of data to process. In a GPU with + * 48KB of shared memory each tile uses approximately 24KB of memory + * which equates to about 144 bytes in each direction. The tiles are + * kept as square as possible to attempt to coalesce memory operations. + * The taller a tile is the better coalescing of columns, but row + * coalescing suffers. The wider a tile is the better the row coalescing, + * but columns coalescing suffers. The code attempts to produce a square + * tile to balance the coalescing. It starts by figuring out the optimal + * byte length and then adding columns to the data until the tile is too + * large. Since rows are different width with different alignment + * requirements, this isn't typically exact. Once a width is found the + * tiles are generated vertically with that width and height and then + * the process repeats. This means all the tiles will be the same + * height, but will have different widths based on what columns they + * encompass. Tiles in a vertical row will all have the same dimensions. + * + * -------------------------------- + * | 4 5.0f || True 8 3 1 | + * | 3 6.0f || False 3 1 1 | + * | 2 7.0f || True 7 4 1 | + * | 1 8.0f || False 2 5 1 | + * -------------------------------- + * | 0 9.0f || True 6 7 1 | + * ... + ************************************************************************/ + +/** + * @brief The CUDA blocks work on one or more tile_info structs of data. + * This structure defines the workspaces for the blocks. + * + */ +struct tile_info { + int start_col; + int start_row; + int end_col; + int end_row; + int batch_number; + + CUDA_DEVICE_CALLABLE + size_type get_shared_row_size(size_type const *const col_offsets, + size_type const *const col_sizes) const { + return util::round_up_unsafe(col_offsets[end_col] + col_sizes[end_col] - col_offsets[start_col], + JCUDF_ROW_ALIGNMENT); + } + + CUDA_DEVICE_CALLABLE + size_type num_cols() const { return end_col - start_col + 1; } + + CUDA_DEVICE_CALLABLE + size_type num_rows() const { return end_row - start_row + 1; } +}; + +/** + * @brief Returning rows is done in a byte cudf column. This is limited in size by + * `size_type` and so output is broken into batches of rows that fit inside + * this limit. + * + */ +struct row_batch { + size_type num_bytes; // number of bytes in this batch + size_type row_count; // number of rows in the batch + device_uvector row_offsets; // offsets column of output cudf column +}; + +/** + * @brief Holds information about the batches of data to be processed + * + */ +struct batch_data { + device_uvector batch_row_offsets; // offset column of returned cudf column + device_uvector d_batch_row_boundaries; // row numbers for the start of each batch + std::vector + batch_row_boundaries; // row numbers for the start of each batch: 0, 1500, 2700 + std::vector row_batches; // information about each batch such as byte count +}; + +struct row_offset_functor { + row_offset_functor(size_type fixed_width_only_row_size) + : _fixed_width_only_row_size(fixed_width_only_row_size){}; + + CUDA_DEVICE_CALLABLE + size_type operator()(int row_number, int tile_row_start) { + return (row_number - tile_row_start) * _fixed_width_only_row_size; + } + + size_type _fixed_width_only_row_size; +}; + +#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 + /** * @brief Copies data from row-based JCUDF format to column-based cudf format. * @@ -336,132 +466,92 @@ __global__ void copy_to_rows_fixed_width_optimized( #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 -/** - * @brief The GPU blocks work on one or more block_info structs of data. - * This structure defined the workspace for the block. - * - */ -struct block_info { - int start_col; - int start_row; - int end_col; - int end_row; - int batch_number; - - constexpr size_type get_shared_row_size(size_type const *const col_offsets, - size_type const *const col_sizes) const { - return util::round_up_unsafe(col_offsets[end_col] + col_sizes[end_col] - col_offsets[start_col], - JCUDF_ROW_ALIGNMENT); - } - constexpr size_type num_cols() const { return end_col - start_col + 1; } - - constexpr size_type num_rows() const { return end_row - start_row + 1; } -}; - -/** - * @brief Returning rows is done in a byte cudf column. This is limited in size by - * `size_type` and so output is broken into batches of rows that fit inside - * this limit. - * - */ -struct row_batch { - size_type num_bytes; // number of bytes in this batch - size_type row_count; // number of rows in the batch - device_uvector row_offsets; // offsets column of output cudf column -}; - -/** - * @brief Holds information about the batches of data to be processed - * - */ -struct batch_data { - device_uvector batch_row_offsets; // offset column of returned cudf column - std::vector - batch_row_boundaries; // row numbers for different batches like 0, 10000, 20000 - std::vector row_batches; // information about each batch such as byte count -}; - /** * @brief copy data from cudf columns into JCUDF format, which is row-based * + * @tparam RowOffsetIter iterator that gives the size of a specific row of the table. * @param num_rows total number of rows in the table * @param num_columns total number of columns in the table - * @param shmem_used_per_block shared memory amount each `block_info` is using - * @param block_infos span of `block_info` structs the define the work + * @param shmem_used_per_tile shared memory amount each `tile_info` is using + * @param tile_infos span of `tile_info` structs the define the work * @param input_data pointer to raw table data * @param col_sizes array of sizes for each element in a column - one per column * @param col_offsets offset into input data row for each column's start * @param row_offsets offset to a specific row in the output data + * @param batch_row_boundaries row numbers for batch starts * @param output_data pointer to output data * */ +template __global__ void copy_to_rows(const size_type num_rows, const size_type num_columns, - const size_type shmem_used_per_block, - device_span block_infos, const int8_t **input_data, + const size_type shmem_used_per_tile, + device_span tile_infos, const int8_t **input_data, const size_type *col_sizes, const size_type *col_offsets, - const size_type *row_offsets, int8_t **output_data) { + RowOffsetIter row_offsets, size_type const *batch_row_boundaries, + int8_t **output_data) { // We are going to copy the data in two passes. // The first pass copies a chunk of data into shared memory. // The second pass copies that chunk from shared memory out to the final location. // Because shared memory is limited we copy a subset of the rows at a time. - // This has been broken up for us in the block_info struct, so we don't have + // This has been broken up for us in the tile_info struct, so we don't have // any calculation to do here, but it is important to note. - constexpr unsigned stages_count = NUM_BLOCKS_PER_KERNEL_LOADED; + constexpr unsigned stages_count = NUM_TILES_PER_KERNEL_LOADED; auto group = cooperative_groups::this_thread_block(); extern __shared__ int8_t shared_data[]; - int8_t *shared[stages_count] = {shared_data, shared_data + shmem_used_per_block}; + int8_t *shared[stages_count] = {shared_data, shared_data + shmem_used_per_tile}; - __shared__ cuda::barrier block_barrier[NUM_BLOCKS_PER_KERNEL_LOADED]; + __shared__ cuda::barrier tile_barrier[NUM_TILES_PER_KERNEL_LOADED]; if (group.thread_rank() == 0) { - for (int i = 0; i < NUM_BLOCKS_PER_KERNEL_LOADED; ++i) { - init(&block_barrier[i], group.size()); + for (int i = 0; i < NUM_TILES_PER_KERNEL_LOADED; ++i) { + init(&tile_barrier[i], group.size()); } } group.sync(); - auto const blocks_remaining = - std::min(static_cast(block_infos.size()) - blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_ROWS, - static_cast(NUM_BLOCKS_PER_KERNEL_TO_ROWS)); - - size_t fetch; - size_t subset; - for (subset = fetch = 0; subset < blocks_remaining; ++subset) { - // Fetch ahead up to stages_count subsets - for (; fetch < blocks_remaining && fetch < (subset + stages_count); ++fetch) { - auto const fetch_block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_ROWS + fetch]; - auto const num_fetch_cols = fetch_block.num_cols(); - auto const num_fetch_rows = fetch_block.num_rows(); - auto const num_elements_in_block = num_fetch_cols * num_fetch_rows; - auto const fetch_block_row_size = fetch_block.get_shared_row_size(col_offsets, col_sizes); - auto const starting_column_offset = col_offsets[fetch_block.start_col]; - auto &fetch_barrier = block_barrier[fetch % NUM_BLOCKS_PER_KERNEL_LOADED]; + auto const tiles_remaining = + std::min(static_cast(tile_infos.size()) - blockIdx.x * NUM_TILES_PER_KERNEL_TO_ROWS, + static_cast(NUM_TILES_PER_KERNEL_TO_ROWS)); + + size_t fetch_index; //< tile we are currently fetching + size_t processing_index; //< tile we are currently processing + for (processing_index = fetch_index = 0; processing_index < tiles_remaining; ++processing_index) { + // Fetch ahead up to NUM_TILES_PER_KERNEL_LOADED + for (; fetch_index < tiles_remaining && fetch_index < (processing_index + stages_count); + ++fetch_index) { + auto const fetch_tile = tile_infos[blockIdx.x * NUM_TILES_PER_KERNEL_TO_ROWS + fetch_index]; + auto const num_fetch_cols = fetch_tile.num_cols(); + auto const num_fetch_rows = fetch_tile.num_rows(); + auto const num_elements_in_tile = num_fetch_cols * num_fetch_rows; + auto const fetch_tile_row_size = fetch_tile.get_shared_row_size(col_offsets, col_sizes); + auto const starting_column_offset = col_offsets[fetch_tile.start_col]; + auto &fetch_barrier = tile_barrier[fetch_index % NUM_TILES_PER_KERNEL_LOADED]; // wait for the last use of the memory to be completed - if (fetch >= NUM_BLOCKS_PER_KERNEL_LOADED) { + if (fetch_index >= NUM_TILES_PER_KERNEL_LOADED) { fetch_barrier.arrive_and_wait(); } // to do the copy we need to do n column copies followed by m element copies OR // we have to do m element copies followed by r row copies. When going from column // to row it is much easier to copy by elements first otherwise we would need a running - // total of the column sizes for our block, which isn't readily available. This makes it + // total of the column sizes for our tile, which isn't readily available. This makes it // more appealing to copy element-wise from input data into shared matching the end layout // and do row-based memcopies out. - auto const shared_buffer_base = shared[fetch % stages_count]; - for (auto el = static_cast(threadIdx.x); el < num_elements_in_block; el += blockDim.x) { + auto const shared_buffer_base = shared[fetch_index % stages_count]; + for (auto el = static_cast(threadIdx.x); el < num_elements_in_tile; el += blockDim.x) { auto const relative_col = el / num_fetch_rows; auto const relative_row = el % num_fetch_rows; - auto const absolute_col = relative_col + fetch_block.start_col; - auto const absolute_row = relative_row + fetch_block.start_row; + auto const absolute_col = relative_col + fetch_tile.start_col; + auto const absolute_row = relative_row + fetch_tile.start_row; auto const col_size = col_sizes[absolute_col]; auto const col_offset = col_offsets[absolute_col]; auto const relative_col_offset = col_offset - starting_column_offset; - auto const shared_offset = relative_row * fetch_block_row_size + relative_col_offset; + auto const shared_offset = relative_row * fetch_tile_row_size + relative_col_offset; auto const input_src = input_data[absolute_col] + col_size * absolute_row; // copy the element from global memory @@ -486,59 +576,65 @@ __global__ void copy_to_rows(const size_type num_rows, const size_type num_colum } } - auto &subset_barrier = block_barrier[subset % NUM_BLOCKS_PER_KERNEL_LOADED]; - subset_barrier.arrive_and_wait(); + auto &processing_barrier = tile_barrier[processing_index % NUM_TILES_PER_KERNEL_LOADED]; + processing_barrier.arrive_and_wait(); - auto const block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_TO_ROWS + subset]; - auto const block_row_size = block.get_shared_row_size(col_offsets, col_sizes); - auto const column_offset = col_offsets[block.start_col]; - auto const block_output_buffer = output_data[block.batch_number]; + auto const tile = tile_infos[blockIdx.x * NUM_TILES_PER_KERNEL_TO_ROWS + processing_index]; + auto const tile_row_size = tile.get_shared_row_size(col_offsets, col_sizes); + auto const column_offset = col_offsets[tile.start_col]; + auto const tile_output_buffer = output_data[tile.batch_number]; + auto const row_batch_start = + tile.batch_number == 0 ? 0 : batch_row_boundaries[tile.batch_number]; // copy entire row 8 bytes at a time - auto const chunks_per_row = util::div_rounding_up_unsafe(block_row_size, 8); - auto const total_chunks = chunks_per_row * block.num_rows(); + auto const chunks_per_row = util::div_rounding_up_unsafe(tile_row_size, 8); + auto const total_chunks = chunks_per_row * tile.num_rows(); for (auto i = threadIdx.x; i < total_chunks; i += blockDim.x) { // determine source address of my chunk auto const relative_row = i / chunks_per_row; auto const relative_chunk_offset = (i % chunks_per_row) * 8; - auto const output_dest = block_output_buffer + row_offsets[relative_row + block.start_row] + + auto const output_dest = tile_output_buffer + + row_offsets(relative_row + tile.start_row, row_batch_start) + column_offset + relative_chunk_offset; - auto const input_src = - &shared[subset % stages_count][block_row_size * relative_row + relative_chunk_offset]; + auto const input_src = &shared[processing_index % stages_count] + [tile_row_size * relative_row + relative_chunk_offset]; - cuda::memcpy_async(output_dest, input_src, cuda::aligned_size_t<8>(8), subset_barrier); + cuda::memcpy_async(output_dest, input_src, cuda::aligned_size_t<8>(8), processing_barrier); } } // wait on the last copies to complete - for (uint i = 0; i < std::min(stages_count, blocks_remaining); ++i) { - block_barrier[i].arrive_and_wait(); + for (uint i = 0; i < std::min(stages_count, tiles_remaining); ++i) { + tile_barrier[i].arrive_and_wait(); } } /** * @brief copy data from row-based format to cudf columns * + * @tparam RowOffsetIter iterator that gives the size of a specific row of the table. * @param num_rows total number of rows in the table * @param num_columns total number of columns in the table - * @param shmem_used_per_block amount of shared memory that is used by a block + * @param shmem_used_per_tile amount of shared memory that is used by a tile * @param row_offsets offset to a specific row in the output data + * @param batch_row_boundaries row numbers for batch starts * @param output_data pointer to output data, partitioned by data size * @param validity_offsets offset into input data row for validity data - * @param block_infos information about the blocks of work - * @param input_data pointer to input data + * @param tile_infos information about the tiles of work + * @param input_nm pointer to input data * */ -__global__ void copy_validity_to_rows(const size_type num_rows, const size_type num_columns, - const size_type shmem_used_per_block, - const size_type *row_offsets, int8_t **output_data, - const size_type validity_offset, - device_span block_infos, - const bitmask_type **input_nm) { +template +__global__ void +copy_validity_to_rows(const size_type num_rows, const size_type num_columns, + const size_type shmem_used_per_tile, RowOffsetIter row_offsets, + size_type const *batch_row_boundaries, int8_t **output_data, + const size_type validity_offset, device_span tile_infos, + const bitmask_type **input_nm) { extern __shared__ int8_t shared_data[]; - int8_t *shared_blocks[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] = { - shared_data, shared_data + shmem_used_per_block / 2}; + int8_t *shared_tiles[NUM_VALIDITY_TILES_PER_KERNEL_LOADED] = { + shared_data, shared_data + shmem_used_per_tile / 2}; using cudf::detail::warp_size; @@ -548,51 +644,50 @@ __global__ void copy_validity_to_rows(const size_type num_rows, const size_type // probably need knobs for number of rows vs columns to balance read/write auto group = cooperative_groups::this_thread_block(); - int const blocks_remaining = - std::min(static_cast(block_infos.size()) - blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL, - static_cast(NUM_VALIDITY_BLOCKS_PER_KERNEL)); + int const tiles_remaining = + std::min(static_cast(tile_infos.size()) - blockIdx.x * NUM_VALIDITY_TILES_PER_KERNEL, + static_cast(NUM_VALIDITY_TILES_PER_KERNEL)); __shared__ cuda::barrier - shared_block_barriers[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]; + shared_tile_barriers[NUM_VALIDITY_TILES_PER_KERNEL_LOADED]; if (group.thread_rank() == 0) { - for (int i = 0; i < NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; ++i) { - init(&shared_block_barriers[i], group.size()); + for (int i = 0; i < NUM_VALIDITY_TILES_PER_KERNEL_LOADED; ++i) { + init(&shared_tile_barriers[i], group.size()); } } group.sync(); - for (int validity_block = 0; validity_block < blocks_remaining; ++validity_block) { - if (validity_block >= NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED) { - shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] - .arrive_and_wait(); + for (int validity_tile = 0; validity_tile < tiles_remaining; ++validity_tile) { + if (validity_tile >= NUM_VALIDITY_TILES_PER_KERNEL_LOADED) { + shared_tile_barriers[validity_tile % NUM_VALIDITY_TILES_PER_KERNEL_LOADED].arrive_and_wait(); } - int8_t *this_shared_block = shared_blocks[validity_block % 2]; - auto block = block_infos[blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block]; + int8_t *this_shared_tile = shared_tiles[validity_tile % NUM_VALIDITY_TILES_PER_KERNEL_LOADED]; + auto tile = tile_infos[blockIdx.x * NUM_VALIDITY_TILES_PER_KERNEL + validity_tile]; - auto const num_block_cols = block.num_cols(); - auto const num_block_rows = block.num_rows(); + auto const num_tile_cols = tile.num_cols(); + auto const num_tile_rows = tile.num_rows(); - auto const num_sections_x = util::div_rounding_up_unsafe(num_block_cols, 32); - auto const num_sections_y = util::div_rounding_up_unsafe(num_block_rows, 32); + auto const num_sections_x = util::div_rounding_up_unsafe(num_tile_cols, 32); + auto const num_sections_y = util::div_rounding_up_unsafe(num_tile_rows, 32); auto const validity_data_row_length = - util::round_up_unsafe(util::div_rounding_up_unsafe(num_block_cols, 8), JCUDF_ROW_ALIGNMENT); + util::round_up_unsafe(util::div_rounding_up_unsafe(num_tile_cols, 8), JCUDF_ROW_ALIGNMENT); auto const total_sections = num_sections_x * num_sections_y; int const warp_id = threadIdx.x / warp_size; int const lane_id = threadIdx.x % warp_size; - auto const warps_per_block = std::max(1u, blockDim.x / warp_size); + auto const warps_per_tile = std::max(1u, blockDim.x / warp_size); - // the block is divided into sections. A warp operates on a section at a time. + // the tile is divided into sections. A warp operates on a section at a time. for (int my_section_idx = warp_id; my_section_idx < total_sections; - my_section_idx += warps_per_block) { + my_section_idx += warps_per_tile) { // convert to rows and cols auto const section_x = my_section_idx % num_sections_x; auto const section_y = my_section_idx / num_sections_x; auto const relative_col = section_x * 32 + lane_id; auto const relative_row = section_y * 32; - auto const absolute_col = relative_col + block.start_col; - auto const absolute_row = relative_row + block.start_row; + auto const absolute_col = relative_col + tile.start_col; + auto const absolute_row = relative_row + tile.start_row; auto const participation_mask = __ballot_sync(0xFFFFFFFF, absolute_col < num_columns); if (absolute_col < num_columns) { @@ -610,141 +705,145 @@ __global__ void copy_validity_to_rows(const size_type num_rows, const size_type auto const validity_write_offset = validity_data_row_length * (relative_row + i) + relative_col / 8; if (threadIdx.x % warp_size == 0) { - *reinterpret_cast(&this_shared_block[validity_write_offset]) = validity_data; + *reinterpret_cast(&this_shared_tile[validity_write_offset]) = validity_data; } } } } - // make sure entire block has finished copy + // make sure entire tile has finished copy group.sync(); auto const output_data_base = - output_data[block.batch_number] + validity_offset + block.start_col / 8; + output_data[tile.batch_number] + validity_offset + tile.start_col / 8; // now async memcpy the shared memory out to the final destination 4 bytes at a time since we do // 32-row chunks - auto const row_bytes = util::div_rounding_up_unsafe(num_block_cols, 8); + auto const row_bytes = util::div_rounding_up_unsafe(num_tile_cols, 8); auto const chunks_per_row = util::div_rounding_up_unsafe(row_bytes, 8); - auto const total_chunks = chunks_per_row * block.num_rows(); - auto &subset_barrier = - shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]; + auto const total_chunks = chunks_per_row * tile.num_rows(); + auto &processing_barrier = + shared_tile_barriers[validity_tile % NUM_VALIDITY_TILES_PER_KERNEL_LOADED]; auto const tail_bytes = row_bytes % 8; + auto const row_batch_start = + tile.batch_number == 0 ? 0 : batch_row_boundaries[tile.batch_number]; for (auto i = threadIdx.x; i < total_chunks; i += blockDim.x) { // determine source address of my chunk auto const relative_row = i / chunks_per_row; auto const col_chunk = i % chunks_per_row; auto const relative_chunk_offset = col_chunk * 8; - auto const output_dest = - output_data_base + row_offsets[relative_row + block.start_row] + relative_chunk_offset; + auto const output_dest = output_data_base + + row_offsets(relative_row + tile.start_row, row_batch_start) + + relative_chunk_offset; auto const input_src = - &this_shared_block[validity_data_row_length * relative_row + relative_chunk_offset]; + &this_shared_tile[validity_data_row_length * relative_row + relative_chunk_offset]; if (tail_bytes > 0 && col_chunk == chunks_per_row - 1) - cuda::memcpy_async(output_dest, input_src, tail_bytes, subset_barrier); + cuda::memcpy_async(output_dest, input_src, tail_bytes, processing_barrier); else - cuda::memcpy_async(output_dest, input_src, cuda::aligned_size_t<8>(8), subset_barrier); + cuda::memcpy_async(output_dest, input_src, cuda::aligned_size_t<8>(8), processing_barrier); } } - // wait for last blocks of data to arrive - for (int validity_block = 0; - validity_block < blocks_remaining % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; - ++validity_block) { - shared_block_barriers[validity_block].arrive_and_wait(); + // wait for last tiles of data to arrive + for (int validity_tile = 0; + validity_tile < tiles_remaining % NUM_VALIDITY_TILES_PER_KERNEL_LOADED; ++validity_tile) { + shared_tile_barriers[validity_tile].arrive_and_wait(); } } /** * @brief copy data from row-based format to cudf columns * + * @tparam RowOffsetIter iterator that gives the size of a specific row of the table. * @param num_rows total number of rows in the table * @param num_columns total number of columns in the table - * @param shmem_used_per_block amount of shared memory that is used by a block + * @param shmem_used_per_tile amount of shared memory that is used by a tile * @param row_offsets offset to a specific row in the input data + * @param batch_row_boundaries row numbers for batch starts * @param output_data pointers to column data * @param col_sizes array of sizes for each element in a column - one per column * @param col_offsets offset into input data row for each column's start - * @param block_infos information about the blocks of work + * @param tile_infos information about the tiles of work * @param input_data pointer to input data * */ +template __global__ void copy_from_rows(const size_type num_rows, const size_type num_columns, - const size_type shmem_used_per_block, const size_type *row_offsets, - int8_t **output_data, const size_type *col_sizes, - const size_type *col_offsets, - device_span block_infos, - const int8_t *input_data) { + const size_type shmem_used_per_tile, RowOffsetIter row_offsets, + size_type const *batch_row_boundaries, int8_t **output_data, + const size_type *col_sizes, const size_type *col_offsets, + device_span tile_infos, const int8_t *input_data) { // We are going to copy the data in two passes. // The first pass copies a chunk of data into shared memory. // The second pass copies that chunk from shared memory out to the final location. // Because shared memory is limited we copy a subset of the rows at a time. - // This has been broken up for us in the block_info struct, so we don't have + // This has been broken up for us in the tile_info struct, so we don't have // any calculation to do here, but it is important to note. // to speed up some of the random access memory we do, we copy col_sizes and col_offsets - // to shared memory for each of the blocks that we work on + // to shared memory for each of the tiles that we work on - constexpr unsigned stages_count = NUM_BLOCKS_PER_KERNEL_LOADED; + constexpr unsigned stages_count = NUM_TILES_PER_KERNEL_LOADED; auto group = cooperative_groups::this_thread_block(); extern __shared__ int8_t shared_data[]; - int8_t *shared[stages_count] = {shared_data, shared_data + shmem_used_per_block}; + int8_t *shared[stages_count] = {shared_data, shared_data + shmem_used_per_tile}; - __shared__ cuda::barrier block_barrier[NUM_BLOCKS_PER_KERNEL_LOADED]; + __shared__ cuda::barrier tile_barrier[NUM_TILES_PER_KERNEL_LOADED]; if (group.thread_rank() == 0) { - for (int i = 0; i < NUM_BLOCKS_PER_KERNEL_LOADED; ++i) { - init(&block_barrier[i], group.size()); + for (int i = 0; i < NUM_TILES_PER_KERNEL_LOADED; ++i) { + init(&tile_barrier[i], group.size()); } } group.sync(); - auto blocks_remaining = - std::min(static_cast(block_infos.size()) - blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_ROWS, - static_cast(NUM_BLOCKS_PER_KERNEL_FROM_ROWS)); + auto tiles_remaining = + std::min(static_cast(tile_infos.size()) - blockIdx.x * NUM_TILES_PER_KERNEL_FROM_ROWS, + static_cast(NUM_TILES_PER_KERNEL_FROM_ROWS)); size_t fetch_index; size_t processing_index; - for (processing_index = fetch_index = 0; processing_index < blocks_remaining; - ++processing_index) { + for (processing_index = fetch_index = 0; processing_index < tiles_remaining; ++processing_index) { // Fetch ahead up to stages_count groups - for (; fetch_index < static_cast(blocks_remaining) && + for (; fetch_index < static_cast(tiles_remaining) && fetch_index < (processing_index + stages_count); ++fetch_index) { - auto const fetch_block = - block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_ROWS + fetch_index]; - auto const fetch_block_start_row = fetch_block.start_row; - auto const starting_col_offset = col_offsets[fetch_block.start_col]; - auto const fetch_block_row_size = fetch_block.get_shared_row_size(col_offsets, col_sizes); - auto &fetch_barrier = block_barrier[fetch_index % NUM_BLOCKS_PER_KERNEL_LOADED]; + auto const fetch_tile = tile_infos[blockIdx.x * NUM_TILES_PER_KERNEL_FROM_ROWS + fetch_index]; + auto const fetch_tile_start_row = fetch_tile.start_row; + auto const starting_col_offset = col_offsets[fetch_tile.start_col]; + auto const fetch_tile_row_size = fetch_tile.get_shared_row_size(col_offsets, col_sizes); + auto &fetch_barrier = tile_barrier[fetch_index % NUM_TILES_PER_KERNEL_LOADED]; + auto const row_batch_start = + fetch_tile.batch_number == 0 ? 0 : batch_row_boundaries[fetch_tile.batch_number]; // if we have fetched all buffers, we need to wait for processing // to complete on them before we can use them again - if (fetch_index > NUM_BLOCKS_PER_KERNEL_LOADED) { + if (fetch_index > NUM_TILES_PER_KERNEL_LOADED) { fetch_barrier.arrive_and_wait(); } - for (auto row = fetch_block_start_row + static_cast(threadIdx.x); - row <= fetch_block.end_row; row += blockDim.x) { - auto shared_offset = (row - fetch_block_start_row) * fetch_block_row_size; + for (auto row = fetch_tile_start_row + static_cast(threadIdx.x); + row <= fetch_tile.end_row; row += blockDim.x) { + auto shared_offset = (row - fetch_tile_start_row) * fetch_tile_row_size; // copy the data cuda::memcpy_async(&shared[fetch_index % stages_count][shared_offset], - &input_data[row_offsets[row] + starting_col_offset], - fetch_block_row_size, fetch_barrier); + &input_data[row_offsets(row, row_batch_start) + starting_col_offset], + fetch_tile_row_size, fetch_barrier); } } - auto &processing_barrier = block_barrier[processing_index % NUM_BLOCKS_PER_KERNEL_LOADED]; + auto &processing_barrier = tile_barrier[processing_index % NUM_TILES_PER_KERNEL_LOADED]; // ensure our data is ready processing_barrier.arrive_and_wait(); - auto const block = block_infos[blockIdx.x * NUM_BLOCKS_PER_KERNEL_FROM_ROWS + processing_index]; - auto const rows_in_block = block.num_rows(); - auto const cols_in_block = block.num_cols(); - auto const block_row_size = block.get_shared_row_size(col_offsets, col_sizes); + auto const tile = tile_infos[blockIdx.x * NUM_TILES_PER_KERNEL_FROM_ROWS + processing_index]; + auto const rows_in_tile = tile.num_rows(); + auto const cols_in_tile = tile.num_cols(); + auto const tile_row_size = tile.get_shared_row_size(col_offsets, col_sizes); // now we copy from shared memory to final destination. // the data is laid out in rows in shared memory, so the reads @@ -753,15 +852,15 @@ __global__ void copy_from_rows(const size_type num_rows, const size_type num_col // to prevent each thread working on a single row and also to ensure // that all threads can do work in the case of more threads than rows, // we do a global index instead of a double for loop with col/row. - for (int index = threadIdx.x; index < rows_in_block * cols_in_block; index += blockDim.x) { - auto const relative_col = index % cols_in_block; - auto const relative_row = index / cols_in_block; - auto const absolute_col = relative_col + block.start_col; - auto const absolute_row = relative_row + block.start_row; + for (int index = threadIdx.x; index < rows_in_tile * cols_in_tile; index += blockDim.x) { + auto const relative_col = index % cols_in_tile; + auto const relative_row = index / cols_in_tile; + auto const absolute_col = relative_col + tile.start_col; + auto const absolute_row = relative_row + tile.start_row; - auto const shared_memory_row_offset = block_row_size * relative_row; + auto const shared_memory_row_offset = tile_row_size * relative_row; auto const shared_memory_offset = - col_offsets[absolute_col] - col_offsets[block.start_col] + shared_memory_row_offset; + col_offsets[absolute_col] - col_offsets[tile.start_col] + shared_memory_row_offset; auto const column_size = col_sizes[absolute_col]; int8_t *shmem_src = &shared[processing_index % stages_count][shared_memory_offset]; @@ -773,33 +872,36 @@ __global__ void copy_from_rows(const size_type num_rows, const size_type num_col } // wait on the last copies to complete - for (uint i = 0; i < std::min(stages_count, blocks_remaining); ++i) { - block_barrier[i].arrive_and_wait(); + for (uint i = 0; i < std::min(stages_count, tiles_remaining); ++i) { + tile_barrier[i].arrive_and_wait(); } } /** * @brief copy data from row-based format to cudf columns * + * @tparam RowOffsetIter iterator that gives the size of a specific row of the table. * @param num_rows total number of rows in the table * @param num_columns total number of columns in the table - * @param shmem_used_per_block amount of shared memory that is used by a block + * @param shmem_used_per_tile amount of shared memory that is used by a tile * @param row_offsets offset to a specific row in the input data + * @param batch_row_boundaries row numbers for batch starts * @param output_nm pointers to null masks for columns * @param validity_offsets offset into input data row for validity data - * @param block_infos information about the blocks of work + * @param tile_infos information about the tiles of work * @param input_data pointer to input data * */ -__global__ void copy_validity_from_rows(const size_type num_rows, const size_type num_columns, - const size_type shmem_used_per_block, - const size_type *row_offsets, bitmask_type **output_nm, - const size_type validity_offset, - device_span block_infos, - const int8_t *input_data) { +template +__global__ void +copy_validity_from_rows(const size_type num_rows, const size_type num_columns, + const size_type shmem_used_per_tile, RowOffsetIter row_offsets, + size_type const *batch_row_boundaries, bitmask_type **output_nm, + const size_type validity_offset, device_span tile_infos, + const int8_t *input_data) { extern __shared__ int8_t shared_data[]; - int8_t *shared_blocks[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED] = { - shared_data, shared_data + shmem_used_per_block / 2}; + int8_t *shared_tiles[NUM_VALIDITY_TILES_PER_KERNEL_LOADED] = { + shared_data, shared_data + shmem_used_per_tile / 2}; using cudf::detail::warp_size; @@ -809,55 +911,57 @@ __global__ void copy_validity_from_rows(const size_type num_rows, const size_typ // probably need knobs for number of rows vs columns to balance read/write auto group = cooperative_groups::this_thread_block(); - int const blocks_remaining = - std::min(static_cast(block_infos.size()) - blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL, - static_cast(NUM_VALIDITY_BLOCKS_PER_KERNEL)); + int const tiles_remaining = + std::min(static_cast(tile_infos.size()) - blockIdx.x * NUM_VALIDITY_TILES_PER_KERNEL, + static_cast(NUM_VALIDITY_TILES_PER_KERNEL)); __shared__ cuda::barrier - shared_block_barriers[NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]; + shared_tile_barriers[NUM_VALIDITY_TILES_PER_KERNEL_LOADED]; if (group.thread_rank() == 0) { - for (int i = 0; i < NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; ++i) { - init(&shared_block_barriers[i], group.size()); + for (int i = 0; i < NUM_VALIDITY_TILES_PER_KERNEL_LOADED; ++i) { + init(&shared_tile_barriers[i], group.size()); } } group.sync(); - for (int validity_block = 0; validity_block < blocks_remaining; ++validity_block) { - if (validity_block >= NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED) { - auto const validity_index = validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED; - shared_block_barriers[validity_index].arrive_and_wait(); + for (int validity_tile = 0; validity_tile < tiles_remaining; ++validity_tile) { + if (validity_tile >= NUM_VALIDITY_TILES_PER_KERNEL_LOADED) { + auto const validity_index = validity_tile % NUM_VALIDITY_TILES_PER_KERNEL_LOADED; + shared_tile_barriers[validity_index].arrive_and_wait(); } - int8_t *this_shared_block = shared_blocks[validity_block % 2]; - auto const block = block_infos[blockIdx.x * NUM_VALIDITY_BLOCKS_PER_KERNEL + validity_block]; - auto const block_start_col = block.start_col; - auto const block_start_row = block.start_row; - auto const num_block_cols = block.num_cols(); - auto const num_block_rows = block.num_rows(); - auto const num_sections_x = util::div_rounding_up_safe(num_block_cols, 8); - auto const num_sections_y = util::div_rounding_up_safe(num_block_rows, 32); + int8_t *this_shared_tile = shared_tiles[validity_tile % 2]; + auto const tile = tile_infos[blockIdx.x * NUM_VALIDITY_TILES_PER_KERNEL + validity_tile]; + auto const tile_start_col = tile.start_col; + auto const tile_start_row = tile.start_row; + auto const num_tile_cols = tile.num_cols(); + auto const num_tile_rows = tile.num_rows(); + auto const num_sections_x = util::div_rounding_up_safe(num_tile_cols, 8); + auto const num_sections_y = util::div_rounding_up_safe(num_tile_rows, 32); auto const validity_data_col_length = num_sections_y * 4; // words to bytes auto const total_sections = num_sections_x * num_sections_y; int const warp_id = threadIdx.x / warp_size; int const lane_id = threadIdx.x % warp_size; - auto const warps_per_block = std::max(1u, blockDim.x / warp_size); + auto const warps_per_tile = std::max(1u, blockDim.x / warp_size); - // the block is divided into sections. A warp operates on a section at a time. + // the tile is divided into sections. A warp operates on a section at a time. for (int my_section_idx = warp_id; my_section_idx < total_sections; - my_section_idx += warps_per_block) { + my_section_idx += warps_per_tile) { // convert section to row and col auto const section_x = my_section_idx % num_sections_x; auto const section_y = my_section_idx / num_sections_x; auto const relative_col = section_x * 8; auto const relative_row = section_y * 32 + lane_id; - auto const absolute_col = relative_col + block_start_col; - auto const absolute_row = relative_row + block_start_row; + auto const absolute_col = relative_col + tile_start_col; + auto const absolute_row = relative_row + tile_start_row; + auto const row_batch_start = + tile.batch_number == 0 ? 0 : batch_row_boundaries[tile.batch_number]; auto const participation_mask = __ballot_sync(0xFFFFFFFF, absolute_row < num_rows); if (absolute_row < num_rows) { - auto const my_byte = - input_data[row_offsets[absolute_row] + validity_offset + absolute_col / 8]; + auto const my_byte = input_data[row_offsets(absolute_row, row_batch_start) + + validity_offset + absolute_col / 8]; // so every thread that is participating in the warp has a byte, but it's row-based // data and we need it in column-based. So we shuffle the bits around to make @@ -870,47 +974,47 @@ __global__ void copy_validity_from_rows(const size_type num_rows, const size_typ auto const validity_write_offset = validity_data_col_length * (relative_col + i) + relative_row / 8; - *reinterpret_cast(&this_shared_block[validity_write_offset]) = validity_data; + *reinterpret_cast(&this_shared_tile[validity_write_offset]) = validity_data; } } } } - // make sure entire block has finished copy + // make sure entire tile has finished copy group.sync(); // now async memcpy the shared memory out to the final destination 8 bytes at a time - auto const col_bytes = util::div_rounding_up_unsafe(num_block_rows, 8); + auto const col_bytes = util::div_rounding_up_unsafe(num_tile_rows, 8); auto const chunks_per_col = util::div_rounding_up_unsafe(col_bytes, 8); - auto const total_chunks = chunks_per_col * num_block_cols; - auto &subset_barrier = - shared_block_barriers[validity_block % NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED]; + auto const total_chunks = chunks_per_col * num_tile_cols; + auto &processing_barrier = + shared_tile_barriers[validity_tile % NUM_VALIDITY_TILES_PER_KERNEL_LOADED]; auto const tail_bytes = col_bytes % 8; for (auto i = threadIdx.x; i < total_chunks; i += blockDim.x) { // determine source address of my chunk auto const relative_col = i / chunks_per_col; auto const row_chunk = i % chunks_per_col; - auto const absolute_col = relative_col + block_start_col; + auto const absolute_col = relative_col + tile_start_col; auto const relative_chunk_byte_offset = row_chunk * 8; - auto const output_dest = - output_nm[absolute_col] + word_index(block_start_row) + row_chunk * 2; + auto const output_dest = output_nm[absolute_col] + word_index(tile_start_row) + row_chunk * 2; auto const input_src = - &this_shared_block[validity_data_col_length * relative_col + relative_chunk_byte_offset]; + &this_shared_tile[validity_data_col_length * relative_col + relative_chunk_byte_offset]; - if (tail_bytes > 0 && row_chunk == chunks_per_col - 1) - cuda::memcpy_async(output_dest, input_src, tail_bytes, subset_barrier); - else - cuda::memcpy_async(output_dest, input_src, cuda::aligned_size_t<8>(8), subset_barrier); + if (tail_bytes > 0 && row_chunk == chunks_per_col - 1) { + cuda::memcpy_async(output_dest, input_src, tail_bytes, processing_barrier); + } else { + cuda::memcpy_async(output_dest, input_src, cuda::aligned_size_t<8>(8), processing_barrier); + } } } - // wait for last blocks of data to arrive - auto const num_blocks_to_wait = blocks_remaining > NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED ? - NUM_VALIDITY_BLOCKS_PER_KERNEL_LOADED : - blocks_remaining; - for (int validity_block = 0; validity_block < num_blocks_to_wait; ++validity_block) { - shared_block_barriers[validity_block].arrive_and_wait(); + // wait for last tiles of data to arrive + auto const num_tiles_to_wait = tiles_remaining > NUM_VALIDITY_TILES_PER_KERNEL_LOADED ? + NUM_VALIDITY_TILES_PER_KERNEL_LOADED : + tiles_remaining; + for (int validity_tile = 0; validity_tile < num_tiles_to_wait; ++validity_tile) { + shared_tile_barriers[validity_tile].arrive_and_wait(); } } @@ -1087,19 +1191,19 @@ static size_type compute_column_information(iterator begin, iterator end, } /** - * @brief Build `block_info` for the validity data to break up the work. + * @brief Build `tile_info` for the validity data to break up the work. * * @param num_columns number of columns in the table * @param num_rows number of rows in the table - * @param shmem_limit_per_block size of shared memory available to a single gpu block + * @param shmem_limit_per_tile size of shared memory available to a single gpu tile * @param row_batches batched row information for multiple output locations - * @return vector of `block_info` structs for validity data + * @return vector of `tile_info` structs for validity data */ -std::vector -build_validity_block_infos(size_type const &num_columns, size_type const &num_rows, - size_type const &shmem_limit_per_block, - std::vector const &row_batches) { - auto const desired_rows_and_columns = static_cast(sqrt(shmem_limit_per_block)); +std::vector +build_validity_tile_infos(size_type const &num_columns, size_type const &num_rows, + size_type const &shmem_limit_per_tile, + std::vector const &row_batches) { + auto const desired_rows_and_columns = static_cast(sqrt(shmem_limit_per_tile)); auto const column_stride = util::round_up_unsafe( [&]() { if (desired_rows_and_columns > num_columns) { @@ -1116,28 +1220,29 @@ build_validity_block_infos(size_type const &num_columns, size_type const &num_ro // element still takes 8 bytes! auto const bytes_per_row = util::round_up_safe(util::div_rounding_up_unsafe(column_stride, 8), JCUDF_ROW_ALIGNMENT); - auto const row_stride = std::min(num_rows, shmem_limit_per_block / bytes_per_row); + auto const row_stride = + std::min(num_rows, util::round_down_safe(shmem_limit_per_tile / bytes_per_row, 64)); - std::vector validity_block_infos; + std::vector validity_tile_infos; for (int col = 0; col < num_columns; col += column_stride) { - int current_window_row_batch = 0; - int rows_left_in_batch = row_batches[current_window_row_batch].row_count; + int current_tile_row_batch = 0; + int rows_left_in_batch = row_batches[current_tile_row_batch].row_count; int row = 0; while (row < num_rows) { if (rows_left_in_batch == 0) { - current_window_row_batch++; - rows_left_in_batch = row_batches[current_window_row_batch].row_count; + current_tile_row_batch++; + rows_left_in_batch = row_batches[current_tile_row_batch].row_count; } - int const window_height = std::min(row_stride, rows_left_in_batch); + int const tile_height = std::min(row_stride, rows_left_in_batch); - validity_block_infos.emplace_back(detail::block_info{ - col, row, std::min(col + column_stride - 1, num_columns - 1), row + window_height - 1}); - row += window_height; - rows_left_in_batch -= window_height; + validity_tile_infos.emplace_back(detail::tile_info{ + col, row, std::min(col + column_stride - 1, num_columns - 1), row + tile_height - 1}); + row += tile_height; + rows_left_in_batch -= tile_height; } } - return validity_block_infos; + return validity_tile_infos; } template struct row_size_functor { @@ -1155,6 +1260,7 @@ template struct row_size_functor { * @tparam RowSize iterator that gives the size of a specific row of the table. * @param num_rows Total number of rows in the table * @param row_sizes iterator that gives the size of a specific row of the table. + * @param all_fixed_width bool indicating all data in this table is fixed width * @param stream stream to operate on for this work * @param mr memory resource used to allocate any returned data * @returns vector of size_type's that indicate row numbers for batch boundaries and a @@ -1162,8 +1268,8 @@ template struct row_size_functor { */ template -batch_data build_batches(size_type num_rows, RowSize row_sizes, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) { +batch_data build_batches(size_type num_rows, RowSize row_sizes, bool all_fixed_width, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) { auto uint64_row_sizes = cudf::detail::make_counting_transform_iterator(0, row_size_functor(row_sizes)); auto const total_size = @@ -1173,7 +1279,7 @@ batch_data build_batches(size_type num_rows, RowSize row_sizes, rmm::cuda_stream auto const num_offsets = num_batches + 1; std::vector row_batches; std::vector batch_row_boundaries; - device_uvector batch_row_offsets(num_rows, stream); + device_uvector batch_row_offsets(all_fixed_width ? 0 : num_rows, stream); // at most max gpu memory / 2GB iterations. batch_row_boundaries.reserve(num_offsets); @@ -1212,8 +1318,10 @@ batch_data build_batches(size_type num_rows, RowSize row_sizes, rmm::cuda_stream // The output_batch_row_offsets vector is used as the offset column of the returned data. This // needs to be individually allocated, but the kernel needs a contiguous array of offsets or // more global lookups are necessary. - cudaMemcpy(batch_row_offsets.data() + last_row_end, output_batch_row_offsets.data(), - num_rows_in_batch * sizeof(size_type), cudaMemcpyDeviceToDevice); + if (!all_fixed_width) { + cudaMemcpy(batch_row_offsets.data() + last_row_end, output_batch_row_offsets.data(), + num_rows_in_batch * sizeof(size_type), cudaMemcpyDeviceToDevice); + } batch_row_boundaries.push_back(row_end); row_batches.push_back({batch_bytes, num_rows_in_batch, std::move(output_batch_row_offsets)}); @@ -1221,141 +1329,142 @@ batch_data build_batches(size_type num_rows, RowSize row_sizes, rmm::cuda_stream last_row_end = row_end; } - return {std::move(batch_row_offsets), batch_row_boundaries, std::move(row_batches)}; + return {std::move(batch_row_offsets), make_device_uvector_async(batch_row_boundaries, stream), + std::move(batch_row_boundaries), std::move(row_batches)}; } /** - * @brief Computes the number of blocks necessary given a window height and batch offsets + * @brief Computes the number of tiles necessary given a tile height and batch offsets * * @param batch_row_boundaries row boundaries for each batch - * @param desired_window_height height of each window in the table + * @param desired_tile_height height of each tile in the table * @param stream stream to use - * @return number of windows necessary + * @return number of tiles necessary */ -int compute_block_counts(device_span const &batch_row_boundaries, - int desired_window_height, rmm::cuda_stream_view stream) { +int compute_tile_counts(device_span const &batch_row_boundaries, + int desired_tile_height, rmm::cuda_stream_view stream) { size_type const num_batches = batch_row_boundaries.size() - 1; - device_uvector num_blocks(num_batches, stream); + device_uvector num_tiles(num_batches, stream); auto iter = thrust::make_counting_iterator(0); - thrust::transform(rmm::exec_policy(stream), iter, iter + num_batches, num_blocks.begin(), - [desired_window_height, + thrust::transform(rmm::exec_policy(stream), iter, iter + num_batches, num_tiles.begin(), + [desired_tile_height, batch_row_boundaries = batch_row_boundaries.data()] __device__(auto batch_index) -> size_type { return util::div_rounding_up_unsafe(batch_row_boundaries[batch_index + 1] - batch_row_boundaries[batch_index], - desired_window_height); + desired_tile_height); }); - return thrust::reduce(rmm::exec_policy(stream), num_blocks.begin(), num_blocks.end()); + return thrust::reduce(rmm::exec_policy(stream), num_tiles.begin(), num_tiles.end()); } /** - * @brief Builds the `block_info` structs for a given table. + * @brief Builds the `tile_info` structs for a given table. * - * @param blocks span of blocks to populate + * @param tiles span of tiles to populate * @param batch_row_boundaries boundary to row batches - * @param column_start starting column of the window - * @param column_end ending column of the window - * @param desired_window_height height of the window + * @param column_start starting column of the tile + * @param column_end ending column of the tile + * @param desired_tile_height height of the tile * @param total_number_of_rows total number of rows in the table * @param stream stream to use - * @return number of windows created + * @return number of tiles created */ size_type -build_blocks(device_span blocks, - device_uvector const &batch_row_boundaries, // comes from build_batches - int column_start, int column_end, int desired_window_height, int total_number_of_rows, - rmm::cuda_stream_view stream) { +build_tiles(device_span tiles, + device_uvector const &batch_row_boundaries, // comes from build_batches + int column_start, int column_end, int desired_tile_height, int total_number_of_rows, + rmm::cuda_stream_view stream) { size_type const num_batches = batch_row_boundaries.size() - 1; - device_uvector num_blocks(num_batches, stream); + device_uvector num_tiles(num_batches, stream); auto iter = thrust::make_counting_iterator(0); - thrust::transform(rmm::exec_policy(stream), iter, iter + num_batches, num_blocks.begin(), - [desired_window_height, + thrust::transform(rmm::exec_policy(stream), iter, iter + num_batches, num_tiles.begin(), + [desired_tile_height, batch_row_boundaries = batch_row_boundaries.data()] __device__(auto batch_index) -> size_type { return util::div_rounding_up_unsafe(batch_row_boundaries[batch_index + 1] - batch_row_boundaries[batch_index], - desired_window_height); + desired_tile_height); }); - size_type const total_blocks = - thrust::reduce(rmm::exec_policy(stream), num_blocks.begin(), num_blocks.end()); + size_type const total_tiles = + thrust::reduce(rmm::exec_policy(stream), num_tiles.begin(), num_tiles.end()); - device_uvector block_starts(num_batches + 1, stream); - auto block_iter = cudf::detail::make_counting_transform_iterator( - 0, [num_blocks = num_blocks.data(), num_batches] __device__(auto i) { - return (i < num_batches) ? num_blocks[i] : 0; + device_uvector tile_starts(num_batches + 1, stream); + auto tile_iter = cudf::detail::make_counting_transform_iterator( + 0, [num_tiles = num_tiles.data(), num_batches] __device__(auto i) { + return (i < num_batches) ? num_tiles[i] : 0; }); - thrust::exclusive_scan(rmm::exec_policy(stream), block_iter, block_iter + num_batches + 1, - block_starts.begin()); // in blocks + thrust::exclusive_scan(rmm::exec_policy(stream), tile_iter, tile_iter + num_batches + 1, + tile_starts.begin()); // in tiles thrust::transform( - rmm::exec_policy(stream), iter, iter + total_blocks, blocks.begin(), - [=, block_starts = block_starts.data(), - batch_row_boundaries = batch_row_boundaries.data()] __device__(size_type block_index) { - // what batch this block falls in + rmm::exec_policy(stream), iter, iter + total_tiles, tiles.begin(), + [=, tile_starts = tile_starts.data(), + batch_row_boundaries = batch_row_boundaries.data()] __device__(size_type tile_index) { + // what batch this tile falls in auto const batch_index_iter = - thrust::upper_bound(thrust::seq, block_starts, block_starts + num_batches, block_index); - auto const batch_index = std::distance(block_starts, batch_index_iter) - 1; - // local index within the block - int const local_block_index = block_index - block_starts[batch_index]; + thrust::upper_bound(thrust::seq, tile_starts, tile_starts + num_batches, tile_index); + auto const batch_index = std::distance(tile_starts, batch_index_iter) - 1; + // local index within the tile + int const local_tile_index = tile_index - tile_starts[batch_index]; // the start row for this batch. int const batch_row_start = batch_row_boundaries[batch_index]; - // the start row for this block - int const block_row_start = batch_row_start + (local_block_index * desired_window_height); - // the end row for this block + // the start row for this tile + int const tile_row_start = batch_row_start + (local_tile_index * desired_tile_height); + // the end row for this tile int const max_row = std::min(total_number_of_rows - 1, batch_index + 1 > num_batches ? std::numeric_limits::max() : static_cast(batch_row_boundaries[batch_index + 1]) - 1); - int const block_row_end = std::min( - batch_row_start + ((local_block_index + 1) * desired_window_height) - 1, max_row); + int const tile_row_end = + std::min(batch_row_start + ((local_tile_index + 1) * desired_tile_height) - 1, max_row); - // stuff the block - return block_info{column_start, block_row_start, column_end, block_row_end, - static_cast(batch_index)}; + // stuff the tile + return tile_info{column_start, tile_row_start, column_end, tile_row_end, + static_cast(batch_index)}; }); - return total_blocks; + return total_tiles; } /** - * @brief Determines what data should be operated on by each block for the incoming table. + * @brief Determines what data should be operated on by each tile for the incoming table. * - * @tparam WindowCallback Callback that receives the start and end columns of windows + * @tparam TileCallback Callback that receives the start and end columns of tiles * @param column_sizes vector of the size of each column * @param column_starts vector of the offset of each column - * @param first_row_batch_size size of the first row batch to limit max window size since a window + * @param first_row_batch_size size of the first row batch to limit max tile size since a tile * is unable to span batches * @param total_number_of_rows total number of rows in the table - * @param shmem_limit_per_block shared memory allowed per block - * @param f callback function called when building a window + * @param shmem_limit_per_tile shared memory allowed per tile + * @param f callback function called when building a tile */ -template -void determine_windows(std::vector const &column_sizes, - std::vector const &column_starts, - size_type const first_row_batch_size, size_type const total_number_of_rows, - size_type const &shmem_limit_per_block, WindowCallback f) { - // block infos are organized with the windows going "down" the columns +template +void determine_tiles(std::vector const &column_sizes, + std::vector const &column_starts, + size_type const first_row_batch_size, size_type const total_number_of_rows, + size_type const &shmem_limit_per_tile, TileCallback f) { + // tile infos are organized with the tile going "down" the columns // this provides the most coalescing of memory access - int current_window_width = 0; - int current_window_start_col = 0; + int current_tile_width = 0; + int current_tile_start_col = 0; - // the ideal window height has lots of 8-byte reads and 8-byte writes. The optimal read/write - // would be memory cache line sized access, but since other blocks will read/write the edges + // the ideal tile height has lots of 8-byte reads and 8-byte writes. The optimal read/write + // would be memory cache line sized access, but since other tiles will read/write the edges // this may not turn out to be overly important. For now, we will attempt to build a square - // window as far as byte sizes. x * y = shared_mem_size. Which translates to x^2 = + // tile as far as byte sizes. x * y = shared_mem_size. Which translates to x^2 = // shared_mem_size since we want them equal, so height and width are sqrt(shared_mem_size). The // trick is that it's in bytes, not rows or columns. - auto const optimal_square_len = static_cast(sqrt(shmem_limit_per_block)); - auto const window_height = + auto const optimal_square_len = static_cast(sqrt(shmem_limit_per_tile)); + auto const tile_height = std::clamp(util::round_up_safe( std::min(optimal_square_len / column_sizes[0], total_number_of_rows), 32), 1, first_row_batch_size); int row_size = 0; - // march each column and build the blocks of appropriate sizes + // march each column and build the tiles of appropriate sizes for (uint col = 0; col < column_sizes.size(); ++col) { auto const col_size = column_sizes[col]; @@ -1366,25 +1475,25 @@ void determine_windows(std::vector const &column_sizes, auto const row_size_with_end_pad = util::round_up_unsafe(row_size_with_this_col, JCUDF_ROW_ALIGNMENT); - if (row_size_with_end_pad * window_height > shmem_limit_per_block) { - // too large, close this window, generate vertical blocks and restart - f(current_window_start_col, col == 0 ? col : col - 1, window_height); + if (row_size_with_end_pad * tile_height > shmem_limit_per_tile) { + // too large, close this tile, generate vertical tiles and restart + f(current_tile_start_col, col == 0 ? col : col - 1, tile_height); row_size = util::round_up_unsafe((column_starts[col] + column_sizes[col]) & 7, alignment_needed); - row_size += col_size; // alignment required for shared memory window boundary to match + row_size += col_size; // alignment required for shared memory tile boundary to match // alignment of output row - current_window_start_col = col; - current_window_width = 0; + current_tile_start_col = col; + current_tile_width = 0; } else { row_size = row_size_with_this_col; - current_window_width++; + current_tile_width++; } } - // build last set of blocks - if (current_window_width > 0) { - f(current_window_start_col, static_cast(column_sizes.size()) - 1, window_height); + // build last set of tiles + if (current_tile_width > 0) { + f(current_tile_start_col, static_cast(column_sizes.size()) - 1, tile_height); } } @@ -1399,18 +1508,23 @@ std::vector> convert_to_rows(table_view const &tbl, auto const num_columns = tbl.num_columns(); auto const num_rows = tbl.num_rows(); + auto const fixed_width_only = std::all_of( + tbl.begin(), tbl.end(), [](column_view const &c) { return is_fixed_width(c.type()); }); + int device_id; CUDA_TRY(cudaGetDevice(&device_id)); - int total_shmem; - CUDA_TRY(cudaDeviceGetAttribute(&total_shmem, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); - - // TODO: why is this needed. kernel fails to launch if all memory is requested. - total_shmem -= 1024; - auto const shmem_limit_per_block = total_shmem / NUM_BLOCKS_PER_KERNEL_LOADED; - - // break up the work into blocks, which are a starting and ending row/col #. - // this window size is calculated based on the shared memory size available - // we want a single block to fill up the entire shared memory space available + int total_shmem_in_bytes; + CUDA_TRY( + cudaDeviceGetAttribute(&total_shmem_in_bytes, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); + + // Need to reduce total shmem available by the size of barriers in the kernel's shared memory + total_shmem_in_bytes -= + sizeof(cuda::barrier) * NUM_TILES_PER_KERNEL_LOADED; + auto const shmem_limit_per_tile = total_shmem_in_bytes / NUM_TILES_PER_KERNEL_LOADED; + + // break up the work into tiles, which are a starting and ending row/col #. + // this tile size is calculated based on the shared memory size available + // we want a single tile to fill up the entire shared memory space available // for the transpose-like conversion. // There are two different processes going on here. The GPU conversion of the data @@ -1419,19 +1533,19 @@ std::vector> convert_to_rows(table_view const &tbl, // this limitation because the column must own the data inside and as a result it must be // a distinct allocation for that column. Copying the data into these final buffers would // be prohibitively expensive, so care is taken to ensure the GPU writes to the proper buffer. - // The windows are broken at the boundaries of specific rows based on the row sizes up + // The tiles are broken at the boundaries of specific rows based on the row sizes up // to that point. These are row batches and they are decided first before building the - // windows so the windows can be properly cut around them. + // tiles so the tiles can be properly cut around them. // Get the pointers to the input columnar data ready - std::vector input_data; - std::vector input_nm; - input_data.reserve(num_columns); - input_nm.reserve(num_columns); - std::transform(tbl.begin(), tbl.end(), std::back_inserter(input_data), - [](column_view const &c) -> int8_t const * { return c.template data(); }); - std::transform(tbl.begin(), tbl.end(), std::back_inserter(input_nm), - [](auto c) { return c.null_mask(); }); + + auto data_begin = thrust::make_transform_iterator( + tbl.begin(), [](auto const &c) { return c.template data(); }); + std::vector input_data(data_begin, data_begin + tbl.num_columns()); + + auto nm_begin = + thrust::make_transform_iterator(tbl.begin(), [](auto const &c) { return c.null_mask(); }); + std::vector input_nm(nm_begin, nm_begin + tbl.num_columns()); auto dev_input_data = make_device_uvector_async(input_data, stream, mr); auto dev_input_nm = make_device_uvector_async(input_nm, stream, mr); @@ -1444,7 +1558,7 @@ std::vector> convert_to_rows(table_view const &tbl, auto schema_column_iter = thrust::make_transform_iterator(thrust::make_counting_iterator(0), [&tbl](auto i) -> std::tuple { - return std::make_tuple(tbl.column(i).type(), tbl.column(i)); + return {tbl.column(i).type(), tbl.column(i)}; }); auto const fixed_width_size_per_row = detail::compute_column_information( @@ -1461,9 +1575,7 @@ std::vector> convert_to_rows(table_view const &tbl, return util::round_up_unsafe(bytes_needed, JCUDF_ROW_ALIGNMENT); }); - auto batch_info = detail::build_batches(num_rows, row_size_iter, stream, mr); - auto gpu_batch_row_boundaries = - make_device_uvector_async(batch_info.batch_row_boundaries, stream); + auto batch_info = detail::build_batches(num_rows, row_size_iter, fixed_width_only, stream, mr); // the first batch always exists unless we were sent an empty table auto const first_batch_size = batch_info.row_batches[0].row_count; @@ -1479,52 +1591,67 @@ std::vector> convert_to_rows(table_view const &tbl, std::transform(output_buffers.begin(), output_buffers.end(), std::back_inserter(output_data), [](auto &buf) { return static_cast(buf.data()); }); + /* auto output_data_begin = thrust::make_transform_iterator(batch_info.row_batches.begin(), + [stream, mr](auto const& batch) { return rmm::device_buffer(batch.num_bytes, stream, mr); }); + std::vector output_buffers( output_data_begin, output_data_begin + + batch_info.row_batches.size() ); + + auto output_buffers_begin = thrust::make_transform_iterator(output_buffers.begin(), + [](auto const &buf) -> int8_t * { return static_cast(buf.data()); }); + std::vector output_data( output_buffers_begin, output_buffers_begin + + output_buffers.size() );*/ + auto dev_output_data = make_device_uvector_async(output_data, stream, mr); int info_count = 0; - detail::determine_windows( - column_sizes, column_starts, first_batch_size, num_rows, shmem_limit_per_block, - [&gpu_batch_row_boundaries, &info_count, &stream](int const start_col, int const end_col, - int const window_height) { - int i = detail::compute_block_counts(gpu_batch_row_boundaries, window_height, stream); + detail::determine_tiles( + column_sizes, column_starts, first_batch_size, num_rows, shmem_limit_per_tile, + [&gpu_batch_row_boundaries = batch_info.d_batch_row_boundaries, &info_count, + &stream](int const start_col, int const end_col, int const tile_height) { + int i = detail::compute_tile_counts(gpu_batch_row_boundaries, tile_height, stream); info_count += i; }); - // allocate space for blocks - device_uvector gpu_block_infos(info_count, stream); - int block_offset = 0; - - detail::determine_windows( - column_sizes, column_starts, first_batch_size, num_rows, shmem_limit_per_block, - [&gpu_batch_row_boundaries, &gpu_block_infos, num_rows, &block_offset, - stream](int const start_col, int const end_col, int const window_height) { - block_offset += detail::build_blocks( - {gpu_block_infos.data() + block_offset, gpu_block_infos.size() - block_offset}, - gpu_batch_row_boundaries, start_col, end_col, window_height, num_rows, stream); + // allocate space for tiles + device_uvector gpu_tile_infos(info_count, stream); + int tile_offset = 0; + + detail::determine_tiles( + column_sizes, column_starts, first_batch_size, num_rows, shmem_limit_per_tile, + [&gpu_batch_row_boundaries = batch_info.d_batch_row_boundaries, &gpu_tile_infos, num_rows, + &tile_offset, stream](int const start_col, int const end_col, int const tile_height) { + tile_offset += detail::build_tiles( + {gpu_tile_infos.data() + tile_offset, gpu_tile_infos.size() - tile_offset}, + gpu_batch_row_boundaries, start_col, end_col, tile_height, num_rows, stream); }); // blast through the entire table and convert it - dim3 blocks(util::div_rounding_up_unsafe(gpu_block_infos.size(), NUM_BLOCKS_PER_KERNEL_TO_ROWS)); + dim3 blocks(util::div_rounding_up_unsafe(gpu_tile_infos.size(), NUM_TILES_PER_KERNEL_TO_ROWS)); dim3 threads(256); - auto validity_block_infos = detail::build_validity_block_infos( - num_columns, num_rows, shmem_limit_per_block, batch_info.row_batches); + auto validity_tile_infos = detail::build_validity_tile_infos( + num_columns, num_rows, shmem_limit_per_tile, batch_info.row_batches); - auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream); + auto dev_validity_tile_infos = make_device_uvector_async(validity_tile_infos, stream); dim3 validity_blocks( - util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_VALIDITY_BLOCKS_PER_KERNEL)); - dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu)); - - detail::copy_to_rows<<>>( - num_rows, num_columns, shmem_limit_per_block, gpu_block_infos, dev_input_data.data(), - dev_col_sizes.data(), dev_col_starts.data(), - batch_info.batch_row_offsets - .data(), // needs to be row offsets per batch, not overall JUST for output. + util::div_rounding_up_unsafe(validity_tile_infos.size(), NUM_VALIDITY_TILES_PER_KERNEL)); + dim3 validity_threads(std::min(validity_tile_infos.size() * 32, 128lu)); + + auto const fixed_width_only_row_size = util::round_up_unsafe( + fixed_width_size_per_row + util::div_rounding_up_safe(num_columns, 8), 8); + detail::row_offset_functor offset_functor(fixed_width_only_row_size); + + detail::copy_to_rows<<>>( + num_rows, num_columns, shmem_limit_per_tile, gpu_tile_infos, dev_input_data.data(), + dev_col_sizes.data(), dev_col_starts.data(), offset_functor, + batch_info.d_batch_row_boundaries.data(), reinterpret_cast(dev_output_data.data())); - detail::copy_validity_to_rows<<>>( - num_rows, num_columns, shmem_limit_per_block, batch_info.batch_row_offsets.data(), - dev_output_data.data(), column_starts.back(), dev_validity_block_infos, dev_input_nm.data()); + detail::copy_validity_to_rows<<>>( + num_rows, num_columns, shmem_limit_per_tile, offset_functor, + batch_info.d_batch_row_boundaries.data(), dev_output_data.data(), column_starts.back(), + dev_validity_tile_infos, dev_input_nm.data()); // split up the output buffer into multiple buffers based on row batch sizes // and create list of byte columns @@ -1629,12 +1756,14 @@ std::unique_ptr
convert_from_rows(lists_column_view const &input, int device_id; CUDA_TRY(cudaGetDevice(&device_id)); - int total_shmem; - CUDA_TRY(cudaDeviceGetAttribute(&total_shmem, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); + int total_shmem_in_bytes; + CUDA_TRY( + cudaDeviceGetAttribute(&total_shmem_in_bytes, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); - // TODO: why is this needed. kernel fails to launch if all memory is requested. - total_shmem -= 1024; - int shmem_limit_per_block = total_shmem / NUM_BLOCKS_PER_KERNEL_LOADED; + // Need to reduce total shmem available by the size of barriers in the kernel's shared memory + total_shmem_in_bytes -= + sizeof(cuda::barrier) * NUM_TILES_PER_KERNEL_LOADED; + int shmem_limit_per_tile = total_shmem_in_bytes / NUM_TILES_PER_KERNEL_LOADED; std::vector column_starts; std::vector column_sizes; @@ -1686,50 +1815,53 @@ std::unique_ptr
convert_from_rows(lists_column_view const &input, [num_rows] __device__(auto i) { return i == 0 ? 0 : num_rows; }); int info_count = 0; - detail::determine_windows(column_sizes, column_starts, num_rows, num_rows, shmem_limit_per_block, - [&gpu_batch_row_boundaries, &info_count, &stream]( - int const start_col, int const end_col, int const window_height) { - info_count += detail::compute_block_counts(gpu_batch_row_boundaries, - window_height, stream); - }); - - // allocate space for blocks - device_uvector gpu_block_infos(info_count, stream); - - int block_offset = 0; - detail::determine_windows( - column_sizes, column_starts, num_rows, num_rows, shmem_limit_per_block, - [&gpu_batch_row_boundaries, &gpu_block_infos, num_rows, &block_offset, - stream](int const start_col, int const end_col, int const window_height) { - block_offset += detail::build_blocks( - {gpu_block_infos.data() + block_offset, gpu_block_infos.size() - block_offset}, - gpu_batch_row_boundaries, start_col, end_col, window_height, num_rows, stream); + detail::determine_tiles(column_sizes, column_starts, num_rows, num_rows, shmem_limit_per_tile, + [&gpu_batch_row_boundaries, &info_count, + &stream](int const start_col, int const end_col, int const tile_height) { + info_count += detail::compute_tile_counts(gpu_batch_row_boundaries, + tile_height, stream); + }); + + // allocate space for tiles + device_uvector gpu_tile_infos(info_count, stream); + + int tile_offset = 0; + detail::determine_tiles( + column_sizes, column_starts, num_rows, num_rows, shmem_limit_per_tile, + [&gpu_batch_row_boundaries, &gpu_tile_infos, num_rows, &tile_offset, + stream](int const start_col, int const end_col, int const tile_height) { + tile_offset += detail::build_tiles( + {gpu_tile_infos.data() + tile_offset, gpu_tile_infos.size() - tile_offset}, + gpu_batch_row_boundaries, start_col, end_col, tile_height, num_rows, stream); }); - dim3 blocks( - util::div_rounding_up_unsafe(gpu_block_infos.size(), NUM_BLOCKS_PER_KERNEL_FROM_ROWS)); - dim3 threads(std::min(std::min(256, shmem_limit_per_block / 8), static_cast(child.size()))); + dim3 blocks(util::div_rounding_up_unsafe(gpu_tile_infos.size(), NUM_TILES_PER_KERNEL_FROM_ROWS)); + dim3 threads(std::min(std::min(256, shmem_limit_per_tile / 8), static_cast(child.size()))); - auto validity_block_infos = - detail::build_validity_block_infos(num_columns, num_rows, shmem_limit_per_block, row_batches); + auto validity_tile_infos = + detail::build_validity_tile_infos(num_columns, num_rows, shmem_limit_per_tile, row_batches); - auto dev_validity_block_infos = make_device_uvector_async(validity_block_infos, stream); + auto dev_validity_tile_infos = make_device_uvector_async(validity_tile_infos, stream); dim3 validity_blocks( - util::div_rounding_up_unsafe(validity_block_infos.size(), NUM_VALIDITY_BLOCKS_PER_KERNEL)); + util::div_rounding_up_unsafe(validity_tile_infos.size(), NUM_VALIDITY_TILES_PER_KERNEL)); + + dim3 validity_threads(std::min(validity_tile_infos.size() * 32, 128lu)); - dim3 validity_threads(std::min(validity_block_infos.size() * 32, 128lu)); + auto const fixed_width_only_row_size = util::round_up_unsafe( + fixed_width_size_per_row + util::div_rounding_up_safe(static_cast(num_columns), 8), + 8); + detail::row_offset_functor offset_functor(fixed_width_only_row_size); - detail::copy_from_rows<<>>( - num_rows, num_columns, shmem_limit_per_block, input.offsets().data(), - dev_output_data.data(), dev_col_sizes.data(), dev_col_starts.data(), gpu_block_infos, + detail::copy_from_rows<<>>( + num_rows, num_columns, shmem_limit_per_tile, offset_functor, gpu_batch_row_boundaries.data(), + dev_output_data.data(), dev_col_sizes.data(), dev_col_starts.data(), gpu_tile_infos, child.data()); - detail:: - copy_validity_from_rows<<>>( - num_rows, num_columns, shmem_limit_per_block, input.offsets().data(), - dev_output_nm.data(), column_starts.back(), dev_validity_block_infos, - child.data()); + detail::copy_validity_from_rows<<>>( + num_rows, num_columns, shmem_limit_per_tile, offset_functor, gpu_batch_row_boundaries.data(), + dev_output_nm.data(), column_starts.back(), dev_validity_tile_infos, child.data()); return std::make_unique
(std::move(output_columns)); #else @@ -1794,6 +1926,6 @@ std::unique_ptr
convert_from_rows_fixed_width_optimized( } } -} // namespace java +} // namespace jni } // namespace cudf From 7fbe10dbb55873fd0ac03706dd283f7c5ca90229 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Thu, 23 Dec 2021 02:14:39 +0000 Subject: [PATCH 73/80] removing commented out code --- java/src/main/native/src/row_conversion.cu | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu index c1b6bdbce5d..9df8d7b7f14 100644 --- a/java/src/main/native/src/row_conversion.cu +++ b/java/src/main/native/src/row_conversion.cu @@ -1591,16 +1591,6 @@ std::vector> convert_to_rows(table_view const &tbl, std::transform(output_buffers.begin(), output_buffers.end(), std::back_inserter(output_data), [](auto &buf) { return static_cast(buf.data()); }); - /* auto output_data_begin = thrust::make_transform_iterator(batch_info.row_batches.begin(), - [stream, mr](auto const& batch) { return rmm::device_buffer(batch.num_bytes, stream, mr); }); - std::vector output_buffers( output_data_begin, output_data_begin + - batch_info.row_batches.size() ); - - auto output_buffers_begin = thrust::make_transform_iterator(output_buffers.begin(), - [](auto const &buf) -> int8_t * { return static_cast(buf.data()); }); - std::vector output_data( output_buffers_begin, output_buffers_begin + - output_buffers.size() );*/ - auto dev_output_data = make_device_uvector_async(output_data, stream, mr); int info_count = 0; From d47360d94e9054df002c94432f630a4f90c0d084 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Tue, 4 Jan 2022 16:48:03 +0000 Subject: [PATCH 74/80] updating from review comments --- .../cudf/detail/utilities/integer_utils.hpp | 61 +++++++++++++------ java/src/main/java/ai/rapids/cudf/Table.java | 9 --- 2 files changed, 41 insertions(+), 29 deletions(-) diff --git a/cpp/include/cudf/detail/utilities/integer_utils.hpp b/cpp/include/cudf/detail/utilities/integer_utils.hpp index 0e427c0418a..fe501279fd5 100644 --- a/cpp/include/cudf/detail/utilities/integer_utils.hpp +++ b/cpp/include/cudf/detail/utilities/integer_utils.hpp @@ -1,7 +1,7 @@ /* * Copyright 2019 BlazingDB, Inc. * Copyright 2019 Eyal Rozenberg - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -33,12 +33,18 @@ namespace cudf { //! Utility functions namespace util { /** - * Finds the smallest integer not less than `number_to_round` and modulo `S` is - * zero. This function assumes that `number_to_round` is non-negative and - * `modulus` is positive. + * @brief Rounds `number_to_round` up to the next multiple of modulus + * + * @tparam S type to return + * @param number_to_round number that is being rounded + * @param modulus value to which to round + * @return smallest integer greater than `number_to_round` and modulo `S` is zero. + * + * @note This function assumes that `number_to_round` is non-negative and + * `modulus` is positive. The safety is in regard to rollover. */ template -inline S round_up_safe(S number_to_round, S modulus) +S round_up_safe(S number_to_round, S modulus) { auto remainder = number_to_round % modulus; if (remainder == 0) { return number_to_round; } @@ -50,20 +56,37 @@ inline S round_up_safe(S number_to_round, S modulus) } /** - * Finds the largest integer not greater than `number_to_round` and modulo `S` is - * zero. This function assumes that `number_to_round` is non-negative and - * `modulus` is positive. + * @brief Rounds `number_to_round` down to the last multiple of modulus + * + * @tparam S type to return + * @param number_to_round number that is being rounded + * @param modulus value to which to round + * @return largest integer not greater than `number_to_round` and modulo `S` is zero. + * + * @note This function assumes that `number_to_round` is non-negative and + * `modulus` is positive and does not check for overflow. */ template -inline S round_down_safe(S number_to_round, S modulus) +S round_down_safe(S number_to_round, S modulus) noexcept { auto remainder = number_to_round % modulus; auto rounded_down = number_to_round - remainder; return rounded_down; } +/** + * @brief Rounds `number_to_round` up to the next multiple of modulus + * + * @tparam S type to return + * @param number_to_round number that is being rounded + * @param modulus value to which to round + * @return smallest integer greater than `number_to_round` and modulo `S` is zero. + * + * @note This function assumes that `number_to_round` is non-negative and + * `modulus` is positive and does not check for overflow. + */ template -constexpr inline S round_up_unsafe(S number_to_round, S modulus) noexcept +constexpr S round_up_unsafe(S number_to_round, S modulus) noexcept { auto remainder = number_to_round % modulus; if (remainder == 0) { return number_to_round; } @@ -84,16 +107,16 @@ constexpr inline S round_up_unsafe(S number_to_round, S modulus) noexcept * the result will be incorrect */ template -constexpr inline S div_rounding_up_unsafe(const S& dividend, const T& divisor) noexcept +constexpr S div_rounding_up_unsafe(const S& dividend, const T& divisor) noexcept { return (dividend + divisor - 1) / divisor; } namespace detail { template -constexpr inline I div_rounding_up_safe(std::integral_constant, - I dividend, - I divisor) noexcept +constexpr I div_rounding_up_safe(std::integral_constant, + I dividend, + I divisor) noexcept { // TODO: This could probably be implemented faster return (dividend > divisor) ? 1 + div_rounding_up_unsafe(dividend - divisor, divisor) @@ -101,9 +124,7 @@ constexpr inline I div_rounding_up_safe(std::integral_constant, } template -constexpr inline I div_rounding_up_safe(std::integral_constant, - I dividend, - I divisor) noexcept +constexpr I div_rounding_up_safe(std::integral_constant, I dividend, I divisor) noexcept { auto quotient = dividend / divisor; auto remainder = dividend % divisor; @@ -125,14 +146,14 @@ constexpr inline I div_rounding_up_safe(std::integral_constant, * approach of using (dividend + divisor - 1) / divisor */ template -constexpr inline I div_rounding_up_safe(I dividend, I divisor) noexcept +constexpr I div_rounding_up_safe(I dividend, I divisor) noexcept { using i_is_a_signed_type = std::integral_constant::value>; return detail::div_rounding_up_safe(i_is_a_signed_type{}, dividend, divisor); } template -constexpr inline bool is_a_power_of_two(I val) noexcept +constexpr bool is_a_power_of_two(I val) noexcept { static_assert(std::is_integral::value, "This function only applies to integral types"); return ((val - 1) & val) == 0; @@ -162,7 +183,7 @@ constexpr inline bool is_a_power_of_two(I val) noexcept * @return Absolute value if value type is signed. */ template -constexpr inline auto absolute_value(T value) -> T +constexpr auto absolute_value(T value) -> T { if constexpr (cuda::std::is_signed()) return numeric::detail::abs(value); return value; diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index 06383c82ae6..6c34fd6f997 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -2789,15 +2789,6 @@ public ColumnVector[] convertToRowsFixedWidthOptimized() { return ret; } - public ColumnVector[] convertToRowsFixedWidthOptimized() { - long[] ptrs = convertToRowsFixedWidthOptimized(nativeHandle); - ColumnVector[] ret = new ColumnVector[ptrs.length]; - for (int i = 0; i < ptrs.length; i++) { - ret[i] = new ColumnVector(ptrs[i]); - } - return ret; - } - /** * Convert a column of list of bytes that is formatted like the output from `convertToRows` * and convert it back to a table. From 9b502718ad7d4fa814d5a78854715d04c3983e61 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Tue, 4 Jan 2022 18:45:20 +0000 Subject: [PATCH 75/80] Updating namespace --- java/src/main/native/src/TableJni.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 44c34d133ce..a3a00730f30 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -2754,7 +2754,7 @@ Java_ai_rapids_cudf_Table_convertToRowsFixedWidthOptimized(JNIEnv *env, jclass, cudf::jni::auto_set_device(env); cudf::table_view *n_input_table = reinterpret_cast(input_table); std::vector> cols = - cudf::java::convert_to_rows_fixed_width_optimized(*n_input_table); + cudf::jni::convert_to_rows_fixed_width_optimized(*n_input_table); int num_columns = cols.size(); cudf::jni::native_jlongArray outcol_handles(env, num_columns); for (int i = 0; i < num_columns; i++) { @@ -2812,7 +2812,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertToRows(JNIEnv *env try { cudf::jni::auto_set_device(env); cudf::table_view *n_input_table = reinterpret_cast(input_table); - std::vector> cols = cudf::java::convert_to_rows(*n_input_table); + std::vector> cols = cudf::jni::convert_to_rows(*n_input_table); int num_columns = cols.size(); cudf::jni::native_jlongArray outcol_handles(env, num_columns); for (int i = 0; i < num_columns; i++) { @@ -2839,7 +2839,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRowsFixedWidth types_vec.emplace_back(cudf::jni::make_data_type(n_types[i], n_scale[i])); } std::unique_ptr result = - cudf::java::convert_from_rows_fixed_width_optimized(list_input, types_vec); + cudf::jni::convert_from_rows_fixed_width_optimized(list_input, types_vec); return cudf::jni::convert_table_for_return(env, result); } CATCH_STD(env, 0); @@ -2862,7 +2862,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRows(JNIEnv *e for (int i = 0; i < n_types.size(); i++) { types_vec.emplace_back(cudf::jni::make_data_type(n_types[i], n_scale[i])); } - std::unique_ptr result = cudf::java::convert_from_rows(list_input, types_vec); + std::unique_ptr result = cudf::jni::convert_from_rows(list_input, types_vec); return cudf::jni::convert_table_for_return(env, result); } CATCH_STD(env, 0); From fb7566cfef456c43f512c0ecb0731aed15b8e10b Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Tue, 4 Jan 2022 18:55:25 +0000 Subject: [PATCH 76/80] updating namespace --- java/src/main/native/src/row_conversion.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/src/main/native/src/row_conversion.hpp b/java/src/main/native/src/row_conversion.hpp index edc2768d4bb..181a9fa068d 100644 --- a/java/src/main/native/src/row_conversion.hpp +++ b/java/src/main/native/src/row_conversion.hpp @@ -23,7 +23,7 @@ #include namespace cudf { -namespace java { +namespace jni { std::vector> convert_to_rows_fixed_width_optimized( cudf::table_view const &tbl, From 5e1cf972552d6b53041439f1bf185b5a8f3aa403 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Wed, 5 Jan 2022 21:27:43 -0500 Subject: [PATCH 77/80] Update java/src/main/native/src/row_conversion.cu Co-authored-by: MithunR --- java/src/main/native/src/row_conversion.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu index 9df8d7b7f14..9b60fb667b6 100644 --- a/java/src/main/native/src/row_conversion.cu +++ b/java/src/main/native/src/row_conversion.cu @@ -106,7 +106,7 @@ namespace detail { * NUM_TILES_PER_KERNEL_LOADED tiles at one time. The block will load * as many tiles as it can fit into shared memory and then wait on the * first tile to completely load before processing. Processing in this - * case means coping the data from shared memory back out to device + * case means copying the data from shared memory back out to device * memory via memcpy_async. This kernel is completely memory bound. * * Batch Data: From a1e35459e77ed1429044aea662961d2ae42c7c34 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Fri, 7 Jan 2022 05:14:28 +0000 Subject: [PATCH 78/80] moving to a constant iterator and other review cleanup --- java/src/main/native/src/row_conversion.cu | 65 +++++++++------------ java/src/main/native/src/row_conversion.hpp | 4 +- 2 files changed, 30 insertions(+), 39 deletions(-) diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu index 9df8d7b7f14..2d701497942 100644 --- a/java/src/main/native/src/row_conversion.cu +++ b/java/src/main/native/src/row_conversion.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -195,7 +195,7 @@ struct row_offset_functor { : _fixed_width_only_row_size(fixed_width_only_row_size){}; CUDA_DEVICE_CALLABLE - size_type operator()(int row_number, int tile_row_start) { + size_type operator()(int row_number, int tile_row_start) const { return (row_number - tile_row_start) * _fixed_width_only_row_size; } @@ -1187,7 +1187,10 @@ static size_type compute_column_information(iterator begin, iterator end, auto validity_offset = fixed_width_size_per_row; column_starts.push_back(validity_offset); - return fixed_width_size_per_row; + return util::round_up_unsafe( + fixed_width_size_per_row + + util::div_rounding_up_safe(static_cast(std::distance(begin, end)), 8), + JCUDF_ROW_ALIGNMENT); } /** @@ -1224,6 +1227,7 @@ build_validity_tile_infos(size_type const &num_columns, size_type const &num_row std::min(num_rows, util::round_down_safe(shmem_limit_per_tile / bytes_per_row, 64)); std::vector validity_tile_infos; + validity_tile_infos.reserve(num_columns / column_stride * num_rows / row_stride); for (int col = 0; col < num_columns; col += column_stride) { int current_tile_row_batch = 0; int rows_left_in_batch = row_batches[current_tile_row_batch].row_count; @@ -1245,13 +1249,21 @@ build_validity_tile_infos(size_type const &num_columns, size_type const &num_row return validity_tile_infos; } +/** + * @brief functor that returns the size of a row or 0 is row is greater than the number of rows in the table + * + * @tparam RowSize iterator that returns the size of a specific row + */ template struct row_size_functor { - RowSize _row_sizes; - size_type _num_rows; - row_size_functor(RowSize row_sizes) : _row_sizes(row_sizes){}; + row_size_functor(size_type row_end, RowSize row_sizes, size_type last_row_end) + : _row_end(row_end), _row_sizes(row_sizes), _last_row_end(last_row_end) {} CUDA_DEVICE_CALLABLE - uint64_t operator()(int row_index) { return static_cast(_row_sizes[row_index]); } + uint64_t operator()(int i) const { return i >= _row_end ? 0 : _row_sizes[i + _last_row_end]; } + + size_type _row_end; + RowSize _row_sizes; + size_type _last_row_end; }; /** @@ -1266,14 +1278,10 @@ template struct row_size_functor { * @returns vector of size_type's that indicate row numbers for batch boundaries and a * device_uvector of row offsets */ - template batch_data build_batches(size_type num_rows, RowSize row_sizes, bool all_fixed_width, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) { - auto uint64_row_sizes = - cudf::detail::make_counting_transform_iterator(0, row_size_functor(row_sizes)); - auto const total_size = - thrust::reduce(rmm::exec_policy(stream), uint64_row_sizes, uint64_row_sizes + num_rows); + auto const total_size = thrust::reduce(rmm::exec_policy(stream), row_sizes, row_sizes + num_rows); auto const num_batches = static_cast( util::div_rounding_up_safe(total_size, static_cast(MAX_BATCH_SIZE))); auto const num_offsets = num_batches + 1; @@ -1286,7 +1294,7 @@ batch_data build_batches(size_type num_rows, RowSize row_sizes, bool all_fixed_w batch_row_boundaries.push_back(0); size_type last_row_end = 0; device_uvector cumulative_row_sizes(num_rows, stream); - thrust::inclusive_scan(rmm::exec_policy(stream), uint64_row_sizes, uint64_row_sizes + num_rows, + thrust::inclusive_scan(rmm::exec_policy(stream), row_sizes, row_sizes + num_rows, cumulative_row_sizes.begin()); while (static_cast(batch_row_boundaries.size()) < num_offsets) { @@ -1305,10 +1313,8 @@ batch_data build_batches(size_type num_rows, RowSize row_sizes, bool all_fixed_w auto const num_entries = row_end - last_row_end + 1; device_uvector output_batch_row_offsets(num_entries, stream, mr); - auto row_size_iter_bounded = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), [row_end, row_sizes, last_row_end] __device__(auto i) { - return i >= row_end ? 0 : row_sizes[i + last_row_end]; - }); + auto row_size_iter_bounded = cudf::detail::make_counting_transform_iterator( + 0, row_size_functor(row_end, row_sizes, last_row_end)); thrust::exclusive_scan(rmm::exec_policy(stream), row_size_iter_bounded, row_size_iter_bounded + num_entries, output_batch_row_offsets.begin()); @@ -1568,13 +1574,7 @@ std::vector> convert_to_rows(table_view const &tbl, auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr); // total encoded row size. This includes fixed-width data, validity, and variable-width data. - auto row_size_iter = cudf::detail::make_counting_transform_iterator( - 0, [fixed_width_size_per_row, num_columns] __device__(auto i) { - auto const bytes_needed = - fixed_width_size_per_row + util::div_rounding_up_safe(num_columns, 8); - return util::round_up_unsafe(bytes_needed, JCUDF_ROW_ALIGNMENT); - }); - + auto row_size_iter = thrust::make_constant_iterator(fixed_width_size_per_row); auto batch_info = detail::build_batches(num_rows, row_size_iter, fixed_width_only, stream, mr); // the first batch always exists unless we were sent an empty table @@ -1627,9 +1627,7 @@ std::vector> convert_to_rows(table_view const &tbl, util::div_rounding_up_unsafe(validity_tile_infos.size(), NUM_VALIDITY_TILES_PER_KERNEL)); dim3 validity_threads(std::min(validity_tile_infos.size() * 32, 128lu)); - auto const fixed_width_only_row_size = util::round_up_unsafe( - fixed_width_size_per_row + util::div_rounding_up_safe(num_columns, 8), 8); - detail::row_offset_functor offset_functor(fixed_width_only_row_size); + detail::row_offset_functor offset_functor(fixed_width_size_per_row); detail::copy_to_rows<<>>( num_rows, num_columns, shmem_limit_per_tile, gpu_tile_infos, dev_input_data.data(), @@ -1764,14 +1762,10 @@ std::unique_ptr
convert_from_rows(lists_column_view const &input, auto const fixed_width_size_per_row = detail::compute_column_information(iter, iter + num_columns, column_starts, column_sizes); - auto const validity_size = num_bitmask_words(num_columns) * 4; - - auto const row_size = - util::round_up_unsafe(fixed_width_size_per_row + validity_size, JCUDF_ROW_ALIGNMENT); - // Ideally we would check that the offsets are all the same, etc. but for now // this is probably fine - CUDF_EXPECTS(row_size * num_rows == child.size(), "The layout of the data appears to be off"); + CUDF_EXPECTS(fixed_width_size_per_row * num_rows == child.size(), + "The layout of the data appears to be off"); auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr); auto dev_col_sizes = make_device_uvector_async(column_sizes, stream, mr); @@ -1838,10 +1832,7 @@ std::unique_ptr
convert_from_rows(lists_column_view const &input, dim3 validity_threads(std::min(validity_tile_infos.size() * 32, 128lu)); - auto const fixed_width_only_row_size = util::round_up_unsafe( - fixed_width_size_per_row + util::div_rounding_up_safe(static_cast(num_columns), 8), - 8); - detail::row_offset_functor offset_functor(fixed_width_only_row_size); + detail::row_offset_functor offset_functor(fixed_width_size_per_row); detail::copy_from_rows<<>>( num_rows, num_columns, shmem_limit_per_tile, offset_functor, gpu_batch_row_boundaries.data(), diff --git a/java/src/main/native/src/row_conversion.hpp b/java/src/main/native/src/row_conversion.hpp index 181a9fa068d..1a3cf37caba 100644 --- a/java/src/main/native/src/row_conversion.hpp +++ b/java/src/main/native/src/row_conversion.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -47,5 +47,5 @@ convert_from_rows(cudf::lists_column_view const &input, std::vector Date: Mon, 10 Jan 2022 04:24:33 +0000 Subject: [PATCH 79/80] Removing magic numbers per review comments --- java/src/main/native/src/row_conversion.cu | 74 +++++++++++++--------- 1 file changed, 43 insertions(+), 31 deletions(-) diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu index e777099acb3..94b9e4bc143 100644 --- a/java/src/main/native/src/row_conversion.cu +++ b/java/src/main/native/src/row_conversion.cu @@ -587,20 +587,23 @@ __global__ void copy_to_rows(const size_type num_rows, const size_type num_colum tile.batch_number == 0 ? 0 : batch_row_boundaries[tile.batch_number]; // copy entire row 8 bytes at a time - auto const chunks_per_row = util::div_rounding_up_unsafe(tile_row_size, 8); + constexpr auto bytes_per_chunk = 8; + auto const chunks_per_row = util::div_rounding_up_unsafe(tile_row_size, bytes_per_chunk); auto const total_chunks = chunks_per_row * tile.num_rows(); for (auto i = threadIdx.x; i < total_chunks; i += blockDim.x) { // determine source address of my chunk auto const relative_row = i / chunks_per_row; - auto const relative_chunk_offset = (i % chunks_per_row) * 8; + auto const relative_chunk_offset = (i % chunks_per_row) * bytes_per_chunk; auto const output_dest = tile_output_buffer + row_offsets(relative_row + tile.start_row, row_batch_start) + column_offset + relative_chunk_offset; auto const input_src = &shared[processing_index % stages_count] [tile_row_size * relative_row + relative_chunk_offset]; - cuda::memcpy_async(output_dest, input_src, cuda::aligned_size_t<8>(8), processing_barrier); + cuda::memcpy_async(output_dest, input_src, + cuda::aligned_size_t(bytes_per_chunk), + processing_barrier); } } @@ -670,8 +673,8 @@ copy_validity_to_rows(const size_type num_rows, const size_type num_columns, auto const num_sections_x = util::div_rounding_up_unsafe(num_tile_cols, 32); auto const num_sections_y = util::div_rounding_up_unsafe(num_tile_rows, 32); - auto const validity_data_row_length = - util::round_up_unsafe(util::div_rounding_up_unsafe(num_tile_cols, 8), JCUDF_ROW_ALIGNMENT); + auto const validity_data_row_length = util::round_up_unsafe( + util::div_rounding_up_unsafe(num_tile_cols, CHAR_BIT), JCUDF_ROW_ALIGNMENT); auto const total_sections = num_sections_x * num_sections_y; int const warp_id = threadIdx.x / warp_size; @@ -703,7 +706,7 @@ copy_validity_to_rows(const size_type num_rows, const size_type num_columns, auto validity_data = __ballot_sync(participation_mask, my_data & dw_mask); // lead thread in each warp writes data auto const validity_write_offset = - validity_data_row_length * (relative_row + i) + relative_col / 8; + validity_data_row_length * (relative_row + i) + relative_col / CHAR_BIT; if (threadIdx.x % warp_size == 0) { *reinterpret_cast(&this_shared_tile[validity_write_offset]) = validity_data; } @@ -715,16 +718,17 @@ copy_validity_to_rows(const size_type num_rows, const size_type num_columns, group.sync(); auto const output_data_base = - output_data[tile.batch_number] + validity_offset + tile.start_col / 8; + output_data[tile.batch_number] + validity_offset + tile.start_col / CHAR_BIT; // now async memcpy the shared memory out to the final destination 4 bytes at a time since we do // 32-row chunks - auto const row_bytes = util::div_rounding_up_unsafe(num_tile_cols, 8); - auto const chunks_per_row = util::div_rounding_up_unsafe(row_bytes, 8); + constexpr auto bytes_per_chunk = 8; + auto const row_bytes = util::div_rounding_up_unsafe(num_tile_cols, CHAR_BIT); + auto const chunks_per_row = util::div_rounding_up_unsafe(row_bytes, bytes_per_chunk); auto const total_chunks = chunks_per_row * tile.num_rows(); auto &processing_barrier = shared_tile_barriers[validity_tile % NUM_VALIDITY_TILES_PER_KERNEL_LOADED]; - auto const tail_bytes = row_bytes % 8; + auto const tail_bytes = row_bytes % bytes_per_chunk; auto const row_batch_start = tile.batch_number == 0 ? 0 : batch_row_boundaries[tile.batch_number]; @@ -732,7 +736,7 @@ copy_validity_to_rows(const size_type num_rows, const size_type num_columns, // determine source address of my chunk auto const relative_row = i / chunks_per_row; auto const col_chunk = i % chunks_per_row; - auto const relative_chunk_offset = col_chunk * 8; + auto const relative_chunk_offset = col_chunk * bytes_per_chunk; auto const output_dest = output_data_base + row_offsets(relative_row + tile.start_row, row_batch_start) + relative_chunk_offset; @@ -742,7 +746,9 @@ copy_validity_to_rows(const size_type num_rows, const size_type num_columns, if (tail_bytes > 0 && col_chunk == chunks_per_row - 1) cuda::memcpy_async(output_dest, input_src, tail_bytes, processing_barrier); else - cuda::memcpy_async(output_dest, input_src, cuda::aligned_size_t<8>(8), processing_barrier); + cuda::memcpy_async(output_dest, input_src, + cuda::aligned_size_t(bytes_per_chunk), + processing_barrier); } } @@ -936,8 +942,9 @@ copy_validity_from_rows(const size_type num_rows, const size_type num_columns, auto const tile_start_row = tile.start_row; auto const num_tile_cols = tile.num_cols(); auto const num_tile_rows = tile.num_rows(); - auto const num_sections_x = util::div_rounding_up_safe(num_tile_cols, 8); - auto const num_sections_y = util::div_rounding_up_safe(num_tile_rows, 32); + constexpr auto rows_per_read = 32; + auto const num_sections_x = util::div_rounding_up_safe(num_tile_cols, CHAR_BIT); + auto const num_sections_y = util::div_rounding_up_safe(num_tile_rows, rows_per_read); auto const validity_data_col_length = num_sections_y * 4; // words to bytes auto const total_sections = num_sections_x * num_sections_y; int const warp_id = threadIdx.x / warp_size; @@ -950,8 +957,8 @@ copy_validity_from_rows(const size_type num_rows, const size_type num_columns, // convert section to row and col auto const section_x = my_section_idx % num_sections_x; auto const section_y = my_section_idx / num_sections_x; - auto const relative_col = section_x * 8; - auto const relative_row = section_y * 32 + lane_id; + auto const relative_col = section_x * CHAR_BIT; + auto const relative_row = section_y * rows_per_read + lane_id; auto const absolute_col = relative_col + tile_start_col; auto const absolute_row = relative_row + tile_start_row; auto const row_batch_start = @@ -961,18 +968,18 @@ copy_validity_from_rows(const size_type num_rows, const size_type num_columns, if (absolute_row < num_rows) { auto const my_byte = input_data[row_offsets(absolute_row, row_batch_start) + - validity_offset + absolute_col / 8]; + validity_offset + absolute_col / CHAR_BIT]; // so every thread that is participating in the warp has a byte, but it's row-based // data and we need it in column-based. So we shuffle the bits around to make // the bytes we actually write. - for (int i = 0, byte_mask = 1; i < 8 && relative_col + i < num_columns; + for (int i = 0, byte_mask = 1; i < CHAR_BIT && relative_col + i < num_columns; ++i, byte_mask <<= 1) { auto validity_data = __ballot_sync(participation_mask, my_byte & byte_mask); // lead thread in each warp writes data if (threadIdx.x % warp_size == 0) { auto const validity_write_offset = - validity_data_col_length * (relative_col + i) + relative_row / 8; + validity_data_col_length * (relative_col + i) + relative_row / CHAR_BIT; *reinterpret_cast(&this_shared_tile[validity_write_offset]) = validity_data; } @@ -984,19 +991,20 @@ copy_validity_from_rows(const size_type num_rows, const size_type num_columns, group.sync(); // now async memcpy the shared memory out to the final destination 8 bytes at a time - auto const col_bytes = util::div_rounding_up_unsafe(num_tile_rows, 8); - auto const chunks_per_col = util::div_rounding_up_unsafe(col_bytes, 8); + constexpr auto bytes_per_chunk = 8; + auto const col_bytes = util::div_rounding_up_unsafe(num_tile_rows, CHAR_BIT); + auto const chunks_per_col = util::div_rounding_up_unsafe(col_bytes, bytes_per_chunk); auto const total_chunks = chunks_per_col * num_tile_cols; auto &processing_barrier = shared_tile_barriers[validity_tile % NUM_VALIDITY_TILES_PER_KERNEL_LOADED]; - auto const tail_bytes = col_bytes % 8; + auto const tail_bytes = col_bytes % bytes_per_chunk; for (auto i = threadIdx.x; i < total_chunks; i += blockDim.x) { // determine source address of my chunk auto const relative_col = i / chunks_per_col; auto const row_chunk = i % chunks_per_col; auto const absolute_col = relative_col + tile_start_col; - auto const relative_chunk_byte_offset = row_chunk * 8; + auto const relative_chunk_byte_offset = row_chunk * bytes_per_chunk; auto const output_dest = output_nm[absolute_col] + word_index(tile_start_row) + row_chunk * 2; auto const input_src = &this_shared_tile[validity_data_col_length * relative_col + relative_chunk_byte_offset]; @@ -1004,7 +1012,9 @@ copy_validity_from_rows(const size_type num_rows, const size_type num_columns, if (tail_bytes > 0 && row_chunk == chunks_per_col - 1) { cuda::memcpy_async(output_dest, input_src, tail_bytes, processing_barrier); } else { - cuda::memcpy_async(output_dest, input_src, cuda::aligned_size_t<8>(8), processing_barrier); + cuda::memcpy_async(output_dest, input_src, + cuda::aligned_size_t(bytes_per_chunk), + processing_barrier); } } } @@ -1144,7 +1154,8 @@ static inline int32_t compute_fixed_width_layout(std::vector const &s // Now we need to add in space for validity // Eventually we can think about nullable vs not nullable, but for now we will just always add // it in - int32_t const validity_bytes_needed = util::div_rounding_up_safe(schema.size(), 8); + int32_t const validity_bytes_needed = + util::div_rounding_up_safe(schema.size(), CHAR_BIT); // validity comes at the end and is byte aligned so we can pack more in. at_offset += validity_bytes_needed; // Now we need to pad the end so all rows are 64 bit aligned @@ -1189,7 +1200,7 @@ static size_type compute_column_information(iterator begin, iterator end, return util::round_up_unsafe( fixed_width_size_per_row + - util::div_rounding_up_safe(static_cast(std::distance(begin, end)), 8), + util::div_rounding_up_safe(static_cast(std::distance(begin, end)), CHAR_BIT), JCUDF_ROW_ALIGNMENT); } @@ -1211,9 +1222,9 @@ build_validity_tile_infos(size_type const &num_columns, size_type const &num_row [&]() { if (desired_rows_and_columns > num_columns) { // not many columns, group it into 8s and ship it off - return std::min(8, num_columns); + return std::min(CHAR_BIT, num_columns); } else { - return util::round_down_safe(desired_rows_and_columns, 8); + return util::round_down_safe(desired_rows_and_columns, CHAR_BIT); } }(), JCUDF_ROW_ALIGNMENT); @@ -1221,8 +1232,8 @@ build_validity_tile_infos(size_type const &num_columns, size_type const &num_row // we fit as much as we can given the column stride // note that an element in the table takes just 1 bit, but a row with a single // element still takes 8 bytes! - auto const bytes_per_row = - util::round_up_safe(util::div_rounding_up_unsafe(column_stride, 8), JCUDF_ROW_ALIGNMENT); + auto const bytes_per_row = util::round_up_safe( + util::div_rounding_up_unsafe(column_stride, CHAR_BIT), JCUDF_ROW_ALIGNMENT); auto const row_stride = std::min(num_rows, util::round_down_safe(shmem_limit_per_tile / bytes_per_row, 64)); @@ -1250,7 +1261,8 @@ build_validity_tile_infos(size_type const &num_columns, size_type const &num_row } /** - * @brief functor that returns the size of a row or 0 is row is greater than the number of rows in the table + * @brief functor that returns the size of a row or 0 is row is greater than the number of rows in + * the table * * @tparam RowSize iterator that returns the size of a specific row */ From 0d0015afcc1476b8e1b55d55cb525511b35df611 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Mon, 10 Jan 2022 04:26:40 +0000 Subject: [PATCH 80/80] removing magic number 2 --- java/src/main/native/src/row_conversion.cu | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu index 94b9e4bc143..3ef092792bf 100644 --- a/java/src/main/native/src/row_conversion.cu +++ b/java/src/main/native/src/row_conversion.cu @@ -1804,10 +1804,11 @@ std::unique_ptr
convert_from_rows(lists_column_view const &input, // only ever get a single batch when going from rows, so boundaries // are 0, num_rows - device_uvector gpu_batch_row_boundaries(2, stream); + constexpr auto num_batches = 2; + device_uvector gpu_batch_row_boundaries(num_batches, stream); thrust::transform(rmm::exec_policy(stream), thrust::make_counting_iterator(0), - thrust::make_counting_iterator(2), gpu_batch_row_boundaries.begin(), + thrust::make_counting_iterator(num_batches), gpu_batch_row_boundaries.begin(), [num_rows] __device__(auto i) { return i == 0 ? 0 : num_rows; }); int info_count = 0;